diff --git a/.bazelrc b/.bazelrc
index ceba7bfdbac74d1e44aadc3010e5e84bd36ce3ee..c70c57136102b483a4332ca22f775d7a2c5b849e 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -25,12 +25,14 @@ build --define framework_shared_object=true
 # If you would like to use a local MKL instead of downloading, please set the
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl -c opt
 
 # This config option is used to enable MKL-DNN open source library only,
 # without depending on MKL binary version.
 build:mkl_open_source_only --define=build_with_mkl_dnn_only=true
 build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=0
 
 build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
 build:download_clang --define=using_clang=true
@@ -78,7 +80,7 @@ build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
 
 build --spawn_strategy=standalone
-build --genrule_strategy=standalone
+build --strategy=Genrule=standalone
 build -c opt
 
 # Other build flags.
@@ -93,9 +95,6 @@ build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include
 
-# Disable MKL-DNN contraction kernels by default.
-build --define=tensorflow_mkldnn_contraction_kernel=0
-
 # Default options should come above this line
 
 # Options from ./configure
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4a296f265f7b9521c46d350cec26ff199f43eb6c..b978f89f9e1d79dd4f7481711a59c2b94e8bf01b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -150,41 +150,45 @@ may exist in your changes.
 
 There are two ways to run TensorFlow unit tests.
 
-1. Using tools and libraries installed directly on your system.
+1.  Using tools and libraries installed directly on your system.
 
-   Refer to the
-   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
-   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
-   for the required packages. Alternatively, use the said
-   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
-   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
-   for development to avoid installing the packages directly on your system.
+    Refer to the
+    [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel)
+    and
+    [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+    for the required packages. Alternatively, use the said
+    [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+    `tensorflow/tensorflow:nightly-devel` and
+    `tensorflow/tensorflow:nightly-devel-gpu` for development to avoid
+    installing the packages directly on your system (in which case remember to
+    change directory from `/root` to `/tensorflow` once you get into the running
+    container so `bazel` can find the `tensorflow` workspace).
 
-   Once you have the packages installed, you can run a specific unit test in
-   bazel by doing as follows:
+    Once you have the packages installed, you can run a specific unit test in
+    bazel by doing as follows:
 
-   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
-   the `cuda` option flag
+    If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+    the `cuda` option flag
 
-   ```bash
-   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+    ```bash
+    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
 
-   export flags="--config=opt --config=cuda -k"
-   ```
+    export flags="--config=opt --config=cuda -k"
+    ```
 
-   For example, to run all tests under tensorflow/python, do:
+    For example, to run all tests under tensorflow/python, do:
 
-   ```bash
-   bazel test ${flags} //tensorflow/python/...
-   ```
+    ```bash
+    bazel test ${flags} //tensorflow/python/...
+    ```
 
-2. Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
+2.  Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
 
-   ```bash
-   # Install Docker first, then this will build and run cpu tests
-   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
-   ```
-
-   See
-   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+    ```bash
+    # Install Docker first, then this will build and run cpu tests
+    tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+    ```
 
+    See
+    [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build)
+    for details.
diff --git a/README.md b/README.md
index 519815d006cc33be10132909baf414a4bd843435..4e37b239b16e6eeefc587aeb242a03e1f88eddbd 100644
--- a/README.md
+++ b/README.md
@@ -57,21 +57,24 @@ Simply run `pip install tf-nightly` or `pip install tf-nightly-gpu` in a clean
 environment to install the nightly TensorFlow build. We support CPU and GPU
 packages on Linux, Mac, and Windows.
 
-
 #### *Try your first TensorFlow program*
+
 ```shell
 $ python
 ```
+
 ```python
 >>> import tensorflow as tf
 >>> tf.enable_eager_execution()
->>> tf.add(1, 2)
+>>> tf.add(1, 2).numpy()
 3
 >>> hello = tf.constant('Hello, TensorFlow!')
 >>> hello.numpy()
 'Hello, TensorFlow!'
 ```
-Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
+
+Learn more examples about how to do specific tasks in TensorFlow at the
+[tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
diff --git a/RELEASE.md b/RELEASE.md
index 282430d12303bde980e19e3c3602eb91b1a54d63..0a56e6909870e398c9d6349576cd2f8e6734f072 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -849,7 +849,7 @@ answered questions, and were part of inspiring discussions.
 * Remove `tf.contrib.data.Iterator.from_dataset()` method. Use
   `Dataset.make_initializable_iterator()` instead.
 * Remove seldom used and unnecessary `tf.contrib.data.Iterator.dispose_op()`.
-* Reorder some TFGAN loss functions in a non-backwards compatible way.
+* Reorder some TF-GAN loss functions in a non-backwards compatible way.
 
 ## Known Issues
 * In Python 3, `Dataset.from_generator()` does not support Unicode strings.
diff --git a/WORKSPACE b/WORKSPACE
index 2277e83a3f67b62cf4ee1311767ee06c0549c697..957b8d8528dc9b5e2ea134921b28601aa6fed2d1 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,11 +4,11 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file"
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
-    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
+    sha256 = "43c9b882fa921923bcba764453f4058d102bece35a37c9f6383c713004aacff1",
+    strip_prefix = "rules_closure-9889e2348259a5aad7e805547c1a0cf311cfcd91",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/9889e2348259a5aad7e805547c1a0cf311cfcd91.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/9889e2348259a5aad7e805547c1a0cf311cfcd91.tar.gz",  # 2018-12-21
     ],
 )
 
@@ -73,7 +73,7 @@ swift_rules_dependencies()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.18.0")
+check_bazel_version_at_least("0.19.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
diff --git a/configure.py b/configure.py
index 1e732db26404906901a9eeab97a5e75137ee8388..adc9ef9caca8c0128c63896fdebbbadf7f86da81 100644
--- a/configure.py
+++ b/configure.py
@@ -33,7 +33,7 @@ except ImportError:
   from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top
 
-_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDA_VERSION = '10.0'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
@@ -480,7 +480,9 @@ def check_bazel_version(min_version, max_version):
   if (curr_version_int > max_version_int and
       'TF_IGNORE_MAX_BAZEL_VERSION' not in os.environ):
     print('Please downgrade your bazel installation to version %s or lower to '
-          'build TensorFlow!' % max_version)
+          'build TensorFlow! To downgrade: download the installer for the old '
+          'version (from https://github.com/bazelbuild/bazel/releases) then '
+          'run the installer.' % max_version)
     sys.exit(1)
   return curr_version
 
@@ -1554,7 +1556,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.19.0', '0.20.0')
+  check_bazel_version('0.19.0', '0.21.0')
 
   reset_tf_configure_bazelrc()
 
@@ -1692,7 +1694,7 @@ def main():
   config_info_line('noaws', 'Disable AWS S3 filesystem support.')
   config_info_line('nogcp', 'Disable GCP support.')
   config_info_line('nohdfs', 'Disable HDFS support.')
-  config_info_line('noignite', 'Disable Apacha Ignite support.')
+  config_info_line('noignite', 'Disable Apache Ignite support.')
   config_info_line('nokafka', 'Disable Apache Kafka support.')
   config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f07e7365d3482cde5b7bb76ebf22890150e98651..413806fac14ca4605606507726d7ff87ce73a699 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -343,6 +343,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "using_rocm_hipcc",
+    define_values = {
+        "using_rocm_hipcc": "true",
+    },
+)
+
 config_setting(
     name = "with_mpi_support",
     values = {"define": "with_mpi_support=true"},
@@ -370,13 +377,22 @@ config_setting(
     define_values = {"tf_api_version": "2"},
 )
 
+# This flag is defined for select statements that match both
+# on 'windows' and 'api_version_2'. In this case, bazel requires
+# having a flag which is a superset of these two.
+config_setting(
+    name = "windows_and_api_version_2",
+    define_values = {"tf_api_version": "2"},
+    values = {"cpu": "x64_windows"},
+)
+
 package_group(
     name = "internal",
     packages = [
         "-//third_party/tensorflow/python/estimator",
         "//learning/deepmind/...",
         "//learning/meta_rank/...",
-        "//learning/pathways/...",  # While dataset C++ api requires internals
+        "//platforms/performance/autograppler/...",
         "//tensorflow/...",
         "//tensorflow_estimator/contrib/...",
         "//tensorflow_fold/llgtm/...",
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 2c0a7452692e5cdb184f7f0a77eb1b646a1772d4..a93799bfe84b0f9c4743e1ad0effd6e69ad7f3f2 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -52,7 +52,7 @@ elif _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
 # Enable TF2 behaviors
-from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
 _compat.enable_v2_behavior()
 
 
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 514aba1b59631f882523396aab0f4d3d5e88a893..eeca8f0d566a6401cb64e4fe3f0ee3c5aeb4ece2 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -62,11 +62,15 @@ if '__all__' in vars():
   vars()['__all__'].append('contrib')
 
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+# The 'app' module will be imported as part of the placeholder section above.
 app.flags = flags  # pylint: disable=undefined-variable
 
+# Also use 'app' module (choice is arbitrary) to derive the API directory below.
+_API_MODULE = app  # pylint: disable=undefined-variable
+
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))  # pylint: disable=undefined-variable
 if not hasattr(_current_module, '__path__'):
   __path__ = [_tf_api_dir]
 elif _tf_api_dir not in __path__:
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 3e1f220db233001ba652120657631f8c1a296b35..6e50a09bfc5ed3a8f2f7e05e6a6a151525e8dfce 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -83,7 +83,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [
             ":c_api_internal",
-            "//tensorflow/cc/saved_model:loader",
+            "//tensorflow/cc/saved_model:loader_lite",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
             "//tensorflow/cc:grad_ops",
@@ -129,6 +129,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:attr_builder",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -252,12 +253,6 @@ tf_cc_test(
     name = "c_test",
     srcs = ["c_test.c"],
     extra_copts = ["-std=c11"],
-    tags = [
-        # TODO(b/121223209): Re-enable after fixing asan memory leaks and MacOS
-        # build errors.
-        "noasan",
-        "no_mac",
-    ],
     deps = [
         ":c_api",
         ":c_api_experimental",
@@ -294,13 +289,20 @@ tf_cuda_cc_test(
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/compiler/jit",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:array",
@@ -324,6 +326,7 @@ tf_cc_test(
     deps = [
         ":c_api",
         ":c_api_experimental",
+        ":c_api_internal",
         ":c_test_util",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_test_util",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 9580215a317b1a6b1cdacbd430a1764af61be990..94d9f4a6fa2f14cb3343bdd51b7e4d61944444d0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -257,6 +257,74 @@ int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
 size_t TF_TensorByteSize(const TF_Tensor* t) { return t->buffer->size(); }
 void* TF_TensorData(const TF_Tensor* t) { return t->buffer->data(); }
 
+int64_t TF_TensorElementCount(const TF_Tensor* t) {
+  int64_t result = 1;
+  int rank = TF_NumDims(t);
+  for (int dim = 0; dim < rank; ++dim) {
+    result *= TF_Dim(t, dim);
+  }
+  return result;
+}
+
+// Returns the number of elements that would be present in a tensor with the
+// given shape.
+static int64_t ShapeNumElements(const int64_t* dims, int num_dims) {
+  int64_t result = 1;
+  for (int dim = 0; dim < num_dims; ++dim) {
+    result *= dims[dim];
+  }
+  return result;
+}
+
+static void UnrefIfNonNull(::tensorflow::TensorBuffer* buf) {
+  if (buf != nullptr) {
+    buf->Unref();
+  }
+}
+
+static void RefIfNonNull(::tensorflow::TensorBuffer* buf) {
+  if (buf != nullptr) {
+    buf->Ref();
+  }
+}
+
+void TF_TensorBitcastFrom(const TF_Tensor* from, TF_DataType type,
+                          TF_Tensor* to, const int64_t* new_dims,
+                          int num_new_dims, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  size_t in_size = TF_DataTypeSize(TF_TensorType(from));
+  if (in_size == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "input tensor has a zero-sized data type");
+    return;
+  }
+  size_t out_size = TF_DataTypeSize(type);
+  if (out_size == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "output tensor has a zero-sized data type");
+    return;
+  }
+
+  if (ShapeNumElements(new_dims, num_new_dims) * out_size !=
+      TF_TensorElementCount(from) * in_size) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "input tensor is not compatible with output shape");
+    return;
+  }
+
+  tensorflow::TensorShapeProto p;
+  for (int i = 0; i < num_new_dims; ++i) {
+    p.add_dim()->set_size(new_dims[i]);
+  }
+  to->shape = tensorflow::TensorShape(p);
+  to->dtype = type;
+  if (to->buffer != from->buffer) {
+    UnrefIfNonNull(to->buffer);
+    to->buffer = from->buffer;
+    RefIfNonNull(to->buffer);
+  }
+}
+
 // --------------------------------------------------------------------------
 size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
                        size_t dst_len, TF_Status* status) {
@@ -2881,6 +2949,9 @@ const char* TF_ServerTarget(TF_Server* server) {
 #endif
 }
 
-void TF_DeleteServer(TF_Server* server) { delete server; }
-
+void TF_DeleteServer(TF_Server* server) {
+#ifndef __ANDROID__
+  delete server;
+#endif
+}
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index c7abba85521fccec07983cd5ab4f94a8368d6181..8031928dac4de2391f0aec46e69d61a137606e4d 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -272,6 +272,39 @@ TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
 // Return a pointer to the underlying data buffer.
 TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
 
+// Returns the number of elements in the tensor.
+TF_CAPI_EXPORT extern int64_t TF_TensorElementCount(const TF_Tensor* tensor);
+
+// Copy the internal data representation of `from` to `to`. `new_dims` and
+// `num_new_dims` specify the new shape of the `to` tensor, `type` specifies its
+// data type. On success, *status is set to TF_OK and the two tensors share the
+// same data buffer.
+//
+// This call requires that the `from` tensor and the given type and shape (dims
+// and num_dims) are "compatible" (i.e. they occupy the same number of bytes).
+// Specifically, given from_type_size = TF_DataTypeSize(TF_TensorType(from)):
+//
+// ShapeElementCount(dims, num_dims) * TF_DataTypeSize(type)
+//
+// must equal
+//
+// TF_TensorElementCount(from) * from_type_size
+//
+// where TF_ShapeElementCount would be the number of elements in a tensor with
+// the given shape.
+//
+// In addition, this function requires:
+//   * TF_DataTypeSize(TF_TensorType(from)) != 0
+//   * TF_DataTypeSize(type) != 0
+//
+// If any of the requirements are not met, *status is set to
+// TF_INVALID_ARGUMENT.
+TF_CAPI_EXPORT extern void TF_TensorBitcastFrom(const TF_Tensor* from,
+                                                TF_DataType type, TF_Tensor* to,
+                                                const int64_t* new_dims,
+                                                int num_new_dims,
+                                                TF_Status* status);
+
 // --------------------------------------------------------------------------
 // Encode the string `src` (`src_len` bytes long) into `dst` in the format
 // required by TF_STRING tensors. Does not write to memory more than `dst_len`
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index f04b285037dff403428ed74fe90eac60339fe36b..6cc74cfb3246e9526e862f363590ce43e390ffaa 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -128,6 +129,14 @@ const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   return ret;
 }
 
+char* TF_FunctionDebugString(TF_Function* func, size_t* len) {
+  const auto& debug_str = func->fdef.DebugString();
+  *len = debug_str.size();
+  char* ret = static_cast<char*>(malloc(*len + 1));
+  memcpy(ret, debug_str.c_str(), *len + 1);
+  return ret;
+}
+
 // On success, returns a set of TF_Function instances from `text_proto` of
 // GraphDef type. These functions must be deleted by calling TF_DeleteFunction.
 //
@@ -8737,6 +8746,12 @@ static void CheckOk(TF_Status* status) {
 
 void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   auto* status = TF_NewStatus();
+  if (!TFE_TensorHandleIsConcrete(handle)) {
+    VLOG(1) << "Symbolic tensor: " << handle;
+    TF_DeleteStatus(status);
+    return;
+  }
+
   TF_Tensor* t = TFE_TensorHandleResolve(handle, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
@@ -8748,6 +8763,11 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
+void TFE_OpPrintDebugString(TFE_Op* op) {
+  VLOG(1) << "TFE_OpPrintDebugString() over " << op;
+  LOG(INFO) << op->operation.DebugString();
+}
+
 struct TFE_ExecuteOpNotification {
   TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
   tensorflow::Notification n;
@@ -8941,3 +8961,161 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
   }
   status->status = EnableCollectiveOps(server_def, ctx);
 }
+
+std::string tensorflow::getTF_OutputDebugString(TF_Output node) {
+  return absl::Substitute("TF_Output($0, $1)", node.oper, node.index);
+}
+
+using tensorflow::getTF_OutputDebugString;
+
+TFE_TensorHandle* TFE_NewTensorHandleFromTFOutput(TF_Output t,
+                                                  TF_DataType dtype) {
+  auto ret = new TFE_TensorHandle(t, dtype);
+  VLOG(1) << "Storing TFOutput " << getTF_OutputDebugString(t)
+          << " into tensor handle " << ret << " with internal handle "
+          << ret->handle;
+  return ret;
+}
+
+unsigned char TFE_TensorHandleIsConcrete(TFE_TensorHandle* handle) {
+  assert(handle->handle != nullptr);
+  return handle->handle->getSymbolicTensor() == nullptr;
+}
+
+TF_Output TFE_GetTFOutputFromTensorHandle(TFE_TensorHandle* handle,
+                                          TF_Status* status) {
+  if (TFE_TensorHandleIsConcrete(handle)) {
+    status->status =
+        tensorflow::errors::Internal("Not a symbolic tensor: ", handle);
+    return TF_Output{nullptr, -1};
+  }
+
+  auto* sym_tensor = handle->handle->getSymbolicTensor();
+  CHECK(sym_tensor != nullptr);
+  auto ret = TF_Output{sym_tensor->oper, sym_tensor->index};
+  VLOG(1) << "Retrieving " << getTF_OutputDebugString(ret)
+          << " from tensor handle " << handle;
+  CHECK_GE(sym_tensor->index, 0);
+  return ret;
+}
+
+TFE_TraceContext* TFE_NewTraceContext(TF_Graph* graph) {
+  return new TFE_TraceContext(graph);
+}
+
+void TFE_DeleteTraceContext(TFE_TraceContext* trace_ctx) { delete trace_ctx; }
+
+// If `handle` is already symbolic, return it. Otherwise map it to a new
+// symbolic tensor (a PlaceHolder op) and return that.
+static TF_Output getOrCreateSymbolicTensor(TFE_TraceContext* trace_ctx,
+                                           tensorflow::TensorHandle* handle,
+                                           TF_Status* status) {
+  VLOG(1) << "Getting symbolic tensor for input tensor handle " << handle
+          << ": " << handle->DebugString();
+
+  auto* sym_tensor = handle->getSymbolicTensor();
+  if (sym_tensor != nullptr) {
+    auto ret = TF_Output{sym_tensor->oper, sym_tensor->index};
+    VLOG(1) << "This handle is a symbolic tensor " << sym_tensor << ": "
+            << getTF_OutputDebugString(ret);
+    return ret;
+  }
+
+  auto find_it = trace_ctx->input_tensor_map.find(handle);
+  if (find_it != trace_ctx->input_tensor_map.end()) {
+    VLOG(1) << "There exists a map entry from this concrete tensor to: "
+            << getTF_OutputDebugString(find_it->second);
+    return find_it->second;
+  }
+
+  auto node_name = tensorflow::strings::StrCat("additional_input_",
+                                               trace_ctx->node_counter++);
+  VLOG(1) << "Adding a place holder node named " << node_name;
+  auto* desc =
+      TF_NewOperation(trace_ctx->graph, "Placeholder", node_name.c_str());
+  TF_SetAttrType(desc, "dtype",
+                 static_cast<TF_DataType>(handle->dtype) /*TF_FLOAT*/);
+  auto* result = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) {
+    return TF_Output{nullptr, -1};
+  }
+
+  auto ret = TF_Output{result, 0};
+  VLOG(1) << "Creating a new map entry to map to: "
+          << getTF_OutputDebugString(ret);
+  trace_ctx->input_tensor_map[handle] = ret;
+  // `handle` could be destroyed before it's read from `input_tensor_map` (say
+  // during a subsequent TFE_FinalizeInputTensorsFromTraceContext() call), so we
+  // increment its ref count to extend its life span to that of `trace_ctx`.
+  handle->Ref();
+  VLOG(1) << "Ref count for handle " << handle
+          << " is 1?: " << handle->RefCountIsOne();
+  return ret;
+}
+
+void TFE_AddEagerOpToGraph(TFE_Op* op, TFE_TraceContext* trace_ctx,
+                           TFE_TensorHandle** retvals, int* num_retvals,
+                           TF_Status* status) {
+  VLOG(1) << "Calling TFE_AddEagerOpToGraph() with op " << op << ": "
+          << op->operation.DebugString();
+
+  const auto& op_type = op->operation.Name();
+  auto op_name =
+      tensorflow::strings::StrCat(op_type, "_", trace_ctx->node_counter++);
+  auto* desc =
+      TF_NewOperation(trace_ctx->graph, op_type.c_str(), op_name.c_str());
+  for (auto* input : op->operation.Inputs()) {
+    auto symbolic_input = getOrCreateSymbolicTensor(trace_ctx, input, status);
+    if (!status->status.ok()) return;
+    TF_AddInput(desc, symbolic_input);
+  }
+
+  VLOG(1) << "Adding attrs.";
+  // TODO(hongm): add attrs
+
+  auto* graph_op = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) return;
+
+  VLOG(1) << "Op finalized; setting return tensors.";
+  *num_retvals = TF_OperationNumOutputs(graph_op);
+  VLOG(1) << "This op has " << *num_retvals << " outputs.";
+  for (int i = 0; i < *num_retvals; ++i) {
+    auto output = TF_Output{graph_op, i};
+    auto dtype = TF_OperationOutputType(output);
+    retvals[i] = TFE_NewTensorHandleFromTFOutput(output, dtype);
+  }
+}
+
+int TFE_FinalizeInputTensorsFromTraceContext(TFE_TraceContext* trace_ctx) {
+  if (trace_ctx->input_tensors == nullptr) {
+    trace_ctx->input_tensors =
+        new std::vector<std::pair<tensorflow::TensorHandle*, TF_Output>>();
+    trace_ctx->input_tensors->reserve(trace_ctx->input_tensor_map.size());
+
+    for (auto input : trace_ctx->input_tensor_map) {
+      trace_ctx->input_tensors->emplace_back(input.first, input.second);
+    }
+  }
+  return trace_ctx->input_tensor_map.size();
+}
+
+TF_Output TFE_GetInputGraphNodeFromTraceContext(TFE_TraceContext* trace_ctx,
+                                                unsigned int idx) {
+  CHECK(trace_ctx->input_tensors != nullptr);
+  CHECK(trace_ctx->input_tensors->size() > idx);
+  return trace_ctx->input_tensors->at(idx).second;
+}
+
+TFE_TensorHandle* TFE_ConsumeInputConcreteTensorFromTraceContext(
+    TFE_TraceContext* trace_ctx, unsigned int idx) {
+  CHECK(trace_ctx->input_tensors != nullptr);
+  CHECK(trace_ctx->input_tensors->size() > idx);
+  auto* handle = trace_ctx->input_tensors->at(idx).first;
+  VLOG(1) << "Ref count for internal handle " << handle
+          << " is 1?: " << handle->RefCountIsOne();
+  handle->Ref();
+  auto* ret = new TFE_TensorHandle(handle);
+  VLOG(1) << "Returning a new tensor handle " << ret << ": "
+          << handle->DebugString();
+  return ret;
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index e6d04d0c2b25a3f7b1ebf50c58268f003595a520..48ea0ec1ed78a071b7bf7c858881d943a3ff3acd 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -84,6 +84,15 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_CreateRunOptions(
 TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
                                                       size_t* len);
 
+// Returns the function content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+//
+// Do not return const char*, because some foreign language binding
+// (e.g. swift) cannot then call free() on the returned pointer.
+TF_CAPI_EXPORT extern char* TF_FunctionDebugString(TF_Function* func,
+                                                   size_t* len);
+
 // Creates a stack of data set + iterator nodes, currently hard-coded to return
 // a sequence of 3 float values <42.0, 43.0, 44.0> over 3 calls. On success,
 // returns the IteratorGetNext node, which caller can run or feed into an node.
@@ -181,6 +190,8 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+TF_CAPI_EXPORT extern void TFE_OpPrintDebugString(TFE_Op* op);
+
 typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
 
 // Allows invoking a kernel asynchronously, and explicitly returns a
@@ -255,6 +266,55 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
                                                    const void* proto,
                                                    size_t proto_len,
                                                    TF_Status* status);
+
+// Create a symbolic tensor from the input graph node.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromTFOutput(
+    TF_Output t, TF_DataType data_type);
+
+// Returns 0 if the input tensor handle represents a symbolic tensor (i.e., a
+// graph node). Otherwise returns non-0.
+TF_CAPI_EXPORT extern unsigned char TFE_TensorHandleIsConcrete(
+    TFE_TensorHandle* handle);
+
+// If `handle` is a symbolic tensor, return the corresponding graph node
+// represented by TF_Output. Otherwise, return an error status.
+TF_CAPI_EXPORT extern TF_Output TFE_GetTFOutputFromTensorHandle(
+    TFE_TensorHandle* handle, TF_Status* status);
+
+typedef struct TFE_TraceContext TFE_TraceContext;
+
+// A trace context contains a trace graph, to which TFE_AddEagerOpToGraph()
+// calls add graph nodes as a way to symbolically execute the eager ops.
+//
+// It also contains a hash map from concrete input tensors to symbolic
+// tensors. That map will be used to create input tensors to the trace graph.
+TF_CAPI_EXPORT extern TFE_TraceContext* TFE_NewTraceContext(TF_Graph* graph);
+
+TF_CAPI_EXPORT extern void TFE_DeleteTraceContext(TFE_TraceContext* trace_ctx);
+
+// Symbolically executes `op`, by adding a corresponding node to the graph
+// associated with `trace_ctx`. This graph node outputs a set of symbolic
+// tensors in `retvals` and `num_retvals`.
+TF_CAPI_EXPORT extern void TFE_AddEagerOpToGraph(TFE_Op* op,
+                                                 TFE_TraceContext* trace_ctx,
+                                                 TFE_TensorHandle** retvals,
+                                                 int* num_retvals,
+                                                 TF_Status* status);
+
+// Finalizes the trace graph and its inputs, and returns the number of inputs.
+// After this call, the next two APIs can be called to iterate over the input
+// tensors.
+TF_CAPI_EXPORT extern int TFE_FinalizeInputTensorsFromTraceContext(
+    TFE_TraceContext* trace_ctx);
+
+TF_CAPI_EXPORT extern TF_Output TFE_GetInputGraphNodeFromTraceContext(
+    TFE_TraceContext* trace_ctx, unsigned int idx);
+
+// Each input tensor should be consumed at most once.
+TF_CAPI_EXPORT extern TFE_TensorHandle*
+TFE_ConsumeInputConcreteTensorFromTraceContext(TFE_TraceContext* trace_ctx,
+                                               unsigned int idx);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index daa7701b7fe7e8ce757b6504329cf6434ad39778..4cfcf2ef3b2ccd9d8aedaf8efa4a31ac12d91c1b 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
@@ -296,5 +297,73 @@ TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, SymbolicTensor) {
+  TF_Status* status = TF_NewStatus();
+  auto node = TF_Output{nullptr, 1};
+  auto* sym_handle = TFE_NewTensorHandleFromTFOutput(node, TF_FLOAT);
+  TFE_TensorHandlePrintDebugString(sym_handle);
+  CHECK_EQ(TFE_TensorHandleDataType(sym_handle), TF_FLOAT);
+  ASSERT_FALSE(TFE_TensorHandleIsConcrete(sym_handle));
+  auto same_node = TFE_GetTFOutputFromTensorHandle(sym_handle, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(same_node.oper, node.oper);
+  ASSERT_EQ(same_node.index, node.index);
+  TFE_DeleteTensorHandle(sym_handle);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  ASSERT_TRUE(TFE_TensorHandleIsConcrete(m));
+  (void)TFE_GetTFOutputFromTensorHandle(m, status);
+  CHECK_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(m);
+
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI_EXPERIMENTAL, DebugPrintAndSymbolicExecution) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* op = MatMulOp(ctx, m, m);
+
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpPrintDebugString(op);
+
+  auto* graph = TF_NewGraph();
+  auto* trace_ctx = TFE_NewTraceContext(graph);
+  TFE_TensorHandle* retvals[5];
+  int num_retvals = 5;
+  // Symbolically execute this op, which adds a graph node to `trace_ctx`.
+  TFE_AddEagerOpToGraph(op, trace_ctx, retvals, &num_retvals, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  int num_inputs = TFE_FinalizeInputTensorsFromTraceContext(trace_ctx);
+  CHECK_EQ(num_inputs, 1);
+  auto input_sym_tensor = TFE_GetInputGraphNodeFromTraceContext(trace_ctx,
+                                                                /*idx*/ 0);
+
+  LOG(INFO) << tensorflow::getTF_OutputDebugString(input_sym_tensor);
+  auto handle = TFE_ConsumeInputConcreteTensorFromTraceContext(trace_ctx,
+                                                               /*idx*/ 0);
+  TFE_TensorHandlePrintDebugString(handle);
+  TFE_DeleteTensorHandle(handle);
+
+  CHECK_EQ(num_retvals, 1);
+  CHECK_EQ(TFE_TensorHandleDataType(retvals[0]), TF_FLOAT);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteTraceContext(trace_ctx);
+  TF_DeleteGraph(graph);
+
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteOp(op);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 5ba26d3c585350aa510f9970cbfc246a9a108543..73283d775639b297857b2a50007dc7c28b1f39a3 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -228,6 +228,8 @@ void RecordMutation(TF_Graph* graph, const TF_Operation& op,
 bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
     LOCKS_EXCLUDED(session->graph->mu, session->mu);
 
+std::string getTF_OutputDebugString(TF_Output node);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index d5934a10395ae094f65d3bc8b6cd7b94dbd32410..2be03bf0de6277fc63c353ad6dc63bec096a6993 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -163,6 +163,7 @@ TEST(CAPI, AllocateTensor) {
   EXPECT_EQ(dims[0], TF_Dim(t, 0));
   EXPECT_EQ(dims[1], TF_Dim(t, 1));
   EXPECT_EQ(num_bytes, TF_TensorByteSize(t));
+  EXPECT_EQ(6, TF_TensorElementCount(t));
   TF_DeleteTensor(t);
 }
 
@@ -1467,6 +1468,41 @@ TEST(CAPI, DeletingNullPointerIsSafe) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, TestBitcastFrom_Reshape) {
+  int64_t dims[] = {2, 3};
+  TF_Tensor* a =
+      TF_AllocateTensor(TF_UINT64, dims, 2, 6 * TF_DataTypeSize(TF_UINT64));
+  TF_Tensor* b =
+      TF_AllocateTensor(TF_UINT64, nullptr, 0, TF_DataTypeSize(TF_UINT64));
+  EXPECT_NE(a, nullptr);
+  EXPECT_NE(b, nullptr);
+
+  EXPECT_EQ(6, TF_TensorElementCount(a));
+  EXPECT_EQ(1, TF_TensorElementCount(b));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(a));
+  EXPECT_EQ(TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(b));
+
+  int64_t new_dims[] = {3, 2};
+  TF_Status* status = TF_NewStatus();
+  TF_TensorBitcastFrom(a, TF_UINT64, b, new_dims, 2, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status));
+  TF_DeleteStatus(status);
+
+  EXPECT_EQ(6, TF_TensorElementCount(a));
+  EXPECT_EQ(6, TF_TensorElementCount(b));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(a));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(b));
+
+  // Check that a write to one tensor shows up in the other.
+  *(static_cast<int64_t*>(TF_TensorData(a))) = 4;
+  EXPECT_EQ(4, *(static_cast<int64_t*>(TF_TensorData(b))));
+  *(static_cast<int64_t*>(TF_TensorData(b))) = 6;
+  EXPECT_EQ(6, *(static_cast<int64_t*>(TF_TensorData(a))));
+
+  TF_DeleteTensor(a);
+  TF_DeleteTensor(b);
+}
+
 REGISTER_OP("TestOpWithNoGradient")
     .Input("x: T")
     .Output("y: T")
diff --git a/tensorflow/c/c_test.c b/tensorflow/c/c_test.c
index c0ed5ccd15d9524e2c14630d8ef92f6b3ef9b059..b86d8eb8e300e02a3871ecd5f424a82c521b18fc 100644
--- a/tensorflow/c/c_test.c
+++ b/tensorflow/c/c_test.c
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <limits.h>
-#include <malloc.h>
 #include <memory.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <sys/time.h>
 #include <unistd.h>
 
@@ -32,7 +32,12 @@ void compute(void* kernel, TF_OpKernelContext* ctx) {
   TF_Status* s = TF_NewStatus();
   TF_GetInput(ctx, 0, &input, s);
   TF_DeleteTensor(input);
+
+  TF_DataType type;
+  TF_OpKernelContext_GetAttrType(ctx, "foobar", &type, s);
+
   TF_DeleteStatus(s);
+
 }
 
 // Exercises tensorflow's C API.
@@ -68,6 +73,10 @@ int main(int argc, char** argv) {
   }
   fprintf(stderr, "wrote %s\n", full_path);
   free(full_path);
+  TF_CloseWritableFile(h, status);
+  if (TF_GetCode(status) != TF_OK) {
+    fprintf(stderr, "TF_CloseWritableFile failed: %s\n", TF_Message(status));
+  }
   TF_StringStreamDone(s);
 
   TF_KernelBuilder* b =
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index c34a84fcfee9b6ba9a7be86ae16e2856a2d343c7..04dfefa6da28429b73856d392d94fa402ecda580 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -3,11 +3,19 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cuda_cc_test",
-    "tf_cc_test",
     "tf_copts",
-    "tfe_xla_copts",
+    "tf_cuda_cc_test",
     "tf_cuda_library",
+    "tfe_xla_copts",
+)
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_additional_device_tracer_test_flags",
+    "tf_kernel_tests_linkstatic",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
 )
 
 tf_cuda_library(
@@ -62,6 +70,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:remote_device",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/profiler/lib:eager_profiler",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -101,6 +110,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/profiler/lib:eager_profiler",
     ],
 )
 
@@ -148,6 +158,88 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_library(
+    name = "c_api_experimental",
+    srcs = [
+        "c_api_experimental.cc",
+    ],
+    hdrs = ["c_api_experimental.h"],
+    copts = tf_copts() + tfe_xla_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":c_api_internal",
+            "//tensorflow/c:c_api",
+            "//tensorflow/c:c_api_internal",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:attr_builder",
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:eager_executor",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
+            "//tensorflow/core/common_runtime/eager:copy_to_device_node",
+            "//tensorflow/core:core_cpu_internal",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }) + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+            "//tensorflow/compiler/jit:xla_device",
+        ],
+        "//conditions:default": [],
+    }) + [
+        "@com_google_absl//absl/memory",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/core:gpu_runtime",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = [
+        "c_api_experimental_test.cc",
+    ],
+    args =
+        ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":c_api_experimental",
+        ":c_api_test_util",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/contrib/tpu/profiler:trace_events_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "tape",
     hdrs = ["tape.h"],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 027d752f420238da867cb9d8c116640e1730caaa..af13f487af91594fedd4d5f77592682a6f98c34f 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -356,6 +356,8 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
   if (h == nullptr) return;
+  VLOG(1) << "Deleting tensor handle " << h << " with internal handle "
+          << h->handle;
   if (h->handle) {
     h->handle->Unref();
   }
@@ -443,15 +445,15 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
     return nullptr;
   }
   // TODO(agarwal): move this implementation inside TFE_TensorHandle.
-  tensorflow::Device* d = nullptr;
-  tensorflow::Device* op_device = nullptr;
   const tensorflow::Tensor* t = nullptr;
-  status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
-  if (!status->status.ok()) return nullptr;
   tensorflow::TensorHandle* h_cpu = nullptr;
-  if (!IsCPU(d)) {
-    status->status = h->handle->CopyToDevice(
-        h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+  tensorflow::Device* d = nullptr;
+  tensorflow::Device* op_device = nullptr;
+
+  if (h->handle->IsRemote()) {
+    status->status = EagerCopyToDevice(
+        h->handle, h->handle->Context(),
+        h->handle->Context()->HostCPU()->name().c_str(), &h_cpu);
     if (!status->status.ok()) {
       return nullptr;
     }
@@ -460,6 +462,22 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
       h_cpu->Unref();
       return nullptr;
     }
+  } else {
+    status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
+    if (!status->status.ok()) return nullptr;
+
+    if (!IsCPU(d)) {
+      status->status = h->handle->CopyToDevice(
+          h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+      if (!status->status.ok()) {
+        return nullptr;
+      }
+      status->status = h_cpu->TensorAndDevice(&t, &d, &op_device);
+      if (!status->status.ok()) {
+        h_cpu->Unref();
+        return nullptr;
+      }
+    }
   }
   TF_Tensor* retval = tensorflow::TF_TensorFromTensor(*t, status);
   if (h_cpu != nullptr) {
@@ -696,6 +714,7 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
+  VLOG(1) << "Calling TFE_Execute() on op " << op;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
       *num_retvals);
   status->status =
@@ -738,6 +757,10 @@ void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
   status->status = ctx->context.AddFunctionDef(function->fdef);
 }
 
+unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
+  return ctx->context.FindFunctionDef(name) != nullptr;
+}
+
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
   ctx->context.SetShouldStoreMetadata(true);
 }
@@ -774,7 +797,7 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
   if (!status->status.ok()) return;
   tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
   status->status = MessageToBuffer(*ctx->context.RunMetadataProto(), buf);
-  ctx->context.RunMetadataProto()->Clear();
+  ctx->context.ClearRunMetadata();
 }
 
 namespace {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 120748ab763a3358b6e38e64bb3b6fd2ea32f7c3..044dfb7415b027b707af05a197fdb41fe1f6d2e5 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -393,6 +393,10 @@ TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
                                                   TF_Function* function,
                                                   TF_Status* status);
 
+// Checks whether a function is registered under `name`.
+TF_CAPI_EXPORT unsigned char TFE_ContextHasFunction(TFE_Context* ctx,
+                                                    const char* name);
+
 // Enables tracing of RunMetadata on the ops executed from this context.
 TF_CAPI_EXPORT extern void TFE_ContextEnableRunMetadata(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 52b0824552855860dfb138f3ac9a5d3afa7dc965..ffcd5ace0b98597363abe63201bf6c328a03212f 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -83,7 +83,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       }
     }
 
-    if (xla::ShapeUtil::IsTuple(padded_shape)) {
+    if (padded_shape.IsTuple()) {
       if (xla::ShapeUtil::TupleElementCount(padded_shape) != 2) {
         // Currently, the only case of XlaTensor containing a tuple shape is to
         // represent 64 bit ints, doubles, and complex numbers (we don't support
@@ -99,7 +99,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       xla::Shape shape0 = xla::ShapeUtil::GetTupleElementShape(padded_shape, 0);
       const xla::Shape& shape1 =
           xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
-      if (xla::ShapeUtil::IsTuple(shape0) || xla::ShapeUtil::IsTuple(shape1)) {
+      if (shape0.IsTuple() || shape1.IsTuple()) {
         status->status = tensorflow::errors::InvalidArgument(
             "XlaTensors should not contain nested tuples. Shape: ",
             padded_shape.DebugString());
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dab17505643e791e6294a64247898ae23769a055
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
+
+using tensorflow::string;
+
+void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
+  op->operation.ConsumeInput(h->handle);
+}
+
+TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
+  return new TFE_Profiler(ctx);
+}
+
+void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
+
+void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
+                                   TF_Buffer* buf, TF_Status* status) {
+  TFE_ContextAsyncWait(ctx, status);
+  if (!status->status.ok()) return;
+  string content;
+  status->status = profiler->profiler->SerializeToString(&content);
+  void* data = tensorflow::port::Malloc(content.length());
+  content.copy(static_cast<char*>(data), content.length(), 0);
+  buf->data = data;
+  buf->length = content.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+}
+
+void TFE_StartProfilerServer(TFE_Context* ctx, int port) {
+  auto server_thread = tensorflow::StartProfilerServer(&ctx->context, port);
+  ctx->context.AddChildThread(std::move(server_thread));
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c85d0e51695fde09cf0e2bb3930f9173e6cfb54
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
+                                              TF_Status* status);
+
+// A profiler which will start profiling when creating the object and will stop
+// when the object is destroyed. It will profile all operations run under the
+// given TFE_Context. Multiple instance of it can be created, but at most one
+// of them will profile for each TFE_Context.
+// Thread-safety: TFE_Profiler is thread-safe.
+typedef struct TFE_Profiler TFE_Profiler;
+
+TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
+TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
+
+// The output string is a binary string of tensorflow.tpu.Trace. User can write
+// the string to file for offline analysis by tensorboard.
+TF_CAPI_EXPORT extern void TFE_ProfilerSerializeToString(TFE_Context* ctx,
+                                                         TFE_Profiler* profiler,
+                                                         TF_Buffer* buf,
+                                                         TF_Status* status);
+
+// Start a profiler grpc server which listens to specified port. It will start
+// the server on its own thread. It can be shutdown by destructing TFE_Context.
+// Creating multiple profiler server is allowed. The service defined in
+// tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
+// file following
+// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern void TFE_StartProfilerServer(TFE_Context* ctx, int port);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af55fee66e8708e39626da3b10b6dd2f73af92bb
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+#include <string.h>
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/cc/profiler/profiler.h"
+#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+using tensorflow::string;
+
+namespace tensorflow {
+namespace {
+
+static bool HasSubstr(absl::string_view base, absl::string_view substr) {
+  bool ok = str_util::StrContains(base, substr);
+  EXPECT_TRUE(ok) << base << ", expected substring " << substr;
+  return ok;
+}
+
+void ExecuteWithProfiling(bool async) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  TFE_Profiler* profiler = TFE_NewProfiler(ctx);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+
+  // Run op on GPU if it is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    const char* device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr);
+  }
+
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TF_Buffer* profiler_result = TF_NewBuffer();
+  TFE_ProfilerSerializeToString(ctx, profiler, profiler_result, status);
+  TFE_DeleteProfiler(profiler);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  tensorflow::tpu::Trace profile_proto;
+  EXPECT_TRUE(profile_proto.ParseFromString(
+      {reinterpret_cast<const char*>(profiler_result->data),
+       profiler_result->length}));
+  string profile_proto_str = profile_proto.DebugString();
+  if (!gpu_device_name.empty()) {
+    EXPECT_TRUE(HasSubstr(profile_proto_str, "GPU:0"));
+    // device name with "stream:all" is collected by Device Tracer.
+    EXPECT_TRUE(HasSubstr(profile_proto_str, "stream:all"));
+  }
+  EXPECT_TRUE(HasSubstr(profile_proto_str, "CPU:0"));
+  TF_DeleteBuffer(profiler_result);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+  TF_DeleteStatus(status);
+}
+TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
+TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 67bc1bcd24605f8363d6a7c8d5d6a0836a42fc82..3b9e681194b7cebc61d9028525d200c692bbd529 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/eager_profiler.h"
 #include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
@@ -82,6 +83,12 @@ struct TFE_TensorHandle {
   TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
   tensorflow::TensorHandle* handle;
+
+  // Create a symbolic tensor.
+  TFE_TensorHandle(TF_Output t, TF_DataType dtype)
+      : handle(new tensorflow::TensorHandle(
+            tensorflow::OutputGraphNode{t.oper, t.index},
+            static_cast<tensorflow::DataType>(dtype))) {}
 };
 
 struct TFE_TensorDebugInfo {
@@ -100,6 +107,13 @@ struct TFE_Op {
   tensorflow::EagerOperation operation;
 };
 
+struct TFE_Profiler {
+  TFE_Profiler(TFE_Context* ctx)
+      : profiler(tensorflow::EagerProfiler::Create(&ctx->context)) {}
+
+  std::unique_ptr<tensorflow::EagerProfiler> profiler;
+};
+
 namespace tensorflow {
 // Set an AttrValue on the op. Doesn't handle the list types.
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
@@ -107,4 +121,24 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
                           const char* attr_name, TF_Status* status);
 }  // namespace tensorflow
 
+struct TFE_TraceContext {
+  TF_Graph* const graph;
+
+  unsigned int node_counter = 0;
+  // Each tensor handle will have its ref count incremented when it's added as a
+  // map key, and decremented when this object is destroyed.
+  std::map<tensorflow::TensorHandle*, TF_Output> input_tensor_map;
+  std::vector<std::pair<tensorflow::TensorHandle*, TF_Output>>* input_tensors =
+      nullptr;
+
+  TFE_TraceContext(TF_Graph* graph) : graph(graph) {}
+
+  ~TFE_TraceContext() {
+    delete input_tensors;
+    for (auto input : input_tensor_map) {
+      input.first->Unref();
+    }
+  }
+};
+
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 6b39b79ee82f9c7baaf856e573a42b7da65691e5..3d1ca4fb4b561a03ea9d879b1876fb1fd08a3139 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -175,13 +175,8 @@ void TestRemoteExecute(bool async) {
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
-      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteTensorHandle(retval_task0);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
   memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 2a4eaecb6cf2740a522b1e849d1306ebde6c4577..9505bf9dda32b9a338b574f1d31ec555a5628c6a 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -48,9 +48,10 @@ TF_KernelBuilder* TF_NewKernelBuilder(
 }
 
 void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
-  DCHECK_NE(builder, nullptr);
-  delete builder->cc_builder;
-  delete builder;
+  if (builder != nullptr) {
+    delete builder->cc_builder;
+    delete builder;
+  }
 }
 
 namespace tensorflow {
@@ -158,3 +159,44 @@ void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
     cc_ctx->set_output(i, cc_tensor);
   }
 }
+
+void TF_OpKernelConstruction_Failure(TF_OpKernelConstruction* ctx,
+                                     TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx);
+  ::tensorflow::Status s(::tensorflow::StatusFromTF_Status(status));
+  cc_ctx->CtxFailure(s);
+}
+
+void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  ::tensorflow::Status s(::tensorflow::StatusFromTF_Status(status));
+  cc_ctx->CtxFailure(s);
+}
+
+#define DEFINE_TF_GETATTR_(struct_name, func, c_type, cc_type)                 \
+  void struct_name##_GetAttr##func(struct_name* ctx, const char* attr_name,    \
+                                   c_type* val, TF_Status* status) {           \
+    TF_SetStatus(status, TF_OK, "");                                           \
+    cc_type v;                                                                 \
+    auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx); \
+    ::tensorflow::Status s = cc_ctx->GetAttr(attr_name, &v);                   \
+    ::tensorflow::Set_TF_Status_from_Status(status, s);                        \
+    if (s.ok()) {                                                              \
+      *val = static_cast<c_type>(v);                                           \
+    }                                                                          \
+  }
+
+#define DEFINE_TF_GETATTR(func, c_type, cc_type)                     \
+  DEFINE_TF_GETATTR_(TF_OpKernelConstruction, func, c_type, cc_type) \
+  DEFINE_TF_GETATTR_(TF_OpKernelContext, func, c_type, cc_type)
+
+DEFINE_TF_GETATTR(Type, TF_DataType, tensorflow::DataType)
+
+TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return static_cast<TF_DataType>(cc_ctx->expected_output_dtype(i));
+}
+
+int64_t TF_StepId(TF_OpKernelContext* ctx) {
+  return reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->step_id();
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index cefc30bcdf89bdc14a4406299cc29f74153e77ac..b015d0103969355e8566242bfcc007f697c6ae18 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -111,6 +111,41 @@ TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
                                         const TF_Tensor* tensor,
                                         TF_Status* status);
 
+// Notifies the given OpKernelConstruction that kernel construction has failed.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_Failure(
+    TF_OpKernelConstruction* ctx, TF_Status* status);
+
+// Notifies the given OpKernelContext that the kernel's compute function has
+// failed.
+TF_CAPI_EXPORT extern void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx,
+                                                      TF_Status* status);
+
+// Returns the expected output data type of the ith output. If i < 0 or
+// i >= TF_NumOutputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern TF_DataType TF_ExpectedOutputDataType(
+    TF_OpKernelContext* ctx, int i);
+
+// Returns the step ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_StepId(TF_OpKernelContext* ctx);
+
+// Interprets the named kernel construction attribute as a TF_DataType and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// TF_DataType, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrType(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_DataType* val,
+    TF_Status* status);
+
+// Interprets the named kernel context attribute as a TF_DataType and places it
+// into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// TF_DataType, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelContext_GetAttrType(
+    TF_OpKernelContext* ctx, const char* attr_name, TF_DataType* val,
+    TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index e659ee3c3d258a626ccf03a782ec031b5a703a48..0d2954717e7a83c102a35815809a554e3a917e07 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/kernels.h"
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/op.h"
@@ -41,6 +42,19 @@ static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
 static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
   struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
   s->compute_called = true;
+  if (ctx != nullptr) {
+    TF_Status* status = TF_NewStatus();
+
+    EXPECT_EQ(43, TF_StepId(ctx));
+
+    // Exercise attribute reads.
+    TF_DataType type;
+    TF_OpKernelContext_GetAttrType(ctx, "SomeDataTypeAttr", &type, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    EXPECT_EQ(TF_FLOAT, type);
+
+    TF_DeleteStatus(status);
+  }
 }
 
 static void MyDeleteFunc(void* kernel) {
@@ -61,6 +75,11 @@ static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
   def.set_device(device_name);
   def.add_input("input1");
   def.add_input("input2");
+
+  AttrValue v;
+  v.set_type(DataType::DT_FLOAT);
+  (*def.mutable_attr())["SomeDataTypeAttr"] = v;
+
   return CreateOpKernel(DeviceType(device_name), nullptr, nullptr, def, 1,
                         status);
 }
@@ -75,7 +94,8 @@ TEST(TestKernel, TestRegisterKernelBuilder) {
   REGISTER_OP(op_name)
       .Input("input1: double")
       .Input("input2: uint8")
-      .Output("output1: uint8");
+      .Output("output1: uint8")
+      .Attr("SomeDataTypeAttr: type");
 
   TF_KernelBuilder* builder = TF_NewKernelBuilder(
       op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
@@ -126,7 +146,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
   REGISTER_OP(op_name)
       .Input("input1: double")
       .Input("input2: uint8")
-      .Output("output1: uint8");
+      .Output("output1: uint8")
+      .Attr("SomeDataTypeAttr: type");
 
   static int num_inputs = 0;
   static int num_outputs = 0;
@@ -155,6 +176,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
     TF_SetOutput(ctx, 24, input, s);
     EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
 
+    EXPECT_EQ(TF_UINT8, TF_ExpectedOutputDataType(ctx, 0));
+
     TF_DeleteStatus(s);
     if (input != nullptr) {
       TF_DeleteTensor(input);
@@ -175,6 +198,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     OpKernelContext::Params p;
     DummyDevice dummy_device(nullptr, false);
     p.device = &dummy_device;
+    p.step_id = 43;
 
     Tensor t(tensorflow::uint8(123));
 
@@ -200,4 +224,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
   }
 }
 
+TEST(TestKernel, DeleteKernelBuilderIsOkOnNull) {
+  TF_DeleteKernelBuilder(nullptr);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index cf65fe1ab99b49207a64e86310178141b30d07d7..e9838d9aba6554b40082187057851e9c896f8352 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -10,7 +10,7 @@ tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
     tags = [
-        "noguitar",  # b/77649654
+        "nogpu",  # b/77649654
     ],
     deps = [
         ":profiler",
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 85d3dd01fa51b3c3ba6fcbf5faac03f1ff5630e2..10f7abf09e925c0c31cfd595ecee4605f189476f 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
@@ -42,9 +43,28 @@ auto* load_latency = monitoring::Counter<1>::New(
     "/tensorflow/cc/saved_model/load_latency",
     "Latency in microseconds for SavedModels that were successfully loaded.",
     "model_path");
+auto* load_latency_by_stage = monitoring::Sampler<2>::New(
+    {
+        "/tensorflow/cc/saved_model/load_latency_by_stage",  // metric name
+        "Distribution of wall time spent (in microseconds) in each stage "
+        "(restore graph from disk, run init graph op, etc) when loading the "
+        "model",
+        "model_path",
+        "stage",
+    },
+    // Scale of 10, power of 1.8 with bucket count 33 (~20 minutes).
+    monitoring::Buckets::Exponential(10, 1.8, 33));
+
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
 
+uint64 GetLatencyMicroseconds(const uint64 start_microseconds) {
+  const uint64 end_microseconds = Env::Default()->NowMicros();
+  // Avoid clock skew.
+  if (end_microseconds < start_microseconds) return 0;
+  return end_microseconds - start_microseconds;
+}
+
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
@@ -242,6 +262,7 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const string& export_dir,
                               const std::unordered_set<string>& tags,
                               SavedModelBundle* const bundle) {
+  const uint64 read_start_microseconds = Env::Default()->NowMicros();
   TF_RETURN_IF_ERROR(ReadMetaGraphDefFromSavedModel(export_dir, tags,
                                                     &bundle->meta_graph_def));
 
@@ -256,12 +277,23 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                  bundle->meta_graph_def.saver_def().restore_op_name(),
                  bundle->meta_graph_def.saver_def().filename_tensor_name(),
                  asset_file_defs, bundle->session.get()));
+  // Record walltime spent in restoring graph from disk, but postpone metric
+  // increments until graph init finishes.
+  const uint64 restore_graph_walltime =
+      GetLatencyMicroseconds(read_start_microseconds);
+
+  const uint64 graph_init_start_microseconds = Env::Default()->NowMicros();
   string init_op_name;
   TF_RETURN_IF_ERROR(
       GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
   TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
                                asset_file_defs, bundle->session.get(),
                                init_op_name));
+  load_latency_by_stage->GetCell(export_dir, "restore_graph")
+      ->Add(restore_graph_walltime);
+  // Record wall time spent in init op.
+  load_latency_by_stage->GetCell(export_dir, "init_graph")
+      ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
   return Status::OK();
 }
 
@@ -275,16 +307,10 @@ Status LoadSavedModel(const SessionOptions& session_options,
   const uint64 start_microseconds = Env::Default()->NowMicros();
   const Status status = LoadSavedModelInternal(session_options, run_options,
                                                export_dir, tags, bundle);
-  const uint64 load_latency_microsecs = [&]() -> uint64 {
-    const uint64 end_microseconds = Env::Default()->NowMicros();
-    // Avoid clock skew.
-    if (end_microseconds < start_microseconds) return 0;
-    return end_microseconds - start_microseconds;
-  }();
   auto log_and_count = [&](const string& status_str) {
     LOG(INFO) << "SavedModel load for tags { " << str_util::Join(tags, " ")
               << " }; Status: " << status_str << ". Took "
-              << load_latency_microsecs << " microseconds.";
+              << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
     load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
   };
   if (status.ok()) {
@@ -292,7 +318,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
   } else {
     log_and_count(kLoadAttemptFail);
   }
-  load_latency->GetCell(export_dir)->IncrementBy(load_latency_microsecs);
+  load_latency->GetCell(export_dir)
+      ->IncrementBy(GetLatencyMicroseconds(start_microseconds));
   return status;
 }
 
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index b966c22b2319aef3b87ef54a283911718d37cf84..9549a71c41a0ba2aac58abd8cfb182aa4eaf3b4f 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -28,7 +28,8 @@ from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
-    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v1.estimator'))
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow.python.keras.api._v1.keras'))
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index ab1c1be344e2257721507543bc7647d4ff4becb2..d016632da2a9d7c2c2f81c02dd573787a0502923 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -129,7 +129,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
   std::vector<string> dim_vars;
   string dim_sizes, indices;
-  if (xla::ShapeUtil::Rank(shape) == 0 ||
+  if (shape.rank() == 0 ||
       (shape.dimensions_size() == 1 && shape.dimensions(0) == 1)) {
     dim_sizes = "[1]";
     indices = "[0]";
@@ -384,8 +384,9 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   // calling HloProfilePrinter::profile_counters_size.
   const string assign_profile_counters_size =
       opts.gen_hlo_profile_printer_data
-          ? "data->set_profile_counters_size("
-            "data->hlo_profile_printer_data()->profile_counters_size());"
+          ? "set_static_data_profile_counters_size(data, "
+            "get_static_data_hlo_profile_printer_data(data)->"
+            "profile_counters_size());"
           : "";
 
   // Use a poor-man's text templating mechanism; first populate the full header
@@ -449,7 +450,7 @@ extern "C" void {{ENTRY}}(
 //   arg bytes aligned:  {{ARG_BYTES_ALIGNED}}
 //   temp bytes total:   {{TEMP_BYTES_TOTAL}}
 //   temp bytes aligned: {{TEMP_BYTES_ALIGNED}}
-class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
+class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = {{ARG_NUM}};
@@ -464,16 +465,17 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
-      data->set_raw_function({{ENTRY}});
-      data->set_buffer_infos(BufferInfos());
-      data->set_num_buffers(kNumBuffers);
-      data->set_arg_index_table(ArgIndexToBufferIndex());
-      data->set_num_args(kNumArgs);
-      data->set_result_index(kResultIndex);
-      data->set_arg_names(StaticArgNames());
-      data->set_result_names(StaticResultNames());
-      data->set_program_shape(StaticProgramShape());
-      data->set_hlo_profile_printer_data(StaticHloProfilePrinterData());
+      set_static_data_raw_function(data, {{ENTRY}});
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
 {{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 968afad65ed6d4b5510687df484b7ce6743f6a85..35994fc785d3e1d5e883c49bec96de315e189d2e 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -59,7 +59,7 @@ namespace bar {
 //   arg bytes aligned:  192
 //   temp bytes total:   126
 //   temp bytes aligned: 320
-class MyClass : public tensorflow::XlaCompiledCpuFunction {
+class MyClass final : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = 2;
@@ -74,16 +74,17 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
     static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
-      data->set_raw_function(entry_point);
-      data->set_buffer_infos(BufferInfos());
-      data->set_num_buffers(kNumBuffers);
-      data->set_arg_index_table(ArgIndexToBufferIndex());
-      data->set_num_args(kNumArgs);
-      data->set_result_index(kResultIndex);
-      data->set_arg_names(StaticArgNames());
-      data->set_result_names(StaticResultNames());
-      data->set_program_shape(StaticProgramShape());
-      data->set_hlo_profile_printer_data(StaticHloProfilePrinterData());
+      set_static_data_raw_function(data, entry_point);
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
 
       return data;
     }();
@@ -256,7 +257,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   static const xla::ProgramShapeProto* StaticProgramShape() {
     static const xla::ProgramShapeProto* kShape = []() {
     xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 52);
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 64);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index ce8e5ec8c96a2c3696f14b8eea206d648182ecb5..7f7b96428572705f30144e6c95cd4cf9c44ce2a3 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 64b861a73091642b03573543a5c55618bf33915d..7bac79ec062af7e790134286e34eda4e123e138a 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -50,7 +50,7 @@ def tfadd_with_ckpt(out_dir):
   y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
-  init_op = variables.initialize_all_variables()
+  init_op = variables.global_variables_initializer()
   saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
   with session.Session() as sess:
     sess.run(init_op)
@@ -65,7 +65,7 @@ def tfadd_with_ckpt_saver(out_dir):
   y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
-  init_op = variables.initialize_all_variables()
+  init_op = variables.global_variables_initializer()
   saver = saver_lib.Saver(name='abcprefix', write_version=saver_pb2.SaverDef.V1)
   with session.Session() as sess:
     sess.run(init_op)
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 4051664c24cacad4a2d151ad3ac9009015900609..2abe3e29b78dbbe719637b13418704acc213d050 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -207,7 +207,7 @@ def tf_library(
         #
         # Note that setting the local=1 attribute on a *test target* causes the
         # test infrastructure to skip that test.  However this is a genrule, not
-        # a test target, and runs with --genrule_strategy=forced_forge, meaning
+        # a test target, and runs with --strategy=Genrule=forced_forge, meaning
         # the local=1 attribute is ignored, and the genrule is still run.
         #
         # https://www.bazel.io/versions/master/docs/be/general.html#genrule
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index d548de8c44285f6d21dd778db464a31e1b19645b..0b6ab7e723d6e3a55da2f1c30b75f44cbdaa75bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -136,6 +136,10 @@ int main(int argc, char** argv) {
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
+  if (argc > 1 && absl::string_view(argv[1]) == "--help") {
+    std::cerr << usage << "\n";
+    return 0;
+  }
   bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   QCHECK(parsed_flags_ok) << "\n" << usage;
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index b9a87ba296abfc6b9d9aaeff3b3e26678e4e1b94..55e2e6d11f7a0984d2e1a40990c3d3db85bd1ff4 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -175,12 +175,18 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:stream_pool",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:resource_variable_ops_op_lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:constant_op",
@@ -634,10 +640,10 @@ tf_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler/optimizers/data:graph_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 48a23a4c1711ac88a329723c46559112d5a39dbd..390ffa694b6f127544d92f3024a02d877556aacd 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/node_matchers.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 0562838f628c66b1eb03af9d2a5139c01dca31c5..0ef0d3db8c16e4b3f78d29aad5a2ae75a81d96f6 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
@@ -222,29 +225,40 @@ class NotPredicate : public Predicate {
   std::array<Predicate*, 1> operands_;
 };
 
-// Represents an infinite list of predicates.
+// Represents the liveness of an induction variable.  For users inside the loop
+// this represents the "current" liveness of the induction variable.  For users
+// outside the loop it represents the "last" liveness of the induction variable.
 //
-// An AndRecurrence with start = S and step = X is printed as {S,&,X} and stands
-// for the list of predicates:
+// More concretely, an and recurrence {S,&,X}<loop> represents the liveness of V
+// in the following graph:
 //
-//   S, S & GenSym(X,1), S & GenSym(X,1) & GenSym(X,2), ...
+//   V = Merge(S', V_NextIt)
+//   V = Op(V, X')
+//   V_NextIt = NextIteration(V)
 //
-// where GenSym(<expression>, <id>) renames every SymbolPredicate in
-// <expression> by appending <id> to it, in effect creating a "fresh" symbol.
-// This means {P,&,Q} is not equal to "P on the first iteration; P&Q on
-// subsequent iterations".
+// where Predicate(S') = S and Predicate(X') = X.
+//
+// `X` may contain symbolic predicates and the operations corresponding to these
+// symbolic predicates are either in frame `loop` or outside it.  The symbols
+// that are inside frame `loop` are loop variant (i.e. can have different
+// liveness in each loop iteration) and the symbols that are outside frame
+// `loop` are loop invariant (i.e. have the same liveness across all
+// iterations).
 class AndRecurrencePredicate : public Predicate {
  public:
-  explicit AndRecurrencePredicate(Predicate* start, Predicate* step)
-      : Predicate(HashPredicateSequence(Kind::kAndRecurrence, {start, step})),
-        operands_({start, step}) {}
+  explicit AndRecurrencePredicate(Predicate* start, Predicate* step,
+                                  std::vector<string> frame)
+      : Predicate(Hash(start, step, frame)),
+        operands_({start, step}),
+        frame_(std::move(frame)) {}
 
   Predicate* start() const { return operands_[0]; }
   Predicate* step() const { return operands_[1]; }
+  absl::Span<const string> frame() const { return frame_; }
 
   string ToString() const override {
     return absl::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
-                        "}");
+                        "}<", absl::StrJoin(frame(), ";"), ">");
   }
 
   Kind kind() const override { return Kind::kAndRecurrence; }
@@ -255,6 +269,17 @@ class AndRecurrencePredicate : public Predicate {
 
  private:
   std::array<Predicate*, 2> operands_;
+  std::vector<string> frame_;
+
+  static int64 Hash(Predicate* start, Predicate* step,
+                    const std::vector<string>& frame) {
+    uint64 frame_hash = 0;
+    for (const string& sub_frame : frame) {
+      frame_hash = Hash64Combine(Hash64(sub_frame), frame_hash);
+    }
+    return Hash64Combine(
+        HashPredicateSequence(Kind::kAndRecurrence, {start, step}), frame_hash);
+  }
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
@@ -281,7 +306,7 @@ class SymbolPredicate : public Predicate {
   // "tensor_id() is live and evaluates to true".
   //
   // If `must_be_true()` is false then this SymbolPredicate represents the
-  // proposition "tensor_id() is live (and may evalutate to any value)"
+  // proposition "tensor_id() is live (and may evaluate to any value)"
   TensorId tensor_id() const { return tensor_id_; }
   bool must_be_true() const { return must_be_true_; }
 
@@ -333,34 +358,58 @@ class PredicateFactory {
   }
 
   Predicate* MakeNotPredicate(Predicate* pred) {
-    SignatureForNot signature = pred;
-    auto it = interned_not_instances_.find(signature);
-    if (it == interned_not_instances_.end()) {
-      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
-      Predicate* new_pred_ptr = new_pred.get();
-      interned_not_instances_.emplace(signature, std::move(new_pred));
-      return new_pred_ptr;
-    } else {
-      return it->second.get();
+    auto it = make_not_predicate_cache_.find(pred);
+    if (it != make_not_predicate_cache_.end()) {
+      return it->second;
     }
+
+    Predicate* result = MakeNotPredicateImpl(pred);
+
+    bool insert_successful =
+        make_not_predicate_cache_.insert({pred, result}).second;
+    (void)insert_successful;
+    DCHECK(insert_successful);
+
+    return result;
   }
 
-  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step) {
-    auto it = interned_and_rec_instances_.find({start, step});
+  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step,
+                                        std::vector<string> frame) {
+    SignatureForAndRec signature(start, step, std::move(frame));
+    auto it = interned_and_rec_instances_.find(signature);
     if (it != interned_and_rec_instances_.end()) {
       return it->second.get();
     }
 
-    std::unique_ptr<Predicate> new_pred =
-        Make<AndRecurrencePredicate>(start, step);
+    std::unique_ptr<Predicate> new_pred = Make<AndRecurrencePredicate>(
+        std::get<0>(signature), std::get<1>(signature), std::get<2>(signature));
     Predicate* new_pred_ptr = new_pred.get();
-    CHECK(interned_and_rec_instances_
-              .emplace(SignatureForAndRec(start, step), std::move(new_pred))
-              .second);
+    bool inserted =
+        interned_and_rec_instances_.emplace(signature, std::move(new_pred))
+            .second;
+    (void)inserted;
+    DCHECK(inserted);
     return new_pred_ptr;
   }
 
-  Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) {
+  Status MakeSymbolPredicate(Node* node, int output_idx, bool must_be_true,
+                             Predicate** predicate) {
+    TensorId tensor_id(node->name(), output_idx);
+
+    bool is_boolean_tensor = node->output_type(tensor_id.index()) == DT_BOOL;
+    TF_RET_CHECK(!must_be_true || is_boolean_tensor);
+
+    if (node->type_string() == "Const" && must_be_true) {
+      const TensorProto* proto = nullptr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "value", &proto));
+
+      Tensor tensor(proto->dtype());
+      TF_RET_CHECK(tensor.FromProto(*proto));
+
+      *predicate = tensor.scalar<bool>()() ? MakeTrue() : MakeFalse();
+      return Status::OK();
+    }
+
     SignatureForSymbol signature = {tensor_id, must_be_true};
     auto it = interned_symbol_instances_.find(signature);
     if (it == interned_symbol_instances_.end()) {
@@ -369,16 +418,63 @@ class PredicateFactory {
       Predicate* new_pred_ptr = new_pred.get();
       interned_symbol_instances_.emplace(std::move(signature),
                                          std::move(new_pred));
-      return new_pred_ptr;
+      *predicate = new_pred_ptr;
     } else {
-      return it->second.get();
+      *predicate = it->second.get();
     }
+
+    return Status::OK();
   }
 
   Predicate* MakeTrue() { return MakeAndPredicate({}); }
   Predicate* MakeFalse() { return MakeOrPredicate({}); }
 
+  ~PredicateFactory() {
+    DCHECK_EQ(stack_depth_, 0) << "Unnested IncrementStackDepth?";
+  }
+
  private:
+  Predicate* MakeNotPredicateImpl(Predicate* pred) {
+    IncrementStackDepth stack_frame(this);
+    if (!stack_frame.HasOverflowed()) {
+      if (Predicate* simplified = SimplifyUsingDeMorgan(pred)) {
+        return simplified;
+      }
+
+      // ~~A => A
+      if (auto* not_pred = dynamic_cast<NotPredicate*>(pred)) {
+        return not_pred->operand();
+      }
+    }
+
+    SignatureForNot signature = pred;
+    auto it = interned_not_instances_.find(signature);
+    if (it == interned_not_instances_.end()) {
+      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
+      Predicate* new_pred_ptr = new_pred.get();
+      interned_not_instances_.emplace(signature, std::move(new_pred));
+      return new_pred_ptr;
+    } else {
+      return it->second.get();
+    }
+  }
+
+  Predicate* SimplifyUsingDeMorgan(Predicate* pred) {
+    // ~(A & B & C & ...) => ~A | ~B | ~C | ~...
+    // ~(A | B | C | ...) -> ~A & ~B & ~C & ~...
+    Predicate::Kind kind = pred->kind();
+
+    if (kind == Predicate::Kind::kAnd || kind == Predicate::Kind::kOr) {
+      std::vector<Predicate*> new_operands;
+      absl::c_transform(pred->GetOperands(), std::back_inserter(new_operands),
+                        [&](Predicate* p) { return MakeNotPredicate(p); });
+      return kind == Predicate::Kind::kOr ? MakeAndPredicate(new_operands)
+                                          : MakeOrPredicate(new_operands);
+    }
+
+    return nullptr;
+  }
+
   template <typename PredicateT, typename... Args>
   std::unique_ptr<Predicate> Make(Args&&... args) {
     return std::unique_ptr<PredicateT>(
@@ -402,7 +498,8 @@ class PredicateFactory {
   using SignatureForAndOr =
       std::pair<Predicate::Kind, absl::Span<Predicate* const>>;
   using SignatureForNot = Predicate*;
-  using SignatureForAndRec = std::pair<Predicate*, Predicate*>;
+  using SignatureForAndRec =
+      std::tuple<Predicate*, Predicate*, std::vector<string>>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
 
   struct HashSignatureForAndOr {
@@ -422,6 +519,36 @@ class PredicateFactory {
     }
   };
 
+  // Used to limit recursion to avoid blowing up the stack and cap compile time.
+  class IncrementStackDepth {
+   public:
+    explicit IncrementStackDepth(PredicateFactory* parent) : parent_(parent) {
+      parent_->stack_depth_++;
+    }
+
+    bool HasOverflowed() const {
+      const int kMaxStackDepth = 8;
+      return parent_->stack_depth_ >= kMaxStackDepth;
+    }
+
+    ~IncrementStackDepth() { parent_->stack_depth_--; }
+
+   private:
+    PredicateFactory* parent_;
+  };
+
+  // A cache for the MakeNotPredicate function.
+  //
+  // NB! This is *not* the same as `interned_not_instances_`.
+  // `interned_not_instances_` maps ensures pointer identity for `NotPredicate`
+  // instances, i.e., it ensures there at most one instance of Not(predicate)
+  // for any given predicate whereas `make_not_predicate_cache_` simply caches
+  // the result of the `MakeNotPredicate` function.  The values in
+  // `interned_not_instances_` are always instance of `NotPredicate` whereas the
+  // values in `make_not_predicate_cache_` may not be (for instance it will map
+  // Not(Not(A)) to A).
+  absl::flat_hash_map<Predicate*, Predicate*> make_not_predicate_cache_;
+
   absl::flat_hash_map<SignatureForAndOr, std::unique_ptr<Predicate>,
                       HashSignatureForAndOr>
       interned_and_or_instances_;
@@ -432,6 +559,7 @@ class PredicateFactory {
   absl::flat_hash_map<SignatureForSymbol, std::unique_ptr<Predicate>,
                       HashSignatureForSymbol>
       interned_symbol_instances_;
+  int stack_depth_ = 0;
 };
 
 Predicate* PredicateFactory::MakeInternedAndOr(
@@ -466,6 +594,13 @@ Predicate* PredicateFactory::MakeAndOrImpl(
     absl::Span<Predicate* const> operands, bool is_and) {
   Predicate::Kind pred_kind =
       is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr;
+
+  IncrementStackDepth stack_frame(this);
+  if (stack_frame.HasOverflowed()) {
+    return MakeInternedAndOr(
+        std::vector<Predicate*>(operands.begin(), operands.end()), pred_kind);
+  }
+
   Predicate::Kind other_pred_kind =
       is_and ? Predicate::Kind::kOr : Predicate::Kind::kAnd;
   absl::flat_hash_set<Predicate*> simplified_ops_set;
@@ -494,16 +629,31 @@ Predicate* PredicateFactory::MakeAndOrImpl(
 
   // Simplify "A&~A=>False" and "A|~A=>True".
   absl::flat_hash_set<Predicate*> negated_ops;
-  for (Predicate* op : simplified_ops) {
-    if (op->kind() == Predicate::Kind::kNot) {
-      negated_ops.insert(dynamic_cast<NotPredicate&>(*op).operand());
-    }
-  }
-
   for (Predicate* op : simplified_ops) {
     if (negated_ops.count(op)) {
+      // Simple case:
+      //
+      //   A & ~A & ... == False
+      //   A | ~A | ... == True
       return is_and ? MakeFalse() : MakeTrue();
     }
+
+    Predicate* negated_op = MakeNotPredicate(op);
+    if (negated_op->kind() == pred_kind) {
+      // Slightly more complicated case:
+      //
+      //   (~A | ~B | ~C) & A & B & C & ... ==
+      //   ~(A & B & C) & (A & B & C) & ... == False
+      //
+      //   (~A & ~B & ~C) | A | B | C | ... ==
+      //   ~(A | B | C) | (A | B | C) | ... == True
+      if (absl::c_all_of(negated_op->GetOperands(), [&](Predicate* p) {
+            return simplified_ops_set.contains(p);
+          })) {
+        return is_and ? MakeFalse() : MakeTrue();
+      }
+    }
+    negated_ops.insert(negated_op);
   }
 
   // If all ops contain the same subop, then factor it out thanks to the
@@ -619,6 +769,7 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   const Graph& graph_;
   absl::flat_hash_map<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
   PredicateFactory predicate_factory_;
+  std::vector<ControlFlowInfo> control_flow_info_;
   bool vlog_;
 };
 
@@ -661,9 +812,12 @@ Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
   TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
   const Edge* pred_edge;
   TF_RETURN_IF_ERROR(n->input_edge(1, &pred_edge));
-  Predicate* true_switch = predicate_factory_.MakeSymbolPredicate(
-      TensorId(pred_edge->src()->name(), pred_edge->src_output()),
-      /*must_be_true=*/true);
+
+  Predicate* true_switch;
+  TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+      pred_edge->src(), pred_edge->src_output(),
+      /*must_be_true=*/true, &true_switch));
+
   Predicate* false_switch = predicate_factory_.MakeNotPredicate(true_switch);
 
   // Output 0 is alive iff all inputs are alive and the condition is false.
@@ -761,6 +915,23 @@ Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
 
   return found_sym ? predicate_factory->MakeAndPredicate(and_ops) : nullptr;
 }
+
+Status GetFullFrame(const Node* n, absl::Span<const ControlFlowInfo> cfi_infos,
+                    std::vector<string>* frame) {
+  int depth = 0;
+  for (const ControlFlowInfo* cfi_iter = &cfi_infos[n->id()]; !n->IsSource();
+       n = cfi_iter->parent_frame, cfi_iter = &cfi_infos[n->id()]) {
+    frame->push_back(cfi_iter->frame_name);
+
+    if (depth++ > 5000) {
+      return errors::Internal(
+          "Frame of depth > 5000:  Probably malformed graph or a bug in "
+          "BuildControlFlowInfo");
+    }
+  }
+
+  return Status::OK();
+}
 }  // namespace
 
 Status DeadnessAnalysisImpl::HandleMerge(Node* n,
@@ -783,8 +954,10 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
     if (has_unvisited_backedge) {
       // We're visiting this merge for the first time and it has an unvisited
       // backedge.
-      Predicate* input_data_pred = predicate_factory_.MakeSymbolPredicate(
-          TensorId(n->name(), 0), /*must_be_true=*/false);
+      Predicate* input_data_pred;
+      TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+          n, /*output_idx=*/0, /*must_be_true=*/false, &input_data_pred));
+
       SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
                    should_revisit);
       return Status::OK();
@@ -825,8 +998,10 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
 
         Predicate* start =
             predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
-        Predicate* and_rec =
-            predicate_factory_.MakeAndRecurrencePredicate(start, step);
+        std::vector<string> frame;
+        TF_RETURN_IF_ERROR(GetFullFrame(n, control_flow_info_, &frame));
+        Predicate* and_rec = predicate_factory_.MakeAndRecurrencePredicate(
+            start, step, std::move(frame));
         SetPredicate(n, {0, 1, Graph::kControlSlot}, and_rec, should_revisit);
         return Status::OK();
       }
@@ -841,8 +1016,10 @@ Status DeadnessAnalysisImpl::HandleRecv(Node* n,
   // acquire a dead signal from a _Send.
   std::vector<Predicate*> input_preds;
   TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
-  input_preds.push_back(predicate_factory_.MakeSymbolPredicate(
-      TensorId(n->name(), 0), /*must_be_true=*/false));
+  Predicate* signal_is_alive;
+  TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+      n, /*output_idx=*/0, /*must_be_true=*/false, &signal_is_alive));
+  input_preds.push_back(signal_is_alive);
   SetPredicate(n, {0, Graph::kControlSlot},
                predicate_factory_.MakeAndPredicate(input_preds),
                should_revisit);
@@ -892,6 +1069,24 @@ Status DeadnessAnalysisImpl::Populate() {
 
 Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
     absl::Span<Node* const> rpo) {
+  std::vector<string> unreachable_nodes;
+  // Compute the loop structure of the graph.
+  TF_RETURN_IF_ERROR(
+      BuildControlFlowInfo(&graph_, &control_flow_info_, &unreachable_nodes));
+
+  // Do some opportunistic error checking:
+  if (!unreachable_nodes.empty()) {
+    if (unreachable_nodes.size() > 5) {
+      unreachable_nodes.erase(unreachable_nodes.begin() + 5,
+                              unreachable_nodes.end());
+    }
+
+    return errors::InvalidArgument(
+        "Found unreachable nodes, most likely source and sink nodes not "
+        "connected: ",
+        absl::StrJoin(unreachable_nodes, ", "));
+  }
+
   // This an abstract interpretation over the deadness propagation semantics of
   // the graph executor.
   //
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 8a73101c184e6190921fd7729742922bd96f4bcf..16ee8f86d55c72785368ac2fd67635eba2fa7cd7 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -123,10 +123,9 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   Output increment_by = ops::Const(root.WithOpName(prefix + "/incr"), 1);
   Output final_value = ops::Const(root.WithOpName(prefix + "/final"), 10);
   Output loop_cond_expr =
-      ops::Less(root.WithOpName(prefix + "/less"), iv.output, final_value);
-  Output loop_cond =
-      ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
-  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
+      ops::Less(root.WithOpName(prefix + "/cond"), iv.output, final_value);
+  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output,
+                    loop_cond_expr);
   ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
                            latch.output_false);
   Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
@@ -140,7 +139,7 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   root.graph()->AddControlEdge(iv.output.node(), increment_by.node());
   root.graph()->AddControlEdge(iv.output.node(), final_value.node());
 
-  return {iv.output, loop_cond};
+  return {iv.output, loop_cond_expr};
 }
 
 InductionVarInfo CreateInductionVariable(const Scope& root,
@@ -515,24 +514,27 @@ TEST(DeadnessAnalysisTest, Loop) {
 
     // In theory we should be able to tell that iv0/cond:0 and iv1/cond:0
     // produce the same deadness.  But we're not that smart today.
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)], "{#true,&,*iv0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)], "{#true,&,*iv1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)], "{#true,&,*iv2/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)],
+              "{#true,&,*iv0/cond:0}<fr0>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)],
+              "{#true,&,*iv1/cond:0}<fr0>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)],
+              "{#true,&,*iv2/cond:0}<fr0>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({#true,&,*iv1/cond:0} & {#true,&,*iv0/cond:0})");
+              "({#true,&,*iv1/cond:0}<fr0> & {#true,&,*iv0/cond:0}<fr0>)");
     EXPECT_EQ(predicate_map[ControlOutputFor(add1)],
-              "({#true,&,*iv1/cond:0} & {#true,&,*iv2/cond:0})");
+              "({#true,&,*iv1/cond:0}<fr0> & {#true,&,*iv2/cond:0}<fr0>)");
   }
 }
 
 TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "loop", 0);
   Output dependent_iv0 =
-      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "div0", "loop", iv.loop_cond, 0)
           .induction_var;
   Output dependent_iv1 =
-      CreateDependentLoopInvariantValue(root, "div1", "frame", iv.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "div1", "loop", iv.loop_cond, 0)
           .induction_var;
   Output add0 = ops::Add(root.WithOpName("add0"), dependent_iv0, dependent_iv1);
 
@@ -549,13 +551,13 @@ TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
-              "{#true,&,*iv0/cond:0}");
+              "{#true,&,*iv0/cond:0}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}<loop>");
   }
 }
 
@@ -595,32 +597,33 @@ TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
 TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
   InductionVarInfo iv_outer =
-      CreateInductionVariable(root, "iv_outer", "frame", 0);
+      CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+  Output enter_constant_outer_loop = ops::internal::Enter(
+      root.WithOpName("constant_enter_outer_loop"),
+      ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+      ops::internal::Enter::Attrs().IsConstant(true));
   ops::Switch inner_value(root.WithOpName("outer_is_live"),
-                          ops::Const(root.WithOpName("constant"), 5),
-                          iv_outer.loop_cond);
+                          enter_constant_outer_loop, iv_outer.loop_cond);
   InductionVarInfo iv_inner = CreateInductionVariable(
-      root, "iv_inner", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner/enter"),
-                           inner_value.output_true, "frame_inner"));
+      root, "iv_inner", "inner_loop", inner_value.output_true);
 
   Output dependent_outer_iv0 =
-      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0", "frame",
-                                        iv_outer.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0",
+                                        "outer_loop", iv_outer.loop_cond, 0)
           .induction_var;
   Output dependent_outer_iv1 =
-      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1", "frame",
-                                        iv_outer.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1",
+                                        "outer_loop", iv_outer.loop_cond, 0)
           .induction_var;
 
-  Output dependent_inner_iv0 =
-      CreateDependentLoopInvariantValue(root, "dependent_inner_iv0", "frame",
-                                        iv_inner.loop_cond, dependent_outer_iv0)
-          .induction_var;
-  Output dependent_inner_iv1 =
-      CreateDependentLoopInvariantValue(root, "dependent_inner_iv1", "frame",
-                                        iv_inner.loop_cond, dependent_outer_iv1)
-          .induction_var;
+  Output dependent_inner_iv0 = CreateDependentLoopInvariantValue(
+                                   root, "dependent_inner_iv0", "inner_loop",
+                                   iv_inner.loop_cond, dependent_outer_iv0)
+                                   .induction_var;
+  Output dependent_inner_iv1 = CreateDependentLoopInvariantValue(
+                                   root, "dependent_inner_iv1", "inner_loop",
+                                   iv_inner.loop_cond, dependent_outer_iv1)
+                                   .induction_var;
 
   Output add0 = ops::Add(root.WithOpName("add0"), dependent_inner_iv0,
                          dependent_inner_iv1);
@@ -638,46 +641,50 @@ TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
-              "{#true,&,*iv_outer/cond:0}");
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
-              "{(*iv_outer/cond:0 & {#true,&,*iv_outer/cond:0}),&,"
-              "*iv_inner/cond:0}");
+              "{({#true,&,*iv_outer/cond:0}<outer_loop> & "
+              "*iv_outer/cond:0),&,*iv_inner/cond:0}<inner_loop;outer_loop>");
 
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
+              "iv_inner/iv:0)}<inner_loop;outer_loop>");
+
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
+              "iv_inner/iv:0)}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
+              "iv_inner/iv:0)}<inner_loop;outer_loop>");
   }
 }
 
 TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  InductionVarInfo iv_outer_0 =
-      CreateInductionVariable(root, "iv_outer_0", "frame", 0);
-  ops::Switch inner_value_0(root.WithOpName("outer_0_is_live"),
-                            ops::Const(root.WithOpName("constant"), 5),
-                            iv_outer_0.loop_cond);
-  InductionVarInfo iv_inner_0 = CreateInductionVariable(
-      root, "iv_inner_0", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner_0/enter"),
-                           inner_value_0.output_true, "frame_inner"));
-
-  InductionVarInfo iv_outer_1 =
-      CreateInductionVariable(root, "iv_outer_1", "frame", 1);
-  ops::Switch inner_init_value_1(root.WithOpName("outer_1_is_live"),
-                                 ops::Const(root.WithOpName("constant"), 5),
-                                 iv_outer_1.loop_cond);
-  InductionVarInfo iv_inner_1 = CreateInductionVariable(
-      root, "iv_inner_1", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner_1/enter"),
-                           inner_init_value_1.output_true, "frame_inner"));
-  Output add0 = ops::Add(root.WithOpName("add0"), iv_inner_0.induction_var,
-                         iv_inner_1.induction_var);
+
+  std::array<Output, 2> outer_iv;
+  std::array<Output, 2> inner_iv;
+
+  for (int i : {0, 1}) {
+    InductionVarInfo iv_outer =
+        CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+    Output enter_constant_outer_loop = ops::internal::Enter(
+        root.WithOpName("constant_enter_outer_loop"),
+        ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+        ops::internal::Enter::Attrs().IsConstant(true));
+    ops::Switch inner_value(root.WithOpName("outer_is_live"),
+                            enter_constant_outer_loop, iv_outer.loop_cond);
+    InductionVarInfo iv_inner = CreateInductionVariable(
+        root, "iv_inner", "inner_loop", inner_value.output_true);
+
+    outer_iv[i] = iv_outer.induction_var;
+    inner_iv[i] = iv_inner.induction_var;
+  }
+
+  Output add0 = ops::Add(root.WithOpName("add0"), inner_iv[0], inner_iv[1]);
 
   VLogGraphIfAsked(*root.graph());
 
@@ -692,21 +699,76 @@ TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
     PredicateMapTy predicate_map;
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_0.induction_var)],
-              "{#true,&,*iv_outer_0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_0.induction_var)],
-              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
-              "*iv_inner_0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_1.induction_var)],
-              "{#true,&,*iv_outer_1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_1.induction_var)],
-              "{(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
-              "*iv_inner_1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
-              "*iv_inner_1/cond:0} & "
-              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
-              "*iv_inner_0/cond:0})");
+    EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[0])],
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[0])],
+              "{({#true,&,*iv_outer/cond:0}<outer_loop> & "
+              "*iv_outer/cond:0),&,*iv_inner/cond:0}<inner_loop;outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[1])],
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>");
+    EXPECT_EQ(
+        predicate_map[ControlOutputFor(inner_iv[1])],
+        "{({#true,&,*iv_outer/cond_1:0}<outer_loop> & "
+        "*iv_outer/cond_1:0),&,*iv_inner/cond_1:0}<inner_loop;outer_loop>");
+    EXPECT_EQ(
+        predicate_map[ControlOutputFor(add0)],
+        "({({#true,&,*iv_outer/cond:0}<outer_loop> & "
+        "*iv_outer/cond:0),&,*iv_inner/cond:0}<inner_loop;outer_loop> & "
+        "{({#true,&,*iv_outer/cond_1:0}<outer_loop> & "
+        "*iv_outer/cond_1:0),&,*iv_inner/cond_1:0}<inner_loop;outer_loop>)");
+  }
+}
+
+TEST(DeadnessAnalysisTest, AndRecurrenceNeedsFrameName) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_0 = CreateInductionVariable(root, "iv_0", "frame_0", 10);
+  InductionVarInfo iv_1 = CreateInductionVariable(root, "iv_1", "frame_1", 9);
+
+  Output init = CreateSwitch(root, "init").output_true;
+  Output step = CreateSwitch(root, "step").output_true;
+
+  std::array<Output, 2> exits;
+  std::array<Output, 2> next_iterations;
+
+  for (int i : {0, 1}) {
+    Output init_enter = ops::internal::Enter(
+        root.WithOpName(absl::StrCat("init_enter_frame_", i)), init,
+        absl::StrCat("frame_", i),
+        ops::internal::Enter::Attrs().IsConstant(true));
+    Output step_enter = ops::internal::Enter(
+        root.WithOpName(absl::StrCat("step_enter_frame_", i)), step,
+        absl::StrCat("frame_", i),
+        ops::internal::Enter::Attrs().IsConstant(true));
+
+    ops::Merge iv(root.WithOpName(absl::StrCat("expr_", i)),
+                  {init_enter, init_enter});
+    Output add = ops::Add(root.WithOpName(absl::StrCat("add_", i)), iv.output,
+                          step_enter);
+    next_iterations[i] = ops::NextIteration(
+        root.WithOpName(absl::StrCat("expr_", i, "_next_iteration")), add);
+    EXPECT_TRUE(
+        root.graph()
+            ->UpdateEdge(next_iterations[i].node(), 0, iv.output.node(), 1)
+            .ok());
+    exits[i] = ops::internal::Exit(root.WithOpName(absl::StrCat("exit_", i)),
+                                   iv.output);
+  }
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[0])],
+              predicate_map[ControlOutputFor(exits[1])]);
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[0])], "");
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[1])], "");
+
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[0])],
+              predicate_map[ControlOutputFor(next_iterations[1])]);
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[0])], "");
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[1])], "");
   }
 }
 
@@ -818,5 +880,82 @@ TEST(DeadnessAnalysisTest, RecvVsSwitchText) {
   EXPECT_EQ(predicate_map[logical_and_output_0], "(recv:0 & *recv:0)");
 }
 
+TEST(DeadnessAnalysisTest, DeMorgan) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output cond_0 = ops::Placeholder(root.WithOpName("cond_0"), DT_BOOL);
+  Output cond_1 = ops::Placeholder(root.WithOpName("cond_1"), DT_BOOL);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+
+  ops::Switch sw_0(root.WithOpName("switch_0"), value, cond_0);
+  ops::Switch sw_1(root.WithOpName("switch_1"), value, cond_1);
+
+  Output and_0_1 =
+      ops::Add(root.WithOpName("and_0_1"), sw_0.output_true, sw_1.output_true);
+
+  Output or_not0_not1 = ops::Merge(root.WithOpName("or_not0_not1"),
+                                   {sw_0.output_false, sw_1.output_false})
+                            .output;
+
+  // Predicate(should_always_be_dead) =
+  // (A & B) & (~A | ~B) = (A & B) & ~(A & B) = False
+  Output should_always_be_dead =
+      ops::Add(root.WithOpName("should_always_be_dead"), and_0_1, or_not0_not1);
+
+  // Predicate(should_always_be_dead) =
+  // (A & B) | (~A | ~B) = (A & B) | ~(A & B) = True
+  Output should_always_be_alive =
+      ops::Merge(root.WithOpName("should_always_be_alive"),
+                 {and_0_1, or_not0_not1})
+          .output;
+
+  std::unique_ptr<DeadnessAnalysis> result;
+  TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(should_always_be_dead)], "#false");
+  EXPECT_EQ(predicate_map[ControlOutputFor(should_always_be_alive)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, ConstantTrueSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output constant_true = ops::Const(root.WithOpName("const_true"), true);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, constant_true);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "#false");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, ConstantFalseSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output constant_false = ops::Const(root.WithOpName("const_false"), false);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, constant_false);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "#true");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#false");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 03aba97bbe81a11f6366d118ee5bc573d0c6b31b..d0d7a3f3785469acd79a83b6897668f94fc6ea2e 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -1008,13 +1008,15 @@ Status Encapsulator::Subgraph::AddHostComputes(
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, host_compute);
+        graph_->AddControlEdge(src_image, host_compute,
+                               /* allow_duplicates= */ true);
       }
 
       // Connect the _HostCompute node to its ancestor host compute nodes.
       for (const auto& ancestor_name : host_compute_ancestors) {
         Node* ancestor = host_compute_node[ancestor_name];
-        graph_->AddControlEdge(ancestor, host_compute);
+        graph_->AddControlEdge(ancestor, host_compute,
+                               /* allow_duplicates= */ true);
       }
 
       // Connect the consumers in the subgraph to the _HostCompute node.
@@ -1031,7 +1033,8 @@ Status Encapsulator::Subgraph::AddHostComputes(
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(host_compute, dst_image);
+        graph_->AddControlEdge(host_compute, dst_image,
+                               /* allow_duplicates= */ true);
       }
     }
   }
@@ -1059,7 +1062,8 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
 void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
   if (sequencer_ != nullptr) {
     VLOG(2) << "ConnectSequencerToCallNode";
-    graph_out->AddControlEdge(sequencer_, call_node_);
+    graph_out->AddControlEdge(sequencer_, call_node_,
+                              /* allow_duplicates= */ true);
   }
 }
 
@@ -1279,7 +1283,8 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   // completes. This has no effect on execution order but prevents the
   // RecvAtHost being pruned.
   TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_);
+  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_,
+                            true /* skip duplicates check */);
 
   return Status::OK();
 }
@@ -1336,7 +1341,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   // subgraph completes. This has no effect on execution order but prevents the
   // RecvAtHost being pruned.
   TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_);
+  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_,
+                            /* allow_duplicates= */ true);
 
   return Status::OK();
 }
@@ -1446,7 +1452,8 @@ Status Encapsulator::CopySubgraphEdges(
         src_func_id == dst_func_id) {
       Graph* g = subgraphs_[src_func_id].GetGraph();
       if (edge->IsControlEdge()) {
-        g->AddControlEdge(src_image, dst_image);
+        g->AddControlEdge(src_image, dst_image,
+                          /* allow_duplicates= */ true);
       } else {
         g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
       }
@@ -1732,7 +1739,8 @@ Status Encapsulator::CopyEdgeToOutputGraph(
     if (edges_added
             ->emplace(OutputTensor(src_image, -1), InputTensor(dst_image, -1))
             .second) {
-      graph_out->AddControlEdge(src_image, dst_image);
+      graph_out->AddControlEdge(src_image, dst_image,
+                                /* allow_duplicates= */ true);
     }
 
     return Status::OK();
@@ -1761,7 +1769,8 @@ Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
     const string& subgraph = ancestors.first;
     for (const string& ancestor : ancestors.second) {
       graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNode(),
-                                subgraphs_[subgraph].GetCallNode());
+                                subgraphs_[subgraph].GetCallNode(),
+                                /* allow_duplicates= */ true);
     }
   }
   return Status::OK();
@@ -2129,7 +2138,8 @@ Status CheckClusterDependencyForCycles(
     const string& ancestor, const string& successor,
     const std::unordered_map<string, std::unordered_set<string>>& ancestors,
     const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
-    GraphCycles* cycle_detector, std::map<string, int>* cycle_detector_map) {
+    GraphCycles* cycle_detector,
+    std::unordered_map<string, int>* cycle_detector_map) {
   if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
     (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
   }
@@ -2173,7 +2183,7 @@ Status Encapsulator::FindClusterDependencies() {
   // We check that clusters are acyclic using this cycle detector.
   GraphCycles cycle_detector;
   // Map from cluster name to cycle detector node id.
-  std::map<string, int> cycle_detector_map;
+  std::unordered_map<string, int> cycle_detector_map;
   // Process the nodes in topologically-sorted order.
   std::vector<Node*> nodes;
   GetReversePostOrder(*graph_in_, &nodes);
@@ -2535,7 +2545,33 @@ Status EncapsulateSubgraphsPass::Run(
             std::vector<int>* input_permutation,
             std::vector<int>* output_permutation, NodeDef* node) {
         // Optimize the subgraph.
-        OptimizeGraph(flr, subgraph);
+        // Do not constant fold nodes that output DT_VARIANT type tensors.
+        // XLA does not support Const nodes of Variant type since it needs
+        // to know the original ops to be able to compile them to the relevant
+        // XLA form.
+        // TODO(srbs): This filter is a little conservative. E.g. a subgraph of
+        // the form:
+        //                          Const
+        //                            |
+        // EmptyTensorList -> TensorListPushBack -> TensorListPopBack -> Op
+        //                                                  |
+        //                                        (Discard popped list)
+        //
+        // Would have been reduced to "Const -> Op" without this filter.
+        // However since we are only allowed to specify the filter at the "Node"
+        // level there is no good way to allow the above behavior. So we
+        // disallow any sort of constant folding on Variant nodes for now.
+        auto cf_consider_fn = [](const Node* n) {
+          for (const auto& output_arg : n->op_def().output_arg()) {
+            if (output_arg.type() == DT_VARIANT) {
+              return false;
+            }
+          }
+          return true;
+        };
+        GraphOptimizer::Options graph_optimizer_options;
+        graph_optimizer_options.cf_consider_fn = cf_consider_fn;
+        OptimizeGraph(flr, subgraph, graph_optimizer_options);
 
         const int num_args = input_permutation->size();
         std::vector<bool> const_args(num_args);
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 8617beec004d0fe912155f054442c5b6249bb6b5..1f8ec09e19c01d0a8b2a3761135ed53dfb2ad3b0 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -32,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -513,6 +517,18 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   s = PerformStaticShapeInferenceBeforeEncapsulation(graph.get());
   if (!s.ok()) return s;
 
+  // Create FunctionLibraryRuntime.
+  SessionOptions session_options;
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::AddDevices(
+      session_options, "/job:localhost/replica:0/task:0", &devices));
+  OptimizerOptions opts;
+  auto device_mgr = absl::make_unique<DeviceMgr>(std::move(devices));
+  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def.get(),
+      opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+  auto flr = pflr->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+
   std::unique_ptr<Graph> graph_out;
   s = EncapsulateSubgraphsInFunctions(
       "_encapsulate", /*outside_compilation_attribute=*/"", *graph,
@@ -538,7 +554,7 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
                                     std::map<string, int>{}});
   }
   s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
-                                graph_out.get(), lib_def.get());
+                                graph_out.get(), flr, lib_def.get());
   if (!s.ok()) return s;
 
   GraphDef graphdef_out;
@@ -941,7 +957,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"c"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1101,7 +1119,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O2"}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
@@ -1112,7 +1132,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph1},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"g_0_retval_retval", "outside_compilation_O2_host_compute:outputs:0"},
@@ -1244,7 +1266,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
@@ -1269,7 +1293,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"g_0_retval_retval", "G:o:0"}, {"i_0_retval_retval", "I:o:0"}});
 
@@ -1397,7 +1423,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1419,7 +1447,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"i_0_retval_retval", "I:o:0"}});
 
@@ -1527,7 +1557,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
 
@@ -1615,7 +1647,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
             {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1716,7 +1750,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"f_0_retval_retval", "F:o:0"}});
@@ -1821,7 +1857,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"f_0_retval_retval", "F:o:0"}});
@@ -1949,7 +1987,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph1},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
           {{"outside_compilation_O2_host_compute"},
            "XlaHostCompute",
            {"F:o:0"},
@@ -1959,7 +1999,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O2"}}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2082,7 +2124,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph", NameAttrList()},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O2"}}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"D:o:0"},
@@ -2092,7 +2136,9 @@ TEST(EncapsulateSubgraphsTest,
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2214,7 +2260,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"key", "host_compute_channel_F1_O1"},
          {"shape_inference_graph", shape_inference_graph},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O1"}}},
+         {"_outside_compilation_subgraph", "O1"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}}},
        {{"outside_compilation_O2_host_compute"},
         "XlaHostCompute",
         {"D:o:0"},
@@ -2224,7 +2272,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"key", "host_compute_channel_F1_O2"},
          {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O2"}},
+         {"_outside_compilation_subgraph", "O2"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}},
         {}},
        {{"outside_compilation_O3_host_compute"},
         "XlaHostCompute",
@@ -2235,7 +2285,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"key", "host_compute_channel_F1_O3"},
          {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O3"}},
+         {"_outside_compilation_subgraph", "O3"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}},
         {}}},
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2354,7 +2406,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"f_0_retval_retval", "F:o:0"}});
@@ -2465,7 +2519,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"c"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 8b01768c49422b331b52a8ba31bade000c95722e..2a770c527b2fae91352fd17dacb13495a3a73f34 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 
@@ -308,6 +309,10 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
     host_compute_builder.Attr("tpu_core", core);
   }
 
+  // Set input tokens.
+  host_compute_builder.Attr(kXlaTokenInputNodesAttrName,
+                            std::vector<string>{kXlaTokenArgNodeName});
+
   // Populate inputs.
   std::vector<DataType> input_dtypes;
   TF_RETURN_IF_ERROR(GetNodeAttr(call_node->attrs(), "Tinputs", &input_dtypes));
@@ -398,8 +403,8 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 }
 
 // Resets "device_ordinal" attr to placeholder value for related nodes
-// (XlaRecvAtHost nodes; XlaSendFromHost nodes; If nodes containing
-// XlaRecvAtHost/XlaSendFromHost).
+// (XlaRecvAtHost nodes; XlaSendFromHost nodes; If/While/FuncCall nodes
+// containing XlaRecvAtHost/XlaSendFromHost).
 Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
   AttrValue device_ordinal_value;
   device_ordinal_value.set_placeholder("device_ordinal");
@@ -429,6 +434,10 @@ Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
         n->ClearAttr(attr_name);
         n->AddAttr(attr_name, branch_func);
       }
+    } else if (HasNodeAttr(n->def(), "device_ordinal")) {
+      // Function call node containing outside compilation.
+      n->ClearAttr("device_ordinal");
+      n->AddAttr("device_ordinal", device_ordinal_value);
     } else {
       return errors::Internal("Unknown node marked with ",
                               kXlaHasHostTransferAttrName, ": ",
@@ -1217,20 +1226,129 @@ Status BuildHostGraphForWhileNode(
   return Status::OK();
 }
 
+// Builds host graph for func call nodes.
+Status BuildHostGraphForFuncCallNode(const string& func_call_node_name,
+                                     const string& xla_cluster_name,
+                                     const string& func_call_host_func_name,
+                                     const string& host_graph_func_name,
+                                     FunctionLibraryDefinition* fld) {
+  Graph host_graph(fld);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: rewrite `host_func_name`, replace key placeholder with an _Arg
+  // node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, func_call_host_func_name, fld));
+
+  // Step 3: build a function call node with `host_func_name`, with
+  // `key_placeholder` as input.
+  NodeDefBuilder call_builder(absl::StrCat("oc_call_", func_call_node_name),
+                              func_call_host_func_name, fld);
+  call_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  call_builder.Attr("device_ordinal", device_ordinal_value);
+  call_builder.Attr(kXlaHasHostTransferAttrName, true);
+  NodeDef call_def;
+  TF_RETURN_IF_ERROR(call_builder.Finalize(&call_def));
+  Status s;
+  Node* call_node = host_graph.AddNode(call_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, call_node, 0);
+
+  // Convert `host_graph` to function, and add a "device_ordinal" attr.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
 Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
     Graph* g, const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
     FunctionLibraryDefinition* fld, std::vector<string>* host_graphs,
     std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation) {
-  std::vector<Node*> if_nodes, while_nodes;
+  std::vector<Node*> if_nodes, while_nodes, func_call_nodes;
   for (Node* n : g->nodes()) {
     if (n->type_string() == "If") {
       if_nodes.push_back(n);
     } else if (n->type_string() == "While") {
       while_nodes.push_back(n);
+    } else if (fld->Contains(n->type_string())) {
+      func_call_nodes.push_back(n);
+    } else if (n->type_string() == FunctionLibraryDefinition::kGradientOp) {
+      // Only gradient for user-defined function should be considered as
+      // function call node.
+      NameAttrList original_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(
+          n->def(), FunctionLibraryDefinition::kFuncAttr, &original_func));
+      if (fld->Contains(original_func.name())) {
+        func_call_nodes.push_back(n);
+      }
+    }
+  }
+
+  for (Node* n : func_call_nodes) {
+    // Extract outside compilation for the function call.
+    bool func_has_outside_compilation = false;
+    NameAttrList func;
+    func.set_name(n->type_string());
+    typedef protobuf::Map<string, AttrValue> AttrMap;
+    *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
+    string new_func_name = absl::StrCat(n->name(), "_oc");
+    string host_func_name = absl::StrCat("oc_func_call_host_", n->name());
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func, new_func_name, host_func_name, host_compute_core, flr, fld,
+        shape_inference_graphs, &func_has_outside_compilation));
+
+    // If the function call does not have outside compilation, nothing to do.
+    if (!func_has_outside_compilation) {
+      continue;
     }
+
+    *has_outside_compilation = true;
+
+    // Change `n` to call the new function directly.
+    NodeDefBuilder replace_builder(n->name(), new_func_name, fld);
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) {
+        continue;
+      }
+      replace_builder.Input(e->src()->name(), e->src_output(),
+                            e->src()->output_type(e->src_output()));
+    }
+    for (const auto& attr : n->attrs()) {
+      replace_builder.Attr(attr.first, attr.second);
+    }
+    NodeDef replace_def;
+    TF_RETURN_IF_ERROR(replace_builder.Finalize(&replace_def));
+    TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, replace_def));
+    replace->AddAttr(kXlaTokenInputNodesAttrName,
+                     std::vector<string>{kXlaTokenArgNodeName});
+
+    // Build host side graph for the function call.
+    string oc_host_graph_name =
+        absl::StrCat("oc_func_host_graph_", replace->name());
+    TF_RETURN_IF_ERROR(
+        BuildHostGraphForFuncCallNode(replace->name(), xla_cluster_name,
+                                      host_func_name, oc_host_graph_name, fld));
+
+    // Record the host graph.
+    host_graphs->push_back(oc_host_graph_name);
   }
 
   for (Node* n : if_nodes) {
@@ -1251,12 +1369,12 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
         then_branch, then_branch_xla_func_name, then_branch_host_func_name,
-        host_compute_core, fld, shape_inference_graphs,
+        host_compute_core, flr, fld, shape_inference_graphs,
         &then_branch_has_outside_compilation));
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
         else_branch, else_branch_xla_func_name, else_branch_host_func_name,
-        host_compute_core, fld, shape_inference_graphs,
+        host_compute_core, flr, fld, shape_inference_graphs,
         &else_branch_has_outside_compilation));
 
     // If then/else branch do not have outside compilation, nothing to do.
@@ -1316,12 +1434,12 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
            body_xla_func_name = absl::StrCat(body.name(), "_oc");
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, fld,
-        shape_inference_graphs, &cond_has_outside_compilation));
+        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
+        fld, shape_inference_graphs, &cond_has_outside_compilation));
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        body, body_xla_func_name, body_host_func_name, host_compute_core, fld,
-        shape_inference_graphs, &body_has_outside_compilation));
+        body, body_xla_func_name, body_host_func_name, host_compute_core, flr,
+        fld, shape_inference_graphs, &body_has_outside_compilation));
 
     // If cond/body do not have outside compilation, nothing to do.
     if (!cond_has_outside_compilation && !body_has_outside_compilation) {
@@ -1469,17 +1587,27 @@ Status ExtractOutsideCompilationForFunction(
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
     const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
     FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation) {
+  // Convert the function to graph.
   const string& func_name = func_name_attrs.name();
-  const FunctionDef* fdef = fld->Find(func_name);
-  if (!fdef) {
-    return errors::Internal("Cannot find function ", func_name);
-  }
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(func_name, AttrSlice(&func_name_attrs.attr()), &handle));
+  Status ret_status = Status::OK();
+  auto cleanup_handle = gtl::MakeCleanup([&]() {
+    auto s = flr->ReleaseHandle(handle);
+    if (!s.ok()) {
+      ret_status.Update(s);
+    }
+  });
+  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+
+  // Check if we have outside compilation nodes.
   *has_outside_compilation = false;
-  for (auto& node_def : fdef->node_def()) {
-    if (HasNodeAttr(node_def, outside_compilation_attr_name)) {
+  for (Node* n : fbody->graph->nodes()) {
+    if (HasNodeAttr(n->def(), outside_compilation_attr_name)) {
       *has_outside_compilation = true;
       break;
     }
@@ -1487,16 +1615,6 @@ Status ExtractOutsideCompilationForFunction(
   // We cannot early return here, because we might have outside compilation in
   // If/While function body.
 
-  // Convert the function to graph.
-  FunctionBody* fbody = nullptr;
-  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      *fld->Find(func_name), AttrSlice(&func_name_attrs.attr()), fld,
-      [&](const string& op, const OpDef** sig) {
-        return fld->LookUpOpDef(op, sig);
-      },
-      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
-
   // Preprocess edges between different outside compilations. They will be
   // restored in `ConstructHostGraph()`.
   TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
@@ -1553,16 +1671,11 @@ Status ExtractOutsideCompilationForFunction(
     TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
         graph_out.get(), n, host_compute_core));
   }
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
-        *graph_out, fld);
-  }
 
   // Handle nodes with associated functions.
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForNodesWithAssociatedFunctions(
       graph_out.get(), xla_cluster_attr_name, outside_compilation_attr_name,
-      xla_cluster_name, host_compute_core, fld,
+      xla_cluster_name, host_compute_core, flr, fld,
       &outside_compilation_host_graphs, shape_inference_graphs,
       has_outside_compilation));
 
@@ -1580,20 +1693,31 @@ Status ExtractOutsideCompilationForFunction(
   FunctionDef updated_fdef;
   TF_RETURN_IF_ERROR(
       GraphToFunctionDef(*graph_out, new_func_name, &updated_fdef));
+  const FunctionDef* original_fdef = fld->Find(func_name);
+  if (original_fdef) {
+    for (const auto& attr : original_fdef->attr()) {
+      (*updated_fdef.mutable_attr())[attr.first] = attr.second;
+    }
+  }
   if (fld->Find(new_func_name)) {
     TF_RETURN_IF_ERROR(fld->ReplaceFunction(new_func_name, updated_fdef));
   } else {
     TF_RETURN_IF_ERROR(fld->AddFunctionDef(updated_fdef));
   }
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
+        *graph_out, fld);
+  }
 
-  return Status::OK();
+  return ret_status;
 }
 
 Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryDefinition* fld) {
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld) {
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile("extract_outside_compilation_before", *g, fld);
   }
@@ -1610,7 +1734,7 @@ Status ExtractOutsideCompilation(
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
         func_name_attrs, func_name_attrs.name(), host_graph_func_name,
-        host_compute_core, fld, &shape_inference_graphs,
+        host_compute_core, flr, fld, &shape_inference_graphs,
         &has_outside_compilation));
     TF_RETURN_IF_ERROR(
         ExpandHostGraphIntoMainGraph(g, fld, host_graph_func_name, n));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index e07e7c5dd0cd42ddd4d643d8b36583c82056bbb2..d64cc2a103ed040cbf413ac736f97f84459e869b 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -89,7 +89,7 @@ Status ExtractOutsideCompilationForFunction(
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
     const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
     FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation);
 
@@ -101,7 +101,7 @@ Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryDefinition* fld);
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index e9a89e34e0c7b04b4be34e367b2d0bf627c0061a..7c3a24feff81b21a5d2347d21fb80988bc3e6065 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -222,7 +225,42 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   EXPECT_EQ(shapes[0].dim_size(), 1);
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
+class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions session_options;
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        session_options, "/job:localhost/replica:0/task:0", &devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+  }
+
+  Status ExtractOutsideCompilationTest(
+      const string &xla_cluster_attr_name,
+      const string &outside_compilation_attr_name,
+      const string &xla_cluster_name, const NameAttrList &func_name_attrs,
+      const string &new_func_name, const string &host_graph_func_name,
+      const std::map<string, int> &host_compute_core,
+      FunctionLibraryDefinition *fld,
+      std::vector<string> *shape_inference_graphs,
+      bool *has_outside_compilation) {
+    OptimizerOptions opts;
+    pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, fld, opts,
+        /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+    auto flr = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+    return ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func_name_attrs, new_func_name, host_graph_func_name, host_compute_core,
+        flr, fld, shape_inference_graphs, has_outside_compilation);
+  }
+
+ private:
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+};
+
+TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   // Build the XLA computation func.
   // "const0"
   // "identity0" = "const0" (outside compilation cluster "0")
@@ -256,7 +294,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -362,7 +400,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   }
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   // Build the XLA computation func.
   // "const0"
   FunctionDefLibrary fdl;
@@ -384,7 +422,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -406,7 +444,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   EXPECT_EQ(host_graph->num_nodes(), 2);
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   // Build the XLA computation func.
   // "const0"
   // "const1" (outside compilation cluster "0")
@@ -432,7 +470,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -489,7 +527,7 @@ REGISTER_OP("XlaRecvFromHost")
     .Attr("key: string")
     .SetIsStateful();
 
-TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   // Build the XLA computation func.
   // "const0" (bool)
   // "const1" (int32)
@@ -555,7 +593,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -651,7 +689,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   }
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   // Build the XLA computation func.
   // "const0" (bool)
   // "while0" (input = "const0", cond = "cond_fn", body = "body_fn")
@@ -714,7 +752,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -782,4 +820,162 @@ TEST(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   }
 }
 
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
+  // Build the XLA computation func.
+  // "const0" (int32)
+  // "fn" (input = "const0")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *true_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+  {
+    std::unique_ptr<Graph> g(new Graph(&fld));
+
+    tensorflow::TensorProto tensor_proto;
+    tensor_proto.set_dtype(tensorflow::DT_INT32);
+    tensorflow::TensorShapeProto shape;
+    shape.add_dim()->set_size(2);
+    *tensor_proto.mutable_tensor_shape() = shape;
+    for (int i = 0; i < 2; ++i) {
+      tensor_proto.add_int_val(1);
+    }
+    NodeDef const_def;
+    TF_CHECK_OK(NodeDefBuilder("const", "Const")
+                    .Attr("dtype", DT_INT32)
+                    .Attr("value", tensor_proto)
+                    .Finalize(&const_def));
+    Status s;
+    Node *const_node = g->AddNode(const_def, &s);
+    TF_CHECK_OK(s);
+
+    NodeDef fn_def;
+    TF_CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
+                    .Input("const", 0, DT_INT32)
+                    .Finalize(&fn_def));
+    Node *fn_node = g->AddNode(fn_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(const_node, 0, fn_node, 0);
+
+    NodeDef ret_def;
+    TF_CHECK_OK(NodeDefBuilder("ret", "_Retval")
+                    .Attr("index", 0)
+                    .Attr("T", DT_INT32)
+                    .Input("fn", 0, DT_INT32)
+                    .Finalize(&ret_def));
+    Node *ret_node = g->AddNode(ret_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(fn_node, 0, ret_node, 0);
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
+  }
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have call node for outside compilation in `fn`.
+    Node *call_node = node_name_index["oc_call_fn"];
+    EXPECT_NE(call_node, nullptr);
+
+    FunctionBody *call_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("oc_func_call_host_fn"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &call_fbody));
+    std::unique_ptr<FunctionBody> call_fbody_deleter(call_fbody);
+
+    // Verify we have _XlaRecvAtHost and _XlaSendFromHost nodes.
+    bool has_recv = false, has_send = false;
+    for (Node *n : call_fbody->graph->nodes()) {
+      if (n->type_string() == "_XlaRecvAtHost") {
+        has_recv = true;
+      } else if (n->type_string() == "_XlaSendFromHost") {
+        has_send = true;
+      }
+    }
+    EXPECT_TRUE(has_recv);
+    EXPECT_TRUE(has_send);
+  }
+
+  // Check XLA graph.
+  {
+    FunctionBody *xla_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &xla_fbody));
+    std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+    Graph *xla_graph = xla_fbody->graph;
+    auto node_name_index = xla_graph->BuildNodeNameIndex();
+
+    // Check that we have call node.
+    Node *fn_node = node_name_index["fn"];
+    EXPECT_NE(fn_node, nullptr);
+    EXPECT_EQ(fn_node->type_string(), "fn_oc");
+
+    FunctionBody *call_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("fn_oc"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &call_fbody));
+    std::unique_ptr<FunctionBody> call_fbody_deleter(call_fbody);
+
+    // Verify we have XlaHostCompute nodes.
+    bool has_hc = false;
+    for (Node *n : call_fbody->graph->nodes()) {
+      if (n->type_string() == "XlaHostCompute") {
+        has_hc = true;
+      }
+    }
+    EXPECT_TRUE(has_hc);
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 98e344b3a080aa8aab27cd41564a90427bac151e..fba69dfccc31e01e73d8f86006b41ce5e3283f15 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -68,7 +68,12 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_fusion_only",
            &mark_for_compilation_flags->tf_xla_fusion_only,
            "enable fusion of element-wise operations only using XLA when "
-           "global_jit_level is ON*.")};
+           "global_jit_level is ON*."),
+      Flag("tf_xla_disable_deadness_safety_checks_for_debugging",
+           &mark_for_compilation_flags
+                ->tf_xla_disable_deadness_safety_checks_for_debugging,
+           "Disable deadness related safety checks when clustering (this is "
+           "unsound).")};
   flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
 }
 
@@ -89,6 +94,8 @@ void AllocateAndParseFlags() {
   mark_for_compilation_flags->tf_xla_clustering_fuel =
       std::numeric_limits<int64>::max();
   mark_for_compilation_flags->tf_xla_fusion_only = false;
+  mark_for_compilation_flags
+      ->tf_xla_disable_deadness_safety_checks_for_debugging = false;
 
   device_flags = new XlaDeviceFlags;
   device_flags->tf_xla_compile_on_demand = false;
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 5ddea588eef5270880d91623dc05893da265960a..ed7810fcfd85c17db70d42e691446b60dc696939 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -25,27 +25,39 @@ namespace tensorflow {
 
 // Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
-  int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
-                          // computations on CPU and GPU devices.  0 = use
-                          // ConfigProto setting; -1 = off; 1 = on for things
-                          // very likely to be improved; 2 = on for everything.
-                          // Experimental.
-  int32 tf_xla_min_cluster_size;  // Minimum number of operators in an XLA
-                                  // compilation. Ignored for operators placed
-                                  // on an XLA device or operators explicitly
-                                  // marked for compilation.
-  int32 tf_xla_max_cluster_size;  // Maximum number of operators in an XLA
-                                  // compilation.
-  bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
-  bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
-                                  // via SessionOptions.
-  int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
-                                  // many ops will be marked as eligible for
-                                  // clustering.
-  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
-                            // is set to ON* and overrides its behavior. If
-                            // true, enable fusion of element-wise operations
-                            // only using XLA.
+  // Control compilation of operators into XLA computations on CPU and GPU
+  // devices.  0 = use ConfigProto setting; -1 = off; 1 = on for things very
+  // likely to be improved; 2 = on for everything.
+  //
+  // Experimental.
+  int32 tf_xla_auto_jit;
+
+  // Minimum number of operators in an XLA compilation. Ignored for operators
+  // placed on an XLA device or operators explicitly marked for compilation.
+  int32 tf_xla_min_cluster_size;
+
+  // Maximum number of operators in an XLA compilation.
+  int32 tf_xla_max_cluster_size;
+
+  // Dump graphs during XLA compilation.
+  bool tf_xla_clustering_debug;
+
+  // Enables global JIT compilation for CPU via SessionOptions.
+  bool tf_xla_cpu_global_jit;
+
+  // "Compiler fuel" for clustering.  Only this many ops will be marked as
+  // eligible for clustering.
+  int64 tf_xla_clustering_fuel;
+
+  // tf_xla_fusion_only is effective only when global_jit_level is set to ON*
+  // and overrides its behavior. If true, enable fusion of element-wise
+  // operations only using XLA.
+  bool tf_xla_fusion_only;
+
+  // If tf_xla_disable_deadness_safety_checks_for_debugging is set to true then
+  // we do not do deadness related safety checks.  This is unsound in general,
+  // but can be used as a debugging aid.
+  bool tf_xla_disable_deadness_safety_checks_for_debugging;
 };
 
 // Flags associated with the XLA bridge's xla_device module.
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 6618e3a58ab7b6374ed775cd6e4e18a6a4975588..20c2cd7e0561f92a01486102c4d2c572fd80c957 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -41,7 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
@@ -677,12 +678,28 @@ Status MarkForCompilationPass::Run(
   VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
+  // Deadness analysis expects a graph with source and sink edges properly
+  // connected but sometimes the incoming graph does not follow this invariant.
+  // So fix up the source and sink edges before calling into deadness analysis.
+  FixupSourceAndSinkEdges(options.graph->get());
+
   std::unique_ptr<DeadnessAnalysis> deadness;
   {
     XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1);
     TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(**options.graph, &deadness));
   }
 
+  bool deadness_analysis_disabled =
+      GetMarkForCompilationPassFlags()
+          ->tf_xla_disable_deadness_safety_checks_for_debugging;
+
+  if (deadness_analysis_disabled) {
+    LOG(WARNING) << "Deadness analysis was manually disabled via "
+                    "--tf_xla_disable_deadness_safety_checks_for_debugging; "
+                    "auto-clustering "
+                    "is unsound!";
+  }
+
   auto is_compilable = [&](const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -715,9 +732,12 @@ Status MarkForCompilationPass::Run(
     // and some are dead) then don't compile it.  XLA cannot represent the
     // deadness semantics of these nodes correctly and auto-clustering these
     // nodes can cause deadness to propagate to nodes that should be live.
-    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
-      VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
-      return false;
+    if (!deadness_analysis_disabled) {
+      if (node->IsMerge() ||
+          deadness->HasInputsWithMismatchingDeadness(*node)) {
+        VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
+        return false;
+      }
     }
 
     // Check for fusable ops only if requested.
@@ -1145,6 +1165,27 @@ Status MarkForCompilationPass::RunImpl(
   if (flags->tf_xla_clustering_debug) {
     dump_graph::DumpGraphToFile("mark_for_compilation", **options.graph,
                                 options.flib_def);
+
+    // We also dump out an annoated version of the TF graph where the nodes
+    // names are prefixed with the cluster names.  This can help visualizing the
+    // clustering decisions on TensorBoard.
+    Graph new_graph((*options.graph)->op_registry());
+    CopyGraph(**options.graph, &new_graph);
+
+    for (Node* n : new_graph.nodes()) {
+      if (absl::optional<absl::string_view> cluster_name =
+              GetXlaClusterForNode(*n)) {
+        n->set_name(absl::StrCat(*cluster_name, "/", n->name()));
+      } else {
+        // There is room for improvement here.  In particular, it may help to
+        // split these unclustered nodes into classes where every node in a
+        // specific class has edges to and from the same set of clusters.
+        n->set_name(absl::StrCat("unclustered/", n->name()));
+      }
+    }
+
+    dump_graph::DumpGraphToFile("mark_for_compilation_annotated", new_graph,
+                                options.flib_def);
   }
 
   VLogClusteringSummary(*graph);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index bf2c5508ea9e987e80093f4c2e15d3ff5191126f..c2b6250f738fafa35b2c5f79e97cf1281b50a316 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -151,7 +151,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
-TEST(XlaCompilationTest, Complex128Unsupported) {
+TEST(XlaCompilationTest, StringUnsupported) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
   {
@@ -159,10 +159,10 @@ TEST(XlaCompilationTest, Complex128Unsupported) {
     Node* a = ops::SourceOp(
         "Const", builder.opts()
                      .WithName("A")
-                     .WithAttr("dtype", DT_COMPLEX128)
-                     .WithAttr("value", Tensor(DT_COMPLEX128, TensorShape())));
-    Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
-    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+                     .WithAttr("dtype", DT_STRING)
+                     .WithAttr("value", Tensor(DT_STRING, TensorShape())));
+    Node* b = ops::UnaryOp("EncodeBase64", a, builder.opts().WithName("B"));
+    ops::BinaryOp("StringSplit", a, b, builder.opts().WithName("C"));
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 38a54cc5efae35ad77b6dc8039c653e920cfc071..1d81a8f4fcbf050663626b1f7660afd71f4027bc 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index fef28fc810cb4e544fe3f271f0b96cebd8a96779..80993861abba050fa3d6a133023d3c99f41f73e3 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 3df5479a55e841380ca7b8cdd0add9fd17487091..611515cf33bc1abe21e06eb7f1513800276e095b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <numeric>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -38,6 +39,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr int64 XlaCompilationCache::kDefaultCompilationThreshold;
+
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
@@ -60,7 +63,7 @@ XlaCompilationCache::~XlaCompilationCache() {
   // about?
 }
 
-string XlaCompilationCache::DebugString() {
+string XlaCompilationCache::DebugString() const {
   return "XLA JIT compilation cache";
 }
 
@@ -68,9 +71,9 @@ string XlaCompilationCache::DebugString() {
 // arguments in the supplied list.
 string XlaCompilationCache::Signature::HumanString() const {
   string result = name;
-  for (const auto& a : arg_types) {
-    absl::StrAppend(&result, ",", DataTypeString(a.first),
-                    a.second.DebugString());
+  for (const auto& a : arg_shapes) {
+    absl::StrAppend(&result, ",", DataTypeString(a.first));
+    absl::StrAppend(&result, " [", absl::StrJoin(a.second, ","), "]");
   }
 
   for (const auto& v : arg_values) {
@@ -81,7 +84,7 @@ string XlaCompilationCache::Signature::HumanString() const {
 
 bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
   if (name != other.name) return false;
-  if (arg_types != other.arg_types) return false;
+  if (arg_shapes != other.arg_shapes) return false;
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
@@ -97,10 +100,10 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 uint64 XlaCompilationCache::Signature::Hash::operator()(
     const XlaCompilationCache::Signature& signature) const {
   uint64 h = std::hash<string>()(signature.name);
-  for (const auto& arg : signature.arg_types) {
+  for (const auto& arg : signature.arg_shapes) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
-    h = Hash64Combine(h, std::hash<int>()(arg.second.dims()));
-    for (int dim : arg.second.dim_sizes()) {
+    h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
+    for (int dim : arg.second) {
       h = Hash64Combine(h, std::hash<int>()(dim));
     }
   }
@@ -124,7 +127,7 @@ XlaCompilationCache::BuildSignature(
         break;
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kResource:
-        signature.arg_types.emplace_back(arg.type, arg.shape);
+        signature.arg_shapes.emplace_back(arg.type, arg.DimensionSizes());
         break;
       default:
         return errors::InvalidArgument(
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 846d0c963dbfdf55f51120f2f138d12f5f63839b..7748b4700f39da4f952278ca6c6d2cadff4d3fb8 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -88,14 +88,16 @@ class XlaCompilationCache : public ResourceBase {
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
 
-  string DebugString() override;
+  string DebugString() const override;
 
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
   struct Signature {
     string name;
 
-    std::vector<std::pair<DataType, TensorShape>> arg_types;
+    // List of Tensor types & shapes for compile-time constant arguments to the
+    // compilation, ordered by argument number.
+    std::vector<std::pair<DataType, std::vector<int64>>> arg_shapes;
 
     // List of Tensor values for compile-time constant arguments to the
     // compilation, ordered by argument number. Tensors must be in host memory.
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index e9770647e7ba96cc1db026d12d5f11f52ce98d35..94dc61d55fb047c0ea81d98fde24cb55387c27d7 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -83,9 +83,9 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 12> kAllXlaCpuTypes = {
+constexpr std::array<DataType, 13> kAllXlaCpuTypes = {
     {DT_UINT8, DT_QUINT8, DT_INT8, DT_QINT8, DT_INT32, DT_QINT32, DT_INT64,
-     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_CPU, XlaCompileOp, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 77cd2f44628677942da9e576070d1d295194cead..e2397f6fcb8677f4bd5151646f9ebacd3e23af5b 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -219,9 +219,6 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
 XlaDevice::~XlaDevice() {
   VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this;
   mutex_lock lock(mu_);
-  while (outstanding_asynchronous_operations_ > 0) {
-    outstanding_asynchronous_operations_cv_.wait(lock);
-  }
   if (device_context_) {
     device_context_->Unref();
   }
@@ -398,12 +395,6 @@ Status XlaDevice::Sync() {
   if (!stream) return Status::OK();
 
   Status status = stream->BlockHostUntilDone();
-  {
-    mutex_lock lock(mu_);
-    while (outstanding_asynchronous_operations_ > 0) {
-      outstanding_asynchronous_operations_cv_.wait(lock);
-    }
-  }
   TF_RETURN_IF_ERROR(status);
   if (!stream->ok()) {
     return errors::Internal("XlaDevice::Sync() failed.");
@@ -412,6 +403,8 @@ Status XlaDevice::Sync() {
   return Status::OK();
 }
 
+// TODO(b/112409994): This is no longer necessary. Consolidate it with the
+// synchronous version.
 void XlaDevice::Sync(const DoneCallback& done) {
   VLOG(1) << "XlaDevice::Sync (asynchronous)";
   std::shared_ptr<se::Stream> stream;
@@ -424,14 +417,20 @@ void XlaDevice::Sync(const DoneCallback& done) {
     return;
   }
 
+  // The call to ThenEnqueueOnBackgroundThread below enqueues a host callback at
+  // the end of the stream, after everything that has already been enqueued
+  // there at this moment. When the host callback is called, everything before
+  // it must have already finished, and the host callback will then place the
+  // task below onto a background thread. (See the implementation of
+  // ThenEnqueueOnBackgroundThread for details.) Therefore, when the done
+  // callback is finally called from that background thread, we know for sure
+  // that everything enqueued onto the stream (i.e., the device) at this very
+  // moment--when ThenEnqueueOnBackgroundThread is called--will have finished.
+  // This achieves a device-wide sync.
   stream->ThenEnqueueOnBackgroundThread(
       [this, stream, done](se::StreamExecutor*) {
         tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
                                          /*is_expensive=*/true);
-        mutex_lock lock(mu_);
-        while (outstanding_asynchronous_operations_ > 0) {
-          outstanding_asynchronous_operations_cv_.wait(lock);
-        }
         done(stream->ok() ? Status::OK()
                           : errors::Internal("XlaDevice::Sync() failed."));
       });
@@ -470,57 +469,26 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
   return status;
 }
 
-void XlaDevice::SetRequiresSyncOnCompletion(bool sync_on_completion) {
+void XlaDevice::SetAllowsSyncOnCompletion(bool sync_on_completion) {
   mutex_lock lock(mu_);
   sync_on_completion_ = sync_on_completion;
 }
 
-bool XlaDevice::RequiresSyncOnCompletion() const {
+bool XlaDevice::AllowsSyncOnCompletion() const {
   mutex_lock lock(mu_);
   return sync_on_completion_;
 }
 
-XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
-    XlaDevice* device)
-    : device_(device) {
-  mutex_lock lock(device_->mu_);
-  ++device_->outstanding_asynchronous_operations_;
-}
-
-XlaDevice::AsynchronousOperationHandle::~AsynchronousOperationHandle() {
-  if (device_) {
-    mutex_lock lock(device_->mu_);
-    --device_->outstanding_asynchronous_operations_;
-    device_->outstanding_asynchronous_operations_cv_.notify_all();
+Status XlaDevice::CurrentStatus() {
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
   }
-}
-
-XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
-    const XlaDevice::AsynchronousOperationHandle& other)
-    : device_(other.device_) {
-  mutex_lock lock(device_->mu_);
-  ++device_->outstanding_asynchronous_operations_;
-}
-
-XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
-    XlaDevice::AsynchronousOperationHandle&& other)
-    : device_(other.device_) {
-  other.device_ = nullptr;
-}
-
-XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
-operator=(const XlaDevice::AsynchronousOperationHandle& other) {
-  device_ = other.device_;
-  mutex_lock lock(device_->mu_);
-  ++device_->outstanding_asynchronous_operations_;
-  return *this;
-}
-
-XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
-operator=(XlaDevice::AsynchronousOperationHandle&& other) {
-  device_ = other.device_;
-  other.device_ = nullptr;
-  return *this;
+  if (!stream) {
+    return Status::OK();
+  }
+  return stream->ok() ? Status::OK() : errors::Internal("XlaDevice is not OK.");
 }
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 45f18ac9ee6d403c192bd421d7823f2d408d994b..e35a1c7d29514dc5777bdbd3858c56401d7b9044 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -167,35 +167,14 @@ class XlaDevice : public LocalDevice {
   Status UseGpuDeviceInfo() LOCKS_EXCLUDED(mu_);
 
   // Instructs this XlaDevice to return 'sync_on_completion' for
-  // RequiresSyncOnCompletion().
-  void SetRequiresSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
+  // AllowsSyncOnCompletion().
+  void SetAllowsSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
 
-  bool RequiresSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
+  bool AllowsSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
-  // A simple RAII handle. On construction the device's
-  // outstanding_asynchronous_operations_ field is incremented; on destruction
-  // it is decremented.
-  class AsynchronousOperationHandle {
-   public:
-    AsynchronousOperationHandle(XlaDevice* device);
-    ~AsynchronousOperationHandle();
-    AsynchronousOperationHandle(const AsynchronousOperationHandle& other);
-    AsynchronousOperationHandle(AsynchronousOperationHandle&& other);
-    AsynchronousOperationHandle& operator=(
-        const AsynchronousOperationHandle& other);
-    AsynchronousOperationHandle& operator=(AsynchronousOperationHandle&& other);
-
-   private:
-    XlaDevice* device_ = nullptr;
-  };
-
-  AsynchronousOperationHandle CreateAsynchronousOperationHandle() {
-    return AsynchronousOperationHandle(this);
-  }
+  Status CurrentStatus() override LOCKS_EXCLUDED(mu_);
 
  private:
-  friend class AsynchronousOperationHandle;
-
   xla::LocalClient* client() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -255,14 +234,9 @@ class XlaDevice : public LocalDevice {
   // Thread pool used for running closures
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
-  // True if the device requires XlaDevice::Sync to be called on completion
+  // True if the device allows XlaDevice::Sync to be called on completion
   // regardless of status.
-  bool sync_on_completion_ GUARDED_BY(mu_) = false;
-
-  // Count of outstanding asynchronous operations which must be zero on Sync()
-  // completion.
-  int64 outstanding_asynchronous_operations_ GUARDED_BY(mu_) = 0;
-  condition_variable outstanding_asynchronous_operations_cv_;
+  bool sync_on_completion_ GUARDED_BY(mu_) = true;
 
   // Set of devices to use. This controls which of the devices on the given
   // platform will have resources allocated. For GPUs this will be
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 1f3afe8822d441a5ce37617fe18d7767e9bc72e4..28681bb8b03dbf97e8145972f9a04b5855fafdae 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -131,7 +131,7 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         xla::ShapeUtil::MakeShape(shape.element_type(),
                                   xla::AsInt64Slice(shape.dimensions())));
 
-    VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+    VLOG(2) << "Transfer to device as literal: " << literal.ToString() << " "
             << xla_tensor->shaped_buffer().ToString();
     if (UseMultipleStreams() &&
         !transfer_manager_->CanShapedBufferBeAccessedNow(
@@ -214,7 +214,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
       device_to_host_stream_.get(), xla_tensor->shaped_buffer(), literal,
       [ref, xla_tensor, done](xla::Status status) {
         done([&]() -> Status {
-          VLOG(1) << "Transfer from device as literal: "
+          VLOG(2) << "Transfer from device as literal: "
                   << xla_tensor->shaped_buffer().ToString();
           return status;
         }());
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 4007309ed1c57b663dca5bac0df11260bf1327f3..e1a582406153d2af447fa9d4ebcaf0bf0842b132 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -26,9 +26,9 @@ namespace tensorflow {
 const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
 const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";
 
-constexpr std::array<DataType, 9> kExecAllTypes = {
+constexpr std::array<DataType, 10> kExecAllTypes = {
     {DT_INT8, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_BOOL, DT_BFLOAT16}};
+     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
 class XlaInterpreterDeviceFactory : public DeviceFactory {
  public:
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3b0bda4caa161a7561a3098b89420329998ff8a7..c64981053fad2dbf1e8bcd623a940ded8b4d9150 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -237,7 +237,7 @@ void XlaComputationLaunchContext::PopulateInputs(
 
     const xla::Shape on_device_shape =
         client_->backend().transfer_manager()->HostShapeToDeviceShape(shape);
-    if (xla::ShapeUtil::IsTuple(on_device_shape)) {
+    if (on_device_shape.IsTuple()) {
       const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
       arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
@@ -274,7 +274,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   // If the on-host-shape isn't a tuple, create a new single-element tuple
   // buffer with a nullptr root index table. This allows the code below to treat
   // output as a tuple unconditionally.
-  if (!xla::ShapeUtil::IsTuple(output.on_host_shape())) {
+  if (!output.on_host_shape().IsTuple()) {
     ShapedBuffer nontuple_buffer = output.release();
     ShapedBuffer buffer(
         xla::ShapeUtil::MakeTupleShape({nontuple_buffer.on_host_shape()}),
@@ -377,7 +377,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     }
 
     if (VLOG_IS_ON(3)) {
-      VLOG(3) << ctx->mutable_output(i)->DebugString();
+      VLOG(3) << ctx->mutable_output(i)->DeviceSafeDebugString();
     }
   }
 
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index fa02cf9cbef45188a6dc2f861ff036649ea92b03..2b9f5d8dbd5152c74936ca92b1066760c4caa00f 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -230,6 +230,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:standard_ops",
     ],
 )
 
@@ -406,7 +407,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "eager_test",
-    size = "large",
+    size = "medium",
     srcs = ["eager_test.py"],
     deps = [
         ":xla_test",
@@ -677,6 +678,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:standard_ops",
     ],
 )
 
@@ -826,6 +828,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:standard_ops",
         "//tensorflow/python:stateless_random_ops",
     ],
 )
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 9a5423c1b2a5df7880453cbb328f6a8174066255..c829c50b5518b29c96c0b0117a6cd143911bd1fc 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -311,6 +311,30 @@ class BinaryOpsTest(xla_test.XLATestCase):
           dtype(7),
           expected=np.array([[-6], [-5]], dtype=dtype))
 
+      if dtype in [np.float32, np.float64]:
+        x = np.array([
+            -0.0, 0.0, -0.0, +0.0, np.inf, np.inf, -np.inf, -np.inf, 2.0, 2.0,
+            1.0
+        ],
+                     dtype=dtype)
+        y = np.array(
+            [-0.0, 0.0, +0.0, -0.0, 1.0, -1.0, 1.0, -1.0, 2.0, 1.0, 2.0],
+            dtype=dtype)
+        expected = np.nextafter(x, y)
+
+        # We use assertAllEqual to expose any bugs hidden by relative or
+        # absolute error tolerances.
+        def NextAfterEqualityTest(result, expected, rtol):
+          del rtol
+          return self.assertAllEqual(result, expected)
+
+        self._testBinary(
+            math_ops.nextafter,
+            x,
+            y,
+            expected=expected,
+            equality_test=NextAfterEqualityTest)
+
       # min/max not supported for complex
       if dtype not in self.complex_types | {np.uint8, np.int8}:
         self._testBinary(
@@ -400,7 +424,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
   def testComplexOps(self):
     for dtype in self.complex_types:
-      ctypes = {np.complex64: np.float32}
+      ctypes = {np.complex64: np.float32, np.complex128: np.float64}
       self._testBinary(
           math_ops.complex,
           np.array([[[[-1, 2], [2, 0]]]], dtype=ctypes[dtype]),
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 447a7de2cb6526a5dcf7789d4f2bffb5e733e8c0..ed580f95b6c2f57dfdf46cfcd64cabb452980c5d 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -5,6 +5,7 @@ load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
+    "tf_exec_compatible_with",
 )
 
 def all_backends():
@@ -64,7 +65,7 @@ def tf_xla_py_test(
         if backend == "cpu":
             backend_args += [
                 "--test_device=XLA_CPU",
-                "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64",
+                "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128",
             ]
         elif backend == "gpu":
             backend_args += [
@@ -84,6 +85,7 @@ def tf_xla_py_test(
         else:
             fail("Unknown backend {}".format(backend))
 
+        test_tags = tags + backend_tags
         native.py_test(
             name = test_name,
             srcs = srcs,
@@ -92,7 +94,8 @@ def tf_xla_py_test(
             main = "{}.py".format(name) if main == None else main,
             data = data + backend_data,
             deps = deps + backend_deps,
-            tags = tags + backend_tags,
+            tags = test_tags,
+            exec_compatible_with = tf_exec_compatible_with({"tags": test_tags}),
             **kwargs
         )
         test_names.append(test_name)
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index bf5ea7b1fb6fb3c774c4db20d059f131990d20d3..b7d08df9f7d144b71fd0b09535e10b8f596ea6ca 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -72,7 +72,7 @@ class DenseLayerTest(test.TestCase):
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -97,7 +97,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -126,7 +126,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 2af32b537ba53723370faf81aebf308a465718c7..c9fce39f6c5111f93a54708b59b4c42c3ba844b6 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -24,6 +24,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.layers import convolutional
 from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import init_ops
@@ -463,7 +465,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
       def f(x, y):
         return x[0::2, y:, ...]
 
-      x = array_ops.ones([2, 3, 4])
+      x = array_ops.ones([2, 3, 4], dtype=dtypes.float32)
       y = array_ops.ones([], dtype=dtypes.int32)
       with backprop.GradientTape() as tape:
         tape.watch(x)
@@ -479,15 +481,15 @@ class EagerFunctionTest(xla_test.XLATestCase):
 
       @function.defun
       def times_two(x):
-        return 2 * x
+        return 2. * x
 
       @function.defun
       def two_x_plus_1(x):
-        return times_two(x) + 1
+        return times_two(x) + 1.
 
-      x = constant_op.constant([2, 3, 4])
+      x = constant_op.constant([2., 3., 4.])
       y = two_x_plus_1(x)
-      self.assertAllEqual([5, 7, 9], y.numpy())
+      self.assertAllEqual([5., 7., 9.], y.numpy())
 
   def testNestedDefunWithVariable(self):
     with self.test_scope():
@@ -506,7 +508,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
       x = constant_op.constant(3.0)
       y = f(x)
 
-    self.assertEqual(75, y.numpy())
+    self.assertEqual(75.0, y.numpy())
 
   def testNestedDefunInGradientTape(self):
     with self.test_scope():
@@ -555,6 +557,56 @@ class EagerFunctionTest(xla_test.XLATestCase):
     self.assertEqual(9, dy_v0.numpy())
     self.assertEqual(15, dy_v1.numpy())
 
+  def testWhileInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(start):
+        c = lambda x: math_ops.less(x, 13.0)
+        b = lambda x: math_ops.add(x, 1.0)
+        return control_flow_ops.while_loop(c, b, [start])
+
+      y = f(constant_op.constant(3.0))
+    self.assertEqual(13.0, y.numpy())
+
+  def testAutoGraphWhileInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(start):
+        x = start
+        while x < 13.0:
+          x += 1.0
+        return x
+
+      y = f(constant_op.constant(3.0))
+    self.assertEqual(13.0, y.numpy())
+
+  def testCondInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(pred, value):
+        fn1 = lambda: math_ops.add(value, 1.0)
+        fn2 = lambda: math_ops.subtract(value, 1.0)
+        return control_flow_ops.cond(pred, fn1, fn2)
+
+      plus_one = f(constant_op.constant(True), constant_op.constant(10.0))
+      minus_one = f(constant_op.constant(False), constant_op.constant(10.0))
+    self.assertEqual(11.0, plus_one.numpy())
+    self.assertEqual(9.0, minus_one.numpy())
+
+  def testAutoGraphCondInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(pred, value):
+        if pred:
+          return value + 1.0
+        else:
+          return value - 1.0
+
+      plus_one = f(constant_op.constant(True), constant_op.constant(10.0))
+      minus_one = f(constant_op.constant(False), constant_op.constant(10.0))
+    self.assertEqual(11.0, plus_one.numpy())
+    self.assertEqual(9.0, minus_one.numpy())
+
 
 class ExcessivePaddingTest(xla_test.XLATestCase):
   """Test that eager execution works with TPU flattened tensors.
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 0e2d840418156d825e2d141018e49f42374c8fee..42e688174fce9e939feb09e1767ebab31e30a6ee 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -403,6 +403,117 @@ class AdjustSaturationTest(xla_test.XLATestCase):
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
+class ResizeNearestNeighborTest(xla_test.XLATestCase):
+  # TODO(ilch): Wrap each test with `for dtype in self.float_types:`
+  # Some work to understand how that should be done was presented here:
+  # cl/227850213
+
+  def _assertForwardOpMatchesExpected(self,
+                                      image_np,
+                                      target_shape,
+                                      expected=None,
+                                      large_tolerance=False,
+                                      align_corners=True):
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.cached_session() as sess, self.test_scope():
+      image = array_ops.placeholder(image_np.dtype)
+      resized = gen_image_ops.resize_nearest_neighbor(
+          image, target_shape, align_corners=align_corners)
+      out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
+      if large_tolerance:
+        self.assertAllClose(
+            expected[np.newaxis, :, :, np.newaxis], out, rtol=2e-4, atol=2e-4)
+      else:
+        self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def testAlignCorners2x2To1x1(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2], [3, 4]], dtype=np.float32), [1, 1],
+        expected=np.array([[1]], dtype=np.float32))
+
+  def testAlignCorners1x1To2x2(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1]], dtype=np.float32), [2, 2],
+        expected=np.array([[1, 1], [1, 1]], dtype=np.float32))
+
+  def testAlignCorners1x1To3x3(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1]], dtype=np.float32), [3, 3],
+        expected=np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2], [3, 4]], dtype=np.float32), [3, 3],
+        expected=np.array([[1, 2, 2], [3, 4, 4], [3, 4, 4]], dtype=np.float32))
+
+  def testAlignCorners2x2To4x4(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2], [3, 4]], dtype=np.float32), [4, 4],
+        expected=np.array(
+            [[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4, 4], [3, 3, 4, 4]],
+            dtype=np.float32), large_tolerance=True)
+
+  def testAlignCorners3x3To2x2(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [2, 2],
+        expected=np.array([[1, 3], [7, 9]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3(self):
+    self._assertForwardOpMatchesExpected(
+        np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+            dtype=np.float32), [3, 3],
+        expected=np.array([[1, 3, 4], [9, 11, 12], [13, 15, 16]],
+                          dtype=np.float32))
+
+  def testAlignCorners3x3To4x4(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [4, 4],
+        expected=np.array(
+            [[1, 2, 2, 3], [4, 5, 5, 6], [4, 5, 5, 6], [7, 8, 8, 9]],
+            dtype=np.float32))
+
+  def testAlignCorners3x3To6x6(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [6, 6],
+        expected=np.array(
+            [[1, 1, 2, 2, 3, 3], [1, 1, 2, 2, 3, 3], [4, 4, 5, 5, 6, 6],
+             [4, 4, 5, 5, 6, 6], [7, 7, 8, 8, 9, 9], [7, 7, 8, 8, 9, 9]],
+            dtype=np.float32))
+
+  def testAlignCorners3x3To9x9(self):
+    # The expected matrix might look uneven in terms of how many of each number
+    # there is, but this is an artifact of doing the dilation and convolution
+    # iteratively. The behavior is less esoteric in the 3x3To12x12 case below.
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [9, 9],
+        expected=np.array(
+            [[1, 2, 2, 2, 2, 3, 3, 3, 3], [4, 5, 5, 5, 5, 6, 6, 6, 6],
+             [4, 5, 5, 5, 5, 6, 6, 6, 6], [4, 5, 5, 5, 5, 6, 6, 6, 6],
+             [4, 5, 5, 5, 5, 6, 6, 6, 6], [7, 8, 8, 8, 8, 9, 9, 9, 9],
+             [7, 8, 8, 8, 8, 9, 9, 9, 9], [7, 8, 8, 8, 8, 9, 9, 9, 9],
+             [7, 8, 8, 8, 8, 9, 9, 9, 9]],
+            dtype=np.float32))
+
+  def testAlignCorners3x3To12x12(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [12, 12],
+        expected=np.array([[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
+                          dtype=np.float32))
+
+
 class ResizeBilinearTest(xla_test.XLATestCase):
 
   def _assertForwardOpMatchesExpected(self,
@@ -444,14 +555,14 @@ class ResizeBilinearTest(xla_test.XLATestCase):
       self.assertAllCloseAccordingToType(expected[np.newaxis, :, :, np.newaxis],
                                          out)
 
-  def testAlignCorners1x2To3x2(self):
+  def testAlignCorners1x2To3x3(self):
     for dtype in self.float_types:
       self._assertForwardOpMatchesExpected(
           np.array([[1, 2]], dtype=dtype), [3, 3],
           expected=np.array([[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]],
                             dtype=np.float32))
 
-  def testAlignCorners1x2To3x2Grad(self):
+  def testAlignCorners1x2To3x3Grad(self):
     for dtype in self.float_types:
       self._assertBackwardOpMatchesExpected(
           np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index d23fd125163d1afe8c7fd5e008d4b617ff4b2874..1521cc760b85b176acb27c1489640e92ef90e247 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -80,6 +81,7 @@ int64 tf_xla_random_seed = 0;
 int32 tf_xla_test_repetitions = 20;
 int64 tf_xla_max_tensor_size = 10000LL;
 string* tf_xla_test_device_ptr;  // initial value set in main()
+string* tf_xla_reference_device_ptr;  // initial value set in main()
 bool tf_xla_test_use_jit = true;
 
 string LocalDeviceToFullDeviceName(const string& device) {
@@ -321,6 +323,9 @@ class OpTest : public ::testing::Test {
   // for use as reduction indices.
   Tensor RandomReductionIndices(int rank);
 
+  // Returns a random bit.
+  bool RandomBool();
+
   struct WindowedSpatialDims {
     Padding padding;
     std::vector<int64> kernel_dims;
@@ -453,6 +458,11 @@ std::vector<int64> OpTest::RandomDims(int min_rank, int max_rank,
   return dims;
 }
 
+bool OpTest::RandomBool() {
+  std::bernoulli_distribution d(0.5);
+  return d(generator());
+}
+
 Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
                             absl::Span<const int64> shape) {
   Tensor tensor(dtype, TensorShape(shape));
@@ -760,8 +770,22 @@ Status TensorsAreEqualImpl(const Tensor& x, const Tensor& y) {
   for (int i = 0; i < Tx.size(); ++i) {
     if (Tx(i) != Ty(i)) {
       return errors::InvalidArgument(absl::StrCat(
-          i, "-th tensor element isn't equal: ", Tx(i), " vs. ", Ty(i),
-          ". x = ", x.DebugString(), "y = ", y.DebugString()));
+          i, "-th tensor element isn't equal: ", Str(Tx(i)), " vs. ",
+          Str(Ty(i)), ". x = ", x.DebugString(), "y = ", y.DebugString()));
+    }
+  }
+  return Status::OK();
+}
+
+Status TensorsAreEqualImplBfloat16(const Tensor& x, const Tensor& y) {
+  auto Tx = x.flat<bfloat16>();
+  auto Ty = y.flat<bfloat16>();
+  for (int i = 0; i < Tx.size(); ++i) {
+    if (Tx(i) != Ty(i)) {
+      return errors::InvalidArgument(absl::StrCat(
+          i, "-th tensor element isn't equal: ", static_cast<float>(Tx(i)),
+          " vs. ", static_cast<float>(Ty(i)), ". x = ", x.DebugString(),
+          "y = ", y.DebugString()));
     }
   }
   return Status::OK();
@@ -797,6 +821,8 @@ Status TensorsAreClose(const Tensor& a, const Tensor& b, double atol,
       return TensorsAreEqualImpl<int64>(a, b);
     case DT_BOOL:
       return TensorsAreEqualImpl<bool>(a, b);
+    case DT_BFLOAT16:
+      return TensorsAreEqualImplBfloat16(a, b);
     default:
       LOG(FATAL) << "Unexpected type : " << DataTypeString(a.dtype());
   }
@@ -829,8 +855,8 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
     VLOG(1) << "Input: " << input_tensors.back().DebugString();
   }
 
-  string cpu_device =
-      LocalDeviceToFullDeviceName(absl::StrCat(DEVICE_CPU, ":0"));
+  string reference_device =
+      LocalDeviceToFullDeviceName(*tf_xla_reference_device_ptr);
   string test_device = LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
 
   DeviceNameUtils::ParsedName parsed_name;
@@ -845,9 +871,9 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
   std::vector<string> expected_inputs, test_inputs;
   std::vector<string> expected_fetches, test_fetches;
   Status status = builder.BuildGraph(
-      absl::StrCat("test", num_tests_, "_expected"), cpu_device,
-      /* use_jit= */ false, &graph, /* test_node_def= */ nullptr,
-      &expected_inputs, &expected_fetches);
+      absl::StrCat("test", num_tests_, "_expected"), reference_device,
+      /*use_jit=*/false, &graph, /*test_node_def=*/nullptr, &expected_inputs,
+      &expected_fetches);
   if (!status.ok()) {
     LOG(ERROR) << "Expected graph construction failed: " << status;
     return kFatalError;
@@ -1371,6 +1397,19 @@ TEST_F(OpTest, Cast) {
   });
 }
 
+TEST_F(OpTest, CastBF16) {
+  Repeatedly([this]() {
+    DataType src_type, dst_type;
+    src_type = Choose<DataType>({DT_FLOAT});
+    dst_type = Choose<DataType>({DT_BFLOAT16});
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Cast")
+                                             .RandomInput(src_type)
+                                             .Attr("SrcT", src_type)
+                                             .Attr("DstT", dst_type)
+                                             .Attr("Truncate", true));
+  });
+}
+
 TEST_F(OpTest, Ceil) {
   Repeatedly([this]() {
     return ExpectTfAndXlaOutputsAreClose(
@@ -3346,11 +3385,41 @@ TEST_F(OpTest, ZerosLike) {
   });
 }
 
+// Example failing run:
+//   --tf_xla_reference_device=GPU:0
+//   --tf_xla_test_use_jit=true --tf_xla_test_device=GPU:0
+//   --tf_xla_test_repetitions=2
+//   --gunit_filter='OpTest.FusedBatchNormTraining'
+//   --tf_xla_random_seed=2838146746
+TEST_F(OpTest, FusedBatchNormTraining) {
+  bool is_nhwc = RandomBool();
+  std::vector<int64> x_dims = RandomDims(/*min_rank=*/4, /*max_rank=*/4,
+                                         /*min_size=*/5, /*max_size=*/20);
+  std::vector<int64> scale_dims = {x_dims[is_nhwc ? 3 : 1]};
+  std::vector<int64> offset_dims = {x_dims[is_nhwc ? 3 : 1]};
+  std::vector<int64> mean_dims = {0};
+  std::vector<int64> variance_dims = {0};
+  DataType type = DT_FLOAT;
+  Repeatedly([&] {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("FusedBatchNorm")
+            .RandomInput(type, x_dims)
+            .RandomInput(type, scale_dims)
+            .RandomInput(type, offset_dims)
+            .RandomInput(type, mean_dims)
+            .RandomInput(type, variance_dims)
+            .Attr("T", type)
+            .Attr("data_format", is_nhwc ? "NHWC" : "NCHW")
+            .Attr("epsilon", static_cast<float>(1.001e-05))
+            .Attr("is_training", true));
+  });
+}
 }  // anonymous namespace
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
   tensorflow::tf_xla_test_device_ptr = new tensorflow::string("GPU:0");
+  tensorflow::tf_xla_reference_device_ptr = new tensorflow::string("CPU:0");
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag(
           "tf_xla_random_seed", &tensorflow::tf_xla_random_seed,
@@ -3366,6 +3435,9 @@ int main(int argc, char** argv) {
                        "Maximum number of elements for random input tensors."),
       tensorflow::Flag("tf_xla_test_device", tensorflow::tf_xla_test_device_ptr,
                        "Tensorflow device type to use for test"),
+      tensorflow::Flag("tf_xla_reference_device",
+                       tensorflow::tf_xla_reference_device_ptr,
+                       "Tensorflow device type to use for reference"),
       tensorflow::Flag("tf_xla_test_use_jit", &tensorflow::tf_xla_test_use_jit,
                        "Use JIT compilation for the operator under test"),
   };
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index 5c079d595c440cac644f5461154509abe7b1d1ed..47e0f384a4f1e46ccc35584aaff3a0aceff8a985 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -23,24 +23,20 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.platform import test
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
-
-
 class ListOpsTest(xla_test.XLATestCase):
 
   def testElementShape(self):
     with self.cached_session() as sess, self.test_scope():
       dim = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=(dim, 15), num_elements=20,
-          element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(dim, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=20)
       e32 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
       e64 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int64)
       self.assertAllEqual(sess.run(e32, {dim: 10}), (10, 15))
@@ -48,25 +44,44 @@ class ListOpsTest(xla_test.XLATestCase):
 
   def testPushPop(self):
     with self.cached_session() as sess, self.test_scope():
-      num = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=(7, 15), num_elements=num, element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(7, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=10)
       l = list_ops.tensor_list_push_back(
           l, constant_op.constant(1.0, shape=(7, 15)))
       l = list_ops.tensor_list_push_back(
           l, constant_op.constant(2.0, shape=(7, 15)))
       l, e2 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       _, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-      self.assertAllEqual(sess.run(e2, {num: 10}), 2.0 * np.ones((7, 15)))
-      self.assertAllEqual(sess.run(e1, {num: 10}), 1.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e2), 2.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e1), 1.0 * np.ones((7, 15)))
+
+  def testDoNotConstantFoldVariants(self):
+    with self.cached_session() as sess, self.test_scope():
+      val = array_ops.placeholder(dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(7, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=10)
+      # Note: Pushing a Placeholder will force the constant folding code
+      # to build a Const node with a DT_VARIANT output. This tests that XLA
+      # passes a cf_consider_fn which prevent folding such nodes.
+      l = list_ops.tensor_list_push_back(
+          l, array_ops.fill(value=val, dims=(7, 15)))
+      l = list_ops.tensor_list_push_back(
+          l, constant_op.constant(2.0, shape=(7, 15)))
+      l, e2 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      _, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e2, {val: 1.0}), 2.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e1, {val: 1.0}), 1.0 * np.ones((7, 15)))
 
   def testPushPopSeparateLists(self):
     with self.cached_session() as sess, self.test_scope():
-      num = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=scalar_shape(),
-          num_elements=num,
-          element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=[],
+          element_dtype=dtypes.float32,
+          max_num_elements=20)
       l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
       l2 = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
       l3 = list_ops.tensor_list_push_back(l, constant_op.constant(3.0))
@@ -75,22 +90,95 @@ class ListOpsTest(xla_test.XLATestCase):
       l2, e22 = list_ops.tensor_list_pop_back(l2, element_dtype=dtypes.float32)
       l3, e31 = list_ops.tensor_list_pop_back(l3, element_dtype=dtypes.float32)
       l3, e32 = list_ops.tensor_list_pop_back(l3, element_dtype=dtypes.float32)
-      result = sess.run([e11, [e21, e22], [e31, e32]], {num: 20})
+      result = sess.run([e11, [e21, e22], [e31, e32]])
       self.assertEqual(result, [1.0, [2.0, 1.0], [3.0, 1.0]])
 
-  def testEmptyTensorList(self):
-    dim = 7
+  def testEmptyTensorListNoMax(self):
     with self.cached_session() as sess, self.test_scope():
-      p = array_ops.placeholder(dtypes.int32)
       l = list_ops.empty_tensor_list(
-          element_shape=(p, 15), element_dtype=dtypes.float32)
+          element_shape=(7, 15), element_dtype=dtypes.float32)
       l = list_ops.tensor_list_push_back(
-          l, constant_op.constant(1.0, shape=(dim, 15)))
+          l, constant_op.constant(1.0, shape=(7, 15)))
       _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Use TensorListReserve instead"):
-        self.assertEqual(sess.run(e, {p: dim}), 1.0 * np.ones((dim, 15)))
+                                   "Set the max number of elements"):
+        self.assertEqual(sess.run(e), 1.0 * np.ones((7, 15)))
 
+  def testEmptyTensorListMax(self):
+    with self.cached_session() as sess, self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_shape=(10, 15), element_dtype=dtypes.float32,
+          max_num_elements=2)
+      l = list_ops.tensor_list_push_back(
+          l, array_ops.fill(value=3.0, dims=(10, 15)))
+      _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e), 3.0 * np.ones((10, 15)))
+
+  def testListFromTensor(self):
+    with self.cached_session(), self.test_scope():
+      t = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e, 1.0)
+      l, e0 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 2.0)
+      l, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(e1, 1.0)
+      self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+
+  def testGetSet(self):
+    with self.cached_session(), self.test_scope():
+      t = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 1.0)
+      l = list_ops.tensor_list_set_item(l, 0, 3.0)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [3.0, 2.0])
+
+  def testGetSetReserved(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 0.0)
+      l = list_ops.tensor_list_set_item(l, 0, 3.0)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [3.0, 0.0])
+
+  def testGetSetReservedNonScalar(self):
+    with self.cached_session() as sess, self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32,
+          element_shape=(7, 15),
+          num_elements=2)
+      l = list_ops.tensor_list_set_item(
+          l, 0, constant_op.constant(1.0, shape=(7, 15)))
+      e1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      e2 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e1), np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e2), np.zeros((7, 15)))
+
+  def testStack(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[],
+          max_num_elements=2)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e, 1.0)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t.shape.as_list(), [None])
+      self.assertAllEqual(t, [1.0, 2.0])
+
+  def testStackWithUninitializedTensors(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [0., 0., 0.])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 95c9e7ffd4651642781143c2c1940b0e51e1e470..3c2875ba477fa71e9e56a18d10efe0808533dd03 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -647,7 +647,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
           expected=np.tan(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
 
-      ctypes = {np.complex64: np.float32}
+      ctypes = {np.complex64: np.float32, np.complex128: np.float64}
       self._assertOpOutputMatchesExpected(
           math_ops.abs,
           np.array([[3 - 4j, -1j, np.inf]], dtype=dtype),
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index fcd7ac5ba1ca5049246e93e6f5f76746fb28c6b8..18c5870e0decb686f4df1c16bbb4a340c93ad21d 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -485,7 +485,7 @@ class SliceAssignTest(xla_test.XLATestCase):
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+    with self.assertRaisesRegexp(errors.FailedPreconditionError,
                                  "uninitialized variable"):
       with self.test_session() as sess, self.test_scope():
         v = resource_variable_ops.ResourceVariable([1, 2])
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a67e511826ae161e78d504c1513934065cbfd19f
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -0,0 +1,440 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_cuda_cc_test(
+    name = "tensorrt_test_cc",
+    size = "small",
+    srcs = ["tensorrt_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        "//tensorflow/core:gpu_init",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_custom_op_library(
+    name = "python/ops/_trt_ops.so",
+    srcs = [
+        "ops/get_serialized_resource_op.cc",
+        "ops/trt_engine_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+cc_library(
+    name = "trt_op_kernels",
+    srcs = [
+        "kernels/get_serialized_resource_op.cc",
+        "kernels/trt_engine_op.cc",
+    ],
+    hdrs = [
+        "kernels/trt_engine_op.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":test_utils",
+        ":trt_allocator",
+        ":trt_conversion",
+        ":trt_logging",
+        ":trt_plugins",
+        ":trt_resources",
+        ":utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+    # TODO(laigd): fix this by merging header file in cc file.
+    alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
+)
+
+tf_cuda_cc_test(
+    name = "get_serialized_resource_op_test",
+    size = "small",
+    srcs = ["kernels/get_serialized_resource_op_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":get_serialized_resource_op_op_lib",
+        ":trt_op_kernels",
+        ":trt_resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "trt_engine_op",
+        "get_serialized_resource_op",
+    ],
+)
+
+tf_cuda_library(
+    name = "trt_logging",
+    srcs = ["utils/trt_logger.cc"],
+    hdrs = ["utils/trt_logger.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_gen_op_wrapper_py(
+    name = "trt_ops",
+    deps = [
+        ":get_serialized_resource_op_op_lib",
+        ":trt_engine_op_op_lib",
+        ":trt_logging",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "trt_ops_loader",
+    srcs = ["python/ops/trt_ops.py"],
+    dso = [
+        "python/ops/_trt_ops.so",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+    kernels = [
+        ":trt_op_kernels",
+        ":trt_engine_op_op_lib",
+        ":get_serialized_resource_op_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_cuda_library(
+    name = "trt_resources",
+    srcs = [
+        "utils/trt_int8_calibrator.cc",
+        "utils/trt_resource_manager.cc",
+        "utils/trt_resources.cc",
+    ],
+    hdrs = [
+        "utils/trt_int8_calibrator.h",
+        "utils/trt_lru_cache.h",
+        "utils/trt_resource_manager.h",
+        "utils/trt_resources.h",
+    ],
+    deps = [
+        ":trt_allocator",
+        ":trt_logging",
+        ":utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_allocator",
+    srcs = ["utils/trt_allocator.cc"],
+    hdrs = ["utils/trt_allocator.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cc_test(
+    name = "trt_allocator_test",
+    size = "small",
+    srcs = ["utils/trt_allocator_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_allocator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "trt_lru_cache_test",
+    size = "small",
+    srcs = ["utils/trt_lru_cache_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_resources",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# Library for the node-level conversion portion of TensorRT operation creation
+tf_cuda_library(
+    name = "trt_conversion",
+    srcs = [
+        "convert/convert_graph.cc",
+        "convert/convert_nodes.cc",
+        "convert/trt_optimization_pass.cc",
+    ],
+    hdrs = [
+        "convert/convert_graph.h",
+        "convert/convert_nodes.h",
+        "convert/trt_optimization_pass.h",
+    ],
+    deps = [
+        ":segment",
+        ":test_utils",
+        ":trt_allocator",
+        ":trt_plugins",
+        ":trt_logging",
+        ":trt_resources",
+        ":utils",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_cuda_cc_test(
+    name = "convert_graph_test",
+    size = "medium",
+    srcs = ["convert/convert_graph_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_conversion",
+        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "convert_nodes_test",
+    size = "medium",
+    srcs = ["convert/convert_nodes_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_logging",
+        ":trt_conversion",
+        ":trt_plugins",
+        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+# Library for the segmenting portion of TensorRT operation creation
+cc_library(
+    name = "segment",
+    srcs = ["segment/segment.cc"],
+    hdrs = [
+        "segment/segment.h",
+        "segment/union_find.h",
+    ],
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "segment_test",
+    size = "small",
+    srcs = ["segment/segment_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":segment",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Library for the plugin factory
+tf_cuda_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["convert/utils.cc"],
+    hdrs = ["convert/utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    srcs = ["utils/test_utils.cc"],
+    hdrs = ["utils/test_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/convert/convert_graph.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index bf2de94e04ae3f6817f7a679ce9fd88e750827dd..1fdb099cc1d658b4259177e357b639ea72d636d0 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
 #include <fstream>
 #include <list>
@@ -24,13 +24,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
@@ -63,8 +64,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 // Returns compiled TRT version information {Maj, Min, Patch}
 std::vector<int> GetLinkedTensorRTVersion() {
@@ -151,7 +152,7 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
     is_supported_op_type = true;
   }
-  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
+  // LINT.ThenChange(//tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc)
   if (!is_supported_op_type) {
     return errors::Unimplemented("Op type ", node->type_string(),
                                  " is not supported");
@@ -334,13 +335,12 @@ struct EdgePtrCompare {
 tensorflow::Status GetEngineInfo(
     const tensorflow::Graph* g,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& segment_nodes,
+    const std::set<const Node*>& segment_nodes,
     const std::unordered_map<string, tensorflow::Node*>& node_map,
     const std::vector<tensorflow::Node*>& reverse_topo_order,
     EngineInfo* info) {
-  std::vector<int> subgraph_node_ids;  // Topologically sorted node ids.
-  std::set<string> subgraph_node_names = segment_nodes;
-  std::set<int> added_const_node_ids;  // Used to prevent double insertion.
+  std::vector<const Node*> subgraph_nodes;  // Topologically sorted nodes.
+  std::set<const Node*> added_const_nodes;  // Used to prevent double insertion.
   std::set<string> segment_devices;
 
   // Map from src_node_name+port to the unique port numbers of the TRT op, where
@@ -352,22 +352,37 @@ tensorflow::Status GetEngineInfo(
   std::unordered_map<string, int> input_to_engine_port, output_to_engine_port;
   for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
        ++it) {
-    const auto& node_name = (*it)->name();
-    if (segment_nodes.count(node_name) == 0) continue;
-    auto node = *it;
+    const Node* node = *it;
+    if (segment_nodes.count(node) == 0) continue;
     auto node_device = node->requested_device();
     if (!node_device.empty()) {
-      segment_devices.insert(node_device);
+      // If device is CPU, treat as if no device was assigned. Don't add CPU to
+      // segment_device because that would cause a segfault in
+      // GetDeviceAndAllocator. This is because GetDeviceAndAllocator assumes
+      // any already set device is a GPU.
+      DeviceNameUtils::ParsedName parsed_name;
+      DeviceNameUtils::ParseFullName(node_device, &parsed_name);
+      if (parsed_name.type == "CPU") {
+        VLOG(1) << "Node " << node->name() << " was assigned to the CPU. "
+                << "Attempting to place on GPU.";
+      } else {
+        segment_devices.insert(node_device);
+      }
     } else {
       if (node->has_assigned_device_name()) {
+        // It appears that nodes will not have assigned devices at this point in
+        // execution.
         segment_devices.insert(node->assigned_device_name());
       } else {
         VLOG(2) << "Node " << node->name()
                 << " neither have requested device nor assigned device";
       }
     }
+    subgraph_nodes.push_back(node);
+
     const int node_id = node->id();
-    subgraph_node_ids.push_back(node_id);
+    const string& node_name = node->name();
+
     // Create input connections. Sort edges first to make determnistic since
     // in_edges is a set of pointers.
     std::vector<const tensorflow::Edge*> in_edges(node->in_edges().begin(),
@@ -375,7 +390,7 @@ tensorflow::Status GetEngineInfo(
     std::sort(in_edges.begin(), in_edges.end(), EdgePtrCompare());
     for (const auto edge : in_edges) {
       auto input_node = edge->src();
-      if (input_node->IsSource() || segment_nodes.count(input_node->name())) {
+      if (input_node->IsSource() || segment_nodes.count(input_node)) {
         continue;
       }
       if (edge->IsControlEdge()) {
@@ -392,12 +407,11 @@ tensorflow::Status GetEngineInfo(
         //
         // Note that the segmenter already ensure that the constant data input
         // is valid and suppported by the engine.
-        if (!added_const_node_ids.insert(input_node->id()).second) {
+        if (!added_const_nodes.insert(input_node).second) {
           // Already added before.
           continue;
         }
         VLOG(1) << "Adding const node " << input_node->name();
-        QCHECK(subgraph_node_names.insert(input_node->name()).second);
         // Since we already add (duplicate) the const input node to the segment
         // graphdef, it's now not a data dependency any more, but to make the
         // dependency correct we still add a control dependency.
@@ -428,7 +442,7 @@ tensorflow::Status GetEngineInfo(
     std::sort(out_edges.begin(), out_edges.end(), EdgePtrCompare());
     for (const auto edge : out_edges) {
       auto output_node = edge->dst();
-      if (output_node->IsSink() || segment_nodes.count(output_node->name())) {
+      if (output_node->IsSink() || segment_nodes.count(output_node)) {
         continue;
       }
       if (edge->IsControlEdge()) {
@@ -456,12 +470,11 @@ tensorflow::Status GetEngineInfo(
   }  // For each segment node in topological order.
 
   // Construct the const nodes first.
-  subgraph_node_ids.insert(subgraph_node_ids.begin(),
-                           added_const_node_ids.begin(),
-                           added_const_node_ids.end());
+  subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
+                        added_const_nodes.end());
   TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
-      g, graph_properties, subgraph_node_names, subgraph_node_ids,
-      &info->connections, &info->segment_graph_def, &info->engine_name));
+      g, graph_properties, subgraph_nodes, &info->connections,
+      &info->segment_graph_def, &info->engine_name));
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
@@ -654,14 +667,8 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     segment_string = info.segment_graph_def.SerializeAsString();
   }
 
-  // TODO(aaroey): use enum instead, and add a helper method to do the
-  // conversion.
   string prec_string;
   TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
-  if (info.precision_mode == INT8MODE && calibrate_int8 &&
-      !TRTResourceManager::instance()->getManager("TRTCalibration")) {
-    LOG(ERROR) << "Failed to construct calibration storage";
-  }
   tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
   if (!info.device.empty()) node_builder.Device(info.device);
   if (VLOG_IS_ON(1)) {
@@ -677,7 +684,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   }
 
   if (info.engine_type == EngineInfo::EngineType::TRTStatic &&
-      info.cached_engine_batches.size()) {
+      !info.cached_engine_batches.empty()) {
     LOG(WARNING) << "Cached engine batches are ignored for static engines";
   }
   tensorflow::NodeDef trt_node;
@@ -691,7 +698,6 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
           .Attr("serialized_segment", segment_string)
           .Attr("calibration_data", "")
           .Attr("max_cached_engines_count", info.maximum_cached_engines)
-          .Attr("cached_engine_batches", {max_batch_size})
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
           .Attr("precision_mode", prec_string)
           .Attr("use_calibration", info.use_calibration)
@@ -1033,27 +1039,31 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     cudaSetDevice(cuda_device_id);
     auto status = CreateTRTNode(engine_segments, i, params.max_batch_size,
                                 &graph, alloc.get(), &engine_nodes);
-    // If status is ok, we successfully added the node to the graph and can
-    // remove segment ops. Otherwise graph is not modified.
+
     string msg = StrCat("TensorRT node ", engine.engine_name,
                         " added for segment ", i, " consisting of ",
                         converted_segments.at(i).first.size(), " nodes");
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
-      for (auto node_name : converted_segments.at(i).first) {
-        graph.RemoveNode(node_map.at(node_name));
-      }
     } else {
       // Graph is not modified.
       LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
-      for (const string& node_name : converted_segments.at(i).first) {
-        StrAppend(&msg, node_name, ", ");
+      for (const Node* node : converted_segments.at(i).first) {
+        StrAppend(&msg, node->name(), ", ");
       }
       VLOG(1) << msg;
     }
+
+    // If status is ok, we successfully added the node to the graph and can
+    // remove segment ops. Otherwise graph is not modified.
+    if (status.ok()) {
+      for (const Node* node : converted_segments.at(i).first) {
+        graph.RemoveNode(const_cast<Node*>(node));
+      }
+    }
   }
   cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
similarity index 94%
rename from tensorflow/contrib/tensorrt/convert/convert_graph.h
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 1f39f56f6392ba33af3d74fec12c326ed4451cb6..fb82a430c632781047487a280e23e7da4c385929 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -123,4 +123,4 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
similarity index 98%
rename from tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 2d2bfeb192c1893824c7b30bfad593c62c203392..a3c3a8ac6561259c974aebb6c6eeac05c71b7161 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index adf8831b960172fc29b5d631e5b0533318d4764d..c08582a42e24fd55e785ad045725e06f1d414bfd 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
 #include <cstring>
@@ -24,11 +24,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"        // NOLINT
@@ -43,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -81,9 +83,9 @@ const char* const kInputPHName = "TensorRTInputPH_";
 const char* const kOutputPHName = "TensorRTOutputPH_";
 
 namespace convert {
+using absl::StrAppend;
+using absl::StrCat;
 using ::tensorflow::str_util::Split;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
                                        nvinfer1::DataType* trt_dtype) {
@@ -334,6 +336,21 @@ Status Converter::GetTrtBroadcastShape(
   return Status::OK();
 }
 
+nvinfer1::ITensor* Converter::CreateConstantLayer(
+    const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) {
+  nvinfer1::Weights trt_weights = weights.GetTrtWeights();
+  nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
+  if (!layer) return nullptr;
+  const nvinfer1::DataType trt_dtype = trt_weights.type;
+  nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
+  // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
+  // the data type below, it will always be kFLOAT regardless what the data type
+  // of the weights is. Once NVIDIA fixes this bug, we should remove the data
+  // type setting logic below and test should still pass.
+  trt_tensor->setType(trt_dtype);
+  return trt_tensor;
+}
+
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
                       const nvinfer1::Dims& dim_r) {
   if (dim_l.nbDims != dim_r.nbDims) {
@@ -879,6 +896,8 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
     // We need to check the name before setting it. If the input is one of the
     // engine input, setting the name here will overwrite engine input
     // bindings which will cause runtime error.
+    // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
+    // in ConvertIdentity.
     if (output.is_tensor()) {
       const char* tensor_name = output.tensor()->getName();
       if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
@@ -939,6 +958,22 @@ Status Converter::RenameAndMarkOutputTensors(
     if (tensor == nullptr) {
       return errors::NotFound("Output tensor not found: ", output.first);
     }
+    // Check if this tensor has already been marked as an output.
+    // ConvertIdentity can cause the same tensor to be repeated in
+    // output_tensors, which can cause us to overwrite the name of the output
+    // tensor binding. For example, if we rename OutputPH_0 to OutputPH_1 then
+    // we won't be able to locate OutputPH_0 during runtime. To fix this,
+    // duplicate the tensor using no-op shuffle.
+    // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
+    // in ConvertIdentity.
+    if (tensorflow::str_util::StartsWith(tensor->getName(), kOutputPHName)) {
+      // Using shuffle layer for identity by not setting reshape or transpose.
+      nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor);
+      TFTRT_RETURN_ERROR_IF_NULLPTR(
+          layer, StrCat("Output Copy for ", tensor->getName()));
+      MarkQuantizationRangesAsInferrable(tensor, layer->getOutput(0));
+      tensor = layer->getOutput(0);
+    }
     tensor->setName(output.second.c_str());
     VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
             << output.second;
@@ -1086,10 +1121,8 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
       *tensor = layer->getOutput(0);
     }
   } else {
-    nvinfer1::IConstantLayer* layer =
-        this->network()->addConstant(dims, input.weights().GetTrtWeights());
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
-    *tensor = layer->getOutput(0);
+    *tensor = CreateConstantLayer(input.weights(), dims);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
     if (precision_mode() == INT8MODE && !use_calibration()) {
       // If we are in int8 mode and not calibrating, we need to explicitly set a
       // quantization range for the output tensor of the IConstantLayer. Here we
@@ -1538,6 +1571,11 @@ enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  if (inputs.size() != 2) {
+    return tensorflow::errors::InvalidArgument("Two inputs are expected for ",
+                                               node_def.op(), ", at ",
+                                               node_def.name());
+  }
   if (inputs.at(0).is_weights()) {
     return tensorflow::errors::Unimplemented(
         node_def.op(), " is only implemented for tensors, not weights, at ",
@@ -1549,39 +1587,61 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
                                              node_def.name());
   }
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  VLOG(2) << "weight shape: " << weights_rsck.DebugString();
   if (weights_rsck.shape_.nbDims != 4) {
-    return tensorflow::errors::Internal(
-        "Conv2D expects kernel of dimension 4, at: " + node_def.name());
+    return tensorflow::errors::InvalidArgument(
+        "Conv2D expects kernel of dimension 4, at " + node_def.name());
   }
+  TFAttrs attrs(node_def);
+  auto data_format = attrs.get<string>("data_format");
+  int c_index = (data_format == "NHWC") ? 3 : 1;
+  int h_index = (data_format == "NHWC") ? 1 : 2;
+  int w_index = (data_format == "NHWC") ? 2 : 3;
+  auto tf_dilations = attrs.get<std::vector<int>>("dilations");
+  if (tf_dilations.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Convolution dilations field must specify 4 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Dilation rate must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+  const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
+
+  const auto tf_stride = attrs.get<std::vector<int>>("strides");
+  if (tf_stride.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Convolution strides field must specify 4 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Stride must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+  const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
   if (params->validation_only) return tensorflow::Status::OK();
 
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  TFAttrs attrs(node_def);
 
-  int h_index = 2;
-  int w_index = 3;
-  auto data_format = attrs.get<string>("data_format");
-  if (data_format == "NHWC") {
+  // Transpose to NCHW (NCHW is required for IConvLayer).
+  const bool need_transpose = (data_format == "NHWC");
+  if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
-    h_index = 1;
-    w_index = 2;
-    // TODO(jie): transpose it
   }
-
-  // tensor after transpose (NCHW)
+  // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
 
-  int num_groups = group;
-  if (num_groups == 0) num_groups = tensor_dim.d[0];  // depthwise convolution
-  VLOG(2) << "groups count: " << num_groups;
+  // For depthwise convolution, group will be 0 so set num_groups to size of
+  // input's channel dim. For a non-depthwise conv, num_groups will be 1.
+  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
 
   if (params->converter->precision_mode() == FP16MODE) {
     weights_rsck =
         ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
   }
-
   TRT_ShapedWeights weights =
       params->weight_store->GetTempWeights(weights_rsck);
   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
@@ -1590,35 +1650,22 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   nvinfer1::DimsHW kernel_size;
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
-  VLOG(2) << "RSCK: " << weights.DebugString();
-  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
-
-  // TODO(jie): stride. (NHWC/NCHW)
-  const auto tf_stride = attrs.get<std::vector<int>>("strides");
-  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
-  VLOG(2) << "stride: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
-          << tf_stride[3];
-  const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
+  // Add padding.
   std::vector<std::pair<int, int>> padding;
-  // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
-    // This is NCHW tensor with no batch dimension.
-    //  1 -> h
-    //  2 -> w
+    nvinfer1::DimsHW effective_kernel_size = kernel_size;
+    effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
+    effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
     padding = CreateSamePadding(
-        stride, kernel_size,
+        stride, effective_kernel_size,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else {
     padding = {{0, 0}, {0, 0}};
   }
-
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
-    // TODO(jie): handle asymmetric padding
-    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
-            << padding[1].first << padding[1].second;
-    VLOG(2) << "TENSOR before: " << DebugString(tensor->getDimensions());
+    // Handle asymmetric padding.
     auto pad_layer = params->converter->network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1628,24 +1675,23 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
         const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
-    VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
   }
 
+  // Add convolution.
   nvinfer1::IConvolutionLayer* layer =
       params->converter->network()->addConvolution(
           *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
           weights.GetTrtWeights(), biases.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
   layer->setNbGroups(num_groups);
+  layer->setDilation(dilation);
   const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  VLOG(2) << "TENSOR out: " << DebugString(output_tensor->getDimensions());
-  VLOG(2) << "data_format: " << data_format;
-  if (data_format == "NHWC") {
-    // TODO(jie): transpose it back!
+
+  // Restore transpose.
+  if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), {0, 2, 3, 1},
         &output_tensor));
@@ -1694,6 +1740,13 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
         "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
         status.error_message());
   }
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+  if (dtype == nvinfer1::DataType::kINT32) {
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " does not support INT32, at ",
+                                 node_def.name());
+  }
   if (params->validation_only) return Status::OK();
 
   const nvinfer1::ITensor* tensor_l = nullptr;
@@ -1710,8 +1763,6 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
   }
 
   // Check type consistency.
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
   TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
       << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
   TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
@@ -2534,22 +2585,18 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   auto weights_ptr =
       static_cast<float*>(const_cast<void*>(weights.GetValues()));
   weights_ptr[0] = 6.0f;
-  nvinfer1::IConstantLayer* const6_layer =
-      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name());
-  params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f,
-                                              6.0f);
+  nvinfer1::ITensor* const6_tensor =
+      params->converter->CreateConstantLayer(weights, dims);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_tensor, node_def.name());
+  params->converter->ProvideQuantizationRange(const6_tensor, 0.0f, 6.0f);
 
   // ElementWise Min Operation
   // Min op is a nop for INT8 execution path, as the input tensor
   // to this layer will only have values in range [0.f, 6.0f].
-  const nvinfer1::ITensor* tensor_l = relu_layer->getOutput(0);
-  const nvinfer1::ITensor* tensor_r = const6_layer->getOutput(0);
   nvinfer1::IElementWiseLayer* relu6_layer =
       params->converter->network()->addElementWise(
-          *const_cast<nvinfer1::ITensor*>(tensor_l),
-          *const_cast<nvinfer1::ITensor*>(tensor_r),
-          nvinfer1::ElementWiseOperation::kMIN);
+          *const_cast<nvinfer1::ITensor*>(relu_layer->getOutput(0)),
+          *const6_tensor, nvinfer1::ElementWiseOperation::kMIN);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
   nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
@@ -2566,12 +2613,18 @@ tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
     return errors::InvalidArgument("Input expects tensor and weights, at ",
                                    node_def.name());
   }
+  TFAttrs attrs(node_def);
+  tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
+  if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
+    return errors::Unimplemented("Data type is not supported, for node ",
+                                 node_def.name(), " got ",
+                                 DataTypeString(tf_dtype));
+  }
   if (params->validation_only) return Status::OK();
 
   nvinfer1::ITensor* tensor =
       const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
   const nvinfer1::Dims original_dims = tensor->getDimensions();
-  TFAttrs attrs(node_def);
   const string data_format = attrs.get<string>("data_format");
   const int channel_index =
       (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
@@ -2661,43 +2714,69 @@ tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   return Status::OK();
 }
 
-Status GetTensorDimsWithProtoShape(const Tensor& tensor,
-                                   int tensor_proto_array_len,
-                                   nvinfer1::Dims* dims) {
+void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) {
   if (tensor.dims() > 0) {
     *dims = GetTrtDimsForTensor(tensor);
-    if (TrtDimsNumElements(*dims) != tensor_proto_array_len &&
-        tensor_proto_array_len != 1) {
-      return errors::InvalidArgument(
-          "Broadcast on weights only supports kCHANNEL and kUNIFORM");
-    }
   } else {
     dims->nbDims = 1;
     // No dimension provided. Flatten it.
-    dims->d[0] = tensor_proto_array_len;
+    dims->d[0] = tensor.NumElements();
     dims->type[0] = nvinfer1::DimensionType::kSPATIAL;
     for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
       dims->d[i] = 0;
     }
   }
-  return Status::OK();
 }
 
-template <typename CType>
-Status TfTensorToTrtWeights(const DataType dtype, const Tensor& tensor,
-                            const CType* tensor_proto_array,
-                            int tensor_proto_array_len, TrtWeightStore* store,
+Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
                             TRT_ShapedWeights* weights) {
+  const DataType dtype = tensor.dtype();
+
+  // We always convert the integer constants to INT32, since TRT INT8 is for
+  // quantized inference.
+  //
+  // TODO(aaroey): FP16 will remain in half format and is not converted to
+  // FP32, but the converter currently uses all float weights as FP32. Fix
+  // this.
+  const DataType converted_dtype =
+      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
+                                                                  : dtype);
+
+  // Verify that the dtype is supported by TensorRT. Otherwise, return an error.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
+
+  if (tensor.NumElements() == 0) {
+    // Return empty weights having converted dtype.
+    *weights = TRT_ShapedWeights(converted_dtype);
+    return Status::OK();
+  }
+
   nvinfer1::Dims weight_dims;
-  TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(tensor, tensor_proto_array_len,
-                                                 &weight_dims));
-  *weights = store->GetTempWeights(dtype, weight_dims);
-  void* dst = const_cast<void*>(weights->GetValues());
-  if (tensor_proto_array_len == 1) {
-    std::fill_n((CType*)dst, TrtDimsNumElements(weight_dims),
-                *tensor_proto_array);
+  GetTensorDimsWithProtoShape(tensor, &weight_dims);
+  *weights = weight_store->GetTempWeights(converted_dtype, weight_dims);
+
+  // Copy the tensor directly if the tensor does not require cast to the
+  // supported type.
+  if (converted_dtype == dtype) {
+    char* dst = static_cast<char*>(const_cast<void*>(weights->GetValues()));
+    memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes());
+    return Status::OK();
+  }
+
+  // Copy tensor elements after casting them to the converted DataType.
+  int32* dst = static_cast<int32*>(const_cast<void*>(weights->GetValues()));
+  if (dtype == DT_INT16) {
+    const int16* src = tensor.flat<int16>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
+  } else if (dtype == DT_INT8) {
+    const int8* src = tensor.flat<int8>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
   } else {
-    memcpy(dst, tensor_proto_array, weights->size_bytes());
+    // dtype can only be DT_UINT8 at this point.
+    TFTRT_CHECK_EQ_TYPE(dtype, DT_UINT8);
+    const uint8* src = tensor.flat<uint8>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
   }
   return Status::OK();
 }
@@ -2715,15 +2794,6 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
         "Constant node is expected to have empty input list: ",
         node_def.name());
   }
-  TFAttrs attrs(node_def);
-  const DataType dtype = attrs.get<tensorflow::DataType>("dtype");
-  // We always convert the integer constants to kINT32, since TRT kINT8 is for
-  // quantized inference.
-  const DataType converted_dtype =
-      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
-                                                                  : dtype);
-  nvinfer1::DataType trt_dtype;
-  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
 
   // Create shaped weights as output
   const auto& tensor_proto = node_def.attr().at("value").tensor();
@@ -2733,78 +2803,18 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
                                         node_def.name());
   }
 
-  TRT_ShapedWeights weights(converted_dtype);
-  if (tensor.NumElements() == 0) {
-    // Do nothing.
-  } else if (!tensor_proto.float_val().empty()) {
-    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
-        converted_dtype, tensor, tensor_proto.float_val().begin(),
-        tensor_proto.float_val_size(), params->weight_store, &weights));
-  } else if (!tensor_proto.int_val().empty()) {
-    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
-        converted_dtype, tensor, tensor_proto.int_val().begin(),
-        tensor_proto.int_val_size(), params->weight_store, &weights));
-  } else if (!tensor_proto.half_val().empty()) {
-    // TODO(aaroey): implement fp16 conversion.
-    return errors::Unimplemented("fp16 constant is not supported yet.");
-  } else if (!tensor_proto.tensor_content().empty()) {
-    // TODO(aaroey): fp16 will remain in half format and is not converted to
-    // fp32, but the converter currently uses all float weights as fp32. Fix
-    // this.
-    const auto& content = tensor_proto.tensor_content();
-    if (content.size() > 0) {
-      const int dtype_size = tensorflow::DataTypeSize(dtype);
-      if (content.size() % dtype_size != 0) {
-        return errors::FailedPrecondition("Tensor content size ",
-                                          content.size(),
-                                          " is not a multiple of ", dtype_size);
-      }
-      nvinfer1::Dims weights_dim;
-      TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(
-          tensor, content.size() / dtype_size, &weights_dim));
-      const int64_t size_bytes = TrtDimsNumElements(weights_dim) * dtype_size;
-      if (content.size() != size_bytes) {
-        return errors::FailedPrecondition(
-            "Tensor size and TensorProto content size mismatch: ", size_bytes,
-            " vs ", content.size());
-      } else if (tensor.NumElements() != content.size() / dtype_size) {
-        return errors::FailedPrecondition(
-            "Tensor elements count and TensorProto content size mismatch: ",
-            tensor.NumElements(), " vs ", content.size() / dtype_size);
-      }
-      weights =
-          params->weight_store->GetTempWeights(converted_dtype, weights_dim);
-      if (dtype_size == tensorflow::DataTypeSize(converted_dtype)) {
-        port::CopyToArray(content, static_cast<char*>(
-                                       const_cast<void*>(weights.GetValues())));
-      } else {
-        // Copy out the weights as original data type.
-        std::vector<uint8_t> temp_weights(content.size());
-        port::CopyToArray(content,
-                          reinterpret_cast<char*>(temp_weights.data()));
-        int32* dst =
-            static_cast<int32*>(const_cast<void*>(weights.GetValues()));
-        // Copy to the weight store as converted data type.
-        if (dtype == DT_INT16) {
-          int16* data = reinterpret_cast<int16*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else if (dtype == DT_INT8) {
-          int8* data = reinterpret_cast<int8*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else if (dtype == DT_UINT8) {
-          uint8* data = reinterpret_cast<uint8*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else {
-          return errors::FailedPrecondition(
-              "Unexpected data type: ", DataTypeString(dtype),
-              " at: ", node_def.name());
-        }
-      }
-    }
-  } else {
-    return errors::Unimplemented("Not supported constant type, at ",
-                                 node_def.name());
+  TFAttrs attrs(node_def);
+  const DataType dtype = attrs.get<tensorflow::DataType>("dtype");
+  if (dtype != tensor.dtype()) {
+    return errors::InvalidArgument("DataType mismatch between attr (",
+                                   DataTypeString(dtype), ") and tensor (",
+                                   DataTypeString(tensor.dtype()), ")");
   }
+
+  TRT_ShapedWeights weights;
+  TF_RETURN_IF_ERROR(
+      TfTensorToTrtWeights(tensor, params->weight_store, &weights));
+
   if (params->outputs != nullptr) {
     params->outputs->push_back(TRT_TensorOrWeights(weights));
   }
@@ -2947,18 +2957,15 @@ tensorflow::Status ConvertSquare(OpConverterParams* params) {
   auto weights_ptr =
       static_cast<float*>(const_cast<void*>(weights.GetValues()));
   weights_ptr[0] = 2.f;
-  nvinfer1::IConstantLayer* const2_layer =
-      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_layer, node_def.name());
+  nvinfer1::ITensor* const2_tensor =
+      params->converter->CreateConstantLayer(weights, dims);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_tensor, node_def.name());
 
   // ElementWise Pow Operation
-  const nvinfer1::ITensor* tensor_l = inputs.at(0).tensor();
-  const nvinfer1::ITensor* tensor_r = const2_layer->getOutput(0);
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
-          *const_cast<nvinfer1::ITensor*>(tensor_l),
-          *const_cast<nvinfer1::ITensor*>(tensor_r),
-          nvinfer1::ElementWiseOperation::kPOW);
+          *const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()),
+          *const2_tensor, nvinfer1::ElementWiseOperation::kPOW);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
@@ -3418,7 +3425,6 @@ tensorflow::Status ConvertMatMul(OpConverterParams* params) {
   }
 
   TFAttrs attrs(node_def);
-  // TODO(jie): INT32 should be converted?
   tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
   if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
     return errors::Unimplemented("Data type is not supported, for node ",
@@ -3444,7 +3450,6 @@ tensorflow::Status ConvertBatchMatMul(OpConverterParams* params) {
   const auto& node_def = params->node_def;
   TFAttrs attrs(node_def);
 
-  // TODO(jie): INT32 should be converted?
   tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
   if (tf_dtype != tensorflow::DataType::DT_FLOAT &&
       tf_dtype != tensorflow::DataType::DT_HALF) {
@@ -3566,6 +3571,9 @@ tensorflow::Status ConvertTopK(OpConverterParams* params) {
 
   nvinfer1::ITensor* output_value_tensor = layer->getOutput(0);
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
+  // Tensor type for network output is not inferred. Indices should be INT32
+  // (default is float).
+  output_indices_tensor->setType(nvinfer1::DataType::kINT32);
   params->outputs->push_back(TRT_TensorOrWeights(output_value_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_indices_tensor));
   return tensorflow::Status::OK();
@@ -3686,7 +3694,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     if (tensorflow::str_util::StartsWith(node_name, kInputPHName) &&
         (node_def.op() == "Placeholder")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(
+      if (!tensorflow::strings::safe_strto32(  // non-absl ok
               node_name.c_str() + strlen(kInputPHName), &slot_number)) {
         return tensorflow::errors::InvalidArgument(
             "Failed to parse slot number from ", node_name);
@@ -3715,7 +3723,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
                (node_def.op() == "Identity")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(
+      if (!tensorflow::strings::safe_strto32(  // non-absl ok
               node_name.c_str() + strlen(kOutputPHName), &slot_number)) {
         return tensorflow::errors::InvalidArgument(
             "Failed to parse slot number from ", node_name);
@@ -3749,8 +3757,7 @@ tensorflow::Status ConvertGraphDefToEngine(
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& subgraph_node_names,
-    const std::vector<int>& subgraph_node_ids,  // In topological order
+    const std::vector<const Node*>& subgraph_nodes,  // In topological order
     std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope) {
   std::set<string> marker_nodes;
@@ -3813,8 +3820,10 @@ tensorflow::Status ConvertSegmentToGraphDef(
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
       tensorflow::NodeDefBuilder builder(node_name, "Identity");
-      auto status = builder.Input(connection.inside_node_name, 0, dtype)
-                        .Finalize(seg_node);
+      auto status =
+          builder
+              .Input(connection.inside_node_name, connection.inside_port, dtype)
+              .Finalize(seg_node);
       VLOG(1) << "Constructing output " << node_name << " for the edge "
               << connection.inside_node_name << ":" << connection.inside_port
               << " -> " << connection.outside_node_name << ":"
@@ -3824,11 +3833,10 @@ tensorflow::Status ConvertSegmentToGraphDef(
 
   std::unordered_map<int, int> old_to_new_id_map;
   // Copy internal nodes to new graphdef
-  string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name();
-  for (const auto node_id : subgraph_node_ids) {
-    const auto node = graph->FindNodeId(node_id);
+  string local_scope = subgraph_nodes.front()->name();
+  for (const Node* node : subgraph_nodes) {
     local_scope = GetCommonNameScope(local_scope, node->name());
-    old_to_new_id_map[node_id] = segment_def->node_size();
+    old_to_new_id_map[node->id()] = segment_def->node_size();
     auto snode = segment_def->add_node();
     snode->CopyFrom(node->def());
     VLOG(2) << "Copying " << snode->name() << " to subgraph";
@@ -3846,6 +3854,11 @@ tensorflow::Status ConvertSegmentToGraphDef(
             << placeholder_name;
     snode->set_input(connection.inside_port, placeholder_name);
   }
+  std::set<string> subgraph_node_names;
+  for (const Node* node : subgraph_nodes) {
+    subgraph_node_names.insert(node->name());
+  }
+
   // Remove control inputs that are not inside the segment.
   for (int i = 0; i < segment_def->node_size(); ++i) {
     auto snode = segment_def->mutable_node(i);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
similarity index 94%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes.h
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 54e19b73957bccdae2b23bd3556de9ad00b864e5..aebc0ca38de449dd716b3948f9a0b2e581fc8c80 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
 #include <string>
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -128,8 +128,7 @@ struct EngineInfo {
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& subgraph_node_names,
-    const std::vector<int>& subgraph_node_ids,
+    const std::vector<const Node*>& subgraph_nodes,
     std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope);
 
@@ -159,7 +158,10 @@ class OutputEdgeValidator {
   bool operator()(const tensorflow::Edge* out_edge) const;
 };
 
+string DebugString(const nvinfer1::DimensionType type);
+string DebugString(const nvinfer1::DataType trt_dtype);
 string DebugString(const nvinfer1::Dims& dims);
+string DebugString(const nvinfer1::Permutation& permutation, int len);
 string DebugString(const nvinfer1::ITensor& tensor);
 int64_t TrtDimsNumElements(const nvinfer1::Dims& dims);
 
@@ -195,6 +197,10 @@ class TRT_ShapedWeights {
   // underlying buffer.
   TRT_ShapedWeights(DataType type, nvinfer1::Dims dims, Tensor tensor);
 
+  // All weights should be stored inside TrtWeightStore to make sure lifetime of
+  // all the underlying tensors are available until the engine is built. For
+  // this reason, tensor_ should never be reassigned to a different value that
+  // is not already present in the TrtWeightStore.
   Tensor tensor_;
 
   friend class TrtWeightStore;
@@ -469,6 +475,11 @@ class Converter {
                               nvinfer1::Dims* operand_l_new_dims,
                               nvinfer1::Dims* operand_r_new_dims) const;
 
+  // Creates an IConstantLayer using 'weights' whose dimensions are specified by
+  // 'dims', and returns the output ITensor.
+  nvinfer1::ITensor* CreateConstantLayer(const TRT_ShapedWeights& weights,
+                                         const nvinfer1::Dims& dims);
+
  private:
   // Verify the provided batch_size is consistent with batch_size_ and update it
   // if necessary.
@@ -544,4 +555,4 @@ class Converter {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
similarity index 89%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index a2ddfbffa5b0d8c421bcfe054097a9e42b79fe8f..3a70423d12b35e46d2709dcdc25920a3143f41c4 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <memory>
 #include <unordered_map>
@@ -21,11 +21,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
@@ -50,7 +52,7 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
@@ -364,9 +366,6 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_tensor());
       EXPECT_EQ(true, ptr->is_weights());
       EXPECT_TRUE(TrtShapedWeightsEquals(weights, ptr->weights()));
-
-      nvinfer1::Dims dims;
-      dims.nbDims = 0;
       ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
     }
   }
@@ -915,6 +914,20 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
                  "(tensor #dims 4 vs broadcast #dims 5)");
 }
 
+TEST_F(ConverterTest, CreateConstantLayer) {
+  for (auto dtype : {DT_FLOAT, DT_INT32}) {
+    TRT_ShapedWeights weights =
+        weight_store_->GetTempWeights(dtype, GetTestDims({2, 3, 5}));
+    nvinfer1::ITensor* tensor =
+        converter_->CreateConstantLayer(weights, GetTestDims({3, 10}));
+    ASSERT_NE(nullptr, tensor);
+    EXPECT_EQ(TfDataTypeToTrt(dtype), tensor->getType())
+        << "Expected " << DebugString(TfDataTypeToTrt(dtype)) << " vs. actual "
+        << DebugString(tensor->getType());
+    ExpectTrtDimsEqualsArray({3, 10}, tensor->getDimensions());
+  }
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -1111,6 +1124,30 @@ class OpConverterTest : public ::testing::Test {
   std::unordered_map<string, NodeDef> validator_inputs_;
 };
 
+template <typename T>
+void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
+  out->Clear();
+  if (tensor.NumElements() == 0) return;
+
+  // TensorProto does not need to have all the elements present and can truncate
+  // trailing elements with the same value for compressed representation. Such
+  // elements are derived based on the tensor shape.
+  const auto flat = tensor.flat<T>();
+  int64 last_index = 0;
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    if (flat(i) != flat(last_index)) {
+      last_index = i;
+    }
+  }
+
+  int num_out_elements = last_index + 1;
+  out->Reserve(num_out_elements);
+  out->AddNAlreadyReserved(num_out_elements);
+  const T* src = flat.data();
+  T* dst = out->mutable_data();
+  std::copy(src, src + num_out_elements, dst);
+}
+
 template <DataType dtype, typename InputCType, typename OutputCType>
 void TestConvertConst(OpConverterTest* test) {
   NodeDef node_def;
@@ -1123,11 +1160,23 @@ void TestConvertConst(OpConverterTest* test) {
                             const std::vector<OutputCType>& expected_value) {
     test->Reset();
 
-    auto& attr = *node_def.mutable_attr();
+    TensorProto* tensor_attr =
+        (*node_def.mutable_attr())["value"].mutable_tensor();
+    tensor_attr->Clear();
+
     if (as_tensor_content) {
-      tensor.AsProtoTensorContent(attr["value"].mutable_tensor());
+      tensor.AsProtoTensorContent(tensor_attr);
     } else {
-      tensor.AsProtoField(attr["value"].mutable_tensor());
+      tensor.shape().AsProto(tensor_attr->mutable_tensor_shape());
+      tensor_attr->set_dtype(tensor.dtype());
+
+      if (tensor.dtype() == DT_FLOAT) {
+        CopyTensorElements<float>(tensor, tensor_attr->mutable_float_val());
+      } else if (tensor.dtype() == DT_INT32) {
+        CopyTensorElements<int32>(tensor, tensor_attr->mutable_int_val());
+      } else {
+        tensor.AsProtoField(tensor_attr);
+      }
     }
     test->RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
@@ -1140,8 +1189,7 @@ void TestConvertConst(OpConverterTest* test) {
   {
     // By default empty tensor will pick DT_FLOAT as data type and we fix it
     // here.
-    attr["value"].mutable_tensor()->set_dtype(dtype);
-    Tensor t;  // Empty tensor.
+    Tensor t(dtype);  // Empty tensor.
     reset_and_test(t, false, {}, {});
   }
   {
@@ -1160,6 +1208,22 @@ void TestConvertConst(OpConverterTest* test) {
     reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6});
     reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6});
   }
+  {
+    // Set all tensor elements to the same value. Such tensors are encoded
+    // using a single element list in tensor proto.
+    Tensor t = ::tensorflow::test::AsTensor<InputCType>({1, 1, 1, 1, 1, 1},
+                                                        TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {1, 1, 1, 1, 1, 1});
+    reset_and_test(t, true, {2, 3}, {1, 1, 1, 1, 1, 1});
+  }
+  {
+    // Set trailing tensor elements to the same value. Such tensors are
+    // encoded by truncating all equal elements except the first one.
+    Tensor t = ::tensorflow::test::AsTensor<InputCType>({2, 2, 1, 1, 1, 1},
+                                                        TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {2, 2, 1, 1, 1, 1});
+    reset_and_test(t, true, {2, 3}, {2, 2, 1, 1, 1, 1});
+  }
 }
 
 TEST_F(OpConverterTest, ConvertConst) {
@@ -2253,7 +2317,7 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     ops::Squeeze::Attrs squeeze_attrs;
-    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);
+    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);  // non-absl ok
     auto squeeze =
         ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
     return squeeze.operation.node()->def();
@@ -2378,6 +2442,8 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   };
 
   {
+    // Input is weights, should fail.
+    Reset();
     NodeDef node_def = get_strided_slice_nodedef();
     AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
@@ -2619,6 +2685,240 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
   }
 }
 
+TEST_F(OpConverterTest, ConvertConv2D) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_conv2d", "Conv2D", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Two inputs are expected for Conv2D, at my_conv2d");
+  }
+
+  // Get nodedef for Conv2D layer.
+  auto get_conv2d_nodedef =
+      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
+         string data_format = "NCHW",
+         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+    ops::Conv2D::Attrs attrs =
+        ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
+    auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
+                              padding, attrs);
+    return conv2d.operation.node()->def();
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Conv2D is only implemented for tensors, not weights, at my_conv2d");
+  }
+  {
+    // Filter is tensor, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3, 3, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Kernel for Conv2D must be constant weights, at my_conv2d");
+  }
+  {
+    // Filter is not 4D, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Conv2D expects kernel of dimension 4, at my_conv2d");
+  }
+  {
+    // Dilations is not 4D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution dilations field must specify 4 dimensions, at my_conv2d");
+  }
+  {
+    // Dilation value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv2d");
+  }
+  {
+    // Dilation value is not 1 for channel (NHWC), should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2});
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv2d");
+  }
+  {
+    // Strides is not 4D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution strides field must specify 4 dimensions, at my_conv2d");
+  }
+  {
+    // Stride value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Stride must be 1 for batch and channel dimensions, at my_conv2d");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims,
+               const std::vector<float>& input,
+               const std::vector<int>& filter_dims,
+               const std::vector<float>& filter,
+               const std::vector<int>& strides, const string& padding,
+               const string& data_format, const std::vector<int>& dilations,
+               const std::vector<int>& expected_output_dims,
+               const std::vector<float>& expected_output)
+        : input_dims(input_dims),
+          input(input),
+          filter_dims(filter_dims),
+          filter(filter),
+          strides(strides),
+          padding(padding),
+          data_format(data_format),
+          dilations(dilations),
+          expected_output_dims(expected_output_dims),
+          expected_output(expected_output) {}
+
+    std::vector<int> input_dims;
+    std::vector<float> input;
+    std::vector<int> filter_dims;
+    std::vector<float> filter;
+    std::vector<int> strides;
+    string padding;
+    string data_format;
+    std::vector<int> dilations;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  // Ok.
+  const int kConv2DOKCases = 6;
+  TestParams ok_params[kConv2DOKCases] = {
+      // Basic
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // SAME padding (Asymmetric)
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output=*/{1, 1, -2, 0, 1, -4}},
+      // SAME padding (Symmetric)
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 3, 1, 1},
+                 /*filter=*/{-1, 0, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output=*/{1, 2, -1, 3, 1, -3}},
+      // NHWC
+      TestParams{/*input_dims=*/{2, 3, 1},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NHWC",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{2, 2, 1},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // Dilated
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 2},
+                 /*expected_output_dims=*/{1, 2, 1},
+                 /*expected_output=*/{2, 1}},
+      // Strided
+      TestParams{/*input_dims=*/{1, 2, 4},
+                 /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output=*/{1, 0, 1, 3}},
+  };
+
+  for (int i = 0; i < kConv2DOKCases; i++) {
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
+                           ok_params[i].data_format, ok_params[i].dilations);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                          ok_params[i].filter);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+    std::vector<float> output_data(ok_params[i].expected_output.size());
+    BuildAndRun<float>({{"input", ok_params[i].input}}, "my_conv2d",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
rename to tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index d57f2300f8e6e6ce79c538133da6bc5cf5ead2f5..ebf8df1349363e9986020ea705b32edfef43bc93 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -12,9 +12,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
@@ -30,9 +32,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 // TODO(sami): Remove VLOG messages once the code matures
+using absl::StrAppend;
+using absl::StrCat;
 using tensorflow::str_util::Uppercase;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
 
 tensorflow::Status TRTOptimizationPass::Init(
     const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
@@ -243,7 +245,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     // If the last token is not an integer, it must be part of the name.
     // Otherwise it is port number.
     if (tokens.size() > 1 &&
-        !strings::safe_strto32(tokens.back(), &dumm_port)) {
+        !strings::safe_strto32(tokens.back(), &dumm_port)) {  // non-absl ok
       StrAppend(&s, ":", tokens.back());
     }
     nodes_to_preserve.push_back(s);
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
similarity index 91%
rename from tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
rename to tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index 3e8dc0978e43e2e9ba07aaa09f74acfe8e59b9a7..bd6c6dbce1ddb8757227a1c71408770ee8be48d8 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
 
 #include <string>
 
@@ -77,4 +77,4 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
 
 #endif  // GOOGLE_CUDA
 #endif  // GOOGLE_TENSORRT
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/contrib/tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/convert/utils.cc
rename to tensorflow/compiler/tf2tensorrt/convert/utils.cc
index e7a1febb8c076891596741fe30721e7acca15a73..62a0f62ad6657f2d1551cd093f4f2d93c25f4cae 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
similarity index 88%
rename from tensorflow/contrib/tensorrt/convert/utils.h
rename to tensorflow/compiler/tf2tensorrt/convert/utils.h
index 0592f31462af2b20f3a13fe5119e89c2ba42dd8a..9f9ee59087d461bdc825346d9adc976e42f47c5e 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
 
 #include <memory>
 
@@ -47,4 +47,4 @@ Status GetPrecisionMode(const string& name, int* precision_mode);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eae1f8e7525f1816d1c50072ebe4ba6713c96e47
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+class GetSerializedResourceOp : public OpKernel {
+ public:
+  explicit GetSerializedResourceOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  ~GetSerializedResourceOp() override {}
+
+  void Compute(OpKernelContext* context) override {
+    // TODO(laigd): it will allocate the tensor on the device and copy the
+    // serialized string to that tensor, and later sess.run() will copy it back
+    // to host. We need to optimize this.
+    const string& container = context->input(0).scalar<string>()();
+    const string& resource_name = context->input(1).scalar<string>()();
+
+    // Get the resource.
+    SerializableResourceBase* resource = nullptr;
+    OP_REQUIRES_OK(context, context->resource_manager()->Lookup(
+                                container, resource_name, &resource));
+    ::tensorflow::core::ScopedUnref sc(resource);
+
+    // Serialize the resource as output.
+    string serialized_resource;
+    OP_REQUIRES_OK(context, resource->SerializeToString(&serialized_resource));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    output->scalar<string>()() = serialized_resource;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GetSerializedResourceOp").Device(DEVICE_GPU),
+                        GetSerializedResourceOp);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec038ebda073c8050321d5668b15a2c6faa72a4b
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dirent.h>
+#include <string.h>
+#include <fstream>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+class GetSerializedResourceOpTest : public OpsTestBase {};
+
+TEST_F(GetSerializedResourceOpTest, Basic) {
+  // Create the GPU device.
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
+
+  // Create the resource.
+  class MySerializableResource : public SerializableResourceBase {
+   public:
+    string DebugString() const override { return ""; }
+    Status SerializeToString(string* serialized) override {
+      *serialized = "my_serialized_str";
+      return Status::OK();
+    }
+  };
+  const string container = "mycontainer";
+  const string resource_name = "myresource";
+  SerializableResourceBase* resource = new MySerializableResource();
+  ResourceMgr* rm = device->resource_manager();
+  EXPECT_TRUE(rm->Create(container, resource_name, resource).ok());
+
+  // Create the op.
+  SetDevice(DEVICE_GPU, std::move(device));
+  TF_ASSERT_OK(NodeDefBuilder("op", "GetSerializedResourceOp")
+                   .Input(FakeInput(DT_STRING))
+                   .Input(FakeInput(DT_STRING))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Execute the op.
+  AddInputFromArray<string>(TensorShape({}), {container});
+  AddInputFromArray<string>(TensorShape({}), {resource_name});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Verify the result.
+  // TODO(laigd): OpsTestBase::GetOutput() doesn't work.
+  Tensor* output = context_->mutable_output(0);
+  EXPECT_EQ("my_serialized_str", output->scalar<string>()());
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
similarity index 67%
rename from tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
rename to tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index bad568644bb1f8d01d4cb0a7c853ec47d6f19e45..198d68b60985d2b3f2ef958c4f13f94054d4875a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
+#include "tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.h"
 
 #include <algorithm>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -38,9 +40,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 static Logger logger;
+using absl::StrAppend;
+using absl::StrCat;
 using ::nvinfer1::IRuntime;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
 
 // A helper class to call done() when destructed for asynchronous execution.
 // Helps simultaneous execution of native and TRT engines.
@@ -135,8 +137,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   native_func_ = tensorflow::kInvalidHandle;
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("fixed_input_size", &fixed_input_size_));
   OP_REQUIRES_OK(context, context->GetAttr("cached_engine_batches",
                                            &cached_engine_batches_));
   std::sort(cached_engine_batches_.begin(), cached_engine_batches_.end());
@@ -175,11 +175,13 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
   lib->Run(opts, native_func_, inputs, outputs,
            [this, ctx, outputs, helper](const tensorflow::Status& s) {
              tensorflow::core::ScopedUnref sc(helper);
-             VLOG(1) << "Native Segment completed";
              if (!s.ok()) {
+               LOG(ERROR) << "Failed to execute native segment " << this->name()
+                          << ": " << s;
                ctx->SetStatus(s);
                return;
              }
+             VLOG(1) << "Native Segment completed";
              for (size_t t = 0; t < outputs->size(); ++t) {
                ctx->set_output(t, outputs->at(t));
              }
@@ -194,18 +196,37 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
   VLOG(1) << "Executing TRT calibration: " << name();
   helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
-  // TODO(aaroey): remove the ResourceMgr singleton.
-  auto trt_rm = TRTResourceManager::instance();
-  auto res_mgr = trt_rm->getManager("TRTCalibration");
+  auto res_mgr = ctx->resource_manager();
   TRTCalibrationResource* calib_res = nullptr;
-  auto status = res_mgr->LookupOrCreate(
-      funcdef_name_, "Calibrator", &calib_res,
-      {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status {
-        return this->AllocateCalibrationResources(ctx, cr);
-      }});
-  if (!status.ok()) {
-    ctx->SetStatus(status);
-    return;
+  OP_REQUIRES_OK(
+      ctx,
+      res_mgr->LookupOrCreate(
+          "TF_TRT_Calibration", name(),
+          reinterpret_cast<SerializableResourceBase**>(&calib_res),
+          {[ctx, this](SerializableResourceBase** cr) -> tensorflow::Status {
+            return this->AllocateCalibrationResources(ctx, cr);
+          }}));
+  tensorflow::core::ScopedUnref calib_sc(calib_res);
+  // TODO(aaroey): here we also add the resource to the ResourceMgr singleton.
+  // This is needed before we migrate all uses of calib_graph_to_infer_graph()
+  // to the new calibration workflow. After that we'll remove this block.
+  {
+    auto deprecated_rm =
+        TRTResourceManager::instance()->getManager("TRTCalibration");
+    TRTCalibrationResource* copied_resource = nullptr;
+    // Check whether the resource exists, and create it if not.
+    if (deprecated_rm->Lookup(funcdef_name_, "Calibrator", &copied_resource)
+            .ok()) {
+      // Do nothing if the resource exists.
+      copied_resource->Unref();
+    } else {
+      copied_resource = calib_res;
+      // Increase the refcount by 1 then transfer the ownership of that refcount
+      // to the ResourceMgr singleton.
+      copied_resource->Ref();
+      OP_REQUIRES_OK(ctx, deprecated_rm->Create(funcdef_name_, "Calibrator",
+                                                copied_resource));
+    }
   }
   int num_inputs = ctx->num_inputs();
   // Pass input data to calibrator
@@ -219,7 +240,8 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
       return;
     }
     // Check the allocated buffer is sufficient for input
-    const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+    const auto device_tensor =
+        calib_res->device_tensors_.at(i).AccessTensor(ctx);
     CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
     input_data.emplace(StrCat(kInputPHName, i), data_address);
   }
@@ -236,32 +258,34 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
   ExecuteNativeSegment(ctx, helper);
 }
 
-int TRTEngineOp::GetEngineBatch(OpKernelContext* ctx) {
-  int num_batch = ctx->input(0).shape().dim_size(0);
-  int smallest_engine = 0;
-  for (const auto i : cached_engine_batches_) {
-    if (i >= num_batch) {
-      smallest_engine = i;
-      break;
-    }
-  }
-  // TODO(sami): Need an LRU here
-  if (smallest_engine == 0) {
-    if (max_cached_engines_ > cached_engine_batches_.size()) {
-      smallest_engine = num_batch;
-      cached_engine_batches_.push_back(num_batch);
-      VLOG(1) << "Running with batch size " << num_batch;
-    } else {
-      string msg =
-          StrCat("Engine buffer is full. buffer limit=", max_cached_engines_,
-                 ", current entries=");
-      for (auto i : cached_engine_batches_) StrAppend(&msg, i, ",");
-      StrAppend(&msg, " requested batch=", num_batch);
-      LOG(WARNING) << msg;
-      return -1;
+bool TRTEngineOp::GetCompatibleCachedEngine(
+    const std::vector<TensorShape>& actual_input_shapes,
+    std::vector<TensorShape>* engine_input_shapes) {
+  const int batch_size = actual_input_shapes[0].dim_size(0);
+  int smallest_batch_size = -1;
+  // Output shape will always be the same as the input but we will overwrite the
+  // batch size.
+  *engine_input_shapes = actual_input_shapes;
+  for (const int cached_batch_size : cached_engine_batches_) {
+    // Check if compatible: batch <= cached batch.
+    //
+    // TODO(laigd): here it only compare the first dim a.k.a the batch size,
+    // we'll need to to support non-batch dimensions as well. This will be done
+    // as part of the offline conversion implementation.
+    if (batch_size <= cached_batch_size) {
+      // First case: first compatible engine found
+      // Second case: smaller batch size engine found
+      if ((smallest_batch_size == -1) ||
+          (cached_batch_size < smallest_batch_size)) {
+        smallest_batch_size = cached_batch_size;
+        // Overwrite batch size for output
+        for (int i = 0; i < engine_input_shapes->size(); i++) {
+          (*engine_input_shapes)[i].set_dim(0, smallest_batch_size);
+        }
+      }
     }
   }
-  return smallest_engine;
+  return (smallest_batch_size != -1);
 }
 
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
@@ -272,25 +296,20 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     ExecuteCalibration(ctx, helper);
     return;
   }
-  const int smallest_engine = GetEngineBatch(ctx);
-  if (smallest_engine < 0) {
-    LOG(WARNING) << "Failed to get engine batch, running native segment for "
-                 << name();
-    ExecuteNativeSegment(ctx, helper);
-    return;
+  // Get shapes of inputs to engine.
+  std::vector<tensorflow::TensorShape> input_shapes;
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    input_shapes.emplace_back(ctx->input(i).shape());
   }
-
-  const int num_batch = ctx->input(0).shape().dim_size(0);
-  auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
-  auto& trt_engine_ptr = engine_ctx_pair.first;
-  if (!trt_engine_ptr) {
-    LOG(WARNING) << "Engine retrieval for batch size " << num_batch
+  EngineContext* engine_context = GetEngine(input_shapes, ctx);
+  if (!engine_context->cuda_engine) {
+    LOG(WARNING) << "Engine retrieval for input shapes: "
+                 << TensorShapeUtils::ShapeListString(input_shapes)
                  << " failed. Running native segment for " << name();
     ExecuteNativeSegment(ctx, helper);
     return;
   }
-  const bool retry = ExecuteTrtEngine(ctx, num_batch, trt_engine_ptr.get(),
-                                      engine_ctx_pair.second.get());
+  const bool retry = ExecuteTrtEngine(ctx, engine_context);
   if (retry) {
     LOG(WARNING) << "Failed to execute engine, "
                  << "retrying with native segment for " << name();
@@ -299,18 +318,19 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   }
 }
 
-bool TRTEngineOp::ExecuteTrtEngine(
-    OpKernelContext* ctx, const int num_batch,
-    nvinfer1::ICudaEngine* trt_engine_ptr,
-    nvinfer1::IExecutionContext* trt_execution_context_ptr) {
+bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
+                                   EngineContext* engine_context) {
   VLOG(1) << "Executing TRT engine: " << name();
+  auto& cuda_engine = engine_context->cuda_engine;
   const bool kRetry = true;
+  // All inputs must have the same batch size, so just get it from the first
+  // input.
+  const int num_batch = ctx->input(0).shape().dim_size(0);
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string input_name = StrCat(kInputPHName, i);
-    const int binding_index =
-        trt_engine_ptr->getBindingIndex(input_name.c_str());
+    const int binding_index = cuda_engine->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
       LOG(ERROR) << "Input node not found, at " << input_name;
       return kRetry;
@@ -323,7 +343,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
                  << " vs " << input_shape.dim_size(0);
       return kRetry;
     }
-    auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
+    auto dtype = cuda_engine->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
@@ -346,13 +366,12 @@ bool TRTEngineOp::ExecuteTrtEngine(
   for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
     const string output_name = StrCat(kOutputPHName, i);
-    const int binding_index =
-        trt_engine_ptr->getBindingIndex(output_name.c_str());
+    const int binding_index = cuda_engine->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
     TensorShape output_shape;
     if (binding_index != -1) {
-      auto dims = trt_engine_ptr->getBindingDimensions(binding_index);
+      auto dims = cuda_engine->getBindingDimensions(binding_index);
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
@@ -374,7 +393,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
       // TODO(aaroey): ideally we should retry, fix this.
       return !kRetry;
     }
-    auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
+    auto dtype = cuda_engine->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
@@ -402,9 +421,12 @@ bool TRTEngineOp::ExecuteTrtEngine(
                                                 ->implementation()
                                                 ->GpuStreamMemberHack()));
 
+  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex
+  // for it.
+  tensorflow::mutex_lock lock(engine_context->mu);
   // TODO(jie): trt enqueue does not return error
-  auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
-                                                nullptr);
+  auto ret = engine_context->execution_context->enqueue(num_batch, &buffers[0],
+                                                        *stream, nullptr);
   if (!ret) {
     LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name();
     return kRetry;
@@ -414,50 +436,45 @@ bool TRTEngineOp::ExecuteTrtEngine(
   return !kRetry;
 }
 
-TRTEngineOp::~TRTEngineOp() {
-  // We need to manually destroy the engine and execution context before
-  // the allocator is destructed.
-  for (auto& eng : engine_map_) {
-    eng.second.first.reset();
-    eng.second.second.reset();
+EngineContext* TRTEngineOp::GetEngine(
+    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
+  static EngineContext empty_context;
+  tensorflow::mutex_lock lock(engine_mutex_);
+  // TODO(tmorris): using first input to get batch size - is this reliable?
+  const int batch_size = input_shapes[0].dim_size(0);
+
+  // Get engine cache
+  TRTEngineCacheResource* cache_res = nullptr;
+  auto status = ctx->resource_manager()->LookupOrCreate(
+      "TRTEngineCache", funcdef_name_, &cache_res,
+      {[this, ctx](TRTEngineCacheResource** cr) -> tensorflow::Status {
+        *cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_);
+        return Status::OK();
+      }});
+  if (!status.ok()) {
+    ctx->SetStatus(status);
+    return &empty_context;
   }
-  allocator_.reset();
-}
-
-nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
-  if (allocator_) return allocator_.get();
-  auto device = ctx->device();
-  auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
-  if (!alloc) {
-    LOG(ERROR) << "Can't find device allocator for gpu device "
-               << device->name();
-    return nullptr;
+  tensorflow::core::ScopedUnref sc(cache_res);
+  auto& cache = cache_res->cache_;
+  auto allocator = cache_res->allocator_.get();
+  if (allocator == nullptr) {
+    return &empty_context;
   }
-  allocator_.reset(new TRTDeviceAllocator(alloc));
-  return allocator_.get();
-}
-
-TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
-                                                   OpKernelContext* ctx) {
-  static EngineCtxPair null_pair = {
-      TrtUniquePtrType<nvinfer1::ICudaEngine>(nullptr),
-      TrtUniquePtrType<nvinfer1::IExecutionContext>(nullptr)};
-  // TODO(sami): This method needs to be re-written to use resource manager and
-  // with LRU mechanism option.
-  tensorflow::mutex_lock lock(engine_mutex_);
 
+  // Handle the static engine case. For static engines, the cache will have a
+  // single element containing the only engine.
   if (static_engine_) {
-    if (engine_map_.size()) {
-      if (engine_map_.begin()->first >= batch_size) {
-        return engine_map_.begin()->second;
+    if (cache.size()) {
+      // Batch size of engine must be >= the input batch size
+      // TODO(tmorris): use match compatible function?
+      if (cache.begin()->first[0].dim_size(0) >= batch_size) {
+        return cache.begin()->second.get();
       }
-      return null_pair;
+      return &empty_context;
     }
+
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
-    auto allocator = GetAllocator(ctx);
-    if (allocator == nullptr) {
-      return null_pair;
-    }
     infer->setGpuAllocator(allocator);
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
@@ -465,62 +482,87 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
                                      PluginFactoryTensorRT::GetInstance()));
     auto raw_static_engine = static_engine.get();
     const auto max_batch_size = raw_static_engine->getMaxBatchSize();
-    engine_map_[max_batch_size] = {
-        std::move(static_engine),
-        TrtUniquePtrType<nvinfer1::IExecutionContext>(
-            raw_static_engine->createExecutionContext())};
+    // Static engine will have max_batch_size for batch size so that all inputs
+    // will map to this single engine.
+    std::vector<TensorShape> engine_input_shapes(input_shapes);
+    for (int i = 0; i < engine_input_shapes.size(); i++) {
+      // TODO(tmorris): will all inputs have batch size as first dimension??
+      engine_input_shapes[i].set_dim(0, max_batch_size);
+    }
+    // TODO(laigd): here we assume engine_input_shapes matches the actual input
+    // shapes of the engine, we should verify that.
+    cache.emplace(engine_input_shapes,
+                  absl::make_unique<EngineContext>(
+                      std::move(static_engine),
+                      TrtUniquePtrType<nvinfer1::IExecutionContext>(
+                          raw_static_engine->createExecutionContext())));
     // Runtime is safe to delete after engine creation
     serialized_segment_.clear();
     if (max_batch_size < batch_size) {
-      return null_pair;
+      return &empty_context;
     }
-    return engine_map_.at(max_batch_size);
+    return cache.at(engine_input_shapes).get();
   }  // static_engine_
 
   // Handle the dynamic engine case.
-  auto engine_it = engine_map_.find(batch_size);
-  if (engine_it == engine_map_.end() &&
-      engine_map_.size() < (size_t)max_cached_engines_) {
-    nvinfer1::IGpuAllocator* allocator = nullptr;
-    allocator = GetAllocator(ctx);
-    if (allocator == nullptr) {
-      return null_pair;
-    }
-    std::vector<tensorflow::PartialTensorShape> shapes;
-    for (int i = 0; i < ctx->num_inputs(); ++i) {
-      shapes.emplace_back(ctx->input(i).shape());
+  // See if there is a compatible engine cached. The batch size should be <= the
+  // cached batch size.
+  std::vector<tensorflow::TensorShape> engine_input_shapes;
+  const bool matched_successfully =
+      GetCompatibleCachedEngine(input_shapes, &engine_input_shapes);
+  // If matched, use that engine. Otherwise, we will look in cache for that
+  // exact shape and possibly create a new engine if it is not in cache.
+  if (!matched_successfully) {
+    engine_input_shapes = input_shapes;
+    if (!cached_engine_batches_.empty()) {
+      // If user has explicitly defined cached_engine_batches, we should
+      // warn them that their input was non-compatible (batch size too high)
+      LOG(WARNING) << "No compatible cached engine was found for batch size: "
+                   << batch_size << ". A new engine will be created.";
+      cached_engine_batches_.push_back(batch_size);
     }
+  }
+
+  if (!cache.count(engine_input_shapes)) {
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
     LOG(INFO) << "Building a new TensorRT engine for " << name()
-              << " with batch size " << batch_size;
+              << " input shapes: "
+              << TensorShapeUtils::ShapeListString(engine_input_shapes);
+    // Convert to partial shapes
+    std::vector<PartialTensorShape> partial_shapes;
+    for (int i = 0; i < engine_input_shapes.size(); i++) {
+      partial_shapes.emplace_back(engine_input_shapes[i]);
+    }
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
-        segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
-        &logger, allocator, calibrator_.get(), &engine, use_calibration_,
-        &convert_successfully);
+        segment_graph_, precision_mode_, batch_size, workspace_size_,
+        partial_shapes, &logger, allocator, calibrator_.get(), &engine,
+        use_calibration_, &convert_successfully);
     if (!status.ok()) {
       if (convert_successfully) {
         // This means it fail to build the engine even when the network is built
         // successfully, probably due to internal issues. In this case we don't
         // retry in the future.
-        engine_map_[batch_size] = {nullptr, nullptr};
+        cache.emplace(engine_input_shapes, absl::make_unique<EngineContext>());
       }
       LOG(WARNING) << "Engine creation for batch size " << batch_size
                    << " failed " << status;
-      return null_pair;
+      return &empty_context;
     }
     VLOG(1) << "Conversion is done";
     TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
         engine->createExecutionContext());
-    engine_map_[batch_size] = {std::move(engine), std::move(exec_context)};
+    cache.emplace(engine_input_shapes,
+                  absl::make_unique<EngineContext>(std::move(engine),
+                                                   std::move(exec_context)));
   }
-  return engine_map_.at(batch_size);
+  return cache.at(engine_input_shapes).get();
 }
 
 tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
-    OpKernelContext* ctx, TRTCalibrationResource** cr) {
+    OpKernelContext* ctx, SerializableResourceBase** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
   // Get the allocator.
@@ -536,7 +578,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   const int batch_size = ctx->input(0).dim_size(0);
   const int num_inputs = ctx->num_inputs();
   std::vector<tensorflow::PartialTensorShape> shapes;
-  dev_tensors_.resize(num_inputs);
+  cres->device_tensors_.resize(num_inputs);
   VLOG(1) << " Constructing calibrator";
   for (int i = 0; i < num_inputs; i++) {
     // allocate workspace on device for inputs
@@ -544,19 +586,19 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     shapes.emplace_back(t.shape());
     Tensor* device_tensor;
     TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        t.dtype(), t.shape(), &dev_tensors_.at(i), &device_tensor));
+        t.dtype(), t.shape(), &cres->device_tensors_.at(i), &device_tensor));
     CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
     void* device_address = GetTensorAddress(device_tensor);
     if (device_address == nullptr) {
       return tensorflow::errors::InvalidArgument(
           "Unsupported data type encountered in input ", i);
     }
-    device_buffers_.emplace(
+    cres->device_buffers_.emplace(
         StrCat(kInputPHName, i),
         std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
   }
   cres->calibrator_.reset(
-      new TRTInt8Calibrator(device_buffers_, batch_size, name()));
+      new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
   const string label(name());
   auto segment_graph = &segment_graph_;
   const int platform_gpu_id =
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.h
similarity index 67%
rename from tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
rename to tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.h
index b545f497f32d5a1a6960b748467ca189b7debf6c..64f8c97a74092ac075de9cc7993283e3ce1e27cf 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -36,7 +39,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 struct TRTInt8Calibrator;
-class TRTCalibrationResource;
 class AsyncHelper;
 //  TODO(Sami): Remove this file?
 
@@ -48,9 +50,10 @@ class TRTEngineOp : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
-  ~TRTEngineOp();
 
  private:
+  // TODO(samikama): context should go to a resource manager!
+
   // Execute calibration
   void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
 
@@ -62,33 +65,25 @@ class TRTEngineOp : public AsyncOpKernel {
 
   // Execute the tensorrt engine. Returns whether we need to retry by running
   // the native segment.
-  bool ExecuteTrtEngine(OpKernelContext* ctx, const int num_batch,
-                        nvinfer1::ICudaEngine* trt_engine_ptr,
-                        nvinfer1::IExecutionContext* trt_execution_context_ptr);
+  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);
 
   // Allocate necessary resources for calibration
   Status AllocateCalibrationResources(OpKernelContext* ctx,
-                                      TRTCalibrationResource** cr);
-
-  // TODO(samikama): context should go to a resource manager!
-  typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>,
-                    TrtUniquePtrType<nvinfer1::IExecutionContext>>
-      EngineCtxPair;
-  EngineCtxPair& GetEngine(int batch_size, OpKernelContext* ctx);
+                                      SerializableResourceBase** cr);
 
-  // Return engine batch closest to input batch.
-  int GetEngineBatch(OpKernelContext* ctx);
+  // Get engine for the input shape
+  EngineContext* GetEngine(const std::vector<TensorShape>& input_shapes,
+                           OpKernelContext* ctx);
 
-  nvinfer1::IGpuAllocator* GetAllocator(OpKernelContext* ctx);
+  // Return engine batch in cached_engne_batch_sizes_ which is closest to input
+  // batch.
+  bool GetCompatibleCachedEngine(
+      const std::vector<TensorShape>& actual_input_shapes,
+      std::vector<TensorShape>* engine_input_shapes);
 
-  // map to keep engines and their execution context for given batch size.
-  std::unordered_map<int, EngineCtxPair> engine_map_;
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
 
-  // keep device allocator for TRT.
-  std::unique_ptr<TRTBaseAllocator> allocator_;
-
   // serialized protobuf segment or trt engine depending on static_engine_ flag.
   string serialized_segment_;
 
@@ -98,12 +93,6 @@ class TRTEngineOp : public AsyncOpKernel {
   // GraphDef representation of the segment.
   GraphDef segment_graph_;
 
-  // Lookup table for temporary staging areas of input tensors for calibration.
-  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
-
-  // Temporary staging areas for calibration inputs.
-  std::vector<PersistentTensor> dev_tensors_;
-
   // Engine Precision mode.
   int precision_mode_;
 
@@ -114,10 +103,6 @@ class TRTEngineOp : public AsyncOpKernel {
   // Whether to calibrate INT8 engine.
   bool calibration_mode_;
 
-  // Whether non-batch ranks of the inputs are assumed to be fixed or not for
-  // engine construction.
-  bool fixed_input_size_;
-
   // Batches of the cached engines
   std::vector<int> cached_engine_batches_;
 
@@ -142,4 +127,4 @@ class TRTEngineOp : public AsyncOpKernel {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_KERNELS_TRT_ENGINE_OP_H_
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59da73f5efc8eedc20c35cf35cb1eae6cda136c9
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+REGISTER_OP("GetSerializedResourceOp")
+    .Input("container: string")
+    .Input("resource_name: string")
+    .Output("serialized_resource: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Gets a resource from a container managed by the resource manager and returns
+its serialized representation.
+)doc");
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
similarity index 80%
rename from tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
rename to tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index 92405906eb76b043bc08b68e25e16ab40197dddf..b84d2fe0b8cef3475f2a7d0f5383d5e11cde099a 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -28,16 +28,22 @@ namespace shape_inference {
 extern Status TRTEngineOpShapeInference(InferenceContext* c);
 }
 
+// NOTE: please try NOT to add/modify/remove attributes or inputs/outputs to the
+// list below, this will break backward compatibility!
+//
+// TODO(laigd): consider making this op stateful. The only problem is it uses TF
+// function which has to be stateless, but we can use function library as the
+// key to cache the instantiated functions for different executor subgraphs.
 REGISTER_OP("TRTEngineOp")
     .Attr("serialized_segment: string")
     .Attr("input_shapes: list(shape)")
     .Attr("output_shapes: list(shape)")
     .Attr("segment_funcdef_name: string")
-    .Attr("InT: list({int8,float16,float32})")
-    .Attr("OutT: list({int8,float16,float32})")
+    .Attr("InT: list({int8,float16,float32,int32})")
+    .Attr("OutT: list({int8,float16,float32,int32})")
     .Attr("static_engine: bool = true")
     .Attr("fixed_input_size: bool = true")
-    .Attr("cached_engine_batches: list(int) = []")
+    .Attr("cached_engine_batches: list(int) >= 0 = []")
     .Attr("max_cached_engines_count: int = 1")
     .Attr("workspace_size_bytes: int")
     .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 062f86e8bb4dc753925e4e2baf0bc80a5312a94f..a4341c530fffca88c82813cc2ace2c0ae1df5345 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
+
 #include <cassert>
 #include <cstring>
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
similarity index 92%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index 754920b60ca7439513a91ad0354833a2482b29c1..f495d857037c79a1783f8eb232fb57c20e229169 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
 
 #include <iostream>
 #include <unordered_map>
@@ -71,4 +71,4 @@ class PluginTensorRT : public nvinfer1::IPlugin {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
index cccc91226265ed139fb8db0b71c40b868f729562..871fb1210bd495dc3f5e8153bb6c3a361bf569f5 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
similarity index 91%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
index bbae9fb65c22cf69d2e7954436fd04dd16f7f6c8..9aa99a40b80de92a4d9b9ad36e88e693b8aa42dc 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
 
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -99,4 +99,4 @@ class TrtPluginRegistrar {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
index 129bdcdbc2f8d9d5215f45f381bcadf35e4fa75e..7d9c465c22beed0e252cbc26d6c533a0789d4f49 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
index a8f60886c03c174a612e7a135b6eb7bb7cb9997a..f3d6b4ff476139693a5251ddf58a3200d8af8efc 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 #include <cassert>
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
similarity index 82%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
index 274ce42fec9283c643004d45fba461879fc5f2dc..e5eff15c19694093c7a5ea933a41375e8e01c8b9 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
 
 #include <functional>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
@@ -43,4 +43,4 @@ string ExtractOpName(const void* serial_data, size_t serial_length,
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
similarity index 84%
rename from tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
rename to tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
index 31a313182be9a2fca7457a539670dbc911ccabb1..86bfabf99e08a8e447a28504c72eebca4d3a582c 100644
--- a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
+++ b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
@@ -22,13 +22,13 @@ import platform
 
 if platform.system() != "Windows":
   # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
+  from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import *
 
-  from tensorflow.contrib.util import loader
+  from tensorflow.python.framework import load_library
   from tensorflow.python.platform import resource_loader
   # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
 
-  _trt_engine_op = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_trt_engine_op.so"))
+  _trt_ops = load_library.load_op_library(
+      resource_loader.get_path_to_datafile("_trt_ops.so"))
 else:
   raise RuntimeError("Windows platforms are not supported")
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/segment/segment.cc
rename to tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 084a96e0fa5c97edc58adf2590ed94e5ef0e4d85..4a8a4ac7589a4b68b129e8e88ee999e8a2495728 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
 #include <queue>
 #include <set>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/segment/union_find.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -32,8 +33,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -673,10 +674,11 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
-    const std::set<const tensorflow::Node*, NodePtrCompare>& segment_nodes =
-        itr.second;
+    const string& segment_root = itr.first;
+    // Return format does not require set comparator.
+    std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
     if (VLOG_IS_ON(1)) {
-      string s = "parent=" + itr.first + ":";
+      string s = "parent=" + segment_root + ":";
       for (auto node : segment_nodes) s += " " + node->name();
       VLOG(1) << "Segment " << segments->size() << ": " << s;
     }
@@ -689,12 +691,10 @@ tensorflow::Status SegmentGraph(
     }
 
     // TODO(sami): Make segmenter placement aware once trtscopes are in place
-    std::set<string> segment_node_names;
-    for (auto node : itr.second) segment_node_names.insert(node->name());
-    const auto& dev_itr = device_maps.find(itr.first);
+    const auto& dev_itr = device_maps.find(segment_root);
     if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
       VLOG(1) << "No device assigned to segment " << segments->size();
-      segments->emplace_back(std::make_pair(segment_node_names, string()));
+      segments->emplace_back(std::make_pair(segment_nodes, string()));
     } else if (dev_itr->second.size() > 1) {
       string s("Segment ");
       StrAppend(&s, segments->size(), " has multiple devices attached: ");
@@ -703,10 +703,10 @@ tensorflow::Status SegmentGraph(
       }
       LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
       segments->emplace_back(
-          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
     } else {
       segments->emplace_back(
-          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
     }
   }
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
similarity index 83%
rename from tensorflow/contrib/tensorrt/segment/segment.h
rename to tensorflow/compiler/tf2tensorrt/segment/segment.h
index b9693aad1b764515459db6833b05221ea5b3a2d1..9a0ccc9aef475edfb0ffb83a2be21d4d4ca0e028 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
 
 #include <set>
 #include <vector>
@@ -29,10 +29,10 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Vector of segments, each entry contains a set of node names and a device name
-// in the segment.
-// TODO(aaroey): use node pointer instead of node name.
-using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
+// Vector of segments, each entry contains a set of node pointers and a device
+// name in the segment.
+using SegmentNodesVector =
+    std::vector<std::pair<std::set<const Node*>, string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
@@ -60,4 +60,4 @@ tensorflow::Status SegmentGraph(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/segment/segment_test.cc
rename to tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 4805ef9c61a7784a1c08cf5eaf504691bc9dbedc..58512d3b09d7c6f523710bc09843c628a5838b53 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -75,7 +75,10 @@ class SegmentTest : public ::testing::Test {
                        const std::vector<std::set<string>>& expected_segments) {
     EXPECT_EQ(expected_segments.size(), segments.size());
     for (int i = 0; i < segments.size(); ++i) {
-      const auto& segment_node_names = segments[i].first;
+      std::set<string> segment_node_names;
+      for (const Node* node : segments[i].first) {
+        segment_node_names.insert(node->name());
+      }
       const auto& expected = expected_segments[i];
       for (const auto& name : expected) {
         EXPECT_TRUE(segment_node_names.count(name))
diff --git a/tensorflow/contrib/tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
similarity index 92%
rename from tensorflow/contrib/tensorrt/segment/union_find.h
rename to tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 1c64ebbb0ae532a4776ab8963515d19fd3b23b4c..6458ae692fd7c922b5fc3bea2e55b613447dbde0 100644
--- a/tensorflow/contrib/tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
 namespace tensorflow {
 namespace tensorrt {
@@ -76,4 +76,4 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
similarity index 100%
rename from tensorflow/contrib/tensorrt/tensorrt_test.cc
rename to tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
diff --git a/tensorflow/contrib/tensorrt/test/utils.cc b/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/test/utils.cc
rename to tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
index 276308b3a0a6ce864969afb0179c6a3f00d6b70b..3bcca99afbff8b84d2dd628ae9211ee94e86af2a 100644
--- a/tensorflow/contrib/tensorrt/test/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
 
 #include <unordered_map>
 #include <vector>
diff --git a/tensorflow/contrib/tensorrt/test/utils.h b/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
similarity index 89%
rename from tensorflow/contrib/tensorrt/test/utils.h
rename to tensorflow/compiler/tf2tensorrt/utils/test_utils.h
index 4bb4120206cfaae70107e55d1818e3af2f02717a..bcd628b62f0320f7ce9dfe6240316d876f1d5a20 100644
--- a/tensorflow/contrib/tensorrt/test/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -41,4 +41,4 @@ string GetTestValue(const string& label);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
similarity index 98%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index 7a2e93414aed56525eaeac876cdac20404bcf6ab..1636cdc30c4df157ed124b160449af645f917252 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
similarity index 93%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index f857a9de055ee7668f0bf9bc97e030354505081b..59ffb42bad348c78cde32035aff8c7081528b3a6 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
 
 #include <unordered_map>
 
@@ -81,4 +81,4 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
similarity index 98%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
index beb1284208e4c10ffe1d36ef411cf08f11dbcb78..e457c64928e5df84c7e2726ba3621420f013dbc9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
similarity index 98%
rename from tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index dab1dd9343be7d5b033a3e04bf0b49fbbf37e9e5..bf111d3a2ee2fbec9151d12bbb6ff7181761c2aa 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 
 #include <atomic>
 #include <unordered_map>
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
similarity index 93%
rename from tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 65466c9741989fda5f82fc27d813d026f35fe386..10587e99624acfb97730bbbd9dfbcde020ffc669 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
 
 #include <atomic>
 #include <string>
@@ -96,4 +96,4 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 
 #endif
 #endif
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/log/trt_logger.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index dda0dc9e712eb726800abfb6084f4f708d04825b..c48bd6bf7747d1646c4e450b780822728e8573f1 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
similarity index 86%
rename from tensorflow/contrib/tensorrt/log/trt_logger.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index 96ccacb791e40143c5c4d9d691bb353702f9a28b..22f4de970a80765b0e1e7e8816134d83aaec7c73 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -41,4 +41,4 @@ class Logger : public nvinfer1::ILogger {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..09c47b36b0ad8074e749342e7d08f139da7ea1f4
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -0,0 +1,192 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+
+#include <list>
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+template <class Key, class Value, class HashFunction>
+class LRUCache {
+ public:
+  typedef Value value_type;
+  typedef Key key_type;
+  typedef HashFunction hasher;
+  typedef typename std::unordered_map<key_type, value_type, hasher> map_type;
+  typedef typename map_type::iterator iterator;
+  typedef typename map_type::const_iterator const_iterator;
+
+  LRUCache() : capacity_(0) {}
+  explicit LRUCache(size_t capacity) : capacity_(capacity) {}
+
+  size_t capacity() const { return capacity_; }
+
+  void reserve(size_t capacity) {
+    capacity_ = capacity;
+    DiscardOld();
+  }
+
+  size_t size() const { return objects_.size(); }
+
+  size_t count(const key_type& key) const { return objects_.count(key); }
+
+  value_type& at(const key_type& key) { return Touch(key); }
+
+  const_iterator begin() const { return objects_.begin(); }
+  const_iterator end() const { return objects_.end(); }
+
+  iterator begin() { return objects_.begin(); }
+  iterator end() { return objects_.end(); }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    DiscardOld(1);
+    std::pair<iterator, bool> result =
+        objects_.emplace(std::forward<Args>(args)...);
+    key_type key = result.first->first;
+    if (result.second) {
+      keys_.push_front(key);
+    } else {
+      TouchNoCheck(key);  // The key must exist in this case.
+    }
+    return result;
+  }
+
+ private:
+  std::unordered_map<key_type, value_type, hasher> objects_;
+  std::list<key_type> keys_;
+  size_t capacity_;
+  value_type not_found_value_;
+
+  value_type& Touch(const key_type& key) {
+    // Check that the key exists, and let it return std::out_of_range error if
+    // not.
+    value_type& value = objects_.at(key);
+    TouchNoCheck(key);
+    return value;
+  }
+
+  void TouchNoCheck(const key_type& key) {
+    auto rank = std::find(keys_.begin(), keys_.end(), key);
+    if (rank != keys_.begin()) {
+      keys_.erase(rank);
+      keys_.push_front(key);
+    }
+  }
+
+  // Creates n free positions in cache
+  tensorflow::Status DiscardOld(size_t n = 0) {
+    if (n > capacity_) {
+      return tensorflow::errors::Internal(
+          "Insufficient capacity in cache (capacity = ", capacity_,
+          ", requested ", n, ")");
+    }
+    while (objects_.size() > (capacity_ - n)) {
+      key_type discard_key = keys_.back();
+      keys_.pop_back();
+      objects_.erase(discard_key);
+    }
+    return tensorflow::Status::OK();
+  }
+};
+
+// Define a hash function for vector<TensorShape> because it is used as the key
+// for the engine cache.
+struct VectorTensorShapeHasher {
+  std::size_t operator()(
+      const std::vector<tensorflow::TensorShape>& key) const {
+    return std::hash<std::string>()(TensorShapeUtils::ShapeListString(key));
+  }
+};
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+struct EngineContext {
+  EngineContext() {}  // Creates an empty context.
+  EngineContext(
+      TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
+      TrtUniquePtrType<nvinfer1::IExecutionContext>&& input_execution_context)
+      : cuda_engine(std::move(input_cuda_engine)),
+        execution_context(std::move(input_execution_context)) {}
+
+  mutex mu;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
+  TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context
+      GUARDED_BY(mu);
+};
+
+class TRTEngineCacheResource : public tensorflow::ResourceBase {
+ public:
+  TRTEngineCacheResource(OpKernelContext* ctx, size_t capacity)
+      : cache_(capacity) {
+    auto device = ctx->device();
+    auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
+    if (!alloc) {
+      LOG(ERROR) << "Can't find device allocator for gpu device "
+                 << device->name();
+      allocator_ = nullptr;
+    } else {
+      allocator_.reset(new TRTDeviceAllocator(alloc));
+    }
+  }
+
+  string DebugString() const override {
+    std::stringstream oss;
+    using std::dec;
+    using std::endl;
+    using std::hex;
+    oss << "TRTEngineCacheResource: ";
+    oss << "TRTBaseAllocator = " << hex << allocator_.get() << dec << ", ";
+    oss << "LRUCache = " << hex << &cache_ << dec << endl;
+    oss << "Containing " << cache_.size() << " entries: " << endl;
+    for (const auto& item : cache_) {
+      oss << TensorShapeUtils::ShapeListString(item.first) << ": " << hex
+          << "ICudaEngine: " << item.second.get()->cuda_engine.get() << ", "
+          << "IExecutionContext: " << item.second.get()->execution_context.get()
+          << dec << endl;
+    }
+    return oss.str();
+  }
+
+  // Keep device allocator for TRT.
+  std::unique_ptr<TRTBaseAllocator> allocator_;
+
+  // Declare cache after allocator so that it is destroyed before allocator is.
+  LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
+           VectorTensorShapeHasher>
+      cache_;
+};
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa5eb8f7d4ad062c2d8622fa5aa55f823f80dd5
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+TEST(LRUCacheTest, Basic) {
+  LRUCache<int, int, std::hash<int>> cache;
+  cache.reserve(2);
+  // Insert 10
+  cache.emplace(10, 100);
+  EXPECT_EQ(cache.size(), 1);
+  EXPECT_EQ(cache.count(10), 1);
+  EXPECT_EQ(cache.at(10), 100);
+  EXPECT_EQ(cache.count(100), 0);
+  // Insert 20
+  cache.emplace(20, 200);
+  EXPECT_EQ(cache.size(), 2);
+  EXPECT_EQ(cache.count(10), 1);
+  EXPECT_EQ(cache.count(20), 1);
+  EXPECT_EQ(cache.at(10), 100);
+  EXPECT_EQ(cache.at(20), 200);
+  EXPECT_EQ(cache.count(100), 0);
+  EXPECT_EQ(cache.count(200), 0);
+  // Insert 30, Evicting 10
+  cache.emplace(30, 300);
+  EXPECT_EQ(cache.count(10), 0);
+  EXPECT_EQ(cache.count(20), 1);
+  EXPECT_EQ(cache.count(30), 1);
+  // Touch 20
+  cache.at(20);
+  // Insert 40, Evicting 30
+  cache.emplace(40, 400);
+  EXPECT_EQ(cache.count(10), 0);
+  EXPECT_EQ(cache.count(20), 1);
+  EXPECT_EQ(cache.count(30), 0);
+  EXPECT_EQ(cache.count(40), 1);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.cc
index 9c3698e5d1cc5d6d8d31a8fcaf03d103f1e1915d..0a72a88bc740101bcbadb40bfe106a5b8d284bbf 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h
similarity index 87%
rename from tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h
index 19f39e6d3db1571573fb290dd2c30fd43ea604ef..03879ffff2fa724b05cb1919753e4aaa99e2e702 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resource_manager.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCE_MANAGER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCE_MANAGER_H_
 #include <memory>
 
 #include <string>
@@ -42,4 +42,4 @@ class TRTResourceManager {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37f7fe99fbb2b9e121953fc0de211db1bbf34b7a
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+TRTCalibrationResource::~TRTCalibrationResource() {
+  VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  builder_.reset();
+  engine_.reset();
+  // We need to manually destroy the builder and engine before the allocator
+  // is destroyed.
+  allocator_.reset();
+}
+
+string TRTCalibrationResource::DebugString() const {
+  std::stringstream oss;
+  using std::dec;
+  using std::endl;
+  using std::hex;
+  oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
+      << " Builder    = " << hex << builder_.get() << dec << endl
+      << " Engine     = " << hex << engine_.get() << dec << endl
+      << " Logger     = " << hex << &logger_ << dec << endl
+      << " Allocator  = " << hex << allocator_.get() << dec << endl
+      << " Thread     = " << hex << thr_.get() << dec << endl;
+  return oss.str();
+}
+
+Status TRTCalibrationResource::SerializeToString(string* serialized) {
+  calibrator_->waitAndSetDone();
+  thr_->join();
+  *serialized = calibrator_->getCalibrationTableAsString();
+  if (!serialized->size()) {
+    return tensorflow::errors::Unknown("Calibration table is empty.");
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e8d4b3b738df09b0c2ea82dcc06e9b23a708385
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
+
+#include <list>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class SerializableResourceBase : public tensorflow::ResourceBase {
+ public:
+  virtual Status SerializeToString(string* serialized) = 0;
+};
+
+class TRTCalibrationResource : public SerializableResourceBase {
+ public:
+  ~TRTCalibrationResource() override;
+
+  string DebugString() const override;
+
+  Status SerializeToString(string* serialized) override;
+
+  // Lookup table for temporary staging areas of input tensors for calibration.
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
+  // Temporary staging areas for calibration inputs.
+  std::vector<PersistentTensor> device_tensors_;
+
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  std::unique_ptr<TRTBaseAllocator> allocator_;
+  tensorflow::tensorrt::Logger logger_;
+  // TODO(sami): Use threadpool threads!
+  std::unique_ptr<std::thread> thr_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index d8123e956fac04912b4fed5bf75cc9cb55c5baf9..92ba474fbcd085e3e33ceea4395cca4034969bd9 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -204,6 +204,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
@@ -224,6 +225,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
     alwayslink = 1,
 )
@@ -669,6 +671,7 @@ cc_library(
     name = "side_effect_util",
     srcs = ["side_effect_util.cc"],
     hdrs = ["side_effect_util.h"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/core:core_cpu",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.h b/tensorflow/compiler/tf2xla/cpu_function_runtime.h
index dfc1e8b8aebcf3142e9f61f60171c6b58634c71d..78970fb39bae7067c7668baa2aec65732b5b2352 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.h
@@ -104,7 +104,7 @@ class BufferInfo {
  private:
   BufferInfo() = default;
 
-  enum class Kind : unsigned {
+  enum class Kind : uint64 {
     kConstant,
     kTempBuffer,
     kEntryParameter,
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index efb75749722893100494e089c0beb96944e9f1d4..5e4699bbb6218089d2e76a36c7351bf7fbd23264 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -88,6 +89,9 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
       case XlaExpression::Kind::kResource:
         return errors::Unimplemented(
             "Resource as function argument is not yet implemented.");
+      case XlaExpression::Kind::kTensorList:
+        return errors::Unimplemented(
+            "TensorList as function argument is not yet implemented.");
       case XlaExpression::Kind::kInvalid:
         return errors::InvalidArgument("Invalid function argument");
     }
@@ -191,6 +195,9 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   // into the functions.
   XlaOpKernelContext xla_op_context(op_context);
 
+  XlaContext& context = XlaContext::Get(op_context);
+  auto* b = context.builder();
+
   XlaCompiler* compiler = xla_op_context.compiler();
 
   NameAttrList func;
@@ -219,8 +226,12 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   TF_RETURN_IF_ERROR(
       PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
 
+  bool add_token_input_output =
+      HasNodeAttr(n->def(), kXlaTokenInputNodesAttrName);
+
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = false;
+  compile_options.add_token_input_output = add_token_input_output;
   XlaCompiler::CompilationResult result;
   TF_RETURN_IF_ERROR(
       compiler->CompileFunction(compile_options, func, arguments, &result));
@@ -234,9 +245,19 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
     }
     handles.push_back(expressions[i]->handle());
   }
-
-  XlaContext& context = XlaContext::Get(op_context);
-  auto* b = context.builder();
+  if (add_token_input_output) {
+    std::vector<string> token_input_nodes;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(n->def(), kXlaTokenInputNodesAttrName, &token_input_nodes));
+    std::vector<xla::XlaOp> token_inputs;
+    for (const string& node_name : token_input_nodes) {
+      auto token_or = compiler->GetNodeToken(node_name);
+      TF_RETURN_IF_ERROR(token_or.status());
+      token_inputs.push_back(token_or.ConsumeValueOrDie());
+    }
+    xla::XlaOp token_input = xla::AfterAll(b, token_inputs);
+    handles.push_back(token_input);
+  }
 
   auto output_handle = xla::Call(b, *result.computation, handles);
   // The output handle of `Call` computation is a tuple type. Unzip it so
@@ -251,6 +272,10 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
       ++computation_output;
     }
   }
+  if (add_token_input_output) {
+    TF_RETURN_IF_ERROR(compiler->SetNodeToken(
+        n->name(), xla::GetTupleElement(output_handle, computation_output)));
+  }
   return b->first_error();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 47209d285f1a077fd80f779a406e6980892f1646..52d2901e73d16f71ecbf633ede0d2cf553b6e521 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -144,13 +144,22 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:quantize",
         "//tensorflow/compiler/xla/client/lib:sorting",
         "//tensorflow/compiler/xla/client/lib:triangular_solve",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:linalg_ops_op_lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:resource_variable_ops_op_lib",
         "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:stateless_random_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:concat_lib",
         "//tensorflow/core/kernels:constant_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 46e5d68c78fd9ff26a88dc2a1484c3a67b76f4f3..6b675fa8a94e0bc932baaa359565cbc8e4614ee5 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -39,7 +39,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input,
 
   OP_REQUIRES(
       ctx,
-      xla::ShapeUtil::Rank(crops.shape()) == 2 &&
+      crops.shape().rank() == 2 &&
           block_rank == xla::ShapeUtil::GetDimension(crops.shape(), 0) &&
           2 == xla::ShapeUtil::GetDimension(crops.shape(), 1),
       errors::InvalidArgument("crops should have shape [", block_rank,
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 5e9280c1fe692037b0a842a92ef5a8c28b854a54..ad6b334326a470442c8c0d79b725345d4165be10 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -165,12 +167,8 @@ XLA_MAKE_BINARY(
     xla::Div(xla::Mul(rhs, XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
              lhs, extend_dimensions));
 
-static xla::XlaOp Square(xla::XlaBuilder* builder, const xla::XlaOp& x) {
-  return xla::Mul(x, x);
-}
-
 XLA_MAKE_BINARY(SquaredDifference,
-                Square(b, xla::Sub(lhs, rhs, extend_dimensions)));
+                xla::Square(xla::Sub(lhs, rhs, extend_dimensions)));
 
 XLA_MAKE_BINARY(TruncateDiv, xla::Div(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(TruncateMod, xla::Rem(lhs, rhs, extend_dimensions));
@@ -195,8 +193,8 @@ XLA_MAKE_BINARY(SoftplusGrad,
 // softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2
 XLA_MAKE_BINARY(SoftsignGrad,
                 xla::Div(lhs,
-                         Square(b, xla::Add(XlaHelpers::One(b, input_type(0)),
-                                            xla::Abs(rhs)))));
+                         xla::Square(xla::Add(XlaHelpers::One(b, input_type(0)),
+                                              xla::Abs(rhs)))));
 
 XLA_MAKE_BINARY(TanhGrad,
                 xla::Mul(rhs, xla::Sub(XlaHelpers::One(b, input_type(0)),
@@ -204,6 +202,8 @@ XLA_MAKE_BINARY(TanhGrad,
 
 XLA_MAKE_BINARY(Pow, xla::Pow(lhs, rhs, extend_dimensions));
 
+XLA_MAKE_BINARY(NextAfter, xla::NextAfter(lhs, rhs));
+
 #undef XLA_MAKE_BINARY
 
 class ApproximateEqualOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 8cc2479dd555380da7500abe6b2aca380110333b..ca2152d6c103e05c06809d85d9529720ff112217 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -19,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -31,6 +33,7 @@ class CastOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_));
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(src_dtype_, &src_type_));
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dst_dtype_, &dst_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Truncate", &use_truncation_));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -48,6 +51,36 @@ class CastOp : public XlaOpKernel {
       // imaginary part.
       output = xla::ConvertElementType(xla::Real(input), dst_type_);
     } else {
+      if (use_truncation_) {
+        OP_REQUIRES(
+            ctx,
+            xla::primitive_util::IsFloatingPointType(src_type_) &&
+                xla::primitive_util::IsFloatingPointType(dst_type_),
+            errors::Unimplemented("Truncate attribute is only "
+                                  "implemented for floating point datatypes."));
+        int mantissa_difference =
+            xla::primitive_util::SignificandWidth(src_type_) -
+            xla::primitive_util::SignificandWidth(dst_type_);
+        OP_REQUIRES(ctx, mantissa_difference > 0,
+                    errors::Unimplemented(
+                        "Truncate attribute is only implemented in cases where "
+                        "dst datatype "
+                        "has fewer mantissa bits than the src datatype"));
+        int src_bitwidth = xla::primitive_util::BitWidth(src_type_);
+
+        // Bitcast to same-width integer, mask off the LSBs, bitcast back to the
+        // source datatype.
+        int64 mask = ~((1L << mantissa_difference) - 1);
+        xla::PrimitiveType same_width_int =
+            xla::primitive_util::UnsignedIntegralTypeForBitWidth(src_bitwidth);
+        OP_REQUIRES(ctx, same_width_int != xla::PRIMITIVE_TYPE_INVALID,
+                    errors::Unimplemented("Unexpected type bitwidth"));
+        input = xla::BitcastConvertType(
+            xla::And(
+                xla::BitcastConvertType(input, same_width_int),
+                ::tensorflow::IntegerLiteral(builder, same_width_int, mask)),
+            src_type_);
+      }
       output = xla::ConvertElementType(input, dst_type_);
     }
 
@@ -57,6 +90,7 @@ class CastOp : public XlaOpKernel {
  protected:
   DataType src_dtype_, dst_dtype_;
   xla::PrimitiveType src_type_, dst_type_;
+  bool use_truncation_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CastOp);
 };
@@ -79,8 +113,8 @@ class BitcastOp : public XlaOpKernel {
     if (src_dtype_ == dst_dtype_) {
       output = input;
     } else {
-      // The only complex type in XLA is C64, so error out if the bitcast has a
-      // complex source or destination type and the bitcast is not trivial.
+      // Error out if the bitcast has a complex source or destination type and
+      // the bitcast is not trivial.
       OP_REQUIRES(ctx,
                   !xla::primitive_util::IsComplexType(src_type_) &&
                       !xla::primitive_util::IsComplexType(dst_type_),
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 7199b9b6feb36dd45ef51f4c38463bc715fcc38a..c2b4c28d1566f5429c5d8109db94af0c3762b131 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -99,8 +99,8 @@ class CategoricalOp : public XlaOpKernel {
     xla::PrimitiveType xla_output_type;
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(output_type(0), &xla_output_type));
-    xla::XlaOp argmax = XlaHelpers::ArgMax(softmax_entries, xla_output_type,
-                                           /*axis=*/class_dimension);
+    xla::XlaOp argmax = xla::ArgMax(softmax_entries, xla_output_type,
+                                    /*axis=*/class_dimension);
     if (num_samples == 1) {
       argmax = xla::Reshape(argmax, {batch_size, 1});
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index cd7c7f4a82df7a65829787efcb1fd2f77870e945..91e4d9cea7cbf6075e30250587044174c4b8e7f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -24,13 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index dff8af800229b9605bb93e0498bc5e5cf012f244..ff6c54e47c62f0555ef045e25051f6ec5a3c1d39 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -83,6 +83,17 @@ class ConstOp : public XlaOpKernel {
             return;
           }
           break;
+        case DT_COMPLEX128:
+          if (proto_.scomplex_val_size() == 2) {
+            ctx->SetOutput(
+                0,
+                xla::Broadcast(xla::ConstantR0<xla::complex128>(
+                                   b, xla::complex128(proto_.dcomplex_val(0),
+                                                      proto_.dcomplex_val(1))),
+                               shape.dim_sizes()));
+            return;
+          }
+          break;
         case DT_INT32:
           if (proto_.int_val_size() == 1) {
             ctx->SetOutput(
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index b0bc7640307149459a29e6b0b2e8e8132e4141c9..5f99b24e221ba6c926032ef7a1b4bf1e92df7a68 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
@@ -212,8 +212,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
       XLAShapeToTensorShape(out_backprop_shape, &out_backprop_tensor_shape));
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_tensor_shape, filter_tensor_shape,
-      out_backprop_tensor_shape, dilations, strides, padding, data_format,
-      dims);
+      out_backprop_tensor_shape, dilations, strides, padding,
+      /*explicit_paddings=*/{}, data_format, dims);
 }
 
 }  // anonymous namespace
@@ -227,6 +227,11 @@ xla::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
   TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", &attrs.dilations));
   TF_RETURN_IF_ERROR(ctx->GetAttr("strides", &attrs.strides));
   TF_RETURN_IF_ERROR(ctx->GetAttr("padding", &attrs.padding));
+  // TODO(reedwm): Support explicit padding.
+  if (attrs.padding == EXPLICIT) {
+    return errors::Unimplemented(
+        "XLA does not yet support Conv2D with explicit padding.");
+  }
 
   string data_format;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format));
@@ -428,23 +433,14 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   int n_dim = GetTensorBatchDimIndex(num_dims, attrs.data_format);
   int c_dim = GetTensorFeatureDimIndex(num_dims, attrs.data_format);
 
-  // The conversion logic below assumes that the data format is NHWC, so we also
-  // check that here.
   bool use_batch_group_count =
-      filter_tensor_shape.dim_size(num_dims - 1) == 1 && attrs.depthwise &&
-      attrs.data_format == FORMAT_NHWC;
+      filter_tensor_shape.dim_size(num_dims - 1) == 1 && attrs.depthwise;
 
   std::vector<std::pair<int64, int64>> padding(attrs.num_spatial_dims);
   std::vector<int64> rhs_dilation(attrs.num_spatial_dims);
   std::vector<int64> window_strides(attrs.num_spatial_dims);
   std::vector<int64> ones(attrs.num_spatial_dims, 1);
 
-  // The activations (inputs) form the LHS of the convolution.
-  // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
-  // For the gradient computation, we flip the roles of the batch and
-  // feature dimensions.
-  // Each spatial entry has size in_depth * batch
-
   // Swap n_dim and c_dim in the activations.
   dnums.set_input_batch_dimension(c_dim);
   dnums.set_input_feature_dimension(n_dim);
@@ -478,7 +474,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // convolution, we get the right size for the filter.
     // The padded_in_rows should be such that when we convolve this with the
     // expanded_out_rows as a filter, we should get filter_rows back.
-    //
+
     const int64 padded_in_size =
         dims.spatial_dims[i].expanded_output_size +
         (dims.spatial_dims[i].filter_size - 1) * attrs.dilations[dim];
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index eafdba876ae9e2c38694f065cf83bb3725b8460e..52c3c2c4a903a8c51f6b511774bc0312d39df826 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -25,13 +25,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 6e6ba21daf5bf3eab5bfc15378e77b6dd253da7c..b119997cf39e210ed8e0ae730a08829e72b238b4 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 6df8b5367d2390e65995beb1583b225755e6ee9f..a623585aad3b1b8f1f096ca527e7694d74f1ba46 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 41c31d0ed58fe9bc9bbde0bd58993c975f04fd60..6472045265e4d930a5da770a68f5c502192201ae 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -167,13 +167,13 @@ class GatherOp : public XlaOpKernel {
 
       OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
       const auto params_dims = input_shape.dims();
-      if (axis < 0) {
-        axis += params_dims;
-      }
       OP_REQUIRES(
-          context, 0 <= axis && axis < params_dims,
+          context, -params_dims <= axis && axis < params_dims,
           errors::InvalidArgument("Expected axis in the range [", -params_dims,
                                   ", ", params_dims, "), but got ", axis));
+      if (axis < 0) {
+        axis += params_dims;
+      }
     }
 
     DataType index_type = input_type(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 4f0f0fd9aefecc3d31f8bd9c8ca40ebb0860c82d..aa5637e2669555da17af8bb05ab08beeba6a89c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -80,7 +80,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.name = resource->name();
       VLOG(2) << "Resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString()
+              << " shape: " << arg.HumanString()
               << " initialized: " << arg.initialized;
 
       num_resource_args++;
@@ -89,7 +89,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.type = input_types_[i];
       arg.shape = ctx->InputShape(i + 1);
       VLOG(2) << "Arg type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString();
+              << " shape: " << arg.HumanString();
     }
   }
 
@@ -150,12 +150,12 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape then_input_shape = then_result.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(then_input_shape),
+  OP_REQUIRES(ctx, then_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx, else_result.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape else_input_shape = else_result.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(else_input_shape),
+  OP_REQUIRES(ctx, else_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx,
               xla::ShapeUtil::Compatible(then_input_shape, else_input_shape),
@@ -248,7 +248,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
         xla::GetTupleElement(outputs, output_types_.size() + num_resource_args);
     auto shape_or = b->GetShape(token_output);
     OP_REQUIRES_OK(ctx, shape_or.status());
-    OP_REQUIRES(ctx, xla::ShapeUtil::IsToken(shape_or.ValueOrDie()),
+    OP_REQUIRES(ctx, shape_or.ValueOrDie().IsToken(),
                 errors::FailedPrecondition(
                     "Token output is not token type: ",
                     xla::ShapeUtil::HumanString(shape_or.ValueOrDie())));
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index 96ddd42e2ae04d454e4fb85628d139e17a543d2e..92b20fe0ba5611ca5314cd954026f7b71ea75f84 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -185,19 +187,20 @@ class AdjustContrastOpV2 : public XlaOpKernel {
                                         factor_shape.DebugString()));
 
     xla::XlaBuilder* b = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp factor = context->Input(1);
-
     DataType type = context->input_type(0);
 
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp factor = XlaHelpers::ConvertElementType(context->Input(1), type);
+
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
     auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto reduce = xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                               *context->GetOrCreateAdd(accumulation_type),
                               {height_dim, width_dim});
-    auto output = XlaHelpers::ConvertElementType(reduce, type);
-    output =
-        xla::Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
+
+    auto output = xla::Div(
+        reduce, XlaHelpers::FloatLiteral(b, accumulation_type, height * width));
+    output = XlaHelpers::ConvertElementType(output, type);
 
     std::vector<int64> broadcast_dims(input_shape.dims() - 2);
     std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
@@ -233,8 +236,10 @@ class AdjustSaturationOp : public XlaOpKernel {
                                 channels, " channels."));
 
     xla::XlaBuilder* b = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp scale = context->Input(1);
+    xla::XlaOp input =
+        XlaHelpers::ConvertElementType(context->Input(0), DT_FLOAT);
+    xla::XlaOp scale =
+        XlaHelpers::ConvertElementType(context->Input(1), DT_FLOAT);
 
     DataType type = context->input_type(0);
 
@@ -249,15 +254,17 @@ class AdjustSaturationOp : public XlaOpKernel {
                                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
     channel_shape.set_dim(channel_dim, 1);
-    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
-                        channel_shape);
+    auto hsv =
+        RGBToHSV(context, b, {red, green, blue}, DT_FLOAT, channel_shape);
 
-    hsv[1] = xla::Clamp(XlaHelpers::Zero(b, type), xla::Mul(hsv[1], scale),
-                        XlaHelpers::One(b, type));
+    hsv[1] = xla::Clamp(XlaHelpers::Zero(b, DT_FLOAT), xla::Mul(hsv[1], scale),
+                        XlaHelpers::One(b, DT_FLOAT));
 
-    auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0));
+    auto rgb = HSVToRGB(context->builder(), hsv, DT_FLOAT);
 
-    context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim));
+    auto output = XlaHelpers::ConvertElementType(
+        xla::ConcatInDim(b, rgb, channel_dim), type);
+    context->SetOutput(0, output);
   }
 };
 REGISTER_XLA_OP(Name("AdjustSaturation"), AdjustSaturationOp);
@@ -283,8 +290,10 @@ class AdjustHueOp : public XlaOpKernel {
                                 channels, " channels."));
 
     xla::XlaBuilder* b = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp delta = context->Input(1);
+    xla::XlaOp input =
+        XlaHelpers::ConvertElementType(context->Input(0), DT_FLOAT);
+    xla::XlaOp delta =
+        XlaHelpers::ConvertElementType(context->Input(1), DT_FLOAT);
 
     DataType type = context->input_type(0);
 
@@ -299,20 +308,22 @@ class AdjustHueOp : public XlaOpKernel {
                                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
     channel_shape.set_dim(channel_dim, 1);
-    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
-                        channel_shape);
+    auto hsv =
+        RGBToHSV(context, b, {red, green, blue}, DT_FLOAT, channel_shape);
 
-    auto zero = XlaHelpers::Zero(b, type);
-    auto one = XlaHelpers::One(b, type);
+    auto zero = XlaHelpers::Zero(b, DT_FLOAT);
+    auto one = XlaHelpers::One(b, DT_FLOAT);
 
     auto& hue = hsv[0];
     hue = xla::Rem(xla::Add(hsv[0], delta), one);
     hue =
         xla::Select(xla::Lt(hue, zero), xla::Rem(xla::Add(one, hue), one), hue);
 
-    auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0));
+    auto rgb = HSVToRGB(context->builder(), hsv, DT_FLOAT);
 
-    context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim));
+    auto output = XlaHelpers::ConvertElementType(
+        xla::ConcatInDim(b, rgb, channel_dim), type);
+    context->SetOutput(0, output);
   }
 };
 REGISTER_XLA_OP(Name("AdjustHue"), AdjustHueOp);
@@ -351,24 +362,26 @@ struct SuppressBodyFn {
     auto num_outputs_so_far = values[1];
     auto iou_mask = values[2];
     auto included_iou = values[3];
-    auto zero_r1 = xla::ConstantR1<int32>(builder, {0});
+    auto zero = xla::ConstantR0<int32>(builder, 0);
     // Determine if current elem is active using a slice.
-    auto row_idx_r1 = xla::Reshape(row_idx, {1});
-    auto active_elem = xla::DynamicSlice(included_iou, row_idx_r1, {1});
+    // TODO(b/118437727): The only reason we need an explicit vector is because
+    // some old GCCs can't deduce the right type for MakeConstSpan, and
+    // providing a single-value initializer list directly uses the wrong
+    // overload. Delete this once the deprecated overload is gone.
+    std::vector<xla::XlaOp> row_idx_vector = {row_idx};
+    auto active_elem = xla::DynamicSlice(included_iou, row_idx_vector, {1});
     active_elem = xla::Reshape(active_elem, {});
     // Increment output count iff current elem is not suppressed.
     num_outputs_so_far = xla::Select(
         active_elem, num_outputs_so_far + xla::ConstantR0<int32>(builder, 1),
         num_outputs_so_far);
     // Slice out the row_idx.
-    auto starts = xla::ConcatInDim(builder, {row_idx_r1, zero_r1}, 0);
-    auto row_iou = xla::DynamicSlice(iou_mask, starts, {1, num_boxes});
+    auto row_iou = xla::DynamicSlice(iou_mask, {row_idx, zero}, {1, num_boxes});
     // Remove the diagonal from consideration. An elem cannot suppress
     // itself.
-    auto update_starts = xla::ConcatInDim(builder, {zero_r1, row_idx_r1}, 0);
     row_iou = xla::DynamicUpdateSlice(
         row_iou, xla::ConstantR2FromArray2D<bool>(builder, {{false}}),
-        update_starts);
+        {zero, row_idx});
     // Create a suppression by inverting polarity.
     row_iou = xla::Reshape(row_iou, {num_boxes});
     auto supp_mask = xla::Not(row_iou);
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 5a10c52ba8b6d4fab73f0dda67cbd52fd625e76b..b96d45316f626e678a64392a4315979eeeb6e83c 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -72,10 +72,10 @@ namespace {
 // from in_size to out_size.
 struct ResizeConvolutionDims {
   // Size of the kernel to use.
-  std::vector<int64> kernel_size;
+  std::vector<int64> kernel_size;  // k
 
   // Stride of the convolution to use.
-  std::vector<int64> stride;
+  std::vector<int64> stride;  // S
 };
 ResizeConvolutionDims ComputeResizeConvolutionParameters(
     absl::Span<const int64> in_size, absl::Span<const int64> out_size,
@@ -117,8 +117,10 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
 //                        + dims.stride * (out_size - 1)
 int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
                             int64 stride) {
-  return (2 * kernel_size - 1) + (out_size - 1) * stride - (kernel_size - 1) -
-         1 - (kernel_size * (in_size - 1));
+  int64 padding = (2 * kernel_size - 1) + (out_size - 1) * stride -
+                  (kernel_size - 1) - 1 - (kernel_size * (in_size - 1));
+
+  return padding;
 }
 
 // Form a 2D convolution kernel like:
@@ -132,7 +134,7 @@ int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
 // If the 2D kernel would be very large, the 1D kernel can be applied once in
 // each dimension due to the symmetry of the kernel along all axis to reduce the
 // computational intensity.
-xla::XlaOp Make1DKernel(xla::XlaBuilder* builder, int64 n) {
+xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder, int64 n) {
   std::vector<float> kernel(n * 2 - 1);
   for (int64 i = 0; i < n; ++i) {
     float v = (i + 1.0f) / n;
@@ -142,43 +144,64 @@ xla::XlaOp Make1DKernel(xla::XlaBuilder* builder, int64 n) {
   return xla::ConstantR1<float>(builder, kernel);
 }
 
+// Unlike the bilinear kernel, which is triangular, the nearest neighbor
+// kernel is a square. For example, a 1D kernel with n=3 would look like
+// [0 1 1 1 0]
+// and n=4 would look like
+// [0 0 1 1 1 1 0].
+// Note that in the second case, the kernel is not symmetric and we default
+// to the right (because an existing non TPU kernel
+// for nearest neighbor resize already chose to default to the right,
+// so we want to be consistent).
+xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder, int64 n) {
+  std::vector<float> kernel(n * 2 - 1, 0.0f);
+  std::fill(&kernel[n / 2], &kernel[(3 * n) / 2], 1.0f);
+
+  return xla::ConstantR1<float>(builder, kernel);
+}
+
 // Kernels with more than 16 spatial elements are considered intense and the
-// kernel should applied to each dimension independently.
+// kernel should be applied to each dimension independently.
 const int64 kMax2DKernelSize = 16;
 
-xla::XlaOp MakeBilinearResizeKernel(xla::XlaBuilder* builder,
-                                    absl::Span<const int64> kernel_size,
-                                    int64 channels) {
+xla::XlaOp MakeGeneralResizeKernel(xla::XlaBuilder* builder,
+                                   absl::Span<const int64> kernel_size,
+                                   int64 channels, bool is_kernel_bilinear) {
+  auto make_kernel_func =
+      is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
+
   auto depthwise_kernel = xla::Broadcast(
       xla::Zero(builder, xla::F32),
       {(2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1});
 
   return xla::Mul(
-      xla::Add(depthwise_kernel, Make1DKernel(builder, kernel_size[1]),
+      xla::Add(depthwise_kernel, make_kernel_func(builder, kernel_size[1]),
                /*broadcast_dimensions=*/{1}),
-      Make1DKernel(builder, kernel_size[0]),
+      make_kernel_func(builder, kernel_size[0]),
       /*broadcast_dimensions=*/{0});
 }
 
-xla::XlaOp MakeBilinearResizeKernelInDim(xla::XlaBuilder* builder,
-                                         absl::Span<const int64> kernel_size,
-                                         int64 channels, int64 dim) {
+xla::XlaOp MakeGeneralResizeKernelInDim(xla::XlaBuilder* builder,
+                                        absl::Span<const int64> kernel_size,
+                                        int64 channels, int64 dim,
+                                        bool is_kernel_bilinear) {
+  auto make_kernel_func =
+      is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
+
   auto depthwise_kernel =
       xla::Broadcast(xla::Zero(builder, xla::F32),
                      {dim == 0 ? (2 * kernel_size[0] - 1) : 1,
                       dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1});
-  return xla::Add(depthwise_kernel, Make1DKernel(builder, kernel_size[dim]),
+  return xla::Add(depthwise_kernel, make_kernel_func(builder, kernel_size[dim]),
                   /*broadcast_dimensions=*/{dim});
 }
 
-xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
-                                             const xla::XlaOp& input,
-                                             const int num_spatial_dims,
-                                             std::vector<int64> in_size,
-                                             std::vector<int64> out_size,
-                                             const int64 channels,
-                                             const bool align_corners) {
-  // Picture for a 1x3 to 1x4 resize:
+xla::XlaOp ResizeUsingDilationAndConvolution(
+    xla::XlaBuilder* builder, const xla::XlaOp& input,
+    const int num_spatial_dims, std::vector<int64> in_size,
+    std::vector<int64> out_size, const int64 channels, const bool align_corners,
+    bool is_kernel_bilinear) {
+  // Picture for a 1x3 to 1x4 bilinear resize:
   // stride = 2, kernel size = 3
   // Input:
   // 3 6 9
@@ -264,8 +287,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
   // Split convolutions into independent dimensions if they would be a very
   // large kernel.
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel =
-        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, dims.kernel_size,
+                                                channels, is_kernel_bilinear);
     output =
         xla::ConvGeneralDilated(input_data, kernel, dims.stride,
                                 /*padding=*/
@@ -275,8 +298,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
                                 /*rhs_dilation=*/{1, 1}, dimension_numbers,
                                 /*feature_group_count=*/channels);
   } else {
-    xla::XlaOp kernel0 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
+    xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
+        builder, dims.kernel_size, channels, 0, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         input_data, kernel0, {dims.stride[0], 1},
         /*padding=*/
@@ -284,8 +307,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
         /*lhs_dilation=*/{dims.kernel_size[0], 1},
         /*rhs_dilation=*/{1, 1}, dimension_numbers,
         /*feature_group_count=*/channels);
-    xla::XlaOp kernel1 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
+    xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
+        builder, dims.kernel_size, channels, 1, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         output, kernel1, {1, dims.stride[1]},
         /*padding=*/
@@ -306,13 +329,11 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
   return output;
 }
 
-xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
-                                                   const xla::XlaOp& grad,
-                                                   const int num_spatial_dims,
-                                                   std::vector<int64> in_size,
-                                                   std::vector<int64> grad_size,
-                                                   const int64 channels,
-                                                   const bool align_corners) {
+xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
+    xla::XlaBuilder* builder, const xla::XlaOp& grad,
+    const int num_spatial_dims, std::vector<int64> in_size,
+    std::vector<int64> grad_size, const int64 channels,
+    const bool align_corners, bool is_kernel_bilinear) {
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, grad_size, align_corners);
 
@@ -332,8 +353,8 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
   xla::XlaOp output;
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel =
-        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, dims.kernel_size,
+                                                channels, is_kernel_bilinear);
 
     // Broadcast the input kernel where the forward op expanded from a size == 1
     // dimension to a size > 1 dimension. This has the effect of summing the
@@ -355,14 +376,14 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
         /*rhs_dilation=*/{1, 1}, dimension_numbers,
         /*feature_group_count=*/channels);
   } else {
-    xla::XlaOp kernel0 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
-    xla::XlaOp kernel1 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
-
-    // Broadcast the input kernel where the forward op expanded from a size == 1
-    // dimension to a size > 1 dimension. This has the effect of summing the
-    // gradient contributions in that dimension.
+    xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
+        builder, dims.kernel_size, channels, 0, is_kernel_bilinear);
+    xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
+        builder, dims.kernel_size, channels, 1, is_kernel_bilinear);
+
+    // Broadcast the input kernel where the forward op expanded from a
+    // size == 1 dimension to a size > 1 dimension. This has the effect of
+    // summing the gradient contributions in that dimension.
     if (in_size[0] == 1 && grad_size[0] > 1) {
       kernel0 =
           xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[0], 0),
@@ -407,109 +428,139 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
   return output;
 }
 
-class ResizeBilinearOp : public XlaOpKernel {
- public:
-  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
+                    bool is_kernel_bilinear) {
+  xla::XlaBuilder* b = ctx->builder();
+
+  TensorShape input_shape = ctx->InputShape(0);
+  OP_REQUIRES(ctx, input_shape.dims() == 4,
+              errors::InvalidArgument("input must be 4-dimensional",
+                                      input_shape.DebugString()));
+  // First dimension always assumed to be batch
+  const int64 batch = input_shape.dim_size(0);
+  std::vector<int64> in_size = {input_shape.dim_size(1),
+                                input_shape.dim_size(2)};
+  // Last/4th dimension always assumed to be num channels
+  const int64 channels = input_shape.dim_size(3);
+  OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+              errors::InvalidArgument("input size must be positive, got [",
+                                      in_size[0], ",", in_size[1], "]"));
+
+  std::vector<int64> out_size;
+  OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
+  OP_REQUIRES(ctx, out_size.size() == 2,
+              errors::InvalidArgument("output size must be length 2, got ",
+                                      out_size.size()));
+  OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
+              errors::InvalidArgument("output size must be positive, got [",
+                                      out_size[0], ",", out_size[1], "]"));
+
+  const int num_spatial_dims = 2;
+
+  xla::XlaOp input = ctx->Input(0);
+
+  // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
+  // dimension i.
+  bool slice_input = false;
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] > 1 && out_size[i] == 1) {
+      // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+      // entry before resizing.
+      slice_input = true;
+      in_size[i] = 1;
+    }
+  }
+  if (slice_input) {
+    input = xla::Slice(input, {0, 0, 0, 0},
+                       {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
   }
 
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
+  // Output is always type float.
+  input = xla::ConvertElementType(input, xla::F32);
+
+  // Special Case:
+  // Instead of doing a ResizeUsingDilationAndConvolution directly,
+  // while (out_size[0]-1) = c * 2^x * (in_size[0]-1) for x>1 c>1, resize the
+  // image to 2*(in_size[0]-1)+1 x-times and then resize by scale c(int here).
+  // Instead of resizing directly we resize it iteratively.
+  //
+  // Since bilinear resize can be broken down as 2 sequential linear
+  // operations along different dimensions.
+  // Given sufficient numerical stability and a<e<c and b<f<d, bilinear resize
+  // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
+  // This does not work in the case of align_corners_=false because of special
+  // padding requirements that cause multiple resizes to be very different
+  // from a single resize.
+  //
+  // This makes the convolutions kernels smaller and the operation faster.
+  xla::XlaOp output = input;
+  while (in_size != out_size) {
+    if (in_size[0] != 1 && in_size[1] != 1) {
+      std::vector<float> k = {
+          (static_cast<float>(out_size[0]) - 1) / ((in_size[0] - 1) * 2),
+          (static_cast<float>(out_size[1]) - 1) / ((in_size[1] - 1) * 2)};
+      if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
+          k[0] > 1 && k[1] > 1 && align_corners_) {
+        std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
+                                            (in_size[1] - 1) * 2 + 1};
+        output = ResizeUsingDilationAndConvolution(
+            b, input, num_spatial_dims, in_size, next_out_size, channels,
+            align_corners_, is_kernel_bilinear);
+        input = output;
+        in_size = next_out_size;
+      } else {
+        output = ResizeUsingDilationAndConvolution(
+            b, input, num_spatial_dims, in_size, out_size, channels,
+            align_corners_, is_kernel_bilinear);
+        in_size = out_size;
+      }
+    } else {
+      output = ResizeUsingDilationAndConvolution(
+          b, input, num_spatial_dims, in_size, out_size, channels,
+          align_corners_, is_kernel_bilinear);
+      in_size = out_size;
+    }
+  }
 
-    TensorShape input_shape = ctx->InputShape(0);
-    OP_REQUIRES(ctx, input_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input_shape.DebugString()));
-    const int64 batch = input_shape.dim_size(0);
-    std::vector<int64> in_size = {input_shape.dim_size(1),
-                                  input_shape.dim_size(2)};
-    const int64 channels = input_shape.dim_size(3);
-    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
-                errors::InvalidArgument("input size must be positive, got [",
-                                        in_size[0], ",", in_size[1], "]"));
+  ctx->SetOutput(0, output);
+}
 
-    std::vector<int64> out_size;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
-    OP_REQUIRES(ctx, out_size.size() == 2,
-                errors::InvalidArgument("output size must be length 2, got ",
-                                        out_size.size()));
-    OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
-                errors::InvalidArgument("output size must be positive, got [",
-                                        out_size[0], ",", out_size[1], "]"));
+class ResizeNearestNeighborOp : public XlaOpKernel {
+ public:
+  explicit ResizeNearestNeighborOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented("ResizeNearestNeighbor with align_corners=False "
+                              "is not yet implemented"));
+  }
 
-    const int num_spatial_dims = 2;
+  void Compile(XlaOpKernelContext* ctx) override {
+    GeneralCompile(ctx, align_corners_, is_kernel_bilinear_);
+  }
 
-    xla::XlaOp input = ctx->Input(0);
+ private:
+  bool align_corners_ = true;
+  bool is_kernel_bilinear_ = false;
+};
 
-    // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
-    // dimension i.
-    bool slice_input = false;
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      if (in_size[i] > 1 && out_size[i] == 1) {
-        // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
-        // entry before resizing.
-        slice_input = true;
-        in_size[i] = 1;
-      }
-    }
-    if (slice_input) {
-      input =
-          xla::Slice(input, {0, 0, 0, 0},
-                     {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
-    }
+REGISTER_XLA_OP(Name("ResizeNearestNeighbor").CompileTimeConstantInput("size"),
+                ResizeNearestNeighborOp);
 
-    // Output is always type float.
-    input = xla::ConvertElementType(input, xla::F32);
-
-    // Special Case:
-    // Instead of doing a ResizeUsingDilationAndConvolution directly,
-    // while (out_size[0]-1) = c * 2^x * (in_size[0]-1) for x>1 c>1, resize the
-    // image to 2*(in_size[0]-1)+1 x-times and then resize by scale c(int here).
-    // Instead of resizing directly we resize it iteratively.
-    //
-    // Since bilinear resize can be broken down as 2 sequential linear
-    // operations along different dimensions.
-    // Given sufficient numerical stability and a<e<c and b<f<d, bilinear resize
-    // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
-    // This does not work in the case of align_corners_=false because of special
-    // padding requirements that cause multiple resizes to be very different
-    // from a single resize.
-    //
-    // This makes the convolutions kernels smaller and the operation faster.
-    xla::XlaOp output = input;
-    while (in_size != out_size) {
-      if (in_size[0] != 1 && in_size[1] != 1) {
-        std::vector<float> k = {
-            (static_cast<float>(out_size[0]) - 1) / ((in_size[0] - 1) * 2),
-            (static_cast<float>(out_size[1]) - 1) / ((in_size[1] - 1) * 2)};
-        if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
-            k[0] > 1 && k[1] > 1 && align_corners_) {
-          std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
-                                              (in_size[1] - 1) * 2 + 1};
-          output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                     in_size, next_out_size,
-                                                     channels, align_corners_);
-          input = output;
-          in_size = next_out_size;
-        } else {
-          output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                     in_size, out_size,
-                                                     channels, align_corners_);
-          in_size = out_size;
-        }
-      } else {
-        output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                   in_size, out_size, channels,
-                                                   align_corners_);
-        in_size = out_size;
-      }
-    }
+class ResizeBilinearOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+  }
 
-    ctx->SetOutput(0, output);
+  void Compile(XlaOpKernelContext* ctx) override {
+    GeneralCompile(ctx, align_corners_, is_kernel_bilinear_);
   }
 
  private:
-  bool align_corners_;
+  bool align_corners_ = true;
+  bool is_kernel_bilinear_ = true;
 };
 
 REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstantInput("size"),
@@ -581,19 +632,19 @@ class ResizeBilinearGradOp : public XlaOpKernel {
                                                (in_size[1] - 1) * 2 + 1};
           output = ResizeUsingDilationAndConvolutionGradOp(
               b, grad, num_spatial_dims, in_size, next_grad_size, channels,
-              align_corners_);
+              align_corners_, true);
           grad = output;
           in_size = next_grad_size;
         } else {
           output = ResizeUsingDilationAndConvolutionGradOp(
               b, grad, num_spatial_dims, in_size, grad_size, channels,
-              align_corners_);
+              align_corners_, true);
           in_size = grad_size;
         }
       } else {
         output = ResizeUsingDilationAndConvolutionGradOp(
             b, grad, num_spatial_dims, in_size, grad_size, channels,
-            align_corners_);
+            align_corners_, true);
         in_size = grad_size;
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 843b6bb4e658af16fd753c1a20b35dd3d18df027..c1539f48d4f729510b2d930de91666a7c31f1ef0 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -18,17 +18,16 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/index_ops.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 XlaArgMinMaxOp::XlaArgMinMaxOp(OpKernelConstruction* ctx, bool is_min)
@@ -66,9 +65,9 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
   xla::XlaOp input = ctx->Input(0);
   xla::XlaOp output;
   if (is_min_) {
-    output = XlaHelpers::ArgMin(input, index_xla_type, axis);
+    output = xla::ArgMin(input, index_xla_type, axis);
   } else {
-    output = XlaHelpers::ArgMax(input, index_xla_type, axis);
+    output = xla::ArgMax(input, index_xla_type, axis);
   }
 
   ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index e2c05b648bb194b1b452c527ddb1a2c5995b1217..e4bbdef6480104a1051acfc647644deb65c80171 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -16,16 +16,16 @@ limitations under the License.
 // Native XLA implementations of indexing ops.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -74,7 +74,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // shape isn't supported.
     if (!ctx->compiler()->options().allow_cpu_custom_calls ||
         (input_dims != 1 && input_dims != 2)) {
-      xla::XlaOp output = XlaHelpers::ArgMax(ctx->Input(0), output_type, axis);
+      xla::XlaOp output = xla::ArgMax(ctx->Input(0), output_type, axis);
       ctx->SetOutput(0, output);
       return;
     }
@@ -110,8 +110,8 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
       auto shape_status = b.GetShape(arg);
       OP_REQUIRES_OK(ctx, shape_status.status());
       xla::Shape arg_shape = shape_status.ConsumeValueOrDie();
-      *arg_shape.mutable_layout() = xla::LayoutUtil::MakeDescendingLayout(
-          xla::ShapeUtil::Rank(arg_shape));
+      *arg_shape.mutable_layout() =
+          xla::LayoutUtil::MakeDescendingLayout(arg_shape.rank());
       arg_shapes.push_back(std::move(arg_shape));
     }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 6440770c29894c951f010f6c1deb929f4fe79bbf..f36e0025250b3a196b31755a1ddf6620c415b6a3 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -24,8 +24,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr std::array<DataType, 5> kMatmulTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
+constexpr std::array<DataType, 6> kMatmulTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}};
 
 class MatMulOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index f6b8534f4d7c537e5b708ee000e00cb92123584b..656f9b898f32dfc05215014f51c2bbaf07580836 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -38,8 +38,7 @@ class MirrorPadOp : public XlaOpKernel {
     // - [1, 2, 3, 3, 2] in symmetric mode.
     int64 excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0;
     xla::XlaOp accum = t;
-    for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0;
-         --dimno) {
+    for (int64 dimno = original_shape.rank() - 1; dimno >= 0; --dimno) {
       auto t_rev = xla::Rev(accum, {dimno});
       int64 lhs_padding = pad_literal.Get<int64>({dimno, 0});
       int64 rhs_padding = pad_literal.Get<int64>({dimno, 1});
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index a9b519d8928cc2807831fd6b4f12e60b7d58ea55..426a0941df57f19072d1cb9f3fa3d0079db465c5 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 06c6cc37ec90192486ba15010bfeb763a9ffb987..23bb050a34d9246cdf73090aa6adfca054bf8bcf 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/pooling_ops_common.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 2d92056e4f522f6206e7d632f0fa1e8b793fd6e3..01b047f732f0e9fb3b45b272e7886e2f8cf4fff4 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -160,17 +160,24 @@ class RandomShuffleOp : public XlaOpKernel {
         -> xla::StatusOr<std::vector<xla::XlaOp>> {
       auto swaps = loop_vars[0];
       auto indices = loop_vars[1];
-      i = xla::Reshape(i, {1});
+      // TODO(b/118437727): The absl::Span nonsense is only necessary because
+      // the deprecated overload creates ambiguity for the single-element span
+      // case. Remove it once the deprecated overload is gone.
       // temp = indices[i]
-      auto temp = xla::DynamicSlice(indices, i, {1});
+      auto temp =
+          xla::DynamicSlice(indices, absl::Span<const xla::XlaOp>({i}), {1});
       // swap_index = swaps[i]
-      auto swap_index = xla::DynamicSlice(swaps, i, {1});
+      auto swap_index = xla::Reshape(
+          xla::DynamicSlice(swaps, absl::Span<const xla::XlaOp>({i}), {1}), {});
       // swap_value = indices[swaps[i]]
-      auto swap_value = xla::DynamicSlice(indices, swap_index, {1});
+      auto swap_value = xla::DynamicSlice(
+          indices, absl::Span<const xla::XlaOp>({swap_index}), {1});
       // indices[i] = indices[swaps[i]]
-      indices = xla::DynamicUpdateSlice(indices, swap_value, i);
+      indices = xla::DynamicUpdateSlice(indices, swap_value,
+                                        absl::Span<const xla::XlaOp>({i}));
       // indices[swaps[i]] = temp
-      indices = xla::DynamicUpdateSlice(indices, temp, swap_index);
+      indices = xla::DynamicUpdateSlice(
+          indices, temp, absl::Span<const xla::XlaOp>({swap_index}));
       return std::vector<xla::XlaOp>{swaps, indices};
     };
     // for i in range(n):
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 4b9e1a578be2445091228953df7e5c5e82b42c28..daefdfc58a4957d9e685d25aa90da6218f2041ad 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 9e4c57c9bf73369662274f6b783418e18ff860c2..aaf8c6075dd292e33e70683774a6c1bf374183e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index b1fa2915d59e4e5e2f2523e20e9a37898d087117..7a620d2a6518f8686ef570b33aac971d1dccb6c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -157,9 +157,11 @@ class LinSpaceOp : public XlaOpKernel {
           flat(0) = start;
         } else {
           const float step = (stop - start) / (num - 1);
-          for (int64 i = 0; i < num; ++i) {
+          for (int64 i = 0; i < num - 1; ++i) {
             flat(i) = start + step * i;
           }
+          // The last value in the sequence must be equal to stop.
+          flat(num - 1) = stop;
         }
         break;
       }
@@ -171,9 +173,11 @@ class LinSpaceOp : public XlaOpKernel {
           flat(0) = start;
         } else {
           const double step = (stop - start) / (num - 1);
-          for (int64 i = 0; i < num; ++i) {
+          for (int64 i = 0; i < num - 1; ++i) {
             flat(i) = start + step * i;
           }
+          // The last value in the sequence must be equal to stop.
+          flat(num - 1) = stop;
         }
         break;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 12830816ec16c9797f0fe4d8f3f13f5a8176161d..31d4cc131600f360c764ffa02831046c85d846e5 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -20,10 +20,11 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -91,14 +92,20 @@ class SizeOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
-    const int64 size = input_shape.num_elements();
-    OP_REQUIRES(ctx, FastBoundsCheck(size, std::numeric_limits<int32>::max()),
+    OP_REQUIRES(ctx,
+                FastBoundsCheck(input_shape.num_elements(),
+                                std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("Size does not work for tensors > "
                                         "int32 max."));
     Tensor size_constant(DT_INT32, TensorShape({}));
-    size_constant.scalar<int32>()() = static_cast<int32>(size);
-
-    ctx->SetConstantOutput(0, size_constant);
+    const int rank = input_shape.dims();
+    xla::XlaBuilder* builder = ctx->builder();
+    auto size = xla::One(builder, xla::U32);
+    for (int64 i = 0; i < rank; ++i) {
+      size = xla::Mul(size, xla::GetDimensionSize(ctx->Input(0), i));
+    }
+    size = xla::ConvertElementType(size, xla::S32);
+    ctx->SetOutput(0, size);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
index 76ea5f525598f511f295eb5a30f3cf603fbf57aa..b18e3f965c427aec456ce2b188dad79485df23cc 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 622efac81766fc3ddaf538b58170f34fce06927a..52bed2670b4b8408e3b2f72b64bf370aea5325f6 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -39,7 +39,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input,
 
   OP_REQUIRES(
       ctx,
-      xla::ShapeUtil::Rank(paddings.shape()) == 2 &&
+      paddings.shape().rank() == 2 &&
           block_rank == xla::ShapeUtil::GetDimension(paddings.shape(), 0) &&
           2 == xla::ShapeUtil::GetDimension(paddings.shape(), 1),
       errors::InvalidArgument("paddings should have shape [", block_rank,
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 8e9e4daf99d3dd3b8e149e3f3e5f6c27665c0fcb..b6c96b1f582710e1cc39e6e1e0e800ef8170743d 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -24,13 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -45,7 +45,7 @@ Status GetStackShape(xla::XlaBuilder* builder, XlaResource* resource,
     return shape_or_status.status();
   }
   xla::Shape shape = shape_or_status.ValueOrDie();
-  TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
+  TF_RET_CHECK(shape.IsTuple());
   return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
                                stack_shape);
 }
@@ -146,9 +146,9 @@ class StackPushOp : public XlaOpKernel {
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
@@ -202,9 +202,9 @@ class StackPopOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, resource->SetValue(xla::Tuple(b, {ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, stack_shape.dims() - 1}}));
+    std::vector<xla::XlaOp> start_indices(stack_shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     auto slice_shape = stack_shape.dim_sizes();
     slice_shape[0] = 1LL;
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 10d990b3213ab882cf44a4df20a977633de3fdab..2273b592466431f59abcc43fcac4c37eecd53bff 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -288,19 +288,21 @@ class StridedSliceAssignOp : public XlaOpKernel {
     xla::XlaOp rhs = ctx->Input(4);
 
     absl::InlinedVector<int64, 4> dimensions_to_reverse;
-    absl::InlinedVector<int64, 4> slice_begin, slice_dims;
+    absl::InlinedVector<xla::XlaOp, 4> slice_begin;
+    absl::InlinedVector<int64, 4> slice_dims;
     for (int i = 0; i < begin.size(); ++i) {
       // TODO(phawkins): implement strides != 1
       OP_REQUIRES(
           ctx, strides[i] == 1 || strides[i] == -1,
           errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
       if (strides[i] > 0) {
-        slice_begin.push_back(begin[i]);
+        slice_begin.push_back(xla::ConstantR0<int64>(ctx->builder(), begin[i]));
         slice_dims.push_back(end[i] - begin[i]);
       } else {
         // Negative stride: swap begin and end, add 1 because the interval
         // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(end[i] + 1);
+        slice_begin.push_back(
+            xla::ConstantR0<int64>(ctx->builder(), end[i] + 1));
         slice_dims.push_back(begin[i] - end[i]);
         dimensions_to_reverse.push_back(i);
       }
@@ -311,14 +313,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
     }
     rhs = xla::Reshape(rhs, slice_dims);
 
-    if (lhs_shape.dims() == 0) {
-      // TODO(b/38323843): DynamicUpdateSlice crashes on rank 0 inputs. Fix
-      // and remove this workaround.
-      lhs = rhs;
-    } else {
-      lhs = xla::DynamicUpdateSlice(
-          lhs, rhs, xla::ConstantR1<int64>(ctx->builder(), slice_begin));
-    }
+    lhs = xla::DynamicUpdateSlice(lhs, rhs, slice_begin);
 
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 939d7e19515a1cb41e3e23e9d1fa957ae09ecab7..77a3e5c001e1c715f23ae5148f94dae2faa81acf 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -27,13 +27,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -123,7 +123,8 @@ Status GetTensorArrayShape(const XlaResource* resource,
 xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp& operand,
                            const xla::XlaOp& update,
                            absl::Span<const int64> update_dims,
-                           const xla::XlaOp& start_indices, DataType dtype) {
+                           absl::Span<const xla::XlaOp> start_indices,
+                           DataType dtype) {
   xla::XlaOp current = xla::DynamicSlice(operand, start_indices, update_dims);
   xla::XlaOp sum =
       dtype == DT_BOOL ? xla::Or(current, update) : xla::Add(current, update);
@@ -212,9 +213,9 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::XlaOp flow = ctx->Input(3);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
@@ -263,9 +264,9 @@ class TensorArrayReadOp : public XlaOpKernel {
     xla::XlaOp index = ctx->Input(1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, ta_shape.dims() - 1}}));
+    std::vector<xla::XlaOp> start_indices(ta_shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     auto slice_shape = ta_shape.dim_sizes();
     slice_shape[0] = 1LL;
@@ -419,10 +420,10 @@ class TensorArrayScatterOp : public XlaOpKernel {
         auto slice = xla::Slice(value, value_starts, value_ends, value_strides);
 
         // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-        auto index = xla::Slice(indices, {i}, {i + 1}, {1});
-        auto start_indices =
-            xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                     xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+        auto index = xla::Reshape(xla::Slice(indices, {i}, {i + 1}, {1}), {});
+        std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                              xla::ConstantR0<int32>(b, 0));
+        start_indices[0] = index;
         ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices, dtype_);
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 64a24703ae1460abfedb6d9298e1e164076a199a..65020012283d9c5f62e5e2fd11fc2bf1110e019a 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 // XLA TensorList operators.
+// Tensor lists are represented as tuple consisting of a pre-allocated list
+// consisting of the tensors (and where dim 0 is the list index), along with a
+// scalar telling us the current number of elements.
 
 #include <limits>
 #include <vector>
@@ -24,13 +27,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -45,11 +48,27 @@ Status GetTensorListShape(xla::XlaBuilder* builder, xla::XlaOp op,
     return shape_or_status.status();
   }
   xla::Shape shape = shape_or_status.ValueOrDie();
-  TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
+  TF_RET_CHECK(shape.IsTuple());
   return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
                                tensor_list_shape);
 }
 
+class TensorListLengthOp : public XlaOpKernel {
+ public:
+  explicit TensorListLengthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp tl = ctx->Input(0);
+    xla::XlaOp index = xla::GetTupleElement(tl, 1);
+    ctx->SetOutput(0, index);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListLengthOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListLength"), TensorListLengthOp);
+
 class TensorListReserveOp : public XlaOpKernel {
  public:
   explicit TensorListReserveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -67,9 +86,10 @@ class TensorListReserveOp : public XlaOpKernel {
     tensor_shape.AppendShape(element_shape);
 
     xla::XlaBuilder* b = ctx->builder();
-    ctx->SetOutput(0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
-                                                    tensor_shape.dim_sizes()),
-                                     xla::ConstantR0<int32>(b, 0)}));
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
+                                         tensor_shape.dim_sizes()),
+                          xla::ConstantR0<int32>(b, num_elements)}));
   }
 
  private:
@@ -85,19 +105,41 @@ REGISTER_XLA_OP(Name("TensorListReserve")
 
 class EmptyTensorListOp : public XlaOpKernel {
  public:
-  explicit EmptyTensorListOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit EmptyTensorListOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    ctx->CtxFailure(
+    TensorShape element_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
+    int64 max_num_elements;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
+    OP_REQUIRES(
+        ctx, max_num_elements >= 0,
         errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Use TensorListReserve instead."));
+                                "size. Set the max number of elements."));
+
+    TensorShape tensor_shape;
+    tensor_shape.AddDim(max_num_elements);
+    tensor_shape.AppendShape(element_shape);
+
+    xla::XlaBuilder* b = ctx->builder();
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
+                                         tensor_shape.dim_sizes()),
+                          xla::ConstantR0<int32>(b, 0)}));
   }
 
  private:
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(EmptyTensorListOp);
 };
 
-REGISTER_XLA_OP(Name("EmptyTensorList"), EmptyTensorListOp);
+REGISTER_XLA_OP(Name("EmptyTensorList")
+                    .CompileTimeConstantInput("element_shape")
+                    .CompileTimeConstantInput("max_num_elements"),
+                EmptyTensorListOp);
 
 class TensorListElementShapeOp : public XlaOpKernel {
  public:
@@ -139,6 +181,136 @@ class TensorListElementShapeOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListElementShape"), TensorListElementShapeOp);
 
+class TensorListGetItemOp : public XlaOpKernel {
+ public:
+  explicit TensorListGetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp state = ctx->Input(0);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, GetTensorListShape(b, state, &shape));
+
+    xla::XlaOp ta = xla::GetTupleElement(state, 0);
+    xla::XlaOp index = ctx->Input(1);
+
+    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+    std::vector<xla::XlaOp> start_indices(shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
+    auto slice_shape = shape.dim_sizes();
+    slice_shape[0] = 1LL;
+
+    xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
+    // Remove the leading '1' dimension.
+    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
+
+    ctx->SetOutput(0, xla::Reshape(read, value_shape));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListGetItemOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListGetItem"), TensorListGetItemOp);
+
+class TensorListStackOp : public XlaOpKernel {
+ public:
+  explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp state = ctx->Input(0);
+    xla::XlaOp ta = xla::GetTupleElement(state, 0);
+    ctx->SetOutput(0, ta);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListStackOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListStack"), TensorListStackOp);
+
+class TensorListFromTensorOp : public XlaOpKernel {
+ public:
+  explicit TensorListFromTensorOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape element_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(1, &element_shape));
+
+    const TensorShape tensor_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, tensor_shape.dims() > 0,
+                errors::InvalidArgument("Input value must be at least a "
+                                        "vector but received shape: ",
+                                        tensor_shape.DebugString()));
+    const int num_elements = tensor_shape.dim_size(0);
+
+    xla::XlaBuilder* b = ctx->builder();
+    const xla::XlaOp tensor = ctx->Input(0);
+
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {tensor, xla::ConstantR0<int32>(b, num_elements)}));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListFromTensorOp);
+};
+
+REGISTER_XLA_OP(
+    Name("TensorListFromTensor").CompileTimeConstantInput("element_shape"),
+    TensorListFromTensorOp);
+
+class TensorListSetItemOp : public XlaOpKernel {
+ public:
+  explicit TensorListSetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp tl = ctx->Input(0);
+    TensorShape elem_shape = ctx->InputShape(2);
+
+    xla::XlaOp ta = xla::GetTupleElement(tl, 0);
+    xla::XlaOp index = ctx->Input(1);
+    xla::XlaOp value = ctx->Input(2);
+
+    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
+
+    TensorShape slice_shape = elem_shape;
+    slice_shape.InsertDim(0, 1LL);
+    auto update = xla::Reshape(value, slice_shape.dim_sizes());
+
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
+                          index + xla::ConstantR0<int32>(b, 1)}));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSetItemOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListSetItem"), TensorListSetItemOp);
+
 class TensorListPushBackOp : public XlaOpKernel {
  public:
   explicit TensorListPushBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -147,25 +319,23 @@ class TensorListPushBackOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp list = ctx->Input(0);
+    xla::XlaOp tl = ctx->Input(0);
     TensorShape elem_shape = ctx->InputShape(1);
 
-    xla::XlaOp ta = xla::GetTupleElement(list, 0);
-    xla::XlaOp index = xla::GetTupleElement(list, 1);
+    xla::XlaOp ta = xla::GetTupleElement(tl, 0);
+    xla::XlaOp index = xla::GetTupleElement(tl, 1);
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
     auto update = xla::Reshape(value, slice_shape.dim_sizes());
 
-    // TODO(phawkins): We don't check the index is in bounds --- there is no
-    // error mechanism in XLA.
-    ctx->SetOutput(
+    ctx->SetTensorListOutput(
         0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
                           index + xla::ConstantR0<int32>(b, 1)}));
   }
@@ -197,20 +367,17 @@ class TensorListPopBackOp : public XlaOpKernel {
     index = index - xla::ConstantR0<int32>(b, 1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, shape.dims() - 1}}));
-
+    std::vector<xla::XlaOp> start_indices(shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
     auto slice_shape = shape.dim_sizes();
     slice_shape[0] = 1LL;
 
-    // TODO(phawkins): We don't check the index is in bounds --- there is no
-    // error mechanism in XLA.
     xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
     // Remove the leading '1' dimension.
     std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
 
-    ctx->SetOutput(0, xla::Tuple(b, {ta, index}));
+    ctx->SetTensorListOutput(0, xla::Tuple(b, {ta, index}));
     ctx->SetOutput(1, xla::Reshape(read, value_shape));
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index c9b324a243e4cc3ec64daa3ca0d285336a0d0154..76793d677ba45f8e863e684a149da684c8ce8787 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index 8671632976023fded04c26a9780c1a67638b0916..2fc5619de737b8977e4249e4d2297a0303c339ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 2c92a585f5679242d672d0402e617ff199b94f17..dfa09b16081e93ba843a1858e68e6ff756de20c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -291,5 +291,19 @@ class ResourceScatterNdAddOp : public ResourceScatterOp {
 };
 REGISTER_XLA_OP(Name("ResourceScatterNdAdd"), ResourceScatterNdAddOp);
 
+class ResourceScatterNdSubOp : public ResourceScatterOp {
+ public:
+  explicit ResourceScatterNdSubOp(OpKernelConstruction* context)
+      : ResourceScatterOp(context, /*indices_are_vectors=*/true,
+                          /*combiner=*/Combine) {}
+
+ private:
+  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+                            xla::XlaBuilder* builder) {
+    return xla::Sub(x, y);
+  }
+};
+REGISTER_XLA_OP(Name("ResourceScatterNdSub"), ResourceScatterNdSubOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index ff5255028bd012ea4d839faa59ef5930a17c5767..fd5ff10ae0a8cb39075fa6c594707dbc833f5f16 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -70,13 +70,20 @@ Status MakeXlaCompilerArgumentsFromInputs(
       arg.name = resource->name();
       VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString()
+              << " shape: " << arg.ShapeHumanString()
               << " initialized: " << arg.initialized;
 
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = ctx->input_type(i);
-      arg.shape = ctx->InputShape(i);
+
+      xla::XlaBuilder* builder = ctx->builder();
+      xla::XlaOp handle = ctx->Input(i);
+      auto shape_or_status = builder->GetShape(handle);
+      if (!shape_or_status.ok()) {
+        return shape_or_status.status();
+      }
+      arg.shape = shape_or_status.ValueOrDie();
     }
   }
   return Status::OK();
@@ -206,12 +213,12 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES(ctx, body.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape body_input_shape = body.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(body_input_shape),
+  OP_REQUIRES(ctx, body_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx, cond.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape cond_input_shape = cond.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(cond_input_shape),
+  OP_REQUIRES(ctx, cond_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
 
   VLOG(2) << "Body shape: " << xla::ShapeUtil::HumanString(body_input_shape)
@@ -291,20 +298,15 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
   xla::XlaOp while_result = xla::While(cond_wrapper, *body.computation, init);
 
-  auto while_shape_or = builder->GetShape(while_result);
-  OP_REQUIRES_OK(ctx, while_shape_or.status());
-  auto count = xla::ShapeUtil::TupleElementCount(while_shape_or.ValueOrDie());
-  int max_index = body.outputs.size() + body.resource_updates.size() - 1;
-  OP_REQUIRES(
-      ctx, max_index < count,
-      errors::Internal("Max tuple element requested (", max_index,
-                       ") needs to be less than tuple size (", count, ")"));
-
-  // Sets non-variable outputs.
+  // Sets non-variable outputs and determine when resource variables start.
+  int resource_index = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
     if (ctx->input_type(i) != DT_RESOURCE) {
       ctx->SetOutput(body.input_mapping[i],
                      xla::GetTupleElement(while_result, i));
+      ++resource_index;
+    } else {
+      break;
     }
   }
   if (has_token_input_output_) {
@@ -313,7 +315,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
         xla::GetTupleElement(while_result, ctx->num_outputs());
     auto shape_or = builder->GetShape(token_output);
     OP_REQUIRES_OK(ctx, shape_or.status());
-    OP_REQUIRES(ctx, xla::ShapeUtil::IsToken(shape_or.ValueOrDie()),
+    OP_REQUIRES(ctx, shape_or.ValueOrDie().IsToken(),
                 errors::FailedPrecondition(
                     "Token output is not token type: ",
                     xla::ShapeUtil::HumanString(shape_or.ValueOrDie())));
@@ -326,7 +328,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(update.input_index, &resource));
     if (update.modified) {
-      int pos = body.outputs.size() + i;
+      int pos = resource_index + i;
       OP_REQUIRES_OK(ctx,
                      resource->SetFromPack(
                          arguments[update.input_index].tensor_array_gradients,
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 688056791f9750e6b22df4b2cd4643de0b780651..1cd5a79171dccd57fc1b7941cdf16417301ff7f8 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -48,7 +48,7 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
   if (indices_are_vectors) {
     TF_RET_CHECK(!indices_dims.empty());
     num_index_dims = indices_dims.back();
-    if (num_index_dims > xla::ShapeUtil::Rank(buffer_shape)) {
+    if (num_index_dims > buffer_shape.rank()) {
       return errors::InvalidArgument(
           "The size of the minor dimension of the indices (shape: ",
           xla::ShapeUtil::HumanString(indices_shape),
@@ -140,8 +140,8 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
                                        ? indices_shape.dimensions_size() - 1
                                        : indices_shape.dimensions_size());
 
-  int64 updates_rank = xla::ShapeUtil::Rank(updates_shape);
-  int64 buffer_rank = xla::ShapeUtil::Rank(buffer_shape);
+  int64 updates_rank = updates_shape.rank();
+  int64 buffer_rank = buffer_shape.rank();
   int64 num_window_dims_in_updates = buffer_rank - num_index_dims;
 
   // If the rank of `updates` is 0 and does not match the expected rank of
@@ -156,7 +156,7 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
   if (updates_rank == 0 && expected_updates_rank != 0) {
     new_updates = xla::Broadcast(updates, expected_updates_dims);
     TF_ASSIGN_OR_RETURN(updates_shape, builder->GetShape(new_updates));
-    updates_rank = xla::ShapeUtil::Rank(updates_shape);
+    updates_rank = updates_shape.rank();
   }
 
   if (updates_rank > 0) {
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index c0bd172d17c192435ba8ee196f9def0491c0bf5c..06eda41611861060a1f1c4d028b96405d288efdb 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -54,6 +54,9 @@ xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
     case xla::C64:
       return xla::ConstantR0<xla::complex64>(builder, value);
       break;
+    case xla::C128:
+      return xla::ConstantR0<xla::complex128>(builder, value);
+      break;
     default:
       LOG(FATAL) << "unhandled element type " << type;
   }
@@ -90,6 +93,9 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
     case xla::C64:
       literal = xla::LiteralUtil::CreateR0<complex64>(value);
       break;
+    case xla::C128:
+      literal = xla::LiteralUtil::CreateR0<complex128>(value);
+      break;
     case xla::PRED:
       LOG(FATAL) << "pred element type is not integral";
     case xla::S16:
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 67d08290033361f16dfff42b06af9b253e84963a..749a7c3054a65d6ec9f9dc13f6f4a713ac9d3d5a 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -77,7 +77,7 @@ Status HostTensorsToBorrowingLiteralTuple(absl::Span<const Tensor> host_tensors,
 
 Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor) {
-  TF_RET_CHECK(xla::ShapeUtil::IsArray(literal.shape()) &&
+  TF_RET_CHECK(literal.shape().IsArray() &&
                xla::ShapeUtil::ElementsIn(literal.shape()) ==
                    host_tensor->NumElements());
   xla::PrimitiveType primitive_type;
diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc
index 15f4c38da29507da9e092c1d5725b5f95a81d1b9..44bccfe6474d175beda392ca17dfbcb08c0b1b11 100644
--- a/tensorflow/compiler/tf2xla/literal_util_test.cc
+++ b/tensorflow/compiler/tf2xla/literal_util_test.cc
@@ -49,7 +49,7 @@ using Types =
                      std::pair<int16, qint16>, std::pair<uint16, quint16>,
                      std::pair<int32, qint32>>;
 
-TYPED_TEST_CASE(LiteralUtilTest, Types);
+TYPED_TEST_SUITE(LiteralUtilTest, Types);
 
 TYPED_TEST(LiteralUtilTest, LiteralToQuantizedHostTensor) {
   using int_type = typename TypeParam::first_type;
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 4dce0a2102cf9c782850ccc7af4f14b59bd51e53..7140b6a1227a53290c3747892a55886a7f48513b 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -4,7 +4,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_wrapper_py",
+)
 
 cc_library(
     name = "xla_ops",
@@ -24,3 +28,14 @@ tf_gen_op_wrapper_py(
         ":xla_ops",
     ],
 )
+
+tf_custom_op_library(
+    name = "_xla_ops.so",
+    srcs = [
+        "xla_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index ab77984684db4525f4d3f42b2c9c0f093c82ec45..af641131ed76a8d6a7291c360302fa17c94af014 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -369,7 +369,11 @@ REGISTER_OP("XlaKeyValueSort")
     .Output("sorted_values: V")
     .Attr("K: realnumbertype")
     .Attr("V: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      c->set_output(1, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Wraps the XLA Sort operator, documented at
  https://www.tensorflow.org/performance/xla/operation_semantics#sort
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index fef97b98c376d9df8bbfd9cb6651216895e46bf4..9abdb04d7736e8ff5225688af4759a522d3e7fc7 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -15,6 +15,7 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_py_clif_cc",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_py_clif_cc(
     name = "xla_op_registry",
@@ -27,9 +28,13 @@ tf_py_clif_cc(
     ],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "xla",
     srcs = ["xla.py"],
+    dso = ["//tensorflow/compiler/tf2xla/ops:_xla_ops.so"],
+    kernels = [
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+    ],
     deps = [
         "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
         "//tensorflow/compiler/xla:xla_data_proto_py",
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index ff9f1b9ccba2c4f3307890d5aac4ddb6cfaafcd9..c20d6a5fd1f3bd7dad30cb3359d13ed4609a2250 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -77,6 +77,7 @@ CreateResourceOpInfoMap() {
   add("ResourceScatterMin"                   , kReadWrite, kVariable);
   add("ResourceScatterMul"                   , kReadWrite, kVariable);
   add("ResourceScatterNdAdd"                 , kReadWrite, kVariable);
+  add("ResourceScatterNdSub"                 , kReadWrite, kVariable);
   add("ResourceScatterNdUpdate"              , kReadWrite, kVariable);
   add("ResourceScatterSub"                   , kReadWrite, kVariable);
   add("ResourceScatterUpdate"                , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index ec604af13867171d558cd7324919fb9531caf460..8997b2f5c68da480e9d4cb1f7ff8776690363392 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -27,7 +27,7 @@ namespace {
 
 Status PopulateInfeedLayoutVector(const xla::Shape& shape,
                                   std::vector<int>* layouts) {
-  if (xla::ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     int64 tuple_elements = xla::ShapeUtil::TupleElementCount(shape);
     for (int64 i = 0; i < tuple_elements; ++i) {
       const xla::Shape& subshape =
@@ -39,23 +39,60 @@ Status PopulateInfeedLayoutVector(const xla::Shape& shape,
       layouts->push_back(dim);
     }
   } else {
-    layouts->insert(layouts->end(), xla::ShapeUtil::Rank(shape), -1);
+    layouts->insert(layouts->end(), shape.rank(), -1);
   }
   return Status::OK();
 }
 
+// Populate the output layout unless the minor_to_major array contains all -1
+// value, in which case the layout is considered missing and the API returns
+// false.
+xla::StatusOr<bool> MakeLayout(absl::Span<const int64> minor_to_major,
+                               xla::Layout* layout) {
+  if (std::all_of(minor_to_major.begin(), minor_to_major.end(),
+                  [](int64 dim) { return dim == -1; })) {
+    return false;
+  }
+  std::vector<bool> dim_present(minor_to_major.size(), false);
+  for (auto dim : minor_to_major) {
+    if (dim < 0 || dim >= minor_to_major.size()) {
+      return errors::InvalidArgument("Layout dimension out of range: dim=", dim,
+                                     " rank=", minor_to_major.size());
+    }
+    if (dim_present[dim]) {
+      return errors::InvalidArgument("Repeated layout dimension: dim=", dim);
+    }
+    dim_present[dim] = true;
+  }
+  *layout = xla::LayoutUtil::MakeLayout(minor_to_major);
+  return true;
+}
+
+Status AssignLayout(
+    absl::Span<const int64> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* shape) {
+  xla::Layout layout;
+  TF_ASSIGN_OR_RETURN(bool has_layout, MakeLayout(minor_to_major, &layout));
+  if (!has_layout && layout_func) {
+    layout = layout_func(*shape);
+  }
+  *shape->mutable_layout() = layout;
+  return Status::OK();
+}
+
 }  // namespace
 
 // Convert an XLA Shape into the equivalent TensorFlow shape.
 Status XLAShapeToTensorShape(const xla::Shape& shape,
                              TensorShape* tensor_shape) {
-  if (xla::ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     return errors::InvalidArgument("XLA shape ",
                                    xla::ShapeUtil::HumanString(shape),
                                    " cannot be converted to a TensorShape");
   }
   *tensor_shape = TensorShape();
-  for (int i = 0; i < xla::ShapeUtil::Rank(shape); ++i) {
+  for (int i = 0; i < shape.rank(); ++i) {
     tensor_shape->AddDim(shape.dimensions(i));
   }
   return Status::OK();
@@ -84,10 +121,64 @@ xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
   return xla::ShapeUtil::MakeShapeWithLayout(type, dimensions, layout);
 }
 
-xla::StatusOr<std::vector<int>> GetInfeedLayoutVector(const xla::Shape& shape) {
+xla::StatusOr<std::vector<int>> GetShapeLayoutVector(const xla::Shape& shape) {
   std::vector<int> layouts;
   TF_RETURN_IF_ERROR(PopulateInfeedLayoutVector(shape, &layouts));
   return layouts;
 }
 
+Status GetShapeWithLayout(
+    const xla::Shape& input_shape, absl::Span<const int64> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* output_shape) {
+  if (input_shape.IsTuple()) {
+    int64 tuple_elements = xla::ShapeUtil::TupleElementCount(input_shape);
+    std::vector<xla::Shape> shapes;
+    shapes.reserve(tuple_elements);
+    size_t position = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      const xla::Shape& shape =
+          xla::ShapeUtil::GetTupleElementShape(input_shape, i);
+      if (shape.IsTuple()) {
+        return errors::InvalidArgument(
+            "Nested tuples not supported: ",
+            xla::ShapeUtil::HumanString(input_shape));
+      }
+      int64 rank = shape.rank();
+      if (position + rank > minor_to_major.size()) {
+        return errors::InvalidArgument(
+            "Not enough layout attribute elements: position=", position,
+            " rank=", rank, " elements=", minor_to_major.size());
+      }
+      shapes.push_back(shape);
+      TF_RETURN_IF_ERROR(AssignLayout(
+          absl::Span<const int64>(minor_to_major).subspan(position, rank),
+          layout_func, &shapes.back()));
+      position += rank;
+
+      VLOG(4) << "Shape[" << i
+              << "] = " << xla::ShapeUtil::HumanStringWithLayout(shapes.back());
+    }
+    if (position != minor_to_major.size()) {
+      return errors::InvalidArgument(
+          "Too many elements passed in the layout attribute: position=",
+          position, " size=", minor_to_major.size());
+    }
+    *output_shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  } else {
+    int64 rank = input_shape.rank();
+    if (rank != minor_to_major.size()) {
+      return errors::InvalidArgument(
+          "Wrong number of layout attribute elements: rank=", rank,
+          " elements=", minor_to_major.size());
+    }
+    *output_shape = input_shape;
+    TF_RETURN_IF_ERROR(AssignLayout(minor_to_major, layout_func, output_shape));
+
+    VLOG(4) << "Shape[] = "
+            << xla::ShapeUtil::HumanStringWithLayout(*output_shape);
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index cf52bf46e7c2a237d57f4c87e7d6efbf3fa9b1c2..e775c4462c3dc15cf4b8d9e8d8e7d9a61e024cd0 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -45,12 +45,23 @@ xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
                                  const TensorShape& tensor_shape);
 
 // Given an XLA shape with layouts, builds a layout vector in the form able to
-// be fed to an InfeedEnqueue/InfeedEnqueueTuple ops.
+// be fed to ops like InfeedEnqueue/InfeedEnqueueTuple/XRTAllocateV2/....
 // THe returned vector is a linearized sequence of the minor-to-major values of
 // the layouts held within the input shape.
 // In case the input shape is a tuple, the minor-to-major values will be in the
 // order of the tuple elements within the tuple shape.
-xla::StatusOr<std::vector<int>> GetInfeedLayoutVector(const xla::Shape& shape);
+// If a shape (or a subshape of a tuple shape) has missing layout, a rank long
+// sequence of -1 values will be emittted.
+xla::StatusOr<std::vector<int>> GetShapeLayoutVector(const xla::Shape& shape);
+
+// Given the input shape and a linearized sequence of the minor-to-major values
+// of the layouts, create the output shape by rewriting the input shape layouts.
+// If a layout is missing (has -1 values) for a matching tuple subshape, the
+// layout_func will be called, if not nullptr.
+Status GetShapeWithLayout(
+    const xla::Shape& input_shape, absl::Span<const int64> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* output_shape);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index b62f8e9115229ac35c657d374c68336f1168ff77..412f31adbb7df52b2d6933be054cc6d40947dc44 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -26,6 +26,49 @@ const char kXlaTokenArgNodeName[] = "_xla_token_arg_node";
 
 const char kXlaHasHostTransferAttrName[] = "_xla_has_host_transfer";
 
+Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
+  if (!HasNodeAttr(node->def(), kXlaHasHostTransferAttrName)) {
+    return errors::InvalidArgument("Node ", node->DebugString(),
+                                   " does not have attribute ",
+                                   kXlaHasHostTransferAttrName);
+  }
+
+  if (node->type_string() == "_XlaRecvAtHost" ||
+      node->type_string() == "_XlaSendFromHost") {
+    node->ClearAttr("device_ordinal");
+    node->AddAttr("device_ordinal", device_ordinal);
+  } else if (node->type_string() == "If") {
+    AttrValue device_ordinal_value;
+    device_ordinal_value.set_i(device_ordinal);
+    for (const string& attr_name :
+         std::vector<string>{"then_branch", "else_branch"}) {
+      NameAttrList branch_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
+      (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+      node->ClearAttr(attr_name);
+      node->AddAttr(attr_name, branch_func);
+    }
+  } else if (node->type_string() == "While") {
+    AttrValue device_ordinal_value;
+    device_ordinal_value.set_i(device_ordinal);
+    for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+      NameAttrList branch_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
+      (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+      node->ClearAttr(attr_name);
+      node->AddAttr(attr_name, branch_func);
+    }
+  } else if (HasNodeAttr(node->def(), "device_ordinal")) {
+    // Function call node containing outside compilation.
+    node->ClearAttr("device_ordinal");
+    node->AddAttr("device_ordinal", device_ordinal);
+  } else {
+    return errors::Internal("Unknown node type to set 'device_ordinal': ",
+                            node->DebugString());
+  }
+  return Status::OK();
+}
+
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
   std::set<std::string> results;
   Node* first_side_effecting_node_on_path = nullptr;
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index 7081b362c36c4785164b29003a5f89cd73bcf3af..75e1f253fb08ae61b0336a8783b7449c69197dd1 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -38,6 +38,10 @@ extern const char kXlaTokenArgNodeName[];
 // This node have XlaRecvAtHost/XlaSendFromHost in its associated functions.
 extern const char kXlaHasHostTransferAttrName[];
 
+// Sets device ordinal attribute for nodes with attribute
+// `kXlaHasHostTransferAttrName`.
+Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal);
+
 // Calculates side-effect dependencies for the graph's token output.
 // Returns a set of node names representing these dependencies.
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g);
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 9fac16a9700419b189bf5393c2b8bd7d76c6c1cc..cf48576ec2746fb29779633275eac4c638b91e45 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -243,7 +243,9 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
+    arg.shape = shape;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index d00b1376620c0c9d112c7d7426758f6d3f25e86f..732f957d7329c93ad104dacf5190948fbfd7974b 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -69,6 +69,9 @@ Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type) {
     case tensorflow::DT_COMPLEX64:
       *type = xla::C64;
       return Status::OK();
+    case tensorflow::DT_COMPLEX128:
+      *type = xla::C128;
+      return Status::OK();
     default:
       return errors::InvalidArgument(
           "Unsupported type in DataTypeToPrimitiveType ",
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index c7341cf8b9e8d7a06fd304ae8766420d20f0c16e..de2e485a47c18ae8e58a06aba408dbb61a30d00a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -59,45 +59,8 @@ class XlaCompiledCpuFunction {
   // AOT this is backed by data compiled into the object file.
   //
   // The contents of StaticData are XLA-internal implementation details and
-  // should not be relied on by clients.
-  //
-  // TODO(sanjoy): Come up with a cleaner way to express the contraint we want
-  // here: generated XlaCompiledCpuFunction subclasses should be able to create
-  // instances of StaticData but only XlaCompiledCpuFunction should be able to
-  // read from StaticData instances.
+  // should not be relied on by clients (and therefore are private).
   class StaticData {
-   public:
-    void set_raw_function(RawFunction raw_function) {
-      raw_function_ = raw_function;
-    }
-    void set_buffer_infos(
-        const cpu_function_runtime::BufferInfo* buffer_infos) {
-      buffer_infos_ = buffer_infos;
-    }
-    void set_num_buffers(size_t num_buffers) { num_buffers_ = num_buffers; }
-    void set_arg_index_table(const int32* arg_index_table) {
-      arg_index_table_ = arg_index_table;
-    }
-    void set_num_args(int64 num_args) { num_args_ = num_args; }
-    void set_result_index(size_t result_index) { result_index_ = result_index; }
-    void set_arg_names(const char** arg_names) { arg_names_ = arg_names; }
-    void set_result_names(const char** result_names) {
-      result_names_ = result_names;
-    }
-    void set_program_shape(const xla::ProgramShapeProto* program_shape) {
-      program_shape_ = program_shape;
-    }
-    const xla::HloProfilePrinterData* hlo_profile_printer_data() const {
-      return hlo_profile_printer_data_;
-    }
-    void set_hlo_profile_printer_data(
-        const xla::HloProfilePrinterData* hlo_profile_printer_data) {
-      hlo_profile_printer_data_ = hlo_profile_printer_data;
-    }
-    void set_profile_counters_size(int64 profile_counters_size) {
-      profile_counters_size_ = profile_counters_size;
-    }
-
    private:
     // The raw function to call.
     RawFunction raw_function_;
@@ -134,7 +97,8 @@ class XlaCompiledCpuFunction {
     // declared so we don't have access to that information here.
     int64 profile_counters_size_ = 0;
 
-    // Only XlaCompiledCpuFunction is allowed to read the above fields.
+    // Only XlaCompiledCpuFunction is allowed to read and write the above
+    // fields.
     friend class XlaCompiledCpuFunction;
   };
 
@@ -148,7 +112,7 @@ class XlaCompiledCpuFunction {
     RESULTS_PROFILES_AND_TEMPS_ONLY,
   };
 
-  XlaCompiledCpuFunction(
+  explicit XlaCompiledCpuFunction(
       const StaticData& static_data,
       AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS);
   virtual ~XlaCompiledCpuFunction();
@@ -280,6 +244,76 @@ class XlaCompiledCpuFunction {
     return *hlo_profile_printer_data_;
   }
 
+ protected:
+  // ---------------------------------------------------------------------------
+  // Accessors for reading from and writing to instances of `StaticData`.
+  //
+  // Classes generated by tfcompile can call these because the generated classes
+  // inherit from `XlaCompiledCpuFunction`.  `XlaJitCompiledCpuFunction` can
+  // call these because it is explicitly added as a friend.
+
+  static void set_static_data_raw_function(StaticData* static_data,
+                                           RawFunction raw_function) {
+    static_data->raw_function_ = raw_function;
+  }
+
+  static void set_static_data_buffer_infos(
+      StaticData* static_data,
+      const cpu_function_runtime::BufferInfo* buffer_infos) {
+    static_data->buffer_infos_ = buffer_infos;
+  }
+
+  static void set_static_data_num_buffers(StaticData* static_data,
+                                          size_t num_buffers) {
+    static_data->num_buffers_ = num_buffers;
+  }
+
+  static void set_static_data_arg_index_table(StaticData* static_data,
+                                              const int32* arg_index_table) {
+    static_data->arg_index_table_ = arg_index_table;
+  }
+
+  static void set_static_data_num_args(StaticData* static_data,
+                                       int64 num_args) {
+    static_data->num_args_ = num_args;
+  }
+
+  static void set_static_data_result_index(StaticData* static_data,
+                                           size_t result_index) {
+    static_data->result_index_ = result_index;
+  }
+
+  static void set_static_data_arg_names(StaticData* static_data,
+                                        const char** arg_names) {
+    static_data->arg_names_ = arg_names;
+  }
+
+  static void set_static_data_result_names(StaticData* static_data,
+                                           const char** result_names) {
+    static_data->result_names_ = result_names;
+  }
+
+  static void set_static_data_program_shape(
+      StaticData* static_data, const xla::ProgramShapeProto* program_shape) {
+    static_data->program_shape_ = program_shape;
+  }
+
+  static void set_static_data_hlo_profile_printer_data(
+      StaticData* static_data,
+      const xla::HloProfilePrinterData* hlo_profile_printer_data) {
+    static_data->hlo_profile_printer_data_ = hlo_profile_printer_data;
+  }
+
+  static const xla::HloProfilePrinterData*
+  get_static_data_hlo_profile_printer_data(StaticData* static_data) {
+    return static_data->hlo_profile_printer_data_;
+  }
+
+  static void set_static_data_profile_counters_size(
+      StaticData* static_data, int64 profile_counters_size) {
+    static_data->profile_counters_size_ = profile_counters_size;
+  }
+
  private:
   const RawFunction raw_function_;
   const size_t result_index_;
@@ -313,6 +347,10 @@ class XlaCompiledCpuFunction {
   const char** result_names_ = nullptr;
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
+
+  // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
+  // `set_static_data_*` static methods above.
+  friend class XlaJitCompiledCpuFunction;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index ee461a3c07d4db514c7697e005a9371be4b54dd0..1f9cfcdd246f36bd7e0325bca34c7480d4ce2843 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -192,6 +193,8 @@ Status BuildComputation(
         output.shape = output.constant_value.shape();
         break;
 
+      case XlaExpression::Kind::kTensorList:
+        TF_FALLTHROUGH_INTENDED;
       case XlaExpression::Kind::kXlaOp: {
         output.is_constant = false;
         TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
@@ -333,8 +336,21 @@ bool XlaCompiler::Argument::operator==(
                other.tensor_array_gradients)) {
     return false;
   }
-  if (shape != other.shape) {
-    return false;
+  if (absl::holds_alternative<xla::Shape>(shape)) {
+    if (!absl::holds_alternative<xla::Shape>(other.shape)) {
+      return false;
+    }
+    if (!xla::Shape::Equal()(absl::get<xla::Shape>(shape),
+                             absl::get<xla::Shape>(other.shape))) {
+      return false;
+    }
+  } else {
+    if (!absl::holds_alternative<TensorShape>(other.shape)) {
+      return false;
+    }
+    if (absl::get<TensorShape>(shape) != absl::get<TensorShape>(other.shape)) {
+      return false;
+    }
   }
   if (constant_value.shape() != other.constant_value.shape()) {
     return false;
@@ -348,7 +364,7 @@ string XlaCompiler::Argument::HumanString() const {
     common = absl::StrCat(" name=", name);
   }
   absl::StrAppend(&common, " type=", DataTypeString(type),
-                  " shape=", shape.DebugString());
+                  " shape=", ShapeHumanString());
   switch (kind) {
     case kInvalid:
       return "invalid";
@@ -375,6 +391,23 @@ string XlaCompiler::Argument::HumanString() const {
   }
 }
 
+std::vector<int64> XlaCompiler::Argument::DimensionSizes() const {
+  if (absl::holds_alternative<TensorShape>(shape)) {
+    return xla::InlinedVectorToVector(
+        absl::get<TensorShape>(shape).dim_sizes());
+  } else {
+    return absl::get<xla::Shape>(shape).dimensions();
+  }
+}
+
+string XlaCompiler::Argument::ShapeHumanString() const {
+  if (absl::holds_alternative<TensorShape>(shape)) {
+    return absl::get<TensorShape>(shape).DebugString();
+  } else {
+    return absl::get<xla::Shape>(shape).DebugString();
+  }
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
@@ -462,8 +495,34 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
+  // Do not constant fold nodes that output DT_VARIANT type tensors.
+  // XLA does not support Const nodes of Variant type since it needs
+  // to know the original ops to be able to compile them to the relevant
+  // XLA form.
+  // TODO(srbs): This filter is a little conservative. E.g. a subgraph of
+  // the form:
+  //                          Const
+  //                            |
+  // EmptyTensorList -> TensorListPushBack -> TensorListPopBack -> Op
+  //                                                  |
+  //                                        (Discard popped list)
+  //
+  // Would have been reduced to "Const -> Op" without this filter.
+  // However since we are only allowed to specify the filter at the "Node"
+  // level there is no good way to allow the above behavior. So we
+  // disallow any sort of constant folding on Variant nodes for now.
+  auto cf_consider_fn = [](const Node* n) {
+    for (const auto& output_arg : n->op_def().output_arg()) {
+      if (output_arg.type() == DT_VARIANT) {
+        return false;
+      }
+    }
+    return true;
+  };
+  GraphOptimizer::Options graph_optimizer_options;
+  graph_optimizer_options.cf_consider_fn = cf_consider_fn;
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
-                     /*device=*/nullptr, &graph, /*shape_map=*/nullptr);
+                     /*device=*/nullptr, &graph, graph_optimizer_options);
 
   return graph;
 }
@@ -548,11 +607,22 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
       LOG(FATAL) << "Unreachable case";
     case XlaCompiler::Argument::kParameter: {
       if (is_entry_computation) {
-        TF_ASSIGN_OR_RETURN(
-            *xla_shape, options_.shape_representation_fn(arg.shape, arg.type));
+        TensorShape shape;
+        if (absl::holds_alternative<TensorShape>(arg.shape)) {
+          shape = absl::get<TensorShape>(arg.shape);
+        } else {
+          TF_RETURN_IF_ERROR(
+              XLAShapeToTensorShape(absl::get<xla::Shape>(arg.shape), &shape));
+        }
+        TF_ASSIGN_OR_RETURN(*xla_shape,
+                            options_.shape_representation_fn(shape, arg.type));
       } else {
-        TF_RETURN_IF_ERROR(
-            TensorShapeToXLAShape(arg.type, arg.shape, xla_shape));
+        if (absl::holds_alternative<xla::Shape>(arg.shape)) {
+          *xla_shape = absl::get<xla::Shape>(arg.shape);
+        } else {
+          TF_RETURN_IF_ERROR(TensorShapeToXLAShape(
+              arg.type, absl::get<TensorShape>(arg.shape), xla_shape));
+        }
       }
       return Status::OK();
     }
@@ -561,8 +631,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
-          TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
-                                              arg.shape, arg.type));
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
+          TF_ASSIGN_OR_RETURN(*xla_shape,
+                              options_.shape_representation_fn(
+                                  absl::get<TensorShape>(arg.shape), arg.type));
 
           return Status::OK();
         }
@@ -571,9 +643,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
             return errors::InvalidArgument(
                 "Negative max_array_size in XLAShapeForArgument");
           }
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
           shape.AddDim(arg.max_array_size);
-          shape.AppendShape(arg.shape);
+          shape.AppendShape(absl::get<TensorShape>(arg.shape));
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
           if (!arg.tensor_array_gradients.empty()) {
@@ -588,9 +661,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
             return errors::InvalidArgument(
                 "Negative max_array_size in XLAShapeForArgument");
           }
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
           shape.AddDim(arg.max_array_size);
-          shape.AppendShape(arg.shape);
+          shape.AppendShape(absl::get<TensorShape>(arg.shape));
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
               TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
@@ -620,14 +694,15 @@ Status XlaCompiler::BuildArguments(
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
     const std::map<int, int>& arg_cores,
     std::vector<XlaExpression>* arg_expressions,
-    std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
+    std::vector<int>* input_to_args, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
   arg_expressions->resize(args.size());
 
   // Argument numbers of arguments and resources that are to be passed to the
-  // XLA computation as runtime parameters.
-  input_mapping->clear();
-  input_mapping->reserve(args.size());
+  // XLA computation as runtime parameters. `input_to_args[a] = b` means that
+  // the a'th XLA input corresponds to the b'th original arg indexes.
+  input_to_args->clear();
+  input_to_args->reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
   for (std::vector<XlaCompiler::Argument>::size_type i = 0; i < args.size();
@@ -637,24 +712,25 @@ Status XlaCompiler::BuildArguments(
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.resource_kind != XlaResource::kInvalid);
+        TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
         XlaResource* resource =
             context->AddResource(absl::make_unique<XlaResource>(
-                arg.resource_kind, i, arg.name, arg.type, arg.shape,
-                xla::XlaOp(),
+                arg.resource_kind, i, arg.name, arg.type,
+                absl::get<TensorShape>(arg.shape), xla::XlaOp(),
                 /*max_array_size=*/arg.max_array_size,
                 /*tensor_array_gradients=*/arg.tensor_array_gradients,
                 /*tensor_array_multiple_writes_aggregate=*/true));
         arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
-          input_mapping->push_back(i);
+          input_to_args->push_back(i);
         }
         break;
       }
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kToken: {
-        input_mapping->push_back(i);
+        input_to_args->push_back(i);
         break;
       }
       case XlaCompiler::Argument::kConstant:
@@ -666,15 +742,23 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
-  if (input_mapping->empty()) {
+  if (input_to_args->empty()) {
     return Status::OK();
   }
 
-  std::vector<xla::Shape> arg_shapes(input_mapping->size());
-  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+  // `arg_to_inputs[c] = d` means that the c'th original arg index corresponds
+  // to the d'th XLA input. Note that the value -1 corresponds to constants, or
+  // other args that don't correspond to an input.
+  std::vector<int> arg_to_inputs(args.size(), -1);
+  for (int i = 0; i < input_to_args->size(); i++) {
+    arg_to_inputs[input_to_args->at(i)] = i;
+  }
+
+  std::vector<xla::Shape> arg_shapes(input_to_args->size());
+  for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
     // Computes the shapes of non-constant arguments.
     TF_RETURN_IF_ERROR(XLAShapeForArgument(
-        args[(*input_mapping)[i]], is_entry_computation, &arg_shapes[i]));
+        args[(*input_to_args)[i]], is_entry_computation, &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -691,13 +775,13 @@ Status XlaCompiler::BuildArguments(
   builder->SetOpMetadata(arg_metadata);
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::XlaOp> arg_handles(input_mapping->size());
+  std::vector<xla::XlaOp> arg_handles(input_to_args->size());
   if (use_tuple_arg) {
     xla::XlaOp tuple;
     if (is_entry_computation) {
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
-      for (int64 parameter : *input_mapping) {
+      for (int64 parameter : *input_to_args) {
         auto it = arg_cores.find(parameter);
         const int core = it == arg_cores.end() ? 0 : it->second;
         *tuple_sharding.add_tuple_shardings() =
@@ -709,7 +793,19 @@ Status XlaCompiler::BuildArguments(
     } else {
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
-    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+
+    for (int i = 0; i < input_to_args->size(); ++i) {
+      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
+            /*dynamic_size_param_num=*/0, {dynamic_size_param_index},
+            /*target_param_num=*/0, /*target_param_index=*/{i},
+            dim_and_arg_num.first));
+      }
+    }
+
+    for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_cores.find(i);
       const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -718,7 +814,7 @@ Status XlaCompiler::BuildArguments(
       arg_handles[i] = xla::GetTupleElement(tuple, i);
     }
   } else {
-    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+    for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_cores.find(i);
       const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -727,6 +823,17 @@ Status XlaCompiler::BuildArguments(
       arg_handles[i] = xla::Parameter(builder, i, (*input_shapes)[i],
                                       absl::StrCat("arg", i));
     }
+
+    for (int i = 0; i < input_to_args->size(); ++i) {
+      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
+            /*dynamic_size_param_num=*/dynamic_size_param_index, {},
+            /*target_param_num=*/i, /*target_param_index=*/{},
+            dim_and_arg_num.first));
+      }
+    }
   }
 
   builder->ClearOpMetadata();
@@ -734,12 +841,12 @@ Status XlaCompiler::BuildArguments(
   // Fill in the handles in non-constant arguments, and reshape parameters
   // back to their correct shapes.
   VLOG(2) << "XLA computation inputs:";
-  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-    const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
+  for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
     VLOG(2) << "  XLA arg " << i
             << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i])
-            << " name: " << arg.name << " TF arg " << input_mapping->at(i);
-    XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)];
+            << " name: " << arg.name << " TF arg " << input_to_args->at(i);
+    XlaExpression& arg_expression = (*arg_expressions)[input_to_args->at(i)];
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.initialized);
@@ -756,7 +863,7 @@ Status XlaCompiler::BuildArguments(
         // return values of functions, and then reshape unconditionally.
         if (is_entry_computation) {
           arg_expression = XlaExpression::XlaOp(
-              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()), arg.type);
+              xla::Reshape(arg_handles[i], arg.DimensionSizes()), arg.type);
         } else {
           arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 0d801b73a8c2651305328384377751254ecaa41d..ad3144b41bdf3fc8b75ab5230e8e128df2962884 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stack>
 
 #include "absl/types/span.h"
+#include "absl/types/variant.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
@@ -124,7 +125,8 @@ class XlaCompiler {
     DataType type = DT_INVALID;
 
     // The shape of the argument. For:
-    // * a parameter: the shape of the parameter.
+    // * a parameter: the shape of the parameter. We allow setting the xla shape
+    //   if known. This helps avoid conversions to and from TensorShape.
     // * a constant: ignored; the shape given by constant_value is used
     //     instead.
     // * an uninitialized resource: ignored. We don't yet know the shape of an
@@ -133,7 +135,7 @@ class XlaCompiler {
     // * an initialized TensorArray or Stack resource: the shape of an entry in
     //   the TensorArray/Stack. Note this is the size of a single entry, not the
     //   XLA data structure that represents the complete stack/array.
-    TensorShape shape;
+    absl::variant<TensorShape, xla::Shape> shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -157,10 +159,20 @@ class XlaCompiler {
     // as `tensor_array_gradients`.
     std::set<string> tensor_array_gradients;
 
+    // dynamic dims to arg number map. Empty if no dynamic shapes.
+    std::map<int32, int32> dynamic_dim_to_arg_num_map;
+    bool is_pad_arg = false;
+
     bool operator==(const Argument& other) const;
 
     // Returns a human-readable summary of the argument.
     string HumanString() const;
+
+    // Returns the dimension sizes for either TensorShape or xla::Shape.
+    std::vector<int64> DimensionSizes() const;
+
+    // Returns the human-readable string for either TensorShape or xla::Shape.
+    string ShapeHumanString() const;
   };
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -420,7 +432,7 @@ class XlaCompiler {
                         XlaContext* context,
                         const std::map<int, int>& arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
-                        std::vector<int>* input_mapping,
+                        std::vector<int>* input_to_args,
                         std::vector<xla::Shape>* input_shapes,
                         bool is_entry_computation);
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index fe2a5f5b0c9ea6b5f2bb71df836fdcabf9a0cf23..492010f7317d32a8a620147cd2cd9356d4f13fde 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -82,7 +82,7 @@ namespace {
 // compiled kernels.
 class DummyResourceForTest : public ResourceBase {
  public:
-  string DebugString() override { return "dummy"; }
+  string DebugString() const override { return "dummy"; }
   void Increment() { ++value_; }
   int Get() { return value_; }
 
@@ -1362,7 +1362,7 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
                                        args, &result));
     EXPECT_EQ(result.xla_input_shapes.size(), 1);
-    EXPECT_TRUE(xla::ShapeUtil::IsTuple(result.xla_output_shape));
+    EXPECT_TRUE(result.xla_output_shape.IsTuple());
     EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 1);
   }
   {
@@ -1380,11 +1380,11 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
                                        args, &result));
     EXPECT_EQ(result.xla_input_shapes.size(), 2);
-    EXPECT_TRUE(xla::ShapeUtil::IsToken(result.xla_input_shapes[1]));
-    EXPECT_TRUE(xla::ShapeUtil::IsTuple(result.xla_output_shape));
+    EXPECT_TRUE(result.xla_input_shapes[1].IsToken());
+    EXPECT_TRUE(result.xla_output_shape.IsTuple());
     EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 2);
-    EXPECT_TRUE(xla::ShapeUtil::IsToken(
-        xla::ShapeUtil::GetTupleElementShape(result.xla_output_shape, 1)));
+    EXPECT_TRUE(xla::ShapeUtil::GetTupleElementShape(result.xla_output_shape, 1)
+                    .IsToken());
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index a69af70503376b6c0905deb8980abdc3254a6e47..6139bf3cea0790c2697130a993e92be96c81848b 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -61,7 +61,7 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder)
     : compiler_(compiler), builder_(builder) {}
 
-string XlaContext::DebugString() { return "XLA JIT context"; }
+string XlaContext::DebugString() const { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   if (retvals_.size() <= index) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 0767d1faac14cedb8666f6cc37175eb7b55f6158..eb4ad3fe6a14b42a4df2c73c71cb6df1331fd796 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -47,7 +47,7 @@ class XlaContext : public ResourceBase {
   XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder);
 
   // Virtual method defined by ResourceBase.
-  string DebugString() override;
+  string DebugString() const override;
 
   XlaCompiler* compiler() const { return compiler_; }
 
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index ca0309166b7c73d1a5a818091e2a30fa112a4de4..3d228c92adcbe3d093a4fe70d157e57ab3e80c80 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -46,6 +46,14 @@ XlaExpression XlaExpression::XlaOp(xla::XlaOp value, DataType dtype) {
   return e;
 }
 
+XlaExpression XlaExpression::TensorList(xla::XlaOp tensor_list) {
+  XlaExpression e;
+  e.kind_ = Kind::kTensorList;
+  e.dtype_ = DT_VARIANT;
+  e.handle_ = tensor_list;
+  return e;
+}
+
 XlaExpression XlaExpression::Resource(XlaResource* resource) {
   XlaExpression e;
   e.kind_ = Kind::kResource;
@@ -64,6 +72,8 @@ string XlaExpression::HumanString() const {
       return "xla_op";
     case Kind::kResource:
       return "resource";
+    case Kind::kTensorList:
+      return "tensor_list";
   }
 }
 
@@ -76,6 +86,8 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
             HostTensorToBorrowingLiteral(constant_value_, &literal));
         return xla::ConstantLiteral(builder, literal);
       }
+      case Kind::kTensorList:
+        TF_FALLTHROUGH_INTENDED;
       case Kind::kXlaOp:
         if (builder != handle_.builder()) {
           return errors::InvalidArgument(
@@ -96,7 +108,10 @@ xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
       return {constant_value()};
     case Kind::kXlaOp:
       break;
+    case Kind::kTensorList:
+      TF_FALLTHROUGH_INTENDED;
     case Kind::kResource:
+      TF_FALLTHROUGH_INTENDED;
     case Kind::kInvalid:
       return errors::InvalidArgument(
           "ResolveConstant called on XlaExpression: ", HumanString());
@@ -134,6 +149,8 @@ xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
       TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, &shape));
       return shape;
     }
+    case Kind::kTensorList:
+      return TensorShape({});
     case Kind::kResource:
       return TensorShape({});
     case Kind::kInvalid:
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index bed6761d362a98d344003c1edea342e68c31ef07..ac0232d8924cf2c9e35ad3f0772a3a2adc18af87 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -32,11 +32,16 @@ namespace tensorflow {
 // * a constant tensor.
 // * an xla::XlaOp, representing a symbolic XLA value.
 // * a resource, e.g., a variable, represented as an XlaResource pointer.
+// * a tensor list, represented by a tuple of tensors and the list length.
 //
 // Constant tensors are mostly an optimization to avoid passing large constants
 // to XLA, but are also sometimes used to represent tensors that have no XLA
 // representation, for example, DT_STRING tensors. A canonical use case might be
 // an error message string.
+//
+// Tensor lists are very similar to xla::XlaOp, however they require some
+// specific logic around shape management since the tuples are not supported by
+// TensorFlow.
 class XlaExpression {
  public:
   enum class Kind {
@@ -44,6 +49,7 @@ class XlaExpression {
     kConstant,
     kXlaOp,
     kResource,
+    kTensorList,
   };
 
   XlaExpression();
@@ -62,6 +68,9 @@ class XlaExpression {
   // be derived from the XLA type.
   static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
 
+  // Builds a tensor list expression.
+  static XlaExpression TensorList(xla::XlaOp tensor_list);
+
   // Builds a resource expression.
   static XlaExpression Resource(XlaResource* resource);
 
@@ -100,7 +109,8 @@ class XlaExpression {
 
   DataType dtype_ = DT_INVALID;
 
-  // The XLA handle of the expression's computation, if kind_ == kXlaOp.
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp or
+  // a tuple expression if kind_ == kTensorList.
   xla::XlaOp handle_;
 
   // The value of the constant, if kind_ == kConstant.
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index c2c0751211180c3715a19d6c78e34659fd18914e..04a5d934064a9083a41cc210b48df65bbc862fff 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -34,63 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
-xla::XlaOp ArgMinMax(xla::XlaOp input, xla::PrimitiveType output_type, int axis,
-                     bool is_min) {
-  xla::XlaBuilder* builder = input.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
-    xla::XlaOp init_value;
-    xla::XlaComputation reducer;
-    if (is_min) {
-      init_value = xla::MaxValue(builder, input_shape.element_type());
-      reducer =
-          xla::CreateScalarMinComputation(input_shape.element_type(), builder);
-    } else {
-      init_value = xla::MinValue(builder, input_shape.element_type());
-      reducer =
-          xla::CreateScalarMaxComputation(input_shape.element_type(), builder);
-    }
-
-    xla::XlaOp input_max = xla::Reduce(input, init_value, reducer,
-                                       /*dimensions_to_reduce=*/{axis});
-    std::vector<int64> broadcast_dims(xla::ShapeUtil::Rank(input_shape) - 1);
-    std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
-    std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-    // Compute a mask that has 1s for elements equal to the maximum.
-    xla::XlaOp partial_mask = xla::ConvertElementType(
-        xla::Eq(input, input_max, broadcast_dims), output_type);
-
-    // In order to make identity elements for a bitwise And, we:
-    //   Left shift the 1 to the leftmost bit, yielding 0x10...0
-    //   Arithmetic right shift the 1 back to the rightmost bit, yielding
-    //   0xFF...F
-    int32 bits_in_type =
-        xla::ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1;
-    xla::XlaOp shift_amount =
-        xla::ConstantR0WithType(builder, output_type, bits_in_type);
-    xla::XlaOp full_mask = xla::ShiftRightArithmetic(
-        xla::ShiftLeft(partial_mask, shift_amount), shift_amount);
-
-    // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
-    // index.
-
-    const int64 axis_size = xla::ShapeUtil::GetDimension(input_shape, axis);
-    xla::XlaOp iota = xla::Iota(builder, output_type, axis_size);
-    xla::XlaOp product =
-        xla::And(full_mask, iota, /*broadcast_dimensions=*/{axis});
-
-    // If there are multiple maximum elements, choose the one with the highest
-    // index.
-    return xla::Reduce(product, xla::MinValue(builder, output_type),
-                       xla::CreateScalarMaxComputation(output_type, builder),
-                       /*dimensions_to_reduce=*/{axis});
-  });
-}
-
-}  // namespace
-
 xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
@@ -120,7 +63,7 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
 /* static */ Status XlaHelpers::ReshapeLiteral(
     const xla::Literal& input, absl::Span<const int64> dimensions,
     xla::Literal* output) {
-  if (xla::ShapeUtil::IsTuple(input.shape())) {
+  if (input.shape().IsTuple()) {
     return errors::InvalidArgument("ReshapeLiteral does not support tuples.");
   }
   xla::Shape shape =
@@ -148,16 +91,6 @@ static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
   return linspace;
 }
 
-xla::XlaOp XlaHelpers::ArgMax(xla::XlaOp input, xla::PrimitiveType output_type,
-                              int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
-}
-
-xla::XlaOp XlaHelpers::ArgMin(xla::XlaOp input, xla::PrimitiveType output_type,
-                              int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
-}
-
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
                           DataType index_type, const TensorShape& indices_shape,
                           const xla::XlaOp& indices, const xla::XlaOp& on_value,
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 4858dfee55a393d04cd2af83916eeb40820ee368..490923526bd3acd4b167ccb3faff1d6c9e631131 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -53,16 +53,6 @@ class XlaHelpers {
                                absl::Span<const int64> shape,
                                xla::Literal* output);
 
-  // Returns the argmax of `input` along `axis`. `output_type` is the type to
-  // use for the output.
-  static xla::XlaOp ArgMax(xla::XlaOp input, xla::PrimitiveType output_type,
-                           int axis);
-
-  // Returns the argmin of `input` along `axis`. `output_type` is the type to
-  // use for the output.
-  static xla::XlaOp ArgMin(xla::XlaOp input, xla::PrimitiveType output_type,
-                           int axis);
-
   // Converts `indices` into a one-hot representation. `depth` is the size
   // of the new axis to add. `axis` is the position at which to add the new
   // axis. `indices_shape` is the shape of `indices`. `on_value` and
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index fabbcd04fed96ad814d04c2df9394f43bfe0cf99..884dc45cb11b18ae557c3da3f4192b3805cb7980 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -135,24 +135,34 @@ XlaJitCompiledCpuFunction::Compile(
   jit->arg_index_table_ = std::move(arg_index_table);
   jit->program_shape_ =
       absl::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
-  jit->static_data_.set_raw_function(raw_function);
-  jit->static_data_.set_buffer_infos(jit->buffer_infos_.data());
-  jit->static_data_.set_num_buffers(jit->buffer_infos_.size());
-  jit->static_data_.set_arg_index_table(jit->arg_index_table_.data());
-  jit->static_data_.set_num_args(jit->arg_index_table_.size());
-  jit->static_data_.set_result_index(result_index);
+  XlaCompiledCpuFunction::set_static_data_raw_function(&jit->static_data_,
+                                                       raw_function);
+  XlaCompiledCpuFunction::set_static_data_buffer_infos(
+      &jit->static_data_, jit->buffer_infos_.data());
+  XlaCompiledCpuFunction::set_static_data_num_buffers(
+      &jit->static_data_, jit->buffer_infos_.size());
+  XlaCompiledCpuFunction::set_static_data_arg_index_table(
+      &jit->static_data_, jit->arg_index_table_.data());
+  XlaCompiledCpuFunction::set_static_data_num_args(
+      &jit->static_data_, jit->arg_index_table_.size());
+  XlaCompiledCpuFunction::set_static_data_result_index(&jit->static_data_,
+                                                       result_index);
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
                &jit->result_names_);
-  jit->static_data_.set_arg_names(jit->arg_names_.data());
-  jit->static_data_.set_result_names(jit->result_names_.data());
-  jit->static_data_.set_program_shape(jit->program_shape_.get());
+  XlaCompiledCpuFunction::set_static_data_arg_names(&jit->static_data_,
+                                                    jit->arg_names_.data());
+  XlaCompiledCpuFunction::set_static_data_result_names(
+      &jit->static_data_, jit->result_names_.data());
+  XlaCompiledCpuFunction::set_static_data_program_shape(
+      &jit->static_data_, jit->program_shape_.get());
 
   if (cpu_executable->hlo_profiling_enabled()) {
-    jit->static_data_.set_hlo_profile_printer_data(
-        &cpu_executable->hlo_profile_printer_data());
-    jit->static_data_.set_profile_counters_size(
+    XlaCompiledCpuFunction::set_static_data_hlo_profile_printer_data(
+        &jit->static_data_, &cpu_executable->hlo_profile_printer_data());
+    XlaCompiledCpuFunction::set_static_data_profile_counters_size(
+        &jit->static_data_,
         cpu_executable->hlo_profile_printer_data().profile_counters_size());
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 58808c76de6330a6b28e21dbdead03dea25847f6..78bc2c94425e00c2b26058daf609d71f1853664e 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -93,7 +93,7 @@ TensorShape XlaOpKernelContext::InputShape(absl::string_view name) {
 }
 
 DataType XlaOpKernelContext::input_type(int index) const {
-  return context_->input(index).dtype();
+  return context_->input_dtype(index);
 }
 
 DataType XlaOpKernelContext::InputType(absl::string_view name) {
@@ -178,7 +178,7 @@ Status XlaOpKernelContext::ConstantInputReshaped(
 // Converts an int32 or int64 scalar literal to an int64.
 static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
                                    int64* out) {
-  if (xla::ShapeUtil::Rank(literal.shape()) != 0) {
+  if (literal.shape().rank() != 0) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::S32) {
@@ -194,7 +194,7 @@ static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
 // Converts an float32 or float64 scalar literal to a float64.
 static Status LiteralToFloat64Scalar(const xla::LiteralSlice& literal,
                                      double* out) {
-  if (xla::ShapeUtil::Rank(literal.shape()) != 0) {
+  if (literal.shape().rank() != 0) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::F32) {
@@ -228,8 +228,9 @@ Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) {
 // Converts an int32 or int64 1D literal to an int64 vector.
 static Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
                                    std::vector<int64>* out) {
-  if (xla::ShapeUtil::Rank(literal.shape()) != 1) {
-    return errors::InvalidArgument("value is not 1D");
+  if (literal.shape().rank() != 1) {
+    return errors::InvalidArgument("value is not 1D, rank: ",
+                                   literal.shape().rank());
   }
   int64 size = xla::ShapeUtil::ElementsIn(literal.shape());
   if (literal.shape().element_type() == xla::S32) {
@@ -353,8 +354,8 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
   if (!variable->initialized()) {
-    return errors::InvalidArgument("Read of uninitialized variable ",
-                                   variable->name());
+    return errors::FailedPrecondition("Read of uninitialized variable ",
+                                      variable->name());
   }
   if (variable->type() != type) {
     return errors::InvalidArgument(
@@ -456,6 +457,11 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   SetOutputExpression(index, XlaExpression::Constant(constant));
 }
 
+void XlaOpKernelContext::SetTensorListOutput(int index,
+                                             const xla::XlaOp& handle) {
+  SetOutputExpression(index, XlaExpression::TensorList(handle));
+}
+
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
   SetOutputExpression(index, XlaExpression::Resource(resource));
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 1858844bc05a6e12abbf07af83cad816590ddd03..e44415f60bff82fb92d0cf4ec81935564a2f083a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -168,6 +168,9 @@ class XlaOpKernelContext {
   // Returns an XlaExpression describing the value of 'index'.
   void SetOutputExpression(int index, const XlaExpression& expression);
 
+  // Sets output `index` to the Tensor List `handle`.
+  void SetTensorListOutput(int index, const xla::XlaOp& handle);
+
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
   Status status() { return context_->status(); }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 0bdd4a1085445420a5147756daac4a54f4725f11..ce3b6b298c6dc5a08e7b794bbab3a28575967d28 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -47,13 +47,14 @@ extern const char* const DEVICE_XLA_GPU;
 
 constexpr std::array<DataType, 4> kFloatTypes = {
     {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
-constexpr std::array<DataType, 11> kNumericTypes = {
+constexpr std::array<DataType, 12> kNumericTypes = {
     {DT_UINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_INT32, DT_INT64, DT_HALF,
-     DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BFLOAT16}};
+     DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 14> kCpuAllTypes = {
+constexpr std::array<DataType, 15> kCpuAllTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
-     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL}};
 
 constexpr std::array<DataType, 15> kGpuAllTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 722d1376687efa1c04158e3fd9ce539aac9d0122..636e5ef721f58c009566c10a653d09a7667619c0 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -152,7 +152,7 @@ cc_library(
         ":status",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/lib",
     ],
 )
 
@@ -717,6 +717,7 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -741,6 +742,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -824,6 +826,7 @@ cc_library(
         "debug_options_parsers.h",
     ],
     hdrs = ["debug_options_flags.h"],
+    visibility = [":friends"],
     deps =
         [
             ":parse_flags_from_env",
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 58cc1575858201b4508d7340cb47e59c4f4c5783..529e7f77cec43f3158fcb59a53efa9a085d7422a 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -272,6 +272,15 @@ class Array {
     std::iota(&values_[0], &values_[0] + num_elements(), value);
   }
 
+  // Fills the array with a repeating sequence:
+  //   [value, value + 1, ..., value + length - 1, value, ... ]
+  void FillRepeatedIota(const T& value, int64 length) {
+    for (int64 i = 0; i < num_elements(); i += length) {
+      std::iota(&values_[i], &values_[std::min(i + length, num_elements())],
+                value);
+    }
+  }
+
   // Fills the array with the sequence i*multiplier for i=0,1,...
   void FillWithMultiples(const T& multiplier) {
     for (int64 i = 0; i < num_elements(); ++i) {
@@ -280,11 +289,11 @@ class Array {
   }
 
   // Fills the array with random normal variables with the specified mean.
-  void FillRandom(const T& value, const double mean = 0.0,
+  void FillRandom(const T& stddev, const double mean = 0.0,
                   const int seed = 12345) {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
-                                                  static_cast<double>(value));
+                                                  static_cast<double>(stddev));
     for (int64 i = 0; i < num_elements(); ++i) {
       values_[i] = static_cast<T>(distribution(g));
     }
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 27c075e8f13f6777af4e837501d97a33034313f5..f5d56e8a9e1f3a05e1039f7cc90194407200f1ab 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -246,6 +246,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 43127cae1e5d81521003a28288e27d291e33c9b9..4f020bcec2756a328755d86ab04154d54f532465 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -278,53 +278,51 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
     const ExecutionOptions* execution_options,
     ExecutionProfile* execution_profile) {
-  if (execution_options != nullptr &&
-      execution_options->device_handles_size() > 1) {
-    std::vector<XlaComputationInstance> computation_instances = {
-        XlaComputationInstance{
-            computation,
-            std::vector<GlobalData*>(arguments.begin(), arguments.end()),
-            *execution_options, execution_profile}};
-    TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
-    // The result selection is a bit hacky, but better than assuming it is
-    // device 0.
-    //
-    // TODO(b/118493728): Allow Execute to return one result per computation.
-    for (int64 i = 0; i < results.size(); i++) {
-      TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
-      if (!ShapeUtil::IsEmptyTuple(shape)) {
-        VLOG(3) << "Fetching result from device " << i << ": "
-                << ShapeUtil::HumanString(shape);
-        return std::move(results[i]);
-      }
+  // Create an ExecutionOptions if necessary, or set its DeviceHandles.
+  absl::optional<ExecutionOptions> options_storage;
+  if (!execution_options || execution_options->device_handles().empty()) {
+    if (execution_options) {
+      options_storage.emplace(*execution_options);
+    } else {
+      options_storage.emplace(CreateDefaultExecutionOptions());
     }
-    TF_RET_CHECK(!results.empty());
-    VLOG(1) << "Defaulting to device 0 result";
-    return std::move(results[0]);
-  }
-
-  // The argument shapes affect how the computation is compiled.
-  std::vector<Shape> arg_shapes(arguments.size());
-  for (int i = 0; i < arguments.size(); i++) {
-    TF_ASSIGN_OR_RETURN(arg_shapes[i], GetShape(*arguments[i]));
-  }
-
-  TF_ASSIGN_OR_RETURN(auto handle,
-                      Compile(computation, arg_shapes, execution_options));
-
-  TF_ASSIGN_OR_RETURN(auto result,
-                      Execute(handle, arguments, execution_profile));
-
-  if (execution_profile != nullptr) {
-    if (VLOG_IS_ON(1)) {
-      TF_ASSIGN_OR_RETURN(
-          auto execution_stats,
-          ExecutionStatsAsString(computation, *execution_profile));
-      VLOG(1) << execution_stats;
+    execution_options = &*options_storage;
+
+    TF_ASSIGN_OR_RETURN(auto device_handles,
+                        GetDeviceHandles(/*device_count=*/1));
+    TF_RET_CHECK(!device_handles.empty());
+    *options_storage->add_device_handles() = std::move(device_handles[0]);
+  }
+
+  std::vector<XlaComputationInstance> computation_instances = {
+      XlaComputationInstance{
+          computation,
+          std::vector<GlobalData*>(arguments.begin(), arguments.end()),
+          *execution_options, execution_profile}};
+
+  // Instead of invoking Compile() and Execute(), invoke
+  // Service::ExecuteParallel() to execute our one computation.  Compile()
+  // caches the executable forever, which isn't what we want.
+  VLOG(1) << "Making ExecuteParallel request: "
+          << execution_options->DebugString();
+  TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
+  VLOG(1) << "ExecuteParallel request done.";
+
+  // The result selection is a bit hacky, but better than assuming it is
+  // device 0.
+  //
+  // TODO(b/118493728): Allow Execute to return one result per computation.
+  for (int64 i = 0; i < results.size(); i++) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
+    if (!ShapeUtil::IsEmptyTuple(shape)) {
+      VLOG(3) << "Fetching result from device " << i << ": "
+              << ShapeUtil::HumanString(shape);
+      return std::move(results[i]);
     }
   }
-
-  return std::move(result);
+  TF_RET_CHECK(!results.empty());
+  VLOG(1) << "Defaulting to device 0 result";
+  return std::move(results[0]);
 }
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index d0ac4703c632e0e01d3c8911594b46fedf28930d..eff8713ac340e82ee7633f1f078334ba73b67b2f 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -52,6 +52,12 @@ class Client {
   //   need to live beyond this call.)
   // * If execution_options.device_handles should be empty. If you need
   //   non-empty device handles, call 'Execute' instead.
+  //
+  // TODO(b/122731460): This call caches the resulting Executable in the Service
+  // *forever*.  If you're only going to run the computation once, you may want
+  // to call the Execute(const XlaComputation&) overload.  If you're going to
+  // run the computation more than once but you want control over when the
+  // Executable is unloaded, use the LocalClient API.
   StatusOr<ExecutionHandle> Compile(
       const XlaComputation& computation,
       absl::Span<const Shape> argument_shapes,
@@ -76,6 +82,10 @@ class Client {
   //   device is chosen by the service.
   // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
   //   will be filled with profile data from the execution.
+  //
+  // TODO(b/122731460): The given computation is compiled and then thrown away
+  // immediately after it's run.  If you want control over how long the
+  // resulting Executable lives, use the LocalClient API.
   StatusOr<std::unique_ptr<GlobalData>> Execute(
       const XlaComputation& computation,
       absl::Span<GlobalData* const> arguments,
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 1f594e551af381d7537e947892cbf7e0b5b3b861..ec0e08975926f36c36c854f83a40b374b12a09a4 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -58,6 +58,12 @@ const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_num_replicas(
+    int num_replicas) {
+  num_replicas_ = num_replicas;
+  return *this;
+}
+
 string ExecutableBuildOptions::ToString() const {
   string result_layout = "nullopt";
   if (result_layout_set_) {
@@ -65,8 +71,9 @@ string ExecutableBuildOptions::ToString() const {
   }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
-      "generate_hlo_graph=%s}",
-      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph());
+      "generate_hlo_graph=%s, num_replicas=%d}",
+      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph(),
+      num_replicas_);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index a58090253bfac7779e4b61bc7231a0f0d945cc00..1d85fb34304b95d1fccdb0b0d6a7a65e739fae18 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -67,12 +67,18 @@ class ExecutableBuildOptions {
   // debugging.
   string ToString() const;
 
+  // The number of replicas of this computation that are to be executed.
+  // Defaults to 1.
+  int num_replicas() const { return num_replicas_; }
+  ExecutableBuildOptions& set_num_replicas(int num_replicas);
+
  private:
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
   absl::optional<DebugOptions> debug_options_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
+  int num_replicas_ = 1;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 826b13fe3733b3334d2213eeb1d10cdd53d2f134..26c5e8eb73f0908cdc2d7df65936fadeda627423 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -34,6 +34,21 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "arithmetic_test",
+    srcs = ["arithmetic_test.cc"],
+    deps = [
+        ":arithmetic",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "cholesky",
     srcs = ["cholesky.cc"],
@@ -76,6 +91,39 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "comparators",
+    srcs = ["comparators.cc"],
+    hdrs = ["comparators.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "comparators_test",
+    srcs = ["comparators_test.cc"],
+    deps = [
+        ":comparators",
+        ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:inlined_vector",
+    ],
+)
+
 cc_library(
     name = "constants",
     srcs = ["constants.cc"],
@@ -93,7 +141,6 @@ cc_library(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":constants",
         "//tensorflow/compiler/xla:test",
@@ -147,7 +194,22 @@ cc_library(
 xla_test(
     name = "math_test",
     srcs = ["math_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":math",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+xla_test(
+    name = "math_exhaustive_test",
+    srcs = ["math_exhaustive_test.cc"],
+    shard_count = 16,
     deps = [
         ":math",
         "//tensorflow/compiler/xla:literal_util",
@@ -168,12 +230,16 @@ cc_library(
         ":arithmetic",
         ":constants",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -181,16 +247,19 @@ cc_library(
 xla_test(
     name = "matrix_test",
     srcs = ["matrix_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":matrix",
         ":slicing",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -229,7 +298,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/base",
     ],
 )
@@ -281,12 +349,7 @@ cc_library(
     srcs = ["slicing.cc"],
     hdrs = ["slicing.h"],
     deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
     ],
@@ -295,13 +358,11 @@ cc_library(
 xla_test(
     name = "slicing_test",
     srcs = ["slicing_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":slicing",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -324,12 +385,10 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -352,7 +411,10 @@ cc_library(
 xla_test(
     name = "quantize_test",
     srcs = ["quantize_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
+    # TODO(b/122119490): re-enable TAP after fixing.
+    tags = [
+        "notap",
+    ],
     deps = [
         ":quantize",
         "//tensorflow/compiler/xla:test",
@@ -410,24 +472,23 @@ cc_library(
 xla_test(
     name = "triangular_solve_test",
     srcs = ["triangular_solve_test.cc"],
-    tags = ["noasan"],  # sometimes times out, http://b/78650012
+    tags = [
+        "enable_for_xla_interpreter",
+        "noasan",  # sometimes times out, http://b/78650012
+    ],
     deps = [
+        ":math",
         ":matrix",
         ":triangular_solve",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index e86c10f030f3990d67e5a6638100640f73c82307..3b875135af29f142463ffd783bfeaadc61ada1af 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -117,10 +117,70 @@ XlaOp Any(XlaOp predicates) {
     XlaComputation logical_or = CreateScalarOrComputation(PRED, builder);
     TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
                         builder->GetShape(predicates));
-    std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
+    std::vector<int64> all_dimensions(predicates_shape.rank());
     std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
     return Reduce(predicates, f, logical_or, all_dimensions);
   });
 }
 
+namespace {
+
+XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    XlaOp init_value;
+    XlaComputation reducer;
+    if (is_min) {
+      init_value = MaxValue(builder, input_shape.element_type());
+      reducer = CreateScalarMinComputation(input_shape.element_type(), builder);
+    } else {
+      init_value = MinValue(builder, input_shape.element_type());
+      reducer = CreateScalarMaxComputation(input_shape.element_type(), builder);
+    }
+
+    XlaOp input_max = Reduce(input, init_value, reducer,
+                             /*dimensions_to_reduce=*/{axis});
+    std::vector<int64> broadcast_dims(input_shape.rank() - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
+    std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
+    // Compute a mask that has 1s for elements equal to the maximum.
+    XlaOp partial_mask =
+        ConvertElementType(Eq(input, input_max, broadcast_dims), output_type);
+
+    // In order to make identity elements for a bitwise And, we:
+    //   Left shift the 1 to the leftmost bit, yielding 0x10...0
+    //   Arithmetic right shift the 1 back to the rightmost bit, yielding
+    //   0xFF...F
+    int32 bits_in_type =
+        ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1;
+    XlaOp shift_amount = ConstantR0WithType(builder, output_type, bits_in_type);
+    XlaOp full_mask = ShiftRightArithmetic(
+        ShiftLeft(partial_mask, shift_amount), shift_amount);
+
+    // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
+    // index.
+
+    const int64 axis_size = ShapeUtil::GetDimension(input_shape, axis);
+    XlaOp iota = Iota(builder, output_type, axis_size);
+    XlaOp product = And(full_mask, iota, /*broadcast_dimensions=*/{axis});
+
+    // If there are multiple maximum elements, choose the one with the highest
+    // index.
+    return Reduce(product, MinValue(builder, output_type),
+                  CreateScalarMaxComputation(output_type, builder),
+                  /*dimensions_to_reduce=*/{axis});
+  });
+}
+
+}  // namespace
+
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
+}
+
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 632e8cc8bc64fad236a0226c6e93079aadde7050..d4a7812c441c351b121e5d72faf9642b06728b18 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -57,6 +57,14 @@ XlaComputation CreateScalarOrComputation(PrimitiveType type,
 // Note: if predicates is zero-sized, Any() vacuously returns false.
 XlaOp Any(XlaOp predicates);
 
+// Returns the argmax of `input` along `axis`. `output_type` is the type to
+// use for the output.
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis);
+
+// Returns the argmin of `input` along `axis`. `output_type` is the type to
+// use for the output.
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a13839f9db89b9c07f2465867a503ef2193f8160
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using ArithmeticTest = ClientLibraryTestBase;
+
+XLA_TEST_F(ArithmeticTest, ArgMinR2Axis0) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMin(x, S32, /*axis=*/0);
+
+  std::vector<int32> expected = {0, 2, 2};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMinR2Axis1) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMin(x, S32, /*axis=*/1);
+
+  std::vector<int32> expected = {0, 1, 2};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis0) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMax(x, S32, /*axis=*/0);
+
+  std::vector<int32> expected = {2, 0, 1};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis1) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMax(x, S32, /*axis=*/1);
+
+  std::vector<int32> expected = {1, 0, 0};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/cholesky.cc b/tensorflow/compiler/xla/client/lib/cholesky.cc
index fd98049968491d80b9717a2de1f34997bd9d18c1..414bd1494cd32f32a5c37e84119de930678a776b 100644
--- a/tensorflow/compiler/xla/client/lib/cholesky.cc
+++ b/tensorflow/compiler/xla/client/lib/cholesky.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,7 +55,7 @@ XlaOp CholeskyUnblocked(XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int n_dims = ShapeUtil::Rank(a_shape);
+    const int n_dims = a_shape.rank();
     const int64 n = ShapeUtil::GetDimension(a_shape, -1);
     auto major_dims = AsInt64Slice(a_shape.dimensions())
                           .subspan(
@@ -67,29 +68,26 @@ XlaOp CholeskyUnblocked(XlaOp a, PrecisionConfig::Precision precision) {
     auto body_fn =
         [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
             XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
-      Shape col_shape;
-      Shape row_shape;
-      for (int64 d : major_dims) {
-        row_shape.add_dimensions(d);
-        col_shape.add_dimensions(d);
-      }
-      row_shape.add_dimensions(1);
-      row_shape.add_dimensions(n);
-      row_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_row = Zeros(body_builder, row_shape);
-
-      col_shape.add_dimensions(n);
-      col_shape.add_dimensions(1);
-      col_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_col = Zeros(body_builder, col_shape);
-
-      std::vector<int32> mask_vector(n);
-      std::iota(mask_vector.begin(), mask_vector.end(), 0);
-      auto mask_range = ConstantR1<int32>(body_builder, mask_vector);
+      std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
+      std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
+      row_shape_dims.push_back(1);
+      row_shape_dims.push_back(n);
+      auto mask_zeros_row =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), row_shape_dims));
+
+      col_shape_dims.push_back(n);
+      col_shape_dims.push_back(1);
+      auto mask_zeros_col =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), col_shape_dims));
+
       auto mask_range_row =
-          Broadcast(Reshape(mask_range, {0}, {1, n}), major_dims);
+          Iota(body_builder, ShapeUtil::MakeShape(S32, row_shape_dims),
+               /*iota_dimension=*/n_dims - 1);
       auto mask_range_col =
-          Broadcast(Reshape(mask_range, {0}, {n, 1}), major_dims);
+          Iota(body_builder, ShapeUtil::MakeShape(S32, col_shape_dims),
+               /*iota_dimension=*/n_dims - 2);
       auto body_a = loop_vars[0];
       auto body_l = loop_vars[1];
 
@@ -144,7 +142,7 @@ XlaOp Cholesky(XlaOp a, int64 block_size,
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int ndims = ShapeUtil::Rank(a_shape);
+    const int ndims = a_shape.rank();
     if (ndims < 2) {
       return InvalidArgument(
           "Argument to Cholesky must have rank >= 2; shape was %s",
@@ -158,6 +156,12 @@ XlaOp Cholesky(XlaOp a, int64 block_size,
           ShapeUtil::HumanString(a_shape));
     }
 
+    if (primitive_util::IsComplexType(a_shape.element_type())) {
+      return Unimplemented(
+          "Complex types are not implemented in Cholesky; got shape %s",
+          ShapeUtil::HumanString(a_shape));
+    }
+
     if (block_size < 1) {
       return InvalidArgument(
           "block_size argument to Cholesky must be >= 1; got %d", block_size);
diff --git a/tensorflow/compiler/xla/client/lib/cholesky_test.cc b/tensorflow/compiler/xla/client/lib/cholesky_test.cc
index ba9580a3d32225625acc1447344b7d2c16c5d8a5..095dd4fbf8b7c90047c4428b50c626c16e9c1e94 100644
--- a/tensorflow/compiler/xla/client/lib/cholesky_test.cc
+++ b/tensorflow/compiler/xla/client/lib/cholesky_test.cc
@@ -157,10 +157,10 @@ XLA_TEST_P(RandomCholeskyTest, Random) {
                              xla::ErrorSpec(1e-4, 1e-4));
 }
 
-INSTANTIATE_TEST_CASE_P(RandomCholeskyTestInstance, RandomCholeskyTest,
-                        ::testing::Values(CholeskyTestCase{1, 1},
-                                          CholeskyTestCase{1, 2},
-                                          CholeskyTestCase{10, 5},
-                                          CholeskyTestCase{2, 20}));
+INSTANTIATE_TEST_SUITE_P(RandomCholeskyTestInstance, RandomCholeskyTest,
+                         ::testing::Values(CholeskyTestCase{1, 1},
+                                           CholeskyTestCase{1, 2},
+                                           CholeskyTestCase{10, 5},
+                                           CholeskyTestCase{2, 20}));
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c620c9841a5146618e3a142adeb3fe2da525950a
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -0,0 +1,159 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using XlaOpGenerator = XlaOp (*)(const XlaOp&, const XlaOp&,
+                                 absl::Span<const int64>);
+
+XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
+                                            int64 bit_width) {
+  PrimitiveType signed_type;
+  PrimitiveType unsigned_type;
+  XlaOp max_value;
+  switch (bit_width) {
+    case 16:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint16>(std::numeric_limits<int16>::max()));
+      signed_type = S16;
+      unsigned_type = U16;
+      break;
+    case 32:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint32>(std::numeric_limits<int32>::max()));
+      signed_type = S32;
+      unsigned_type = U32;
+      break;
+    case 64:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint64>(std::numeric_limits<int64>::max()));
+      signed_type = S64;
+      unsigned_type = U64;
+      break;
+    default:
+      return value.builder()->ReportError(
+          InvalidArgument("Invalid bit width %lld for Comparator floating "
+                          "point parameter.",
+                          bit_width));
+  }
+  // Switch from a floating point value to a integer value in such a way that
+  // when using the integer value to compare, we get the same result for normal
+  // values, and -Nan is treated as the smallest value, and Nan is treated as
+  // the largest value.
+  // If f is a float, and
+  // x = bit_cast<int32>(f);
+  // y = x < 0 ? numeric_limits<int32>::max() - x : x;
+  // then y is ordered as an int32 such that finite values have the obvious
+  // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
+  // and end of the ordering.
+  // Note that in order to avoid -x to overflow, we calculate
+  // numeric_limits<int32>::max() - x as unsigned, and then convert back to
+  // signed.
+  auto signed_value = BitcastConvertType(value, signed_type);
+  auto unsigned_value = BitcastConvertType(value, unsigned_type);
+  auto flipped_value =
+      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
+  auto is_negative = Lt(signed_value, Zero(value.builder(), signed_type));
+  return Select(is_negative, flipped_value, signed_value);
+}
+
+XlaComputation CreateScalarComparisonComputation(
+    const string& name, const std::vector<PrimitiveType>& operand_types,
+    XlaBuilder* builder, XlaOpGenerator generator) {
+  // Create a default computation where we compare only the first two
+  // parameters of type 'operand_types[0]'.
+  auto b = builder->CreateSubBuilder(name);
+  if (operand_types.empty()) {
+    b->ReportError(InvalidArgument("operand_types should not be empty"));
+    return b->BuildAndNoteError();
+  }
+
+  int64 parameter_count = 0;
+  XlaOp first_lhs_param;
+  XlaOp first_rhs_param;
+
+  // For each type in 'operand_types' we create two parameters of this type. The
+  // idea is that this computation can be used by n-ary Sort, and potentially
+  // should support comparing also the other operands of sort. In this default
+  // computation, however, we will not actually use any parameters except the
+  // first two.
+  for (auto operand_type : operand_types) {
+    auto scalar_shape = ShapeUtil::MakeShape(operand_type, {});
+    auto lhs_param = Parameter(b.get(), parameter_count * 2, scalar_shape,
+                               absl::StrCat("p.", parameter_count, ".lhs"));
+    auto rhs_param = Parameter(b.get(), parameter_count * 2 + 1, scalar_shape,
+                               absl::StrCat("p.", parameter_count, ".rhs"));
+    if (parameter_count == 0) {
+      first_lhs_param = lhs_param;
+      first_rhs_param = rhs_param;
+    }
+    ++parameter_count;
+  }
+  if (primitive_util::IsFloatingPointType(operand_types[0])) {
+    PrimitiveType compare_type = operand_types[0];
+    // Special-case handling for BF16. We currently do not support direct
+    // comparisons with BF16, so we convert to F32 and then use the F32
+    // comparison logic.
+    if (compare_type == BF16) {
+      compare_type = F32;
+      first_lhs_param = ConvertElementType(first_lhs_param, F32);
+      first_rhs_param = ConvertElementType(first_rhs_param, F32);
+    }
+    int64 bit_width = primitive_util::BitWidth(compare_type);
+    first_lhs_param =
+        BitcastConvertFloatingPointToIntegral(first_lhs_param, bit_width);
+    first_rhs_param =
+        BitcastConvertFloatingPointToIntegral(first_rhs_param, bit_width);
+  }
+  generator(first_lhs_param, first_rhs_param, {});
+  return b->BuildAndNoteError();
+}
+}  // namespace
+
+// Creates a scalar less-than computation and returns it.
+XlaComputation CreateScalarLtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
+  return CreateScalarComparisonComputation("compare-less-than", operand_types,
+                                           builder, Lt);
+}
+
+// Creates a scalar greater-than computation and returns it.
+XlaComputation CreateScalarGtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
+  return CreateScalarComparisonComputation("compare-greater-than",
+                                           operand_types, builder, Gt);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/comparators.h b/tensorflow/compiler/xla/client/lib/comparators.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbcfc227dd495537f59bf0a9090bad8ade15da62
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/comparators.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// Creates a scalar less-than computation and returns it. The created
+// computation has 2 * 'operand_types.size()' many parameters, where parameters
+// 2 * i and 2 * i + 1 are a scalar with primitive type 'operand_types[i]'. The
+// computation compares the first two parameters. For floating point types, a
+// total order is created where
+// -NaN < -infinity < ... < -0 < 0 < ... < infinity < NaN
+XlaComputation CreateScalarLtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
+
+// Creates a scalar greater-than computation and returns it. The created
+// computation has 2 * 'operand_types.size()' many parameters, where parameters
+// 2 * i and 2 * i + 1 are a scalar with primitive type 'operand_types[i]'. The
+// computation compares the first two parameters. For floating point types, a
+// total order is created where
+// NaN > infinity > ... > 0 > -0 > ... > -infinity > -NaN
+XlaComputation CreateScalarGtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
diff --git a/tensorflow/compiler/xla/client/lib/comparators_test.cc b/tensorflow/compiler/xla/client/lib/comparators_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..598956803b34702b1e095a342648d348fa350b29
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/comparators_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+
+#include <limits>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ComparatorsTest : public ClientLibraryTestBase {
+ public:
+  ComparatorsTest() : builder_(TestName()) {}
+  XlaBuilder* builder() { return &builder_; }
+
+ private:
+  XlaBuilder builder_;
+};
+
+template <
+    PrimitiveType type,
+    typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+void BuildComparatorAndComparisons(ComparatorsTest* test,
+                                   bool compare_less_than,
+                                   absl::InlinedVector<bool, 10>* expected) {
+  auto compare = compare_less_than
+                     ? CreateScalarLtComputation({type}, test->builder())
+                     : CreateScalarGtComputation({type}, test->builder());
+
+  auto negative_nan = ConstantR0<T>(
+      test->builder(), -T(std::numeric_limits<float>::quiet_NaN()));
+  auto positive_nan = ConstantR0<T>(test->builder(),
+                                    T(std::numeric_limits<float>::quiet_NaN()));
+  auto negative_zero = ConstantR0<T>(test->builder(), T(-0.));
+  auto positive_zero = ConstantR0<T>(test->builder(), T(0.));
+  auto negative_infinity = MinValue(test->builder(), type);
+  auto positive_infinity = MaxValue(test->builder(), type);
+
+  // List the values in the expected sorting order from smallest to largest.
+  std::vector<XlaOp> all_constants{negative_nan,      negative_infinity,
+                                   negative_zero,     positive_zero,
+                                   positive_infinity, positive_nan};
+
+  // Do pairwise comparisons.
+  std::vector<XlaOp> all_comparisons;
+  for (const XlaOp& lhs_constant : all_constants) {
+    for (const XlaOp& rhs_constant : all_constants) {
+      all_comparisons.push_back(Broadcast(
+          Call(test->builder(), compare, {lhs_constant, rhs_constant}), {1}));
+    }
+  }
+
+  // Concantenate the comparison results.
+  ConcatInDim(test->builder(), all_comparisons, 0);
+
+  // If we use less-than comparisons, we expect the comparison to result in true
+  // if the lhs value to be compared appears earlier in 'all_constants' than the
+  // rhs value. Likewise, if we use greater-than comparisons, we expect the
+  // comparison to return true if the rhs value appears earlier in
+  // 'all_constants' than the lhs value.
+  expected->clear();
+  for (int i = 0; i < all_constants.size(); ++i) {
+    for (int j = 0; j < all_constants.size(); ++j) {
+      expected->push_back(compare_less_than ? i < j : i > j);
+    }
+  }
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtBF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<BF16>(this, /*compare_less_than=*/true,
+                                      &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtBF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<BF16>(this, /*compare_less_than=*/false,
+                                      &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F16>(this, /*compare_less_than=*/true,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F16>(this, /*compare_less_than=*/false,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtF32) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F32>(this, /*compare_less_than=*/true,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtF32) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F32>(this, /*compare_less_than=*/false,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtF64) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F64>(this, /*compare_less_than=*/true,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtF64) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F64>(this, /*compare_less_than=*/false,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index 81624614c1e3599dfe116eb61d9e2edcd5230684..4e5310a380e8bda15348dae2cbb0ea9e2c381bcb 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -56,6 +56,8 @@ XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) {
       return ConstantR0<double>(builder, static_cast<double>(value));
     case C64:
       return ConstantR0<complex64>(builder, static_cast<complex64>(value));
+    case C128:
+      return ConstantR0<complex128>(builder, static_cast<complex128>(value));
     case U8:
       return ConstantR0<uint8>(builder, static_cast<uint8>(value));
     case U32:
@@ -88,6 +90,27 @@ XlaOp ScalarLike(XlaOp prototype, T value) {
   });
 }
 
+// Returns an array or scalar containing copies of `value` cast to the same
+// run-type type as `prototype` and broadcast to the same dimensions as
+// `prototype`.
+//
+// If `prototype` is not a scalar or array, returns an error.
+template <typename T>
+XlaOp FullLike(XlaOp prototype, T value) {
+  XlaBuilder* builder = prototype.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype));
+    if (ShapeUtil::IsScalar(shape) || shape.IsArray()) {
+      return Broadcast(ScalarLike(prototype, value), shape.dimensions());
+    } else {
+      return InvalidArgument(
+          "Prototype shape for BroadcastConstantLike must be a scalar or "
+          "array, but was %s",
+          shape.ToString());
+    }
+  });
+}
+
 // Returns a scalar with value '0' of 'type'.
 XlaOp Zero(XlaBuilder* builder, PrimitiveType type);
 
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 36fdda39b4124b9100c6054160f9c17bdf787d6f..253b3440e200d04e76fb64b90c1707d8a21869e8 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
@@ -78,26 +79,79 @@ XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients) {
 }
 
 // Compute an approximation of the error function complement (1 - erf(x)).
+//
+// TODO(jlebar): This is not particularly efficient.  The implementation in
+// Cephes that this follows was written for double precision, but our
+// coefficients are specified only to single-precision!  Cephes has a different,
+// simpler implementation for single-precision.
+//
+// Furthermore, we could simplify this further for f16 -- for example, because
+// exp(-4.2 * 4.2) = 0 (f16), the computations in service of the x < 8.0 branch
+// below are unnecessary.
+//
+// See also these alternate implementations of erf and erfc:
+//
+//   https://stackoverflow.com/questions/35148198
+//   https://stackoverflow.com/questions/35966695
+//
 XlaOp Erfc(XlaOp x) {
-  XlaOp abs_x = Abs(x);
-  XlaOp z = Exp(-x * x);
-
-  XlaOp pp = EvaluatePolynomial(abs_x, kErfcPCoefficient);
-  XlaOp pq = EvaluatePolynomial(abs_x, kErfcQCoefficient);
-  XlaOp pr = EvaluatePolynomial(abs_x, kErfcRCoefficient);
-  XlaOp ps = EvaluatePolynomial(abs_x, kErfcSCoefficient);
-
-  XlaOp y = Select(Lt(abs_x, ScalarLike(x, 8.0)), z * pp / pq, z * pr / ps);
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs.  (We could extend erfc to accept complex
+    // types, but it doesn't seem necessary at this point.)
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x));
+    if (!ShapeUtil::ElementIsFloating(shape)) {
+      return InvalidArgument(
+          "erfc only accepts real floating-point arrays or scalars, but got %s",
+          shape.ToString());
+    }
+    XlaOp abs_x = Abs(x);
+    XlaOp z = Exp(-x * x);
+
+    XlaOp pp = EvaluatePolynomial(abs_x, kErfcPCoefficient);
+    XlaOp pq = EvaluatePolynomial(abs_x, kErfcQCoefficient);
+    XlaOp pr = EvaluatePolynomial(abs_x, kErfcRCoefficient);
+    XlaOp ps = EvaluatePolynomial(abs_x, kErfcSCoefficient);
+
+    XlaOp abs_x_small = Lt(abs_x, ScalarLike(x, 8.0));
+    XlaOp y = Select(abs_x_small, z * pp / pq, z * pr / ps);
+    XlaOp result_no_underflow =
+        Select(Lt(x, ScalarLike(x, 0.0)), ScalarLike(x, 2.0) - y, y);
+
+    // Check for edge cases, namely, exp(-x^2) is exactly 0, or the appropriate
+    // denominator (ps or pq) is inf.  (The check for exp(-x^2) == 0 is
+    // necessary only for x == +/- inf, where this check lets us avoid
+    // multiplying 0 by inf and getting nan.)
+    auto is_pos_inf = [](XlaOp op) {
+      return And(Not(IsFinite(op)), Gt(op, ScalarLike(op, 0)));
+    };
+    XlaOp underflow =
+        Or(Eq(z, ScalarLike(z, 0)), Or(And(is_pos_inf(pq), abs_x_small),
+                                       And(is_pos_inf(ps), Not(abs_x_small))));
+    XlaOp result_underflow =
+        Select(Lt(x, ScalarLike(x, 0)), FullLike(x, 2), FullLike(x, 0));
 
-  return Select(Lt(x, ScalarLike(x, 0.0)), ScalarLike(x, 2.0) - y, y);
+    return Select(underflow, result_underflow, result_no_underflow);
+  });
 }
 
 // Compute a polynomial approximation of the error function.
 XlaOp Erf(XlaOp x) {
-  XlaOp z = x * x;
-  XlaOp pt = EvaluatePolynomial(z, kErfTCoefficient);
-  XlaOp pu = EvaluatePolynomial(z, kErfUCoefficient);
-  return x * pt / pu;
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs.  (We could extend erf to accept complex
+    // types, but it doesn't seem necessary at this point.)
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x));
+    if (!ShapeUtil::ElementIsFloating(shape)) {
+      return InvalidArgument(
+          "erf only accepts real floating-point arrays or scalars, but got %s",
+          shape.ToString());
+    }
+    XlaOp z = x * x;
+    XlaOp pt = EvaluatePolynomial(z, kErfTCoefficient);
+    XlaOp pu = EvaluatePolynomial(z, kErfUCoefficient);
+    return x * pt / pu;
+  });
 }
 
 // Approximation for the inverse error function from
@@ -113,37 +167,30 @@ XlaOp Erf(XlaOp x) {
 //   }
 //   return p*x
 XlaOp ErfInv(XlaOp x) {
-  XlaBuilder* b = x.builder();
-  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, b->GetShape(x));
-    constexpr int kDegree = 9;
-    constexpr std::array<float, 9> w_less_than_5_constants = {
-        2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-        -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
-        -0.00417768164f,  0.246640727f,    1.50140941f};
-    constexpr std::array<float, 9> w_greater_than_5_constants = {
-        -0.000200214257f, 0.000100950558f, 0.00134934322f,
-        -0.00367342844f,  0.00573950773f,  -0.0076224613f,
-        0.00943887047f,   1.00167406f,     2.83297682f};
+  constexpr int kDegree = 9;
+  constexpr std::array<float, 9> w_less_than_5_constants = {
+      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
+      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
+      -0.00417768164f,  0.246640727f,    1.50140941f};
+  constexpr std::array<float, 9> w_greater_than_5_constants = {
+      -0.000200214257f, 0.000100950558f, 0.00134934322f,
+      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
+      0.00943887047f,   1.00167406f,     2.83297682f};
 
-    auto one = ScalarLike(x, 1.0);
-    auto w = -Log((one - x) * (one + x));
-
-    auto lt = Lt(w, ScalarLike(x, 5.0));
-    auto coefficient = [&](int i) {
-      return Select(lt,
-                    Broadcast(ScalarLike(x, w_less_than_5_constants[i]),
-                              AsInt64Slice(shape.dimensions())),
-                    Broadcast(ScalarLike(x, w_greater_than_5_constants[i]),
-                              AsInt64Slice(shape.dimensions())));
-    };
-    w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0));
-    auto p = coefficient(0);
-    for (int i = 1; i < kDegree; ++i) {
-      p = coefficient(i) + p * w;
-    }
-    return p * x;
-  });
+  auto one = ScalarLike(x, 1.0);
+  auto w = -Log((one - x) * (one + x));
+
+  auto lt = Lt(w, ScalarLike(x, 5.0));
+  auto coefficient = [&](int i) {
+    return Select(lt, FullLike(x, w_less_than_5_constants[i]),
+                  FullLike(x, w_greater_than_5_constants[i]));
+  };
+  w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0));
+  auto p = coefficient(0);
+  for (int i = 1; i < kDegree; ++i) {
+    p = coefficient(i) + p * w;
+  }
+  return p * x;
 }
 
 namespace {
@@ -170,49 +217,94 @@ static constexpr std::array<double, 8> kLanczosCoefficients = {
 // t(z) = z + kLanczosGamma + 1/2
 // A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k))
 XlaOp Lgamma(XlaOp input) {
-  XlaOp one_half = ScalarLike(input, 0.5);
-  XlaOp one = ScalarLike(input, 1);
+  auto& b = *input.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs.  (We could extend lgamma to accept complex
+    // types, but it doesn't seem necessary at this point.)
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(input));
+    if (!ShapeUtil::ElementIsFloating(shape)) {
+      return InvalidArgument(
+          "lgamma only accepts real floating-point arrays or scalars, but got "
+          "%s",
+          shape.ToString());
+    }
 
-  XlaOp pi = ScalarLike(input, M_PI);
-  XlaOp log_pi = ScalarLike(input, std::log(M_PI));
-  XlaOp log_sqrt_two_pi = ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2);
+    XlaOp one_half = ScalarLike(input, 0.5);
+    XlaOp one = ScalarLike(input, 1);
 
-  XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
-  XlaOp log_lanczos_gamma_plus_one_half =
-      ScalarLike(input, std::log(kLanczosGamma + 0.5));
+    XlaOp pi = ScalarLike(input, M_PI);
+    XlaOp log_pi = ScalarLike(input, std::log(M_PI));
+    XlaOp log_sqrt_two_pi =
+        ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2);
 
-  XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
+    XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
+    XlaOp log_lanczos_gamma_plus_one_half =
+        ScalarLike(input, std::log(kLanczosGamma + 0.5));
 
-  // If the input is less than 0.5 use Gauss's reflection formula:
-  // gamma(x) = pi / sin(pi * x) * gamma(1 - x)
-  XlaOp need_to_reflect = Lt(Real(input), one_half);
-  XlaOp z = Select(need_to_reflect, -input, input - one);
+    XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
 
-  XlaOp x = base_lanczos_coeff;
-  for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
-    XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
-    XlaOp index = ScalarLike(input, i);
-    x = x + lanczos_coefficient / (z + index + one);
-  }
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
+    XlaOp need_to_reflect = Lt(input, one_half);
+    XlaOp z = Select(need_to_reflect, -input, input - one);
+
+    XlaOp x = base_lanczos_coeff;
+    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+      XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
+      XlaOp index = ScalarLike(input, i);
+      x = x + lanczos_coefficient / (z + index + one);
+    }
 
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
-  // the device.
-  // log(t) = log(kLanczosGamma + 0.5 + z)
-  //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-  XlaOp t = lanczos_gamma_plus_one_half + z;
-  XlaOp log_t =
-      log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half);
-
-  XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x);
-
-  // If z = a + 0j, the analytic continuation of log reduces to taking the
-  // absolute value of the real part.
-  // Re(log(z)) = Re(log|z| + arg(z)j)
-  //            = log|a|
-  XlaOp reflection = log_pi - Log(Abs(Sin(pi * input))) - log_y;
-  XlaOp result = Select(need_to_reflect, reflection, log_y);
-  return result;
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    XlaOp t = lanczos_gamma_plus_one_half + z;
+    XlaOp log_t = log_lanczos_gamma_plus_one_half +
+                  Log1p(z / lanczos_gamma_plus_one_half);
+
+    XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x);
+
+    // Compute the reflected value, used when x < 0.5:
+    //
+    //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
+    //
+    // (The abs is because lgamma is the log of the absolute value of the gamma
+    // function.)
+    //
+    // We have to be careful when computing the final term above. gamma(x) goes
+    // to +/-inf at every integer x < 0, and this is controlled by the
+    // sin(pi * x) term.  The slope is large, so precision is particularly
+    // important.
+    //
+    // Because abs(sin(pi * x)) has period 1, we can equivalently use
+    // abs(sin(pi * frac(x))) = sin(pi * frac(x)), where frac(x) is the
+    // fractional part of x.  This is more numerically accurate: It doesn't
+    // overflow to inf like pi * x can, and if x is an integer, it evaluates to
+    // 0 exactly, which is significant because we then take the log of this
+    // value, and log(0) is inf.
+    //
+    // We don't have a frac(x) primitive in XLA and computing it is tricky, but
+    // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for
+    // our purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
+    //
+    XlaOp abs_input = Abs(input);
+    XlaOp reflection_denom = Log(Sin(pi * (abs_input - Floor(abs_input))));
+
+    // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
+    // then it "wins" and the result is +/-inf.
+    XlaOp reflection =
+        Select(IsFinite(reflection_denom), log_pi - reflection_denom - log_y,
+               -reflection_denom);
+    XlaOp result = Select(need_to_reflect, reflection, log_y);
+
+    // lgamma(+/-inf) = +inf.
+    XlaOp inf_bcast = FullLike(input, std::numeric_limits<float>::infinity());
+    return Select(Or(IsFinite(input),                           // is finite, or
+                     Not(Or(Lt(input, one), Ge(input, one)))),  // is nan
+                  result, inf_bcast);
+  });
 }
 
 // Compute the Digamma function using Lanczos' approximation from "A Precision
@@ -223,69 +315,96 @@ XlaOp Lgamma(XlaOp input) {
 // A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k))
 // A'(z) = sigma(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
 XlaOp Digamma(XlaOp input) {
-  XlaOp zero = ScalarLike(input, 0);
-  XlaOp one_half = ScalarLike(input, 0.5);
-  XlaOp one = ScalarLike(input, 1);
-
-  XlaOp pi = ScalarLike(input, M_PI);
-
-  XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma);
-  XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
-  XlaOp log_lanczos_gamma_plus_one_half =
-      ScalarLike(input, std::log(kLanczosGamma + 0.5));
-
-  XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
-
-  // If the input is less than 0.5 use Gauss's reflection formula:
-  // digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  XlaOp need_to_reflect = Lt(Real(input), one_half);
-  XlaOp z = Select(need_to_reflect, -input, input - one);
-
-  XlaOp num = zero;
-  XlaOp denom = base_lanczos_coeff;
-  for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
-    XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
-    XlaOp index = ScalarLike(input, i);
-    num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
-    denom = denom + lanczos_coefficient / (z + index + one);
-  }
+  auto& b = *input.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs.  (We could extend digamma to accept
+    // complex types, but it doesn't seem necessary at this point.)
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(input));
+    if (!ShapeUtil::ElementIsFloating(shape)) {
+      return InvalidArgument(
+          "digamma only accepts real floating-point arrays or scalars, but got "
+          "%s",
+          shape.ToString());
+    }
+
+    XlaOp zero = ScalarLike(input, 0);
+    XlaOp one_half = ScalarLike(input, 0.5);
+    XlaOp one = ScalarLike(input, 1);
+
+    XlaOp pi = ScalarLike(input, M_PI);
+
+    XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma);
+    XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
+    XlaOp log_lanczos_gamma_plus_one_half =
+        ScalarLike(input, std::log(kLanczosGamma + 0.5));
+
+    XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // digamma(x) = digamma(1 - x) - pi * cot(pi * x)
+    XlaOp need_to_reflect = Lt(input, one_half);
+    XlaOp z = Select(need_to_reflect, -input, input - one);
+
+    XlaOp num = zero;
+    XlaOp denom = base_lanczos_coeff;
+    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+      XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
+      XlaOp index = ScalarLike(input, i);
+      num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
+      denom = denom + lanczos_coefficient / (z + index + one);
+    }
 
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
-  // the device.
-  // log(t) = log(kLanczosGamma + 0.5 + z)
-  //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-  XlaOp t = lanczos_gamma_plus_one_half + z;
-  XlaOp log_t =
-      log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half);
-
-  XlaOp y = log_t + num / denom - lanczos_gamma / t;
-  XlaOp reflection = y - pi * Cos(pi * input) / Sin(pi * input);
-  XlaOp result = Select(need_to_reflect, reflection, y);
-  return result;
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    XlaOp t = lanczos_gamma_plus_one_half + z;
+    XlaOp log_t = log_lanczos_gamma_plus_one_half +
+                  Log1p(z / lanczos_gamma_plus_one_half);
+
+    XlaOp y = log_t + num / denom - lanczos_gamma / t;
+    XlaOp reflection = y - pi * Cos(pi * input) / Sin(pi * input);
+    return Select(need_to_reflect, reflection, y);
+  });
 }
 
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
 XlaOp RoundToEven(XlaOp x) {
-  auto half = ScalarLike(x, 0.5);
-  auto one = ScalarLike(x, 1.0);
-  auto two = ScalarLike(x, 2.0);
-
-  auto round_val = Floor(x);
-  auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * Floor(half * x);
-  auto is_odd = Eq(nearest_even_int, one);
-  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
-                round_val + one, round_val);
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs (What does it even mean to round a complex
+    // number?  Do you round each component equally?  In that case, you should
+    // just ask for that explicitly.)
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x));
+    if (ShapeUtil::ElementIsComplex(shape)) {
+      return InvalidArgument(
+          "RoundToEven doesn't accept complex inputs, but got %s",
+          shape.ToString());
+    }
+    auto half = ScalarLike(x, 0.5);
+    auto one = ScalarLike(x, 1.0);
+    auto two = ScalarLike(x, 2.0);
+
+    auto round_val = Floor(x);
+    auto fraction = x - round_val;
+    auto nearest_even_int = round_val - two * Floor(half * x);
+    auto is_odd = Eq(nearest_even_int, one);
+    return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                  round_val + one, round_val);
+  });
 }
 
 // Trigonometric functions.
 
-// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) if x != -1
+//           pi                                if x == -1
 XlaOp Acos(XlaOp x) {
-  return ScalarLike(x, 2.0) *
-         Atan2(Sqrt(ScalarLike(x, 1.0) - x * x), ScalarLike(x, 1.0) + x);
+  return Select(Ne(x, FullLike(x, -1)),
+                ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
+                                           ScalarLike(x, 1.0) + x),
+                FullLike(x, M_PI));
 }
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
@@ -323,9 +442,88 @@ XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    auto perform_conj = shape.element_type() == C64 && conjugate;
+    auto perform_conj =
+        primitive_util::IsComplexType(shape.element_type()) && conjugate;
     return perform_conj ? Conj(x) : x;
   });
 }
 
+XlaOp NextAfter(XlaOp from, XlaOp to) {
+  auto builder = from.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(from));
+    int bitwidth = primitive_util::BitWidth(shape.element_type());
+    auto int_type = primitive_util::UnsignedIntegralTypeForBitWidth(bitwidth);
+    auto from_as_int = BitcastConvertType(from, int_type);
+    auto to_as_int = BitcastConvertType(to, int_type);
+
+    // The result is NaN if either "from" or "to" are NaN.
+    auto from_is_nan = Ne(from, from);
+    auto to_is_nan = Ne(to, to);
+    auto nan_input = Or(from_is_nan, to_is_nan);
+    auto result_for_nan =
+        Broadcast(ScalarLike(from, std::numeric_limits<double>::quiet_NaN()),
+                  shape.dimensions());
+    result_for_nan = BitcastConvertType(result_for_nan, int_type);
+
+    // The sign bit is the MSB.
+    const int64 sign_mask = int64{1} << (bitwidth - 1);
+    // Discard the sign bit to make the result non-negative.
+    auto from_abs = And(from_as_int, ScalarLike(from_as_int, ~sign_mask));
+    auto to_abs = And(to_as_int, ScalarLike(to_as_int, ~sign_mask));
+
+    // When both "from" and "to" are equal, the result is "to".
+    // N.B. It would not make a difference if we chose the result to be "from".
+    auto from_and_to_are_equal = Eq(from_as_int, to_as_int);
+    auto result_for_equal = to_as_int;
+
+    // When both "from" and "to" are both 0, the result is "to". This ensures we
+    // get a zero signed like "to".
+    auto from_is_zero = Eq(from_abs, ZerosLike(from_abs));
+    auto to_is_zero = Eq(to_abs, ZerosLike(to_abs));
+    auto result_for_both_zero = to_as_int;
+
+    auto from_sign = And(from_as_int, ScalarLike(from_as_int, sign_mask));
+    auto to_sign = And(to_as_int, ScalarLike(to_as_int, sign_mask));
+
+    // If from == 0 && to != 0, we need to return the smallest subnormal number
+    // signed like "to".
+    auto result_for_from_zero_to_non_zero =
+        Or(to_sign, ScalarLike(from_as_int, 1));
+
+    // If the sign of "from" and "to" disagree:
+    // - we need to make the magnitude of "from" smaller so that it is closer to
+    //   zero.
+    //
+    // Otherwise the signs agree:
+    // - "from" with a magnitude larger than "to" means we need to make the
+    //   magnitude smaller.
+    // - "from" with a magnitude smaller than "to" means we need to make the
+    //   magnitude larger.
+    // - "from" with the same magnitude and sign as "to" has already been
+    //   handled.
+    auto signs_disagree = Ne(from_sign, to_sign);
+    auto from_magnitude_larger_than_to = Gt(from_abs, to_abs);
+    auto result_has_smaller_magnitude =
+        Or(from_magnitude_larger_than_to, signs_disagree);
+    auto magnitude_adjustment =
+        Select(result_has_smaller_magnitude,
+               Broadcast(ScalarLike(from_as_int, -1), shape.dimensions()),
+               Broadcast(ScalarLike(from_as_int, 1), shape.dimensions()));
+    auto result = Add(from_as_int, magnitude_adjustment);
+    // Handle from == ±0.
+    result = Select(from_is_zero,
+                    Select(to_is_zero, result_for_both_zero,
+                           result_for_from_zero_to_non_zero),
+                    result);
+    // Handle from == to.
+    result = Select(from_and_to_are_equal, result_for_equal, result);
+    // Handle isnan(from) || isnan(to).
+    result = Select(nan_input, result_for_nan, result);
+
+    // Cast back to the original type.
+    return BitcastConvertType(result, shape.element_type());
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 17612bf9fdc0f1eabb338671c93c025c5b268872..583481c7f329fec9b7c5262e820b6796654cb7a2 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -32,7 +32,7 @@ XlaOp Square(XlaOp operand);
 // Computes the reciprocal of 'operand'.
 XlaOp Reciprocal(XlaOp operand);
 
-// Evaluates a polynomial given coefficients and `x`.
+// Evaluates a polynomial given coefficients and 'x'.
 // N.B. Coefficients should be supplied in decreasing order.
 XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients);
 
@@ -86,10 +86,14 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
-// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// Applies a complex conjugation operation if 'a' is complex and 'conjugate'
 // is true, otherwise returns its argument.
 xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
 
+// Returns the next number after 'from' in the direction of 'to' the same way
+// std::nextafter(from, to) would.
+XlaOp NextAfter(XlaOp from, XlaOp to);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
diff --git a/tensorflow/compiler/xla/client/lib/math_exhaustive_test.cc b/tensorflow/compiler/xla/client/lib/math_exhaustive_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fb13a73b4fdce1fd92a95030135c51e13e43653
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/math_exhaustive_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using Eigen::half;
+
+struct Testcase {
+  Testcase(string name, XlaOp (*op)(XlaOp), float (*host_op)(float))
+      : name(name), op(op), host_op(host_op) {}
+
+  Testcase& set_tolerance(float abs_err, float rel_err) {
+    error.abs = abs_err;
+    error.rel = rel_err;
+    return *this;
+  }
+
+  Testcase& set_relaxed_nans() {
+    error.relaxed_nans = true;
+    return *this;
+  }
+
+  Testcase& set_fewer_infs_ok() {
+    error.fewer_infs_ok = true;
+    return *this;
+  }
+
+  Testcase& set_skip_pos_inf() {
+    skip_pos_inf = true;
+    return *this;
+  }
+
+  Testcase& set_skip_neg_inf() {
+    skip_neg_inf = true;
+    return *this;
+  }
+
+  Testcase& set_skip_infs() {
+    skip_pos_inf = true;
+    skip_neg_inf = true;
+    return *this;
+  }
+
+  Testcase& set_skip_neg_zero() {
+    skip_neg_zero = true;
+    return *this;
+  }
+
+  string name;
+  XlaOp (*op)(XlaOp);
+  float (*host_op)(float);
+
+  ErrorSpec error{0.01};
+
+  // If true, don't test +/-infinity or negative 0.
+  bool skip_pos_inf = false;
+  bool skip_neg_inf = false;
+  bool skip_neg_zero = false;
+};
+
+void PrintTo(const Testcase& tc, std::ostream* os) { *os << tc.name; }
+
+class MathExhaustiveTest : public ClientLibraryTestBase,
+                           public ::testing::WithParamInterface<Testcase> {
+ public:
+  MathExhaustiveTest() {
+    // Disable fast-math, otherwise we get the wrong results for e.g.
+    // sqrt(-inf).
+    SetFastMathDisabled(true);
+  }
+};
+
+// Checks a function's behavior on all fp16 values.
+//
+// TODO(jlebar): asin and lgamma tests fail on interpreter.
+XLA_TEST_P(MathExhaustiveTest, DISABLED_ON_INTERPRETER(F16)) {
+  const Testcase& tc = GetParam();
+  XlaBuilder b(TestName());
+
+  std::vector<half> input;
+  for (uint32 i = 0; i < 1 << 16; ++i) {
+    half h;
+    h.x = i;
+
+    // If we're not using infinity as an input, use 0 as a placeholder rather
+    // than simply skipping this element.  We do this because when the test
+    // framework reports an incorrect answer, it tells us which index failed.
+    // So long as our inputs are a simple list of all possible float16s, we can
+    // convert an index to a half with e.g. the following Python:
+    //
+    //   np.frombuffer(array('H', [12345]), dtype=np.float16)[0]
+    //
+    // but as soon as our list of inputs has any gaps, this doesn't work.
+    if (std::isinf(static_cast<float>(h)) &&
+        ((tc.skip_pos_inf && h > half{0}) ||
+         (tc.skip_neg_inf && h < half{0}))) {
+      h = half{0};
+    }
+
+    if (h == half{0} && tc.skip_neg_zero &&
+        std::signbit(static_cast<float>(h))) {
+      h = half{0};
+    }
+
+    input.push_back(h);
+  }
+
+  std::vector<half> expected_result;
+  for (const auto& h : input) {
+    expected_result.push_back(
+        static_cast<half>(tc.host_op(static_cast<float>(h))));
+  }
+
+  XlaOp param = AddParam(LiteralUtil::CreateR1<half>(input), &b);
+  tc.op(param);
+  ComputeAndCompareR1<half>(&b, expected_result, {}, tc.error);
+}
+
+// TODO(b/123355973): The following tests from math.cc are missing.
+//
+// - Many failures.
+//
+//   Testcase{"acosh", Acosh, std::acosh}.set_relaxed_nans(),
+//   Testcase{"asinh", Asinh, std::asinh},
+//   Testcase{"sinh", Sinh, std::sinh},
+//   Testcase{"cosh", Cosh, std::cosh}.set_fewer_infs_ok(),
+//   Testcase{"erf", Erf, std::erf},
+//   Testcase{"round_to_even", RoundToEven,
+//            [](float x) { return std::nearbyint(x / 2) * 2; }},
+//
+// - No equivalent std function to compare with.
+//
+//   Testcase{"erfinv", ErfInv, std::erfinv},
+//   Testcase{"digamma", Digamma, std::digamma},
+//
+// - Needs a special test (function takes two args, and simply computing in f32
+//   and downcasting to f16 doesn't give the correct answer).
+//
+//   Testcase{"nextafter", NextAfter, std::nextafter},
+//
+// TODO(b/123355973): Test math functions not from math.cc (e.g. log).
+// TODO(b/123355973): Test bf16 and f32.
+//
+INSTANTIATE_TEST_CASE_P(
+    MathExhaustiveTest_Instantiation, MathExhaustiveTest,
+    ::testing::ValuesIn(std::vector<Testcase>{
+        Testcase{"sqrt", Sqrt, std::sqrt}.set_skip_neg_inf(),
+        Testcase{"rsqrt", Rsqrt, [](float x) { return 1 / std::sqrt(x); }}
+            .set_tolerance(0.05, 0.05)
+            .set_skip_infs()
+            .set_skip_neg_zero(),
+        Testcase{"square", Square, [](float x) { return x * x; }},
+        Testcase{"reciprocal", Reciprocal, [](float x) { return 1 / x; }},
+        Testcase{"erfc", Erfc, std::erfc},
+        Testcase{"lgamma", Lgamma, std::lgamma}
+            .set_tolerance(0.1, 0.15)
+            .set_fewer_infs_ok(),
+        Testcase{"asin", Asin, std::asin}.set_skip_infs(),
+        Testcase{"acos", Acos, std::acos}.set_skip_infs(),
+        Testcase{"atan", Atan, std::atan},
+        Testcase{"tan", Tan, std::tan}.set_tolerance(0.05, 0.05),
+    }));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index ae2ea225d1aadd7b3a794eabeca866c498f34760..c2e1251fc2fa09956b9b60d4e3e13a5d0cb61d2b 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -30,6 +30,45 @@ class MathTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.0001};
 };
 
+// Write TYPED_TESTs within the class definition so that we don't have to litter
+// "this->" everywhere.
+template <typename T>
+class MathTypedTest : public MathTest {
+ public:
+  void TestLogEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    Log(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}}), &b));
+    ComputeAndCompareR1<T>(&b,
+                           {-std::numeric_limits<T>::infinity(),
+                            -std::numeric_limits<T>::infinity()},
+                           {}, error_spec_);
+  }
+
+  void TestLog1pEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    Log1p(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}, T{-1.0}}), &b));
+    ComputeAndCompareR1<T>(
+        &b, {T{0.0}, T{-0.0}, -std::numeric_limits<T>::infinity()}, {},
+        error_spec_);
+  }
+};
+
+// TODO(b/123355973): Add bfloat16 to TestTypes once it's working.
+#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+using TestTypes = ::testing::Types<float>;
+#else
+using TestTypes = ::testing::Types<float, Eigen::half>;
+#endif
+
+TYPED_TEST_CASE(MathTypedTest, TestTypes);
+
+XLA_TYPED_TEST(MathTypedTest, LogEdgeCases) { this->TestLogEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, Log1pEdgeCases) { this->TestLog1pEdgeCases(); }
+
 XLA_TEST_F(MathTest, SqrtF32) {
   XlaBuilder builder(TestName());
   Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32);
@@ -106,6 +145,28 @@ XLA_TEST_F(MathTest, Lgamma) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+// TODO(jlebar): Fails on interpreter due to unimplemented operation.
+XLA_TEST_F(MathTest, DISABLED_ON_INTERPRETER(LgammaF16)) {
+  SetFastMathDisabled(true);
+
+  XlaBuilder b(TestName());
+
+  // These seemingly arbitrary inputs came from debugging the lgamma
+  // implementation against a test which tried all possible f16 values.
+  auto x = ConstantR1<half>(&b, {
+                                    half(-7360.0),
+                                    half(-4066.0),
+                                    half(-5.9605e-08),
+                                });
+  Lgamma(x);
+  std::vector<half> expected = {
+      std::numeric_limits<half>::infinity(),
+      std::numeric_limits<half>::infinity(),
+      half(16.64),
+  };
+  ComputeAndCompareR1<half>(&b, expected, {}, ErrorSpec{0.1});
+}
+
 XLA_TEST_F(MathTest, Digamma) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1.0, 0.5, 1 / 3.0, 0.25, 1 / 6.0, 0.125,
@@ -148,5 +209,40 @@ XLA_TEST_F(MathTest, RoundToEven) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, ErfRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Erf(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, ErfcRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Erfc(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, LgammaRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Lgamma(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, DigammaRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Digamma(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, RoundToEvenRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  RoundToEven(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index ffd744d190885b8e3f4149a48a706498b3787618..a5aea96090c59c78d20cfc10a4bd6b312be592c1 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -15,24 +15,32 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 
+#include <array>
 #include <numeric>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
                      int64 n) {
-  auto a = Iota(builder, type, m);
-  auto b = Iota(builder, type, n);
+  auto a = Iota(builder, U32, m);
+  auto b = Iota(builder, U32, n);
   auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
   return ConvertElementType(indicator, type);
 }
@@ -41,7 +49,7 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
@@ -64,105 +72,251 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
   });
 }
 
-XlaOp Triangle(XlaOp x, bool lower) {
+XlaOp TriangleMask(XlaOp x, int diagonal) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
     absl::Span<const int64> major_dims =
         AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
+    auto a = Iota(builder, S32, n);
+    auto b = Iota(builder, S32, m) + ConstantR0<int32>(builder, diagonal);
     XlaOp indicator;
-    if (lower) {
-      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    } else {
-      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    }
-    auto mask = Broadcast(indicator, major_dims);
-
-    return Select(mask, x, Zeros(builder, shape));
+    indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    return Broadcast(indicator, major_dims);
   });
 }
 
+XlaOp Triangle(XlaOp x, bool lower) {
+  return lower ? Select(TriangleMask(x, 0), x, ZerosLike(x))
+               : Select(TriangleMask(x, -1), ZerosLike(x), x);
+}
+
 XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
 
 XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
 
-XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
+                                       absl::Span<const int64> y_config,
+                                       absl::Span<const int64> output_config) {
+  for (auto dim : output_config) {
+    if (absl::c_linear_search(x_config, dim) ||
+        absl::c_linear_search(y_config, dim)) {
+      if (absl::c_count(output_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated output dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has output dimension without corresponding input dimension.");
+  }
+  for (auto dim : x_config) {
+    if (absl::c_linear_search(y_config, dim) ||
+        absl::c_linear_search(output_config, dim)) {
+      if (absl::c_count(x_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated lhs dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has lhs dimension without corresponding rhs or output "
+        "dimension.");
+  }
+  for (auto dim : y_config) {
+    if (absl::c_linear_search(x_config, dim) ||
+        absl::c_linear_search(output_config, dim)) {
+      if (absl::c_count(y_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated rhs dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has rhs dimension without corresponding lhs or output "
+        "dimension.");
+  }
+  return Status::OK();
+}
+
+xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
+                  absl::Span<const int64> y_config,
+                  absl::Span<const int64> output_config,
+                  xla::PrecisionConfig::Precision precision) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+    TF_RETURN_IF_ERROR(
+        ValidateEinsumNumericDimensions(x_config, y_config, output_config));
+    const int64 x_rank = x_config.size();
+    const int64 y_rank = y_config.size();
+    const int64 output_rank = output_config.size();
+    absl::flat_hash_set<int64> x_map;
+    absl::flat_hash_set<int64> y_map;
+    absl::flat_hash_set<int64> output_map;
 
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) {
-      return InvalidArgument(
-          "Arguments to BatchDot have different ranks: %s vs. %s",
-          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    auto find = [&](const absl::flat_hash_set<int64>& map, int64 d) {
+      return map.count(d) != 0;
+    };
+
+    auto insert = [&](absl::flat_hash_set<int64>& map, char d) {
+      CHECK(!find(map, d));
+      map.insert(d);
+    };
+
+    for (auto d : x_config) {
+      insert(x_map, d);
     }
-    const int ndims = ShapeUtil::Rank(x_shape);
-    if (ndims < 2) {
-      return InvalidArgument(
-          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+
+    for (auto d : y_config) {
+      insert(y_map, d);
     }
 
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return InvalidArgument(
-            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
-            i, ShapeUtil::HumanString(x_shape),
-            ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
+    for (auto d : output_config) {
+      insert(output_map, d);
     }
 
-    int x_inner_dim = ndims - 1;
-    int y_inner_dim = ndims - 2;
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return InvalidArgument(
-          "Dimensions %d and %d of arguments to BatchDot must be equal: "
-          "shapes %s vs %s",
-          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
-          ShapeUtil::HumanString(y_shape));
+    DotDimensionNumbers dnums;
+    std::vector<int64> lhs_outer_dims;
+    auto is_batch_dim = [&](int64 d) {
+      return find(x_map, d) && find(y_map, d) && find(output_map, d);
+    };
+    auto is_contracting = [&](int64 d) {
+      return find(x_map, d) && find(y_map, d);
+    };
+    auto rhs_dimension_number = [&](int64 d) {
+      return absl::c_find(y_config, d) - y_config.begin();
+    };
+    for (int64 i = 0; i < x_rank; ++i) {
+      auto dim_name = x_config[i];
+      if (is_batch_dim(dim_name)) {
+        dnums.add_lhs_batch_dimensions(i);
+        dnums.add_rhs_batch_dimensions(rhs_dimension_number(dim_name));
+      } else if (is_contracting(dim_name)) {
+        dnums.add_lhs_contracting_dimensions(i);
+        dnums.add_rhs_contracting_dimensions(rhs_dimension_number(dim_name));
+      } else {
+        lhs_outer_dims.push_back(i);
+      }
     }
 
-    // Check for zero lhs/rhs dim size.
-    if (ShapeUtil::IsZeroElementArray(x_shape) ||
-        ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+    std::vector<int64> rhs_outer_dims;
+    for (int64 i = 0; i < y_rank; ++i) {
+      auto dim_name = y_config[i];
+      if (!is_batch_dim(dim_name) && !is_contracting(dim_name)) {
+        rhs_outer_dims.push_back(i);
       }
-      int x_outer_dim = ndims - 2;
-      int y_outer_dim = ndims - 1;
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return Broadcast(
-          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
+    }
+
+    auto output_dimension_number = [&](char d) {
+      return absl::c_find(output_config, d) - output_config.begin();
+    };
+
+    std::vector<int64> output_dims;
+    output_dims.reserve(output_rank);
+    for (auto d : dnums.lhs_batch_dimensions()) {
+      output_dims.push_back(output_dimension_number(x_config[d]));
+    }
+    for (auto d : lhs_outer_dims) {
+      output_dims.push_back(output_dimension_number(x_config[d]));
+    }
+    for (auto d : rhs_outer_dims) {
+      output_dims.push_back(output_dimension_number(y_config[d]));
+    }
+
+    std::vector<int64> transpose_dims(output_rank);
+    for (int64 i = 0; i < output_rank; ++i) {
+      transpose_dims[output_dims[i]] = i;
     }
 
     PrecisionConfig precision_proto;
     precision_proto.add_operand_precision(precision);
     precision_proto.add_operand_precision(precision);
+    return Transpose(DotGeneral(x, y, dnums, &precision_proto), transpose_dims);
+  });
+}
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    const int ndims = x_shape.rank();
+    batch_dimension_numbers.reserve(ndims - 2);
+    for (int i = 0; i < ndims - 2; ++i) {
+      batch_dimension_numbers.push_back(i);
+    }
+    std::vector<int64> x_config = batch_dimension_numbers;
+    x_config.push_back(ndims - 2);
+    x_config.push_back(ndims);
+    std::vector<int64> y_config = batch_dimension_numbers;
+    y_config.push_back(ndims);
+    y_config.push_back(ndims - 1);
+    std::vector<int64> output_config = batch_dimension_numbers;
+    output_config.push_back(ndims - 2);
+    output_config.push_back(ndims - 1);
+    return Einsum(x, x_config, y, y_config, output_config, precision);
+  });
+}
 
-    DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
+    absl::string_view einsum_config) {
+  std::array<std::vector<int64>, 3> einsum_config_numeric;
+  std::vector<absl::string_view> main_split =
+      absl::StrSplit(einsum_config, ',');
+
+  if (main_split.size() != 2) {
+    return InvalidArgument("Expected one \",\" in einsum_config.");
+  }
+
+  auto maybe_invalid_character = [](char d) {
+    if (absl::ascii_isalpha(d)) {
+      return Status::OK();
     }
+    if (d == '.') {
+      return InvalidArgument("Unsupported \"...\" or \".\" in einsum config.");
+    }
+    return InvalidArgument("Unexpected character in einsum config.");
+  };
 
-    return DotGeneral(x, y, dot_dnums, &precision_proto);
+  auto& x_config = einsum_config_numeric[0];
+  x_config.reserve(main_split[0].size());
+  for (auto d : main_split[0]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    x_config.push_back(static_cast<int64>(d));
+  }
+  std::vector<absl::string_view> y_output_split =
+      absl::StrSplit(main_split[1], "->");
+  if (y_output_split.size() != 2) {
+    return InvalidArgument("Expected one \"->\" in einsum_config.");
+  }
+  auto& y_config = einsum_config_numeric[1];
+  y_config.reserve(y_output_split[0].size());
+  for (auto d : y_output_split[0]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    y_config.push_back(static_cast<int64>(d));
+  }
+  auto& output_config = einsum_config_numeric[2];
+  output_config.reserve(y_output_split[1].size());
+  for (auto d : y_output_split[1]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    output_config.push_back(static_cast<int64>(d));
+  }
+  return einsum_config_numeric;
+}
+
+XlaOp Einsum(XlaOp x, XlaOp y, absl::string_view einsum_config,
+             PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto einsum_config_numeric,
+                        ParseEinsumString(einsum_config));
+    return Einsum(x, einsum_config_numeric[0], y, einsum_config_numeric[1],
+                  einsum_config_numeric[2], precision);
   });
 }
 
@@ -170,7 +324,7 @@ XlaOp TransposeInMinorDims(XlaOp x) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_dims >= 2);
     std::vector<int64> permutation(n_dims);
     std::iota(permutation.begin(), permutation.end(), 0);
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 8856f99c7a0fee8f315aac11fab392cf5536f57b..491f1eab4cbffbbf9df70d4c35a61351df3e98aa 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
+#include <array>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -31,6 +35,10 @@ XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 // diagonal elements (i.e., with indices [..., i, i]).
 XlaOp GetMatrixDiagonal(XlaOp x);
 
+// Returns a lower-triangular mask, i.e., true below the `diagonal`-th diagonal
+// and false above that diagonal.
+XlaOp TriangleMask(XlaOp x, int diagonal);
+
 // Get the upper or lower triangle part of the last two dimensions
 XlaOp Triangle(XlaOp x, bool lower);
 
@@ -61,6 +69,40 @@ xla::XlaOp BatchDot(
     xla::XlaOp x, xla::XlaOp y,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
 
+// Parse an einsum string into dimension numbers:
+//   "ab,cb->ac"
+// becomes:
+//   {{0, 1},{2, 1},{0, 2}}
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+
+StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
+    absl::string_view einsum_config);
+
+// Determine if each dimension label is in at least two inputs.
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
+                                       absl::Span<const int64> y_config,
+                                       absl::Span<const int64> output_config);
+
+// Supports two operand einsum notation like "ab,cb->ac".
+xla::XlaOp Einsum(
+    xla::XlaOp x, xla::XlaOp y, absl::string_view einsum_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Same as above but supporting numeric labels on dimensins. So "ab,cb->ac"
+// becomes:
+//   x_config = {0, 1}
+//   y_config = {2, 1}
+//   output_config = {0, 2}
+xla::XlaOp Einsum(
+    xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
+    absl::Span<const int64> y_config, absl::Span<const int64> output_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
 xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
 
diff --git a/tensorflow/compiler/xla/client/lib/matrix_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
index 0593a7517ac125ca8dc5395cee76f6bc23232cd3..79cf529ee94b044ee0af788522200cd28c778997 100644
--- a/tensorflow/compiler/xla/client/lib/matrix_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -101,5 +104,78 @@ XLA_TEST_F(MatrixTest, RowBatchDot) {
   ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
                              {a_data.get(), row_data.get(), index_data.get()});
 }
+
+XLA_TEST_F(MatrixTest, Einsum) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
+
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  Einsum(l_index, row, "abc,adc->abd");
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(MatrixTest, ParseEinsumString) {
+  auto to_vec = [](absl::string_view s) {
+    std::vector<int64> v;
+    v.reserve(s.size());
+    for (auto c : s) {
+      v.push_back(int64{c});
+    }
+    return v;
+  };
+
+  auto to_string = [&](absl::string_view x, absl::string_view y,
+                       absl::string_view o) {
+    return absl::StrCat(x, ",", y, "->", o);
+  };
+
+  std::vector<std::vector<string>> good_test_cases = {{"ab", "bc", "ac"},
+                                                      {"Bab", "Bbc", "Bac"},
+                                                      {"ab", "cd", "dcba"},
+                                                      {"abc", "abd", "cbd"}};
+  for (auto test_case : good_test_cases) {
+    auto parse_result_or_status =
+        ParseEinsumString(to_string(test_case[0], test_case[1], test_case[2]));
+    EXPECT_TRUE(parse_result_or_status.status().ok());
+    auto parse_result = parse_result_or_status.ValueOrDie();
+    for (int i = 0; i < 3; ++i) {
+      EXPECT_EQ(parse_result[i], to_vec(test_case[i]));
+    }
+    EXPECT_TRUE(ValidateEinsumNumericDimensions(
+                    parse_result[0], parse_result[1], parse_result[2])
+                    .ok());
+  }
+
+  std::vector<string> einsum_strings_that_fail_parsing = {
+      "", "a", "ab->ba", "ab,bc,cd->ad", "a...b,bc->a...c"};
+  for (auto test_case : einsum_strings_that_fail_parsing) {
+    auto parse_result_or_status = ParseEinsumString(test_case);
+    EXPECT_FALSE(parse_result_or_status.status().ok());
+  }
+
+  std::vector<string> einsum_strings_that_fail_numeric_validation = {
+      "a,b->c", "ab,bc->acd", "abz,bc->ac", "ab,bcz->ac"};
+  for (auto test_case : einsum_strings_that_fail_numeric_validation) {
+    auto parse_result_or_status = ParseEinsumString(test_case);
+    EXPECT_TRUE(parse_result_or_status.status().ok());
+    auto parse_result = parse_result_or_status.ValueOrDie();
+    EXPECT_FALSE(ValidateEinsumNumericDimensions(
+                     parse_result[0], parse_result[1], parse_result[2])
+                     .ok());
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
index 72ca653173b78d9338f632c41779f2a30db1e978..640412ec8bcffd2565b11ba25b87f6bf6438d848 100644
--- a/tensorflow/compiler/xla/client/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -154,7 +154,7 @@ struct QRBlockResult {
 StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int num_dims = ShapeUtil::Rank(a_shape);
+  const int num_dims = a_shape.rank();
   if (num_dims < 2) {
     return InvalidArgument("Argument to QR must have rank >= 2; got shape %s",
                            a_shape.ToString());
@@ -325,7 +325,7 @@ StatusOr<QRDecompositionResult> QRDecomposition(
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int num_dims = ShapeUtil::Rank(a_shape);
+  const int num_dims = a_shape.rank();
   if (num_dims < 2) {
     return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
                            a_shape.ToString());
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index f8c7df3ff5189c817202eaf39adb572f7e232ec2..77145ba7d4c72435450d3e33d57b2507eb84d2fc 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace xla {
 
@@ -26,7 +27,7 @@ XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
 
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
 
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_minor_dims <= n_dims);
     auto major_dims = AsInt64Slice(shape.dimensions())
                           .subspan(
@@ -51,17 +52,17 @@ XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
 XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = shape.rank();
+    TF_RET_CHECK(start.size() == n_dims);
+
     // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
     std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return DynamicUpdateSlice(x, update, start_constant);
+    std::vector<XlaOp> start_ops(start.size());
+    for (int i = 0; i < start.size(); ++i) {
+      start_ops[i] = ConstantR0(builder, start_as_int32[i]);
+    }
+    return DynamicUpdateSlice(x, update, start_ops);
   });
 }
 
@@ -70,7 +71,7 @@ XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     const int64 n_minor_dims = start.size();
     TF_RET_CHECK(n_minor_dims <= n_dims);
     std::vector<int64> padded_start(n_dims, 0);
@@ -90,18 +91,17 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+StatusOr<std::vector<XlaOp>> PrependZerosInMajorDims(
+    XlaOp x, absl::Span<const XlaOp> starts) {
   XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
-    std::vector<XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
-    }
-    return ConcatInDim(builder, padded_starts, 0);
-  });
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+  const int64 n_dims = shape.rank();
+  auto zero = ConstantR0<int32>(builder, 0);
+  std::vector<XlaOp> padded_starts(n_dims, zero);
+  for (int i = 0; i < starts.size(); ++i) {
+    padded_starts[n_dims - starts.size() + i] = starts[i];
+  }
+  return padded_starts;
 }
 
 }  // namespace
@@ -111,7 +111,7 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     int64 n_minor_dims = starts.size();
     TF_RET_CHECK(n_minor_dims == sizes.size());
     TF_RET_CHECK(n_minor_dims <= n_dims);
@@ -119,7 +119,7 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
                           .subspan(
                               /*pos=*/0,
                               /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    TF_ASSIGN_OR_RETURN(auto padded_starts, PrependZerosInMajorDims(x, starts));
     auto padded_sizes = ConcatVectors(major_dims, sizes);
     return DynamicSlice(x, padded_starts, padded_sizes);
   });
@@ -127,8 +127,11 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
 
 XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
                                     absl::Span<const XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return DynamicUpdateSlice(x, update, padded_starts);
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto padded_starts, PrependZerosInMajorDims(x, starts));
+    return DynamicUpdateSlice(x, update, padded_starts);
+  });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index 27ff36c7491ab8397d46f3a49493ff2b904deb2d..0fbd138aca1e86f219d0459086fc09d20844f135 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -77,7 +77,7 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   auto x = ConstantR1<float>(&builder, inputs);
   xla::GetTupleElement(xla::TopK(x, kSize), 0);
 
-  std::sort(inputs.begin(), inputs.end(), std::greater<float>());
+  absl::c_sort(inputs, std::greater<float>());
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 5db9d10dff4c50d71cde934b3f3c345bee571f29..9f520bcdadfabc8ca9f9ee82b20804fd2c50d1db 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -34,7 +34,7 @@ namespace {
 // specified shape. In case of a (nested) tuple shape this is the total byte
 // size of all sub-shapes within the tuple.
 int64 DataSizeOfShape(const Shape& shape) {
-  if (ShapeUtil::IsArray(shape)) {
+  if (shape.IsArray()) {
     return ShapeUtil::ByteSizeOf(shape);
   }
 
@@ -47,7 +47,7 @@ int64 DataSizeOfShape(const Shape& shape) {
 
 // Creates a XlaOp for an op what generates fake data with the given shape.
 XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) {
-  if (ShapeUtil::IsArray(shape)) {
+  if (shape.IsArray()) {
     return Broadcast(
         ConstantLiteral(builder, LiteralUtil::One(shape.element_type())),
         AsInt64Slice(shape.dimensions()));
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.cc b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
index 159e0c82dc4ff123533b65baac99388591c400d7..ba7fde118fde990fbb4aa9a34dd0f0e67ff5a93b 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
@@ -38,7 +38,7 @@ XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
-    int ndims = ShapeUtil::Rank(shape);
+    int ndims = shape.rank();
     int64 n = ShapeUtil::GetDimension(shape, -1);
     int64 num_blocks = n / block_size;
 
@@ -140,9 +140,7 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
     // zero (which can happen if the last block was padded) otherwise it will
     // introduce nans which will propagate
     auto diags = GetMatrixDiagonal(diag_blocks);
-    TF_ASSIGN_OR_RETURN(Shape diags_shape, builder->GetShape(diags));
-    auto one = ScalarLike(diags, 1);
-    auto ones = Broadcast(one, AsInt64Slice(diags_shape.dimensions()));
+    auto ones = FullLike(diags, 1);
     diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
     auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
 
@@ -165,10 +163,10 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
     // The first or last  diagonal element should be set to 1 instead of -1
     // though, since we never update it
     auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
-    auto start_index = (lower) ? 0 : block_size - 1;
-    auto output_block = DynamicUpdateSlice(
-        neg_identity, pos_one,
-        /*start_indices=*/ConstantR1<int>(builder, 2, start_index));
+    auto start_index = ConstantR0<int>(builder, (lower) ? 0 : block_size - 1);
+    auto output_block =
+        DynamicUpdateSlice(neg_identity, pos_one,
+                           /*start_indices=*/{start_index, start_index});
 
     // Broadcast diag([1, -1, -1, ...]) to every block
     XlaOp output = Broadcast(output_block,
@@ -211,12 +209,10 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
       auto body_out = GetTupleElement(input_tuple, 1);
       auto body_input = GetTupleElement(input_tuple, 2);
 
-      auto zero = ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto zero = ConstantR0<int32>(bodyb.get(), 0);
       auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
-      auto start_indices =
-          ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
       auto input_row =
-          DynamicSlice(body_input, start_indices,
+          DynamicSlice(body_input, {zero, j, zero},
                        /*slice_sizes=*/{num_blocks, 1, block_size});
 
       // We want -L21 L11^{-1}
@@ -230,7 +226,7 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
       precision_proto.add_operand_precision(precision);
       auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
 
-      body_out = DynamicUpdateSlice(body_out, update, start_indices);
+      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
 
       auto next_i = i + ScalarLike(i, 1);
       Tuple(bodyb.get(), {next_i, body_out, body_input});
@@ -262,7 +258,7 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
     int64 block_size = ShapeUtil::GetDimension(blocks_shape, -1);
 
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    int64 ndims = ShapeUtil::Rank(a_shape);
+    int64 ndims = a_shape.rank();
     int64 n = ShapeUtil::GetDimension(a_shape, -1);
     int64 num_blocks = n / block_size + (n % block_size != 0);
     int64 m_dim = (left_side) ? -1 : -2;
@@ -356,13 +352,13 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
-    if (ShapeUtil::Rank(a_shape) != ShapeUtil::Rank(b_shape)) {
+    if (a_shape.rank() != b_shape.rank()) {
       return InvalidArgument(
           "Arguments to TriangularSolve have shapes with different ranks: "
           "%s vs. %s",
           ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
-    const int64 ndims = ShapeUtil::Rank(a_shape);
+    const int64 ndims = a_shape.rank();
     if (ndims < 2) {
       return InvalidArgument(
           "Arguments to TriangularSolve was rank %d but must have rank >= 2.",
@@ -417,6 +413,11 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
     auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
                                                 conjugate_a, precision);
 
+    // Mask off the ignored elements of the triangular matrix a.
+    // TODO(phawkins): it would probably be preferable to perform this masking
+    // block by block inside SolveWithInvertedDiagonalBlocks.
+    a = Triangle(a, lower);
+
     // We now find the solution using GEMMs
     auto x =
         SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
index 3fea627e6a8c30b6f06fa61751aad386ec543843..284a2e9d183a6a7923fb59ac134ce3b3a3a96e35 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -37,12 +38,20 @@ namespace {
 using TriangularSolveTest = ClientLibraryTestBase;
 using TriangularSolveLeftLookingTest = ClientLibraryTestBase;
 
+static constexpr float kNan = std::numeric_limits<float>::quiet_NaN();
+
 Array2D<float> AValsLower() {
-  return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+  return {{2, kNan, kNan, kNan},
+          {3, 6, kNan, kNan},
+          {4, 7, 9, kNan},
+          {5, 8, 10, 11}};
 }
 
 Array2D<float> AValsUpper() {
-  return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
+  return {{2, 3, 4, 5},
+          {kNan, 6, 7, 8},
+          {kNan, kNan, 9, 10},
+          {kNan, kNan, kNan, 11}};
 }
 
 Array2D<float> BValsRight() {
@@ -53,18 +62,20 @@ Array2D<float> BValsLeft() {
   return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
 }
 
+static constexpr complex64 kNanC64 = complex64(kNan, kNan);
+
 Array2D<complex64> AValsLowerComplex() {
-  return {{2, 0, 0, 0},
-          {complex64(3, 1), 6, 0, 0},
-          {4, complex64(7, 2), 9, 0},
+  return {{2, kNanC64, kNanC64, kNanC64},
+          {complex64(3, 1), 6, kNanC64, kNanC64},
+          {4, complex64(7, 2), 9, kNanC64},
           {5, 8, complex64(10, 3), 11}};
 }
 
 Array2D<complex64> AValsUpperComplex() {
   return {{2, 3, complex64(4, 3), 5},
-          {0, 6, complex64(7, 2), 8},
-          {0, 0, complex64(9, 1), 10},
-          {0, 0, 0, 11}};
+          {kNanC64, 6, complex64(7, 2), 8},
+          {kNanC64, kNanC64, complex64(9, 1), 10},
+          {kNanC64, kNanC64, kNanC64, 11}};
 }
 
 Array2D<complex64> BValsRightComplex() {
@@ -367,5 +378,70 @@ XLA_TEST_F(TriangularSolveTest, BatchedLeftUpper) {
                              ErrorSpec(1e-2, 1e-2));
 }
 
+struct TriangularSolveTestSpec {
+  int m, n;  // A is mxm, B is mxn
+  bool left_side;
+  bool lower;
+  bool transpose_a;
+};
+
+class TriangularSolveParametricTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<TriangularSolveTestSpec> {};
+
+XLA_TEST_P(TriangularSolveParametricTest, Random) {
+  TriangularSolveTestSpec spec = GetParam();
+
+  XlaBuilder builder(TestName());
+
+  Array2D<float> avals(spec.m, spec.m);
+  avals.FillRandom(1.0);
+  for (int i = 0; i < spec.m; ++i) {
+    avals(i, i) += 10;
+  }
+
+  std::pair<int, int> bdims = spec.left_side ? std::make_pair(spec.m, spec.n)
+                                             : std::make_pair(spec.n, spec.m);
+  Array2D<float> bvals(bdims.first, bdims.second);
+  bvals.FillRandom(1.0);
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(avals, 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(bvals, 1, "b", &builder, &b);
+  auto x = TriangularSolve(a, b, spec.left_side, spec.lower, spec.transpose_a,
+                           /*conjugate_a=*/false,
+                           /*block_size=*/3);
+  auto a_tri = Triangle(a, spec.lower);
+  a_tri = MaybeTransposeInMinorDims(a_tri, spec.transpose_a);
+  if (spec.left_side) {
+    BatchDot(a_tri, x);
+  } else {
+    BatchDot(x, a_tri);
+  }
+
+  ComputeAndCompareR2<float>(&builder, bvals, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+std::vector<TriangularSolveTestSpec> TriangularSolveTests() {
+  std::vector<TriangularSolveTestSpec> specs;
+  for (int m : {5, 10}) {
+    for (int n : {5, 10}) {
+      for (bool left_side : {false, true}) {
+        for (bool lower : {false, true}) {
+          for (bool transpose_a : {false, true}) {
+            specs.push_back({m, n, left_side, lower, transpose_a});
+          }
+        }
+      }
+    }
+  }
+  return specs;
+}
+
+INSTANTIATE_TEST_SUITE_P(TriangularSolveParametricTestInstantiation,
+                         TriangularSolveParametricTest,
+                         ::testing::ValuesIn(TriangularSolveTests()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 049cd15738a619294b19d5cf74ca514d7b4a00ad..48b5f94538f453785194bc434a91ee0a10c020c2 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -164,9 +164,8 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   //    ExecutableRunOptions.eigen_intra_op_thread_pool.
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
-  ServiceExecutableRunOptions service_options(
-      run_options, backend_->StreamBorrower(),
-      backend_->eigen_intra_op_thread_pool());
+  ServiceExecutableRunOptions service_options(run_options,
+                                              backend_->StreamBorrower());
 
   if (executable_->dumping_snapshot()) {
     return ExecuteAndDump(&service_options, arguments);
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
index fb9ea6ec3fc41d5e04ca125798a8199350470a44..b9bff06cbdbc3525eb19d5df885952c3971d9d6a 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.cc
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -50,7 +50,7 @@ OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles) {
   OpSharding result;
   result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
 
-  CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
+  CHECK_EQ(tile_shape.rank(), 1);
   std::vector<int64> dimensions(1, num_tiles);
   *result.mutable_tile_shape() = tile_shape.ToProto();
   auto& tile_dimension =
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 622fc158e11161b5b1167ccb432f51775767e3a1..5c9f9f708883f458b67205058fc7c1e1e2ad02f5 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -192,9 +195,9 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
 }
 
 void XlaBuilder::IsConstantVisitor(const int64 op_handle,
-                                   std::set<int64>* visited,
+                                   absl::flat_hash_set<int64>* visited,
                                    bool* is_constant) const {
-  if (visited->count(op_handle) != 0 || !*is_constant) {
+  if (visited->contains(op_handle) || !*is_constant) {
     return;
   }
 
@@ -208,11 +211,21 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       }
       // TODO(b/32495713): We aren't checking the called computations.
       break;
+    case HloOpcode::kGetDimensionSize: {
+      int64 dimension_number = instr.dimensions(0);
+      const HloInstructionProto& operand =
+          *(LookUpInstructionByHandle(instr.operand_ids(0)).ValueOrDie());
+      Shape operand_shape(operand.shape());
+      if (operand_shape.is_dynamic_dimension(dimension_number)) {
+        *is_constant = false;
+      }
+      break;
+    }
 
     // Non functional ops.
     case HloOpcode::kRng:
     case HloOpcode::kAllReduce:
-      // TODO(b/33009255): Implmement constant folding for cross replica sum.
+      // TODO(b/33009255): Implement constant folding for cross replica sum.
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kCall:
@@ -244,6 +257,29 @@ Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
                                      int64 target_param_num,
                                      ShapeIndex target_param_index,
                                      int64 target_dim_num) {
+  bool param_exists = false;
+  for (HloInstructionProto& instr : instructions_) {
+    if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
+        instr.parameter_number() == target_param_num) {
+      param_exists = true;
+      Shape param_shape(instr.shape());
+      Shape* param_shape_ptr = &param_shape;
+      for (int64 index : target_param_index) {
+        param_shape_ptr = param_shape_ptr->mutable_tuple_shapes(index);
+      }
+      param_shape_ptr->set_dynamic_dimension(target_dim_num,
+                                             /*is_dynamic=*/true);
+      *instr.mutable_shape() = param_shape.ToProto();
+    }
+  }
+
+  if (!param_exists) {
+    return InvalidArgument(
+        "Asked to mark parameter %lld as dynamic sized parameter, but the "
+        "doesn't exists",
+        target_param_num);
+  }
+
   TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
       DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
                                                 dynamic_size_param_index},
@@ -263,29 +299,52 @@ XlaComputation XlaBuilder::BuildAndNoteError() {
   return build_status.ConsumeValueOrDie();
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build() {
+StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
   if (!first_error_.ok()) {
     string backtrace;
     first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
     return AppendStatus(first_error_, backtrace);
   }
-  return Build(instructions_.back().id());
+  return Build(instructions_.back().id(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root) {
+StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root,
+                                           bool remove_dynamic_dimensions) {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
-  return Build(root.handle());
+  return Build(root.handle(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
+StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id,
+                                           bool remove_dynamic_dimensions) {
   if (!first_error_.ok()) {
     string backtrace;
     first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
     return AppendStatus(first_error_, backtrace);
   }
 
+  // TODO(b/121223198): XLA backend cannot handle dynamic dimensions yet, remove
+  // all dynamic dimensions before building xla program until we have support in
+  // the backend.
+  if (remove_dynamic_dimensions) {
+    std::function<void(ShapeProto*)> remove_dynamic_dimension =
+        [&](ShapeProto* shape) {
+          if (shape->tuple_shapes_size() != 0) {
+            for (int64 i = 0; i < shape->tuple_shapes_size(); ++i) {
+              remove_dynamic_dimension(shape->mutable_tuple_shapes(i));
+            }
+          }
+          for (int64 i = 0; i < shape->dimensions_size(); ++i) {
+            shape->set_is_dynamic_dimension(i, false);
+          }
+        };
+
+    for (auto& instruction : instructions_) {
+      remove_dynamic_dimension(instruction.mutable_shape());
+    }
+  }
+
   HloComputationProto entry;
   SetProtoIdAndName(&entry, name_, kNameSeparator, GetNextId());
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape, GetProgramShape(root_id));
@@ -310,7 +369,10 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
     module->add_computations()->Swap(&e.second);
   }
   module->add_computations()->Swap(&entry);
-
+  if (!input_output_aliases_.empty()) {
+    TF_RETURN_IF_ERROR(
+        PopulateInputOutputAlias(module, program_shape, input_output_aliases_));
+  }
   *(module->mutable_dynamic_parameter_binding()) =
       dynamic_parameter_binding_.ToProto();
 
@@ -323,6 +385,35 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   return std::move(computation);
 }
 
+/* static */ Status XlaBuilder::PopulateInputOutputAlias(
+    HloModuleProto* module, const ProgramShape& program_shape,
+    const std::vector<InputOutputAlias>& input_output_aliases) {
+  HloInputOutputAliasConfig config(program_shape.result());
+  for (auto& alias : input_output_aliases) {
+    // The HloInputOutputAliasConfig does not do parameter validation as it only
+    // carries the result shape. Maybe it should be constructed with a
+    // ProgramShape to allow full validation. We will still get an error when
+    // trying to compile the HLO module, but would be better to have validation
+    // at this stage.
+    if (alias.param_number >= program_shape.parameters_size()) {
+      return InvalidArgument("Invalid parameter number %ld (total %ld)",
+                             alias.param_number,
+                             program_shape.parameters_size());
+    }
+    const Shape& parameter_shape = program_shape.parameters(alias.param_number);
+    if (!ShapeUtil::IndexIsValid(parameter_shape, alias.param_index)) {
+      return InvalidArgument("Invalid parameter %ld index: %s",
+                             alias.param_number,
+                             alias.param_index.ToString().c_str());
+    }
+    TF_RETURN_IF_ERROR(config.SetUpAlias(
+        alias.output_index, alias.param_number, alias.param_index,
+        HloInputOutputAliasConfig::AliasKind::kUserAlias));
+  }
+  *module->mutable_input_output_alias() = config.ToProto();
+  return Status::OK();
+}
+
 StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, const XlaOp& operand,
     absl::Span<const int64> broadcast_dimensions) {
@@ -343,7 +434,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
   TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
 
   CHECK(ShapeUtil::IsScalar(operand_shape) ||
-        ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape));
+        operand_shape.rank() == output_shape.rank());
   Shape broadcast_shape =
       ShapeUtil::ChangeElementType(output_shape, operand_shape.element_type());
 
@@ -355,7 +446,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
   // Do explicit broadcast for degenerate broadcast.
   std::vector<int64> broadcast_dimensions;
   std::vector<int64> reshaped_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(operand_shape); i++) {
+  for (int i = 0; i < operand_shape.rank(); i++) {
     if (operand_shape.dimensions(i) == output_shape.dimensions(i)) {
       broadcast_dimensions.push_back(i);
       reshaped_dimensions.push_back(operand_shape.dimensions(i));
@@ -398,8 +489,8 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
     *instr.mutable_shape() = shape.ToProto();
 
-    const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
-    const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
+    const int64 lhs_rank = lhs_shape.rank();
+    const int64 rhs_rank = rhs_shape.rank();
 
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
@@ -410,17 +501,19 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
       const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
 
       std::vector<int64> to_size;
-      for (int64 size : shape.dimensions()) {
-        to_size.push_back(size);
+      std::vector<bool> to_size_is_dynamic;
+      for (int i = 0; i < shape.rank(); i++) {
+        to_size.push_back(shape.dimensions(i));
+        to_size_is_dynamic.push_back(shape.is_dynamic_dimension(i));
       }
-      for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
-           from_dim++) {
+      for (int64 from_dim = 0; from_dim < from_shape.rank(); from_dim++) {
         int64 to_dim = broadcast_dimensions[from_dim];
         to_size[to_dim] = from_shape.dimensions(from_dim);
+        to_size_is_dynamic[to_dim] = from_shape.is_dynamic_dimension(from_dim);
       }
 
-      const Shape& broadcasted_shape =
-          ShapeUtil::MakeShape(from_shape.element_type(), to_size);
+      const Shape& broadcasted_shape = ShapeUtil::MakeShape(
+          from_shape.element_type(), to_size, to_size_is_dynamic);
       TF_ASSIGN_OR_RETURN(
           XlaOp broadcasted_operand,
           InDimBroadcast(broadcasted_shape, from, broadcast_dimensions));
@@ -458,18 +551,18 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
-    if (!ShapeUtil::IsTuple(shape)) {
-      if (!ShapeUtil::IsTuple(lhs_shape) &&
+    if (!shape.IsTuple()) {
+      if (!lhs_shape.IsTuple() &&
           !ShapeUtil::SameDimensions(shape, lhs_shape)) {
         // lhs is being implicitly broadcasted. Change to explicit.
         TF_ASSIGN_OR_RETURN(updated_lhs, AddBroadcastSequence(shape, lhs));
       }
-      if (!ShapeUtil::IsTuple(rhs_shape) &&
+      if (!rhs_shape.IsTuple() &&
           !ShapeUtil::SameDimensions(shape, rhs_shape)) {
         // rhs is being implicitly broadcasted. Change to explicit.
         TF_ASSIGN_OR_RETURN(updated_rhs, AddBroadcastSequence(shape, rhs));
       }
-      if (!ShapeUtil::IsTuple(ehs_shape) &&
+      if (!ehs_shape.IsTuple() &&
           !ShapeUtil::SameDimensions(shape, ehs_shape)) {
         // ehs is being implicitly broadcasted. Change to explicit.
         TF_ASSIGN_OR_RETURN(updated_ehs, AddBroadcastSequence(shape, ehs));
@@ -563,10 +656,10 @@ XlaOp XlaBuilder::Broadcast(const XlaOp& operand,
     // output, so to append dimensions on the left the instruction's dimensions
     // should just be the n highest dimension numbers of the output shape where
     // n is the number of input dimensions.
-    const int64 operand_rank = ShapeUtil::Rank(operand_shape);
+    const int64 operand_rank = operand_shape.rank();
     std::vector<int64> dimensions(operand_rank);
     for (int i = 0; i < operand_rank; ++i) {
-      dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank;
+      dimensions[i] = i + shape.rank() - operand_rank;
     }
     return InDimBroadcast(shape, operand, dimensions);
   });
@@ -579,8 +672,17 @@ XlaOp XlaBuilder::BroadcastInDim(
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
     // not necessarily the same as the dimension sizes of the output shape.
-    const auto& output_shape =
+    auto output_shape =
         ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+    for (int i = 0; i < broadcast_dimensions.size(); i++) {
+      if (broadcast_dimensions[i] < 0 ||
+          broadcast_dimensions[i] > out_dim_size.size()) {
+        return InvalidArgument("Broadcast dimension %lld is out of bound",
+                               broadcast_dimensions[i]);
+      }
+      output_shape.set_dynamic_dimension(broadcast_dimensions[i],
+                                         operand_shape.is_dynamic_dimension(i));
+    }
 
     TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(
                            operand_shape, output_shape, broadcast_dimensions)
@@ -639,10 +741,10 @@ XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
                              int64 limit_index, int64 stride, int64 dimno) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
-    std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
+    std::vector<int64> starts(shape.rank(), 0);
     std::vector<int64> limits(shape.dimensions().begin(),
                               shape.dimensions().end());
-    std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
+    std::vector<int64> strides(shape.rank(), 1);
     starts[dimno] = start_index;
     limits[dimno] = limit_index;
     strides[dimno] = stride;
@@ -660,7 +762,7 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                         GetShape(start_indices));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
-                            operand_shape, start_indices_shape, slice_sizes));
+                            operand_shape, {start_indices_shape}, slice_sizes));
     *instr.mutable_shape() = shape.ToProto();
 
     for (int64 size : slice_sizes) {
@@ -672,6 +774,34 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
   });
 }
 
+XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand,
+                               absl::Span<const XlaOp> start_indices,
+                               absl::Span<const int64> slice_sizes) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<const Shape*> start_indices_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
+                        GetOperandShapes(start_indices));
+    absl::c_transform(start_indices_shapes,
+                      std::back_inserter(start_indices_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shapes, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    std::vector<XlaOp> operands = {operand};
+    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+  });
+}
+
 XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                      const XlaOp& start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -681,13 +811,38 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
     TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
+                         operand_shape, update_shape, {start_indices_shape}));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                     absl::Span<const XlaOp> start_indices) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    std::vector<const Shape*> start_indices_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
+                        GetOperandShapes(start_indices));
+    absl::c_transform(start_indices_shapes,
+                      std::back_inserter(start_indices_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicUpdateSliceShape(
-                            operand_shape, update_shape, start_indices_shape));
+                            operand_shape, update_shape, start_indices_shapes));
     *instr.mutable_shape() = shape.ToProto();
 
+    std::vector<XlaOp> operands = {operand, update};
+    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
     return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
+                          operands);
   });
 }
 
@@ -780,7 +935,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
     VLOG(3) << "dims to collapse: " << absl::StrJoin(dimensions, ",");
 
     std::vector<int64> new_sizes;
-    for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
+    for (int i = 0; i < original_shape.rank(); ++i) {
       if (i <= dimensions.front() || i > dimensions.back()) {
         new_sizes.push_back(original_shape.dimensions(i));
       } else {
@@ -808,10 +963,9 @@ XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& true_shape, GetShape(on_true));
     TF_ASSIGN_OR_RETURN(const Shape& false_shape, GetShape(on_false));
-    TF_RET_CHECK(ShapeUtil::IsTuple(true_shape) ==
-                 ShapeUtil::IsTuple(false_shape));
-    HloOpcode opcode = ShapeUtil::IsTuple(true_shape) ? HloOpcode::kTupleSelect
-                                                      : HloOpcode::kSelect;
+    TF_RET_CHECK(true_shape.IsTuple() == false_shape.IsTuple());
+    HloOpcode opcode =
+        true_shape.IsTuple() ? HloOpcode::kTupleSelect : HloOpcode::kSelect;
     return TernaryOp(opcode, pred, on_true, on_false);
   });
 }
@@ -835,7 +989,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data));
-    if (!ShapeUtil::IsTuple(tuple_shape)) {
+    if (!tuple_shape.IsTuple()) {
       return InvalidArgument(
           "Operand to GetTupleElement() is not a tuple; got %s",
           ShapeUtil::HumanString(tuple_shape));
@@ -915,13 +1069,13 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
 Status XlaBuilder::VerifyConvolution(
     const Shape& lhs_shape, const Shape& rhs_shape,
     const ConvolutionDimensionNumbers& dimension_numbers) const {
-  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
+  if (lhs_shape.rank() != rhs_shape.rank()) {
     return InvalidArgument(
         "Convolution arguments must have same number of "
         "dimensions. Got: %s and %s",
         ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape));
   }
-  int num_dims = ShapeUtil::Rank(lhs_shape);
+  int num_dims = lhs_shape.rank();
   if (num_dims < 2) {
     return InvalidArgument(
         "Convolution expects argument arrays with >= 3 dimensions. "
@@ -1150,7 +1304,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
-    if (ShapeUtil::IsArray(shape) && sharding() &&
+    if (shape.IsArray() && sharding() &&
         sharding()->type() == OpSharding::Type::OpSharding_Type_OTHER) {
       // TODO(b/110793772): Support tiled array-shaped infeeds.
       return InvalidArgument(
@@ -1226,7 +1380,7 @@ XlaOp XlaBuilder::InfeedWithToken(const XlaOp& token, const Shape& shape,
     *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
-    if (ShapeUtil::IsArray(shape) && sharding() &&
+    if (shape.IsArray() && sharding() &&
         sharding()->type() == OpSharding::Type::OpSharding_Type_OTHER) {
       // TODO(b/110793772): Support tiled array-shaped infeeds.
       return InvalidArgument(
@@ -1339,7 +1493,7 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
     for (int i = 0; i < tokens.size(); ++i) {
       const XlaOp& operand = tokens[i];
       TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-      if (!ShapeUtil::IsToken(operand_shape)) {
+      if (!operand_shape.IsToken()) {
         return InvalidArgument(
             "All operands to AfterAll must be tokens; operand %d has shape %s",
             i, ShapeUtil::HumanString(operand_shape));
@@ -1582,7 +1736,7 @@ XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
     *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
       TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
-      dimension = ShapeUtil::Rank(keys_shape) - 1;
+      dimension = keys_shape.rank() - 1;
     }
     instr.add_dimensions(dimension);
     std::vector<XlaOp> operands{keys};
@@ -1652,12 +1806,12 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
     *instr.mutable_shape() = shape.ToProto();
 
     Shape output_shape(instr.shape());
-    const int64 output_rank = ShapeUtil::Rank(output_shape);
+    const int64 output_rank = output_shape.rank();
     AddCalledComputation(computation, &instr);
     std::vector<XlaOp> new_operands(operands.begin(), operands.end());
     for (XlaOp& new_operand : new_operands) {
       TF_ASSIGN_OR_RETURN(Shape shape, GetShape(new_operand));
-      const int64 rank = ShapeUtil::Rank(shape);
+      const int64 rank = shape.rank();
       if (rank != output_rank) {
         TF_ASSIGN_OR_RETURN(new_operand,
                             InDimBroadcast(output_shape, new_operand, {}));
@@ -1866,7 +2020,7 @@ XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                             const XlaComputation& computation) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    std::vector<int64> all_dimnos(ShapeUtil::Rank(operand_shape));
+    std::vector<int64> all_dimnos(operand_shape.rank());
     std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
     return Reduce(operand, init_value, computation, all_dimnos);
   });
@@ -2292,7 +2446,7 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
     // TODO(b/111544877): Support tuple shapes.
-    if (!ShapeUtil::IsArray(operand_shape)) {
+    if (!operand_shape.IsArray()) {
       return InvalidArgument("SendToHost only supports array shapes, shape: %s",
                              ShapeUtil::HumanString(operand_shape));
     }
@@ -2332,7 +2486,7 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
     }
 
     // TODO(b/111544877): Support tuple shapes.
-    if (!ShapeUtil::IsArray(shape)) {
+    if (!shape.IsArray()) {
       return InvalidArgument(
           "RecvFromHost only supports array shapes, shape: %s",
           ShapeUtil::HumanString(shape));
@@ -2385,7 +2539,7 @@ StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
   TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
 
   bool is_constant = true;
-  std::set<int64> visited;
+  absl::flat_hash_set<int64> visited;
   IsConstantVisitor(operand.handle(), &visited, &is_constant);
   return is_constant;
 }
@@ -2432,21 +2586,58 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     worklist.pop();
     TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_proto,
                         LookUpInstructionByHandle(handle));
-    for (int64 id : instr_proto->operand_ids()) {
-      if (related_ops.insert(id).second) {
-        worklist.push(id);
+
+    if (instr_proto->opcode() ==
+        HloOpcodeString(HloOpcode::kGetDimensionSize)) {
+      // At this point, BuildConstantSubGraph should never encounter a
+      // GetDimensionSize with a dynamic dimension. IsConstant check would have
+      // failed at the beginning of this function.
+      //
+      // Replace GetDimensionSize with a Constant representing the static bound
+      // of the shape.
+      int64 dimension = instr_proto->dimensions(0);
+      int64 operand_handle = instr_proto->operand_ids(0);
+      TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
+                          LookUpInstructionByHandle(operand_handle));
+
+      TF_RET_CHECK(!operand_proto->shape().is_dynamic_dimension(dimension));
+      auto constant_dimension_size =
+          static_cast<uint32>(operand_proto->shape().dimensions(dimension));
+
+      Literal literal = LiteralUtil::CreateR0(constant_dimension_size);
+
+      HloInstructionProto const_instr;
+      *const_instr.mutable_shape() = literal.shape().ToProto();
+      *const_instr.mutable_literal() = literal.ToProto();
+      *const_instr.mutable_opcode() = HloOpcodeString(HloOpcode::kConstant);
+
+      const_instr.set_id(handle);
+      *const_instr.mutable_name() =
+          GetFullName(const_instr.opcode(), kNameSeparator, const_instr.id());
+      *entry.add_instructions() =
+          const_instr;  // Add to the result constant graph.
+    } else {
+      for (int64 id : instr_proto->operand_ids()) {
+        if (related_ops.insert(id).second) {
+          worklist.push(id);
+        }
+      }
+      for (int64 called_id : instr_proto->called_computation_ids()) {
+        related_calls.insert(called_id);
       }
-    }
-    for (int64 called_id : instr_proto->called_computation_ids()) {
-      related_calls.insert(called_id);
     }
   }
 
   // Add related ops to the computation.
   for (int64 id : related_ops) {
-    auto* instr = entry.add_instructions();
     TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_src,
                         LookUpInstructionByHandle(id));
+
+    if (instr_src->opcode() == HloOpcodeString(HloOpcode::kGetDimensionSize)) {
+      continue;
+    }
+    auto* instr = entry.add_instructions();
+
     *instr = *instr_src;
     // Ensures that the instruction names are unique among the graph.
     const string& new_name =
@@ -2719,12 +2910,21 @@ XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
+XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64> slice_sizes) {
+  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
+}
 
 XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                          const XlaOp& start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
 }
 
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         absl::Span<const XlaOp> start_indices) {
+  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
+}
+
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                   int64 dimension) {
   return builder->ConcatInDim(operands, dimension);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 6e9b025e5d70c03e9f4c7e7fbc89976f314d48d7..3bd6d42363664721ee4c15c8dc4fc75a42d0591b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -197,11 +197,19 @@ class XlaBuilder {
   // status. Note that all ops that have been enqueued will be moved to the
   // computation being returned. The root of the computation will be the last
   // added operation.
-  StatusOr<XlaComputation> Build();
+  //
+  // `remove_dynamic_dimensions` tells the builder whether to remove the
+  // dyanmic dimensions information in all ops.
+  //
+  // TODO(b/121223198): Delete `remove_dynamic_dimensions` and keeps the
+  // dynamic dimensions information when XLA backend can handle dynamic
+  // dimensions.
+  StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = true);
 
   // Overload of Build which specifies a particular root instruction for the
   // computation.
-  StatusOr<XlaComputation> Build(XlaOp root);
+  StatusOr<XlaComputation> Build(XlaOp root,
+                                 bool remove_dynamic_dimensions = true);
 
   // Builds the computation with the requested operations, or notes an error in
   // the parent XlaBuilder and returns an empty computation if building failed.
@@ -269,6 +277,10 @@ class XlaBuilder {
   // and its real dynamic size is represented by `dynamic_param_index` in
   // parameter `dynamic_param_num`.
   //
+  // Note that this should be called before the dynamic parameters are used to
+  // create other operations, otherwise created operations won't have the
+  // dynamic dimensions information.
+  //
   // TODO(b/119520625): Remove this API once we have more dynamic shape infra
   // ready.
   Status SetDynamicBinding(int64 dynamic_size_param_num,
@@ -276,9 +288,24 @@ class XlaBuilder {
                            int64 target_param_num,
                            ShapeIndex target_param_index, int64 target_dim_num);
 
+  // Adds a new input/output alias. Since the input/ouput shape information are
+  // not available until the computation is built, and eventual error in the
+  // arguments of this API will be detected only at computation Build() time.
+  void SetUpAlias(const ShapeIndex& output_index, int64 param_number,
+                  const ShapeIndex& param_index) {
+    input_output_aliases_.push_back({output_index, param_number, param_index});
+  }
+
  private:
+  // Describes an input/output alias as inserted by the SetUpAlias() API.
+  struct InputOutputAlias {
+    ShapeIndex output_index;
+    int64 param_number;
+    ShapeIndex param_index;
+  };
+
   // Build helper which takes the id of the root operation..
-  StatusOr<XlaComputation> Build(int64 root_id);
+  StatusOr<XlaComputation> Build(int64 root_id, bool remove_dynamic_dimensions);
 
   // Description for the methods below can be found in the corresponding public
   // functions section in this file.
@@ -344,11 +371,18 @@ class XlaBuilder {
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
+  XlaOp DynamicSlice(const XlaOp& operand,
+                     absl::Span<const XlaOp> start_indices,
+                     absl::Span<const int64> slice_sizes);
 
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
@@ -712,7 +746,8 @@ class XlaBuilder {
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
   // computation starting at a given operation and sets is_constant to false iff
   // a parameter or stateful operation is encountered.
-  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+  void IsConstantVisitor(const int64 op_handle,
+                         absl::flat_hash_set<int64>* visited,
                          bool* is_constant) const;
 
   // Checks bounds for convolution parameters.
@@ -730,6 +765,12 @@ class XlaBuilder {
 
   int64 GetNextId() { return ++next_id_; }
 
+  // Populates the module with the input/output alias information stored within
+  // the input_output_aliases vector.
+  static Status PopulateInputOutputAlias(
+      HloModuleProto* module, const ProgramShape& program_shape,
+      const std::vector<InputOutputAlias>& input_output_aliases);
+
   string name_;  // Name to use for the built computation.
 
   // The next sequential ID for every instruction/computation contained within
@@ -749,6 +790,9 @@ class XlaBuilder {
   // Dynamic parameter configuration of this computation.
   DynamicParameterBinding dynamic_parameter_binding_;
 
+  // Holds the input/output alias information populated by the SetUpAlias() API.
+  std::vector<InputOutputAlias> input_output_aliases_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -850,9 +894,14 @@ class XlaBuilder {
 
   friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                             absl::Span<const int64> slice_sizes);
+  friend XlaOp DynamicSlice(const XlaOp& operand,
+                            absl::Span<const XlaOp> start_indices,
+                            absl::Span<const int64> slice_sizes);
 
   friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                   const XlaOp& start_indices);
+  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                  absl::Span<const XlaOp> start_indices);
 
   friend XlaOp ConcatInDim(XlaBuilder* builder,
                            absl::Span<const XlaOp> operands, int64 dimension);
@@ -1294,10 +1343,15 @@ XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
 // The size of the slice in each dimension is passed in 'slice_sizes',
 // which specify the end point of exclusive slice intervals in each
 // dimension [start, start + size).
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
 // Slice index calculations are computed modulo input dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64> slice_sizes);
+
+ABSL_DEPRECATED("Use span-of-indices form instead")
 XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                    absl::Span<const int64> slice_sizes);
 
@@ -1313,10 +1367,15 @@ XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
 //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
 //   [7 8 9]                                                  [7 8  9 ]
 //
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
 // Slice index calculations are computed modulo update dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         absl::Span<const XlaOp> start_indices);
+
+ABSL_DEPRECATED("Use span-of-indices form instead")
 XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                          const XlaOp& start_indices);
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index b3f5be300d3f15397ad33858a6a9cab5f6029688..098165000a29cb28cb0ef906dbdb1ff9ae2f24e8 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -39,7 +40,8 @@ using ::testing::HasSubstr;
 class XlaBuilderTest : public ::testing::Test {
  protected:
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b) {
-    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build());
+    TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                        b->Build(/*remove_dynamic_dimensions=*/false));
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
@@ -50,7 +52,8 @@ class XlaBuilderTest : public ::testing::Test {
   // Overload which explicitly specifies the root instruction.
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b,
                                                       XlaOp root) {
-    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build(root));
+    TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                        b->Build(root, /*remove_dynamic_dimensions=*/false));
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
@@ -446,6 +449,417 @@ TEST_F(XlaBuilderTest, ProtoMatches) {
   EXPECT_EQ(c0_string, c1_string);
 }
 
+TEST_F(XlaBuilderTest, DynamicParameter) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  Parameter(&b, 1, ShapeUtil::MakeShape(U32, {}), "p1");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/1,
+                                   /*dynamic_size_param_index=*/{},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/p0));
+  const Shape& param_shape = module->entry_computation()
+                                 ->parameter_instruction(0)
+                                 ->shape()
+                                 .tuple_shapes(1);
+  EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicUnary) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  Neg(gte);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicBinary) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicBinaryHasBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}), ShapeUtil::MakeShape(F32, {5}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1, {0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  BroadcastInDim(gte, /*out_dim_size=*/{3, 5, 4},
+                 /*broadcast_dimensions=*/{1, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {false, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicPad) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pad_val = ConstantR0<float>(&b, -1);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  PaddingConfig padding_config;
+  for (int i = 0; i < 2; i++) {
+    auto dimension = padding_config.add_dimensions();
+    dimension->set_edge_padding_low(0);
+    dimension->set_edge_padding_high(0);
+    dimension->set_interior_padding(0);
+  }
+  Pad(gte, pad_val, padding_config);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicConvolution) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {1, 2, 2, 128}),
+       ShapeUtil::MakeShape(F32, {2, 2, 128, 8}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/2));
+  auto input = GetTupleElement(p0, 0);
+  auto filter = GetTupleElement(p0, 1);
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.set_input_feature_dimension(3);
+  dnums.set_output_feature_dimension(3);
+  dnums.add_kernel_spatial_dimensions(0);
+  dnums.add_kernel_spatial_dimensions(1);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.set_kernel_output_feature_dimension(3);
+  ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                            /*feature_group_count=*/1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
+                              {true, false, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicDot) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 3, 4}),
+       ShapeUtil::MakeShape(F32, {2, 4, 5}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+
+  auto lhs = GetTupleElement(p0, 0);
+  auto rhs = GetTupleElement(p0, 1);
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+  DotGeneral(lhs, rhs, dnums);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReduce) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4, 3}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  auto gte = GetTupleElement(p0, 0);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  Reduce(gte, init, sum, {0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReduceWindow) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0.f);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  ReduceWindow(gte, init, sum, /*window_dimensions=*/{1, 2, 4},
+               /*window_strides=*/{1, 1, 1}, Padding::kValid);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectAndScatter) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}),
+       ShapeUtil::MakeShape(F32, {2, 2, 2}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0.f);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  XlaBuilder bge(TestName());
+  Ge(Parameter(&bge, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+     Parameter(&bge, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto ge, bge.Build());
+
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto source = GetTupleElement(p0, 1);
+  SelectAndScatter(gte0, ge, {1, 2, 4}, {1, 2, 4}, Padding::kValid, source,
+                   init, sum);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReshape) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/2));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/3));
+  auto gte = GetTupleElement(p0, 0);  // f32[2, 3, <=4, <=5, 6]
+  Reshape(gte, /*new_sizes=*/{6, 4, 1, 5, 2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(3));
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
+                              {false, true, false, true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelect) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {4, 5, 6}),
+       ShapeUtil::MakeShape(F32, {4, 5, 6}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pred = Parameter(&b, 1, ShapeUtil::MakeShape(PRED, {}), "pred");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/1));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Select(pred, gte0, gte1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
+  EXPECT_FALSE(result_shape.is_dynamic_dimension(2));
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {false, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectNotCompatible) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {4, 5, 6}),
+       ShapeUtil::MakeShape(F32, {4, 5, 6}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pred = Parameter(&b, 1, ShapeUtil::MakeShape(PRED, {}), "pred");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/2));
+  auto gte0 = GetTupleElement(p0, 0);  // f32[4,<=5,6]
+  auto gte1 = GetTupleElement(p0, 1);  // f32[4,5,<=6]
+  Select(pred, gte0, gte1);
+  Status status = BuildHloModule(&b).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("Operands to select must be the same shape; "
+                                   "got f32[4,<=5,6] and f32[4,5,<=6]"));
+}
+
+TEST_F(XlaBuilderTest, DynamicTranspose) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3, 5}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  Transpose(gte, /*permutation=*/{1, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {false, true}))
+      << result_shape;
+}
+
 TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
   XlaBuilder b(TestName());
   AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
@@ -455,5 +869,31 @@ TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
               ::testing::HasSubstr("All operands to AfterAll must be tokens"));
 }
 
+TEST_F(XlaBuilderTest, CheckInputOutputAlias) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {8, 4}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {8, 4}), "p1");
+  auto add = Add(p0, p1);
+  auto sub = Sub(p0, p1);
+  auto root = Tuple(&b, {add, sub});
+
+  b.SetUpAlias({1}, 0, {});
+  b.SetUpAlias({0}, 1, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, root));
+
+  const HloInputOutputAliasConfig& config = module->input_output_alias_config();
+  EXPECT_TRUE(config.ParameterHasAlias(0, {}));
+  EXPECT_TRUE(config.ParameterHasAlias(1, {}));
+
+  auto alias_p0 = config.GetAliasedOutput(0, {});
+  ASSERT_TRUE(alias_p0.has_value());
+  EXPECT_EQ(*alias_p0, ShapeIndex({1}));
+
+  auto alias_p1 = config.GetAliasedOutput(1, {});
+  ASSERT_TRUE(alias_p1.has_value());
+  EXPECT_EQ(*alias_p1, ShapeIndex({0}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/error_spec.h b/tensorflow/compiler/xla/error_spec.h
index a1463aa15941b9c265db94e2eb3cc176fab6695b..4359f3b7deb8e585494cb2a9c7115eac6a312c8e 100644
--- a/tensorflow/compiler/xla/error_spec.h
+++ b/tensorflow/compiler/xla/error_spec.h
@@ -30,6 +30,19 @@ struct ErrorSpec {
   // In effect, this allows the tested operation to produce incorrect results
   // for inputs outside its mathematical domain.
   bool relaxed_nans;
+
+  // If this is true, then we treat each +/-inf in the actual result as
+  // equivalent to our choice of either +/-inf or the min/max floating-point
+  // value.
+  //
+  // If the expected result is +/-inf, the actual result must still be +/-inf.
+  //
+  // In effect, this allows the tested operation to overflow, so long as it's
+  // overflowing on "large" values.
+  //
+  // (We could have a symmetric more_infs_ok flag if necessary; right now it
+  // appears not to be.)
+  bool fewer_infs_ok = false;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index ba3217f31b55bd1428f67da6154a46c8bc304053..6f36d11dfb34eb27e79ea4ff797d35f80fb44b27 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-// Pulls in the ::stream_executor -> ::xla::se namespace alias.
-#include "tensorflow/compiler/xla/types.h"
-
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -28,12 +25,6 @@ class Stream;
 class Platform;
 }  // namespace stream_executor
 
-namespace tensorflow {
-namespace thread {
-class ThreadPool;
-}  // namespace thread
-}  // namespace tensorflow
-
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
diff --git a/tensorflow/compiler/xla/g3doc/broadcasting.md b/tensorflow/compiler/xla/g3doc/broadcasting.md
index 2870869a2cef13a9105b9dc9fa4d657834288f86..5c0525c1e9adf9f37d945170d05e7c18fa3d8852 100644
--- a/tensorflow/compiler/xla/g3doc/broadcasting.md
+++ b/tensorflow/compiler/xla/g3doc/broadcasting.md
@@ -168,7 +168,7 @@ consult the
 
 Broadcasting of a lower-rank array to a higher-rank array **and** broadcasting
 using degenerate dimensions can both be performed in the same binary operation.
-For example, a vector of size 4 and an matrix of size 1x2 can be added together
+For example, a vector of size 4 and a matrix of size 1x2 can be added together
 using broadcast dimensions value of (0):
 
     |1 2 3 4| + [5 6]    // [5 6] is a 1x2 matrix, not a vector.
@@ -176,7 +176,7 @@ using broadcast dimensions value of (0):
 First the vector is broadcast up to rank 2 (matrix) using the broadcast
 dimensions. The single value (0) in the broadcast dimensions indicates that
 dimension zero of the vector matches to dimension zero of the matrix. This
-produces an matrix of size 4xM where the value M is chosen to match the
+produces a matrix of size 4xM where the value M is chosen to match the
 corresponding dimension size in the 1x2 array. Therefore, a 4x2 matrix is
 produced:
 
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 9a9cd08c301502cbda8858225182d95fca4bf7ae..c5f9377f98868cdf6d5c711cf80ede5d41fd8305 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -636,11 +636,15 @@ details, see `tf.nn.depthwise_conv2d`.
 
 The `batch_group_count` (default value 1) argument can be used for depthwise
 filters during backpropagation. `batch_group_count` needs to be a divisor of the
-size of the `lhs` batch dimension. If `batch_group_count` is greater than 1, it
-means that conceptually the output batch dimension is split evenely in
-`batch_group_count` groups, such that each group consists of a consecutive
-subsequence of batches. Each output batch element is the reduced value of the
-batch group size.
+size of the `lhs` (input) batch dimension. If `batch_group_count` is greater
+than 1, it means that the output batch dimension should be of size
+`batch_group_size` where `batch_group_size = input batch / batch_group_count`.
+For convolutions with `batch_group_count` greater than 1, the input batch size
+must evenly divide into batch_group_size and output feature size, which implies
+that the output feature size must be equal to batch_group_count. Conceptually,
+this can be achieved by performing the usual convolution, and then scraping
+`batch_group_size` number of elements on the diagonal of the matrix formed by
+output batch and output feature.
 
 The output shape has these dimensions, in this order:
 
@@ -871,9 +875,7 @@ DotGeneral performs the sum of products over contracting dimensions specified
 in 'dimension_numbers'.
 
 Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
-to be the same, but must be listed in the same order in both
-'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
-There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
+to be the same and but must have the same dimension sizes.
 
 Example with contracting dimension numbers:
 
@@ -892,10 +894,8 @@ DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
 {15.0, 30.0} }
 ```
 
-Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
-dimension number, must be listed in the same order in both arrays, must
-have the same dimension sizes, and must be ordered before contracting and
-non-contracting/non-batch dimension numbers.
+Associated batch dimension numbers from the 'lhs' and 'rhs' must
+have the same dimension sizes.
 
 Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
@@ -944,21 +944,21 @@ dimension: [start, start + size). The shape of `start_indices` must be rank ==
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-| Arguments       | Type                | Semantics                           |
-| --------------- | ------------------- | ----------------------------------- |
-| `operand`       | `XlaOp`             | N dimensional array of type T       |
-| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
-:                 :                     : containing the starting indices of  :
-:                 :                     : the slice for each dimension. Value :
-:                 :                     : must be greater than or equal to    :
-:                 :                     : zero.                               :
-| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
-:                 :                     : slice size for each dimension. Each :
-:                 :                     : value must be strictly greater than :
-:                 :                     : zero, and start + size must be less :
-:                 :                     : than or equal to the size of the    :
-:                 :                     : dimension to avoid wrapping modulo  :
-:                 :                     : dimension size.                     :
+| Arguments       | Type                  | Semantics                          |
+| --------------- | --------------------- | ---------------------------------- |
+| `operand`       | `XlaOp`               | N dimensional array of type T      |
+| `start_indices` | sequence of N `XlaOp` | List of N scalar integers          |
+:                 :                       : containing the starting indices of :
+:                 :                       : the slice for each dimension.      :
+:                 :                       : Value must be greater than or      :
+:                 :                       : equal to zero.                     :
+| `size_indices`  | `ArraySlice<int64>`   | List of N integers containing the  |
+:                 :                       : slice size for each dimension.     :
+:                 :                       : Each value must be strictly        :
+:                 :                       : greater than zero, and start +     :
+:                 :                       : size must be less than or equal to :
+:                 :                       : the size of the dimension to avoid :
+:                 :                       : wrapping modulo dimension size.    :
 
 The effective slice indices are computed by applying the following
 transformation for each index `i` in `[1, N)` before performing the slice:
@@ -1009,19 +1009,22 @@ the rank of `operand`.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-| Arguments       | Type    | Semantics                                        |
-| --------------- | ------- | ------------------------------------------------ |
-| `operand`       | `XlaOp` | N dimensional array of type T                    |
-| `update`        | `XlaOp` | N dimensional array of type T containing the     |
-:                 :         : slice update. Each dimension of update shape     :
-:                 :         : must be strictly greater than zero, and start +  :
-:                 :         : update must be less than or equal to the operand :
-:                 :         : size for each dimension to avoid generating      :
-:                 :         : out-of-bounds update indices.                    :
-| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
-:                 :         : starting indices of the slice for each           :
-:                 :         : dimension. Value must be greater than or equal   :
-:                 :         : to zero.                                         :
+| Arguments       | Type                  | Semantics                          |
+| --------------- | --------------------- | ---------------------------------- |
+| `operand`       | `XlaOp`               | N dimensional array of type T      |
+| `update`        | `XlaOp`               | N dimensional array of type T      |
+:                 :                       : containing the slice update. Each  :
+:                 :                       : dimension of update shape must be  :
+:                 :                       : strictly greater than zero, and    :
+:                 :                       : start + update must be less than   :
+:                 :                       : or equal to the operand size for   :
+:                 :                       : each dimension to avoid generating :
+:                 :                       : out-of-bounds update indices.      :
+| `start_indices` | sequence of N `XlaOp` | List of N scalar integers          |
+:                 :                       : containing the starting indices of :
+:                 :                       : the slice for each dimension.      :
+:                 :                       : Value must be greater than or      :
+:                 :                       : equal to zero.                     :
 
 The effective slice indices are computed by applying the following
 transformation for each index `i` in `[1, N)` before performing the slice:
@@ -1095,7 +1098,7 @@ When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
 absolute value of the result is always less than the divisor's absolute value.
 
 Integer division overflow (signed/unsigned division/remainder by zero or signed
-divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined
+division/remainder of `INT_SMIN` with `-1`) produces an implementation defined
 value.
 
 An alternative variant with different-rank broadcasting support exists for these
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 2a0241af3ef359c4d1c6c1ab9319b5b293110f7a..7e22a32e545e4155545ffcfb9582187eadec3a82 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -141,7 +141,7 @@ namespace xla {
 
 /* static */ bool IndexUtil::IndexInBounds(const Shape& shape,
                                            absl::Span<const int64> index) {
-  int64 rank = ShapeUtil::Rank(shape);
+  int64 rank = shape.rank();
   if (rank != index.size()) {
     return false;
   }
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index ddccd8c798df5b926d2e5aea8975cb6cb6640824..2fe9b56c6bdffb931726f60ab75081361b43ebb4 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -101,13 +101,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }  // namespace
 
 /* static */ Layout LayoutUtil::GetDefaultLayoutForShape(const Shape& shape) {
-  if (ShapeUtil::IsOpaque(shape) || ShapeUtil::IsToken(shape)) {
+  if (shape.IsOpaque() || shape.IsToken()) {
     // Opaque and token types have empty layouts.
     return Layout();
   }
 
   // A Layout proto corresponds to a single array, not a tuple.
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   return CreateDefaultLayoutForRank(shape.dimensions_size());
 }
 
@@ -128,13 +128,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ void LayoutUtil::SetToDefaultLayout(Shape* shape) {
-  if (ShapeUtil::IsTuple(*shape)) {
+  if (shape->IsTuple()) {
     // Tuple shape.
     for (auto& element_shape : *shape->mutable_tuple_shapes()) {
       SetToDefaultLayout(&element_shape);
     }
     shape->clear_layout();
-  } else if (ShapeUtil::IsArray(*shape)) {
+  } else if (shape->IsArray()) {
     shape->mutable_layout()->set_format(DENSE);
     auto* minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
     minor_to_major->resize(shape->dimensions_size(), 0);
@@ -160,7 +160,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ Status LayoutUtil::ValidateLayoutInShape(
     const Shape& shape, bool allow_missing_layouts) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     // Tuple shape.
     if (shape.has_layout()) {
       return InvalidArgument("tuple should not have a layout field");
@@ -170,7 +170,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
           ValidateLayoutInShape(element_shape, allow_missing_layouts));
     }
     return Status::OK();
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     if (!shape.has_layout()) {
       if (allow_missing_layouts) {
         return Status::OK();
@@ -192,11 +192,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ Status LayoutUtil::ValidateLayoutForShape(const Layout& layout,
                                                        const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
-  if (!ShapeUtil::IsArray(shape)) {
+  if (!shape.IsArray()) {
     if (layout.minor_to_major_size() != 0) {
       return InvalidArgument(
           "shape of primitive type %s should not have a non-trivial layout",
@@ -211,19 +211,19 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   }
 
   if (layout.format() == DENSE) {
-    if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) {
+    if (layout.minor_to_major_size() != shape.rank()) {
       return InvalidArgument(
           "layout minor_to_major field contains %d elements, "
           "but shape is rank %d: {%s}; shape: %s",
-          layout.minor_to_major_size(), ShapeUtil::Rank(shape),
+          layout.minor_to_major_size(), shape.rank(),
           absl::StrJoin(layout.minor_to_major(), ", "),
           shape.ShortDebugString());
     }
 
-    std::vector<bool> dimensions_in_layout(ShapeUtil::Rank(shape), false);
-    for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
+    std::vector<bool> dimensions_in_layout(shape.rank(), false);
+    for (int64 i = 0; i < shape.rank(); ++i) {
       int64 dim = layout.minor_to_major(i);
-      if (dim < 0 || dim >= ShapeUtil::Rank(shape)) {
+      if (dim < 0 || dim >= shape.rank()) {
         return InvalidArgument(
             "layout minor_to_major field has out-of-bounds value: %s",
             HumanString(layout));
@@ -255,8 +255,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::IsDenseArray(const Shape& shape) {
-  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
-         IsDense(shape.layout());
+  return shape.IsArray() && shape.has_layout() && IsDense(shape.layout());
 }
 
 /* static */ bool LayoutUtil::IsDense(const Layout& layout) {
@@ -276,8 +275,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::IsSparseArray(const Shape& shape) {
-  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
-         IsSparse(shape.layout());
+  return shape.IsArray() && shape.has_layout() && IsSparse(shape.layout());
 }
 
 /* static */ bool LayoutUtil::IsSparse(const Layout& layout) {
@@ -290,11 +288,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::HasLayout(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     // Tuple shape: all subshapes must have a layout.
-    return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
-                       [](const Shape& s) { return HasLayout(s); });
-  } else if (!ShapeUtil::IsArray(shape)) {
+    return absl::c_all_of(shape.tuple_shapes(),
+                          [](const Shape& s) { return HasLayout(s); });
+  } else if (!shape.IsArray()) {
     // Opaque, token types etc. ignore layout.
     return true;
   }
@@ -360,11 +358,11 @@ namespace {
 
 // Internal helper for recursively copying layouts.
 Status CopyLayoutInternal(const Shape& src, Shape* dst) {
-  if (ShapeUtil::IsTuple(src) != ShapeUtil::IsTuple(*dst)) {
+  if (src.IsTuple() != dst->IsTuple()) {
     return InvalidArgument(
         "cannot copy layout from shape: shape structure differs");
   }
-  if (ShapeUtil::IsTuple(src)) {
+  if (src.IsTuple()) {
     if (ShapeUtil::TupleElementCount(src) !=
         ShapeUtil::TupleElementCount(*dst)) {
       return InvalidArgument(
@@ -376,7 +374,7 @@ Status CopyLayoutInternal(const Shape& src, Shape* dst) {
     }
   } else {
     if (src.has_layout()) {
-      if (ShapeUtil::Rank(src) != ShapeUtil::Rank(*dst)) {
+      if (src.rank() != dst->rank()) {
         return InvalidArgument("cannot copy layout from shape: ranks differs");
       }
       TF_RETURN_IF_ERROR(
@@ -398,9 +396,9 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
 
 /* static */ bool LayoutUtil::LayoutsInShapesEqual(const Shape& lhs,
                                                    const Shape& rhs) {
-  if (ShapeUtil::IsTuple(lhs)) {
-    if (!ShapeUtil::IsTuple(rhs) || ShapeUtil::TupleElementCount(lhs) !=
-                                        ShapeUtil::TupleElementCount(rhs)) {
+  if (lhs.IsTuple()) {
+    if (!rhs.IsTuple() || ShapeUtil::TupleElementCount(lhs) !=
+                              ShapeUtil::TupleElementCount(rhs)) {
       return false;
     }
     for (int i = 0; i < ShapeUtil::TupleElementCount(lhs); ++i) {
@@ -409,8 +407,8 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
       }
     }
     return true;
-  } else if (ShapeUtil::IsArray(lhs)) {
-    return ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs) &&
+  } else if (lhs.IsArray()) {
+    return lhs.rank() == rhs.rank() &&
            LayoutUtil::Equal(lhs.layout(), rhs.layout());
   } else {
     // Layouts of non-array and non-tuple shapes is ignored.
@@ -426,7 +424,7 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
     positions_in_layout.push_back(
         PositionInContainer(layout.minor_to_major(), dim));
   }
-  std::sort(positions_in_layout.begin(), positions_in_layout.end());
+  absl::c_sort(positions_in_layout);
   for (size_t i = 1; i < positions_in_layout.size(); ++i) {
     if (1 != positions_in_layout[i] - positions_in_layout[i - 1]) {
       return false;
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 277c98721e59ac12965392500fdfdc3d91e59a8b..8600e8752cfbe072407391559d210d0b49bea511 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -29,10 +29,12 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -107,7 +109,7 @@ Literal::Literal(const Shape& shape)
     : Literal(shape, /*allocate_arrays=*/true) {}
 
 void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       const Shape& subshape = shape.tuple_shapes(i);
 
@@ -118,7 +120,7 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
 
       piece->emplace_back(std::move(child_piece));
     }
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     if (allocate_arrays) {
       if (LayoutUtil::IsSparseArray(shape)) {
         // For sparse arrays, the buffer must be of the size of the maximum
@@ -129,7 +131,7 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
             new char[max_sparse_elements *
                      ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]);
         piece->set_sparse_indices(
-            new SparseIndexArray(max_sparse_elements, ShapeUtil::Rank(shape)));
+            new SparseIndexArray(max_sparse_elements, shape.rank()));
       } else {
         piece->set_buffer(new char[piece->size_bytes()]);
       }
@@ -187,7 +189,7 @@ Literal LiteralBase::CreateFromShape(const Shape& shape) {
   Literal literal(shape);
   literal.root_piece_->ForEachMutableSubpiece(
       [&](const ShapeIndex& index, Piece* piece) {
-        if (ShapeUtil::IsArray(piece->subshape())) {
+        if (piece->subshape().IsArray()) {
           memset(piece->untyped_data(), 0, piece->size_bytes());
         }
       });
@@ -208,16 +210,15 @@ template <typename NativeT>
 Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, absl::Span<const int64> src_base,
     absl::Span<const int64> dest_base, absl::Span<const int64> copy_size) {
-  TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size());
-  TF_RET_CHECK(ShapeUtil::Rank(shape()) == dest_base.size());
+  TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
+  TF_RET_CHECK(shape().rank() == dest_base.size());
 
   auto linear_index = [](const Shape& shape,
                          absl::Span<const int64> multi_index) {
     return IndexUtil::MultidimensionalIndexToLinearIndex(shape, multi_index);
   };
 
-  if (ShapeUtil::Rank(src_literal.shape()) == 0 ||
-      ShapeUtil::Rank(shape()) == 0) {
+  if (src_literal.shape().rank() == 0 || shape().rank() == 0) {
     // If any of the two shapes are scalars, we can just call the StridedCopy()
     // directly, and we know we will be copying only one value.
     TF_RET_CHECK(copy_size.empty());
@@ -312,7 +313,7 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
           proto_element = &proto_element->tuple_literals(i);
         }
 
-        if (ShapeUtil::IsTuple(piece->subshape())) {
+        if (piece->subshape().IsTuple()) {
           if (proto_element->tuple_literals_size() !=
               ShapeUtil::TupleElementCount(piece->subshape())) {
             return InvalidArgument(
@@ -326,7 +327,7 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
           return Status::OK();
         }
 
-        CHECK(ShapeUtil::IsArray(piece->subshape()));
+        CHECK(piece->subshape().IsArray());
         TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element));
 
         return Status::OK();
@@ -336,7 +337,7 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
 }
 
 std::vector<Literal> Literal::DecomposeTuple() {
-  CHECK(ShapeUtil::IsTuple(shape()));
+  CHECK(shape().IsTuple());
   std::vector<Literal> elements;
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) {
     elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}),
@@ -375,7 +376,7 @@ void CopyElementsBetween(absl::Span<NativeT> dest,
   if (ShapeUtil::IsZeroElementArray(dest_shape)) {
     return;
   }
-  std::vector<int64> index(ShapeUtil::Rank(dest_shape));
+  std::vector<int64> index(dest_shape.rank());
   do {
     dest[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape, index)] =
         src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)];
@@ -392,7 +393,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
     memcpy(buffer(), src.buffer(), src.size_bytes());
   } else {
     TF_RET_CHECK(ShapeUtil::Compatible(src.subshape(), subshape()));
-    std::vector<int64> origin(ShapeUtil::Rank(subshape()), 0);
+    std::vector<int64> origin(subshape().rank(), 0);
     switch (subshape().element_type()) {
 #define COPY_ELEMENTS(XLA_T, NATIVE_T)                                    \
   case (XLA_T):                                                           \
@@ -412,6 +413,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
       COPY_ELEMENTS(F32, float);
       COPY_ELEMENTS(F64, double);
       COPY_ELEMENTS(C64, complex64);
+      COPY_ELEMENTS(C128, complex128);
       COPY_ELEMENTS(PRED, bool);
 #undef COPY_ELEMENTS
       default:
@@ -438,7 +440,7 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
   }
   return root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
-        if (!ShapeUtil::IsArray(piece->subshape())) {
+        if (!piece->subshape().IsArray()) {
           return Status::OK();
         }
 
@@ -477,7 +479,7 @@ Status Literal::MoveFrom(Literal&& src_literal,
 
   src_literal.root_piece_->ForEachSubpiece(
       [&](const ShapeIndex& src_index, const Piece& src_piece) {
-        if (!ShapeUtil::IsArray(src_piece.subshape())) {
+        if (!src_piece.subshape().IsArray()) {
           return;
         }
 
@@ -504,8 +506,8 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
                                          absl::Span<const int64> src_base,
                                          absl::Span<const int64> dest_base,
                                          absl::Span<const int64> copy_size) {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape());
-  TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape()))
+  TF_RET_CHECK(shape().IsArray()) << ShapeUtil::HumanString(shape());
+  TF_RET_CHECK(src_literal.shape().IsArray())
       << ShapeUtil::HumanString(src_literal.shape());
   TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape()));
 
@@ -549,6 +551,9 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
     case C64:
       return CopySliceFromInternal<complex64>(src_literal, src_base, dest_base,
                                               copy_size);
+    case C128:
+      return CopySliceFromInternal<complex128>(src_literal, src_base, dest_base,
+                                               copy_size);
     case PRED:
       return CopySliceFromInternal<bool>(src_literal, src_base, dest_base,
                                          copy_size);
@@ -562,8 +567,8 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
 }
 
 void MutableLiteralBase::PopulateR1(const tensorflow::core::Bitmap& values) {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 1);
   CHECK_EQ(element_count(), values.bits());
   CHECK_EQ(shape().element_type(), PRED);
   for (int64 i = 0; i < static_cast<int64>(values.bits()); ++i) {
@@ -592,7 +597,7 @@ Literal LiteralBase::Relayout(const Shape& shape_with_layout) const {
   ShapeUtil::ForEachSubshape(
       result.shape(),
       [this, &result](const Shape& subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           TF_CHECK_OK(result.CopyFrom(*this,
                                       /*dest_shape_index=*/index,
                                       /*src_shape_index=*/index));
@@ -603,7 +608,7 @@ Literal LiteralBase::Relayout(const Shape& shape_with_layout) const {
 
 StatusOr<Literal> LiteralBase::Broadcast(
     const Shape& result_shape, absl::Span<const int64> dimensions) const {
-  if (!ShapeUtil::IsArray(shape())) {
+  if (!shape().IsArray()) {
     return InvalidArgument("Broadcast only supports arrays.");
   }
 
@@ -643,13 +648,12 @@ StatusOr<Literal> LiteralBase::Broadcast(
 
 StatusOr<Literal> LiteralBase::Reshape(
     absl::Span<const int64> dimensions) const {
-  if (!ShapeUtil::IsArray(shape())) {
+  if (!shape().IsArray()) {
     return InvalidArgument("Reshape does not support tuples.");
   }
   Literal output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
-    output =
-        Relayout(LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(shape())));
+    output = Relayout(LayoutUtil::GetDefaultLayoutForRank(shape().rank()));
   } else {
     output = Clone();
   }
@@ -671,8 +675,8 @@ StatusOr<Literal> LiteralBase::Reshape(
 }
 
 Literal LiteralBase::Transpose(absl::Span<const int64> permutation) const {
-  CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose";
-  CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape())))
+  CHECK(shape().IsArray()) << "Tuple is not supported for transpose";
+  CHECK(IsPermutation(permutation, shape().rank()))
       << "Given permutation is not a permutation of dimension numbers";
   // To transpose the array, we just permute the dimensions and layout, and
   // do a straight memory copy of the raw data set.
@@ -711,10 +715,10 @@ template <typename NativeT>
 Literal LiteralBase::SliceInternal(
     const Shape& result_shape, absl::Span<const int64> start_indices) const {
   Literal result_literal(result_shape);
-  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
+  DimensionVector new_indices(result_shape.rank());
   result_literal.EachCell<NativeT>(
       [&](absl::Span<const int64> indices, NativeT /*value*/) {
-        for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
+        for (int64 i = 0; i < result_shape.rank(); ++i) {
           new_indices[i] = indices[i] + start_indices[i];
         }
         NativeT value = Get<NativeT>(new_indices);
@@ -725,10 +729,10 @@ Literal LiteralBase::SliceInternal(
 
 Literal LiteralBase::Slice(absl::Span<const int64> start_indices,
                            absl::Span<const int64> limit_indices) const {
-  CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice";
+  CHECK(shape().IsArray()) << "tuple is not supported for slice";
 
   DimensionVector result_dimensions;
-  for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) {
+  for (int64 dnum = 0; dnum < shape().rank(); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
     CHECK_LE(limit_indices[dnum], shape().dimensions(dnum))
         << "dnum = " << dnum;
@@ -768,6 +772,8 @@ Literal LiteralBase::Slice(absl::Span<const int64> start_indices,
       return SliceInternal<double>(result_shape, start_indices);
     case C64:
       return SliceInternal<complex64>(result_shape, start_indices);
+    case C128:
+      return SliceInternal<complex128>(result_shape, start_indices);
     default:
       LOG(FATAL) << "not yet implemented: "
                  << PrimitiveType_Name(result_shape.element_type());
@@ -816,6 +822,10 @@ string LiteralBase::GetAsString(absl::Span<const int64> multi_index,
       complex64 c = Get<complex64>(multi_index, shape_index);
       return StrCat("(", c.real(), ", ", c.imag(), ")");
     }
+    case C128: {
+      complex128 c = Get<complex128>(multi_index, shape_index);
+      return StrCat("(", c.real(), ", ", c.imag(), ")");
+    }
     default:
       LOG(FATAL) << PrimitiveType_Name(subshape.element_type());
   }
@@ -870,6 +880,11 @@ string LiteralBase::GetSparseElementAsString(
           GetSparseElement<complex64>(sparse_element_number, shape_index);
       return StrCat("(", c.real(), ", ", c.imag(), ")");
     }
+    case C128: {
+      complex128 c =
+          GetSparseElement<complex128>(sparse_element_number, shape_index);
+      return StrCat("(", c.real(), ", ", c.imag(), ")");
+    }
     default:
       LOG(FATAL) << "Invalid element type for sparse arrays: "
                  << PrimitiveType_Name(subshape.element_type());
@@ -906,7 +921,7 @@ size_t LiteralBase::Hash() const {
 
   ShapeUtil::ForEachSubshape(
       shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-        if (!ShapeUtil::IsArray(subshape)) {
+        if (!subshape.IsArray()) {
           return;
         }
 
@@ -998,6 +1013,9 @@ void LiteralBase::Piece::SortSparseElements() {
     case C64:
       SortSparseElementsInternal<complex64>();
       break;
+    case C128:
+      SortSparseElementsInternal<complex128>();
+      break;
     case F16:
       SortSparseElementsInternal<half>();
       break;
@@ -1056,7 +1074,7 @@ void SparseArrayToStringHelper(const LiteralBase& literal,
     pieces->push_back(ShapeToString(print_layout, subshape));
   }
   pieces->push_back("{");
-  int64 rank = ShapeUtil::Rank(subshape);
+  int64 rank = subshape.rank();
   int64 num_elements = literal.sparse_element_count();
   for (int64 i = 0; i < num_elements; ++i) {
     if (i > 0) {
@@ -1079,7 +1097,7 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
                               const ShapeIndex& shape_index, bool print_shape,
                               bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  int64 rank = ShapeUtil::Rank(subshape);
+  int64 rank = subshape.rank();
 
   std::function<void(absl::Span<const int64> dimensions, std::vector<int64>*)>
       to_string_recursive = [&](absl::Span<const int64> dimensions,
@@ -1154,10 +1172,10 @@ void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   CHECK(LayoutUtil::HasLayout(literal.shape()));
   CHECK(LayoutUtil::HasLayout(subshape));
-  if (ShapeUtil::IsTuple(subshape)) {
+  if (subshape.IsTuple()) {
     TupleToStringHelper(literal, shape_index, print_shape, print_layout,
                         pieces);
-  } else if (ShapeUtil::IsToken(subshape)) {
+  } else if (subshape.IsToken()) {
     pieces->push_back("token");
   } else if (LayoutUtil::IsSparseArray(subshape)) {
     SparseArrayToStringHelper(literal, subshape, print_shape, print_layout,
@@ -1217,7 +1235,7 @@ namespace {
 template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
 Literal ConvertBetweenNativeTypesWithConverter(const LiteralBase& src_literal,
                                                const ConverterType& converter) {
-  CHECK(ShapeUtil::IsArray(src_literal.shape()));
+  CHECK(src_literal.shape().IsArray());
   Literal result_literal(ShapeUtil::ChangeElementType(
       src_literal.shape(),
       primitive_util::NativeToPrimitiveType<NativeDestT>()));
@@ -1232,7 +1250,24 @@ Literal ConvertBetweenNativeTypesWithConverter(const LiteralBase& src_literal,
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
+typename std::enable_if<(std::is_same<NativeSrcT, Eigen::half>::value) &&
+                            (std::is_same<NativeDestT, complex64>::value ||
+                             std::is_same<NativeDestT, complex128>::value),
+                        Literal>::type
+ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
+  auto converter = [](NativeSrcT src) {
+    return NativeDestT(static_cast<typename NativeDestT::value_type>(src));
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(!std::is_same<NativeSrcT, Eigen::half>::value) ||
+                            (!std::is_same<NativeDestT, complex64>::value &&
+                             !std::is_same<NativeDestT, complex128>::value),
+                        Literal>::type
+ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
@@ -1276,22 +1311,6 @@ BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   LOG(FATAL) << "Invalid bitcast between types of different sizes.";
 }
 
-template <PrimitiveType primitive_src_type>
-Literal ConvertToC64(const LiteralBase& src_literal) {
-  CHECK(ShapeUtil::IsArray(src_literal.shape()));
-  Literal result_literal(
-      ShapeUtil::ChangeElementType(src_literal.shape(), C64));
-  using NativeSrcT =
-      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type;
-  absl::Span<const NativeSrcT> src_data = src_literal.data<NativeSrcT>();
-  absl::Span<complex64> dest_data = result_literal.data<complex64>();
-  int64 num_elements = src_literal.element_count();
-  for (int64 i = 0; i < num_elements; ++i) {
-    dest_data[i] = complex64(static_cast<float>(src_data[i]), 0);
-  }
-  return result_literal;
-}
-
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
 Literal ConvertIfTypesMatch(const LiteralBase& src_literal, bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
@@ -1321,9 +1340,11 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
                                                            bitcast);
     CONVERT_IF_TYPES_MATCH(PRED)
     CONVERT_IF_TYPES_MATCH(S8)
+    CONVERT_IF_TYPES_MATCH(S16)
     CONVERT_IF_TYPES_MATCH(S32)
     CONVERT_IF_TYPES_MATCH(S64)
     CONVERT_IF_TYPES_MATCH(U8)
+    CONVERT_IF_TYPES_MATCH(U16)
     CONVERT_IF_TYPES_MATCH(U32)
     CONVERT_IF_TYPES_MATCH(U64)
     CONVERT_IF_TYPES_MATCH(F16)
@@ -1332,10 +1353,15 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
     CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
-      if (!bitcast) {
-        return ConvertToC64<primitive_src_type>(src_literal);
+      if (bitcast) {
+        break;
       }
-      break;
+      return ConvertIfTypesMatch<primitive_src_type, C64>(src_literal, false);
+    case C128:
+      if (bitcast) {
+        break;
+      }
+      return ConvertIfTypesMatch<primitive_src_type, C128>(src_literal, false);
     // Other types are not yet supported.
     default:
       break;
@@ -1348,7 +1374,7 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
 StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
                                 PrimitiveType primitive_dest_type,
                                 bool bitcast) {
-  TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
+  TF_RET_CHECK(literal.shape().IsArray());
   if (literal.shape().element_type() == primitive_dest_type) {
     return literal.Clone();
   }
@@ -1359,9 +1385,11 @@ StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
                                             bitcast);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
+    CONVERT_IF_DEST_TYPE_MATCHES(S16)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
     CONVERT_IF_DEST_TYPE_MATCHES(S64)
     CONVERT_IF_DEST_TYPE_MATCHES(U8)
+    CONVERT_IF_DEST_TYPE_MATCHES(U16)
     CONVERT_IF_DEST_TYPE_MATCHES(U32)
     CONVERT_IF_DEST_TYPE_MATCHES(U64)
     CONVERT_IF_DEST_TYPE_MATCHES(F16)
@@ -1401,7 +1429,7 @@ StatusOr<Literal> LiteralBase::BitcastConvert(
 }
 
 StatusOr<Literal> LiteralBase::ConvertToShape(const Shape& dest_shape) const {
-  if (!ShapeUtil::IsTuple(dest_shape)) {
+  if (!dest_shape.IsTuple()) {
     return Convert(dest_shape.element_type());
   }
   std::vector<Literal> elements;
@@ -1433,7 +1461,7 @@ StatusOr<Literal> LiteralBase::ConvertToShape(const Shape& dest_shape) const {
 template <typename NativeT>
 bool LiteralBase::Piece::EqualElementsInternal(
     const LiteralBase::Piece& other, std::vector<int64>* multi_index) const {
-  if (multi_index->size() == ShapeUtil::Rank(subshape())) {
+  if (multi_index->size() == subshape().rank()) {
     return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
   }
   for (int64 i = 0; i < subshape().dimensions(multi_index->size()); ++i) {
@@ -1483,6 +1511,8 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<bfloat16>(other, &multi_index);
     case C64:
       return EqualElementsInternal<complex64>(other, &multi_index);
+    case C128:
+      return EqualElementsInternal<complex128>(other, &multi_index);
     default:
       LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type "
                  << PrimitiveType_Name(subshape().element_type());
@@ -1496,7 +1526,7 @@ bool LiteralBase::operator==(const LiteralBase& other) const {
 
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
-        if (!ShapeUtil::IsArray(piece.subshape())) {
+        if (!piece.subshape().IsArray()) {
           return true;
         }
 
@@ -1526,7 +1556,7 @@ static bool AllElementsEqualValue(absl::Span<const NativeT> data,
 bool LiteralBase::IsAll(int8 value) const {
   return root_piece().ForEachSubpieceWithBool([&](const ShapeIndex& index,
                                                   const Piece& piece) {
-    if (!ShapeUtil::IsArray(piece.subshape())) {
+    if (!piece.subshape().IsArray()) {
       return true;
     }
 
@@ -1594,7 +1624,7 @@ bool LiteralBase::IsAll(int8 value) const {
 bool LiteralBase::IsAllFloat(float value) const {
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
-        if (!ShapeUtil::IsArray(piece.subshape())) {
+        if (!piece.subshape().IsArray()) {
           return true;
         }
 
@@ -1626,6 +1656,9 @@ bool LiteralBase::IsAllComplex(complex64 value) const {
     case C64:
       return AllElementsEqualValue<complex64>(root_piece().data<complex64>(),
                                               value);
+    case C128:
+      return AllElementsEqualValue<complex128>(root_piece().data<complex128>(),
+                                               value);
     default:
       return false;
   }
@@ -1634,7 +1667,7 @@ bool LiteralBase::IsAllComplex(complex64 value) const {
 bool LiteralBase::IsAllFirst() const {
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
-        if (!ShapeUtil::IsArray(piece.subshape())) {
+        if (!piece.subshape().IsArray()) {
           return true;
         }
 
@@ -1705,6 +1738,11 @@ bool LiteralBase::IsAllFirst() const {
               auto data = piece.data<uint64>();
               return AllElementsEqualValue<uint64>(data, data[0]);
             }
+
+            case C128: {
+              auto data = piece.data<complex128>();
+              return AllElementsEqualValue<complex128>(data, data[0]);
+            }
             default:
               return false;
           }
@@ -1718,11 +1756,11 @@ bool LiteralBase::IsAllFirst() const {
 }
 
 bool LiteralBase::IsR1Iota() const {
-  if (!ShapeUtil::IsArray(shape())) {
+  if (!shape().IsArray()) {
     return false;
   }
 
-  if (ShapeUtil::Rank(shape()) != 1) {
+  if (shape().rank() != 1) {
     return false;
   }
 
@@ -1754,6 +1792,8 @@ bool LiteralBase::IsR1Iota() const {
         return Get<bfloat16>({idx}) == static_cast<bfloat16>(idx);
       case C64:
         return Get<complex64>({idx}) == complex64(idx, 0.0f);
+      case C128:
+        return Get<complex128>({idx}) == complex128(idx, 0.0f);
       case PRED:
         return Get<bool>({idx}) == idx;
       // token, opaque, tuple, etc. are all not iota.
@@ -1773,7 +1813,7 @@ bool LiteralBase::IsR1Iota() const {
 }
 
 bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
-  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK(shape().IsArray());
   switch (shape().element_type()) {
     case U8:
       return Get<uint8>(indices) == 0;
@@ -1797,6 +1837,8 @@ bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
       return Get<double>(indices) == 0.0;
     case C64:
       return Get<complex64>(indices) == complex64(0.0f, 0.0f);
+    case C128:
+      return Get<complex128>(indices) == complex128(0.0f, 0.0f);
     case F16:
       return Get<half>(indices) == static_cast<half>(0.0f);
     case BF16:
@@ -1884,6 +1926,12 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
         proto->add_c64s(value.imag());
       }
       break;
+    case C128:
+      for (complex128 value : data<complex128>()) {
+        proto->add_c128s(value.real());
+        proto->add_c128s(value.imag());
+      }
+      break;
     case TUPLE:
     case TOKEN:
       // Nothing to do but assign the shape which is done above.
@@ -1896,12 +1944,12 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
 }
 
 const void* LiteralBase::Piece::untyped_data() const {
-  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  CHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
 
 void* LiteralBase::Piece::untyped_data() {
-  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  CHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
 
@@ -1932,14 +1980,12 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   if (LayoutUtil::IsSparseArray(subshape())) {
     // Compute the number of elements (indices) in the sparse shape and reserve
     // the necessary space in spare_indices.
-    TF_RET_CHECK(ShapeUtil::Rank(subshape()) != 0)
-        << "Scalar shapes cannot be sparse";
-    TF_RET_CHECK(proto.sparse_indices_size() % ShapeUtil::Rank(subshape()) == 0)
+    TF_RET_CHECK(subshape().rank() != 0) << "Scalar shapes cannot be sparse";
+    TF_RET_CHECK(proto.sparse_indices_size() % subshape().rank() == 0)
         << "Unexpected number of indices in proto ("
         << proto.sparse_indices_size() << ") for shape of rank "
-        << ShapeUtil::Rank(subshape());
-    const int64 index_count =
-        proto.sparse_indices_size() / ShapeUtil::Rank(subshape());
+        << subshape().rank();
+    const int64 index_count = proto.sparse_indices_size() / subshape().rank();
     sparse_indices()->Resize(index_count);
 
     // Copy the indices from the proto into the SparseIndexArray object.
@@ -2018,7 +2064,17 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       for (int64 i = 0; i < complex_data.size(); ++i) {
         complex_data[i] = complex64{proto.c64s(i * 2), proto.c64s(i * 2 + 1)};
       }
-    } break;
+      break;
+    }
+    case C128: {
+      auto complex_data = data<complex128>();
+      TF_RET_CHECK(proto.c128s_size() == complex_data.size() * 2);
+      for (int64 i = 0; i < complex_data.size(); ++i) {
+        complex_data[i] =
+            complex128{proto.c128s(i * 2), proto.c128s(i * 2 + 1)};
+      }
+      break;
+    }
     case TUPLE:
       return InvalidArgument("Should not be called on tuple shapes: %s",
                              ShapeUtil::HumanString(subshape()));
@@ -2064,8 +2120,8 @@ int64 LiteralBase::size_bytes(const ShapeIndex& shape_index) const {
 }
 
 string LiteralBase::GetR1U8AsString() const {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 1);
   CHECK_EQ(shape().element_type(), U8);
   return string(absl::bit_cast<const char*>(data<uint8>().data()),
                 ShapeUtil::ElementsIn(shape()));
@@ -2079,7 +2135,7 @@ void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
       << ShapeUtil::HumanString(src_piece->subshape())
       << "dest_piece has shape: "
       << ShapeUtil::HumanString(dest_piece->subshape());
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       const Shape& subshape = shape.tuple_shapes(i);
 
@@ -2090,7 +2146,7 @@ void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
 
       dest_piece->emplace_back(std::move(child_piece));
     }
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     dest_piece->set_buffer(src_piece->buffer());
   } else {
     // If the shape is neither an array nor tuple, then it must be
@@ -2166,7 +2222,7 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr,
     : MutableLiteralBase() {
   shape_ = absl::make_unique<Shape>(shape);
   CHECK(LayoutUtil::HasLayout(*shape_));
-  CHECK(!ShapeUtil::IsTuple(*shape_));
+  CHECK(!shape_->IsTuple());
 
   root_piece_ = new Piece();
   root_piece_->set_buffer(const_cast<char*>(src_buf_ptr));
@@ -2193,14 +2249,14 @@ LiteralSlice::LiteralSlice(const LiteralBase& literal,
     : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
 
 void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
-  CHECK(ShapeUtil::IsTuple(shape));
+  CHECK(shape.IsTuple());
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
     const Shape& subshape = shape.tuple_shapes(i);
 
     auto child_piece = Piece();
     child_piece.set_subshape(&subshape);
 
-    if (ShapeUtil::IsTuple(subshape)) {
+    if (subshape.IsTuple()) {
       BuildPieceSubtree(subshape, &child_piece);
     }
 
@@ -2210,7 +2266,7 @@ void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
 
 BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
     : LiteralBase(), shape_(absl::make_unique<Shape>(shape)) {
-  CHECK(ShapeUtil::IsArray(*shape_));
+  CHECK(shape_->IsArray());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = Piece();
@@ -2221,7 +2277,7 @@ BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
 BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
                                    const Shape& shape)
     : LiteralBase(), shape_(absl::make_unique<Shape>(shape)) {
-  CHECK(ShapeUtil::IsTuple(*shape_));
+  CHECK(shape_->IsTuple());
   CHECK(!ShapeUtil::IsNestedTuple(*shape_));
   CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_));
   root_piece_ = Piece();
@@ -2230,7 +2286,7 @@ BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
 
   for (int i = 0; i < src_buf_ptrs.size(); ++i) {
     const auto& src_shape = shape_->tuple_shapes(i);
-    CHECK(ShapeUtil::IsArray(src_shape));
+    CHECK(src_shape.IsArray());
     root_piece_.child(i).set_buffer(const_cast<char*>(src_buf_ptrs[i]));
   }
 }
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 67e908e7ec4d4346f4e26a99a42aac26928ec0c2..041151fda1280d6ae7b35d5857ca79788d4f7203 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -867,7 +867,7 @@ class BorrowingLiteral : public LiteralBase {
 
 template <typename NativeT>
 absl::Span<const NativeT> LiteralBase::Piece::data() const {
-  DCHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  DCHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   DCHECK_EQ(subshape().element_type(),
             primitive_util::NativeToPrimitiveType<NativeT>())
       << "Attempting to access "
@@ -880,7 +880,7 @@ absl::Span<const NativeT> LiteralBase::Piece::data() const {
 
 template <typename NativeT>
 absl::Span<NativeT> LiteralBase::Piece::data() {
-  DCHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  DCHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   DCHECK_EQ(subshape().element_type(),
             primitive_util::NativeToPrimitiveType<NativeT>())
       << "Attempting to access "
@@ -961,7 +961,7 @@ void MutableLiteralBase::AppendSparseElement(
   Piece& p = piece(shape_index);
   const Shape& subshape = p.subshape();
   CHECK(LayoutUtil::IsSparseArray(subshape));
-  int64 rank = ShapeUtil::Rank(subshape);
+  int64 rank = subshape.rank();
   CHECK_EQ(multi_index.size(), rank);
   int64 last_element = p.sparse_indices()->index_count();
   CHECK_LT(last_element, LayoutUtil::MaxSparseElements(subshape.layout()));
@@ -977,7 +977,7 @@ void LiteralBase::EachCell(
   if (ShapeUtil::IsZeroElementArray(shape())) {
     return;
   }
-  std::vector<int64> indices(ShapeUtil::Rank(shape()), 0);
+  std::vector<int64> indices(shape().rank(), 0);
   do {
     per_cell(indices, Get<NativeT>(indices));
   } while (IndexUtil::BumpIndices(shape(), absl::MakeSpan(indices)));
@@ -985,8 +985,8 @@ void LiteralBase::EachCell(
 
 template <typename NativeT>
 inline void MutableLiteralBase::PopulateR1(absl::Span<const NativeT> values) {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 1);
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
@@ -997,8 +997,8 @@ inline void MutableLiteralBase::PopulateR1(absl::Span<const NativeT> values) {
 template <typename NativeT>
 void MutableLiteralBase::PopulateR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 2);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 2);
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
 
@@ -1021,10 +1021,10 @@ void MutableLiteralBase::PopulateR2(
 
 template <typename NativeT>
 void MutableLiteralBase::PopulateFromArray(const Array<NativeT>& values) {
-  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK(shape().IsArray());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
-  CHECK_EQ(ShapeUtil::Rank(shape()), values.num_dimensions());
+  CHECK_EQ(shape().rank(), values.num_dimensions());
   for (int dim = 0; dim < values.num_dimensions(); ++dim) {
     CHECK_EQ(values.dim(dim), shape().dimensions(dim));
   }
@@ -1053,7 +1053,7 @@ void MutableLiteralBase::PopulateSparse(SparseIndexArray indices,
                                         absl::Span<const NativeT> values,
                                         bool sort) {
   CHECK(LayoutUtil::IsSparseArray(shape()));
-  int rank = ShapeUtil::Rank(shape());
+  int rank = shape().rank();
   CHECK_EQ(indices.rank(), rank);
   int64 max_elements = LayoutUtil::MaxSparseElements(shape().layout());
   CHECK_LE(indices.max_indices(), max_elements);
@@ -1077,7 +1077,7 @@ template <typename NativeT, typename FnType>
 Status MutableLiteralBase::PopulateInternal(const FnType& generator,
                                             bool parallel) {
   const Shape& this_shape = shape();
-  const int64 rank = ShapeUtil::Rank(this_shape);
+  const int64 rank = this_shape.rank();
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
   TF_RET_CHECK(this_shape.element_type() ==
                primitive_util::NativeToPrimitiveType<NativeT>());
@@ -1129,7 +1129,7 @@ Status MutableLiteralBase::PopulateParallel(const FnType& generator) {
 
 template <typename NativeT>
 void MutableLiteralBase::PopulateWithValue(NativeT value) {
-  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK(shape().IsArray());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
   for (NativeT& element : data<NativeT>()) {
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 1ac9a48e805daa86f0dc65b54626195c89241020..9b3de75dd4e9d495778af86fb8fc07909ab4ba81 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -90,6 +90,12 @@ bool CompareEqual<complex64>(complex64 lhs, complex64 rhs,
   return CompareEqual<float>(lhs.real(), rhs.real(), multi_index) &&
          CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
 }
+template <>
+bool CompareEqual<complex128>(complex128 lhs, complex128 rhs,
+                              absl::Span<const int64> multi_index) {
+  return CompareEqual<double>(lhs.real(), rhs.real(), multi_index) &&
+         CompareEqual<double>(lhs.imag(), rhs.imag(), multi_index);
+}
 
 template <typename NativeT, typename UnsignedT>
 Status MakeBitwiseErrorStatus(NativeT lhs, NativeT rhs,
@@ -143,6 +149,14 @@ Status MakeErrorStatus(complex64 lhs, complex64 rhs,
   }
   return MakeErrorStatus(lhs.imag(), rhs.imag(), multi_index);
 }
+template <>
+Status MakeErrorStatus(complex128 lhs, complex128 rhs,
+                       absl::Span<const int64> multi_index) {
+  if (!CompareEqual<double>(lhs.real(), rhs.real(), multi_index)) {
+    return MakeErrorStatus(lhs.real(), rhs.real(), multi_index);
+  }
+  return MakeErrorStatus(lhs.imag(), rhs.imag(), multi_index);
+}
 
 // A recursive function which iterates through every index of expected and
 // actual literal and compares their values elementwise. Returns true if all
@@ -172,53 +186,40 @@ Status Equal(LiteralSlice expected, LiteralSlice actual,
 // Gets the total element count.  For tuples, this is not the count of tuple
 // elements, but the sum of elements of each tuple element.
 int64 RecursiveElementCount(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
     int64 total = 0;
     for (int64 i = 0; i < tuple_elements; ++i) {
       total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
     }
     return total;
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     return ShapeUtil::ElementsIn(shape);
   } else {
     return 0;
   }
 }
 
-// Returns whether the actual and expected values are mismatched with respect to
-// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+// Returns whether the given value is infinity.
 template <typename NativeT>
-bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-  if (relaxed_nans) {
-    return !std::isnan(expected) && std::isnan(actual);
-  } else {
-    return std::isnan(expected) != std::isnan(actual);
-  }
-}
-
-template <>
-bool NanMismatch<complex64>(complex64 expected, complex64 actual,
-                            bool relaxed_nans) {
-  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+bool IsInf(NativeT val) {
+  return std::isinf(val);
 }
 
 template <>
-bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
-  return NanMismatch<float>(static_cast<float>(expected),
-                            static_cast<float>(actual), relaxed_nans);
+bool IsInf<half>(half val) {
+  return std::isinf(static_cast<float>(val));
 }
 
-// Returns whether the given value is infinity.
+// Returns whether the given value is nan.
 template <typename NativeT>
-bool IsInf(NativeT val) {
-  return std::isinf(val);
+float IsNan(NativeT value) {
+  return std::isnan(value);
 }
 
 template <>
-bool IsInf<half>(half val) {
-  return std::isinf(static_cast<float>(val));
+float IsNan(half value) {
+  return IsNan<float>(static_cast<float>(value));
 }
 
 // Converts the given floating-point value to a string.
@@ -232,6 +233,11 @@ string FpValueToString<complex64>(complex64 value) {
   return absl::StrFormat("%8.4g + %8.4fi", value.real(), value.imag());
 }
 
+template <>
+string FpValueToString<complex128>(complex128 value) {
+  return absl::StrFormat("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
 // Returns the absolute value of the given floating point value. This function
 // is used instead of std::abs directly in order to allow type-dependent
 // implementations for NearComparator.
@@ -311,7 +317,7 @@ class NearComparator {
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape()));
-    if (!ShapeUtil::IsArray(expected_.shape())) {
+    if (!expected_.shape().IsArray()) {
       return InvalidArgument("Expected array shape; got %s.",
                              ShapeUtil::HumanString(expected_.shape()));
     }
@@ -364,21 +370,39 @@ class NearComparator {
   // the given literal_index and keeps track of various mismatch statistics.
   template <typename T>
   void CompareValues(T expected, T actual, int64 linear_index) {
-    const bool is_nan_mismatch =
-        NanMismatch(expected, actual, error_.relaxed_nans);
     float abs_error;
     float rel_error;
     if (CompareEqual<T>(expected, actual, {linear_index})) {
       abs_error = 0;
       rel_error = 0;
-    } else if (is_nan_mismatch) {
-      num_nan_mismatches_++;
-      // A nan mismatch is considered to have infinite error. rel_error is used
-      // for sorting a std::set of the top mismatchs, and a nan value here will
-      // result in undefined behavior because nan's do not satisfy the strict
-      // weak ordering requirement of std containers.
-      abs_error = std::numeric_limits<float>::infinity();
-      rel_error = std::numeric_limits<float>::infinity();
+    } else if (IsNan(expected) || IsNan(actual)) {
+      if ((!error_.relaxed_nans && IsNan(expected) != IsNan(actual)) ||
+          (error_.relaxed_nans && !IsNan(expected) && IsNan(actual))) {
+        num_nan_mismatches_++;
+        // A nan mismatch is considered to have infinite error. rel_error is
+        // used for sorting a std::set of the top mismatchs, and a nan value
+        // here will result in undefined behavior because nan's do not satisfy
+        // the strict weak ordering requirement of std containers.
+        abs_error = std::numeric_limits<float>::infinity();
+        rel_error = std::numeric_limits<float>::infinity();
+      } else {
+        abs_error = 0;
+        rel_error = 0;
+      }
+    } else if (IsInf(actual) && !IsInf(expected) && error_.fewer_infs_ok) {
+      // `fewer_infs_ok` gives us the option of comparing as though `actual`
+      // were float_max/min rather than inf.
+      T actual_finite = actual > T{0} ? std::numeric_limits<T>::max()
+                                      : std::numeric_limits<T>::lowest();
+      abs_error = FpAbsoluteValue(actual_finite - expected);
+
+      // Avoid division by 0 even though it's well-defined because ubsan can be
+      // configured to treat this as a fatal error.
+      if (expected != T{0}) {
+        rel_error = abs_error / FpAbsoluteValue(expected);
+      } else {
+        rel_error = std::numeric_limits<float>::infinity();
+      }
     } else if (IsInf(expected) || IsInf(actual)) {
       // If either the expected or actual value is infinity but not both,
       // then both absolute and relative error are regarded as inifity.
@@ -387,12 +411,18 @@ class NearComparator {
       rel_error = std::numeric_limits<float>::infinity();
     } else {
       abs_error = FpAbsoluteValue(actual - expected);
-      rel_error = abs_error / FpAbsoluteValue(expected);
+
+      // Avoid division by 0 even though it's well-defined because ubsan can be
+      // configured to treat this as a fatal error.
+      if (expected != T{0}) {
+        rel_error = abs_error / FpAbsoluteValue(expected);
+      } else {
+        rel_error = std::numeric_limits<float>::infinity();
+      }
     }
     const bool is_abs_mismatch = abs_error > error_.abs;
     const bool is_rel_mismatch = rel_error > error_.rel;
-    const bool is_mismatch =
-        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+    const bool is_mismatch = is_abs_mismatch && is_rel_mismatch;
 
     // Update the error of the relative bucket only if the *absolute* error
     // bound is exceeded and vice versa.
@@ -427,7 +457,7 @@ class NearComparator {
     mismatches_.data<bool>()[linear_index] = true;
   }
 
-  // For complex64 types, we compare real and imaginary parts individually.
+  // For complex types, we compare real and imaginary parts individually.
   void CompareValues(complex64 expected, complex64 actual, int64 linear_index) {
     bool mismatch = false;
     CompareValues<float>(expected.real(), actual.real(), linear_index);
@@ -450,6 +480,29 @@ class NearComparator {
     mismatches_.data<bool>()[linear_index] = mismatch;
   }
 
+  void CompareValues(complex128 expected, complex128 actual,
+                     int64 linear_index) {
+    bool mismatch = false;
+    CompareValues<double>(expected.real(), actual.real(), linear_index);
+    if (mismatches_.data<bool>()[linear_index] == true) {
+      mismatch = true;
+      // Delay the mismatch count increase for real part, instead increase
+      // mismatch by 1 for the entire complex number.
+      num_mismatches_--;
+    }
+    CompareValues<double>(expected.imag(), actual.imag(), linear_index);
+    if (mismatches_.data<bool>()[linear_index] == true) {
+      mismatch = true;
+      // Delay the mismatch count increase for imag part, instead increase
+      // mismatch by 1 for the entire complex number.
+      num_mismatches_--;
+    }
+    if (mismatch == true) {
+      num_mismatches_++;
+    }
+    mismatches_.data<bool>()[linear_index] = mismatch;
+  }
+
   // Compares the two literals elementwise.
   void CompareLiterals() {
     // Fast path optimization for the case were layouts match.
@@ -463,7 +516,7 @@ class NearComparator {
       }
       return;
     }
-    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    std::vector<int64> multi_index(actual_.shape().rank(), 0);
     CompareLiteralsSlow(0, &multi_index);
   }
 
@@ -658,6 +711,9 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
     case C64:
       result = Equal<complex64>(expected, actual, index, 0);
       break;
+    case C128:
+      result = Equal<complex128>(expected, actual, index, 0);
+      break;
     case TUPLE: {
       for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
         result.Update(EqualHelper(LiteralSlice(expected, {i}),
@@ -680,12 +736,12 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
 // via recursion. shape_index is the ShapeIndex of expected (or actual)
 // currently being compared.
 Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
-                  const ErrorSpec& error, bool detailed_message,
+                  const ErrorSpec& error, absl::optional<bool> detailed_message,
                   const MiscompareCallback& miscompare_callback,
                   const ShapeIndex& shape_index) {
   TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
 
-  if (ShapeUtil::IsTuple(expected.shape())) {
+  if (expected.shape().IsTuple()) {
     Status return_status;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
       const auto expected_element = LiteralSlice(expected, {i});
@@ -721,26 +777,32 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
 
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
+    bool use_detailed_message = detailed_message.value_or(
+        ShapeUtil::ElementsIn(expected.shape()) >= 64);
     switch (expected.shape().element_type()) {
       case BF16:
         return NearComparator<bfloat16>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F16:
         return NearComparator<half>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F32:
         return NearComparator<float>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F64:
         return NearComparator<double>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case C64:
         return NearComparator<complex64>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
+        break;
+      case C128:
+        return NearComparator<complex128>::Compare(
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       default:
         LOG(FATAL) << "Unsupported primitive type in near comparator: "
@@ -761,7 +823,7 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
                            ShapeUtil::HumanString(expected),
                            ShapeUtil::HumanString(actual));
   }
-  if (ShapeUtil::IsTuple(expected)) {
+  if (expected.IsTuple()) {
     if (ShapeUtil::TupleElementCount(expected) !=
         ShapeUtil::TupleElementCount(actual)) {
       return InvalidArgument(
@@ -776,8 +838,8 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
         return AppendStatus(result, StrCat("mismatch in tuple index", i));
       }
     }
-  } else if (ShapeUtil::IsArray(expected)) {
-    if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) {
+  } else if (expected.IsArray()) {
+    if (expected.rank() != actual.rank()) {
       return InvalidArgument("want rank of %s got rank of %s",
                              ShapeUtil::HumanString(expected),
                              ShapeUtil::HumanString(actual));
@@ -831,7 +893,7 @@ Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
 }
 
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, bool detailed_message,
+            const ErrorSpec& error, absl::optional<bool> detailed_message,
             const MiscompareCallback& miscompare_callback) {
   VLOG(1) << "Expected literal:";
   XLA_VLOG_LINES(1, expected.ToString());
diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h
index 9e5bf7c1d062ef0f25d07a80d6ded8106df5dacc..23fff3fa348f1652eaec344da4c40ccf3ad1079a 100644
--- a/tensorflow/compiler/xla/literal_comparison.h
+++ b/tensorflow/compiler/xla/literal_comparison.h
@@ -55,9 +55,10 @@ using MiscompareCallback =
 // being compared.
 //
 // If detailed_message is true, then the error message in the assertion result
-// will contain a more detailed breakdown of mismatches.
+// will contain a more detailed breakdown of mismatches.  By default, we display
+// a detailed message only for "large" inputs.
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, bool detailed_message,
+            const ErrorSpec& error, absl::optional<bool> detailed_message,
             const MiscompareCallback& miscompare_callback);
 
 // Calling ToString on a literal with over 100 million elements takes around
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index d8c7141cacb8f60cb4ce56d07ac5827a8dbf9b20..b54a71ae68218ef578535a913f5867d843236e32 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -118,6 +118,9 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
   auto c64_lit = LiteralUtil::CreateR0<complex64>({3.14f, 2.78f});
   EXPECT_EQ("c64[] (3.14, 2.78)", c64_lit.ToString());
 
+  auto c128_lit = LiteralUtil::CreateR0<complex128>({3.14f, 2.78f});
+  EXPECT_EQ("c128[] (3.14, 2.78)", c128_lit.ToString());
+
   auto bf16_lit = LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(0.5f));
   EXPECT_EQ("bf16[] 0.5", bf16_lit.ToString());
 
@@ -469,6 +472,21 @@ TEST_F(LiteralUtilTest, C64Equality) {
   EXPECT_NE(vector, vector_reversed);
 }
 
+TEST_F(LiteralUtilTest, C128Equality) {
+  // Test equality with tuples.
+  auto vector = LiteralUtil::CreateR1<complex128>({{1.0, 2.0}, {3.0, 4.0}});
+
+  // Tuple with the same elements. One element is shared with the original
+  // tuple, the other is a clone of the element in the original tuple.
+  auto vector_clone =
+      LiteralUtil::CreateR1<complex128>({{1.0, 2.0}, {3.0, 4.0}});
+  EXPECT_EQ(vector, vector_clone);
+
+  auto vector_reversed =
+      LiteralUtil::CreateR1<complex128>({{3.0, 4.0}, {1.0, 2.0}});
+  EXPECT_NE(vector, vector_reversed);
+}
+
 TEST_F(LiteralUtilTest, IsAllTuple) {
   auto element1 = LiteralUtil::CreateR0<float>(0.0);
   auto element2 = LiteralUtil::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
@@ -623,7 +641,7 @@ template <typename T>
 class LiteralUtilTestTemplated : public ::testing::Test {};
 
 using TestedTypes = ::testing::Types<float, int32, uint32, complex64>;
-TYPED_TEST_CASE(LiteralUtilTestTemplated, TestedTypes);
+TYPED_TEST_SUITE(LiteralUtilTestTemplated, TestedTypes);
 
 TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) {
   // Make a non-integer for floating point types.
@@ -836,6 +854,13 @@ TEST_F(LiteralUtilTest, PopulateR1C64) {
   EXPECT_EQ(output, expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateR1C128) {
+  Literal output(ShapeUtil::MakeShape(C128, {1}));
+  output.PopulateR1<complex128>({{77, 88}});
+  auto expected = LiteralUtil::CreateR1<complex128>({{77, 88}});
+  EXPECT_EQ(output, expected);
+}
+
 TEST_F(LiteralUtilTest, PopulateR2C64) {
   Literal output(ShapeUtil::MakeShape(C64, {2, 2}));
   output.PopulateR2<complex64>({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}});
@@ -897,6 +922,14 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2C64) {
   EXPECT_EQ(output, expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR2C128) {
+  Literal output(ShapeUtil::MakeShape(C128, {2, 2}));
+  output.PopulateWithValue<complex128>({4, 2});
+  auto expected =
+      LiteralUtil::CreateR2<complex128>({{{4, 2}, {4, 2}}, {{4, 2}, {4, 2}}});
+  EXPECT_EQ(output, expected);
+}
+
 TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
   Literal output(ShapeUtil::MakeShape(F16, {}));
   half h(0.25f);
@@ -1237,11 +1270,21 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
+  auto s16 = LiteralUtil::CreateR4WithLayout<int16>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
   auto s32 = LiteralUtil::CreateR4WithLayout<int32>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
+  auto u16 = LiteralUtil::CreateR4WithLayout<uint16>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
   auto u32 = LiteralUtil::CreateR4WithLayout<uint32>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
@@ -1298,9 +1341,19 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
     {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}},
     {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}},
   }}, layout_r4_dim0major_);
-  // clang-format on
+  auto c128 = LiteralUtil::CreateR4WithLayout<complex128>({{
+    {{10.0, 0.0, 12.0, 0.0}, {0.0, 15.0, 0.0, 17.0}},
+    {{0.0, 19.0, 0.0, 21.0}, {22.0, 0.0, 24.0, 0.0}},
+    {{26.0, 0.0, 28.0, 0.0}, {0.0, 31.0, 0.0, 33.0}},
+  }}, layout_r4_dim0major_);  // clang-format on
   Literal conv;
 
+  conv = s8.Convert(U16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, u16);
+
+  conv = s8.Convert(S16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, s16);
+
   conv = s8.Convert(U32).ConsumeValueOrDie();
   EXPECT_EQ(conv, u32);
 
@@ -1352,12 +1405,26 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   conv = f16.Convert(C64).ConsumeValueOrDie();
   EXPECT_EQ(conv, c64);
 
+  conv = s32.Convert(S16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, s16);
+
+  conv = s32.Convert(U16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, u16);
+
+  conv = s32.Convert(C128).ConsumeValueOrDie();
+  EXPECT_EQ(conv, c128);
+
+  conv = f16.Convert(C128).ConsumeValueOrDie();
+  EXPECT_EQ(conv, c128);
+
   EXPECT_EQ(s32.Convert(TUPLE).status().code(),
             tensorflow::error::UNIMPLEMENTED);
-  EXPECT_EQ(s32.Convert(S16).status().code(), tensorflow::error::UNIMPLEMENTED);
-  EXPECT_EQ(s32.Convert(U16).status().code(), tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(c64.Convert(F32).status().code(), tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(c64.Convert(S32).status().code(), tensorflow::error::UNIMPLEMENTED);
+  EXPECT_EQ(c128.Convert(F32).status().code(),
+            tensorflow::error::UNIMPLEMENTED);
+  EXPECT_EQ(c128.Convert(S32).status().code(),
+            tensorflow::error::UNIMPLEMENTED);
 }
 
 TEST_F(LiteralUtilTest, BitcastConvert) {
@@ -1642,7 +1709,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
       LiteralUtil::MakeTuple({&inner_elements[0], &inner_elements[1]}));
 
   Literal literal = Literal::MoveIntoTuple(absl::MakeSpan(elements));
-  ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape()));
+  ASSERT_TRUE(literal.shape().IsTuple());
   ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 3);
 
   EXPECT_EQ(literal.Get<float>({}, /*shape_index=*/{0}), 1.0);
@@ -1659,7 +1726,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
 
 TEST_F(LiteralUtilTest, MoveIntoEmptyTuple) {
   Literal literal = Literal::MoveIntoTuple({});
-  ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape()));
+  ASSERT_TRUE(literal.shape().IsTuple());
   EXPECT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0);
 }
 
@@ -1719,7 +1786,8 @@ TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) {
 
   Literal tuple = Literal::CreateFromShape(ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F64, {}), ShapeUtil::MakeShape(PRED, {2}),
-       ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {})}));
+       ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {}),
+       ShapeUtil::MakeShape(C128, {})}));
 
   EXPECT_EQ(tuple.Get<double>({}, {0}), 0.0);
   EXPECT_EQ(tuple.Get<bool>({0}, {1}), false);
@@ -1727,6 +1795,7 @@ TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) {
   EXPECT_EQ(tuple.Get<uint64>({0, 0}, {2}), 0);
   EXPECT_EQ(tuple.Get<uint64>({1, 0}, {2}), 0);
   EXPECT_EQ(tuple.Get<complex64>({}, {3}), complex64(0.0f, 0.0f));
+  EXPECT_EQ(tuple.Get<complex128>({}, {4}), complex128(0.0, 0.0));
 }
 
 TEST_F(LiteralUtilTest, ProtoRoundTrip) {
@@ -1736,6 +1805,8 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   auto vector_int8 = LiteralUtil::CreateR1<int8>({-128, 0, 2, 4, 7, 56, 127});
   auto vector_uint8 = LiteralUtil::CreateR1<uint8>({128, 0, 2, 56, 127, 255});
   auto vector_c64 = LiteralUtil::CreateR1<complex64>({{1.0, 2.0}, {3.0, 4.0}});
+  auto vector_c128 =
+      LiteralUtil::CreateR1<complex128>({{1.0, 2.0}, {3.0, 4.0}});
   auto vector_bfloat16 = LiteralUtil::CreateR1<bfloat16>(
       {bfloat16{-1.0}, bfloat16{2.0}, bfloat16{-3.0}});
   auto vector_half =
@@ -1756,6 +1827,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   EXPECT_EQ(vector_int8, to_from_proto(vector_int8));
   EXPECT_EQ(vector_uint8, to_from_proto(vector_uint8));
   EXPECT_EQ(vector_c64, to_from_proto(vector_c64));
+  EXPECT_EQ(vector_c128, to_from_proto(vector_c128));
   EXPECT_EQ(vector_bfloat16, to_from_proto(vector_bfloat16));
   EXPECT_EQ(matrix_pred, to_from_proto(matrix_pred));
   EXPECT_EQ(tuple, to_from_proto(tuple));
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index bb5e5e61000d0aca6ab052ac87d2fbcd96e55f70..26b029c8d0c52e38510f9279def7c4af2904931d 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -62,7 +62,7 @@ Literal ConvertType(LiteralSlice literal) {
   ShapeUtil::ForEachSubshape(
       literal.shape(),
       [&](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           if (subshape.element_type() ==
               primitive_util::NativeToPrimitiveType<FromNativeT>()) {
             auto src = literal.data<FromNativeT>(shape_index);
@@ -106,12 +106,16 @@ Literal ConvertType(LiteralSlice literal) {
   switch (primitive_type) {
     case U8:
       return LiteralUtil::CreateR0<uint8>(0);
+    case U16:
+      return LiteralUtil::CreateR0<uint16>(0);
     case U32:
       return LiteralUtil::CreateR0<uint32>(0);
     case U64:
       return LiteralUtil::CreateR0<uint64>(0);
     case S8:
       return LiteralUtil::CreateR0<int8>(0);
+    case S16:
+      return LiteralUtil::CreateR0<int16>(0);
     case S32:
       return LiteralUtil::CreateR0<int32>(0);
     case S64:
@@ -126,11 +130,10 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<double>(0);
     case C64:
       return LiteralUtil::CreateR0<complex64>(0);
+    case C128:
+      return LiteralUtil::CreateR0<complex128>(0);
     case PRED:
       return LiteralUtil::CreateR0<bool>(false);
-    case S16:
-    case U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 0";
     case OPAQUE:
@@ -164,6 +167,8 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<double>(1);
     case C64:
       return LiteralUtil::CreateR0<complex64>(1);
+    case C128:
+      return LiteralUtil::CreateR0<complex128>(1);
     case PRED:
       return LiteralUtil::CreateR0<bool>(true);
     case S16:
@@ -200,6 +205,8 @@ Literal ConvertType(LiteralSlice literal) {
           -std::numeric_limits<double>::infinity());
     case C64:
       LOG(FATAL) << "C64 element type has no minimum value";
+    case C128:
+      LOG(FATAL) << "C128 element type has no minimum value";
     case PRED:
       return LiteralUtil::CreateR0<bool>(false);
     case S16:
@@ -344,6 +351,10 @@ Literal ConvertType(LiteralSlice literal) {
         new_literal.Set<complex64>(to_multi_index,
                                    literal.Get<complex64>(from_multi_index));
         break;
+      case C128:
+        new_literal.Set<complex128>(to_multi_index,
+                                    literal.Get<complex128>(from_multi_index));
+        break;
       default:
         LOG(FATAL) << "Unhandled primitive element type: "
                    << PrimitiveType_Name(literal.shape().element_type());
@@ -355,7 +366,7 @@ Literal ConvertType(LiteralSlice literal) {
 
 /* static */ Literal LiteralUtil::GetFirstScalarLiteral(
     const LiteralSlice& literal) {
-  CHECK(ShapeUtil::IsArray(literal.shape()));
+  CHECK(literal.shape().IsArray());
   CHECK_GT(ShapeUtil::ElementsIn(literal.shape()), 0);
   switch (literal.shape().element_type()) {
     case PRED:
@@ -392,6 +403,10 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<int64>(literal.GetFirstElement<int64>());
     case U64:
       return LiteralUtil::CreateR0<uint64>(literal.GetFirstElement<uint64>());
+
+    case C128:
+      return LiteralUtil::CreateR0<complex128>(
+          literal.GetFirstElement<complex128>());
     default:
       LOG(FATAL) << "Unhandled primitive type "
                  << literal.shape().element_type();
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index 4eab4fa4290c270697c00be20840cf4e85459183..bad65ac32018fafcc7634b989f1b4b0867aa5c0d 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/metric_table_report.h"
 
-#include <cctype>
 #include <unordered_map>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/logging.h"
@@ -55,7 +55,7 @@ string MetricTableReport::MakeReport(double expected_metric_sum) {
   const auto metric_greater = [](const Entry& a, const Entry& b) {
     return a.metric > b.metric;
   };
-  std::sort(entries_.begin(), entries_.end(), metric_greater);
+  absl::c_sort(entries_, metric_greater);
 
   // Create the report
   AppendLine();
@@ -117,7 +117,7 @@ std::vector<MetricTableReport::Category> MetricTableReport::MakeCategories(
   auto metric_sum_greater = [](const Category& a, const Category& b) {
     return a.metric_sum > b.metric_sum;
   };
-  std::sort(categories.begin(), categories.end(), metric_sum_greater);
+  absl::c_sort(categories, metric_sum_greater);
 
   return categories;
 }
@@ -249,7 +249,7 @@ string MetricTableReport::MetricString(double metric) {
   string output;
   // Copy leading non-digit characters unconditionally.
   // This picks up the leading sign.
-  while (!sp1.empty() && !isdigit(sp1[0])) {
+  while (!sp1.empty() && !absl::ascii_isdigit(sp1[0])) {
     output.push_back(sp1[0]);
     sp1.remove_prefix(1);
   }
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 5b568888d14f21c1330556d017eafba6c8dd2228..91e71f5d1d02d135158d0dffc140c21cf8ea5e3a 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -37,7 +38,7 @@ limitations under the License.
 
 namespace xla {
 
-static const char kWS[] = " \t\r\n";           // whitespace
+static const char kWS[] = " \t\r\n";  // whitespace
 
 // The following struct represents an argv[]-style array, parsed
 // from data gleaned from the environment.
@@ -104,7 +105,8 @@ static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
     // Set e to the index just past the end of the flag.
     size_t e = b;
     while (e != flag_str.size() && isascii(flag_str[e]) &&
-           (strchr("-_", flag_str[e]) != nullptr || isalnum(flag_str[e]))) {
+           (strchr("-_", flag_str[e]) != nullptr ||
+            absl::ascii_isalnum(flag_str[e]))) {
       e++;
     }
     if (e != flag_str.size() && flag_str[e] == '=' &&
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 00ad01fc407017624a9183d69e61cb0d382e3f11..1eedddf72c1d393cb1b88e589881e24de02ad802 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -18,16 +18,32 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace primitive_util {
 
+int SignificandWidth(PrimitiveType type) {
+  switch (type) {
+    case F32:
+      return std::numeric_limits<float>::digits;
+    case F64:
+      return std::numeric_limits<double>::digits;
+    case BF16:
+      return kBFloat16MantissaBits + 1;
+    case F16:
+      return 11;
+    default:
+      LOG(FATAL) << "Not a floating data type " << type;
+  }
+}
+
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64 || type == BF16;
 }
 
-bool IsComplexType(PrimitiveType type) { return type == C64; }
+bool IsComplexType(PrimitiveType type) { return type == C64 || type == C128; }
 
 bool IsSignedIntegralType(PrimitiveType type) {
   return type == S8 || type == S16 || type == S32 || type == S64;
@@ -67,6 +83,9 @@ int BitWidth(PrimitiveType type) {
     case C64:
       return 64;
 
+    case C128:
+      return 128;
+
     case TUPLE:
       LOG(FATAL) << "TUPLE is an invalid type for BitWidth";
 
@@ -78,10 +97,27 @@ int BitWidth(PrimitiveType type) {
   }
 }
 
+xla::PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth) {
+  switch (src_bitwidth) {
+    case 8:
+      return xla::U8;
+    case 16:
+      return xla::U16;
+    case 32:
+      return xla::U32;
+    case 64:
+      return xla::U64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
+
 PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   switch (complex_type) {
     case C64:
       return F32;
+    case C128:
+      return F64;
     default:
       LOG(FATAL) << "Primitive type is not complex: "
                  << PrimitiveType_Name(complex_type);
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 70603b6fed1be50c427799e6dce7b8bf9631a6f4..295d353003276b4c1731f7d6a378fd1ae0288d3c 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -29,6 +29,10 @@ limitations under the License.
 namespace xla {
 namespace primitive_util {
 
+// Returns the count of significand (mantissa) bits for float datatypes.
+// For non-float datatypes, results in a LOG(FATAL).
+int SignificandWidth(PrimitiveType type);
+
 // The number of exponent bits in a BF16 value.
 const int kBFloat16ExponentBits = 8;
 
@@ -126,6 +130,11 @@ inline PrimitiveType NativeToPrimitiveType<complex64>() {
   return C64;
 }
 
+template <>
+inline PrimitiveType NativeToPrimitiveType<complex128>() {
+  return C128;
+}
+
 bool IsFloatingPointType(PrimitiveType type);
 
 bool IsComplexType(PrimitiveType type);
@@ -142,6 +151,8 @@ bool IsArrayType(PrimitiveType primitive_type);
 // Returns the number of bits in the representation for a given type.
 int BitWidth(PrimitiveType type);
 
+PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth);
+
 // Returns the real, imag component type underlying the given complex type.
 // LOG(FATAL)'s if complex_type is not complex.
 PrimitiveType ComplexComponentType(PrimitiveType complex_type);
@@ -225,6 +236,11 @@ struct PrimitiveTypeToNative<C64> {
   using type = complex64;
 };
 
+template <>
+struct PrimitiveTypeToNative<C128> {
+  using type = complex128;
+};
+
 // Returns the lower-case name of the given primitive type.
 const string& LowercasePrimitiveTypeName(PrimitiveType s);
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index ddffafa9017a565f01c3214360a958e6840e9148..4afb21d5c8864c2974114af2de08df4106a13a8c 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -3,8 +3,8 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_python_default_plugins")
 
 py_library(
     name = "xla_client",
@@ -98,6 +98,11 @@ tf_py_wrap_cc(
         "local_computation_builder.i",
         "//tensorflow/python:platform/base.i",
     ],
+    version_script = select({
+        "//tensorflow:darwin": "pywrap_xla_exported_symbols.lds",
+        "//tensorflow:windows": None,
+        "//conditions:default": "pywrap_xla_version_script.lds",
+    }),
     deps = [
         ":local_computation_builder",
         ":numpy_bridge",
@@ -105,7 +110,5 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-    ] + if_cuda_is_configured([
-        "//tensorflow/compiler/xla/service:gpu_plugin",
-    ]),
+    ] + xla_python_default_plugins(),
 )
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 657a09f92ad14d959416c768b09c392ff17f96eb..0e898d494e044509a41209891c28d929dff11b9a 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -102,7 +102,7 @@ int GetReplicaCount() {
   return g_replica_count;
 }
 
-LocalClient* GetOrCreateLocalClient() {
+StatusOr<LocalClient*> GetOrCreateLocalClient() {
   string* platform_name = GetPlatformNameString();
   tensorflow::mutex_lock lock(g_local_client_mutex);
   if (g_local_client != nullptr) {
@@ -111,7 +111,8 @@ LocalClient* GetOrCreateLocalClient() {
   LocalClientOptions options;
   options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
   options.set_number_of_replicas(g_replica_count);
-  g_local_client = ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
+  TF_ASSIGN_OR_RETURN(g_local_client,
+                      ClientLibrary::GetOrCreateLocalClient(options));
   CHECK(g_local_client != nullptr);
   return g_local_client;
 }
@@ -133,7 +134,7 @@ Status RegisterCpuCustomCallTarget(const string& fn_name, PyObject* capsule) {
 Status TransferToInfeedLocal(const Literal& literal) {
   VLOG(1) << "Infeeding literal without replica number; shape: "
           << literal.shape();
-  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   return client->TransferToInfeedLocal(literal, /*device_ordinal=*/0);
 }
 
@@ -141,7 +142,7 @@ Status TransferToInfeedLocalReplica(const Literal& literal,
                                     int replica_number) {
   VLOG(1) << "Infeeding shape " << literal.shape()
           << " to replica number: " << replica_number;
-  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   TF_ASSIGN_OR_RETURN(int device_ordinal,
                       client->ReplicaNumberToDeviceOrdinal(replica_number));
   return client->TransferToInfeedLocal(literal, device_ordinal);
@@ -151,7 +152,7 @@ StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
                                                   int replica_number) {
   VLOG(1) << "Outfeeding literal from replica number: " << replica_number
           << " shape: " << shape;
-  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   TF_ASSIGN_OR_RETURN(int device_ordinal,
                       client->ReplicaNumberToDeviceOrdinal(replica_number));
   return client->TransferFromOutfeedLocal(shape, device_ordinal);
@@ -168,7 +169,7 @@ static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
     const Literal& argument, const absl::optional<Shape>& shape_with_layout,
     int replica_number) {
-  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   TF_ASSIGN_OR_RETURN(int device_ordinal,
                       client->ReplicaNumberToDeviceOrdinal(replica_number));
   VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
@@ -198,7 +199,7 @@ const Shape& LocalShapedBuffer::shape() const {
 }
 
 StatusOr<Literal> LocalShapedBuffer::ToLiteral() const {
-  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   return client->ShapedBufferToLiteral(*shaped_buffer());
 }
 
@@ -333,37 +334,34 @@ CompiledLocalComputation::CompiledLocalComputation(
 
 StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
-  LocalClient* client = GetOrCreateLocalClient();
-  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  if (num_replicas() != 1) {
+    return InvalidArgument(
+        "Attempted to execute computation with %d replicas using Execute()",
+        num_replicas());
+  }
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
+  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
+                      client->backend().computation_placer()->AssignDevices(
+                          1, /*computation_count=*/1));
   StatusOr<ScopedShapedBuffer> result_buffer_status;
-  if (!device_ordinal_status.ok()) {
-    result_buffer_status = device_ordinal_status.status();
-  } else {
-    const int device_ordinal = device_ordinal_status.ValueOrDie();
-    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
-            << device_ordinal;
-
-    std::vector<const ShapedBuffer*> argument_buffers;
-    argument_buffers.reserve(argument_handles.size());
-    for (auto& handle : argument_handles) {
-      argument_buffers.push_back(handle->shaped_buffer());
-    }
-
-    DeviceAssignment device_assignment =
-        client->backend()
-            .computation_placer()
-            ->AssignDevices(1, /*computation_count=*/1)
-            .ConsumeValueOrDie();
+  const int device_ordinal = device_assignment(0, 0);
+  VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+          << device_ordinal;
+
+  std::vector<const ShapedBuffer*> argument_buffers;
+  argument_buffers.reserve(argument_handles.size());
+  for (auto& handle : argument_handles) {
+    argument_buffers.push_back(handle->shaped_buffer());
+  }
 
-    ExecutableRunOptions options;
-    options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client->backend().memory_allocator());
-    options.set_intra_op_thread_pool(
-        client->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment);
+  ExecutableRunOptions options;
+  options.set_device_ordinal(device_ordinal);
+  options.set_allocator(client->backend().memory_allocator());
+  options.set_intra_op_thread_pool(
+      client->backend().eigen_intra_op_thread_pool_device());
+  options.set_device_assignment(&device_assignment);
 
-    result_buffer_status = executable_->Run(argument_buffers, options);
-  }
+  result_buffer_status = executable_->Run(argument_buffers, options);
 
   if (!result_buffer_status.ok()) {
     return InternalError(
@@ -376,29 +374,30 @@ StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
 
 StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
     absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
-  LocalClient* client = GetOrCreateLocalClient();
-  const int num_replicas = GetReplicaCount();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
+  const int num_devices = client->device_count();
 
-  if (argument_handles.size() != num_replicas) {
+  if (argument_handles.size() != num_replicas()) {
     return InvalidArgument(
         "Attempted to execute with %d replicas when replica count is %d",
-        argument_handles.size(), num_replicas);
+        argument_handles.size(), num_devices);
   }
+  if (argument_handles.size() > num_devices) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when device count is %d",
+        argument_handles.size(), num_devices);
+  }
+
+  VLOG(1) << "Executing with " << num_replicas() << " replicas.";
 
-  VLOG(1) << "Executing with " << num_replicas << " replicas.";
+  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
+                      client->backend().computation_placer()->AssignDevices(
+                          num_replicas(), /*computation_count=*/1));
 
-  // Each replica populates a StatusOr result, but only the output value of
-  // replica zero is returned.
-  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
-  auto execute = [this, client, num_replicas, &argument_handles,
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas());
+  auto execute = [this, client, &device_assignment, &argument_handles,
                   &results](int replica) {
-    StatusOr<int> device_ordinal_status =
-        client->ReplicaNumberToDeviceOrdinal(replica);
-    if (!device_ordinal_status.ok()) {
-      results[replica] = device_ordinal_status.status();
-      return;
-    }
-    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    const int device_ordinal = device_assignment(replica, 0);
     VLOG(3) << "Replica " << replica
             << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -408,12 +407,6 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
       argument_buffers.push_back(handle->shaped_buffer());
     }
 
-    DeviceAssignment device_assignment =
-        client->backend()
-            .computation_placer()
-            ->AssignDevices(num_replicas, /*computation_count=*/1)
-            .ConsumeValueOrDie();
-
     ExecutableRunOptions options;
     options.set_device_ordinal(device_ordinal);
     options.set_allocator(client->backend().memory_allocator());
@@ -426,23 +419,23 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
     results[replica] = std::move(result_buffer_status);
   };
 
-  if (num_replicas == 1) {
+  if (num_replicas() == 1) {
     // Fast-path if there is only one replica — run the computation on the
     // current thread.
     execute(0);
   } else {
     // TODO(phawkins): don't recreate the threadpool for each execution.
     tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        num_replicas - 1);
+                                        num_replicas() - 1);
 
-    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+    for (int replica = 0; replica < num_replicas() - 1; ++replica) {
       pool.Schedule([&execute, replica] { execute(replica); });
     }
-    execute(num_replicas - 1);
+    execute(num_replicas() - 1);
   }
 
-  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
-  for (int replica = 0; replica < num_replicas; ++replica) {
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas());
+  for (int replica = 0; replica < num_replicas(); ++replica) {
     auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
@@ -549,7 +542,7 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
     argument_shape_pointers.push_back(&argument_shape);
   }
 
-  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   ExecutableBuildOptions options;
   if (build_options != nullptr) {
     options = *build_options;
@@ -698,8 +691,9 @@ LocalOp LocalComputationBuilder::Collapse(const LocalOp& operand,
   return xla::Collapse(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::CrossReplicaSum(const LocalOp& operand) {
-  return xla::CrossReplicaSum(operand.op());
+LocalOp LocalComputationBuilder::CrossReplicaSum(
+    const LocalOp& operand, absl::Span<const ReplicaGroup> replica_groups) {
+  return xla::CrossReplicaSum(operand.op(), replica_groups);
 }
 
 LocalOp LocalComputationBuilder::Slice(const LocalOp& operand,
@@ -927,6 +921,22 @@ LocalOp LocalComputationBuilder::TriangularSolve(const LocalOp& a,
                               conjugate_a);
 }
 
+LocalOp LocalComputationBuilder::Gather(
+    const LocalOp& input, const LocalOp& start_indices,
+    const GatherDimensionNumbers& dimension_numbers,
+    absl::Span<const int64> slice_sizes) {
+  return xla::Gather(input.op(), start_indices.op(), dimension_numbers,
+                     slice_sizes);
+}
+
+LocalOp LocalComputationBuilder::Scatter(
+    const LocalOp& input, const LocalOp& scatter_indices,
+    const LocalOp& updates, const LocalComputation& update_computation,
+    const ScatterDimensionNumbers& dimension_numbers) {
+  return xla::Scatter(input.op(), scatter_indices.op(), updates.op(),
+                      update_computation.computation(), dimension_numbers);
+}
+
 StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
@@ -1041,7 +1051,7 @@ StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
     LocalShapedBuffer* local_shaped_buffer) {
   const Shape tuple_shape = local_shaped_buffer->shape();
 
-  if (!ShapeUtil::IsTuple(tuple_shape)) {
+  if (!tuple_shape.IsTuple()) {
     return InvalidArgument(
         "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
         "shape; shape: %s",
@@ -1088,7 +1098,7 @@ StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
     XrtAllocation* allocation, const string& session_target) {
   const Shape& tuple_shape = allocation->shape();
 
-  if (!ShapeUtil::IsTuple(tuple_shape)) {
+  if (!tuple_shape.IsTuple()) {
     return InvalidArgument(
         "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
         "shape; shape: %s",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 5e8341592100bc1eba4d1c17b0c2dd0e0888fdb1..6170567f9ff8f5a062f47d148900fe3676a74542 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -180,6 +180,10 @@ class CompiledLocalComputation {
  public:
   CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
 
+  int num_replicas() const {
+    return executable_->build_options().num_replicas();
+  }
+
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
 
@@ -312,7 +316,8 @@ class LocalComputationBuilder {
 
   LocalOp Collapse(const LocalOp& operand, absl::Span<const int64> dimensions);
 
-  LocalOp CrossReplicaSum(const LocalOp& operand);
+  LocalOp CrossReplicaSum(const LocalOp& operand,
+                          absl::Span<const ReplicaGroup> replica_groups);
 
   LocalOp Slice(const LocalOp& operand, absl::Span<const int64> start_indices,
                 absl::Span<const int64> limit_indices,
@@ -418,6 +423,15 @@ class LocalComputationBuilder {
   LocalOp TriangularSolve(const LocalOp& a, const LocalOp& b, bool left_side,
                           bool lower, bool transpose_a, bool conjugate_a);
 
+  LocalOp Gather(const LocalOp& input, const LocalOp& start_indices,
+                 const GatherDimensionNumbers& dimension_numbers,
+                 absl::Span<const int64> slice_sizes);
+
+  LocalOp Scatter(const LocalOp& input, const LocalOp& scatter_indices,
+                  const LocalOp& updates,
+                  const LocalComputation& update_computation,
+                  const ScatterDimensionNumbers& dimension_numbers);
+
   StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index bf5d667c6a12972845735983a74264ea05675971..6a85ed62dea3dbdbb25a990e6d774a0152439673 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -34,6 +34,9 @@ limitations under the License.
 //  PaddingConfig proto                <-  corresponding Python proto
 //  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
 //  DotDimensionNumbers proto          <-  corresponding Python proto
+//  GatherDimensionNumbers proto       <-  corresponding Python proto
+//  ScatterDimensionNumbers proto      <-  corresponding Python proto
+//  Span<ReplicaGroup proto>           <-  sequence of ReplicaGroup Python proto
 //
 // Arrows indicate whether a conversion only ever occurs in one
 // direction, or whether it is maintained bidirectionally.
@@ -167,8 +170,41 @@ bool HandleStringAttribute(PyObject* o,
   return true;  // Handled string attribute, ok!
 }
 
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
+  PyObject* seq = PyObject_GetAttrString(o, attr_name);
+  if (!seq) {
+    return false;
+  }
+
+  int length = PySequence_Size(seq);
+  if (length == -1) {
+    Py_DECREF(seq);
+    return false;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(seq, i);
+    if (!item) {
+      Py_DECREF(seq);
+      return false;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(seq);
+      return false;
+    }
+    *field->Add() = dimension;
+    Py_DECREF(item);
+  }
+  Py_DECREF(seq);
+  return true;
 }
-}
+
+}  // namespace swig
+}  // namespace xla
 %}
 
 // Required to use PyArray_* functions.
@@ -657,128 +693,27 @@ tensorflow::ImportNumpy();
 
 %typemap(in) const DotDimensionNumbers&
     (DotDimensionNumbers dimension_numbers) {
-  int length;
-
-  /* lhs_contracting_dimensions */
-  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
-      $input, "lhs_contracting_dimensions");
-  if (!lhs_contracting_dimensions) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_contracting_dimensions",
+        dimension_numbers.mutable_lhs_contracting_dimensions())) {
     SWIG_fail;
   }
-
-  length = PySequence_Size(lhs_contracting_dimensions);
-  if (length == -1) {
-    Py_DECREF(lhs_contracting_dimensions);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_contracting_dimensions",
+        dimension_numbers.mutable_rhs_contracting_dimensions())) {
     SWIG_fail;
   }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
-    if (!item) {
-      Py_DECREF(lhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(lhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_lhs_contracting_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(lhs_contracting_dimensions);
-
-  /* rhs_contracting_dimensions */
-  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
-      $input, "rhs_contracting_dimensions");
-  if (!lhs_contracting_dimensions) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_batch_dimensions",
+        dimension_numbers.mutable_lhs_batch_dimensions())) {
     SWIG_fail;
   }
-
-  length = PySequence_Size(rhs_contracting_dimensions);
-  if (length == -1) {
-    Py_DECREF(rhs_contracting_dimensions);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_batch_dimensions",
+        dimension_numbers.mutable_rhs_batch_dimensions())) {
     SWIG_fail;
   }
 
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
-    if (!item) {
-      Py_DECREF(rhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(rhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_rhs_contracting_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(rhs_contracting_dimensions);
-
-  /* lhs_batch_dimensions */
-  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
-      $input, "lhs_batch_dimensions");
-  if (!lhs_batch_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(lhs_batch_dimensions);
-  if (length == -1) {
-    Py_DECREF(lhs_batch_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
-    if (!item) {
-      Py_DECREF(lhs_batch_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(lhs_batch_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_lhs_batch_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(lhs_batch_dimensions);
-
-  /* rhs_batch_dimensions */
-  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
-      $input, "rhs_batch_dimensions");
-  if (!rhs_batch_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(rhs_batch_dimensions);
-  if (length == -1) {
-    Py_DECREF(rhs_batch_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
-    if (!item) {
-      Py_DECREF(rhs_batch_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(rhs_batch_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_rhs_batch_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(rhs_batch_dimensions);
-
   $1 = &dimension_numbers;
 }
 
@@ -860,90 +795,108 @@ tensorflow::ImportNumpy();
   }
   dimension_numbers.set_kernel_input_feature_dimension(value);
 
-  PyObject* o;
-  int length;
-
-  o = PyObject_GetAttrString($input, "input_spatial_dimensions");
-  if (!o) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "input_spatial_dimensions",
+        dimension_numbers.mutable_input_spatial_dimensions())) {
     SWIG_fail;
   }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "kernel_spatial_dimensions",
+        dimension_numbers.mutable_kernel_spatial_dimensions())) {
     SWIG_fail;
   }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_input_spatial_dimensions(dimension);
-    Py_DECREF(item);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "output_spatial_dimensions",
+        dimension_numbers.mutable_output_spatial_dimensions())) {
+    SWIG_fail;
   }
-  Py_DECREF(o);
 
-  o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
-  if (!o) {
+  $1 = &dimension_numbers;
+}
+
+// GatherDimensionNumbers
+
+%typemap(in) const GatherDimensionNumbers&
+    (GatherDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "offset_dims",
+        dimension_numbers.mutable_offset_dims())) {
     SWIG_fail;
   }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "collapsed_slice_dims",
+        dimension_numbers.mutable_collapsed_slice_dims())) {
     SWIG_fail;
   }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_kernel_spatial_dimensions(dimension);
-    Py_DECREF(item);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "start_index_map",
+        dimension_numbers.mutable_start_index_map())) {
+    SWIG_fail;
   }
-  Py_DECREF(o);
 
-  o = PyObject_GetAttrString($input, "output_spatial_dimensions");
-  if (!o) {
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
     SWIG_fail;
   }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// ScatterDimensionNumbers
+
+%typemap(in) const ScatterDimensionNumbers&
+    (ScatterDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "update_window_dims",
+        dimension_numbers.mutable_update_window_dims())) {
     SWIG_fail;
   }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_output_spatial_dimensions(dimension);
-    Py_DECREF(item);
+  if (!HandleRepeatedInt64Attribute(
+        $input, "inserted_window_dims",
+        dimension_numbers.mutable_inserted_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "scatter_dims_to_operand_dims",
+        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
   }
-  Py_DECREF(o);
+  dimension_numbers.set_index_vector_dim(value);
 
   $1 = &dimension_numbers;
 }
 
+// Span<const ReplicaGroup>
+
+%typemap(in) absl::Span<const ReplicaGroup >
+    (std::vector<ReplicaGroup > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    ReplicaGroup rgrp;
+    if (!HandleRepeatedInt64Attribute(
+            o, "replica_ids",
+            rgrp.mutable_replica_ids())) {
+        SWIG_fail;
+    }
+    temps.push_back(rgrp);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+
 // ExecutableBuildOptions
 
 %typemap(in) const ExecutableBuildOptions*
@@ -1000,6 +953,12 @@ tensorflow::ImportNumpy();
     }
     Py_DECREF(o);
 
+    int64 num_replicas;
+    if (!GetIntAttr($input, "num_replicas", &num_replicas)) {
+      SWIG_fail;
+    }
+    build_options.set_num_replicas(num_replicas);
+
     $1 = &build_options;
   }
 }
@@ -1151,6 +1110,8 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::QR;
 %unignore xla::swig::LocalComputationBuilder::TriangularSolve;
 %unignore xla::swig::LocalComputationBuilder::CustomCall;
+%unignore xla::swig::LocalComputationBuilder::Gather;
+%unignore xla::swig::LocalComputationBuilder::Scatter;
 %unignore xla::swig::DeleteLocalComputation;
 %unignore xla::swig::DestructureLocalShapedBufferTuple;
 %unignore xla::swig::DestructureXrtAllocationTuple;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index b0aa024c7474cf8e6934432b2f364be464714999..52c5c621f7294c5da341879d15b77559fe870551 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -54,6 +54,8 @@ int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
       return NPY_FLOAT64;
     case C64:
       return NPY_COMPLEX64;
+    case C128:
+      return NPY_COMPLEX128;
     case TUPLE:
       return NPY_OBJECT;
     default:
@@ -89,6 +91,8 @@ PrimitiveType NumpyTypeToPrimitiveType(int np_type) {
       return F64;
     case NPY_COMPLEX64:
       return C64;
+    case NPY_COMPLEX128:
+      return C128;
     case NPY_OBJECT:
       return TUPLE;
     default:
@@ -111,6 +115,7 @@ bool NumpyTypeIsValid(int np_type) {
     case NPY_FLOAT32:
     case NPY_FLOAT64:
     case NPY_COMPLEX64:
+    case NPY_COMPLEX128:
     case NPY_OBJECT:
       return true;
     default:
@@ -123,7 +128,7 @@ PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
   PyArray_Descr* np_dtype = PyArray_DescrFromType(np_typenum);
 
   PyObject* dimensions;
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(shape);
     dimensions = PyTuple_New(ShapeUtil::TupleElementCount(shape));
     for (int i = 0; i < num_elements; ++i) {
@@ -132,7 +137,7 @@ PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
           PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i)));
     }
   } else {
-    int rank = ShapeUtil::Rank(shape);
+    int rank = shape.rank();
     dimensions = PyTuple_New(rank);
     for (int i = 0; i < rank; ++i) {
       PyTuple_SET_ITEM(dimensions, i,
@@ -345,7 +350,7 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
 }
 
 PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
-  if (ShapeUtil::IsTuple(literal.shape())) {
+  if (literal.shape().IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(literal.shape());
     PyObject* tuple = PyTuple_New(num_elements);
     for (int i = 0; i < num_elements; i++) {
@@ -354,7 +359,7 @@ PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
     }
     return tuple;
   } else {
-    int rank = ShapeUtil::Rank(literal.shape());
+    int rank = literal.shape().rank();
     std::vector<long> dimensions(rank);  // NOLINT - PyArray requires a long*
     for (int i = 0; i < rank; i++) {
       dimensions[i] = ShapeUtil::GetDimension(literal.shape(), i);
@@ -430,6 +435,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_COMPLEX64:
       CopyNumpyArrayToLiteral<complex64>(py_array, literal);
       break;
+    case NPY_COMPLEX128:
+      CopyNumpyArrayToLiteral<complex128>(py_array, literal);
+      break;
     default:
       return InvalidArgument(
           "No XLA literal container for Numpy type number: %d", np_type);
@@ -470,6 +478,9 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
     case NPY_COMPLEX64:
       CopyLiteralToNumpyArray<complex64>(literal, py_array);
       break;
+    case NPY_COMPLEX128:
+      CopyLiteralToNumpyArray<complex128>(literal, py_array);
+      break;
     default:
       LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
   }
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..bce6c1acf8a1cc0005ca93e0466c5a0e29d880de
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
@@ -0,0 +1 @@
+_PyInit__pywrap_xla
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds b/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..d31cfce7be7b6accf05ef77f3485904099965afc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds
@@ -0,0 +1,6 @@
+xla {
+  global:
+    PyInit_*;
+  local:
+    *;
+};
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 378bbdcb175f10d73da87f5286cf5129477a124c..8964b158292371d662368cfb0b644667985f719e 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -199,6 +199,7 @@ XLA_ELEMENT_TYPE_TO_DTYPE = {
     xla_data_pb2.F32: np.dtype('float32'),
     xla_data_pb2.F64: np.dtype('float64'),
     xla_data_pb2.C64: np.dtype('complex64'),
+    xla_data_pb2.C128: np.dtype('complex128'),
     xla_data_pb2.TUPLE: np.dtype(np.object),
 }
 
@@ -458,6 +459,7 @@ class CompileOptions(object):
     self.dump_unoptimized_hlo_proto_to = None
     self.dump_per_pass_hlo_proto_to = None
     self.hlo_profile = False
+    self.num_replicas = get_replica_count()
 
 
 def transfer_to_infeed(value, replica_number=None):
@@ -963,16 +965,30 @@ class ComputationBuilder(object):
       dimensions = tuple(range(ndim))
     return self._client.Reshape(operand, dimensions, new_sizes)
 
-  def CrossReplicaSum(self, operand):
+  def CrossReplicaSum(self, operand, replica_groups=None):
     """CrossReplicaSum op.
 
     Args:
       operand: the operand to sum across replica instances.
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the cross-replica sum is performed. If not supplied or None
+        (the default), all replicas belong to the same group.
 
     Returns:
-      A LocalOp that has the sum of the value among all replicas.
+      A LocalOp that represents on each replica the sum of its group's values.
     """
-    return self._client.CrossReplicaSum(operand)
+
+    def make_proto(replica_group):
+      replica_group_proto = xla_data_pb2.ReplicaGroup()
+      replica_group_proto.replica_ids.extend(replica_group)
+      return replica_group_proto
+
+    if replica_groups is None:
+      replica_groups = []  # special value for XLA API
+    else:
+      replica_groups = [make_proto(group) for group in replica_groups]
+    return self._client.CrossReplicaSum(operand, replica_groups)
 
   def Collapse(self, operand, dimensions):
     """Collapse op."""
@@ -1477,6 +1493,18 @@ class ComputationBuilder(object):
     return self._client.TriangularSolve(
         a, b, left_side, lower, transpose_a, conjugate_a)
 
+  def Gather(self, a, start_indices, dimension_numbers, slice_sizes):
+    """Enqueues a Gather operation onto the computation."""
+    return self._client.Gather(a, start_indices, dimension_numbers,
+                               slice_sizes)
+
+  def Scatter(self, a, scatter_indices, updates, update_computation,
+              dimension_numbers):
+    """Enqueues a Scatter operation onto the computation."""
+    return self._client.Scatter(
+        a, scatter_indices, updates, update_computation.computation,
+        dimension_numbers,)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 002a20e60a9fbe117af991731a555e60eef9397a..54c76241b9929fb39a6d63648f8ff35d78534b28 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -565,6 +565,18 @@ class SingleOpTest(LocalComputationTest):
       c.CrossReplicaSum(c.Constant(lhs))
       self._ExecuteAndCompareExact(c, expected=lhs)
 
+  def testCrossReplicaSumOneReplicaWithSingletonGroup(self):
+    samples = [
+        NumpyArrayF32(42.0),
+        NumpyArrayF32([97.0]),
+        NumpyArrayF32([64.0, 117.0]),
+        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+    ]
+    for lhs in samples:
+      c = self._NewComputation()
+      c.CrossReplicaSum(c.Constant(lhs), [[0]])
+      self._ExecuteAndCompareExact(c, expected=lhs)
+
   def testDotMatrixVectorF32(self):
     c = self._NewComputation()
     lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
@@ -1129,6 +1141,21 @@ class SingleOpTest(LocalComputationTest):
     self.assertFalse(c.IsConstant(non_const_expr))
     # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
 
+  def testGather(self):
+    a = np.arange(9).astype(np.int32).reshape((3, 3))
+    indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
+    dnums = xla_client.xla_data_pb2.GatherDimensionNumbers()
+    dnums.offset_dims.append(1)
+    dnums.offset_dims.append(2)
+    dnums.start_index_map.append(0)
+    dnums.start_index_map.append(1)
+    dnums.index_vector_dim = 2
+    c = self._NewComputation()
+    c.Gather(c.Constant(a), c.Constant(indices), dnums, slice_sizes=[1, 1])
+    g = self._Execute(c, ())
+    expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
+    np.testing.assert_allclose(g, expected, rtol=1e-4)
+
 
 class EmbeddedComputationsTest(LocalComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
@@ -1186,6 +1213,14 @@ class EmbeddedComputationsTest(LocalComputationTest):
     c.Mul(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(2.0))
     return c.Build()
 
+  def _CreateBinaryAddS32Computation(self):
+    """Computation (s32, s32) -> s32 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayS32(0)),
+        c.ParameterFromNumpy(NumpyArrayS32(0)))
+    return c.Build()
+
   def _CreateBinaryAddF32Computation(self):
     """Computation (f32, f32) -> f32 that adds its two parameters."""
     c = self._NewComputation("add_param0_by_param1")
@@ -1568,6 +1603,23 @@ class EmbeddedComputationsTest(LocalComputationTest):
       execution.join()
       self.assertEqual(want, got)
 
+  def testScatter(self):
+    a = np.arange(9).astype(np.int32).reshape((3, 3))
+    scatter_indices = np.array([0, 2], dtype=np.int32)
+    updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
+
+    dnums = xla_client.xla_data_pb2.ScatterDimensionNumbers()
+    dnums.update_window_dims.append(1)
+    dnums.inserted_window_dims.append(0)
+    dnums.scatter_dims_to_operand_dims.append(0)
+    dnums.index_vector_dim = 1
+
+    c = self._NewComputation()
+    c.Scatter(c.Constant(a), c.Constant(scatter_indices), c.Constant(updates),
+              self._CreateBinaryAddS32Computation(), dnums)
+    expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]], dtype=np.int32)
+    self._ExecuteAndCompareClose(c, expected=expected)
+
 
 class ErrorTest(LocalComputationTest):
 
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 92f28a9f8aaa3106b9a58ae1ee93ef8841ab58ef..08b78ee244844f41d551d7e249cec0cbf157d639 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -550,9 +551,9 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   HloEvaluator evaluator;
   Literal result_literal =
-      evaluator.Evaluate<const Literal*>(*computation, {}).ConsumeValueOrDie();
+      evaluator.Evaluate(*computation, {}).ConsumeValueOrDie();
 
-  CHECK_EQ(ShapeUtil::Rank(result_literal.shape()), 4);
+  CHECK_EQ(result_literal.shape().rank(), 4);
   auto result =
       absl::make_unique<Array4D<float>>(result_literal.shape().dimensions(0),
                                         result_literal.shape().dimensions(1),
@@ -605,24 +606,26 @@ ReferenceUtil::ReduceToRowArray2D(
     const std::function<float(float, float)>& reduce_function) {
   std::vector<float> result;
   CHECK_EQ(dims.size(), 3);
-  const std::set<int64> dim_set(dims.begin(), dims.end());
+  const absl::flat_hash_set<int64> dim_set(dims.begin(), dims.end());
   CHECK_EQ(dim_set.size(), 3);
-  for (int64 a0 = 0; a0 == 0 || (!dim_set.count(0) && a0 < array.n1()); ++a0) {
-    for (int64 a1 = 0; a1 == 0 || (!dim_set.count(1) && a1 < array.n2());
+  for (int64 a0 = 0; a0 == 0 || (!dim_set.contains(0) && a0 < array.n1());
+       ++a0) {
+    for (int64 a1 = 0; a1 == 0 || (!dim_set.contains(1) && a1 < array.n2());
          ++a1) {
-      for (int64 a2 = 0; a2 == 0 || (!dim_set.count(2) && a2 < array.n3());
+      for (int64 a2 = 0; a2 == 0 || (!dim_set.contains(2) && a2 < array.n3());
            ++a2) {
-        for (int64 a3 = 0; a3 == 0 || (!dim_set.count(3) && a3 < array.n4());
+        for (int64 a3 = 0; a3 == 0 || (!dim_set.contains(3) && a3 < array.n4());
              ++a3) {
           float accumulator = init;
-          for (int64 i0 = 0; i0 == 0 || (dim_set.count(0) && i0 < array.n1());
-               ++i0) {
-            for (int64 i1 = 0; i1 == 0 || (dim_set.count(1) && i1 < array.n2());
-                 ++i1) {
+          for (int64 i0 = 0;
+               i0 == 0 || (dim_set.contains(0) && i0 < array.n1()); ++i0) {
+            for (int64 i1 = 0;
+                 i1 == 0 || (dim_set.contains(1) && i1 < array.n2()); ++i1) {
               for (int64 i2 = 0;
-                   i2 == 0 || (dim_set.count(2) && i2 < array.n3()); ++i2) {
+                   i2 == 0 || (dim_set.contains(2) && i2 < array.n3()); ++i2) {
                 for (int64 i3 = 0;
-                     i3 == 0 || (dim_set.count(3) && i3 < array.n4()); ++i3) {
+                     i3 == 0 || (dim_set.contains(3) && i3 < array.n4());
+                     ++i3) {
                   // Handle zero-sized arrays.
                   if (array.n1() > 0 && array.n2() > 0 && array.n3() > 0 &&
                       array.n4() > 0) {
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index d8123a6de28ca532819ece4a75cd0b725f8c1bbd..22b4218fbd5e9bc59a0de22735eb51db46670f09 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -47,6 +47,14 @@ namespace xla {
   });
 }
 
+::grpc::Status GRPCService::GetDeviceHandles(::grpc::ServerContext* context,
+                                             const GetDeviceHandlesRequest* arg,
+                                             GetDeviceHandlesResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetDeviceHandles(arg, result);
+  });
+}
+
 ::grpc::Status GRPCService::Compile(::grpc::ServerContext* /*context*/,
                                     const CompileRequest* arg,
                                     CompileResponse* result) {
@@ -61,6 +69,14 @@ namespace xla {
       [this, arg, result]() { return service_->Execute(arg, result); });
 }
 
+::grpc::Status GRPCService::ExecuteGraphParallel(
+    ::grpc::ServerContext* /*context*/, const ExecuteGraphParallelRequest* arg,
+    ExecuteParallelResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->ExecuteGraphParallel(arg, result);
+  });
+}
+
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
                                              const WaitForExecutionRequest* arg,
                                              WaitForExecutionResponse* result) {
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index 3e586b288a56a22573d0c3b9ae7b2f25fdbf851a..b546704f73e34941cbf7bc2fe08062aa438039f7 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -39,6 +39,10 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
+  ::grpc::Status GetDeviceHandles(::grpc::ServerContext* context,
+                                  const GetDeviceHandlesRequest* arg,
+                                  GetDeviceHandlesResponse* result) override;
+
   ::grpc::Status Compile(::grpc::ServerContext* context,
                          const CompileRequest* arg,
                          CompileResponse* result) override;
@@ -46,6 +50,9 @@ class GRPCService : public grpc::XlaService::Service {
   ::grpc::Status Execute(::grpc::ServerContext* context,
                          const ExecuteRequest* arg,
                          ExecuteResponse* result) override;
+  ::grpc::Status ExecuteGraphParallel(::grpc::ServerContext* context,
+                                      const ExecuteGraphParallelRequest* arg,
+                                      ExecuteParallelResponse* result) override;
 
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index d8736c819687482a9dead57bdeacff8e75dce105..34af6b35972e8e484eee3d5419da17095556aebc 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #   XLA service implementation.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = [":friends"])
@@ -12,15 +20,6 @@ package_group(
     ],
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library_py",
-)
-
 xla_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
@@ -224,19 +223,23 @@ cc_library(
         "hlo_evaluator_typed_visitor.h",
         "hlo_evaluator_typed_visitor_bfloat16.cc",
         "hlo_evaluator_typed_visitor_bool.cc",
+        "hlo_evaluator_typed_visitor_complex128.cc",
         "hlo_evaluator_typed_visitor_complex64.cc",
         "hlo_evaluator_typed_visitor_double.cc",
         "hlo_evaluator_typed_visitor_float.cc",
         "hlo_evaluator_typed_visitor_half.cc",
+        "hlo_evaluator_typed_visitor_int16.cc",
         "hlo_evaluator_typed_visitor_int32.cc",
         "hlo_evaluator_typed_visitor_int64.cc",
         "hlo_evaluator_typed_visitor_int8.cc",
+        "hlo_evaluator_typed_visitor_uint16.cc",
         "hlo_evaluator_typed_visitor_uint32.cc",
         "hlo_evaluator_typed_visitor_uint64.cc",
         "hlo_evaluator_typed_visitor_uint8.cc",
     ],
     hdrs = ["hlo_evaluator.h"],
     deps = [
+        ":dynamic_dimension_inference",
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_query",
@@ -257,6 +260,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -268,6 +272,7 @@ tf_cc_test(
     srcs = ["hlo_evaluator_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_element_type_converter",
         ":hlo_evaluator",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
@@ -280,7 +285,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -516,6 +520,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -678,6 +683,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -696,6 +702,7 @@ cc_library(
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
+        ":dynamic_dimension_inference",
         ":executable",
         ":execution_tracker",
         ":hlo",
@@ -1003,6 +1010,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1054,7 +1062,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1092,7 +1099,6 @@ cc_library(
         ":buffer_value_containers",
         ":heap_simulator",
         ":hlo",
-        ":hlo_memory_scheduler",
         ":hlo_proto",
         ":logical_buffer",
         ":tuple_points_to_analysis",
@@ -1137,6 +1143,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1231,7 +1238,6 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_proto",
-        "//tensorflow/compiler/xla:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1499,7 +1505,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -1580,6 +1585,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1720,9 +1727,9 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "convolution_feature_group_converter_test",
+    name = "convolution_group_converter_test",
     size = "small",
-    srcs = ["convolution_feature_group_converter_test.cc"],
+    srcs = ["convolution_group_converter_test.cc"],
     deps = [
         ":convolution_group_converter",
         ":hlo",
@@ -1866,8 +1873,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1931,6 +1939,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_padder",
+    srcs = ["dynamic_padder.cc"],
+    hdrs = ["dynamic_padder.h"],
+    deps = [
+        ":dynamic_dimension_inference",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_padder_test",
+    srcs = ["dynamic_padder_test.cc"],
+    deps = [
+        ":dynamic_padder",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_dimension_inference_test",
     srcs = ["dynamic_dimension_inference_test.cc"],
@@ -2017,7 +2065,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -2058,6 +2105,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -2116,6 +2164,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2288,6 +2338,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -2548,6 +2599,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2592,6 +2644,7 @@ tf_cc_test(
     srcs = ["hlo_verifier_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":hlo_parser",
         ":hlo_verifier",
         ":layout_assignment",
@@ -2599,6 +2652,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -2969,13 +3023,11 @@ cc_library(
     srcs = ["hlo_get_dimension_size_rewriter.cc"],
     hdrs = ["hlo_get_dimension_size_rewriter.h"],
     deps = [
+        ":dynamic_dimension_inference",
         ":hlo",
         ":hlo_pass",
         ":shape_inference",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
@@ -3186,6 +3238,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -3346,7 +3399,6 @@ cc_library(
         ":hlo_pass_pipeline",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -3403,10 +3455,39 @@ cc_library(
         ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_library(
+    name = "sort_simplifier",
+    srcs = ["sort_simplifier.cc"],
+    hdrs = ["sort_simplifier.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "sort_simplifier_test",
+    srcs = ["sort_simplifier_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":sort_simplifier",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_util",
     srcs = ["tuple_util.cc"],
@@ -3505,7 +3586,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -3574,14 +3654,16 @@ cc_library(
 tf_cc_test(
     name = "indexed_array_analysis_test",
     srcs = ["indexed_array_analysis_test.cc"],
+    extra_copts = ["-Wno-string-plus-int"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3675,6 +3757,7 @@ cc_library(
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -3686,6 +3769,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_index_splitter",
+    srcs = ["dynamic_index_splitter.cc"],
+    hdrs = ["dynamic_index_splitter.h"],
+    deps = [
+        ":hlo_casting_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_index_splitter_test",
+    srcs = ["dynamic_index_splitter_test.cc"],
+    deps = [
+        ":dynamic_index_splitter",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "ar_crs_combiner_test",
     srcs = ["ar_crs_combiner_test.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 9e453203ce17cceb606cac06d0ebfaccbf912126..da15ff7d7a2bee8f142bacc996f7fcd063598f77 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -34,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -50,6 +53,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -120,23 +124,37 @@ bool TransposeIsBitcast(const HloInstruction* transpose) {
                                        transpose->dimensions());
 }
 
-// Returns true if the given reshape/copy produces a result which is bit-wise
-// identical to its operand and thus may be replaced with a bitcast.
-//
-// This function is conservative -- even if this function returns false, the
-// reshape may still be a bitcast. For example, a reshape from [28x28] to [784].
-bool ReshapeOrCopyIsBitcast(
-    const HloInstruction* instr,
-    const AlgebraicSimplifierOptions::ValidBitcastCallback&
-        valid_bitcast_callback) {
+// Recursive helper for method below.
+HloInstruction* BitcastingOperandOfReshapeOrCopyChainHelper(
+    HloInstruction* instr, HloInstruction* operand,
+    const AlgebraicSimplifierOptions& options) {
+  // Can't replace chain of copies and reshapes with bitcasts if the compiler
+  // used a memory layout which isn't compatible.
+  if (options.ReshapeIsBitcast(operand->shape(), instr->shape())) {
+    return operand;
+  }
+
+  // If the operand is a copy or reshape try to see if the operand's operand
+  // would produce a bitcast with initial instruction.
+  if (HloOpcode::kReshape == operand->opcode() ||
+      HloOpcode::kCopy == operand->opcode()) {
+    return BitcastingOperandOfReshapeOrCopyChainHelper(
+        instr, operand->mutable_operand(0), options);
+  }
+  return nullptr;
+}
+
+// Returns an operand of a chain of reshapes and copies that is bit-wise
+// identical to first reshape or copy in the chain.
+HloInstruction* BitcastingOperandOfReshapeOrCopyChain(
+    HloInstruction* instr, const AlgebraicSimplifierOptions& options) {
+  if (!options.is_layout_sensitive()) {
+    return nullptr;
+  }
   CHECK(HloOpcode::kReshape == instr->opcode() ||
         HloOpcode::kCopy == instr->opcode());
-
-  const HloInstruction* operand = instr->operand(0);
-  // Can't insert bitcasts if the compiler used a memory layout which isn't
-  // compatible.
-  return ShapeUtil::ReshapeIsBitcast(operand->shape(), instr->shape()) &&
-         valid_bitcast_callback(operand->shape(), instr->shape());
+  return BitcastingOperandOfReshapeOrCopyChainHelper(
+      instr, instr->mutable_operand(0), options);
 }
 
 bool IsUnstridedSlice(const HloInstruction* hlo) {
@@ -203,6 +221,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandlePower(HloInstruction* power) override;
 
+  Status HandleRemainder(HloInstruction* remainder) override;
+
   Status HandleReshape(HloInstruction* reshape) override;
 
   Status HandleReduce(HloInstruction* reduce) override;
@@ -251,7 +271,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   // Reshapes an instruction to rank 1 if it is not already rank 1.
   HloInstruction* Flatten(HloInstruction* hlo) {
-    if (ShapeUtil::Rank(hlo->shape()) == 1) {
+    if (hlo->shape().rank() == 1) {
       return hlo;
     }
     return computation_->AddInstruction(HloInstruction::CreateReshape(
@@ -271,8 +291,11 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
         shape, hlo, zero, {dim}, AddReduce_computation));
   }
 
-  // Convenience method for replacing an instruction with a bitcast.
-  void ReplaceWithBitcast(HloInstruction* instruction);
+  // Convenience method for replacing an instruction with a bitcast. If operand
+  // is not null, then the bitcast will use the specified operand instead of the
+  // operand of the instruction.
+  void ReplaceWithBitcast(HloInstruction* instruction,
+                          HloInstruction* operand = nullptr);
 
   // Replace old instruction with new instruction if old and new instructions
   // have the same shape. Updates uses and root instruction. Returns whether a
@@ -401,17 +424,19 @@ bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
   }
 }
 
-void AlgebraicSimplifierVisitor::ReplaceWithBitcast(
-    HloInstruction* instruction) {
+void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction,
+                                                    HloInstruction* operand) {
   CHECK_EQ(1, instruction->operand_count());
+  if (operand == nullptr) {
+    operand = instruction->mutable_operand(0);
+  }
   CHECK_EQ(ShapeUtil::ElementsIn(instruction->shape()),
-           ShapeUtil::ElementsIn(instruction->operand(0)->shape()));
+           ShapeUtil::ElementsIn(operand->shape()));
   CHECK_EQ(ShapeUtil::ByteSizeOf(instruction->shape()),
-           ShapeUtil::ByteSizeOf(instruction->operand(0)->shape()));
+           ShapeUtil::ByteSizeOf(operand->shape()));
 
-  auto bitcast = computation_->AddInstruction(
-      HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kBitcast,
-                                  instruction->mutable_operand(0)));
+  auto bitcast = computation_->AddInstruction(HloInstruction::CreateUnary(
+      instruction->shape(), HloOpcode::kBitcast, operand));
   TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
@@ -572,9 +597,9 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return Status::OK();
   }
 
-  if (options_.is_layout_sensitive() &&
-      ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) {
-    ReplaceWithBitcast(copy);
+  if (HloInstruction* bitcast_operand =
+          BitcastingOperandOfReshapeOrCopyChain(copy, options_)) {
+    ReplaceWithBitcast(copy, bitcast_operand);
   }
 
   return Status::OK();
@@ -687,7 +712,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       return Status::OK();
     }
     PaddingConfig padding_config;
-    for (int64 dim = 0; dim < ShapeUtil::Rank(operands[0]->shape()); ++dim) {
+    for (int64 dim = 0; dim < operands[0]->shape().rank(); ++dim) {
       auto padding_config_dim = padding_config.add_dimensions();
       padding_config_dim->set_edge_padding_high(0);
       padding_config_dim->set_edge_padding_low(0);
@@ -715,7 +740,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
 
 static HloInstruction* BuildTupleConstant(HloComputation* computation,
                                           const LiteralSlice& literal) {
-  if (ShapeUtil::IsTuple(literal.shape())) {
+  if (literal.shape().IsTuple()) {
     std::vector<HloInstruction*> elems;
     elems.reserve(ShapeUtil::TupleElementCount(literal.shape()));
     for (int i = 0; i < ShapeUtil::TupleElementCount(literal.shape()); ++i) {
@@ -732,7 +757,7 @@ static HloInstruction* BuildTupleConstant(HloComputation* computation,
 Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
   // Tuple constants aren't directly supported by any backend. Expand them into
   // explicit Tuple instructions.
-  if (ShapeUtil::IsTuple(constant->shape())) {
+  if (constant->shape().IsTuple()) {
     return ReplaceInstruction(
         constant, BuildTupleConstant(computation_, constant->literal()));
   }
@@ -754,7 +779,7 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
   }
 
   // If a literal is an increasing sequence from zero, replace it with an iota.
-  if (ShapeUtil::Rank(constant->shape()) == 1 &&
+  if (constant->shape().rank() == 1 &&
       ShapeUtil::ElementsIn(constant->shape()) > 1 &&
       constant->literal().IsR1Iota()) {
     return ReplaceWithNewInstruction(
@@ -791,6 +816,79 @@ Status InvertConstant(const HloInstruction& constant, Literal* result) {
     return T{1.0} / constant.literal().Get<T>(indices);
   });
 }
+
+template <typename T>
+std::unique_ptr<HloInstruction> TryDivideToShift(HloInstruction* divide,
+                                                 HloComputation* computation) {
+  HloInstruction *a, *b, *c;
+  CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
+
+  if (ShapeUtil::ElementIsIntegral(divide->shape()) &&
+      !Match(b, m::ConstantEffectiveScalar(&c)) &&
+      !Match(b, m::Broadcast(m::ConstantEffectiveScalar(&c)))) {
+    return nullptr;
+  }
+
+  if (ShapeUtil::ElementIsSigned(divide->shape())) {
+    int64 b_value = c->literal().GetFirstElement<T>();
+    if (b_value > 0 && IsPowerOfTwo(static_cast<uint64>(b_value))) {
+      // Handle negative dividends by negating the result of the division.
+      HloInstruction* zero_like_a = BroadcastZeros(
+          computation, a->shape().element_type(), a->shape().dimensions());
+
+      auto* dividend_is_negative =
+          computation->AddInstruction(HloInstruction::CreateBinary(
+              ShapeUtil::ChangeElementType(a->shape(), PRED), HloOpcode::kLt, a,
+              zero_like_a));
+
+      auto* negated_dividend = computation->AddInstruction(
+          HloInstruction::CreateUnary(a->shape(), HloOpcode::kNegate, a));
+
+      auto* abs_dividend =
+          computation->AddInstruction(HloInstruction::CreateTernary(
+              a->shape(), HloOpcode::kSelect, dividend_is_negative,
+              negated_dividend, a));
+
+      int log2_abs_b_value = tensorflow::Log2Floor64(b_value);
+
+      auto* shift_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(log2_abs_b_value)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        shift_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), shift_amount, {}));
+      }
+
+      auto* quotient = computation->AddInstruction(HloInstruction::CreateBinary(
+          divide->shape(), HloOpcode::kShiftRightLogical, abs_dividend,
+          shift_amount));
+
+      auto* neqated_quotient =
+          computation->AddInstruction(HloInstruction::CreateUnary(
+              quotient->shape(), HloOpcode::kNegate, quotient));
+
+      return HloInstruction::CreateTernary(divide->shape(), HloOpcode::kSelect,
+                                           dividend_is_negative,
+                                           neqated_quotient, quotient);
+    }
+  } else {
+    uint64 b_value = c->literal().GetFirstElement<T>();
+    if (IsPowerOfTwo(b_value)) {
+      int log2_abs_b_value = tensorflow::Log2Floor64(b_value);
+      HloInstruction* shift_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(log2_abs_b_value)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        shift_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), shift_amount, {}));
+      }
+      return HloInstruction::CreateBinary(
+          divide->shape(), HloOpcode::kShiftRightLogical, a, shift_amount);
+    }
+  }
+
+  return nullptr;
+}
 }  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
@@ -803,6 +901,60 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return Status::OK();
   }
 
+  // A / B => A >> log2(B) if B is a power of 2.
+  switch (divide->shape().element_type()) {
+    case S8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int8>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int16>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int32>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int64>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint8>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint16>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint32>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint64>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    default:
+      break;
+  }
+
   // exp(A)/exp(B) => exp(A-B)
   if (Match(divide, m::Divide(m::Exp(m::Op(&a)), m::Exp(m::Op(&b)))
                         .WithShape(m::Shape(&shape)))) {
@@ -870,6 +1022,9 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
       case C64:
         TF_RETURN_IF_ERROR(InvertConstant<complex64>(*b, &new_literal));
         break;
+      case C128:
+        TF_RETURN_IF_ERROR(InvertConstant<complex128>(*b, &new_literal));
+        break;
       default:
         return Status::OK();
     }
@@ -930,9 +1085,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     return -1;
   };
 
-  const int64 dot_rank = ShapeUtil::Rank(dot->shape());
-  const int64 rhs_rank = ShapeUtil::Rank(rhs->shape());
-  const int64 lhs_rank = ShapeUtil::Rank(lhs->shape());
+  const int64 dot_rank = dot->shape().rank();
+  const int64 rhs_rank = rhs->shape().rank();
+  const int64 lhs_rank = lhs->shape().rank();
   const auto& dnums = dot->dot_dimension_numbers();
   if (dnums.rhs_contracting_dimensions_size() > 1) {
     return false;
@@ -1036,7 +1191,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   //    )
   if (lhs_rank == 1 ||
       (lhs_rank == 2 && lhs->shape().dimensions(lhs_kept_dim) == 1)) {
-    if (ShapeUtil::Rank(rhs->shape()) == 1) {
+    if (rhs->shape().rank() == 1) {
       TF_RETURN_IF_ERROR(
           ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
                                       multiply(Flatten(lhs), rhs), 0))));
@@ -1373,6 +1528,9 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   // => output dimensions: DS ({M x N}, {0, start}, {M, 1}) => {M x 1}.
 
   bool lhs_is_dynamic_slice = lhs->opcode() == HloOpcode::kDynamicSlice;
+  HloDynamicSliceInstruction* dynamic_slice =
+      lhs_is_dynamic_slice ? Cast<HloDynamicSliceInstruction>(lhs)
+                           : Cast<HloDynamicSliceInstruction>(rhs);
 
   // ctA:
   HloInstruction* left_operand =
@@ -1390,8 +1548,6 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
       HloInstruction::CreateDot(memoized_shape, left_operand, right_operand,
                                 dnums, dot->precision_config()));
   // Get pair {start, 0} or {0, start}.
-  HloInstruction* original_start_indices =
-      lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1);
   // Position of start:
   int index_of_non_zero_start = lhs_is_dynamic_slice
                                     ? 1 - lhs_contracting_dimension
@@ -1400,23 +1556,19 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   int index_of_zero_start = 1 - index_of_non_zero_start;
 
   // Slice out start and 0 components and reorder if necessary.
-  auto indices_type = original_start_indices->shape().element_type();
+  auto indices_type = dynamic_slice->operand(1)->shape().element_type();
   Shape s_shape = ShapeUtil::MakeShape(indices_type, {1});
   Shape d_shape = ShapeUtil::MakeShape(indices_type, {2});
   HloInstruction* non_zero_start =
-      computation_->AddInstruction(HloInstruction::CreateSlice(
-          s_shape, original_start_indices, {index_of_non_zero_start},
-          {index_of_non_zero_start + 1}, {1}));
+      dynamic_slice->mutable_operand(1 + index_of_non_zero_start);
   HloInstruction* zero_start =
-      computation_->AddInstruction(HloInstruction::CreateSlice(
-          s_shape, original_start_indices, {index_of_zero_start},
-          {index_of_zero_start + 1}, {1}));
-  HloInstruction* new_start_indices =
-      lhs_is_dynamic_slice
-          ? computation_->AddInstruction(HloInstruction::CreateConcatenate(
-                d_shape, {non_zero_start, zero_start}, 0))
-          : computation_->AddInstruction(HloInstruction::CreateConcatenate(
-                d_shape, {zero_start, non_zero_start}, 0));
+      dynamic_slice->mutable_operand(1 + index_of_zero_start);
+  std::vector<HloInstruction*> new_start_indices;
+  if (lhs_is_dynamic_slice) {
+    new_start_indices = {non_zero_start, zero_start};
+  } else {
+    new_start_indices = {zero_start, non_zero_start};
+  }
 
   // Build DynamicSlice(ctA x ctB).
   const int new_slice_m = lhs_is_dynamic_slice ? 1 : m;
@@ -1449,8 +1601,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
       dot->shape().element_type() != BF16) {
     return Status::OK();
   }
-  if (ShapeUtil::Rank(lhs->shape()) > 2 || ShapeUtil::Rank(rhs->shape()) > 2 ||
-      ShapeUtil::Rank(dot->shape()) > 2) {
+  if (lhs->shape().rank() > 2 || rhs->shape().rank() > 2 ||
+      dot->shape().rank() > 2) {
     if (options_.enable_dot_strength_reduction() &&
         !options_.is_layout_sensitive()) {
       TF_RETURN_IF_ERROR(HandleDotStrengthReduction(dot).status());
@@ -1686,7 +1838,7 @@ bool OutputIsPermutationOfOperandElements(HloInstruction* instruction,
     case HloOpcode::kTranspose:
       return true;
     case HloOpcode::kSort:
-      return (!ShapeUtil::IsTuple(instruction->shape()));
+      return (!instruction->shape().IsTuple());
     default:
       return false;
   }
@@ -1732,8 +1884,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
 
   // A degenerate broadcast that has the same input and output rank can be
   // converted into a transpose.
-  if (ShapeUtil::Rank(broadcast->shape()) ==
-          ShapeUtil::Rank(operand->shape()) &&
+  if (broadcast->shape().rank() == operand->shape().rank() &&
       ShapeUtil::ElementsIn(broadcast->shape()) ==
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> transpose(X) where "
@@ -1888,7 +2039,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   if (HasInteriorPadding(pad->padding_config())) {
     PaddingConfig padding_config = pad->padding_config();
     bool cleared_interior_padding = false;
-    for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    for (int64 i = 0; i < pad->shape().rank(); ++i) {
       if (padding_config.dimensions(i).interior_padding() > 0 &&
           pad->operand(0)->shape().dimensions(i) == 1) {
         cleared_interior_padding = true;
@@ -2139,6 +2290,137 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
   return changed;
 }
 
+namespace {
+template <typename T>
+std::unique_ptr<HloInstruction> TryRemainderToAnd(HloInstruction* remainder,
+                                                  HloComputation* computation) {
+  HloInstruction *a, *b, *c;
+  CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
+
+  if (ShapeUtil::ElementIsIntegral(remainder->shape()) &&
+      !Match(b, m::ConstantEffectiveScalar(&c)) &&
+      !Match(b, m::Broadcast(m::ConstantEffectiveScalar(&c)))) {
+    return nullptr;
+  }
+
+  if (ShapeUtil::ElementIsSigned(remainder->shape())) {
+    int64 b_value = c->literal().GetFirstElement<T>();
+    if (b_value > 0 && IsPowerOfTwo(static_cast<uint64>(b_value))) {
+      // Handle negative dividends by negating the result of the division.
+      HloInstruction* zero_like_a = BroadcastZeros(
+          computation, a->shape().element_type(), a->shape().dimensions());
+
+      auto* dividend_is_negative =
+          computation->AddInstruction(HloInstruction::CreateBinary(
+              ShapeUtil::ChangeElementType(a->shape(), PRED), HloOpcode::kLt, a,
+              zero_like_a));
+
+      auto* negated_dividend = computation->AddInstruction(
+          HloInstruction::CreateUnary(a->shape(), HloOpcode::kNegate, a));
+
+      auto* abs_dividend =
+          computation->AddInstruction(HloInstruction::CreateTernary(
+              a->shape(), HloOpcode::kSelect, dividend_is_negative,
+              negated_dividend, a));
+
+      auto* mask_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(b_value - 1)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        mask_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
+      }
+
+      auto* quotient = computation->AddInstruction(HloInstruction::CreateBinary(
+          remainder->shape(), HloOpcode::kAnd, abs_dividend, mask_amount));
+
+      auto* neqated_quotient =
+          computation->AddInstruction(HloInstruction::CreateUnary(
+              quotient->shape(), HloOpcode::kNegate, quotient));
+
+      return HloInstruction::CreateTernary(
+          remainder->shape(), HloOpcode::kSelect, dividend_is_negative,
+          neqated_quotient, quotient);
+    }
+  } else {
+    uint64 b_value = c->literal().GetFirstElement<T>();
+    if (IsPowerOfTwo(b_value)) {
+      HloInstruction* mask_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(b_value - 1)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        mask_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
+      }
+      return HloInstruction::CreateBinary(remainder->shape(), HloOpcode::kAnd,
+                                          a, mask_amount);
+    }
+  }
+  return nullptr;
+}
+}  // namespace
+
+Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
+  HloInstruction *a, *b;
+  CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
+
+  // A % B => A & (B - 1) if B is a power of 2.
+  switch (remainder->shape().element_type()) {
+    case S8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int8>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int16>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int32>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int64>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint8>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint16>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint32>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint64>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    default:
+      break;
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
 
@@ -2195,12 +2477,10 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   }
 
   // Make this a bitcast if possible.
-  if (options_.is_layout_sensitive() &&
-      ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) {
-    ReplaceWithBitcast(reshape);
-    return Status::OK();
+  if (HloInstruction* bitcast_operand =
+          BitcastingOperandOfReshapeOrCopyChain(reshape, options_)) {
+    ReplaceWithBitcast(reshape, bitcast_operand);
   }
-
   return Status::OK();
 }
 
@@ -2210,8 +2490,7 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
   auto dim_is_one = [&](int64 i) -> bool {
     return reverse->shape().dimensions(i) == 1;
   };
-  if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
-                  dim_is_one)) {
+  if (absl::c_all_of(reverse->dimensions(), dim_is_one)) {
     return ReplaceInstruction(reverse, reverse->mutable_operand(0));
   }
   return Status::OK();
@@ -2276,7 +2555,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
   if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
     VLOG(10) << "Trying to simplify scalar slice of concat";
     // Only do this for R1, there's no chance of this being useful otherwise.
-    if (ShapeUtil::Rank(slice->shape()) != 1) {
+    if (slice->shape().rank() != 1) {
       VLOG(10) << "Not folding, slice is not rank 1";
       return false;
     }
@@ -2326,7 +2605,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
     return false;
   }
   HloInstruction* new_slice_operand = reshape->mutable_operand(0);
-  int64 slice_rank = ShapeUtil::Rank(slice->shape());
+  int64 slice_rank = slice->shape().rank();
   std::vector<int64> sliced_dims;
   for (int64 i = 0; i < slice_rank; ++i) {
     if (slice->slice_starts(i) != 0 ||
@@ -2338,7 +2617,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
   if (sliced_dims.size() == 1 && sliced_dims[0] == 0 &&
       slice->slice_starts(0) == 0) {
     const Shape& new_slice_shape = new_slice_operand->shape();
-    const int64 rank = ShapeUtil::Rank(new_slice_shape);
+    const int64 rank = new_slice_shape.rank();
     std::vector<int64> new_slice_starts(rank, 0);
     std::vector<int64> new_slice_stides(rank, 1);
     std::vector<int64> new_slice_limits(new_slice_shape.dimensions().begin(),
@@ -2438,7 +2717,7 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
 Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
   // TODO(b/112040122): Most of those optimizations can be done for multi-output
   // reduces.
-  if (ShapeUtil::IsTuple(reduce->shape())) {
+  if (reduce->shape().IsTuple()) {
     return Status::OK();
   }
 
@@ -2456,8 +2735,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
   // A Transpose feeding a reduce can simply permute the reduction dimensions
   // field if the output of the reduce is a vector or scalar. Higher ranked
   // result may require a transpose of the output.
-  if (ShapeUtil::Rank(reduce->shape()) <= 1 &&
-      arg->opcode() == HloOpcode::kTranspose) {
+  if (reduce->shape().rank() <= 1 && arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
     for (auto dim : dimensions) {
@@ -2487,9 +2765,9 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     // Create a new reduce with the combined reduction dimensions of both
     // reduces.
     std::vector<int64> arg_dims = arg->dimensions();
-    std::sort(arg_dims.begin(), arg_dims.end());
+    absl::c_sort(arg_dims);
     std::vector<int64> reduce_dims = reduce->dimensions();
-    std::sort(reduce_dims.begin(), reduce_dims.end());
+    absl::c_sort(reduce_dims);
     // Transform reduce_dims to the same rank as the operand of the operand.
     for (int64 arg_dim : arg_dims) {
       for (int64& dim : reduce_dims) {
@@ -2516,8 +2794,8 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     std::vector<std::pair<int64, int64>> unmodified_dims =
         ShapeUtil::DimensionsUnmodifiedByReshape(arg->operand(0)->shape(),
                                                  arg->shape());
-    std::vector<bool> arg_dim_in_output(ShapeUtil::Rank(arg->shape()), true);
-    std::vector<bool> arg_dim_unmodified(ShapeUtil::Rank(arg->shape()), false);
+    std::vector<bool> arg_dim_in_output(arg->shape().rank(), true);
+    std::vector<bool> arg_dim_unmodified(arg->shape().rank(), false);
     for (auto dim : dimensions) {
       arg_dim_in_output[dim] = false;
     }
@@ -2535,15 +2813,15 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     }
     if (can_move_reshape_into_reduce) {
       changed_ = true;
-      std::unordered_set<int64> dimensions_not_to_reduce;
+      absl::flat_hash_set<int64> dimensions_not_to_reduce;
       for (auto dim_pair : unmodified_dims) {
         if (arg_dim_in_output[dim_pair.second]) {
           dimensions_not_to_reduce.insert(dim_pair.first);
         }
       }
       std::vector<int64> new_reduce_dimensions;
-      for (int64 i = 0; i < ShapeUtil::Rank(arg->operand(0)->shape()); ++i) {
-        if (dimensions_not_to_reduce.count(i) == 0) {
+      for (int64 i = 0; i < arg->operand(0)->shape().rank(); ++i) {
+        if (!dimensions_not_to_reduce.contains(i)) {
           new_reduce_dimensions.push_back(i);
         }
       }
@@ -2597,51 +2875,53 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                                   function));
   }
 
-  // A reduce window can be expressed as a reduce and a reshape if all
-  // dimensions either have a window size of one or the entire dimension. If
-  // there is no stride, dilation, or padding, this is as easy as checking the
-  // size of the output shape and window dimension.
-  //
-  // The reshape is a bitcast since it adds one-sized dimensions. Often these
-  // ones are immediately removed as well with another reshape. The
-  // implementation of reduce tends to be slightly more efficient at reducing
-  // entire dimensions compared to reduce window.
-  auto effective_reduce_dims = [&] {
-    if (window_util::HasStride(window) || window_util::HasDilation(window) ||
-        window_util::HasPadding(window)) {
-      return absl::InlinedVector<int64, 8>{};
-    }
-    absl::InlinedVector<int64, 8> reduce_dims;
-    for (int64 i = 0; i < window.dimensions_size(); ++i) {
-      if (window.dimensions(i).size() == 1) {
-        continue;
-      } else if (reduce_window->shape().dimensions(i) == 1) {
-        reduce_dims.push_back(i);
-      } else {
+  if (options_.enable_window_reduce_to_reduce_replacement()) {
+    // A reduce window can be expressed as a reduce and a reshape if all
+    // dimensions either have a window size of one or the entire dimension. If
+    // there is no stride, dilation, or padding, this is as easy as checking the
+    // size of the output shape and window dimension.
+    //
+    // The reshape is a bitcast since it adds one-sized dimensions. Often these
+    // ones are immediately removed as well with another reshape. The
+    // implementation of reduce tends to be slightly more efficient at reducing
+    // entire dimensions compared to reduce window.
+    auto effective_reduce_dims = [&] {
+      if (window_util::HasStride(window) || window_util::HasDilation(window) ||
+          window_util::HasPadding(window)) {
         return absl::InlinedVector<int64, 8>{};
       }
-    }
-    return reduce_dims;
-  }();
+      absl::InlinedVector<int64, 8> reduce_dims;
+      for (int64 i = 0; i < window.dimensions_size(); ++i) {
+        if (window.dimensions(i).size() == 1) {
+          continue;
+        } else if (reduce_window->shape().dimensions(i) == 1) {
+          reduce_dims.push_back(i);
+        } else {
+          return absl::InlinedVector<int64, 8>{};
+        }
+      }
+      return reduce_dims;
+    }();
 
-  // If a reduce window can be expressed as a reduce, do so and reshape the
-  // output.
-  if (!effective_reduce_dims.empty()) {
-    Shape reduce_shape = ShapeUtil::FilterDimensions(
-        [&](int64 dim) {
-          return !absl::c_linear_search(effective_reduce_dims, dim);
-        },
-        reduce_window->shape());
-    HloInstruction* reduce =
-        computation_->AddInstruction(HloInstruction::CreateReduce(
-            /*shape=*/reduce_shape,
-            /*operand=*/operand,
-            /*init_value=*/reduce_window->mutable_operand(1),
-            /*dimensions_to_reduce=*/effective_reduce_dims,
-            /*reduce_computation=*/function));
-    return ReplaceWithNewInstruction(
-        reduce_window,
-        HloInstruction::CreateReshape(reduce_window->shape(), reduce));
+    // If a reduce window can be expressed as a reduce, do so and reshape the
+    // output.
+    if (!effective_reduce_dims.empty()) {
+      Shape reduce_shape = ShapeUtil::FilterDimensions(
+          [&](int64 dim) {
+            return !absl::c_linear_search(effective_reduce_dims, dim);
+          },
+          reduce_window->shape());
+      HloInstruction* reduce =
+          computation_->AddInstruction(HloInstruction::CreateReduce(
+              /*shape=*/reduce_shape,
+              /*operand=*/operand,
+              /*init_value=*/reduce_window->mutable_operand(1),
+              /*dimensions_to_reduce=*/effective_reduce_dims,
+              /*reduce_computation=*/function));
+      return ReplaceWithNewInstruction(
+          reduce_window,
+          HloInstruction::CreateReshape(reduce_window->shape(), reduce));
+    }
   }
 
   // This optimization folds a pad op into reduce_window.
@@ -2779,7 +3059,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   // Carry out the folding of the pad into reduce_window.
   VLOG(10) << "Folding pad into reduce-window.";
   Window new_window = window;
-  const int64 rank = ShapeUtil::Rank(reduce_window->shape());
+  const int64 rank = reduce_window->shape().rank();
   TF_RET_CHECK(pad_config.dimensions_size() == rank);
   TF_RET_CHECK(window.dimensions_size() == rank);
   for (int64 i = 0; i < rank; ++i) {
@@ -2828,6 +3108,7 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
+
   if (!options_.enable_permutation_sort_replacement()) {
     return Status::OK();
   }
@@ -2862,7 +3143,7 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
         // - Use this as the indices parameter of scatter, and set updates
         //   of the scatter to be a reshaped 'values' parameter of sort (adding
         //   'rank' many 1 dimensions at the end).
-        int64 rank = ShapeUtil::Rank(operand->shape());
+        int64 rank = operand->shape().rank();
         Shape extended_shape = operand->shape();
         extended_shape.add_dimensions(1);
         extended_shape.mutable_layout()->add_minor_to_major(rank);
@@ -3221,15 +3502,6 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   const Shape dot_output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
       convolution_shape.element_type(), {conv_width, output_channels});
 
-  // We cannot insert bitcasts if the layouts will not be compatible.
-  // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
-  // invalid.
-  if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) ||
-      !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) ||
-      !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) {
-    return false;
-  }
-
   auto new_lhs = add_bitcast(new_input_shape, lhs);
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
   DotDimensionNumbers dot_dimension_numbers;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index d2775b9fafa7e4c625f5d181114e80e7369f9c78..ff3f638b22e290f6f6237a5a72a257aa23ecd78b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -25,21 +25,25 @@ namespace xla {
 
 class AlgebraicSimplifierOptions {
  public:
-  // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
-  // bitcast from 'from_shape' to 'to_shape' after considering platform
-  // dependent effects on layout like alignment restrictions. Precondition: the
-  // two shapes have layouts, the same number of elements and
-  // ShapeUtil::ReshapeIsBitcast returns true.
-  using ValidBitcastCallback =
+  AlgebraicSimplifierOptions() {}
+  // Platform dependent callback to determine if a reshape `from_shape` to
+  // `to_shape` is a bitcast.
+  using ReshapeIsBitcastCallback =
       std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
-
   explicit AlgebraicSimplifierOptions(
-      ValidBitcastCallback valid_bitcast_callback)
-      : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
-  // If valid_bitcast_callback returns true, then the pass will replace reshapes
-  // and transposes with bitcasts.
-  const ValidBitcastCallback& valid_bitcast_callback() const {
-    return valid_bitcast_callback_;
+      ReshapeIsBitcastCallback reshape_is_bitcast_callback)
+      : reshape_is_bitcast_callback_(std::move(reshape_is_bitcast_callback)) {}
+
+  // Use the platform specific callback if set. It is not sensible to return
+  // true here if the options are not layout sensitive.
+  bool ReshapeIsBitcast(const Shape& from_shape, const Shape& to_shape) const {
+    if (!is_layout_sensitive_) {
+      return false;
+    }
+    if (!reshape_is_bitcast_callback_) {
+      return ShapeUtil::ReshapeIsBitcast(from_shape, to_shape);
+    }
+    return reshape_is_bitcast_callback_(from_shape, to_shape);
   }
 
   // If is_layout_sensitive is true, then the simplifier preserves layout during
@@ -47,12 +51,14 @@ class AlgebraicSimplifierOptions {
   void set_is_layout_sensitive(bool is_layout_sensitive) {
     is_layout_sensitive_ = is_layout_sensitive;
   }
+
   bool is_layout_sensitive() const { return is_layout_sensitive_; }
 
   // Enable dot simplification on platforms where it is profitable.
   void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
     enable_dot_strength_reduction_ = enable_dot_strength_reduction;
   }
+
   bool enable_dot_strength_reduction() const {
     return enable_dot_strength_reduction_;
   }
@@ -71,16 +77,30 @@ class AlgebraicSimplifierOptions {
       bool enable_permutation_sort_replacement) {
     enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
   }
+
   bool enable_permutation_sort_replacement() const {
     return enable_permutation_sort_replacement_;
   }
 
+  // If enable_window_reduce_replacement is true, the kReduceWindow instruction
+  // can be optimized by replacement with simpler operations.
+  void set_enable_window_reduce_to_reduce_replacement(
+      bool enable_window_reduce_to_reduce_replacement) {
+    enable_window_reduce_to_reduce_replacement_ =
+        enable_window_reduce_to_reduce_replacement;
+  }
+
+  bool enable_window_reduce_to_reduce_replacement() const {
+    return enable_window_reduce_to_reduce_replacement_;
+  }
+
  private:
-  ValidBitcastCallback valid_bitcast_callback_;
+  ReshapeIsBitcastCallback reshape_is_bitcast_callback_;
   bool is_layout_sensitive_{false};
   bool enable_dot_strength_reduction_{true};
   bool enable_conv_simplification_{true};
   bool enable_permutation_sort_replacement_{false};
+  bool enable_window_reduce_to_reduce_replacement_{true};
 };
 
 // A pass which performs algebraic simplifications.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index a9d617cbf6dcd02283d5d66655c0fa6ddf6dc27f..3602ab82b248bb3d7cd8203ed7664e3c460374d2 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -46,17 +46,9 @@ namespace {
 using ::testing::ElementsAre;
 namespace m = match;
 
-AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
-  return [](const Shape&, const Shape&) { return true; };
-}
-
-AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() {
-  return [](const Shape&, const Shape&) { return false; };
-}
-
 class AlgebraicSimplifierTest : public HloTestBase {
  protected:
-  AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()};
+  AlgebraicSimplifierOptions default_options_;
 };
 
 // Test that A + 0 is simplified to A
@@ -202,6 +194,86 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
                   m::Broadcast(m::ConstantScalar(0.125)))));
 }
 
+TEST_F(AlgebraicSimplifierTest, UnsignedDivideByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = u32[4] parameter(0)
+      c = u32[] constant(8)
+      b = u32[4] broadcast(c), dimensions={}
+      ROOT d = u32[4] divide(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::ShiftRightLogical(
+                  m::Parameter(0), m::Broadcast(m::ConstantScalar(3)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SignedDivideByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = s32[4] parameter(0)
+      c = s32[] constant(8)
+      b = s32[4] broadcast(c), dimensions={}
+      ROOT d = s32[4] divide(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto match_dividend_is_negative =
+      m::Lt(m::Parameter(0), m::Broadcast(m::ConstantScalar(0)));
+  auto match_abs = m::Select(match_dividend_is_negative,
+                             m::Negate(m::Parameter(0)), m::Parameter(0));
+  auto match_shift =
+      m::ShiftRightLogical(match_abs, m::Broadcast(m::ConstantScalar(3)));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(match_dividend_is_negative,
+                                   m::Negate(match_shift), match_shift)));
+}
+
+TEST_F(AlgebraicSimplifierTest, UnsignedRemainderByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = u32[4] parameter(0)
+      c = u32[] constant(8)
+      b = u32[4] broadcast(c), dimensions={}
+      ROOT r = u32[4] remainder(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::AndAnyOrder(m::Parameter(0),
+                                        m::Broadcast(m::ConstantScalar(7)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SignedRemainderByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = s32[4] parameter(0)
+      c = s32[] constant(8)
+      b = s32[4] broadcast(c), dimensions={}
+      ROOT r = s32[4] remainder(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto match_dividend_is_negative =
+      m::Lt(m::Parameter(0), m::Broadcast(m::ConstantScalar(0)));
+  auto match_abs = m::Select(match_dividend_is_negative,
+                             m::Negate(m::Parameter(0)), m::Parameter(0));
+  auto match_and =
+      m::AndAnyOrder(match_abs, m::Broadcast(m::ConstantScalar(7)));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(match_dividend_is_negative,
+                                   m::Negate(match_and), match_and)));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto m = CreateNewVerifiedModule();
@@ -1464,23 +1536,77 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
-TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+TEST_F(AlgebraicSimplifierTest, CopyOfReshapeOfCopyEqualsBitcast) {
   auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), "param"));
-  *param->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout({0, 1, 2, 3});
+          0, ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {3, 2, 1, 0}),
+          "param"));
   HloInstruction* copy = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), HloOpcode::kCopy, param));
-  *copy->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout({1, 2, 0, 3});
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {0, 1, 2, 3}),
+      HloOpcode::kCopy, param));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShapeWithLayout(F32, {14 * 14, 64}, {0, 1}), copy));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {14 * 14, 64}, {1, 0}),
+      HloOpcode::kCopy, reshape));
+  auto computation = m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Reshape(m::Copy(m::Parameter(0))))));
+
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  // Verify that the copy of reshape of copy is replaced.
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ReshapeOfCopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {3, 2, 1, 0}),
+          "param"));
+  HloInstruction* copy = builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {0, 1, 2, 3}),
+      HloOpcode::kCopy, param));
+  builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShapeWithLayout(F32, {14 * 14, 64}, {1, 0}), copy));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Copy(m::Parameter(0)))));
+
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  // Verify that the copy of reshape of copy is replaced.
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {0, 1, 2, 3}),
+          "param"));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {1, 2, 0, 3}),
+      HloOpcode::kCopy, param));
   auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier1(options);
   ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
@@ -1488,10 +1614,10 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options2(bitcasting_callback());
+  AlgebraicSimplifierOptions options2;
   options2.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier2(options2);
-  ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
+  EXPECT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Bitcast(m::Parameter(0))));
@@ -1744,7 +1870,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
@@ -1774,7 +1900,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1804,7 +1930,8 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
@@ -1835,8 +1962,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(
-      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that reshape(transpose(rng)) is replace by a single rng of the
@@ -1887,7 +2013,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
                                   m::Op().Is(dimensions_wrong_reshape),
                                   m::Op().Is(layout_wrong_reshape))));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   simplifier.Run(m.get()).ValueOrDie();
@@ -1917,8 +2043,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
-  AlgebraicSimplifier simplifier(
-      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1942,8 +2067,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
                                       /*broadcast_dimensions=*/{0, 1}));
 
-  AlgebraicSimplifier simplifier(
-      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1968,7 +2092,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1998,7 +2122,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -2055,7 +2179,7 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -2103,9 +2227,8 @@ TEST_F(AlgebraicSimplifierTest, TransposeIsReshape) {
       ROOT reshaped_again = f32[10] reshape(f32[10,1,1] transposed)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2651,7 +2774,7 @@ TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_enable_permutation_sort_replacement(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2680,7 +2803,7 @@ TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_enable_permutation_sort_replacement(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
@@ -2703,7 +2826,7 @@ TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_enable_permutation_sort_replacement(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
@@ -2945,7 +3068,7 @@ class ConvInputPaddingTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<ConvPaddingTestcase> {};
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     ConvInputPaddingTestCases, ConvInputPaddingTest,
     ::testing::ValuesIn(std::vector<ConvPaddingTestcase>{
         // Merge this edge padding into the conv.
@@ -3053,7 +3176,7 @@ class ConvFilterPaddingTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<ConvPaddingTestcase> {};
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     ConvFilterPaddingTestCases, ConvFilterPaddingTest,
     ::testing::ValuesIn(std::vector<ConvPaddingTestcase>{
         // Can only merge interior padding on the filter's spatial dimensions;
@@ -3292,7 +3415,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
-    AlgebraicSimplifierOptions simplifier_options(bitcasting_callback());
+    AlgebraicSimplifierOptions simplifier_options;
     simplifier_options.set_is_layout_sensitive(true);
     AlgebraicSimplifier simplifier(simplifier_options);
     if (!simplifier.Run(module.get()).ValueOrDie()) {
@@ -3498,7 +3621,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 
   // Create the reduce-window.
   Window window;
-  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+  for (int64 i = 0; i < pad->shape().rank(); ++i) {
     auto* dim = window.add_dimensions();
     dim->set_size(1);
     dim->set_padding_low(10);
@@ -3584,7 +3707,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
 
   // Create the reduce-window.
   Window window;
-  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+  for (int64 i = 0; i < pad->shape().rank(); ++i) {
     auto* dim = window.add_dimensions();
     dim->set_size(1);
     dim->set_padding_low(10);
@@ -3706,12 +3829,16 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
   HloComputation::Builder builder(TestName());
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
+  std::vector<HloInstruction*> params;
+  for (int i = 0; i < 3; ++i) {
+    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
+        i + 1, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+  }
   builder.AddInstruction(HloInstruction::CreateDynamicSlice(
       shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(0, shape, "slice_from")),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+      params,
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = m->AddEntryComputation(builder.Build());
@@ -3730,28 +3857,35 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
   Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 1, 1000});
 
+  std::vector<HloInstruction*> slice_indices, update_indices;
+  for (int i = 0; i < 3; ++i) {
+    slice_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i + 1, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+    update_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i + 5, ShapeUtil::MakeShape(U32, {}), "update_indices")));
+  }
   HloInstruction* slice =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           slice_shape,
           builder.AddInstruction(
               HloInstruction::CreateParameter(0, full_shape, "slice_from")),
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+          slice_indices,
           /*slice_sizes=*/{10, 1, 1000}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       slice_shape,
       builder.AddInstruction(
-          HloInstruction::CreateParameter(2, slice_shape, "to_update")),
-      slice,
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
+          HloInstruction::CreateParameter(4, slice_shape, "to_update")),
+      slice, update_indices));
 
   auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter(),
+                                         m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
@@ -3858,7 +3992,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3879,7 +4013,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3900,7 +4034,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
@@ -3919,7 +4053,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3941,7 +4075,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3963,7 +4097,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3985,7 +4119,7 @@ TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -4005,7 +4139,7 @@ TEST_F(AlgebraicSimplifierTest, NotNot) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -4142,7 +4276,7 @@ PadReduceWindowEffectiveBroadcastCases() {
   return *cases;
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     PadReduceWindowEffectiveBroadcastInstantiation,
     PadReduceWindowEffectiveBroadcastTest,
     ::testing::ValuesIn(PadReduceWindowEffectiveBroadcastCases()));
@@ -4193,7 +4327,7 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   EXPECT_EQ(has_no_dot, dot_should_be_transformed);
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     BatchDotStrengthReductionTestInstantiation, BatchDotStrengthReductionTest,
     ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
                        ::testing::Values(1, 2), ::testing::Values(F32, BF16)));
@@ -4250,7 +4384,7 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   EXPECT_EQ(has_no_dot, dot_should_be_transformed);
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DotStrengthReductionTestInstantiation, DotStrengthReductionTest,
     ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
                        ::testing::Values(1, 2), ::testing::Bool(),
@@ -4412,9 +4546,10 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   HloInstruction* const update = builder.AddInstruction(
       HloInstruction::CreateParameter(1, update_shape, "update"));
   HloInstruction* const start_indices = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int>({0})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>({})));
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      dslice_shape, operand, update, start_indices));
+      dslice_shape, operand, update,
+      std::initializer_list<HloInstruction*>({start_indices})));
   const HloComputation* const computation =
       m->AddEntryComputation(builder.Build());
 
@@ -4423,9 +4558,9 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   EXPECT_THAT(computation->root_instruction(), operand);
 }
 
-INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
-                        DotOfConcatSimplificationTest,
-                        ::testing::ValuesIn(kDotOfConcatTestSpecs));
+INSTANTIATE_TEST_SUITE_P(DotOfConcatSimplificationTestInstantiation,
+                         DotOfConcatSimplificationTest,
+                         ::testing::ValuesIn(kDotOfConcatTestSpecs));
 
 struct DotOfGatherTestSpec {
   int64 m;
@@ -4467,14 +4602,17 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
 
   int32 start_row = (spec.lcd == 0) ? 0 : spec.s;
   int32 start_col = (spec.lcd == 0) ? spec.s : 0;
-  const auto start_indices =
+  std::vector<HloInstruction*> start_indices = {
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32>({start_row, start_col})));
+          LiteralUtil::CreateR0<int32>(start_row))),
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(start_col)))};
   int64 slice_row_size = (spec.lcd == 0) ? spec.k : 1;
   int64 slice_col_size = (spec.lcd == 0) ? 1 : spec.k;
-  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  std::vector<int64> slice_sizes = {slice_row_size, slice_col_size};
+  Shape ds_shape = ShapeUtil::MakeShape(F32, slice_sizes);
   auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ds_shape, lhs, start_indices, {slice_row_size, slice_col_size}));
+      ds_shape, lhs, start_indices, slice_sizes));
 
   int64 rhs_rows = (spec.rcd == 0) ? spec.k : spec.n;
   int64 rhs_cols = (spec.rcd == 0) ? spec.n : spec.k;
@@ -4507,7 +4645,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   } else {
     EXPECT_THAT(computation->root_instruction(),
                 GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
-                                           m::Concatenate())));
+                                           m::Constant(), m::Constant())));
   }
 }
 
@@ -4545,14 +4683,17 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
 
   int32 start_row = (spec.rcd == 0) ? 0 : spec.s;
   int32 start_col = (spec.rcd == 0) ? spec.s : 0;
-  const auto start_indices =
+  std::vector<HloInstruction*> start_indices = {
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(start_row))),
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32>({start_row, start_col})));
+          LiteralUtil::CreateR0<int32>(start_col)))};
   int64 slice_row_size = (spec.rcd == 0) ? spec.k : 1;
   int64 slice_col_size = (spec.rcd == 0) ? 1 : spec.k;
-  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  std::vector<int64> slice_sizes = {slice_row_size, slice_col_size};
+  Shape ds_shape = ShapeUtil::MakeShape(F32, slice_sizes);
   auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ds_shape, rhs, start_indices, {slice_row_size, slice_col_size}));
+      ds_shape, rhs, start_indices, slice_sizes));
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
@@ -4577,7 +4718,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   } else {
     EXPECT_THAT(computation->root_instruction(),
                 GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
-                                           m::Concatenate())));
+                                           m::Constant(), m::Constant())));
   }
 }
 
@@ -4625,7 +4766,7 @@ std::vector<DotOfGatherTestSpec> DotOfGatherPositiveNegativeTests() {
   return all;
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
     ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
 
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index ef5e211646e7b0b66b8e6c09948be58063422943..6cb0e985e57016e5a22fba50c3e3ad6970f1b178 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -142,13 +142,13 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
   // We only need to care about replica id 0 here, since the GlobalDataHandle is
   // the same for all buffers across replicas.
   const ShapedBuffer* shaped_buffer = replicated_buffers[0];
-  if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) {
+  if (!shaped_buffer->on_host_shape().IsTuple()) {
     return InvalidArgument("global data handle %d is not a tuple",
                            data.handle());
   }
   // If the on-host representation is a tuple, then the on-device one should be
   // as well.
-  TF_RET_CHECK(ShapeUtil::IsTuple(shaped_buffer->on_device_shape()));
+  TF_RET_CHECK(shaped_buffer->on_device_shape().IsTuple());
 
   if (ShapeUtil::IsNestedTuple(shaped_buffer->on_device_shape())) {
     return Unimplemented("Deconstructing nested tuples is not implemented.");
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 47d2c7e35705698d49950c2fa042af1c6327d521..f8dff6a700cc9d5843053e3d451a7b005539ca26 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -36,19 +37,34 @@ namespace {
 
 namespace m = match;
 
-// Returns true iff the argument instruction is an AllReduce, followed by a
-// certain sequence of instructions and then a CRS. It must be possible to move
-// the AR past each instruction in the sequence.
-bool MatchesArCrsPattern(HloInstruction* instruction) {
+// Checks if the argument instruction is an AllReduce, followed by a certain
+// sequence of instructions and then a CRS. It must be possible to move
+// the AR past each instruction in the sequence. Returns the CRS, which is the
+// last instruction in the sequence.
+absl::optional<HloInstruction*> MatchesArCrsPattern(
+    HloInstruction* instruction) {
   auto can_ar_move_past_instruction = [](HloInstruction* instruction) -> bool {
     if (instruction->user_count() != 1) {
       return false;
     }
-    auto opcode = instruction->opcode();
-    return opcode == HloOpcode::kBitcast || opcode == HloOpcode::kTranspose ||
-           opcode == HloOpcode::kReshape || opcode == HloOpcode::kConvert ||
-           opcode == HloOpcode::kAdd || opcode == HloOpcode::kSubtract ||
-           opcode == HloOpcode::kMultiply;
+    switch (instruction->opcode()) {
+      case HloOpcode::kBitcast:
+      case HloOpcode::kTranspose:
+      case HloOpcode::kReshape:
+        return true;
+      case HloOpcode::kConvert:
+        // Can be moved across if both input and output is either float or
+        // integer (e.g. S32<->U32 or F32<->BF16)
+        return ShapeUtil::ElementIsFloating(instruction->shape()) ==
+               ShapeUtil::ElementIsFloating(instruction->operand(0)->shape());
+      case HloOpcode::kAdd:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kMultiply:
+        // Only supported for floating point operands.
+        return ShapeUtil::ElementIsFloating(instruction->shape());
+      default:
+        return false;
+    }
   };
 
   auto computation_is_addition = [](HloComputation* c) {
@@ -59,17 +75,22 @@ bool MatchesArCrsPattern(HloInstruction* instruction) {
   if (!instruction->IsCrossModuleAllReduce() ||
       !computation_is_addition(instruction->called_computations()[0]) ||
       instruction->user_count() != 1) {
-    return false;
+    return absl::nullopt;
   }
   auto next = instruction->users()[0];
   while (!next->IsCrossReplicaAllReduce()) {
     if (can_ar_move_past_instruction(next)) {
       next = next->users()[0];
     } else {
-      return false;
+      return absl::nullopt;
     }
   }
-  return computation_is_addition(next->called_computations()[0]);
+  if (!Cast<HloAllReduceInstruction>(next)->IsNoop() &&
+      computation_is_addition(next->called_computations()[0])) {
+    return absl::optional<HloInstruction*>(next);
+  } else {
+    return absl::nullopt;
+  }
 }
 
 }  // namespace
@@ -85,7 +106,7 @@ absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
       return caller_instruction;
     }
   }
-  return absl::optional<HloInstruction*>();
+  return absl::nullopt;
 }
 
 std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
@@ -176,6 +197,15 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
     return false;
   }
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+  if (i1->IsCrossModuleAllReduce()) {
+    return i1->Identical(*i2,
+                         /*eq_operands=*/std::equal_to<const HloInstruction*>(),
+                         eq_computations,
+                         /*layout_sensitive=*/false);
+  }
   visited_pairs->emplace(min_uid, max_uid);
   for (int i = 0; i < operands1.size(); ++i) {
     auto operand1 = operands1[i];
@@ -201,9 +231,6 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   // InstructionsComputeSameValue earlier.
   auto eq_instructions = [](const HloInstruction* i1,
                             const HloInstruction* i2) -> bool { return true; };
-  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
-    return *a == *b;
-  };
   return i1->Identical(*i2, eq_instructions, eq_computations,
                        /*layout_sensitive=*/false);
 }
@@ -211,8 +238,14 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
 void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (MatchesArCrsPattern(instruction)) {
-        all_reduce_map_[*(instruction->all_reduce_id())].push_back(instruction);
+      auto maybe_crs = MatchesArCrsPattern(instruction);
+      if (maybe_crs) {
+        auto crs = *maybe_crs;
+        int64 ar_id = *(instruction->all_reduce_id());
+        if (crs_reserved_map_.find(crs) == crs_reserved_map_.end()) {
+          all_reduce_map_[ar_id].push_back(instruction);
+          crs_reserved_map_[crs] = ar_id;
+        }
       }
     }
   }
@@ -229,14 +262,17 @@ void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
       auto next_0 = instr_0->users()[0];
       auto next_i = instr_i->users()[0];
       absl::flat_hash_map<int64, int64> visited_pairs;
-      do {
+      while (true) {
         if (!InstructionsComputeSameValue(next_0, next_i, &visited_pairs)) {
           all_reduce_map_.erase(all_reduce_id);
           break;
         }
+        if (next_0->IsCrossReplicaAllReduce()) {
+          break;
+        }
         next_0 = next_0->users()[0];
         next_i = next_i->users()[0];
-      } while (!next_0->IsCrossReplicaAllReduce());
+      }
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index 6f54b97615b270bc6b180dd47d9aff6473752b47..e61ef5d4f9072979a6c356a9456c91e19405b01e 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -83,6 +83,11 @@ class ArCrsCombiner : public HloModulePass {
   // Map from all-reduce ids to the all reduce instructions.
   absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
 
+  // Map from a CRS instruction to the all-reduce ID of the AR paired with the
+  // CRS. Sometimes, several ARs in the code could be paired with the same CRS.
+  // We use this map to pick a single AR/CRS path to rewrite.
+  absl::flat_hash_map<HloInstruction*, int64> crs_reserved_map_;
+
   std::unique_ptr<CallGraph> call_graph_;
 };
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index caa57296f465698eb70d7cb8327d4678f394b323..5152f0dc884a153f9b0ade06acd479832d87ff25 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -360,6 +360,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
   %p = bf16[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
 
   %all-reduce.ar.1 = bf16[]
       all-reduce(%p),
@@ -377,7 +378,7 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
       sharding={maximal device=0}
 
   %all-reduce.ar.2 = bf16[]
-      all-reduce(%p),
+      all-reduce(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
       to_apply=%sum.bf16,
@@ -407,7 +408,7 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::AllReduce(op::Convert(op::Parameter())),
-                        op::AllReduce(op::Convert(op::Parameter()))));
+                        op::AllReduce(op::Convert(op::Constant()))));
   auto crs_after =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_after = crs_after->replica_groups();
@@ -705,5 +706,470 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ArCrsCombinerTest, ArThenCrsDontCrash) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(%a, %b)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%all-reduce.ar.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.1,
+      sharding={maximal device=0}
+  %multiply.1 = f32[]
+      multiply(%all-reduce.1, %constant.f32),
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%all-reduce.ar.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.1,
+      sharding={maximal device=1}
+  %multiply.2 = f32[]
+      multiply(%all-reduce.2, %constant.f32),
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Parameter()),
+                        op::AllReduce(op::Parameter())));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleAdds) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.1 = f32[] constant(1)
+  %constant.2 = f32[] constant(2)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add.11 = f32[]
+      add(%constant.1, %all-reduce.ar.1),
+      sharding={maximal device=0}
+  %add.12 = f32[]
+      add(%constant.2, %add.11),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%add.12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add.21 = f32[]
+      add(%constant.1, %all-reduce.ar.2),
+      sharding={maximal device=0}
+  %add.22 = f32[]
+      add(%constant.2, %add.21),
+      sharding={maximal device=0}
+  %all-reduce.2 = f32[]
+      all-reduce(%add.22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Divide(op::Constant(), op::Constant()),
+                            op::Add(op::Divide(op::Constant(), op::Constant()),
+                                    op::Parameter()))),
+                        op::AllReduce(op::Add(
+                            op::Divide(op::Constant(), op::Constant()),
+                            op::Add(op::Divide(op::Constant(), op::Constant()),
+                                    op::Parameter())))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArSubtractCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+  %sub.1 = f32[]
+      subtract(%constant.f32, %all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%sub.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+  %sub.2 = f32[]
+      subtract(%constant.f32, %all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%sub.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::AllReduce(op::Subtract(op::Divide(op::Constant(), op::Constant()),
+                                     op::Parameter())),
+          op::AllReduce(op::Subtract(op::Divide(op::Constant(), op::Constant()),
+                                     op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleARsLeft) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %const1 = f32[] constant(1)
+  %const2 = f32[] constant(2)
+
+  %ar11 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add11 = f32[]
+      add(%ar11, %const1),
+      sharding={maximal device=0}
+  %ar12 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add12 = f32[]
+      add(%add11, %ar12),
+      sharding={maximal device=0}
+  %crs1 = f32[]
+      all-reduce(%add12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %ar21 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add21 = f32[]
+      add(%ar21, %const1),
+      sharding={maximal device=1}
+  %ar22 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add22 = f32[]
+      add(%add21, %ar22),
+      sharding={maximal device=1}
+  %crs2 = f32[]
+      all-reduce(%add22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%crs1, %crs2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Add(op::Parameter(),
+                                    op::Divide(op::Constant(), op::Constant())),
+                            op::Divide(op::AllReduce(), op::Constant()))),
+                        op::AllReduce(op::Add(
+                            op::Add(op::Parameter(),
+                                    op::Divide(op::Constant(), op::Constant())),
+                            op::Divide(op::AllReduce(), op::Constant())))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleARsRight) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %const1 = f32[] constant(1)
+  %const2 = f32[] constant(2)
+
+  %ar11 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %ar12 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add11 = f32[]
+      add(%ar12, %const1),
+      sharding={maximal device=0}
+  %add12 = f32[]
+      add(%ar11, %add11),
+      sharding={maximal device=0}
+  %crs1 = f32[]
+      all-reduce(%add12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %ar21 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %ar22 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add21 = f32[]
+      add(%ar22, %const1),
+      sharding={maximal device=1}
+  %add22 = f32[]
+      add(%ar21, %add21),
+      sharding={maximal device=1}
+  %crs2 = f32[]
+      all-reduce(%add22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%crs1, %crs2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Parameter(),
+                            op::Divide(op::Add(op::AllReduce(), op::Constant()),
+                                       op::Constant()))),
+                        op::AllReduce(op::Add(
+                            op::Parameter(),
+                            op::Divide(op::Add(op::AllReduce(), op::Constant()),
+                                       op::Constant())))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, OneReplicaDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
+  %p = bf16[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+
+  %all-reduce.ar.1 = bf16[]
+      all-reduce(%p),
+      replica_groups={{0}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=0}
+  %convert.1 = f32[]
+      convert(%all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%convert.1),
+      replica_groups={{0}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = bf16[]
+      all-reduce(%constant.bf16),
+      replica_groups={{0}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=1}
+  %convert.2 = f32[]
+      convert(%all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%convert.2),
+      replica_groups={{0}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 2cf24a9dd5fa18abe9dde4eb49b03c6586bfef03..215e8ced4bb3f98a26ac4eb9912a7fd4d917852f 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -115,12 +115,10 @@ StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal) {
 
 StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor) {
   tensorflow::mutex_lock l(mu_);
-  if (0 == stream_pools_.count(executor)) {
-    stream_pools_.emplace(std::piecewise_construct,
-                          std::forward_as_tuple(executor),
-                          std::forward_as_tuple());
+  if (!stream_pools_.contains(executor)) {
+    stream_pools_.emplace(executor, absl::make_unique<StreamPool>());
   }
-  return stream_pools_.at(executor).BorrowStream(executor);
+  return stream_pools_.at(executor)->BorrowStream(executor);
 }
 
 Backend::Backend(se::Platform* platform, Compiler* compiler,
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 7ca993fb2656037951d98d9c4459a3c3e4c64c61..c35f033dc0180409ae3888c2050021da83f5c72a 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -175,7 +176,8 @@ class Backend {
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<se::StreamExecutor*, StreamPool> stream_pools_ GUARDED_BY(mu_);
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamPool>>
+      stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 0e6ca1871b379a2f55b92207133822fc6258b007..e5f5c3edb2ac0c217317fbf809463aa31af9af59 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -123,7 +123,7 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     auto elements_per_feature_u32 = add_instruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
 
-    for (int64 i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+    for (int64 i = 0; i < operand->shape().rank(); ++i) {
       if (i == feature_index) {
         continue;
       }
@@ -229,7 +229,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add(HloInstruction::CreateConstant(std::move(epsilon_literal))), {}));
   std::vector<int64> dimensions_without_feature;
 
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
     if (i != feature_index) {
       dimensions_without_feature.push_back(i);
     }
@@ -357,7 +357,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
 
   std::vector<int64> dimensions_without_feature;
 
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
     if (i != feature_index) {
       dimensions_without_feature.push_back(i);
     }
@@ -494,7 +494,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   std::vector<int64> dimensions_without_feature;
 
-  for (int64 i = 0; i < ShapeUtil::Rank(activation_shape); ++i) {
+  for (int64 i = 0; i < activation_shape.rank(); ++i) {
     if (i != feature_index) {
       dimensions_without_feature.push_back(i);
     }
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index 6caef77ed00909040a54e65651cc6fb7ca74eb90..e62d72b323bd1d113e9d87bf8602bfb434c40d61 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -190,7 +190,7 @@ Status BFloat16ConversionFoldingVisitor::HandleAllReduce(HloInstruction* crs) {
   }
 
   // If the output is not a tuple, we don't need special handling.
-  if (!ShapeUtil::IsTuple(crs->shape())) {
+  if (!crs->shape().IsTuple()) {
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index e3aefe906739b74e887f33d2ffc3ad7a60510b5b..d1b14d604f0559b6b18f7d1fba127669c241c8a3 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -363,7 +363,7 @@ Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
   // TODO(b/112040122): Correctly normalize variadic reduce.
   if ((hlo->opcode() == HloOpcode::kSort ||
        hlo->opcode() == HloOpcode::kAllReduce) &&
-      ShapeUtil::IsTuple(hlo->shape())) {
+      hlo->shape().IsTuple()) {
     return HandleMultipleOutputs(hlo);
   }
   return HandleInstruction(hlo);
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 05dd4b3e914f5563a33d534829ffb01668279064..bab63f66d83b712d756078bef84926eed235f6b5 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -277,7 +277,7 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
               *use.instruction, use.operand_number)) {
         if (use.instruction->opcode() == HloOpcode::kTuple ||
             (use.instruction->opcode() == HloOpcode::kAllReduce &&
-             ShapeUtil::IsTuple(use.instruction->shape()))) {
+             use.instruction->shape().IsTuple())) {
           ShapeIndex use_output_index{use.operand_number};
           for (int64 i : use.operand_index) {
             use_output_index.push_back(i);
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 202e45e181d13621f79e3bf95e33091b54e8b779..e1b91b500191c7756f3d1a4b160a0dd1e09cfe7d 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -86,10 +86,9 @@ std::vector<int64> ColorInterferenceGraph(
   // first, but it would be good to investigate other ordering heuristics too.
   std::vector<int64> nodes(node_count);
   std::iota(nodes.begin(), nodes.end(), 0);
-  std::sort(nodes.begin(), nodes.end(),
-            [&interference_map](const int64 i, const int64 j) {
-              return interference_map[i].size() > interference_map[j].size();
-            });
+  absl::c_sort(nodes, [&interference_map](const int64 i, const int64 j) {
+    return interference_map[i].size() > interference_map[j].size();
+  });
 
   const int64 kColorUnassigned = -1;
   std::vector<int64> assigned_colors(node_count, kColorUnassigned);
@@ -138,8 +137,8 @@ Status GatherComputationsByAllocationType(
     worklist.pop_front();
     const HloComputation* computation = worklist_front.first;
     bool is_thread_local = worklist_front.second;
-    bool in_thread_local_set = thread_local_set.count(computation) > 0;
-    bool in_global_set = global_set.count(computation) > 0;
+    bool in_thread_local_set = thread_local_set.contains(computation);
+    bool in_global_set = global_set.contains(computation);
 
     // If the computation has already been added to the respective set, then
     // nothing to do.
@@ -207,9 +206,9 @@ Status GatherComputationsByAllocationType(
 
   // Add the computations to the vectors in post order.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (thread_local_set.count(computation) > 0) {
+    if (thread_local_set.contains(computation)) {
       thread_local_computations->push_back(computation);
-    } else if (global_set.count(computation) > 0) {
+    } else if (global_set.contains(computation)) {
       global_computations->push_back(computation);
     }
     // If the computation is not reachable from the entry computation, then it
@@ -219,13 +218,6 @@ Status GatherComputationsByAllocationType(
   return Status::OK();
 }
 
-size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
-  uint64 h = std::hash<int64>()(s.index());
-  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.offset()));
-  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.size()));
-  return h;
-}
-
 string BufferAllocation::Slice::ToString() const {
   return absl::StrCat("{index:", index(), ", offset:", offset_,
                       ", size:", size_, "}");
@@ -240,7 +232,7 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
   VLOG(4) << "Trying to add " << buffer << " to allocation #" << index();
-  CHECK(assigned_buffers_.count(&buffer) == 0)
+  CHECK(!assigned_buffers_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
   CHECK_LE(offset, size_) << "LogicalBuffer " << buffer
@@ -279,11 +271,12 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     proto_assigned->set_offset(buffer_offset_size.second.offset);
     proto_assigned->set_size(buffer_offset_size.second.size);
   }
-  std::sort(proto.mutable_assigned()->begin(), proto.mutable_assigned()->end(),
-            [](const BufferAllocationProto::Assigned& assign1,
-               const BufferAllocationProto::Assigned& assign2) {
-              return assign1.logical_buffer_id() < assign2.logical_buffer_id();
-            });
+  absl::c_sort(*proto.mutable_assigned(),
+               [](const BufferAllocationProto::Assigned& assign1,
+                  const BufferAllocationProto::Assigned& assign2) {
+                 return assign1.logical_buffer_id() <
+                        assign2.logical_buffer_id();
+               });
   return proto;
 }
 
@@ -315,10 +308,10 @@ string BufferAllocation::ToString() const {
   for (const auto& buffer_offset_size : assigned_buffers_) {
     sorted_buffers.push_back(buffer_offset_size.first);
   }
-  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
+  absl::c_sort(sorted_buffers,
+               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 return a->id() < b->id();
+               });
   for (const LogicalBuffer* buffer : sorted_buffers) {
     const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
     StrAppend(&output, absl::StrFormat(
@@ -346,7 +339,7 @@ const PointsToSet& BufferAssignment::GetPointsToSet(
 
 bool BufferAssignment::HasAllocation(const LogicalBuffer& buffer) const {
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
-  return allocation_index_for_buffer_.count(&buffer) > 0;
+  return allocation_index_for_buffer_.contains(&buffer);
 }
 
 const BufferAllocation& BufferAssignment::GetAssignedAllocation(
@@ -401,7 +394,7 @@ bool BufferAssignment::HasAllocationAt(const HloInstruction* instruction,
                                        const ShapeIndex& index) const {
   for (const LogicalBuffer* buffer :
        GetPointsToSet(instruction).element(index)) {
-    if (allocation_index_for_buffer_.count(buffer) > 0) {
+    if (allocation_index_for_buffer_.contains(buffer)) {
       return true;
     }
   }
@@ -459,8 +452,7 @@ bool BufferAssignment::SharesSliceAtIndex(
 
 bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
                                           const HloInstruction* hlo_b) const {
-  using SliceSet =
-      flat_hash_set<BufferAllocation::Slice, BufferAllocation::Slice::Hasher>;
+  using SliceSet = flat_hash_set<BufferAllocation::Slice>;
   // Gets the slices all of instr's subshapes.  If any subshape doesn't have an
   // assigned slice, returns the empty set.
   auto collect_slices = [&](const HloInstruction* instr) -> SliceSet {
@@ -487,10 +479,9 @@ bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
   // didn't return the empty set) for both HLOs, and the two resulting sets of
   // slices are disjoint.
   return !slices_a.empty() && !slices_b.empty() &&
-         std::none_of(slices_a.begin(), slices_a.end(),
-                      [&](const BufferAllocation::Slice& slice) {
-                        return slices_b.count(slice) > 0;
-                      });
+         absl::c_none_of(slices_a, [&](const BufferAllocation::Slice& slice) {
+           return slices_b.contains(slice);
+         });
 }
 
 StatusOr<BufferAllocation::Slice>
@@ -519,7 +510,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
 void BufferAssignment::AddAssignment(BufferAllocation* allocation,
                                      const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
-  CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer))
+  CHECK(!allocation_index_for_buffer_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already has an allocation.";
   CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
       << "Non-reusable allocation already assigned a buffer: "
@@ -761,7 +752,8 @@ namespace {
 bool MayInterfereAcrossSubcomputations(BufferAssignment* assignment,
                                        const LogicalBuffer& a_buffer,
                                        const LogicalBuffer& b_buffer) {
-  auto call_graph = assignment->liveness().hlo_ordering().call_graph();
+  const CallGraph& call_graph =
+      assignment->liveness().hlo_ordering().call_graph();
   const HloInstruction* a_ancestor;
   const HloInstruction* b_ancestor;
   std::tie(a_ancestor, b_ancestor) =
@@ -960,35 +952,35 @@ Status BufferAssigner::AssignBuffersForComputation(
   // operands (assuming operands are the same/larger size) enabling the
   // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
-  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [has_sequential_order, &liveness, &post_order_position, assignment](
-                const LogicalBuffer* a, const LogicalBuffer* b) {
-              // Primary sort is by decreasing buffer size.
-              const int64 a_size = assignment->buffer_size_(*a);
-              const int64 b_size = assignment->buffer_size_(*b);
-              if (a_size != b_size) {
-                return a_size > b_size;  // use ">" for decreasing size.
-              }
-              // Otherwise live out buffers come before others, if the
-              // instructions are sequentially ordered.
-              if (has_sequential_order) {
-                const bool a_live_out = liveness.MaybeLiveOut(*a);
-                const bool b_live_out = liveness.MaybeLiveOut(*b);
-                if (a_live_out != b_live_out) {
-                  return a_live_out;
-                }
-              }
-              // Final tiebreaker is in instruction post order.
-              return post_order_position.at(a->instruction()) <
-                     post_order_position.at(b->instruction());
-            });
+  absl::c_sort(sorted_buffers,
+               [has_sequential_order, &liveness, &post_order_position,
+                assignment](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 // Primary sort is by decreasing buffer size.
+                 const int64 a_size = assignment->buffer_size_(*a);
+                 const int64 b_size = assignment->buffer_size_(*b);
+                 if (a_size != b_size) {
+                   return a_size > b_size;  // use ">" for decreasing size.
+                 }
+                 // Otherwise live out buffers come before others, if the
+                 // instructions are sequentially ordered.
+                 if (has_sequential_order) {
+                   const bool a_live_out = liveness.MaybeLiveOut(*a);
+                   const bool b_live_out = liveness.MaybeLiveOut(*b);
+                   if (a_live_out != b_live_out) {
+                     return a_live_out;
+                   }
+                 }
+                 // Final tiebreaker is in instruction post order.
+                 return post_order_position.at(a->instruction()) <
+                        post_order_position.at(b->instruction());
+               });
 
   // BufferAllocations are necessarily created in decreasing size order. Keep
   // indices of previously created BufferAllocations in allocation_indices.
   std::vector<BufferAllocation::Index> allocation_indices;
   for (const LogicalBuffer* buffer : sorted_buffers) {
     VLOG(3) << "Assigning allocation to: " << *buffer;
-    if (colocated_buffers.count(buffer) > 0) {
+    if (colocated_buffers.contains(buffer)) {
       // Colocated buffers are currently assigned in an earlier pass.
       VLOG(3) << "Skipping colocated buffer: " << *buffer;
       continue;
@@ -1020,10 +1012,14 @@ Status BufferAssigner::AssignBuffersForComputation(
       // callers.
       BufferAllocation* allocation =
           assignment->NewAllocation(*buffer, buffer_size);
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              instruction->parameter_number(), buffer->index());
       allocation->set_entry_computation_parameter(
-          instruction->parameter_number(), buffer->index());
-      VLOG(3) << "New allocation #" << allocation->index()
-              << " for entry computation parameter: " << *buffer;
+          instruction->parameter_number(), buffer->index(),
+          parameter_has_alias);
+      VLOG(3) << "Mark allocation #" << allocation->index()
+              << " as entry computation parameter: " << *buffer;
       continue;
     }
 
@@ -1036,7 +1032,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
-    if (ShapeUtil::IsTuple(buffer->shape())) {
+    if (buffer->shape().IsTuple()) {
       BufferAllocation* allocation =
           assignment->NewAllocation(*buffer, buffer_size);
       allocation->set_is_tuple(true);
@@ -1056,7 +1052,7 @@ Status BufferAssigner::AssignBuffersForComputation(
              assignment->GetAllSlices(operand, /*index=*/{})) {
           BufferAllocation* allocation =
               assignment->GetMutableAllocation(operand_slice.index());
-          if (colocated_allocations.count(allocation->index()) == 0) {
+          if (!colocated_allocations.contains(allocation->index())) {
             // TODO(b/32491382) Colocated buffers are currently assigned in an
             // earlier pass, and so can break the "increasing allocation size"
             // invariant in this function (causing this CHECK to fail). However,
@@ -1087,7 +1083,7 @@ Status BufferAssigner::AssignBuffersForComputation(
         // Instructions are iterated in increasing buffer size, so any
         // previously create allocation must be large enough to hold this
         // instruction's output (with the exception of colocated buffers).
-        if (colocated_allocations.count(allocation->index()) == 0) {
+        if (!colocated_allocations.contains(allocation->index())) {
           // TODO(b/32491382) Colocated buffers are currently assigned in an
           // earlier pass, and so can break the "increasing allocation size"
           // invariant in this function (causing this CHECK to fail). However,
@@ -1313,10 +1309,10 @@ std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
                              live_buffers.end());
 
   // Stabily sort the live buffers.
-  std::sort(live_buffers_vector.begin(), live_buffers_vector.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
+  absl::c_sort(live_buffers_vector,
+               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 return a->id() < b->id();
+               });
   return live_buffers_vector;
 }
 
@@ -1376,7 +1372,7 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   std::vector<size_t> overlap_set_indices;
   for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
     for (const LogicalBuffer* buffer : colocated_set) {
-      if ((*colocated_buffer_sets)[index].count(buffer) > 0) {
+      if ((*colocated_buffer_sets)[index].contains(buffer)) {
         VLOG(5) << "Found overlap with existing set on buffer "
                 << buffer->ToString() << "\n"
                 << ColocatedBufferSetsToString((*colocated_buffer_sets)[index],
@@ -1425,12 +1421,14 @@ BufferAssigner::MergeColocatedBufferSets(
           << colocated_buffer_sets.size();
 
   // Returns true if the given buffer is for the entry parameter.
-  auto is_entry_parameter = [](const LogicalBuffer& buffer) {
+  auto is_readonly_entry_parameter = [](const LogicalBuffer& buffer) {
     auto* instruction = buffer.instruction();
     auto* computation = instruction->parent();
     auto* module = computation->parent();
     return instruction->opcode() == HloOpcode::kParameter &&
-           computation == module->entry_computation();
+           computation == module->entry_computation() &&
+           !module->input_output_alias_config().ParameterHasAlias(
+               instruction->parameter_number(), buffer.index());
   };
 
   std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
@@ -1452,7 +1450,7 @@ BufferAssigner::MergeColocatedBufferSets(
   for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
     for (auto& buffer : colocated_buffer_sets[i]) {
       if (buffer_liveness.MaybeLiveOut(*buffer) ||
-          is_entry_parameter(*buffer) ||
+          is_readonly_entry_parameter(*buffer) ||
           buffer->instruction()->opcode() == HloOpcode::kConstant) {
         set_can_be_merged[i] = false;
         break;
@@ -1539,15 +1537,16 @@ void BufferAssigner::BuildColocatedBufferSets(
   VLOG(4) << "Input/Output Alias Config: ";
   VLOG(4) << module->input_output_alias_config();
   module->input_output_alias_config().ForEachAlias(
-      [&](const ShapeIndex& output_index, int64 param_number,
-          const ShapeIndex& param_index) {
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) {
         std::vector<const LogicalBuffer*> colocated_set;
         AddBufferToColocatedSet(module->entry_computation()->root_instruction(),
                                 output_index, points_to_analysis,
                                 &colocated_set);
         AddBufferToColocatedSet(
-            module->entry_computation()->parameter_instruction(param_number),
-            param_index, points_to_analysis, &colocated_set);
+            module->entry_computation()->parameter_instruction(
+                alias.parameter_number),
+            alias.parameter_index, points_to_analysis, &colocated_set);
         AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
       });
 
@@ -1741,10 +1740,6 @@ void BufferAssigner::AssignColocatedBufferSets(
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
         allocation = assignment->NewAllocation(*buffer, buffer_size);
-        if (entry_parameter_number >= 0) {
-          allocation->set_entry_computation_parameter(
-              entry_parameter_number, *entry_parameter_shape_idx);
-        }
         if (is_constant) {
           allocation->set_constant(true);
         }
@@ -1758,6 +1753,16 @@ void BufferAssigner::AssignColocatedBufferSets(
       }
       colocated_buffers->insert(buffer);
     }
+
+    // If an allocation contains a parameter, set corresponding fields.
+    if (entry_parameter_number >= 0) {
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              entry_parameter_number, *entry_parameter_shape_idx);
+      allocation->set_entry_computation_parameter(entry_parameter_number,
+                                                  *entry_parameter_shape_idx,
+                                                  parameter_has_alias);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 0a9fdede803e84ca42472259084615c031b206eb..448dec3b1aa0c0f85e1060a70e965fcf3952c320 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -96,7 +96,11 @@ class BufferAllocation {
   // Whether this allocation is readonly i.e. backed by memory we cannot write
   // to.
   bool is_readonly() const {
-    return is_entry_computation_parameter() || is_constant();
+    // Entry parameters are generally readonly, except when they are aliased
+    // with any output.
+    return (is_entry_computation_parameter() &&
+            !is_parameter_aliased_with_output_) ||
+           is_constant();
   }
 
   bool is_tuple() const { return is_tuple_; }
@@ -186,9 +190,10 @@ class BufferAllocation {
              end > other.offset_;
     }
 
-    struct Hasher {
-      size_t operator()(Slice s) const;
-    };
+    template <typename H>
+    friend H AbslHashValue(H h, const Slice& s) {
+      return H::combine(std::move(h), s.index(), s.offset(), s.size());
+    }
 
     string ToString() const;
 
@@ -273,8 +278,10 @@ class BufferAllocation {
   void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
 
   void set_entry_computation_parameter(int64 parameter_number,
-                                       ShapeIndex param_shape_index) {
+                                       ShapeIndex param_shape_index,
+                                       bool parameter_aliased_with_output) {
     is_entry_computation_parameter_ = true;
+    is_parameter_aliased_with_output_ = parameter_aliased_with_output;
     parameter_number_ = parameter_number;
     param_shape_index_ = std::move(param_shape_index);
   }
@@ -304,6 +311,9 @@ class BufferAllocation {
   // outlast the computation.
   bool is_entry_computation_parameter_ = false;
 
+  // Whether this entry computation parameter is aliased with output.
+  bool is_parameter_aliased_with_output_ = false;
+
   // If this allocation holds an entry computation parameter, this field
   // indicates the index (starting from 0) of the parameter.
   int64 parameter_number_ = 0;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29..580bc2f43384006eab8711490689a200fc887d37 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
@@ -309,7 +310,7 @@ class BufferAssignmentTest : public HloTestBase {
 static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
                             const std::vector<const HloInstruction*>& b,
                             const BufferAssignment& assignment) {
-  std::set<BufferAllocation::Slice> a_slices;
+  absl::flat_hash_set<BufferAllocation::Slice> a_slices;
   for (const HloInstruction* instruction : a) {
     if (assignment.HasTopLevelAllocation(instruction)) {
       a_slices.insert(
@@ -319,8 +320,8 @@ static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
 
   for (const HloInstruction* instruction : b) {
     if (assignment.HasTopLevelAllocation(instruction)) {
-      if (a_slices.count(assignment.GetUniqueTopLevelSlice(instruction)
-                             .ConsumeValueOrDie())) {
+      if (a_slices.contains(assignment.GetUniqueTopLevelSlice(instruction)
+                                .ConsumeValueOrDie())) {
         return false;
       }
     }
@@ -464,6 +465,40 @@ TEST_F(BufferAssignmentTest, Basic) {
   GetAssignedOutputAllocation(*buffers, sub);
 }
 
+TEST_F(BufferAssignmentTest, AliasedParamCanBeReused) {
+  // If an input buffer and output buffer aliases, the input buffer can be
+  // reused for other intermediate results.
+  //
+  // param0[100] ----- (neg1) -- (neg2)
+  //    |                           |
+  //    + -------- Aliased ---------+
+
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec100_, "p0"));
+  auto neg_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param));
+  auto neg_2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, neg_1));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias(
+      {}, 0, {}, HloInputOutputAliasConfig::kUserAlias));
+
+  auto buffers = RunBufferAssignment(module.get());
+
+  BufferAllocation param_buffer = GetAssignedInputAllocation(*buffers, param);
+  BufferAllocation neg_1_buffer = GetAllocation(*buffers, neg_1, {});
+  BufferAllocation neg_2_buffer = GetAllocation(*buffers, neg_2, {});
+
+  // Everything use one buffer.
+  EXPECT_EQ(param_buffer.index(), neg_1_buffer.index());
+  EXPECT_EQ(neg_2_buffer.index(), neg_1_buffer.index());
+}
+
 TEST_F(BufferAssignmentTest, AddCannotReuse) {
   // Pass in a special rule to indicate that "add" cannot reuse any buffer.
   //
@@ -2485,9 +2520,9 @@ while_body {
   get-tuple-element.3 = s32[] get-tuple-element(state), index=0
   constant.2 = s32[] constant(128)
   add.5 = s32[] add(get-tuple-element.3, constant.2)
-  constant.3 = s32[3]{0} constant({0, 0, 0})
-  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3)
-  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3)
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
   ROOT tuple.85 = (s32[], s32[], s32[2]{0}, f32[1280,1,128]{2,1,0}) tuple(add.5, dynamic-update-slice.9)
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 40825a78716b1c0b9fb0121787977d275891c0f8..23b9af0281b0d5ee1ef6ca2315f0cc1042285609 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -52,8 +52,8 @@ class BufferLivenessTest : public HloTestBase {
   // interfere. Precondition: 'a' and 'b' are array-shaped.
   bool InstructionsMayInterfere(const BufferLiveness& liveness,
                                 HloInstruction* a, HloInstruction* b) {
-    EXPECT_FALSE(ShapeUtil::IsTuple(a->shape()));
-    EXPECT_FALSE(ShapeUtil::IsTuple(b->shape()));
+    EXPECT_FALSE(a->shape().IsTuple());
+    EXPECT_FALSE(b->shape().IsTuple());
     return liveness.MayInterfere(
         GetBuffer(liveness, /*instruction=*/a, /*index=*/{}),
         GetBuffer(liveness, /*instruction=*/b, /*index=*/{}));
@@ -66,8 +66,8 @@ class BufferLivenessTest : public HloTestBase {
                                  HloInstruction* a, HloInstruction* b,
                                  const ShapeIndex& index) {
     // Check that top-level shapes are tuple and tuple element shapes are equal.
-    EXPECT_TRUE(ShapeUtil::IsTuple(a->shape()));
-    EXPECT_TRUE(ShapeUtil::IsTuple(b->shape()));
+    EXPECT_TRUE(a->shape().IsTuple());
+    EXPECT_TRUE(b->shape().IsTuple());
     EXPECT_TRUE(
         ShapeUtil::Compatible(ShapeUtil::GetSubshape(a->shape(), index),
                               ShapeUtil::GetSubshape(b->shape(), index)));
@@ -638,10 +638,10 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     }
     // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
     auto starts = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
     auto dynamic_update_slice =
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-            data_shape, gte1, update, starts));
+            data_shape, gte1, update, {starts}));
     // Create output tuple.
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
@@ -794,10 +794,10 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     }
     // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
     auto starts = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
     auto dynamic_update_slice =
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-            data_shape, gte1, update, starts));
+            data_shape, gte1, update, {starts}));
     // Create output tuple.
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
index fdf822c666b15afbc7553ca89d4f92ab08201869..b1abba20689915b03304aacd7a5fcca5443c2c60 100644
--- a/tensorflow/compiler/xla/service/buffer_value.cc
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -29,8 +29,8 @@ BufferValue::BufferValue(HloInstruction* instruction, const ShapeIndex& index,
                          Id id)
     : id_(id) {
   const Shape& shape = ShapeUtil::GetSubshape(instruction->shape(), index);
-  is_array_ = ShapeUtil::IsArray(shape);
-  is_tuple_ = ShapeUtil::IsTuple(shape);
+  is_array_ = shape.IsArray();
+  is_tuple_ = shape.IsTuple();
 }
 
 BufferValue::~BufferValue() {}
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 173b3fc05f53d523fb07ef9b14be884fd5f8aeb1..94af788c54f6c722997311bec50da3ed93aa3cee 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -236,6 +236,41 @@ void CallGraph::SetCallContexts() {
   }
 }
 
+void CallGraph::SetNodeDepths() {
+  std::queue<CallGraphNode*> worklist;
+
+  // Initialize node depths to -1.
+  for (CallGraphNode& node : nodes_) {
+    node.set_depth(-1);
+  }
+
+  // Initialize worklist with all roots of the call graph (computations without
+  // callers).
+  for (const HloComputation* computation : module_->computations()) {
+    CallGraphNode& node = GetNode(computation);
+    if (node.callers().empty()) {
+      node.set_depth(0);
+      worklist.push(&node);
+    }
+  }
+
+  while (!worklist.empty()) {
+    CallGraphNode* node = worklist.front();
+    worklist.pop();
+    for (const HloComputation* callee : node->callees()) {
+      CallGraphNode& callee_node = GetNode(callee);
+      if (callee_node.depth() < node->depth() + 1) {
+        callee_node.set_depth(node->depth() + 1);
+        worklist.push(&callee_node);
+      }
+    }
+  }
+
+  for (CallGraphNode& node : nodes_) {
+    CHECK_NE(node.depth(), -1);
+  }
+}
+
 /* static */
 std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
   // Constructor for CallGraph is private so absl::make_unique can't be used.
@@ -271,6 +306,8 @@ std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
   }
 
   call_graph->SetCallContexts();
+  call_graph->SetNodeDepths();
+
   XLA_VLOG_LINES(1, call_graph->ToString());
 
   return call_graph;
@@ -352,15 +389,38 @@ CallGraph::NearestAncestorsInSameComputation(HloInstruction* a,
 
   // Iterate through the callee->caller chains and find the earliest common
   // element.
-  for (HloInstruction* a_ancestor = a; a_ancestor != nullptr;
-       a_ancestor = next_caller(a_ancestor)) {
-    for (HloInstruction* b_ancestor = b; b_ancestor != nullptr;
-         b_ancestor = next_caller(b_ancestor)) {
-      if (a_ancestor->parent() == b_ancestor->parent()) {
-        return {a_ancestor, b_ancestor};
+  HloInstruction* a_ancestor = a;
+  HloInstruction* b_ancestor = b;
+  int a_depth = GetNode(a->parent()).depth();
+  int b_depth = GetNode(b->parent()).depth();
+
+  // Advance a_ancestor (b_ancestor) up the call chain until the call depth of
+  // a_ancestor or b_ancestor are the same. Necessarily each call to next_caller
+  // reduces the depth by exactly one.
+  if (a_depth > b_depth) {
+    for (int i = 0; i < a_depth - b_depth; ++i) {
+      a_ancestor = next_caller(a_ancestor);
+      if (a_ancestor == nullptr) {
+        return {nullptr, nullptr};
+      }
+    }
+  } else if (b_depth > a_depth) {
+    for (int i = 0; i < b_depth - a_depth; ++i) {
+      b_ancestor = next_caller(b_ancestor);
+      if (b_ancestor == nullptr) {
+        return {nullptr, nullptr};
       }
     }
   }
+
+  while ((a_ancestor != nullptr) && (b_ancestor != nullptr)) {
+    if (a_ancestor->parent() == b_ancestor->parent()) {
+      return {a_ancestor, b_ancestor};
+    }
+
+    a_ancestor = next_caller(a_ancestor);
+    b_ancestor = next_caller(b_ancestor);
+  }
   return {nullptr, nullptr};
 }
 
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 05c7c998738f861ee804d1ec87bfa5fb17ddfb74..c02ffda575278905f6549b362e5e7d94f5713b36 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -121,6 +121,11 @@ class CallGraphNode {
   // Returns the context in which this computation is called.
   CallContext context() const { return context_; }
 
+  // Returns the depth of this node in the call graph. The depth is defined as
+  // the length of the longest call chain from a computation with no callers
+  // (usually the entry computation node) to this node.
+  int depth() const { return depth_; }
+
   string ToString() const;
 
  private:
@@ -130,6 +135,9 @@ class CallGraphNode {
   // Sets the context in which this computation is called.
   void set_context(CallContext value) { context_ = value; }
 
+  // Sets the depth of this node in the graph.
+  void set_depth(int value) { depth_ = value; }
+
   // Adds a callsite which calls this computation. Updates callers to include
   // the calling computation.
   void AddCallerCallSite(const CallSite& caller_callsite);
@@ -164,6 +172,9 @@ class CallGraphNode {
 
   // The context in which this computation is called.
   CallContext context_ = CallContext::kNone;
+
+  // The depth of this node in the call graph.
+  int depth_ = 0;
 };
 
 // The call graph for an HLO module. The graph includes a node for each
@@ -245,9 +256,16 @@ class CallGraph {
  private:
   CallGraph(const HloModule* module);
 
+  // Not copyable.
+  CallGraph(const CallGraph&) = delete;
+  CallGraph& operator=(const CallGraph&) = delete;
+
   // Sets the call contexts for every node in the graph.
   void SetCallContexts();
 
+  // Sets the call node depths for every node in the graph.
+  void SetNodeDepths();
+
   // Helper method for VisitNodes(). Traverses the call graph from 'node' in DFS
   // post order (callee before caller) calling visitor_func on each node. Adds
   // nodes to 'visited' as each node is visited. Skips nodes already in
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index a3ac2568b0f3eec8556a42dbe3c2c64bd8564468..5de724f8924b78008ba4c56603b61bf93fbc5e7c 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -102,6 +102,7 @@ TEST_F(CallGraphTest, SingletonComputation) {
 
   const CallGraphNode& node = call_graph->GetNode(computation);
   EXPECT_EQ(computation, node.computation());
+  EXPECT_EQ(node.depth(), 0);
   EXPECT_TRUE(node.callsites().empty());
   EXPECT_TRUE(node.callees().empty());
   EXPECT_TRUE(node.caller_callsites().empty());
@@ -122,11 +123,13 @@ TEST_F(CallGraphTest, UnreachableComputation) {
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_node.depth(), 0);
   EXPECT_EQ(entry_computation, entry_node.computation());
   EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
   const CallGraphNode& unreachable_node =
       call_graph->GetNode(unreachable_computation);
+  EXPECT_EQ(unreachable_node.depth(), 0);
   EXPECT_EQ(unreachable_computation, unreachable_node.computation());
   EXPECT_EQ(CallContext::kSequential, unreachable_node.context());
 }
@@ -145,6 +148,7 @@ TEST_F(CallGraphTest, ParallelComputation) {
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
   EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(entry_node.depth(), 0);
   EXPECT_EQ(CallContext::kSequential, entry_node.context());
   EXPECT_EQ(5, entry_node.callsites().size());
   EXPECT_EQ(1, entry_node.callees().size());
@@ -153,6 +157,7 @@ TEST_F(CallGraphTest, ParallelComputation) {
 
   const CallGraphNode& map_node = call_graph->GetNode(map_computation);
   EXPECT_EQ(map_computation, map_node.computation());
+  EXPECT_EQ(map_node.depth(), 1);
   EXPECT_EQ(CallContext::kParallel, map_node.context());
   EXPECT_TRUE(map_node.callsites().empty());
   EXPECT_TRUE(map_node.callees().empty());
@@ -234,6 +239,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   EXPECT_EQ(entry_node.GetCallSite(map), &map_callsite);
 
   const CallGraphNode& sub_node = call_graph->GetNode(subcomputation);
+  EXPECT_EQ(sub_node.depth(), 1);
   EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
@@ -264,6 +270,7 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   EXPECT_EQ(3, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_node.depth(), 0);
   EXPECT_EQ(entry_computation, entry_node.computation());
   EXPECT_EQ(1, entry_node.callsites().size());
 
@@ -275,11 +282,13 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   EXPECT_EQ(entry_node.GetCallSite(conditional), &conditional_callsite);
 
   const CallGraphNode& true_node = call_graph->GetNode(true_computation);
+  EXPECT_EQ(true_node.depth(), 1);
   EXPECT_TRUE(true_node.callees().empty());
   EXPECT_EQ(1, true_node.callers().size());
   EXPECT_EQ(entry_computation, true_node.callers()[0]);
 
   const CallGraphNode& false_node = call_graph->GetNode(false_computation);
+  EXPECT_EQ(false_node.depth(), 1);
   EXPECT_TRUE(false_node.callees().empty());
   EXPECT_EQ(1, false_node.callers().size());
   EXPECT_EQ(entry_computation, false_node.callers()[0]);
@@ -332,9 +341,21 @@ TEST_F(CallGraphTest, ComplexGraph) {
   EXPECT_EQ(5, call_graph->nodes().size());
   EXPECT_FALSE(call_graph->IsFlattened());
 
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  const CallGraphNode& a_node = call_graph->GetNode(a_computation);
+  const CallGraphNode& b_node = call_graph->GetNode(b_computation);
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+
+  // Verify depths.
+  EXPECT_EQ(entry_node.depth(), 0);
+  EXPECT_EQ(a_node.depth(), 1);
+  EXPECT_EQ(b_node.depth(), 2);
+  EXPECT_EQ(c_node.depth(), 3);
+  EXPECT_EQ(cond_node.depth(), 2);
+
   // Entry computation has one while instruction calling two computations
   // (cond_computation and a_computation).
-  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
   ASSERT_EQ(1, entry_node.callsites().size());
   const std::vector<HloComputation*>& called_computations =
       entry_node.callsites()[0].called_computations();
@@ -342,7 +363,6 @@ TEST_F(CallGraphTest, ComplexGraph) {
               UnorderedElementsAre(cond_computation, a_computation));
   EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
-  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
   EXPECT_TRUE(c_node.callsites().empty());
   EXPECT_THAT(c_node.callers(),
               UnorderedElementsAre(a_computation, b_computation));
@@ -364,7 +384,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
 
   // Verify visitation order of some computations in the graph.
   auto index_of = [&visited](const HloComputation* comp) {
-    auto it = std::find(visited.begin(), visited.end(), comp);
+    auto it = absl::c_find(visited, comp);
     EXPECT_NE(it, visited.end());
     return std::distance(visited.begin(), it);
   };
diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc
index 3c2d1ae6d82ebc6c10d52194fd1cec5e291025f7..b517495f2ea0c75679685c67f757ff586f8c79e3 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.cc
+++ b/tensorflow/compiler/xla/service/channel_tracker.cc
@@ -72,7 +72,7 @@ ChannelHandle ChannelTracker::AllocateHandle(ChannelHandle::ChannelType type) {
 }
 
 Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) {
-  if (opaque_to_channel_.count(handle.handle()) == 0) {
+  if (!opaque_to_channel_.contains(handle.handle())) {
     return NotFound("channel handle not found: %d", handle.handle());
   }
   Channel& channel = opaque_to_channel_[handle.handle()];
@@ -94,7 +94,7 @@ Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) {
 }
 
 Status ChannelTracker::RegisterRecvInternal(const ChannelHandle& handle) {
-  if (opaque_to_channel_.count(handle.handle()) == 0) {
+  if (!opaque_to_channel_.contains(handle.handle())) {
     return NotFound("channel handle not found: %d", handle.handle());
   }
   Channel& channel = opaque_to_channel_[handle.handle()];
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index 52037bf9b52556c6aa2e66dd3209e25cf085cfe3..89e17eba36f23077ce4cf0704e7455b76bee68d1 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -83,7 +84,8 @@ class ChannelTracker {
 
   // Mapping from ChannelHandle value to the corresponding registered
   // Channel object.
-  std::map<int64, Channel> opaque_to_channel_ GUARDED_BY(channel_mutex_);
+  absl::flat_hash_map<int64, Channel> opaque_to_channel_
+      GUARDED_BY(channel_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ChannelTracker);
 };
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 8f08c244908efb823b3870c19bdc3491fa87d44f..653f4555a77cc82e91fb1cd26206b93826375732 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -98,10 +98,17 @@ Compiler::GetPlatformCompilers() {
   auto* factories = GetPlatformCompilerFactories();
   auto it = factories->find(platform->id());
   if (it == factories->end()) {
+    string hint;
+    if (platform->Name() == "Host") {
+      hint = " (hint: try linking in tensorflow/compiler/jit:xla_cpu_jit)";
+    } else if (platform->Name() == "CUDA") {
+      hint = " (hint: try linking in tensorflow/compiler/jit:xla_gpu_jit)";
+    }
+
     return NotFound(
         "could not find registered compiler for platform %s -- check "
-        "target linkage",
-        platform->Name());
+        "target linkage%s",
+        platform->Name(), hint);
   }
 
   // And then we invoke the factory, placing the result into the mapping.
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index efc893818d03a20d6bd65b7dc1da72ea5da5ceb0..92d1ca4ba5da802a5f1c544017ac52dda38e9b1d 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -42,8 +42,8 @@ void ComputationLayout::SetToDefaultLayout() {
 }
 
 bool ComputationLayout::LayoutIsSet() const {
-  return std::all_of(parameter_layouts_.begin(), parameter_layouts_.end(),
-                     [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
+  return absl::c_all_of(parameter_layouts_,
+                        [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
          result_layout_.LayoutIsSet();
 }
 
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index 7a24faec17f0c4f0a57406328b1c21cd73506d82..1c1f5431700f4ee0cebc3146654feff620ee978c 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -207,7 +207,8 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
     return Status::OK();
   }
 
-  VLOG(2) << "Dealing with batch_group_count " << batch_group_count << "\n";
+  VLOG(2) << "Dealing with batch_group_count " << batch_group_count
+          << " for convolution " << convolution->ToString() << "\n";
 
   auto add = [&](std::unique_ptr<HloInstruction> inst) {
     return computation_->AddInstruction(std::move(inst));
@@ -315,14 +316,27 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
         expanded_filter_shape, HloOpcode::kSelect, filter_mask, new_convolution,
         zero_filter));
 
-    auto zero_literal = LiteralUtil::CreateR0(0.0f);
-    TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(F32));
+    PrimitiveType reduce_type = new_filter->shape().element_type();
+    auto reduce_window_shape = new_convolution->shape();
+    reduce_window_shape.set_dimensions(output_batch_dimension, 1);
+
+    // Ensure that data input to reduce window uses at least 32 bits.
+    if (primitive_util::BitWidth(reduce_type) < primitive_util::BitWidth(F32)) {
+      reduce_type = F32;
+      reduce_window_shape.set_element_type(F32);
+      Shape convert_shape = new_filter->shape();
+      convert_shape.set_element_type(F32);
+      new_filter =
+          add(HloInstruction::CreateConvert(convert_shape, new_filter));
+    }
+
+    auto zero_literal = LiteralUtil::Zero(reduce_type);
     auto zero_scalar =
         add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
     auto reduce_function = [&]() -> HloComputation* {
       HloComputation::Builder b("add_computation");
-      Shape shape = ShapeUtil::MakeShape(F32, {});
+      Shape shape = ShapeUtil::MakeShape(reduce_type, {});
       auto lhs =
           b.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
       auto rhs =
@@ -332,18 +346,6 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
       return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
     };
 
-    // Ensure that data input to reduce window is of type F32.
-    if (primitive_util::BitWidth(new_filter->shape().element_type()) <
-        primitive_util::BitWidth(F32)) {
-      Shape convert_shape = new_filter->shape();
-      convert_shape.set_element_type(F32);
-      new_filter =
-          add(HloInstruction::CreateBitcastConvert(convert_shape, new_filter));
-    }
-
-    auto reduce_window_shape = new_convolution->shape();
-    reduce_window_shape.set_dimensions(output_batch_dimension, 1);
-
     // Create the reduce window.
     Window window;
     for (int64 i = 0; i < new_convolution->shape().dimensions_size(); ++i) {
@@ -369,7 +371,7 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
 
     // Convert reduced data back to the original data type.
     auto reduce_window_converted =
-        HloInstruction::CreateBitcastConvert(convert_back_shape, reduce_window);
+        HloInstruction::CreateConvert(convert_back_shape, reduce_window);
 
     TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
         convolution, std::move(reduce_window_converted)));
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
similarity index 75%
rename from tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
rename to tensorflow/compiler/xla/service/convolution_group_converter_test.cc
index d58f157242f5fb9690f7fda3e7d8f71ca6c8db84..585b81a5db632901be863893bf723fcba19388ea 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
@@ -94,5 +94,32 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   EXPECT_EQ(root->operand(1)->feature_group_count(), 1);
 }
 
+TEST_F(ConvolutionGroupConverterTest,
+       ConvertBatchGroupCountEqualToInputBatchDim) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[16,19,19,512]{3,2,1,0}, filter: f32[16,19,19,512]{3,2,1,0}) -> f32[3,3,512,1]{3,2,1,0} {
+  %input = f32[16,19,19,512]{3,2,1,0} parameter(0)
+  %filter = f32[16,19,19,512]{3,2,1,0} parameter(1)
+  ROOT %convolution = f32[3,3,512,1]{3,2,1,0} convolution(f32[16,19,19,512]{3,2,1,0} %input, f32[16,19,19,512]{3,2,1,0} %filter), window={size=19x19 pad=1_1x1_1}, dim_labels=f01b_i01o->01fb, batch_group_count=512
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto cost_model = [](HloInstruction* conv) { return true; };
+  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
+                                      true);
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Make sure the convolution is converted to one with batch_group_count = 1.
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(0)->batch_group_count(), 1);
+  // Verify that the convolution is replaced by a reshape.
+  EXPECT_EQ(root->opcode(), HloOpcode::kReshape);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index df6059663876dfde71f4c75d3931b3d2de72c1df..5e26a63cebfa9b2e50f4b13335c10c246999d4df 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -349,11 +349,12 @@ Status AddCopiesForAliasedInputOutputs(HloModule* module) {
     ShapeTree<bool> param_indices_to_copy(param->shape());
 
     module->input_output_alias_config().ForEachAlias(
-        [&](const ShapeIndex& output_index, int64 param_number,
-            const ShapeIndex& param_index) {
-          if (param_number == param->parameter_number()) {
+        [&](const ShapeIndex& output_index,
+            const HloInputOutputAliasConfig::Alias& alias) {
+          if (alias.parameter_number == param->parameter_number()) {
             param_has_alias = true;
-            *(param_indices_to_copy.mutable_element(param_index)) = true;
+            *(param_indices_to_copy.mutable_element(alias.parameter_index)) =
+                true;
             *(output_indices_to_copy.mutable_element(output_index)) = true;
           }
         });
@@ -395,13 +396,14 @@ Status AddCopiesForAliasedInputOutputs(HloModule* module) {
 
   // Add control dependencies between the input/output copies.
   TF_RETURN_IF_ERROR(module->input_output_alias_config().ForEachAliasWithStatus(
-      [&](const ShapeIndex& output_index, int64 param_number,
-          const ShapeIndex& input_index) -> Status {
-        if (!copied_parameters[param_number]) {
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) -> Status {
+        if (!copied_parameters[alias.parameter_number]) {
           return Status::OK();
         }
         HloInstruction* from =
-            copied_parameters[param_number]->element(input_index);
+            copied_parameters[alias.parameter_number]->element(
+                alias.parameter_index);
         HloInstruction* to = output_copy_tree.element(output_index);
 
         TF_RET_CHECK(from != nullptr);
@@ -522,7 +524,7 @@ class CopyRemover {
         // between copies added around aliased operations (kWhile) guarantees
         // this strict order.
         for (const HloValue* value_a : buffer.values()) {
-          if (ShapeUtil::IsToken(value_a->shape())) {
+          if (value_a->shape().IsToken()) {
             // Token values have no representation and cannot interfere.
             continue;
           }
@@ -539,10 +541,9 @@ class CopyRemover {
         }
 
         std::vector<const HloValue*> values = buffer.values();
-        std::sort(values.begin(), values.end(),
-                  [this](const HloValue* a, const HloValue* b) {
-                    return ordering_.IsDefinedBefore(*a, *b);
-                  });
+        absl::c_sort(values, [this](const HloValue* a, const HloValue* b) {
+          return ordering_.IsDefinedBefore(*a, *b);
+        });
 
         // Create a list containing all of the values in the buffer.
         AddValueList(values, &value_to_node);
@@ -842,12 +843,11 @@ class CopyRemover {
       copy_value_node->next->prev = operand_node;
 
       // Patch up uses. Remove use of copy from operand_node uses.
-      auto it =
-          std::find_if(operand_node->uses.begin(), operand_node->uses.end(),
-                       [copy_value_node](const HloUse* use) {
-                         return use->instruction ==
-                                copy_value_node->value->defining_instruction();
-                       });
+      auto it = absl::c_find_if(
+          operand_node->uses, [copy_value_node](const HloUse* use) {
+            return use->instruction ==
+                   copy_value_node->value->defining_instruction();
+          });
       CHECK(it != operand_node->uses.end());
       operand_node->uses.erase(it);
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index e4e9d7ba05c115be9dd0eb53ebd7de208d514efb..4391bdcba532661a0fde789e2c4ed324c40bcd32 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -1376,9 +1376,11 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte1, gte0}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 4);
@@ -1409,9 +1411,11 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1475,7 +1479,8 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -1516,7 +1521,8 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1557,7 +1563,8 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, negate1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1848,8 +1855,7 @@ ENTRY %TokensShouldNotBeCopied () -> s32[] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          HloRunner::CreateModuleFromString(
-                              module_string, GetDebugOptionsForTest()));
+                          ParseAndReturnVerifiedModule(module_string));
   InsertCopies(module.get());
 
   // There should be no copies added because tokens should not be copied.
@@ -2112,8 +2118,7 @@ ENTRY TestComputation {
   ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
 }
 )";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   auto module = module_or_status.ConsumeValueOrDie();
   InsertCopies(module.get());
 }
@@ -2213,8 +2218,7 @@ ENTRY TestComputation {
   ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
 }
 )";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   auto module = module_or_status.ConsumeValueOrDie();
   InsertCopies(module.get());
 }
@@ -2231,7 +2235,7 @@ cond.inner {
 
 body.inner {
   param.body.inner = pred[] parameter(0)
-  ROOT neg = pred[] negate(param.body.inner)
+  ROOT not = pred[] not(param.body.inner)
 }
 
 cond.outer {
@@ -2248,9 +2252,8 @@ ENTRY TestComputation {
   ROOT while = pred[] while(entry_param), condition=cond.outer, body=body.outer
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   InsertCopies(module.get());
 
   // There should only be a single copy inserted, and it's in the entry
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index f49b5110be5c4bab63b423e5ed2e67bc1828f6e3..d4535b204d7f3ad8d4e24beea5d0dd79e7a15ab0 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #    LLVM-based CPU backend for XLA.
 
+load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load(":build_defs.bzl", "runtime_copts")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
@@ -14,15 +22,6 @@ package_group(
     ],
 )
 
-load(":build_defs.bzl", "runtime_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "mkl_deps",
-)
-
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -95,6 +94,7 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
@@ -114,6 +114,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -133,6 +134,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -241,6 +243,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/host:host_stream",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -364,15 +367,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tiled_dot_emitter",
+    srcs = ["tiled_dot_emitter.cc"],
+    hdrs = ["tiled_dot_emitter.h"],
+    deps = [
+        ":vector_support_library",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "dot_op_emitter",
     srcs = ["dot_op_emitter.cc"],
-    hdrs = ["dot_op_emitter.h"],
+    hdrs = [
+        "dot_op_emitter.h",
+    ],
     deps = [
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
         ":target_machine_features",
+        ":tiled_dot_emitter",
         ":vector_support_library",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -380,6 +401,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
@@ -631,6 +653,7 @@ cc_library(
     deps = [
         ":runtime_matvec",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -767,8 +790,6 @@ cc_library(
         ":target_machine_features",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1008,7 +1029,6 @@ tf_cc_test(
     size = "small",
     srcs = ["cpu_eigen_tensor_alignment_test.cc"],
     deps = [
-        ":dot_op_emitter",
         ":ir_emission_utils",
         ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 796a7cf94d02b0ad42366387a9d3f8d589b8840a..414eacddfc7ba3c295c027c64c445a2046235d36 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -66,9 +66,14 @@ class FilteredPassManager : public llvm::legacy::PassManager {
   explicit FilteredPassManager(bool disable_expensive_passes)
       : disable_expensive_passes_(disable_expensive_passes) {}
   void add(llvm::Pass* p) override {
+    llvm::StringRef PassName = p->getPassName();
+    if (PassName.contains("Warn about non-applied transformations")) {
+      delete p;
+      return;
+    }
     if (disable_expensive_passes_) {
-      llvm::StringRef PassName = p->getPassName();
       if (PassName.contains("Unroll loops")) {
+        delete p;
         return;
       }
     }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index ba7dcde5c3d7e0406f46d642632f780d6d7db54f..eafda68510d93ee54f2aead60a84f3e97b3fe1f4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -69,6 +69,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -92,6 +93,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -244,6 +246,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
+  pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CpuHloSupportChecker>();
 
   ReducePrecisionInsertion::AddPasses(
@@ -256,7 +259,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // pass.
   pipeline.AddPass<CallInliner>();
   pipeline.AddPass<BatchDotSimplification>();
-  pipeline.AddPass<DotDecomposer>();
+  pipeline.AddPass<DotDecomposer>(/*decompose_batch_dot=*/false);
   auto cost_model = [](HloInstruction* conv) {
     // We need a cost model for CPUs. Currently, do nothing.
     return false;
@@ -279,10 +282,10 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
     pipeline.AddPass<HloGetDimensionSizeRewriter>();
-    AlgebraicSimplifierOptions options(
-        [](const Shape&, const Shape&) { return false; });
+    AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<AlgebraicSimplifier>(options);
+    pass.AddPass<SortSimplifier>();
     pass.AddPass<HloDCE>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
@@ -302,7 +305,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot,
           const TransposeFolding::OperandIndices& candidate_operands) {
-        return PotentiallyImplementedAsEigenDot(dot, *target_machine_features)
+        return DotImplementationCanHandleTranspose(dot,
+                                                   *target_machine_features)
                    ? candidate_operands
                    : TransposeFolding::OperandIndices{};
       },
@@ -345,8 +349,7 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(
         /*layout_sensitive=*/true,
         /*allow_mixed_precision=*/false);
-    AlgebraicSimplifierOptions options(
-        [](const Shape&, const Shape&) { return true; });
+    AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
@@ -506,7 +509,7 @@ Status CreateHloProfilingArtifacts(
 
   auto shape_size_bytes = [](const Shape& shape) {
     // On the cpu, opaques are pointers.
-    if (ShapeUtil::IsOpaque(shape)) {
+    if (shape.IsOpaque()) {
       return static_cast<int64>(sizeof(void*));
     }
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
index 8727c72b6e42517b1859e98ecadb41bbceed761c..485769a373acf5ae70c471b1a5dfcfb20ff772ef 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -28,37 +27,6 @@ namespace {
 
 class CpuEigenTensorAlignmentTest : public ::testing::Test {};
 
-TEST_F(CpuEigenTensorAlignmentTest, EigenDotAlignment) {
-  string hlo_string = R"(
-HloModule DotOperation
-
-ENTRY DotOperation {
-  arg0 = f32[5,256] parameter(0)
-  arg1 = f32[256,1024] parameter(1)
-  ROOT dot = f32[5,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_string));
-
-  HloInstruction* dot = module->entry_computation()->root_instruction();
-
-  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
-      [](int64 size) { return 1; });
-
-  EXPECT_FALSE(
-      PotentiallyImplementedAsEigenDot(*dot, target_machine_with_no_alignment));
-
-  TargetMachineFeaturesWithFakeAlignmentLogic
-      target_machine_with_full_alignment([](int64 size) {
-        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
-      });
-
-  EXPECT_TRUE(PotentiallyImplementedAsEigenDot(
-      *dot, target_machine_with_full_alignment));
-}
-
 TEST_F(CpuEigenTensorAlignmentTest, EigenConvAlignment) {
   string hlo_string = R"(
 HloModule ConvOperation
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 818b2b0d0db2893e11fa46c7867e6c74bbbb6905..23d0af34233858515af21df5e92346742a5b5dc3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -213,6 +213,8 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
       /*on_host_shape=*/result_shape(),
       /*on_device_shape=*/result_shape(), run_options->allocator(),
       stream->parent()->device_ordinal());
+  const HloInputOutputAliasConfig& input_output_alias =
+      module().input_output_alias_config();
 
   // Move OwningDeviceMemory values which contain the array(s) of the result
   // into the respective location in ScopedShapedBuffer which is returned to the
@@ -232,12 +234,31 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
         TF_ASSIGN_OR_RETURN(
             const BufferAllocation::Slice slice,
             this->assignment_->GetUniqueSlice(src, buffer_source->index()));
-        CHECK(!slice.allocation()->is_entry_computation_parameter());
-
         const BufferAllocation::Index buffer_index = slice.index();
         OwningDeviceMemory& buffer = buffers[buffer_index];
-        CHECK(!buffer.is_null() || buffer.size() == 0);
-        *device_memory = buffer.Forget();
+        if (!slice.allocation()->is_entry_computation_parameter()) {
+          // If the buffer coming out of the result is from a parameter, the
+          // owning buffer will be null, and that means the caller aliased some
+          // parameter buffer to an output one (via the
+          // HloInputOutputAliasConfig API). If that is the case, the caller
+          // will receive a partially complete scoped shaped buffer, which they
+          // will have to fill up on return. Unfortunately the interface to the
+          // execute APIs are ShapedBuffer pointer based, which assumes caller
+          // ownership, and hence a buffer coming from there cannot be part of
+          // the new ScopedShapedBuffer we create for the result (which assumes
+          // ownership).
+          *device_memory = buffer.Forget();
+        } else {
+          auto output_alias = input_output_alias.GetAliasedOutput(
+              slice.allocation()->parameter_number(),
+              slice.allocation()->param_shape_index());
+          CHECK(output_alias)
+              << "Ouput buffer is coming from parameter "
+              << slice.allocation()->parameter_number() << " at index "
+              << slice.allocation()->param_shape_index()
+              << ", but no alias exists";
+          CHECK_EQ(*output_alias, index);
+        }
         return Status::OK();
       }));
   return std::move(result_buffer);
@@ -326,7 +347,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
   // On the cpu, opaques are pointers.
-  if (ShapeUtil::IsOpaque(shape)) {
+  if (shape.IsOpaque()) {
     return sizeof(void*);
   }
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 527df0bd1c23bba74f32226e5622fed32f7dcf84..c4bde837e57e82584c2a007858ed8d55608acd3c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -332,7 +332,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
 TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape starts_shape = ShapeUtil::MakeShape(F32, {2});
+  Shape starts_shape = ShapeUtil::MakeShape(F32, {});
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {1, 8, 8});
   Shape reshape_shape = ShapeUtil::MakeShape(F32, {8, 8});
   Shape dynamic_slice_shape = ShapeUtil::MakeShape(F32, {4, 4});
@@ -340,13 +340,15 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
       HloInstruction::CreateParameter(0, param_shape, "param"));
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, starts_shape, "starts"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
   HloInstruction* broadcast2 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(broadcast_shape, param0, {1}));
   HloInstruction* reshape3 = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, broadcast2));
   HloInstruction* dynamic_slice4 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-          dynamic_slice_shape, reshape3, param1, {4, 4}));
+          dynamic_slice_shape, reshape3, {param1, param2}, {4, 4}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
@@ -356,7 +358,8 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
       {HloOpcode::kTanh, HloOpcode::kDynamicSlice, HloOpcode::kReshape,
-       HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter});
+       HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter});
 }
 
 TEST_F(OpcodeFusionTest, Broadcast_Negate) {
@@ -381,14 +384,14 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
 TEST_F(OpcodeFusionTest, DynamicSlice_Negate) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape slice_shape = ShapeUtil::MakeShape(F32, {1});
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {});
   Shape result_shape = ShapeUtil::MakeShape(F32, {2});
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "param"));
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, slice_shape, "starts"));
   HloInstruction* dynamic_slice2 = builder.AddInstruction(
-      HloInstruction::CreateDynamicSlice(result_shape, param0, param1, {2}));
+      HloInstruction::CreateDynamicSlice(result_shape, param0, {param1}, {2}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, dynamic_slice2));
 
@@ -548,28 +551,36 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
   Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 1, 1000});
 
+  std::vector<HloInstruction*> slice_indices, update_indices;
+  for (int i = 0; i < 3; ++i) {
+    slice_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            1 + i, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+    update_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            5 + i, ShapeUtil::MakeShape(U32, {}), "update_indices")));
+  }
   HloInstruction* slice =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           slice_shape,
           builder.AddInstruction(
               HloInstruction::CreateParameter(0, full_shape, "slice_from")),
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+          slice_indices,
           /*slice_sizes=*/{10, 1, 1000}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       full_shape,
       builder.AddInstruction(
-          HloInstruction::CreateParameter(2, full_shape, "to_update")),
-      slice,
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
+          HloInstruction::CreateParameter(4, full_shape, "to_update")),
+      slice, update_indices));
 
   module->AddEntryComputation(builder.Build());
   RunFusionAndCheckOpcodesWereFused(
-      module.get(), {HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
-                     HloOpcode::kParameter, HloOpcode::kParameter,
-                     HloOpcode::kParameter, HloOpcode::kParameter});
+      module.get(),
+      {HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter});
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
@@ -578,49 +589,40 @@ TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
 
-  auto loop_idx = builder.AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(S32, {1}),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(S32, {}), "param0"))));
-
+  auto loop_idx = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(S32, {}), "param0"));
   auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(S32, {1}), "param1"));
-  auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
-      ShapeUtil::MakeShape(S32, {5}),
-      {loop_idx, param1, param1, param1, param1}, /*dimension=*/0));
+      1, ShapeUtil::MakeShape(S32, {}), "param1"));
 
-  auto idx_choice = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ShapeUtil::MakeShape(S32, {1}),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          2, ShapeUtil::MakeShape(S32, {4}), "param2")),
-      loop_idx,
-      /*slice_sizes=*/{1}));
-
-  PaddingConfig padding_config;
-  padding_config.add_dimensions()->set_edge_padding_high(4);
-  auto pad = builder.AddInstruction(HloInstruction::CreatePad(
-      ShapeUtil::MakeShape(S32, {5}), idx_choice,
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))),
-      padding_config));
+  auto idx_choice = builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(S32, {}),
+      builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(S32, {1}),
+          builder.AddInstruction(HloInstruction::CreateParameter(
+              2, ShapeUtil::MakeShape(S32, {4}), "param2")),
+          {loop_idx},
+          /*slice_sizes=*/{1}))));
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
 
   auto slice = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
       ShapeUtil::MakeShape(F32, {1, 100, 10, 100, 50}),
       builder.AddInstruction(HloInstruction::CreateParameter(
           3, ShapeUtil::MakeShape(F32, {100, 100, 10, 100, 50}), "param3")),
-      pad, /*slice_sizes=*/{1, 100, 10, 100, 50}));
+      {idx_choice, zero, zero, zero, zero},
+      /*slice_sizes=*/{1, 100, 10, 100, 50}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       full_shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(4, full_shape, "param4")),
-      slice, concat));
+      slice, {loop_idx, param1, param1, param1, param1}));
 
   module->AddEntryComputation(builder.Build());
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
-      {HloOpcode::kConcatenate, HloOpcode::kPad, HloOpcode::kDynamicSlice,
-       HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
+      {HloOpcode::kDynamicSlice, HloOpcode::kDynamicSlice,
+       HloOpcode::kDynamicUpdateSlice, HloOpcode::kReshape,
        HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
        HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter});
 }
@@ -930,9 +932,10 @@ ENTRY main {
   return result;
 }
 
-INSTANTIATE_TEST_CASE_P(GatherLoopFusionTestInstantiation, GatherLoopFusionTest,
-                        ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
-                        GatherLoopFusionTestSpec::Name);
+INSTANTIATE_TEST_SUITE_P(GatherLoopFusionTestInstantiation,
+                         GatherLoopFusionTest,
+                         ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
+                         GatherLoopFusionTestSpec::Name);
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index c291bf2d1ba2eaff4192051840768c037bece86f..95b8025f873c56bea063ff258d4abd6614257d85 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -46,8 +46,7 @@ static bool ShouldMakeAllUsersColMajor(const HloInstruction* instruction) {
   for (auto* user : instruction->users()) {
     optional<int64> operand_idx = ProfitableToMakeDotOperandColumnMajor(*user);
     if (!operand_idx || user->operand(*operand_idx) != instruction ||
-        std::count(user->operands().begin(), user->operands().end(),
-                   instruction) != 1) {
+        absl::c_count(user->operands(), instruction) != 1) {
       return false;
     }
   }
@@ -94,60 +93,38 @@ static Shape ColMajorShape(const Shape& old_shape) {
   return new_shape;
 }
 
+static bool OperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& instr,
+    const TargetMachineFeatures& target_machine_features) {
+  if (instr.opcode() == HloOpcode::kConvolution) {
+    return PotentiallyImplementedAsEigenConvolution(instr,
+                                                    target_machine_features);
+  } else if (instr.opcode() == HloOpcode::kDot) {
+    return DotOperandsAndResultMustHaveRowMajorLayout(instr,
+                                                      target_machine_features);
+  }
+  return false;
+}
+
 Status CpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
   ShouldMakeOperandColMajorCache cache;
 
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kConvolution &&
-        PotentiallyImplementedAsEigenConvolution(*instruction,
-                                                 target_machine_features_)) {
-      const HloInstruction* convolution = instruction;
-      const HloInstruction* lhs_instruction = convolution->operand(0);
-      const HloInstruction* rhs_instruction = convolution->operand(1);
-
-      // In order to implement `convolution` with Eigen convolution, the layouts
-      // of the input, filter, and output need to be row-major.
-      //
-      // These constraints are not hard constraints. Ideally, we should decide
-      // which layouts to choose according to some cost model.
-      Shape output_shape(RowMajorShape(convolution->shape()));
-      Shape input_shape(RowMajorShape(lhs_instruction->shape()));
-      Shape filter_shape(RowMajorShape(rhs_instruction->shape()));
-
-      // Set layouts of the instructions' shapes.
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(input_shape, convolution, 0));
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(filter_shape, convolution, 1));
-      TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(output_shape, convolution));
+    if (OperandsAndResultMustHaveRowMajorLayout(*instruction,
+                                                target_machine_features_)) {
+      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+          RowMajorShape(instruction->shape()), instruction));
+      for (int i = 0; i < instruction->operand_count(); i++) {
+        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+            RowMajorShape(instruction->operand(i)->shape()), instruction, i));
+      }
     } else if (optional<int64> op_idx =
                    ShouldMakeOperandColumnMajor(&cache, *instruction)) {
       const HloInstruction* op = instruction->operand(*op_idx);
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           ColMajorShape(op->shape()), instruction, *op_idx));
-    } else if (PotentiallyImplementedAsEigenDot(*instruction,
-                                                target_machine_features_)) {
-      const HloInstruction* dot = instruction;
-      // In order to implement `dot` with Eigen dot, the layouts of the lhs,
-      // rhs, and output need to be row-major.
-      //
-      // These constraints are not hard constraints. Ideally, we should decide
-      // which layouts to choose according to some cost model.
-      Shape output_shape(RowMajorShape(dot->shape()));
-
-      const HloInstruction* lhs_instruction = dot->operand(0);
-      Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
-
-      const HloInstruction* rhs_instruction = dot->operand(1);
-      Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
-
-      // Set layouts of the instructions' shapes.
-      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(output_shape, dot));
     } else {
       for (int64 operand_no = 0; operand_no < instruction->operand_count();
            ++operand_no) {
@@ -160,7 +137,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
           continue;
         }
         // Skip operands with non-array shapes.
-        if (!ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+        if (!instruction->operand(operand_no)->shape().IsArray()) {
           continue;
         }
         Shape operand_shape(
@@ -175,7 +152,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       }
       // Skip instructions which don't produce array shapes (tuples, opaque,
       // etc.).
-      if (!ShapeUtil::IsArray(instruction->shape())) {
+      if (!instruction->shape().IsArray()) {
         continue;
       }
     }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 92debb83e33b1400a59e5eef0f90971392ab7b22..ff654c83d61e7cc09ac7839feccaf2bc9cb3c63c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -23,8 +23,8 @@ namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
-const char* const kXlaEnableExperimentalLlvmIrGemm =
-    "xla_enable_experimental_llvm_ir_gemm";
+const char* const kXlaForceEnableExperimentalLlvmIrGemm =
+    "xla_force_enable_experimental_llvm_ir_gemm";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -57,10 +57,10 @@ absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config) {
   return absl::nullopt;
 }
 
-bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
+  return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
 static absl::string_view RemoveSuffix(absl::string_view str,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 47c7eb13b6e4cc05a23f82b8d2a25249f4b82ac0..99e6702d14aed8ffb148adec2bdd02dbc7c3c7e3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,7 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
-bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index a9febe891b5e9d1eb9e6b297952b50d1d26a3396..d8878e622c0500fc5328aa6c295a9e24a3a037f7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -84,31 +84,8 @@ extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName =
     "__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation";
 extern const char* const kParallelForkJoinSymbolName =
     "__xla_cpu_runtime_ParallelForkJoin";
-extern const char* const kKeyValueSortPREDSymbolName =
-    "__xla_cpu_runtime_KeyValueSortPRED";
-extern const char* const kKeyValueSortS8SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS8";
-extern const char* const kKeyValueSortU8SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU8";
-extern const char* const kKeyValueSortS16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS16";
-extern const char* const kKeyValueSortU16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU16";
-extern const char* const kKeyValueSortF16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF16";
-extern const char* const kKeyValueSortS32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS32";
-extern const char* const kKeyValueSortU32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU32";
-extern const char* const kKeyValueSortF32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF32";
-extern const char* const kKeyValueSortS64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS64";
-extern const char* const kKeyValueSortU64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU64";
-extern const char* const kKeyValueSortF64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF64";
-
+extern const char* const kKeyValueSortSymbolName =
+    "__xla_cpu_runtime_KeyValueSort";
 extern const char* const kXlaCpuRuntimeSymbolNamePrefix = "__xla_cpu_runtime_";
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index b2e760a224ad8eaa61dae57b0f9cece04a7e54ae..3a2b44d8c1a80128d3577c374e751e73a89e9d59 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -64,18 +64,7 @@ extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
 extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
 extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
 extern const char* const kParallelForkJoinSymbolName;
-extern const char* const kKeyValueSortPREDSymbolName;
-extern const char* const kKeyValueSortS8SymbolName;
-extern const char* const kKeyValueSortU8SymbolName;
-extern const char* const kKeyValueSortS16SymbolName;
-extern const char* const kKeyValueSortU16SymbolName;
-extern const char* const kKeyValueSortF16SymbolName;
-extern const char* const kKeyValueSortS32SymbolName;
-extern const char* const kKeyValueSortU32SymbolName;
-extern const char* const kKeyValueSortF32SymbolName;
-extern const char* const kKeyValueSortS64SymbolName;
-extern const char* const kKeyValueSortU64SymbolName;
-extern const char* const kKeyValueSortF64SymbolName;
+extern const char* const kKeyValueSortSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index 1ae3aa57111e3a3b7ac18b4907c5c282edf89b7e..4e8c98678309fa4d573f1aac1290c9afc87643a4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -162,11 +162,12 @@ TEST_P(EigenMatMulTest, DoIt) {
   CheckMatrixMultiply(*a, *b, *c);
 }
 
-INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
-                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
-                                           ::testing::Bool(), ::testing::Bool(),
-                                           ::testing::Bool()),
-                        EigenMatMulTest::Name);
+INSTANTIATE_TEST_SUITE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
+                         ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                            ::testing::Bool(),
+                                            ::testing::Bool(),
+                                            ::testing::Bool()),
+                         EigenMatMulTest::Name);
 
 #ifdef INTEL_MKL
 class MKLMatMulTest : public CpuRuntimeTest,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 1457582ac19c27e5c3150b4667e6af505345a6bd..3361a5973f5e8c91802b26d68477347b196d3cac 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -97,7 +97,7 @@ Status CpuTransferManager::TransferLiteralToInfeed(
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
 
-  if (!ShapeUtil::IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     int64 size = GetByteSizeRequirement(shape);
     return TransferBufferToInfeed(executor, size, literal.untyped_data());
   }
@@ -178,7 +178,7 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
 Status CpuTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* executor, const Shape& literal_shape,
     MutableBorrowingLiteral literal) {
-  if (!ShapeUtil::IsTuple(literal_shape)) {
+  if (!literal_shape.IsTuple()) {
     int64 size = GetByteSizeRequirement(literal_shape);
     // Note: OSS build didn't like implicit conversion from
     // literal_shape.dimensions() to the array slice on 2017-07-10.
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.cc b/tensorflow/compiler/xla/service/cpu/disassembler.cc
index 3ae64142cd7e32d3aa8d50870efaf94698c06440..c3c6847b7b77e2fb0470630815de9f5d7a6c5b9c 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.cc
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.cc
@@ -77,17 +77,16 @@ StatusOr<DisassemblerResult> Disassembler::DisassembleObjectFile(
     }
 
     // Sort the symbols in increasing address order.
-    std::sort(
-        symbols.begin(), symbols.end(),
-        [](const llvm::object::SymbolRef& a, const llvm::object::SymbolRef& b) {
-          // getAddress returns a Expected object. Assert there is no error
-          // before extracting the address.
-          llvm::Expected<uint64_t> a_address_or_error = a.getAddress();
-          CHECK(a_address_or_error);
-          llvm::Expected<uint64_t> b_address_or_error = b.getAddress();
-          CHECK(b_address_or_error);
-          return a_address_or_error.get() < b_address_or_error.get();
-        });
+    absl::c_sort(symbols, [](const llvm::object::SymbolRef& a,
+                             const llvm::object::SymbolRef& b) {
+      // getAddress returns a Expected object. Assert there is no error
+      // before extracting the address.
+      llvm::Expected<uint64_t> a_address_or_error = a.getAddress();
+      CHECK(a_address_or_error);
+      llvm::Expected<uint64_t> b_address_or_error = b.getAddress();
+      CHECK(b_address_or_error);
+      return a_address_or_error.get() < b_address_or_error.get();
+    });
 
     // Construct ArrayRef pointing to section contents.
     llvm::StringRef section_content_string;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 37cefcb2e827ffd15aa489b1b3199ba9f27d9dd6..48510181bd01c87c9db764396b556fdf34e6c8c4 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -26,7 +26,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -41,931 +44,165 @@ namespace xla {
 using llvm_ir::SetToFirstInsertPoint;
 
 namespace cpu {
-
 namespace {
-// Provides tiled access to an in-memory rank 2 array.
-class MemoryTile {
- public:
-  // Constructs a MemoryTile that can operate on tiles consisting of
-  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
-  // `major_dim_offset` in the major dimension.  The tile size along the minor
-  // dimension is the vector size, and that is implicitly determined by `vsl`.
-  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
-             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
-             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl), b_(b) {
-    pointers_.reserve(tile_size_along_major_dim);
-    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
-      llvm::Value* total_offset =
-          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
-                       b->CreateAdd(b->getInt64(i), major_dim_offset));
-      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
-    }
-  }
-
-  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
-  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
-    std::vector<llvm::Value*> result;
-    result.reserve(pointers_.size());
-    for (const auto& pointer : pointers_) {
-      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
-    }
-    return result;
-  }
-
-  // Stores `tile` to position {major: `major_dim_offset`, minor:
-  // `minor_dim_offset`}.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  void StoreTile(absl::Span<llvm::Value* const> tile,
-                 llvm::Value* minor_dim_offset) const {
-    CHECK_EQ(tile.size(), pointers_.size());
-    for (int64 i = 0; i < pointers_.size(); i++) {
-      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
-    }
-  }
-
-  // Loads a tile of size [`tile_size_along_major_dim`,
-  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
-  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
-  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
-  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
-      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
-    std::vector<std::vector<llvm::Value*>> result;
-    result.resize(pointers_.size());
-    for (int64 i = 0; i < pointers_.size(); i++) {
-      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
-        result[i].push_back(vsl_->LoadBroadcast(
-            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
-      }
-    }
-    return result;
-  }
-
- private:
-  VectorSupportLibrary* vsl_;
-  llvm::IRBuilder<>* b_;
-  std::vector<llvm::Value*> pointers_;
-};
-
-// The base class for the classes representing the GEMV emitter configurations.
-//
-// The IR emitted (modulo the LLVM values representing the input and output
-// buffers) by the row major and column major GEMV emitters should be a function
-// of their configuration.  This is important because their configuration is
-// used as a key to cache the generated IR.
-class GemvConfig {
- public:
-  // Mixin for convenience.
-  template <typename T>
-  struct User {
-   public:
-    PrimitiveType scalar_type() const {
-      return derived().config().scalar_type();
-    }
-    int64 tile_rows() const { return derived().config().tile_rows(); }
-    int64 tile_cols() const { return derived().config().tile_cols(); }
-    int64 m() const { return derived().config().m(); }
-    int64 k() const { return derived().config().k(); }
-    int64 has_addend() const { return derived().config().has_addend(); }
-
-   private:
-    const T& derived() const { return *static_cast<const T*>(this); }
-  };
+// Returns true if we should call into multi-threaded Eigen routines.
+bool ShouldUseMultiThreadedEigen(const HloModuleConfig& config) {
+  return config.debug_options().xla_cpu_multi_thread_eigen();
+}
 
-  PrimitiveType scalar_type() const { return scalar_type_; }
-  int64 tile_rows() const { return tile_rows_; }
-  int64 tile_cols() const { return tile_cols_; }
-  int64 m() const { return m_; }
-  int64 k() const { return k_; }
-  bool has_addend() const { return has_addend_; }
-
-  string GetCacheKey() const {
-    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
-                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
-                        has_addend() ? "_with_addend" : "");
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  DotInfo() = default;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
   }
-
- protected:
-  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
-                      int64 tile_cols, int64 m, int64 k, bool has_addend)
-      : name_(std::move(name)),
-        scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
-        has_addend_(has_addend) {}
-
- private:
-  string name_;
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
-  bool has_addend_;
 };
 
-// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+       +--+--+--+--+
-//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//
-// (Legend: rows are horizontal and columns are vertical; and each column is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is from the column major left matrix.
-//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
-//      vector loaded from the RHS vector.
-//
-// As we iterate through the column dimension, we compute the change to the
-// result vector by an elementwise multiplication between the two tiles above
-// followed by a reduction along the major dimension:
-//
-//                     +-----------------------------------+
-//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
-//                     +-----------------------------------+
-//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
-// Result[R:R+4] +=    +-----------------------------------+
-//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
-//                     +-----------------------------------+
-//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
-//                     +-----------------------------------+
-//
-// Where R is the starting row for the tile.
-//
-// We have an inner epilogue loop to deal with the "C" submatrix and an outer
-// epilogue loop to deal with the B,D submarix.
-//
-// TODO(sanjoy): We should investigate if using gather loads and scatter stores
-// can be used here have the same inner loop for both column-major and row-major
-// matrix-vector products.
-class ColumnMajorMatrixVectorProductEmitter
-    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
- public:
-  class Config : public GemvConfig {
-   public:
-    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
-                    int64 m, int64 k, bool has_addend)
-        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
-                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
-                     /*k=*/k, /*has_addend=*/has_addend) {}
-  };
-
-  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
-                                        llvm::Value* rhs, llvm::Value* addend,
-                                        llvm::Value* result,
-                                        llvm::IRBuilder<>* b)
-      : config_(config),
-        lhs_(lhs),
-        rhs_(rhs),
-        addend_(addend),
-        result_(result),
-        b_(b),
-        ksl_(b_),
-        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
-    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
-    CHECK(!has_addend() || addend != nullptr);
-  }
-
-  void Emit();
-
-  const Config& config() const { return config_; }
-
- private:
-  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
-                         bool is_first_column);
-
-  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
-    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/m(),
-                      /*major_dim_offset=*/column_start,
-                      /*tile_size_along_major_dim=*/column_count);
-  }
-
-  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
-  // sequence of `count` values, each one broadcasted to the vector width.
-  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
-    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
-    std::vector<llvm::Value*> result;
-    result.reserve(count);
-    for (int64 i = 0; i < count; i++) {
-      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
-    }
-    return result;
-  }
-
-  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
-                          const std::vector<llvm::Value*>& rhs_tile,
-                          int64 columns, bool is_first_column);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
-                             bool is_first_tiled_column);
-
-  Config config_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* addend_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trival dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implemetns a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
 };
 
-void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
-    llvm::Value* column, int64 column_count, bool is_first_column) {
-  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
-                                                /*column_count=*/column_count);
-
-  std::vector<llvm::Value*> rhs_tile =
-      LoadRhsTile(column, /*count=*/column_count);
-  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
-                     /*columns=*/column_count, is_first_column);
-  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
-}
-
-void ColumnMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 column_remainder = k() % tile_cols();
-  int64 column_limit = k() - column_remainder;
-
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
-           [&](llvm::Value* column, bool is_first_column) {
-             EmitOuterLoopBody(column, tile_cols(), is_first_column);
-           });
-
-  if (column_remainder != 0) {
-    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
-                      column_limit == 0);
-  }
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
-    int64 columns, bool is_first_column) {
-  int64 row_limit = m() - (m() % tile_rows());
-
-  ksl_.For(
-      "dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
-      /*step=*/tile_rows(), [&](llvm::Value* row) {
-        std::vector<llvm::Value*> lhs_tile =
-            lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
-        llvm::Value* accumulator =
-            is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
-                                       : vsl_.GetZeroVector())
-                            : vsl_.LoadVector(result_, row);
-        for (int i = 0; i < columns; i++) {
-          accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
-        }
-        vsl_.StoreVector(accumulator, result_, row);
-      });
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
-  int64 row_start = m() - (m() % tile_rows());
-  if (row_start == m()) {
-    return;
-  }
-
-  llvm::Value* columns_llvm = b_->getInt64(columns);
-
-  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
-  //   for (row = row_start, row < m_; row++) {
-  //     result[row] += lhs[row, col] * rhs[col]
-  //     // Also take into account that if col is 0 then result[row] is not
-  //     // initialized.
-  //   }
-
-  ksl_.For(
-      "dot.inner.epilg.outer", /*start=*/current_tile_col,
-      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
-      /*step=*/1, /*peel_first_iteration=*/false,
-      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
-        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
-        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
-        llvm::Value* lhs_base_pointer =
-            vsl_.ComputeOffsetPointer(lhs_, total_offset);
-        ksl_.For(
-            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
-            /*step=*/1, [&](llvm::Value* scalar_row) {
-              llvm::Value* product = vsl_.Mul(
-                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
-              llvm::Value* setting_result_first_time = b_->CreateAnd(
-                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
-              ksl_.If(
-                  setting_result_first_time,
-                  /*true_block_generator=*/
-                  [&]() {
-                    if (addend_) {
-                      vsl_.StoreScalar(
-                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
-                                   product),
-                          result_, scalar_row);
-                    } else {
-                      vsl_.StoreScalar(product, result_, scalar_row);
-                    }
-                  },
-                  /*false_block_generator=*/
-                  [&]() {
-                    vsl_.StoreScalar(
-                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
-                        result_, scalar_row);
-                  });
-            });
-      });
-}
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features);
 
-// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+
-//   |M00|M10|M20|M30|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|
-//   +---+---+---+---+
-//   |M03|M13|M23|M33|
-//   +---+---+---+---+
-//
-// (Legend: rows are horizontal and columns are vertical; and each row is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is loaded from the row major left matrix.
-//   b. The right vector is loaded from the RHS vector.
-//
-// We keep 4 vector accumulators accumulating the following four vector
-// expressions as we iterate over the row dimension:
-//
-//   +------+------+------+------+
-//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
-//   +------+------+------+------+
-//
-// In the end we do a horizontal reduction over these 4 vector accumulators to
-// get 4 values in the result vector.
-//
-// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
-// epilogue loop to deal with the C,D submatrix.
-class RowMajorMatrixVectorProductEmitter
-    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
+// Helper class for emitting LLVM IR to perform the dot operation.
+class DotOpEmitter {
  public:
-  class Config : public GemvConfig {
-   public:
-    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
-                    int64 m, int64 k, bool has_addend)
-        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
-                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
-                     /*k=*/k, /*has_addend=*/has_addend) {}
-  };
-
-  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
-                                     llvm::Value* rhs, llvm::Value* addend,
-                                     llvm::Value* result, llvm::IRBuilder<>* b)
-      : config_(config),
-        lhs_(lhs),
-        rhs_(rhs),
-        addend_(addend),
-        result_(result),
-        b_(b),
-        ksl_(b_),
-        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
-    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
-    CHECK(!has_addend() || addend != nullptr);
-  }
-
-  void Emit();
-
-  const Config& config() const { return config_; }
+  explicit DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features);
+
+  // Emits the IR to perform the dot operation.
+  Status Emit();
 
  private:
-  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
-    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/k(),
-                      /*major_dim_offset=*/row_start,
-                      /*tile_size_along_major_dim=*/row_count);
-  }
-
-  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
-
-  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
-                          std::vector<VectorVariable>* vector_accumulators);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
-                             std::vector<ScalarVariable>* scalar_accumulators);
-
-  Config config_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* addend_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
-};
-
-void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
-                                                           int64 row_count) {
-  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
-                                                /*row_count=*/row_count);
-  std::vector<VectorVariable> vector_accumulators;
-  std::vector<ScalarVariable> scalar_accumulators;
-  for (int i = 0; i < row_count; i++) {
-    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
-    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
-  }
-  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
-                     &vector_accumulators);
-  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
-                        &scalar_accumulators);
-
-  std::vector<llvm::Value*> accumulator_values;
-  std::transform(
-      vector_accumulators.begin(), vector_accumulators.end(),
-      std::back_inserter(accumulator_values),
-      [](const VectorVariable& vector_var) { return vector_var.Get(); });
-
-  std::vector<llvm::Value*> horizontal_sums;
-  if (row_count == vsl_.vector_size()) {
-    if (addend_) {
-      horizontal_sums = vsl_.ComputeHorizontalSums(
-          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
-    } else {
-      horizontal_sums =
-          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
-    }
-  } else {
-    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
-  }
-
-  for (int i = 0; i < row_count; i++) {
-    llvm::Value* result_value =
-        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
-    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
-    if (addend_ && row_count != vsl_.vector_size()) {
-      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
-    }
-    vsl_.StoreScalar(result_value, result_, offset);
-  }
-}
+  // Emits instructions to perform a scalar dot product (a multiply of the
+  // LHS and RHS) and store the results in the target.
+  Status EmitScalarDot();
 
-void RowMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 row_remainder = m() % tile_rows();
-  int64 row_limit = m() - row_remainder;
+  // Emits a call to the CPU runtime to perform the matrix multiply.
+  Status EmitCallToRuntime();
 
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
-           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
-
-  if (row_remainder != 0) {
-    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
-  }
-}
+  // Represents the dimensions of a matrix-matrix multiply operation.
+  struct MatMultDims {
+    // The number of rows in the LHS.
+    int64 m;
 
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    MemoryTile* lhs_memory_tile, int64 rows,
-    std::vector<VectorVariable>* vector_accumulators) {
-  int64 column_limit = k() - (k() % tile_cols());
-
-  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-           /*step=*/tile_cols(), [&](llvm::Value* col) {
-             std::vector<llvm::Value*> lhs_tile =
-                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
-             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
-             for (int i = 0; i < rows; i++) {
-               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
-               (*vector_accumulators)[i].Set(
-                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
-             }
-           });
-}
+    // The number of columns in the LHS, which is also must be equal to the
+    // number of rows in the RHS.
+    int64 k;
 
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_row, int64 rows,
-    std::vector<ScalarVariable>* scalar_accumulators) {
-  int64 column_start = k() - (k() % tile_cols());
-  if (column_start == k()) {
-    return;
-  }
+    // The number of columns on the RHS.
+    int64 n;
 
-  for (int r = 0; r < rows; r++) {
-    llvm::Value* total_offset = b_->CreateMul(
-        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
-    llvm::Value* lhs_base_pointer =
-        vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.For(
-        "dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
-        /*step=*/1, [&](llvm::Value* scalar_col) {
-          llvm::Value* product =
-              vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
-                       vsl_.LoadScalar(rhs_, scalar_col));
-          llvm::Value* old_value = (*scalar_accumulators)[r].Get();
-          (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
-        });
-  }
-}
+    // True if the LHS matrix is column major.
+    bool lhs_column_major;
 
-// This class implements a tiled matrix multiplication algorithm, intended for
-// multiplying small matrices that don't need cache tiling.
-//
-// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
-// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
-// high-performance matrix multiplication." ACM Transactions on Mathematical
-// Software (TOMS) 34.3 (2008): 12.".
-//
-// This only supports canonical dot operations (i.e. where the lhs contraction
-// dimension is 1 and the rhs contraction dimension is 0) over row major
-// matrices.
-class TiledSmallGemmEmitter {
- public:
-  // Describe the dimensions of the kernel.
-  class Dimensions {
-   public:
-    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+    // True if the LHS contraction dimension is not 1.
+    bool lhs_non_canonical;
 
-    int64 m() const { return m_; }
-    int64 k() const { return k_; }
-    int64 n() const { return n_; }
+    // True if the RHS matrix is column major.
+    bool rhs_column_major;
 
-    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
+    // True if the RHS contraction dimension is not 0.
+    bool rhs_non_canonical;
 
-   private:
-    const int64 m_;
-    const int64 k_;
-    const int64 n_;
+    // True if the result matrix is column major.
+    bool target_column_major;
   };
 
-  // Represents the configuration of the emitter.  The LLVM IR emitted by the
-  // emitter, modulo the LLVM values holding the input and output buffers, must
-  // be a function of the instance of `Config` passed to it.
-  //
-  // `dims` holds the matrix multiplication dimensions.
-  //
-  // `max_vectorization_width` is the maximum vector width (i.e. the width of
-  // the largest vector register we will use).  This can be larger than the
-  // largest vector register supported by the machine -- LLVM will legalize
-  // these large vector widths into legally sized vectors.
-  //
-  // `max_vector_count` is the maximum number of vectors of size
-  // `max_vectorization_width` that we will attempt to process at once.
-  //
-  // `min_vectorization_width` is the smallest vector width the emitter will use
-  // -- below that it will devolve to using a scalar loop.
-  //
-  // The innermost reduction loop executes the matrix multiply in tiles of size
-  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
-  // <vectorization width>] in the RHS.
-  class Config {
-   public:
-    explicit Config(PrimitiveType scalar_type, Dimensions dims,
-                    int64 max_vectorization_width, int64 max_vector_count,
-                    int64 min_vectorization_width, int64 tile_size_m,
-                    int64 tile_size_k)
-        : scalar_type_(scalar_type),
-          dims_(dims),
-          max_vectorization_width_(max_vectorization_width),
-          max_vector_count_(max_vector_count),
-          min_vectorization_width_(min_vectorization_width),
-          tile_size_m_(tile_size_m),
-          tile_size_k_(tile_size_k) {}
-
-    string GetCacheKey() const {
-      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
-                          dims().ToString(), "_", max_vectorization_width(),
-                          "_", min_vectorization_width(), "_", tile_size_m(),
-                          "_", tile_size_k());
-    }
+  // Get the MatMultDims instance for the dot product this DotOpEmitter
+  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
+  // of rank 2 as well).
+  MatMultDims GetMatMultDims() const;
 
-    PrimitiveType scalar_type() const { return scalar_type_; }
-    Dimensions dims() const { return dims_; }
-    int64 max_vectorization_width() const { return max_vectorization_width_; }
-    int64 max_vector_count() const { return max_vector_count_; }
-    int64 min_vectorization_width() const { return min_vectorization_width_; }
-
-    int64 tile_size_m() const { return tile_size_m_; }
-    int64 tile_size_k() const { return tile_size_k_; }
-
-   private:
-    PrimitiveType scalar_type_;
-    Dimensions dims_;
-    int64 max_vectorization_width_;
-    int64 max_vector_count_;
-    int64 min_vectorization_width_;
-    int64 tile_size_m_;
-    int64 tile_size_k_;
-  };
+  // Lowers the dot operation as a tiled Matrix*Vector loop.
+  void EmitTiledLlvmIrGemv();
 
-  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
-  // `lhs` with `rhs` and stores the result in `result`.
-  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
-                                 llvm::Value* rhs, llvm::Value* result,
-                                 llvm::IRBuilder<>* b)
-      : lhs_(lhs),
-        rhs_(rhs),
-        result_(result),
-        config_(config),
-        b_(b),
-        ksl_(b_) {
-    CHECK(max_vectorization_width() > 0 &&
-          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
-    CHECK_GT(max_vector_count(), 0);
-    CHECK(min_vectorization_width() > 0 &&
-          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
-    CHECK_GE(max_vectorization_width(), min_vectorization_width());
-    CHECK_GT(tile_size_k(), 0);
-  }
+  // Lowers the dot operation as a tiled Matrix*Matrix loop.
+  void EmitTiledLlvmIrGemm();
 
-  void Emit();
+  // Lowers the dot operation as a naive nested loop that computes the result
+  // one element at a time.
+  void EmitNaiveLlvmIrGemm();
 
- private:
-  // The HandleResiduesOnX helpers split the iteration space for dimension X
-  // into a multiple of the tile size on dimension X and an epilogue.  These
-  // helpers ultimately call into `EmitTiledGemm` for emitting the
-  // tiled GEMM kernel.
-
-  void HandleResiduesOnN();
-  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
-                         llvm::Value* n_end);
-  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
-                         llvm::Value* k_start, llvm::Value* k_end,
-                         llvm::Value* n_start, llvm::Value* n_end);
-
-  // This emits a tiled GEMM kernel.  For a detailed description see the comment
-  // on the implementation.
-  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
-                     llvm::Value* k_start, llvm::Value* k_end,
-                     llvm::Value* n_start, llvm::Value* n_end,
-                     int64 tile_size_m, llvm::Value* m_start,
-                     llvm::Value* m_end);
-
-  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
-
-  Config config() const { return config_; }
-  Dimensions dims() const { return config().dims(); }
-
-  int64 max_vectorization_width() const {
-    return config().max_vectorization_width();
+  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
+  // registers.
+  int64 GetGemvTilingFactor() const {
+    const int64 kDefaultTilingFactor = 8;
+    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
+        .value_or(kDefaultTilingFactor);
   }
-  int64 max_vector_count() const { return config().max_vector_count(); }
-  int64 min_vectorization_width() const {
-    return config().min_vectorization_width();
-  }
-  int64 tile_size_m() const { return config().tile_size_m(); }
-  int64 tile_size_k() const { return config().tile_size_k(); }
-  PrimitiveType scalar_type() const { return config().scalar_type(); }
 
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* result_;
-  Config config_;
+  std::tuple<int64, int64, int64> GetGemmTileSize() const {
+    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
+    //
+    // TODO(b/80093688): Tune for other architectures and centralize this
+    // information in one place.
+    const std::tuple<int64, int64, int64> kDefaultTileSize =
+        std::tuple<int64, int64, int64>(11, 9, 1);
+    return options::LlvmIrGemmTileSize(hlo_module_config_)
+        .value_or(kDefaultTileSize);
+  }
 
+  DotInfo dot_info_;
+  string dot_hlo_name_;
+  const llvm_ir::IrArray& target_array_;
+  const llvm_ir::IrArray& lhs_array_;
+  const llvm_ir::IrArray& rhs_array_;
+  const llvm_ir::IrArray* addend_array_;
+  llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
+  const HloModuleConfig& hlo_module_config_;
+  const TargetMachineFeatures& target_machine_features_;
 };
-
-void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
-
-void TiledSmallGemmEmitter::HandleResiduesOnN() {
-  // We can only iterate the `n` dimension for an extent that is divisible by
-  // the vectorization width.  So we emit an outer loop that first processes the
-  // largest extent in `n` that is divisible by max_vectorization_width, then
-  // the largest remaining extent that is divisible by max_vectorization_width /
-  // 2 etc.
-
-  int64 current_vectorization_width =
-      max_vector_count() * max_vectorization_width();
-  int64 current_vector_count = max_vector_count();
-
-  int64 n_start = 0;
-  while (n_start != dims().n() &&
-         current_vectorization_width >= min_vectorization_width()) {
-    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
-    if (n_start != n_end) {
-      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
-                               "gemm");
-      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
-      n_start = n_end;
-    }
-    if (current_vector_count == 1) {
-      current_vectorization_width /= 2;
-    } else {
-      current_vector_count--;
-      current_vectorization_width =
-          current_vector_count * max_vectorization_width();
-    }
-  }
-
-  if (n_start != dims().n()) {
-    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
-    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
-      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
-      HandleResiduesOnK(&vsl, n_i, n_i_next);
-    });
-  }
-}
-
-void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
-                                              llvm::Value* n_start,
-                                              llvm::Value* n_end) {
-  int64 k_start = 0;
-  int64 k_end = dims().k() - (dims().k() % tile_size_k());
-  if (k_end != k_start) {
-    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
-                      n_start, n_end);
-    k_start = k_end;
-  }
-
-  if (k_start != dims().k()) {
-    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
-                      GetInt64(dims().k()), n_start, n_end);
-  }
-}
-
-void TiledSmallGemmEmitter::HandleResiduesOnM(
-    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
-    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
-  const int64 m_end = dims().m() - dims().m() % tile_size_m();
-  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
-                GetInt64(0), GetInt64(m_end));
-
-  if (m_end != dims().m()) {
-    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
-                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
-  }
-}
-
-// The loop structure is:
-//
-// Iterate over dimension M as m:
-//   Iterate over dimension N as n:
-//     Iterate over dimension K as k:
-//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
-//
-// I.e. a just a tiled version of a "naive" GEMM.
-//
-// The tiling scheme is as follows:
-//
-// Let the LHS be:
-//
-//   +----+----+----+
-//   | a0 | b0 | c0 | .
-//   +----+----+----+ .
-//   | a1 | b1 | c1 | .
-//   +----+----+----+
-//     ..     ..
-//
-// and the RHS be:
-//
-//   +----+----+----+----+
-//   | p0 | p1 | p2 | p3 | .
-//   +----+----+----+----+ .
-//   | q0 | q1 | q2 | q3 | .
-//   +----+----+----+----+
-//   | r0 | r1 | r2 | r3 | .
-//   +----+----+----+----+ .
-//     ......    ......
-//
-// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
-// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
-// matrix that we can increment the result matrix by.
-//
-// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
-// 3 array, L, of dimension [2,3,4]:
-//
-//       L[0,_,_]           *      L[1,_,_]
-//                          *
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//
-//
-// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
-// L[1,_,_] with the RHS to get the second row of the result.  For example,
-// L[0,_,_] is computed as:
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
-//   +----+----+----+----+   +----+----+----+----+
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
-//   +----+----+----+----+   +----+----+----+----+
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
-//   +----+----+----+----+   +----+----+----+----+
-//
-// to get:
-//
-//   +-------------------+-------------------+-------------------+---------
-//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
-//   +-------------------+-------------------+-------------------+---------
-void TiledSmallGemmEmitter::EmitTiledGemm(
-    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
-    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
-    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
-  ksl_.For(
-      "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
-        MemoryTile result_memory_tile(
-            vsl, b_, /*matrix=*/result_,
-            /*matrix_size_along_minor_dim=*/dims().n(),
-            /*major_dim_offset=*/m_i,
-            /*tile_size_along_major_dim=*/tile_size_m);
-        MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
-                                   /*matrix_size_along_minor_dim=*/dims().k(),
-                                   /*major_dim_offset=*/m_i,
-                                   /*tile_size_along_major_dim=*/tile_size_m);
-        ksl_.For(
-            "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
-              TileVariable result_tile_var(vsl,
-                                           result_memory_tile.LoadTile(n_i));
-              ksl_.For(
-                  "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-                    MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
-                                               tile_size_k);
-                    std::vector<std::vector<llvm::Value*>> lhs_tile =
-                        lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
-                    std::vector<llvm::Value*> rhs_tile =
-                        rhs_memory_tile.LoadTile(n_i);
-                    std::vector<llvm::Value*> result_tile =
-                        result_tile_var.Get();
-                    for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
-                      for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
-                        result_tile[r_m_i] =
-                            vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
-                                        result_tile[r_m_i]);
-                      }
-                    }
-                    result_tile_var.Set(result_tile);
-                  });
-
-              result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
-            });
-      });
-}
-
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
+DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
                            const llvm_ir::IrArray& target_array,
                            const llvm_ir::IrArray& lhs_array,
                            const llvm_ir::IrArray& rhs_array,
@@ -974,7 +211,8 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
                            llvm::IRBuilder<>* b,
                            const HloModuleConfig& hlo_module_config,
                            const TargetMachineFeatures& target_machine_features)
-    : dot_(dot),
+    : dot_info_(std::move(dot_info)),
+      dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
@@ -984,58 +222,9 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-/* static */ Status DotOpEmitter::EmitDotOperation(
-    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-    const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features) {
-  PrimitiveType type = target_array.GetShape().element_type();
-  TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
-  DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
-                           addend_array, executable_run_options_value, b,
-                           hlo_module_config, target_machine_features);
-  return dot_emitter.Emit();
-}
-
-bool DotOpEmitter::EmitSmallGemmIfProfitable(
-    const DotOpEmitter::MatMultDims& mat_mult_dims) {
-  if (ShouldUseMultiThreadedEigen()) {
-    return false;
-  }
-
-  if (!EnableExperimentalLlvmIrGemm()) {
-    // TODO(sanjoy):  We should make these numbers micro-arch specific.
-    bool small_gemm = mat_mult_dims.k <= 128 &&
-                      ((mat_mult_dims.m <= 32 && mat_mult_dims.n <= 128) ||
-                       (mat_mult_dims.m <= 128 && mat_mult_dims.n <= 32));
-    if (!small_gemm) {
-      return false;
-    }
-  }
-
-  if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
-    return false;
-  }
-
-  PrimitiveType primitive_type = dot_.shape().element_type();
-
-  switch (primitive_type) {
-    default:
-      return false;
-
-    case F32:
-    case F64:
-    case S32:
-    case S64:
-      break;
-  }
-
-  if (!(mat_mult_dims.lhs_column_major == mat_mult_dims.rhs_column_major &&
-        mat_mult_dims.rhs_column_major == mat_mult_dims.target_column_major)) {
-    return false;
-  }
+void DotOpEmitter::EmitTiledLlvmIrGemm() {
+  PrimitiveType primitive_type = dot_info_.result_shape.element_type();
+  MatMultDims mat_mult_dims = GetMatMultDims();
 
   llvm::Value* lhs = lhs_array_.GetBasePointer();
   llvm::Value* rhs = rhs_array_.GetBasePointer();
@@ -1050,9 +239,8 @@ bool DotOpEmitter::EmitSmallGemmIfProfitable(
   }
 
   int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  b_->CreateMemSet(
-      target, b_->getInt8(0), size_bytes,
-      target_machine_features_.minimum_alignment_for_allocation(size_bytes));
+  b_->CreateMemSet(target, b_->getInt8(0), /*Size=*/size_bytes,
+                   /*Align=*/1);
 
   int64 max_target_vector_width =
       target_machine_features_.vector_register_num_elements(
@@ -1062,47 +250,28 @@ bool DotOpEmitter::EmitSmallGemmIfProfitable(
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
       GetGemmTileSize();
 
-  TiledSmallGemmEmitter::Config config(
-      /*scalar_type=*/primitive_type,
-      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
-      /*max_vectorization_width=*/max_target_vector_width,
-      /*max_vector_count=*/tile_size_n_in_vector_width,
-      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
-      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
-
-  VLOG(2) << "Emitting GEMM kernel in LLVM IR with config "
-          << config.GetCacheKey();
-
   const bool enable_fast_math =
       hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
   const bool optimize_for_size =
       options::OptimizeForSizeRequested(hlo_module_config_);
 
-  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+  EmitSmallGemm(
+      /*scalar_type=*/primitive_type,
+      /*m=*/m, /*k=*/k, /*n=*/n,
+      /*max_vectorization_width=*/max_target_vector_width,
+      /*max_vector_count=*/tile_size_n_in_vector_width,
+      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k, /*lhs=*/lhs,
+      /*rhs=*/rhs, /*result=*/target, b_,
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs,
-      rhs, target,
-      [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
-        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
-                                                 /*rhs=*/rhs,
-                                                 /*result=*/target, b_);
-        small_gemm_emitter.Emit();
-      });
-
-  return true;
+      /*optimize_for_size=*/optimize_for_size);
 }
 
-bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
-  if (dot_.shape().dimensions_size() != 2) {
-    return false;
-  }
-
-  PrimitiveType primitive_type = dot_.shape().element_type();
+void DotOpEmitter::EmitTiledLlvmIrGemv() {
+  PrimitiveType primitive_type = dot_info_.result_shape.element_type();
 
-  if (!primitive_util::IsFloatingPointType(primitive_type) &&
-      !primitive_util::IsIntegralType(primitive_type)) {
-    return false;
-  }
+  CHECK(primitive_util::IsFloatingPointType(primitive_type) ||
+        primitive_util::IsIntegralType(primitive_type));
 
   MatMultDims mat_mult_dims = GetMatMultDims();
   bool is_column_major_matrix_vector = false;
@@ -1143,9 +312,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
     }
   }
 
-  if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return EmitSmallGemmIfProfitable(mat_mult_dims);
-  }
+  CHECK(is_column_major_matrix_vector || is_row_major_matrix_vector);
 
   int64 tiling_factor = GetGemvTilingFactor();
   CHECK_GT(tiling_factor, 0);
@@ -1177,44 +344,27 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (is_column_major_matrix_vector) {
     VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    ColumnMajorMatrixVectorProductEmitter::Config config(
+    EmitColumnMajorGemv(
         /*scalar_type=*/primitive_type,
         /*tile_rows=*/vector_register_element_size, /*tile_cols=*/tiling_factor,
-        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
-
-    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
+        /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
+        /*result=*/result_op, b_,
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
-        lhs_op, rhs_op,
-        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
-                       llvm::Value* addend_op, llvm::Value* result_op) {
-          ColumnMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, b_);
-          emitter.Emit();
-        });
+        /*optimize_for_size=*/optimize_for_size);
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    RowMajorMatrixVectorProductEmitter::Config config(
+    EmitRowMajorGemv(
         /*scalar_type=*/primitive_type,
-        /*tile_rows=*/tiling_factor, /*tile_cols=*/vector_register_element_size,
-        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
-
-    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*tile_rows=*/tiling_factor,
+        /*tile_cols=*/vector_register_element_size,
+        /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
+        /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
+        /*result=*/result_op, b_,
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
-        lhs_op, rhs_op,
-        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
-                       llvm::Value* addend_op, llvm::Value* result_op) {
-          RowMajorMatrixVectorProductEmitter emitter(config, lhs_op, rhs_op,
-                                                     addend_op, result_op, b_);
-          emitter.Emit();
-        });
+        /*optimize_for_size=*/optimize_for_size);
   }
-
-  return true;
 }
 
 Status DotOpEmitter::Emit() {
@@ -1240,11 +390,6 @@ Status DotOpEmitter::Emit() {
   // which performs the sum-of-products (the reduction loop) before storing
   // the result in the output buffer.
 
-  // This routine assumes that the dot operation is not in a parallelized
-  // enclosing computation.
-  CHECK(
-      dot_.parent()->root_instruction()->outer_dimension_partitions().empty());
-
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
 
@@ -1255,27 +400,41 @@ Status DotOpEmitter::Emit() {
     return EmitScalarDot();
   }
 
-  if (EmitLlvmIrDotIfProfitable()) {
-    return Status::OK();
+  switch (GetDotImplementationStrategy(hlo_module_config_, dot_info_,
+                                       target_machine_features_)) {
+    case DotImplementationStrategy::kNaiveLlvmIr:
+      EmitNaiveLlvmIrGemm();
+      return Status::OK();
+
+    case DotImplementationStrategy::kTiledLlvmIrGemv:
+      EmitTiledLlvmIrGemv();
+      return Status::OK();
+
+    case DotImplementationStrategy::kTiledLlvmIrGemm:
+      EmitTiledLlvmIrGemm();
+      return Status::OK();
+
+    case DotImplementationStrategy::kEigen:
+      return EmitCallToRuntime();
   }
+}
 
+void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   CHECK_EQ(addend_array_, nullptr);
 
-  if (PotentiallyImplementedAsEigenDot(dot_, target_machine_features_)) {
-    return EmitCallToRuntime();
-  }
+  const Shape& lhs_shape = lhs_array_.GetShape();
+  const Shape& rhs_shape = rhs_array_.GetShape();
+  const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   // Reduce along dimension 0 of the LHS and 1 of the RHS. Vectors are a special
   // case where the reduction dimension is 0 for both LHS and RHS. This results
   // in a vector dot product producing a scalar.
-  int64 lhs_reduction_dimension =
-      dot_.dot_dimension_numbers().lhs_contracting_dimensions(0);
-  int64 rhs_reduction_dimension =
-      dot_.dot_dimension_numbers().rhs_contracting_dimensions(0);
+  int64 lhs_reduction_dimension = dim_nums.lhs_contracting_dimensions(0);
+  int64 rhs_reduction_dimension = dim_nums.rhs_contracting_dimensions(0);
 
   // Verify the reduction dimension in the two operands are the same size.
-  TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) ==
-               rhs_shape.dimensions(rhs_reduction_dimension));
+  CHECK_EQ(lhs_shape.dimensions(lhs_reduction_dimension),
+           rhs_shape.dimensions(rhs_reduction_dimension));
 
   bool lhs_reduction_along_minor_dimension =
       lhs_reduction_dimension == LayoutUtil::Minor(lhs_shape.layout(), 0);
@@ -1285,7 +444,7 @@ Status DotOpEmitter::Emit() {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), b_);
+  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(dot_hlo_name_), b_);
   llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
       lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
   llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
@@ -1390,8 +549,6 @@ Status DotOpEmitter::Emit() {
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop.
   b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
-  return Status::OK();
 }
 
 Status DotOpEmitter::EmitScalarDot() {
@@ -1438,7 +595,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded = ShouldUseMultiThreadedEigen();
+  bool multi_threaded = ShouldUseMultiThreadedEigen(hlo_module_config_);
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
@@ -1531,11 +688,11 @@ Status DotOpEmitter::EmitCallToRuntime() {
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
-  CHECK_EQ(dot_.shape().dimensions_size(), 2);
+  CHECK_EQ(dot_info_.result_shape.dimensions_size(), 2);
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
-  const DotDimensionNumbers& dim_nums = dot_.dot_dimension_numbers();
+  const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   return {
       /*m=*/lhs_shape.dimensions(1 - dim_nums.lhs_contracting_dimensions(0)),
@@ -1549,74 +706,6 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
-// Return whether the given shape is rank 2.
-static bool IsRank2(const Shape& shape) { return ShapeUtil::Rank(shape) == 2; }
-
-// In a gemm operation where output = lhs * rhs, check whether the given shapes
-// are valid for the operation.
-static bool AreValidGemmShapes(
-    const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
-    const TargetMachineFeatures& target_machine_features) {
-  // The inputs and the output must
-  // 1) be matrices with no padding, and
-  // 2) have an allowed element type.
-  PrimitiveType output_primitive_type = output_shape.element_type();
-  if (!(output_primitive_type == F64 || output_primitive_type == F32 ||
-        output_primitive_type == F16)) {
-    return false;
-  }
-
-  if (!(IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape))) {
-    return false;
-  }
-
-  auto is_aligned = [&](const Shape& shape) {
-    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
-           TargetMachineFeatures::kEigenExpectedTensorAlignment;
-  };
-
-  if (!is_aligned(lhs_shape) || !is_aligned(rhs_shape) ||
-      !is_aligned(output_shape)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool PotentiallyImplementedAsEigenDot(
-    const HloInstruction& hlo,
-    const TargetMachineFeatures& target_machine_features) {
-  // For certain types of Dot, we can call Eigen
-  if (hlo.opcode() == HloOpcode::kDot) {
-    const Shape& lhs_shape = hlo.operand(0)->shape();
-    const Shape& rhs_shape = hlo.operand(1)->shape();
-
-    if (ShapeUtil::IsZeroElementArray(lhs_shape) ||
-        ShapeUtil::IsZeroElementArray(rhs_shape)) {
-      return false;
-    }
-
-    if (ProfitableToImplementDotInTiledLlvmIr(hlo)) {
-      return false;
-    }
-
-    // If gemm can accept the operand shapes, use it rather than a custom
-    // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape(),
-                           target_machine_features)) {
-      const DotDimensionNumbers& dim_numbers = hlo.dot_dimension_numbers();
-      // The size of the reduction dimension should match. The shape inference
-      // guarantees this invariant, so the check here is for programming
-      // errors.
-      CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
-               rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
-      return true;
-    }
-  }
-
-  return false;
-}
-
 // For vector-matrix dot products, it is always profitable to make the Rhs
 // column major.
 absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
@@ -1655,16 +744,319 @@ absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
   return {};
 }
 
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
+namespace {
+// Return whether the given shape is rank 2.
+bool IsRank2(const Shape& shape) { return shape.rank() == 2; }
+
+bool IsSimpleLayout(const Layout& layout) {
+  return layout.tiles().empty() && layout.format() == DENSE;
+}
+
+// In a gemm operation where output = lhs * rhs, check whether the given shapes
+// are valid for the operation.
+bool AreGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
+                   const Shape& output_shape,
+                   const TargetMachineFeatures& target_machine_features) {
+  CHECK(!lhs_shape.has_layout() || IsSimpleLayout(lhs_shape.layout()))
+      << lhs_shape.DebugString();
+  CHECK(!rhs_shape.has_layout() || IsSimpleLayout(rhs_shape.layout()))
+      << rhs_shape.DebugString();
+  CHECK(!output_shape.has_layout() || IsSimpleLayout(output_shape.layout()))
+      << output_shape.DebugString();
+
+  switch (output_shape.element_type()) {
+    case F64:
+    case F32:
+    case F16:
+      return IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape);
+    default:
+      return false;
+  }
+}
+
+bool IsAlignedGemm(const DotInfo& dot_info,
+                   const TargetMachineFeatures& target_machine_features) {
+  if (ShapeUtil::IsZeroElementArray(dot_info.lhs_shape) ||
+      ShapeUtil::IsZeroElementArray(dot_info.rhs_shape)) {
+    return false;
+  }
+
+  return AreGemmShapes(dot_info.lhs_shape, dot_info.rhs_shape,
+                       dot_info.result_shape, target_machine_features);
+}
+
+bool CanEmitTiledLlvmIrGemm(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features) {
+  CHECK(IsAlignedGemm(dot_info, target_machine_features));
+
+  if (ShouldUseMultiThreadedEigen(config)) {
+    return false;
+  }
+
+  int m = dot_info.result_shape.dimensions(0);
+  int k = dot_info.lhs_shape.dimensions(
+      dot_info.dim_nums.lhs_contracting_dimensions(0));
+  int n = dot_info.result_shape.dimensions(1);
+
+  if (!options::ForceEnableExperimentalLlvmIrGemm(config)) {
+    // TODO(sanjoy):  We should make these numbers micro-arch specific.
+    bool small_gemm =
+        k <= 128 && ((m <= 32 && n <= 128) || (m <= 128 && n <= 32));
+    if (!small_gemm) {
+      return false;
+    }
+  }
+
+  bool lhs_non_canonical = dot_info.dim_nums.lhs_contracting_dimensions(0) == 0;
+  bool rhs_non_canonical = dot_info.dim_nums.rhs_contracting_dimensions(0) == 1;
+
+  if (lhs_non_canonical || rhs_non_canonical) {
+    return false;
+  }
+
+  if (dot_info.result_shape.element_type() == F16) {
+    // TODO(sanjoy): This is probably easy to fix, but I want to keep the CL
+    // adding this comment NFC.
+    return false;
+  }
+
+  return true;
+}
+
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features) {
+  PrimitiveType element_type = dot_info.result_shape.element_type();
   // Any Matrix-Vector product of floating point or integral type, or
   // a transpose-dot fusion of the same can be lowered to a tiled LLVM
   // IR implementation.
-  const Shape& shape = dot.shape();
-  return shape.dimensions_size() == 2 &&
-         (shape.dimensions(0) == 1 || shape.dimensions(1) == 1) &&
-         (primitive_util::IsFloatingPointType(shape.element_type()) ||
-          primitive_util::IsIntegralType(shape.element_type()));
+  if (dot_info.result_shape.dimensions_size() == 2 &&
+      (dot_info.result_shape.dimensions(0) == 1 ||
+       dot_info.result_shape.dimensions(1) == 1) &&
+      (primitive_util::IsFloatingPointType(element_type) ||
+       primitive_util::IsIntegralType(element_type))) {
+    return DotImplementationStrategy::kTiledLlvmIrGemv;
+  }
+
+  if (IsAlignedGemm(dot_info, target_machine_features)) {
+    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
+               ? DotImplementationStrategy::kTiledLlvmIrGemm
+               : DotImplementationStrategy::kEigen;
+  }
+
+  return DotImplementationStrategy::kNaiveLlvmIr;
 }
 
+Status EmitNonBatchDotOperation(
+    DotInfo dot_info, string hlo_name, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features) {
+  PrimitiveType type = target_array.GetShape().element_type();
+  TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type ||
+               C128 == type);
+  DotOpEmitter dot_emitter(std::move(dot_info), std::move(hlo_name),
+                           target_array, lhs_array, rhs_array, addend_array,
+                           executable_run_options_value, b, hlo_module_config,
+                           target_machine_features);
+  return dot_emitter.Emit();
+}
+
+Shape DropFirstDim(const Shape& shape) {
+  absl::Span<int64 const> array_shape_dims(shape.dimensions());
+  array_shape_dims.remove_prefix(1);
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  array_shape_dims);
+}
+
+Shape CollapseFirstNDims(const Shape& shape, int64 n) {
+  absl::Span<int64 const> input_shape_dims(shape.dimensions());
+  int64 prefix_dim =
+      std::accumulate(input_shape_dims.begin(), input_shape_dims.begin() + n,
+                      1ll, std::multiplies<int64>());
+  DimensionVector result_dims;
+  result_dims.push_back(prefix_dim);
+  std::copy(input_shape_dims.begin() + n, input_shape_dims.end(),
+            std::back_inserter(result_dims));
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  result_dims);
+}
+
+llvm_ir::IrArray CollapseFirstNDims(llvm::IRBuilder<>* b,
+                                    const llvm_ir::IrArray& array, int64 n) {
+  llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+  const Shape& shape = array.GetShape();
+  CHECK(shape.has_layout() &&
+        LayoutUtil::IsMonotonicWithDim0Major(shape.layout()));
+  CHECK_GE(shape.dimensions_size(), n);
+  Shape new_shape = CollapseFirstNDims(shape, n);
+  llvm::Value* new_value = b->CreateBitCast(
+      array.GetBasePointer(),
+      llvm_ir::ShapeToIrType(new_shape, module)->getPointerTo());
+  return llvm_ir::IrArray(new_value, std::move(new_shape));
+}
+
+Status ValidateDotDimensionNumbers(const DotDimensionNumbers& dim_numbers) {
+  // Checks some invariants that do not hold in general, but DotDecomposer
+  // should have established for us.  This is just a debugging aid.
+  TF_RET_CHECK(dim_numbers.lhs_contracting_dimensions_size() == 1);
+  std::vector<int64> batch_dim_numbers(dim_numbers.lhs_batch_dimensions_size());
+  absl::c_iota(batch_dim_numbers, 0);
+  TF_RET_CHECK(
+      absl::c_equal(batch_dim_numbers, dim_numbers.lhs_batch_dimensions()));
+  TF_RET_CHECK(
+      absl::c_equal(batch_dim_numbers, dim_numbers.rhs_batch_dimensions()));
+  return Status::OK();
+}
+
+// Slice out the inner array at batch index `batch_index` from `outer_array`.
+llvm_ir::IrArray SliceOutInnerArray(llvm_ir::IrArray outer_array,
+                                    llvm::Value* batch_index,
+                                    llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+
+  Shape inner_shape = DropFirstDim(outer_array.GetShape());
+  llvm_ir::IrArray::Index slice_index(b->getInt64Ty());
+  slice_index.push_back(batch_index);
+  slice_index.InsertAt(
+      /*index=*/1, outer_array.GetShape().dimensions_size() - 1,
+      b->getInt64(0));
+  llvm::Value* slice_ptr = outer_array.EmitArrayElementAddress(slice_index, b);
+  llvm::Type* slice_ptr_type =
+      llvm_ir::ShapeToIrType(inner_shape, module)->getPointerTo();
+  return llvm_ir::IrArray(b->CreateBitCast(slice_ptr, slice_ptr_type),
+                          std::move(inner_shape));
+}
+
+Status EmitBatchDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features) {
+  TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot.dot_dimension_numbers()));
+
+  // Lower a batch dot into a sequence of non-batch dot operations.
+
+  int64 num_batch_dims =
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size();
+
+  // First reshape the inputs to make sure we only have one batch dimension.
+  // This is a no-op bitcast because the operands have to be in row-major layout
+  // (enforced in CpuLayoutAssignment), and the batch dimensions are the leading
+  // dimensions (established by DotDecomposer and checked by
+  // ValidateDotDimensionNumbers above).
+  llvm_ir::IrArray lhs_array_reshaped =
+      CollapseFirstNDims(b, lhs_array, num_batch_dims);
+  llvm_ir::IrArray rhs_array_reshaped =
+      CollapseFirstNDims(b, rhs_array, num_batch_dims);
+  llvm_ir::IrArray target_array_reshaped =
+      CollapseFirstNDims(b, target_array, num_batch_dims);
+
+  int64 batch_count = lhs_array_reshaped.GetShape().dimensions(0);
+
+  KernelSupportLibrary ksl(b);
+
+  return ksl.ForWithStatus(
+      "bdot", /*start=*/0, /*end=*/batch_count, /*step=*/1,
+      [&](llvm::Value* indvar) {
+        DotDimensionNumbers adjusted_dim_numbers = dot.dot_dimension_numbers();
+        adjusted_dim_numbers.clear_lhs_batch_dimensions();
+        adjusted_dim_numbers.clear_rhs_batch_dimensions();
+
+        // Create a DotInfo representing the "inner" non-batch dot operation.
+        DotInfo dot_info;
+        dot_info.lhs_shape = DropFirstDim(lhs_array_reshaped.GetShape());
+        dot_info.rhs_shape = DropFirstDim(rhs_array_reshaped.GetShape());
+        dot_info.result_shape = DropFirstDim(target_array_reshaped.GetShape());
+        dot_info.dim_nums = dot.dot_dimension_numbers();
+        dot_info.dim_nums.clear_lhs_batch_dimensions();
+        dot_info.dim_nums.clear_rhs_batch_dimensions();
+
+        dot_info.dim_nums.set_lhs_contracting_dimensions(
+            0,
+            dot_info.dim_nums.lhs_contracting_dimensions(0) - num_batch_dims);
+        dot_info.dim_nums.set_rhs_contracting_dimensions(
+            0,
+            dot_info.dim_nums.rhs_contracting_dimensions(0) - num_batch_dims);
+
+        llvm_ir::IrArray lhs_slice =
+            SliceOutInnerArray(lhs_array_reshaped, /*batch_index=*/indvar, b);
+        llvm_ir::IrArray rhs_slice =
+            SliceOutInnerArray(rhs_array_reshaped, /*batch_index=*/indvar, b);
+        llvm_ir::IrArray target_slice = SliceOutInnerArray(
+            target_array_reshaped, /*batch_index=*/indvar, b);
+
+        // Emit the inner non-batch dot operation.
+        return EmitNonBatchDotOperation(
+            dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
+            executable_run_options_value, b, hlo_module_config,
+            target_machine_features);
+      });
+}
+
+bool IsBatchDot(const HloInstruction& instr) {
+  if (auto* dot_instr = DynCast<HloDotInstruction>(&instr)) {
+    return dot_instr->dot_dimension_numbers().lhs_batch_dimensions_size() > 0;
+  }
+
+  return false;
+}
+}  // namespace
+
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features) {
+  DotImplementationStrategy impl_strategy =
+      GetDotImplementationStrategy(dot_instr.parent()->parent()->config(),
+                                   DotInfo(dot_instr), target_machine_features);
+
+  // TODO(sanjoy): This is not quite right, it should be `impl_strategy ==
+  // kEigen || impl_strategy == kTiledLlvmIrGemv || impl_strategy ==
+  // kNaiveLlvmIr` but I'll fix this in a later CL in the interest of keeping
+  // the CL adding this comment NFC.
+  return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
+         impl_strategy == DotImplementationStrategy::kEigen;
+}
+
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features) {
+  DotImplementationStrategy impl_strategy =
+      GetDotImplementationStrategy(dot_instr.parent()->parent()->config(),
+                                   DotInfo(dot_instr), target_machine_features);
+
+  return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
+         impl_strategy == DotImplementationStrategy::kEigen;
+}
+
+Status EmitDotOperation(const HloInstruction& dot,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features) {
+  // This routine assumes that the dot operation is not in a parallelized
+  // enclosing computation.
+  CHECK(dot.parent()->root_instruction()->outer_dimension_partitions().empty());
+
+  if (IsBatchDot(dot)) {
+    TF_RET_CHECK(addend_array == nullptr);
+    return EmitBatchDotOperation(dot, target_array, lhs_array, rhs_array,
+                                 executable_run_options_value, b,
+                                 hlo_module_config, target_machine_features);
+  }
+
+  return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
+                                  lhs_array, rhs_array, addend_array,
+                                  executable_run_options_value, b,
+                                  hlo_module_config, target_machine_features);
+}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 4c2041b556aa8bf8fe8fb8e0674c0f4f04f0acae..105bd3005c86d87443b2528eba7b0106ad70590e 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -30,9 +30,16 @@ limitations under the License.
 
 namespace xla {
 namespace cpu {
+// Returns true if the two operands and the output of `dot_instr` must have row
+// major layout.
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features);
 
-bool PotentiallyImplementedAsEigenDot(
-    const HloInstruction& hlo,
+// Returns true our lowering strategy for `dot_instr` can fold in transposes to
+// the either of the inputs.
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
     const TargetMachineFeatures& target_machine_features);
 
 // Returns the index for an operand to `hlo` that should ideally be column
@@ -41,129 +48,24 @@ bool PotentiallyImplementedAsEigenDot(
 absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
     const HloInstruction& hlo);
 
-// Returns true to indicate that we can generate a tiled LLVM IR implementation
-// for |dot|.
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot);
-
-// Helper class for emitting LLVM IR to perform the dot operation.
-class DotOpEmitter {
- public:
-  // Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
-  // place the result in target_array. IR is emitted at current insert point of
-  // the builder. Upon completion of the method, the insert point is set to the
-  // end of all instructions emitted for this operation.
-  //
-  // If `addend_array` is not nullptr then it must be an array of the same
-  // dimensions as the result, and the result is computed as `addend_array` +
-  // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
-  // for Matrix-vector products.
-  static Status EmitDotOperation(
-      const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-      const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-      const llvm_ir::IrArray* addend_array,
-      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-      const HloModuleConfig& hlo_module_config,
-      const TargetMachineFeatures& target_machine_features);
-
- private:
-  DotOpEmitter(const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-               const llvm_ir::IrArray& lhs_array,
-               const llvm_ir::IrArray& rhs_array,
-               const llvm_ir::IrArray* addend_array,
-               llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-               const HloModuleConfig& hlo_module_config,
-               const TargetMachineFeatures& target_machine_features);
-
-  // Emits the IR to perform the dot operation.
-  Status Emit();
-
-  // Emits instructions to perform a scalar dot product (a multiply of the
-  // LHS and RHS) and store the results in the target.
-  Status EmitScalarDot();
-
-  // Emit an LLVM IR implementation of the dot operation if we can.  Returns
-  // true if an LLVM IR implementation was emitted.
-  bool EmitLlvmIrDotIfProfitable();
-
-  // Emits a call to the CPU runtime to perform the matrix multiply.
-  Status EmitCallToRuntime();
-
-  // Represents the dimensions of a matrix-matrix multiply operation.
-  struct MatMultDims {
-    // The number of rows in the LHS.
-    int64 m;
-
-    // The number of columns in the LHS, which is also must be equal to the
-    // number of rows in the RHS.
-    int64 k;
-
-    // The number of columns on the RHS.
-    int64 n;
-
-    // True if the LHS matrix is column major.
-    bool lhs_column_major;
-
-    // True if the LHS contraction dimension is not 1.
-    bool lhs_non_canonical;
-
-    // True if the RHS matrix is column major.
-    bool rhs_column_major;
-
-    // True if the RHS contraction dimension is not 0.
-    bool rhs_non_canonical;
-
-    // True if the result matrix is column major.
-    bool target_column_major;
-  };
-
-  // Get the MatMultDims instance for the dot product this DotOpEmitter
-  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
-  // of rank 2 as well).
-  MatMultDims GetMatMultDims() const;
-
-  bool EmitSmallGemmIfProfitable(const MatMultDims& mat_mult_dims);
-
-  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
-  // registers.
-  int64 GetGemvTilingFactor() const {
-    const int64 kDefaultTilingFactor = 8;
-    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
-        .value_or(kDefaultTilingFactor);
-  }
-
-  std::tuple<int64, int64, int64> GetGemmTileSize() const {
-    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
-    //
-    // TODO(b/80093688): Tune for other architectures and centralize this
-    // information in one place.
-    const std::tuple<int64, int64, int64> kDefaultTileSize =
-        std::tuple<int64, int64, int64>(11, 9, 1);
-    return options::LlvmIrGemmTileSize(hlo_module_config_)
-        .value_or(kDefaultTileSize);
-  }
-
-  // Returns true if we should use an experimental implementation of GEMM
-  // (general matrix matrix multiplication) if possible.
-  bool EnableExperimentalLlvmIrGemm() const {
-    return options::EnableExperimentalLlvmIrGemm(hlo_module_config_);
-  }
-
-  // Returns true if we should call into multi-threaded Eigen routines.
-  bool ShouldUseMultiThreadedEigen() {
-    return hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  }
-
-  const HloInstruction& dot_;
-  const llvm_ir::IrArray& target_array_;
-  const llvm_ir::IrArray& lhs_array_;
-  const llvm_ir::IrArray& rhs_array_;
-  const llvm_ir::IrArray* addend_array_;
-  llvm::Value* executable_run_options_value_;
-  llvm::IRBuilder<>* b_;
-  const HloModuleConfig& hlo_module_config_;
-  const TargetMachineFeatures& target_machine_features_;
-};
-
+// Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
+// place the result in target_array. IR is emitted at current insert point of
+// the builder. Upon completion of the method, the insert point is set to the
+// end of all instructions emitted for this operation.
+//
+// If `addend_array` is not nullptr then it must be an array of the same
+// dimensions as the result, and the result is computed as `addend_array` +
+// dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
+// for Matrix-vector products.
+Status EmitDotOperation(const HloInstruction& dot,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc28918ed60a8086135846e2b9b1b9d75ec31ef6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
+
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+// -----------------------------------------------------------------------------
+// INTERNAL HEADER.
+//
+// This file exposes internal implementation details from dot_op_emitter.cc for
+// unit tests.  Please do not depend on this!
+//
+// -----------------------------------------------------------------------------
+
+namespace xla {
+namespace cpu {
+namespace internal {
+
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
+  }
+};
+
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trival dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implemetns a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
+};
+
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features);
+}  // namespace internal
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 1a8bedfe6afb4f096ddd4703c312b84d521a7ba5..a8b139aec9e96b6bb580baf74789df7c998cebf8 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -26,7 +26,7 @@ namespace cpu {
 
 int64 GetMinimumAlignmentForArray(
     const Shape& shape, const TargetMachineFeatures& target_machine_features) {
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   CHECK(!LayoutUtil::HasLayout(shape) || LayoutUtil::IsDense(shape.layout()));
 
   // We don't require a layout to be set on `shape`.  This only works on CPU
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index ed7fe59c80ed68420cea8b51e1732489ac2a874e..0226e8275cb0e1de39c4c2e9a06d4cfa1a4854d3 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,11 +24,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/platform/logging.h"
-// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -70,6 +68,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
@@ -223,11 +223,11 @@ Status IrEmitter::HandleConstant(HloInstruction* constant) {
 }
 
 Status IrEmitter::HandleCopy(HloInstruction* copy) {
-  if (ShapeUtil::IsTuple(copy->shape())) {
+  if (copy->shape().IsTuple()) {
     // kCopy shallow copies a tuple so just memcpy the top-level buffer.
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(copy));
     return EmitMemcpy(*(copy->operand(0)), *copy);
-  } else if (ShapeUtil::IsArray(copy->shape())) {
+  } else if (copy->shape().IsArray()) {
     // Use the elemental emitter for array shapes.
     return DefaultAction(copy);
   }
@@ -239,10 +239,12 @@ Status IrEmitter::HandleCopy(HloInstruction* copy) {
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
   int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
   DCHECK_GE(byte_size, 0);
-  // Largest scalar is a complex64 so we don't need to worry about the
+  // Largest scalar is a complex128 so we don't need to worry about the
   // int64->int truncation here.
-  DCHECK_LE(byte_size, 8);
-  return byte_size;
+  DCHECK_LE(byte_size, 16);
+
+  // Allocations may be 8-byte aligned if part of a small block.
+  return std::min(8LL, byte_size);
 }
 
 int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
@@ -316,7 +318,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   auto on_false = tuple_select->operand(2);
   TF_RET_CHECK(pred->shape().element_type() == PRED);
   TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()));
-  TF_RET_CHECK(ShapeUtil::IsTuple(tuple_select->shape()));
+  TF_RET_CHECK(tuple_select->shape().IsTuple());
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple_select));
   llvm_ir::EmitTupleSelect(GetIrArrayFor(tuple_select), GetIrArrayFor(pred),
                            GetEmittedValueFor(on_true),
@@ -346,7 +348,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
   llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_,
                      module_);
 
-  if (ShapeUtil::IsTuple(data_shape)) {
+  if (data_shape.IsTuple()) {
     TF_RET_CHECK(!ShapeUtil::IsNestedTuple(data_shape));
 
     // For a tuple, we first copy each of the internal elements to
@@ -470,7 +472,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
   const Shape& operand_shape = operand->shape();
 
   llvm::Value* value = GetEmittedValueFor(operand);
-  if (!ShapeUtil::IsTuple(operand_shape)) {
+  if (!operand_shape.IsTuple()) {
     return EmitXfeedTransfer(XfeedKind::kOutfeed, operand_shape, value);
   }
 
@@ -493,6 +495,26 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   const HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
+  PrimitiveType keys_type = keys_shape.element_type();
+  switch (keys_type) {
+    case PRED:
+    case S8:
+    case U8:
+    case S16:
+    case U16:
+    case F16:
+    case S32:
+    case U32:
+    case F32:
+    case S64:
+    case U64:
+    case F64:
+      break;
+    default:
+      return Unimplemented(
+          "Element type %s not supported in the Sort op on CPU.",
+          PrimitiveType_Name(keys_type));
+  }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {
     ShapeIndex shape_index =
@@ -535,110 +557,106 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
     higher_dimensions *= normalized_keys_shape.dimensions(i);
   }
   int64 lower_dimensions = 1;
-  for (int64 i = ShapeUtil::Rank(normalized_keys_shape) - 1;
+  for (int64 i = normalized_keys_shape.rank() - 1;
        i > physical_dimension_to_sort; --i) {
     lower_dimensions *= normalized_keys_shape.dimensions(i);
   }
 
-  PrimitiveType keys_type = keys_shape.element_type();
-  const char* fn_name = nullptr;
-  llvm::Type* keys_native_type = nullptr;
-  switch (keys_type) {
-    case PRED:
-      fn_name = runtime::kKeyValueSortPREDSymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case S8:
-      fn_name = runtime::kKeyValueSortS8SymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case U8:
-      fn_name = runtime::kKeyValueSortU8SymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case S16:
-      fn_name = runtime::kKeyValueSortS16SymbolName;
-      keys_native_type = b_.getInt16Ty()->getPointerTo();
-      break;
-    case U16:
-      fn_name = runtime::kKeyValueSortU16SymbolName;
-      keys_native_type = b_.getInt16Ty()->getPointerTo();
-      break;
-    case F16:
-      fn_name = runtime::kKeyValueSortF16SymbolName;
-      keys_native_type = b_.getHalfTy()->getPointerTo();
-      break;
-    case S32:
-      fn_name = runtime::kKeyValueSortS32SymbolName;
-      keys_native_type = b_.getInt32Ty()->getPointerTo();
-      break;
-    case U32:
-      fn_name = runtime::kKeyValueSortU32SymbolName;
-      keys_native_type = b_.getInt32Ty()->getPointerTo();
-      break;
-    case F32:
-      fn_name = runtime::kKeyValueSortF32SymbolName;
-      keys_native_type = b_.getFloatTy()->getPointerTo();
-      break;
-    case S64:
-      fn_name = runtime::kKeyValueSortS64SymbolName;
-      keys_native_type = b_.getInt64Ty()->getPointerTo();
-      break;
-    case U64:
-      fn_name = runtime::kKeyValueSortU64SymbolName;
-      keys_native_type = b_.getInt64Ty()->getPointerTo();
-      break;
-    case F64:
-      fn_name = runtime::kKeyValueSortF64SymbolName;
-      keys_native_type = b_.getDoubleTy()->getPointerTo();
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
+  llvm::FunctionType* less_than_type = llvm::FunctionType::get(
+      b_.getInt1Ty(), {b_.getInt8PtrTy(), b_.getInt8PtrTy()},
+      /*isVarArg=*/false);
+  auto less_than_function = llvm_ir::CreateFunction(
+      less_than_type, llvm::GlobalValue::InternalLinkage,
+      /*enable_fast_math=*/false,
+      /*optimize_for_size=*/true, absl::StrCat(IrName(sort), "_comparator"),
+      module_);
+  // Emit the code for the less_than function.
+  {
+    llvm::IRBuilder<>::InsertPointGuard guard(b_);
+
+    auto* entry_bb =
+        llvm::BasicBlock::Create(b_.getContext(), "entry", less_than_function);
+
+    b_.SetInsertPoint(entry_bb);
+    auto keys_ir_type = llvm_ir::PrimitiveTypeToIrType(keys_type, module_);
+    CHECK_EQ(less_than_function->arg_size(), 2);
+    llvm::Value* keys_lhs_ptr = less_than_function->arg_begin();
+    keys_lhs_ptr = PointerCast(keys_lhs_ptr, keys_ir_type->getPointerTo());
+    llvm::Value* keys_rhs_ptr = less_than_function->arg_begin() + 1;
+    keys_rhs_ptr = PointerCast(keys_rhs_ptr, keys_ir_type->getPointerTo());
+
+    // TODO(b/122298745): Replace the custom compare logic with a call to the
+    // computation specified for the Sort op.
+    llvm::Value* keys_lhs = Load(keys_ir_type, keys_lhs_ptr);
+    llvm::Value* keys_rhs = Load(keys_ir_type, keys_rhs_ptr);
+    bool is_signed_comparison = true;
+    if (primitive_util::IsFloatingPointType(keys_type)) {
+      // We would like a total order of floating point numbers so that the
+      // sort has a predictable behavior in the presence of NaNs. Rather
+      // than using floating point comparison, we use the following trick:
+      // If f is a float, and
+      // x = bit_cast<int32>(f);
+      // y = x < 0 ? 0x7FFFFFFF - x : x;
+      // then y is ordered as an int32 such that finite values have the
+      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+      // the beginning and end of the ordering.
+      auto k = b_.getInt(llvm::APInt::getSignedMaxValue(
+          keys_lhs->getType()->getPrimitiveSizeInBits()));
+      auto comparison_type = k->getType();
+      auto zero = llvm::ConstantInt::get(comparison_type, 0);
+      auto maybe_flip = [&](llvm::Value* v) {
+        return b_.CreateSelect(b_.CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
+                               b_.CreateSub(k, v), v);
+      };
+      keys_lhs = b_.CreateBitCast(keys_lhs, comparison_type);
+      keys_rhs = b_.CreateBitCast(keys_rhs, comparison_type);
+      keys_lhs = maybe_flip(keys_lhs);
+      keys_rhs = maybe_flip(keys_rhs);
+    } else if (!primitive_util::IsSignedIntegralType(keys_type)) {
+      is_signed_comparison = false;
+    }
+    llvm::Value* result =
+        b_.CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
+                                           : llvm::ICmpInst::ICMP_ULT,
+                      keys_lhs, keys_rhs);
+    llvm::ReturnInst::Create(b_.getContext(),
+                             /*retVal=*/result, entry_bb);
   }
 
   llvm::FunctionType* key_value_sort_type = llvm::FunctionType::get(
       b_.getVoidTy(),
-      {keys_native_type, b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
+      {b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
        b_.getInt8PtrTy()->getPointerTo(), b_.getInt32Ty(),
-       b_.getInt32Ty()->getPointerTo()},
+       b_.getInt32Ty()->getPointerTo(), less_than_function->getType()},
       /*isVarArg=*/false);
-  auto* key_value_sort_func = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(fn_name, key_value_sort_type));
+  auto* key_value_sort_func =
+      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+          runtime::kKeyValueSortSymbolName, key_value_sort_type));
   key_value_sort_func->setCallingConv(llvm::CallingConv::C);
   key_value_sort_func->setDoesNotThrow();
-  llvm::Value* values;
-  llvm::Value* sizes;
-  if (sort->values_count() == 0) {
-    values = llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo());
-    sizes = llvm::Constant::getNullValue(b_.getInt32Ty()->getPointerTo());
-  } else {
-    values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        b_.getInt8PtrTy(), b_.getInt32(sort->values_count()),
-        "cc_values_alloca", &b_);
-    sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        b_.getInt32Ty(), b_.getInt32(sort->values_count()), "cc_sizes_alloca",
-        &b_);
-    for (int64 i = 0; i < sort->values_count(); ++i) {
-      llvm::Value* value_as_i8ptr =
-          PointerCast(destination_addresses[i + 1], b_.getInt8PtrTy());
-      llvm::Value* slot_in_values_alloca =
-          ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
-      Store(value_as_i8ptr, slot_in_values_alloca);
-      llvm::Value* slot_in_sizes_alloca =
-          ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
-      llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
-          sort->operand(i + 1)->shape().element_type()));
-      Store(size, slot_in_sizes_alloca);
-    }
+  llvm::Value* values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt8PtrTy(), b_.getInt32(sort->operand_count()), "cc_values_alloca",
+      &b_);
+  llvm::Value* sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt32Ty(), b_.getInt32(sort->operand_count()), "cc_sizes_alloca",
+      &b_);
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    llvm::Value* value_as_i8ptr =
+        PointerCast(destination_addresses[i], b_.getInt8PtrTy());
+    llvm::Value* slot_in_values_alloca =
+        ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
+    Store(value_as_i8ptr, slot_in_values_alloca);
+    llvm::Value* slot_in_sizes_alloca =
+        ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
+    llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
+        sort->operand(i)->shape().element_type()));
+    Store(size, slot_in_sizes_alloca);
   }
 
   Call(key_value_sort_func,
-       {PointerCast(destination_addresses[0], keys_native_type),
-        b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
+       {b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
         b_.getInt64(lower_dimensions), values,
-        b_.getInt32(sort->values_count()), sizes});
+        b_.getInt32(sort->operand_count()), sizes, less_than_function});
 
   if (sort->values_count() > 0) {
     llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_,
@@ -779,8 +797,8 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   const auto init_value = select_and_scatter->operand(2);
   const Window& window = select_and_scatter->window();
   PrimitiveType operand_element_type = operand->shape().element_type();
-  const int64 rank = ShapeUtil::Rank(operand->shape());
-  CHECK_EQ(rank, ShapeUtil::Rank(source->shape()));
+  const int64 rank = operand->shape().rank();
+  CHECK_EQ(rank, source->shape().rank());
   CHECK_EQ(rank, window.dimensions_size());
 
   // TODO(b/31410564): Implement dilation for select-and-scatter.
@@ -942,12 +960,8 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   auto rhs = dot->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*dot, /*operands=*/{lhs, rhs},
-      /*supported_types=*/{F16, F32, F64, C64}));
+      /*supported_types=*/{F16, F32, F64, C64, C128}));
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
-  if (dnums.lhs_batch_dimensions_size() > 0 ||
-      dnums.rhs_batch_dimensions_size() > 0) {
-    return Unimplemented("Dot with batch dimensions not implemented.");
-  }
 
   if (dnums.lhs_contracting_dimensions_size() != 1) {
     // This is disallowed by ShapeInference today.
@@ -970,10 +984,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
           << llvm_ir::DumpToString(*target_array.GetBasePointer());
 
   // Dot operation is complicated so we delegate to a helper class.
-  return DotOpEmitter::EmitDotOperation(
-      *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
-      GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
-      target_machine_features_);
+  return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                          /*addend_array=*/nullptr,
+                          GetExecutableRunOptionsArgument(), &b_,
+                          hlo_module_config_, target_machine_features_);
 }
 
 StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
@@ -1118,7 +1132,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   auto rhs = convolution->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*convolution, /*operands=*/{lhs, rhs},
-      /*supported_types=*/{F16, F32, C64}));
+      /*supported_types=*/{F16, F32, C64, C128}));
 
   // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support
   // different data layouts.
@@ -1362,7 +1376,7 @@ Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
                         assignment_.GetUniqueSlice(crs, {i}));
 
     const Shape& operand_shape = crs->operand(i)->shape();
-    CHECK(ShapeUtil::IsArray(operand_shape))
+    CHECK(operand_shape.IsArray())
         << "Operands to all-reduce must be arrays: " << crs->ToString();
     operand_ptrs.push_back(EmitBufferPointer(out_slice, operand_shape));
 
@@ -1399,7 +1413,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
 
   int64 delta = 0;
   for (int64 i = 0; i < operand_shape.dimensions_size(); i++) {
-    if (reduced_dims.count(i)) {
+    if (reduced_dims.contains(i)) {
       delta++;
     } else {
       InsertOrDie(&unreduced_dim_map, i, i - delta);
@@ -1412,7 +1426,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   for (int64 operand_dim_idx = 0;
        operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
     int64 operand_dim = operand_shape.layout().minor_to_major(operand_dim_idx);
-    if (!reduced_dims.count(operand_dim)) {
+    if (!reduced_dims.contains(operand_dim)) {
       if (FindOrDie(unreduced_dim_map, operand_dim) !=
           result_shape.layout().minor_to_major(result_dim_idx++)) {
         return false;
@@ -1709,10 +1723,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
       vectorization_factor_in_bytes /
       ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
 
-  bool is_reduction_over_minor_dimension =
-      std::find(dimensions.begin(), dimensions.end(),
-                LayoutUtil::Minor(arg->shape().layout(), 0)) !=
-      dimensions.end();
+  bool is_reduction_over_minor_dimension = absl::c_linear_search(
+      dimensions, LayoutUtil::Minor(arg->shape().layout(), 0));
 
   unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
       ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
@@ -1724,7 +1736,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     return false;
   }
 
-  CHECK(!ShapeUtil::IsTuple(reduce->shape()));
+  CHECK(!reduce->shape().IsTuple());
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(reduce));
 
   // We know we're not reducing over the most minor dimension, which means we
@@ -1891,7 +1903,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   // TODO(b/112040122): Support variadic reduce.
-  if (!ShapeUtil::IsArray(reduce->shape())) {
+  if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on CPU");
   }
   auto arg = reduce->mutable_operand(0);
@@ -1990,7 +2002,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   // The memcpy will copy elements that are logically this shape (allowed to be
   // scalar).
   const Shape logical_element_shape = ShapeUtil::FilterDimensions(
-      [&inner_dims](int64 dim) -> bool { return inner_dims.count(dim); },
+      [&inner_dims](int64 dim) { return inner_dims.contains(dim); },
       operand->shape());
 
   const int64 primitive_elements_per_logical_element =
@@ -2205,10 +2217,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *dot, target_array, lhs_array, rhs_array, &addend_array,
-        GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
-        target_machine_features_));
+    TF_RETURN_IF_ERROR(
+        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
+                         hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
@@ -2267,14 +2279,13 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
   // Write the tuple table if the output is a tuple.
-  if (ShapeUtil::IsTuple(custom_call->shape())) {
+  if (custom_call->shape().IsTuple()) {
     std::vector<llvm::Value*> base_ptrs;
     for (int i = 0; i < ShapeUtil::TupleElementCount(custom_call->shape());
          ++i) {
       const Shape& elem_shape =
           ShapeUtil::GetTupleElementShape(custom_call->shape(), i);
-      TF_RET_CHECK(!ShapeUtil::IsTuple(elem_shape))
-          << "Nested tuples not implemented";
+      TF_RET_CHECK(!elem_shape.IsTuple()) << "Nested tuples not implemented";
       TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
                           assignment_.GetUniqueSlice(custom_call, {i}));
       llvm::Value* addr = EmitBufferPointer(slice, elem_shape);
@@ -2402,8 +2413,7 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   int64 concat_dim = concatenate->dimensions(0);
   const Layout& output_layout = output_shape.layout();
   auto output_min2maj = LayoutUtil::MinorToMajor(output_layout);
-  auto concat_dim_layout_itr =
-      std::find(output_min2maj.begin(), output_min2maj.end(), concat_dim);
+  auto concat_dim_layout_itr = absl::c_find(output_min2maj, concat_dim);
 
   std::vector<int64> inner_dims(output_min2maj.begin(), concat_dim_layout_itr);
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
@@ -2803,7 +2813,7 @@ llvm::Value* IrEmitter::EmitThreadLocalBufferPointer(
           llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_);
       llvm::LoadInst* param_address_untyped = Load(param_address_offset);
 
-      if (!ShapeUtil::IsOpaque(target_shape)) {
+      if (!target_shape.IsOpaque()) {
         AttachAlignmentMetadataForLoad(param_address_untyped, target_shape);
         AttachDereferenceableMetadataForLoad(param_address_untyped,
                                              target_shape);
@@ -2957,8 +2967,7 @@ Status IrEmitter::ElementTypesSameAndSupported(
 
   TF_RET_CHECK(!operands.empty());
   PrimitiveType primitive_type = operands[0]->shape().element_type();
-  if (std::find(supported_types.begin(), supported_types.end(),
-                primitive_type) == supported_types.end()) {
+  if (!absl::c_linear_search(supported_types, primitive_type)) {
     return Unimplemented("unsupported operand type %s in op %s",
                          PrimitiveType_Name(primitive_type),
                          HloOpcodeString(instruction.opcode()));
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index db76de4bb2b8ed568bf2557a30fa216d0cbe518d..974dd7cd3f2254bfbc86fffae02c06c481af8902 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -250,14 +250,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   llvm::Value* EmitBufferPointer(const BufferAllocation::Slice& slice,
                                  const Shape& target_shape);
 
-  // Emits a function into the current module. This can be used for
-  // computations embedded inside other computations, such as the
-  // function that a map operation applies.
-  StatusOr<llvm::Function*> EmitFunction(
-      HloComputation* function,  // The function to emit.
-      absl::string_view
-          function_name_suffix);  // Used for LLVM IR register names.
-
   // Emits a call to a thread local function (e.g. to the computation nested
   // within a reduce or a map).  Thread local callees (by definition) only write
   // to and read from thread local allocations.
@@ -448,7 +440,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       computation_to_profile_idx_;
 
   // Maps HLOs to Values emitted for them.
-  std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_;
+  absl::flat_hash_map<const HloInstruction*, llvm::Value*> emitted_value_;
 
   llvm_ir::AliasAnalysis alias_analysis_;
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index f8441c3e345504616485c6b34b4302acd5cc23a3..a6f4273a5a70aab0bc88383283d2a55b1ecb1681 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -34,7 +34,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
                                                    llvm::Type* index_type) {
   CHECK_NE(index_type, nullptr);
 
-  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!shape_.IsTuple());
   CHECK(!ShapeUtil::IsScalar(shape_));
 
   llvm_ir::ForLoopNest loop_nest(loop_name, b_);
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index ede7f433ca6b2cc5629115f800348be9dfb2b93b..6121d1ca9a5c785cedd947200d3e7e320aa06bc2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -146,11 +146,9 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
       (opcode == HloOpcode::kConvolution &&
        PotentiallyImplementedAsEigenConvolution(*instruction,
                                                 target_machine_features_)) ||
-      PotentiallyImplementedAsEigenDot(*instruction,
-                                       target_machine_features_) ||
       (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
-      ShapeUtil::IsTuple(instruction->shape())) {
+      instruction->shape().IsTuple()) {
     return 1;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 722aa3120ef4d8c957873ac58c361f19632dde1f..a0667d0d9d1cde246f4b74626859955beeec08b0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -15,12 +15,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstring>
-#include <limits>
 #include <memory>
+#include <numeric>
 #include <string>
-#include <utility>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/platform/dynamic_annotations.h"
@@ -28,80 +26,14 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace {
-using tensorflow::int16;
 using tensorflow::int32;
 using tensorflow::int64;
-using tensorflow::int8;
-using tensorflow::uint16;
-using tensorflow::uint32;
-using tensorflow::uint64;
-using tensorflow::uint8;
-
-template <typename KeyType>
-void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements);
-}
-
-// We would like a total order of floating point numbers so that the
-// sort has a predictable behavior in the presence of NaNs. Rather
-// than using floating point comparison, we use the following trick:
-// If f is a float, and
-// x = bit_cast<int32>(f);
-// y = x < 0 ? 0x7FFFFFFF - x : x;
-// then y is ordered as an int32 such that finite values have the
-// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-// the beginning and end of the ordering.
-template <typename CastType, typename UnsignedCastType, typename KeyType>
-CastType Convert(KeyType value) {
-  CastType casted_value;
-  memcpy(&casted_value, &value, sizeof(CastType));
-  if (casted_value < 0) {
-    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
-           casted_value;
-  }
-  return casted_value;
-}
-
-template <typename CastType, typename UnsignedCastType, typename KeyType>
-bool LessThan(KeyType lhs, KeyType rhs) {
-  return Convert<CastType, UnsignedCastType>(lhs) <
-         Convert<CastType, UnsignedCastType>(rhs);
-}
-
-template <>
-void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<double, int64>& lhs,
-                      const std::pair<double, int64>& rhs) -> bool {
-                     return LessThan<int64, uint64>(lhs.first, rhs.first);
-                   });
-}
-
-template <>
-void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<float, int64>& lhs,
-                      const std::pair<float, int64>& rhs) -> bool {
-                     return LessThan<int32, uint32>(lhs.first, rhs.first);
-                   });
-}
-
-template <>
-void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
-                  int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<Eigen::half, int64>& lhs,
-                      const std::pair<Eigen::half, int64>& rhs) -> bool {
-                     return LessThan<int32, uint32>(
-                         Eigen::half_impl::half_to_float(lhs.first),
-                         Eigen::half_impl::half_to_float(rhs.first));
-                   });
-}
+}  // namespace
 
-template <typename KeyType>
-void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
-                      int32 values_count,
-                      int32* values_primitive_type_size_in_bytes) {
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
+    int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes,
+    bool (*less_than)(char*, char*)) {
   // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
   // code, so msan can't tell they are initialized.
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
@@ -121,8 +53,8 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
   int64 num_iteration_elements = a * c;
   int64 sort_dimension_offset = c;
 
-  std::unique_ptr<std::pair<KeyType, int64>[]> row_to_sort(
-      new std::pair<KeyType, int64>[sort_dimension_elements]);
+  std::unique_ptr<int64[]> indices(new int64[sort_dimension_elements]);
+  std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
   for (int64 index = 0; index < num_iteration_elements; ++index) {
@@ -135,24 +67,22 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
     int64 base_offset =
         index % sort_dimension_offset +
         (index - index % sort_dimension_offset) * sort_dimension_elements;
-    // TODO(b/26783907): We could define a custom iterator class that references
-    // all arrays. Then we could avoid the intermediate copy. However this
-    // would become more complicated, and it is not clear if the benefit is high
-    // enough.
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      row_to_sort[i] =
-          std::make_pair(keys[base_offset + i * sort_dimension_offset], i);
-    }
-    KeyValueSort(row_to_sort.get(), sort_dimension_elements);
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      keys[base_offset + i * sort_dimension_offset] = row_to_sort[i].first;
-    }
-
-    // Reorder the values according to the order defined by the keys.
+    std::stable_sort(
+        indices.get(), indices.get() + sort_dimension_elements,
+        [&](int64 a, int64 b) {
+          int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
+                                   values_primitive_type_size_in_bytes[0];
+          int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
+                                   values_primitive_type_size_in_bytes[0];
+          return less_than(values[0] + memory_index_lhs,
+                           values[0] + memory_index_rhs);
+        });
+
+    // Reorder the values according to the order defined by 'indices'.
     for (int32 idx = 0; idx < values_count; ++idx) {
       for (int64 i = 0; i < sort_dimension_elements; ++i) {
         int64 memory_index =
-            (base_offset + row_to_sort[i].second * sort_dimension_offset) *
+            (base_offset + indices[i] * sort_dimension_offset) *
             values_primitive_type_size_in_bytes[idx];
 
         reordered_values[i] =
@@ -168,88 +98,3 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
     }
   }
 }
-}  // namespace
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS8(
-    int8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU8(
-    uint8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS16(
-    int16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU16(
-    uint16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, int64 a, int64 b, int64 c, char** values,
-    int32 values_count, int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS32(
-    int32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU32(
-    uint32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS64(
-    int64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU64(
-    uint64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
index 7821099386969e855ea1737cf53ef49c15c6e93b..5460af3485b94aaef1a5822a79e4fa325bcb67ea 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
@@ -21,76 +21,19 @@ limitations under the License.
 
 extern "C" {
 
-// 'keys' represents a 3-dimensional shape with dimensions [a, b, c]. The 'b'
-// dimension of 'keys' is sorted into ascending order. If 'values_count' is <=
-// 0, 'values' and 'values_primitive_type_size_in_bytes' can be nullptr.
-// If 'values_count' > 0, they contain exactly 'values_count' many elements.
-// Each element of 'values' also represents a 3-dimensional shape with
-// dimensions [a, b, c], and the size of the primitive type of the i-th shape
-// has exactly 'values_primitive_type_size_in_bytes[i]' bytes. The elements in
-// each 'values' shape are reordered in such a way that if the element at index
-// 'i' in 'keys' was moved to index 'j', the element at index 'i' in a 'values'
-// shape is also moved to index 'j' (which means that the same elements
-// correspond to each other as before).
-extern void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
+// Each entry in 'values' represents a 3-dimensional shape with dimensions
+// [a, b, c]. The 'b' dimension of the first shape is sorted into ascending
+// order according to the results of comparisons using the provided 'less_than'
+// function. 'values_count' must be > 0 and specifies the number of entries in
+// 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
+// type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
+// bytes. The elements in each 'values' shape are reordered in the same way
+// according to the comparisons using the first shape.
+extern void __xla_cpu_runtime_KeyValueSort(
+    tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
     char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS8(
-    tensorflow::int8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU8(
-    tensorflow::uint8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS16(
-    tensorflow::int16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU16(
-    tensorflow::uint16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS32(
-    tensorflow::int32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU32(
-    tensorflow::uint32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS64(
-    tensorflow::int64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU64(
-    tensorflow::uint64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
+    tensorflow::int32* values_primitive_type_size_in_bytes,
+    bool (*less_than)(char*, char*));
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 1ed743afc30af7c7ff38c7d2a738f2e376270952..1f7204e67a413efabd34cd7d88ced4c82ee7a5df 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 using tensorflow::int32;
 using tensorflow::int64;
 
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 296f39a4853f2d3f7030209a921001e92c39d609..9c2685674fbc133de1220caef81ac3b60a1c0f7c 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -116,13 +116,26 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                 orc_jit_memory_mapper::GetInstance());
             result.Resolver = symbol_resolver_;
             return result;
+          },
+          /*NotifyLoaded=*/
+          llvm::orc::LegacyRTDyldObjectLinkingLayer::NotifyLoadedFtor(),
+          /*NotifyFinalized=*/
+          [this](VModuleKeyT, const llvm::object::ObjectFile& object,
+                 const llvm::RuntimeDyld::LoadedObjectInfo& object_info) {
+            this->NotifyObjectFinalized(object, object_info);
+          },
+          /*NotifyFreed=*/
+          [this](VModuleKeyT, const llvm::object::ObjectFile& object) {
+            this->NotifyObjectFreed(object);
           }),
       compile_layer_(object_layer_,
                      CompilerFunctor(target_machine_.get(), &disassembler_,
                                      opt_level, optimize_for_size,
                                      enable_fast_math, disable_expensive_passes,
                                      std::move(pre_optimization_hook),
-                                     std::move(post_optimization_hook))) {
+                                     std::move(post_optimization_hook))),
+      gdb_jit_event_listener_(
+          llvm::JITEventListener::createGDBRegistrationListener()) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
           << " features: " << target_machine_->getTargetFeatureString().str();
 }
@@ -147,6 +160,20 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
   return symbol_info;
 }
 
+void SimpleOrcJIT::NotifyObjectFinalized(
+    const llvm::object::ObjectFile& object,
+    const llvm::RuntimeDyld::LoadedObjectInfo& object_info) {
+  uint64_t key = static_cast<uint64_t>(
+      reinterpret_cast<uintptr_t>(object.getData().data()));
+  gdb_jit_event_listener_->notifyObjectLoaded(key, object, object_info);
+}
+
+void SimpleOrcJIT::NotifyObjectFreed(const llvm::object::ObjectFile& object) {
+  uint64_t key = static_cast<uint64_t>(
+      reinterpret_cast<uintptr_t>(object.getData().data()));
+  gdb_jit_event_listener_->notifyFreeingObject(key);
+}
+
 SimpleOrcJIT::VModuleKeyT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
   auto key = execution_session_.allocateVModule();
@@ -213,18 +240,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortPRED);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS8);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU8);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS64);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU64);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort);
 
   registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee));
   registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee));
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 78406ba143570183aea09d79db3f9b708c21bf70..3307c2f93d796bbdcd49af7f68e9f6c388e402ca 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
@@ -99,6 +100,11 @@ class SimpleOrcJIT {
  private:
   llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name);
 
+  void NotifyObjectFinalized(
+      const llvm::object::ObjectFile& object,
+      const llvm::RuntimeDyld::LoadedObjectInfo& object_info);
+  void NotifyObjectFreed(const llvm::object::ObjectFile& object);
+
   std::vector<VModuleKeyT> module_keys_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
@@ -107,6 +113,15 @@ class SimpleOrcJIT {
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
   CompileLayerT compile_layer_;
+
+  // Non owning pointer to a JIT event listener that registers the JIT events
+  // with an attached GDB.
+  //
+  // Note: we get a pointer to this event listener using
+  // `createGDBRegistrationListener` which makes it look like we're supposed to
+  // free this, but the function is poorly named and really just returns a
+  // pointer to a static object.
+  llvm::JITEventListener* gdb_jit_event_listener_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index f8f5f392da8ab3348e63185aecf7b639daacaa42..8b7f843582b697058fe328fe69990122d868ada4 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // Tests that we call into Eigen for dot operations as needed.
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
 #include "absl/strings/str_cat.h"
@@ -102,10 +101,10 @@ std::vector<DotTestSpec> GetDotTestCases() {
   return result;
 }
 
-INSTANTIATE_TEST_CASE_P(CpuEigenDotOperationTestInstantiation,
-                        CpuEigenDotOperationTest,
-                        ::testing::ValuesIn(GetDotTestCases()),
-                        DotTestSpecToString);
+INSTANTIATE_TEST_SUITE_P(CpuEigenDotOperationTestInstantiation,
+                         CpuEigenDotOperationTest,
+                         ::testing::ValuesIn(GetDotTestCases()),
+                         DotTestSpecToString);
 
 }  // namespace
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
index 5cc6d01c0f15d4209cbc1fb259a0078fb9957f6e..f0f897e9635600b22e0c389ba056899e4d6ab3d4 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
@@ -48,7 +48,7 @@ class InfeedTest : public ClientLibraryTestBase {
     ASSERT_IS_OK(client_->TransferToInfeed(literal));
     XlaBuilder builder(TestName());
     Infeed(&builder, literal.shape());
-    if (ShapeUtil::IsTuple(literal.shape())) {
+    if (literal.shape().IsTuple()) {
       // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
       ComputeAndCompareTuple(&builder, literal, {});
     } else {
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index 9b10c49f4f547edfb2164f98c49cceb031148bdc..9078b8fd1ff6cb0ddac89d5fcd13a9ccfae07763 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
@@ -59,8 +59,9 @@ class CpuUnaryIntrinsicTest
 
     string features{spec.features.data(), spec.features.size()};
     if (!features.empty()) {
-      std::replace_if(features.begin(), features.end(),
-                      [](char c) { return c != '_' && !isalnum(c); }, '_');
+      std::replace_if(
+          features.begin(), features.end(),
+          [](char c) { return c != '_' && !absl::ascii_isalnum(c); }, '_');
     } else {
       features = "";
     }
@@ -140,10 +141,10 @@ IntrinsicTestSpec CpuUnaryIntrinsicTestCases[] = {
         HloOpcode::kLog, kTriple_android_arm, "",
         R"(CHECK: fadd fast <4 x float> <float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000>)"}};
 
-INSTANTIATE_TEST_CASE_P(CpuUnaryIntrinsicTestInstantiation,
-                        CpuUnaryIntrinsicTest,
-                        ::testing::ValuesIn(CpuUnaryIntrinsicTestCases),
-                        CpuUnaryIntrinsicTest::Name);
+INSTANTIATE_TEST_SUITE_P(CpuUnaryIntrinsicTestInstantiation,
+                         CpuUnaryIntrinsicTest,
+                         ::testing::ValuesIn(CpuUnaryIntrinsicTestCases),
+                         CpuUnaryIntrinsicTest::Name);
 
 }  // namespace
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb6c44b70ab34d0a294880b5de4fe0b3ba5e19e5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
@@ -0,0 +1,1014 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
+
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using tensorflow::int64;
+
+// Provides tiled access to an in-memory rank 2 array.
+class MemoryTile {
+ public:
+  // Constructs a MemoryTile that can operate on tiles consisting of
+  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
+  // `major_dim_offset` in the major dimension.  The tile size along the minor
+  // dimension is the vector size, and that is implicitly determined by `vsl`.
+  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
+             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
+             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
+      : vsl_(vsl), b_(b) {
+    pointers_.reserve(tile_size_along_major_dim);
+    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
+      llvm::Value* total_offset =
+          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
+                       b->CreateAdd(b->getInt64(i), major_dim_offset));
+      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
+    }
+  }
+
+  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
+  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
+    std::vector<llvm::Value*> result;
+    result.reserve(pointers_.size());
+    for (const auto& pointer : pointers_) {
+      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
+    }
+    return result;
+  }
+
+  // Stores `tile` to position {major: `major_dim_offset`, minor:
+  // `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  void StoreTile(absl::Span<llvm::Value* const> tile,
+                 llvm::Value* minor_dim_offset) const {
+    CHECK_EQ(tile.size(), pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
+    }
+  }
+
+  // Loads a tile of size [`tile_size_along_major_dim`,
+  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
+  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
+  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
+  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
+      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
+    std::vector<std::vector<llvm::Value*>> result;
+    result.resize(pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
+        result[i].push_back(vsl_->LoadBroadcast(
+            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
+      }
+    }
+    return result;
+  }
+
+ private:
+  VectorSupportLibrary* vsl_;
+  llvm::IRBuilder<>* b_;
+  std::vector<llvm::Value*> pointers_;
+};
+
+// The base class for the classes representing the GEMV emitter configurations.
+//
+// The IR emitted (modulo the LLVM values representing the input and output
+// buffers) by the row major and column major GEMV emitters should be a function
+// of their configuration.  This is important because their configuration is
+// used as a key to cache the generated IR.
+class GemvConfig {
+ public:
+  // Mixin for convenience.
+  template <typename T>
+  struct User {
+   public:
+    PrimitiveType scalar_type() const {
+      return derived().config().scalar_type();
+    }
+    int64 tile_rows() const { return derived().config().tile_rows(); }
+    int64 tile_cols() const { return derived().config().tile_cols(); }
+    int64 m() const { return derived().config().m(); }
+    int64 k() const { return derived().config().k(); }
+    int64 has_addend() const { return derived().config().has_addend(); }
+
+   private:
+    const T& derived() const { return *static_cast<const T*>(this); }
+  };
+
+  PrimitiveType scalar_type() const { return scalar_type_; }
+  int64 tile_rows() const { return tile_rows_; }
+  int64 tile_cols() const { return tile_cols_; }
+  int64 m() const { return m_; }
+  int64 k() const { return k_; }
+  bool has_addend() const { return has_addend_; }
+
+  string GetCacheKey() const {
+    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
+                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
+                        has_addend() ? "_with_addend" : "");
+  }
+
+ protected:
+  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, bool has_addend)
+      : name_(std::move(name)),
+        scalar_type_(scalar_type),
+        tile_rows_(tile_rows),
+        tile_cols_(tile_cols),
+        m_(m),
+        k_(k),
+        has_addend_(has_addend) {}
+
+ private:
+  string name_;
+  PrimitiveType scalar_type_;
+  int64 tile_rows_;
+  int64 tile_cols_;
+  int64 m_;
+  int64 k_;
+  bool has_addend_;
+};
+
+// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+       +--+--+--+--+
+//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//
+// (Legend: rows are horizontal and columns are vertical; and each column is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is from the column major left matrix.
+//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
+//      vector loaded from the RHS vector.
+//
+// As we iterate through the column dimension, we compute the change to the
+// result vector by an elementwise multiplication between the two tiles above
+// followed by a reduction along the major dimension:
+//
+//                     +-----------------------------------+
+//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
+//                     +-----------------------------------+
+//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
+// Result[R:R+4] +=    +-----------------------------------+
+//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
+//                     +-----------------------------------+
+//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
+//                     +-----------------------------------+
+//
+// Where R is the starting row for the tile.
+//
+// We have an inner epilogue loop to deal with the "C" submatrix and an outer
+// epilogue loop to deal with the B,D submarix.
+//
+// TODO(sanjoy): We should investigate if using gather loads and scatter stores
+// can be used here have the same inner loop for both column-major and row-major
+// matrix-vector products.
+class ColumnMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
+ public:
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                        llvm::Value* rhs, llvm::Value* addend,
+                                        llvm::Value* result,
+                                        llvm::IRBuilder<>* b)
+      : config_(config),
+        lhs_(lhs),
+        rhs_(rhs),
+        addend_(addend),
+        result_(result),
+        b_(b),
+        ksl_(b_),
+        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
+    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
+    CHECK(!has_addend() || addend != nullptr);
+  }
+
+  void Emit();
+
+  const Config& config() const { return config_; }
+
+ private:
+  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
+                         bool is_first_column);
+
+  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/m(),
+                      /*major_dim_offset=*/column_start,
+                      /*tile_size_along_major_dim=*/column_count);
+  }
+
+  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
+  // sequence of `count` values, each one broadcasted to the vector width.
+  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
+    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
+    std::vector<llvm::Value*> result;
+    result.reserve(count);
+    for (int64 i = 0; i < count; i++) {
+      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
+    }
+    return result;
+  }
+
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
+                          const std::vector<llvm::Value*>& rhs_tile,
+                          int64 columns, bool is_first_column);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
+                             bool is_first_tiled_column);
+
+  Config config_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* addend_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
+    llvm::Value* column, int64 column_count, bool is_first_column) {
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
+                                                /*column_count=*/column_count);
+
+  std::vector<llvm::Value*> rhs_tile =
+      LoadRhsTile(column, /*count=*/column_count);
+  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
+                     /*columns=*/column_count, is_first_column);
+  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
+}
+
+void ColumnMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 column_remainder = k() % tile_cols();
+  int64 column_limit = k() - column_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
+           [&](llvm::Value* column, bool is_first_column) {
+             EmitOuterLoopBody(column, tile_cols(), is_first_column);
+           });
+
+  if (column_remainder != 0) {
+    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
+                      column_limit == 0);
+  }
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
+    int64 columns, bool is_first_column) {
+  int64 row_limit = m() - (m() % tile_rows());
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
+           /*step=*/tile_rows(), [&](llvm::Value* row) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
+             llvm::Value* accumulator =
+                 is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
+                                            : vsl_.GetZeroVector())
+                                 : vsl_.LoadVector(result_, row);
+             for (int i = 0; i < columns; i++) {
+               accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
+             }
+             vsl_.StoreVector(accumulator, result_, row);
+           });
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
+  int64 row_start = m() - (m() % tile_rows());
+  if (row_start == m()) {
+    return;
+  }
+
+  llvm::Value* columns_llvm = b_->getInt64(columns);
+
+  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
+  //   for (row = row_start, row < m_; row++) {
+  //     result[row] += lhs[row, col] * rhs[col]
+  //     // Also take into account that if col is 0 then result[row] is not
+  //     // initialized.
+  //   }
+
+  ksl_.For(
+      "dot.inner.epilg.outer", /*start=*/current_tile_col,
+      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
+      /*step=*/1, /*peel_first_iteration=*/false,
+      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
+        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
+        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
+        llvm::Value* lhs_base_pointer =
+            vsl_.ComputeOffsetPointer(lhs_, total_offset);
+        ksl_.For(
+            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
+            /*step=*/1, [&](llvm::Value* scalar_row) {
+              llvm::Value* product = vsl_.Mul(
+                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
+              llvm::Value* setting_result_first_time = b_->CreateAnd(
+                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
+              ksl_.If(
+                  setting_result_first_time,
+                  /*true_block_generator=*/
+                  [&]() {
+                    if (addend_) {
+                      vsl_.StoreScalar(
+                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
+                                   product),
+                          result_, scalar_row);
+                    } else {
+                      vsl_.StoreScalar(product, result_, scalar_row);
+                    }
+                  },
+                  /*false_block_generator=*/
+                  [&]() {
+                    vsl_.StoreScalar(
+                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
+                        result_, scalar_row);
+                  });
+            });
+      });
+}
+
+// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+
+//   |M00|M10|M20|M30|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|
+//   +---+---+---+---+
+//   |M03|M13|M23|M33|
+//   +---+---+---+---+
+//
+// (Legend: rows are horizontal and columns are vertical; and each row is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is loaded from the row major left matrix.
+//   b. The right vector is loaded from the RHS vector.
+//
+// We keep 4 vector accumulators accumulating the following four vector
+// expressions as we iterate over the row dimension:
+//
+//   +------+------+------+------+
+//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
+//   +------+------+------+------+
+//
+// In the end we do a horizontal reduction over these 4 vector accumulators to
+// get 4 values in the result vector.
+//
+// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
+// epilogue loop to deal with the C,D submatrix.
+class RowMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
+ public:
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                     llvm::Value* rhs, llvm::Value* addend,
+                                     llvm::Value* result, llvm::IRBuilder<>* b)
+      : config_(config),
+        lhs_(lhs),
+        rhs_(rhs),
+        addend_(addend),
+        result_(result),
+        b_(b),
+        ksl_(b_),
+        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
+    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
+    CHECK(!has_addend() || addend != nullptr);
+  }
+
+  void Emit();
+
+  const Config& config() const { return config_; }
+
+ private:
+  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/k(),
+                      /*major_dim_offset=*/row_start,
+                      /*tile_size_along_major_dim=*/row_count);
+  }
+
+  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
+
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
+                          std::vector<VectorVariable>* vector_accumulators);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
+                             std::vector<ScalarVariable>* scalar_accumulators);
+
+  Config config_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* addend_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
+                                                           int64 row_count) {
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
+                                                /*row_count=*/row_count);
+  std::vector<VectorVariable> vector_accumulators;
+  std::vector<ScalarVariable> scalar_accumulators;
+  for (int i = 0; i < row_count; i++) {
+    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
+    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
+  }
+  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
+                     &vector_accumulators);
+  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
+                        &scalar_accumulators);
+
+  std::vector<llvm::Value*> accumulator_values;
+  std::transform(
+      vector_accumulators.begin(), vector_accumulators.end(),
+      std::back_inserter(accumulator_values),
+      [](const VectorVariable& vector_var) { return vector_var.Get(); });
+
+  std::vector<llvm::Value*> horizontal_sums;
+  if (row_count == vsl_.vector_size()) {
+    if (addend_) {
+      horizontal_sums = vsl_.ComputeHorizontalSums(
+          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
+    } else {
+      horizontal_sums =
+          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+    }
+  } else {
+    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+  }
+
+  for (int i = 0; i < row_count; i++) {
+    llvm::Value* result_value =
+        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
+    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
+    if (addend_ && row_count != vsl_.vector_size()) {
+      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
+    }
+    vsl_.StoreScalar(result_value, result_, offset);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 row_remainder = m() % tile_rows();
+  int64 row_limit = m() - row_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
+           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
+
+  if (row_remainder != 0) {
+    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    MemoryTile* lhs_memory_tile, int64 rows,
+    std::vector<VectorVariable>* vector_accumulators) {
+  int64 column_limit = k() - (k() % tile_cols());
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
+           /*step=*/tile_cols(), [&](llvm::Value* col) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
+             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
+             for (int i = 0; i < rows; i++) {
+               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
+               (*vector_accumulators)[i].Set(
+                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
+             }
+           });
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_row, int64 rows,
+    std::vector<ScalarVariable>* scalar_accumulators) {
+  int64 column_start = k() - (k() % tile_cols());
+  if (column_start == k()) {
+    return;
+  }
+
+  for (int r = 0; r < rows; r++) {
+    llvm::Value* total_offset = b_->CreateMul(
+        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
+    llvm::Value* lhs_base_pointer =
+        vsl_.ComputeOffsetPointer(lhs_, total_offset);
+    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
+             /*step=*/1, [&](llvm::Value* scalar_col) {
+               llvm::Value* product =
+                   vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
+                            vsl_.LoadScalar(rhs_, scalar_col));
+               llvm::Value* old_value = (*scalar_accumulators)[r].Get();
+               (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
+             });
+  }
+}
+
+// This class implements a tiled matrix multiplication algorithm, intended for
+// multiplying small matrices that don't need cache tiling.
+//
+// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
+// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
+// high-performance matrix multiplication." ACM Transactions on Mathematical
+// Software (TOMS) 34.3 (2008): 12.".
+//
+// This only supports canonical dot operations (i.e. where the lhs contraction
+// dimension is 1 and the rhs contraction dimension is 0) over row major
+// matrices.
+class TiledSmallGemmEmitter {
+ public:
+  // Describe the dimensions of the kernel.
+  class Dimensions {
+   public:
+    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+
+    int64 m() const { return m_; }
+    int64 k() const { return k_; }
+    int64 n() const { return n_; }
+
+    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
+
+   private:
+    const int64 m_;
+    const int64 k_;
+    const int64 n_;
+  };
+
+  // Represents the configuration of the emitter.  The LLVM IR emitted by the
+  // emitter, modulo the LLVM values holding the input and output buffers, must
+  // be a function of the instance of `Config` passed to it.
+  //
+  // `dims` holds the matrix multiplication dimensions.
+  //
+  // `max_vectorization_width` is the maximum vector width (i.e. the width of
+  // the largest vector register we will use).  This can be larger than the
+  // largest vector register supported by the machine -- LLVM will legalize
+  // these large vector widths into legally sized vectors.
+  //
+  // `max_vector_count` is the maximum number of vectors of size
+  // `max_vectorization_width` that we will attempt to process at once.
+  //
+  // `min_vectorization_width` is the smallest vector width the emitter will use
+  // -- below that it will devolve to using a scalar loop.
+  //
+  // The innermost reduction loop executes the matrix multiply in tiles of size
+  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
+  // <vectorization width>] in the RHS.
+  class Config {
+   public:
+    explicit Config(PrimitiveType scalar_type, Dimensions dims,
+                    int64 max_vectorization_width, int64 max_vector_count,
+                    int64 min_vectorization_width, int64 tile_size_m,
+                    int64 tile_size_k)
+        : scalar_type_(scalar_type),
+          dims_(dims),
+          max_vectorization_width_(max_vectorization_width),
+          max_vector_count_(max_vector_count),
+          min_vectorization_width_(min_vectorization_width),
+          tile_size_m_(tile_size_m),
+          tile_size_k_(tile_size_k) {}
+
+    string GetCacheKey() const {
+      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
+                          dims().ToString(), "_", max_vectorization_width(),
+                          "_", min_vectorization_width(), "_", tile_size_m(),
+                          "_", tile_size_k());
+    }
+
+    PrimitiveType scalar_type() const { return scalar_type_; }
+    Dimensions dims() const { return dims_; }
+    int64 max_vectorization_width() const { return max_vectorization_width_; }
+    int64 max_vector_count() const { return max_vector_count_; }
+    int64 min_vectorization_width() const { return min_vectorization_width_; }
+
+    int64 tile_size_m() const { return tile_size_m_; }
+    int64 tile_size_k() const { return tile_size_k_; }
+
+   private:
+    PrimitiveType scalar_type_;
+    Dimensions dims_;
+    int64 max_vectorization_width_;
+    int64 max_vector_count_;
+    int64 min_vectorization_width_;
+    int64 tile_size_m_;
+    int64 tile_size_k_;
+  };
+
+  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
+  // `lhs` with `rhs` and stores the result in `result`.
+  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
+                                 llvm::Value* rhs, llvm::Value* result,
+                                 llvm::IRBuilder<>* b)
+      : lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        config_(config),
+        b_(b),
+        ksl_(b_) {
+    CHECK(max_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
+    CHECK_GT(max_vector_count(), 0);
+    CHECK(min_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
+    CHECK_GE(max_vectorization_width(), min_vectorization_width());
+    CHECK_GT(tile_size_k(), 0);
+  }
+
+  void Emit();
+
+ private:
+  // The HandleResiduesOnX helpers split the iteration space for dimension X
+  // into a multiple of the tile size on dimension X and an epilogue.  These
+  // helpers ultimately call into `EmitTiledGemm` for emitting the
+  // tiled GEMM kernel.
+
+  void HandleResiduesOnN();
+  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                         llvm::Value* n_end);
+  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
+                         llvm::Value* k_start, llvm::Value* k_end,
+                         llvm::Value* n_start, llvm::Value* n_end);
+
+  // This emits a tiled GEMM kernel.  For a detailed description see the comment
+  // on the implementation.
+  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
+                     llvm::Value* k_start, llvm::Value* k_end,
+                     llvm::Value* n_start, llvm::Value* n_end,
+                     int64 tile_size_m, llvm::Value* m_start,
+                     llvm::Value* m_end);
+
+  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
+
+  Config config() const { return config_; }
+  Dimensions dims() const { return config().dims(); }
+
+  int64 max_vectorization_width() const {
+    return config().max_vectorization_width();
+  }
+  int64 max_vector_count() const { return config().max_vector_count(); }
+  int64 min_vectorization_width() const {
+    return config().min_vectorization_width();
+  }
+  int64 tile_size_m() const { return config().tile_size_m(); }
+  int64 tile_size_k() const { return config().tile_size_k(); }
+  PrimitiveType scalar_type() const { return config().scalar_type(); }
+
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  Config config_;
+
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+};
+
+void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
+
+void TiledSmallGemmEmitter::HandleResiduesOnN() {
+  // We can only iterate the `n` dimension for an extent that is divisible by
+  // the vectorization width.  So we emit an outer loop that first processes the
+  // largest extent in `n` that is divisible by max_vectorization_width, then
+  // the largest remaining extent that is divisible by max_vectorization_width /
+  // 2 etc.
+
+  int64 current_vectorization_width =
+      max_vector_count() * max_vectorization_width();
+  int64 current_vector_count = max_vector_count();
+
+  int64 n_start = 0;
+  while (n_start != dims().n() &&
+         current_vectorization_width >= min_vectorization_width()) {
+    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
+    if (n_start != n_end) {
+      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
+                               "gemm");
+      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
+      n_start = n_end;
+    }
+    if (current_vector_count == 1) {
+      current_vectorization_width /= 2;
+    } else {
+      current_vector_count--;
+      current_vectorization_width =
+          current_vector_count * max_vectorization_width();
+    }
+  }
+
+  if (n_start != dims().n()) {
+    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
+    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
+      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
+      HandleResiduesOnK(&vsl, n_i, n_i_next);
+    });
+  }
+}
+
+void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
+                                              llvm::Value* n_start,
+                                              llvm::Value* n_end) {
+  int64 k_start = 0;
+  int64 k_end = dims().k() - (dims().k() % tile_size_k());
+  if (k_end != k_start) {
+    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
+                      n_start, n_end);
+    k_start = k_end;
+  }
+
+  if (k_start != dims().k()) {
+    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
+                      GetInt64(dims().k()), n_start, n_end);
+  }
+}
+
+void TiledSmallGemmEmitter::HandleResiduesOnM(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
+  const int64 m_end = dims().m() - dims().m() % tile_size_m();
+  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
+                GetInt64(0), GetInt64(m_end));
+
+  if (m_end != dims().m()) {
+    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
+                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
+  }
+}
+
+// The loop structure is:
+//
+// Iterate over dimension M as m:
+//   Iterate over dimension N as n:
+//     Iterate over dimension K as k:
+//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
+//
+// I.e. a just a tiled version of a "naive" GEMM.
+//
+// The tiling scheme is as follows:
+//
+// Let the LHS be:
+//
+//   +----+----+----+
+//   | a0 | b0 | c0 | .
+//   +----+----+----+ .
+//   | a1 | b1 | c1 | .
+//   +----+----+----+
+//     ..     ..
+//
+// and the RHS be:
+//
+//   +----+----+----+----+
+//   | p0 | p1 | p2 | p3 | .
+//   +----+----+----+----+ .
+//   | q0 | q1 | q2 | q3 | .
+//   +----+----+----+----+
+//   | r0 | r1 | r2 | r3 | .
+//   +----+----+----+----+ .
+//     ......    ......
+//
+// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
+// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
+// matrix that we can increment the result matrix by.
+//
+// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
+// 3 array, L, of dimension [2,3,4]:
+//
+//       L[0,_,_]           *      L[1,_,_]
+//                          *
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//
+//
+// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
+// L[1,_,_] with the RHS to get the second row of the result.  For example,
+// L[0,_,_] is computed as:
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
+//   +----+----+----+----+   +----+----+----+----+
+//
+// to get:
+//
+//   +-------------------+-------------------+-------------------+---------
+//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
+//   +-------------------+-------------------+-------------------+---------
+void TiledSmallGemmEmitter::EmitTiledGemm(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
+    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
+  ksl_.For("dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
+    MemoryTile result_memory_tile(vsl, b_, /*matrix=*/result_,
+                                  /*matrix_size_along_minor_dim=*/dims().n(),
+                                  /*major_dim_offset=*/m_i,
+                                  /*tile_size_along_major_dim=*/tile_size_m);
+    MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
+                               /*matrix_size_along_minor_dim=*/dims().k(),
+                               /*major_dim_offset=*/m_i,
+                               /*tile_size_along_major_dim=*/tile_size_m);
+    ksl_.For(
+        "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+          TileVariable result_tile_var(vsl, result_memory_tile.LoadTile(n_i));
+          ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
+            MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
+                                       tile_size_k);
+            std::vector<std::vector<llvm::Value*>> lhs_tile =
+                lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
+            std::vector<llvm::Value*> rhs_tile = rhs_memory_tile.LoadTile(n_i);
+            std::vector<llvm::Value*> result_tile = result_tile_var.Get();
+            for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
+              for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
+                result_tile[r_m_i] =
+                    vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
+                                result_tile[r_m_i]);
+              }
+            }
+            result_tile_var.Set(result_tile);
+          });
+
+          result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
+        });
+  });
+}
+
+}  // namespace
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
+                      llvm::Value* rhs, llvm::Value* addend,
+                      llvm::Value* result, llvm::IRBuilder<>* b,
+                      bool enable_fast_math, bool optimize_for_size) {
+  RowMajorMatrixVectorProductEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
+      /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, addend, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+          llvm::Value* result) {
+        RowMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
+                                                   result, b);
+        emitter.Emit();
+      });
+}
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
+                         int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
+                         llvm::Value* rhs, llvm::Value* addend,
+                         llvm::Value* result, llvm::IRBuilder<>* b,
+                         bool enable_fast_math, bool optimize_for_size) {
+  ColumnMajorMatrixVectorProductEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
+      /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, addend, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+          llvm::Value* result) {
+        ColumnMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
+                                                      result, b);
+        emitter.Emit();
+      });
+}
+
+void EmitSmallGemm(PrimitiveType scalar_type, int64 m, int64 k, int64 n,
+                   int64 max_vectorization_width, int64 max_vector_count,
+                   int64 min_vectorization_width, int64 tile_size_m,
+                   int64 tile_size_k, llvm::Value* lhs, llvm::Value* rhs,
+                   llvm::Value* result, llvm::IRBuilder<>* b,
+                   bool enable_fast_math, bool optimize_for_size) {
+  TiledSmallGemmEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
+      /*max_vectorization_width=*/max_vectorization_width,
+      /*max_vector_count=*/max_vector_count,
+      /*min_vectorization_width=*/min_vectorization_width,
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result) {
+        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
+                                                 /*rhs=*/rhs,
+                                                 /*result=*/result, b);
+        small_gemm_emitter.Emit();
+      });
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a82326cc3704bce8c122261383249c60eda1f3a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace cpu {
+
+// These routines emit LLVM IR implementing tiled GEMM and GEMV routines.
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
+                      tensorflow::int64 tile_cols, tensorflow::int64 m,
+                      tensorflow::int64 k, llvm::Value* lhs, llvm::Value* rhs,
+                      llvm::Value* addend, llvm::Value* result,
+                      llvm::IRBuilder<>* b, bool enable_fast_math,
+                      bool optimize_for_size);
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
+                         tensorflow::int64 tile_cols, tensorflow::int64 m,
+                         tensorflow::int64 k, llvm::Value* lhs,
+                         llvm::Value* rhs, llvm::Value* addend,
+                         llvm::Value* result, llvm::IRBuilder<>* b,
+                         bool enable_fast_math, bool optimize_for_size);
+
+void EmitSmallGemm(PrimitiveType scalar_type, tensorflow::int64 m,
+                   tensorflow::int64 k, tensorflow::int64 n,
+                   tensorflow::int64 max_vectorization_width,
+                   tensorflow::int64 max_vector_count,
+                   tensorflow::int64 min_vectorization_width,
+                   tensorflow::int64 tile_size_m, tensorflow::int64 tile_size_k,
+                   llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result,
+                   llvm::IRBuilder<>* b, bool enable_fast_math,
+                   bool optimize_for_size);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
index 825e1436f0ec6d49b555e5e3e9c2c7a19fb7b062..70173d43d79e931b75f131ad380ad98359cc78b8 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -73,15 +73,14 @@ ENTRY TestComputation {
   abs = f32[] abs(arg)
   add = f32[] add(arg, gte)
   broadcast = f32[42] broadcast(add), dimensions={}
-  slice = f32[0] slice(broadcast), slice={[1:2]}
+  slice = f32[1] slice(broadcast), slice={[1:2]}
   copy = f32[] copy(arg)
   eq = pred[] equal-to(arg, gte)
   neg = f32[] negate(arg)
   ROOT convert = f64[] convert(f32[] arg)
 })";
   std::unique_ptr<HloModule> module =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())
-          .ConsumeValueOrDie();
+      ParseAndReturnVerifiedModule(hlo_string).ConsumeValueOrDie();
   ElementwiseTestVisitor visitor;
   TF_EXPECT_OK(module->entry_computation()->Accept(&visitor));
 }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index b2ba2617902104bfea06713332fa1c2aedea536d..e8bc6d05716a2ef02e0280e86c7df4ac22fe78c4 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -156,29 +158,192 @@ Status DecomposeBatchDot(HloInstruction* dot) {
   return computation->ReplaceInstruction(dot, new_dot);
 }
 
+// Convert a dot into a canonical form where non-contracting and contracting
+// dimensions are reshaped together and batch dimensions are the most major
+// dimensions. The requires transposing and reshapes the lhs and rhs and
+// reshaping the output batch to the original shape.
+Status CanonicalizeDot(HloInstruction* original_dot) {
+  auto computation = original_dot->parent();
+  const auto& original_dnums = original_dot->dot_dimension_numbers();
+  const int64 num_batch_dims = original_dnums.lhs_batch_dimensions_size();
+  const int64 num_contracting_dims =
+      original_dnums.lhs_contracting_dimensions_size();
+
+  const auto& lhs_shape = original_dot->operand(0)->shape();
+  const int64 lhs_rank = lhs_shape.rank();
+  const int64 num_lhs_non_contracting_dims =
+      lhs_rank - num_batch_dims - num_contracting_dims;
+
+  std::vector<int64> lhs_non_contracting_dims;
+  lhs_non_contracting_dims.reserve(num_lhs_non_contracting_dims);
+  int64 lhs_contracting_size = 1;
+  int64 lhs_non_contracting_size = 1;
+  std::vector<int64> batch_dim_sizes;
+  batch_dim_sizes.reserve(num_batch_dims);
+  for (int64 i = 0; i < lhs_rank; ++i) {
+    if (absl::c_linear_search(original_dnums.lhs_contracting_dimensions(), i)) {
+      lhs_contracting_size *= lhs_shape.dimensions(i);
+    } else if (absl::c_linear_search(original_dnums.lhs_batch_dimensions(),
+                                     i)) {
+      batch_dim_sizes.push_back(lhs_shape.dimensions(i));
+    } else {
+      lhs_non_contracting_dims.push_back(i);
+      lhs_non_contracting_size *= lhs_shape.dimensions(i);
+    }
+  }
+  // The canonical form of the lhs is
+  // [BatchDims, NonContractingDims, ContractingsDims]
+  std::vector<int64> lhs_transpose;
+  lhs_transpose.reserve(lhs_rank);
+  lhs_transpose.insert(lhs_transpose.end(),
+                       original_dnums.lhs_batch_dimensions().begin(),
+                       original_dnums.lhs_batch_dimensions().end());
+  lhs_transpose.insert(lhs_transpose.end(), lhs_non_contracting_dims.begin(),
+                       lhs_non_contracting_dims.end());
+  lhs_transpose.insert(lhs_transpose.end(),
+                       original_dnums.lhs_contracting_dimensions().begin(),
+                       original_dnums.lhs_contracting_dimensions().end());
+  HloInstruction* transposed_lhs =
+      computation->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(InversePermutation(lhs_transpose),
+                                       lhs_shape),
+          original_dot->mutable_operand(0), lhs_transpose));
+  std::vector<int64> lhs_reshape_dims = batch_dim_sizes;
+  lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  lhs_reshape_dims.push_back(lhs_contracting_size);
+  // Reshape the contracting and non-contracting dimensions together.
+  HloInstruction* reshaped_lhs =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(lhs_shape.element_type(), lhs_reshape_dims),
+          transposed_lhs));
+
+  const auto& rhs_shape = original_dot->operand(1)->shape();
+  const int64 rhs_rank = rhs_shape.rank();
+  const int64 num_rhs_non_contracting_dims =
+      rhs_rank - num_batch_dims - num_contracting_dims;
+  std::vector<int64> rhs_non_contracting_dims;
+  rhs_non_contracting_dims.reserve(num_rhs_non_contracting_dims);
+  int64 rhs_non_contracting_size = 1;
+  int64 rhs_contracting_size = 1;
+  for (int64 i = 0; i < rhs_rank; ++i) {
+    if (absl::c_linear_search(original_dnums.rhs_contracting_dimensions(), i)) {
+      rhs_contracting_size *= rhs_shape.dimensions(i);
+    } else if (!absl::c_linear_search(original_dnums.rhs_batch_dimensions(),
+                                      i)) {
+      rhs_non_contracting_dims.push_back(i);
+      rhs_non_contracting_size *= rhs_shape.dimensions(i);
+    }
+  }
+
+  // The canonical form of the rhs is
+  // [BatchDims, ContractingsDims, NonContractingDims]
+  std::vector<int64> rhs_transpose;
+  rhs_transpose.reserve(rhs_rank);
+  rhs_transpose.insert(rhs_transpose.end(),
+                       original_dnums.rhs_batch_dimensions().begin(),
+                       original_dnums.rhs_batch_dimensions().end());
+  rhs_transpose.insert(rhs_transpose.end(),
+                       original_dnums.rhs_contracting_dimensions().begin(),
+                       original_dnums.rhs_contracting_dimensions().end());
+  rhs_transpose.insert(rhs_transpose.end(), rhs_non_contracting_dims.begin(),
+                       rhs_non_contracting_dims.end());
+  HloInstruction* transposed_rhs =
+      computation->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(InversePermutation(rhs_transpose),
+                                       rhs_shape),
+          original_dot->mutable_operand(1), rhs_transpose));
+
+  std::vector<int64> rhs_reshape_dims = batch_dim_sizes;
+  rhs_reshape_dims.push_back(rhs_contracting_size);
+  rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  // Reshape the contracting and non-contracting dimensions together.
+  HloInstruction* reshaped_rhs =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(rhs_shape.element_type(), rhs_reshape_dims),
+          transposed_rhs));
+
+  std::vector<int64> dot_dims = batch_dim_sizes;
+  dot_dims.push_back(lhs_non_contracting_size);
+  dot_dims.push_back(rhs_non_contracting_size);
+
+  DotDimensionNumbers dot_dnums;
+  for (int64 i = 0; i < num_batch_dims; ++i) {
+    dot_dnums.add_lhs_batch_dimensions(i);
+    dot_dnums.add_rhs_batch_dimensions(i);
+  }
+  dot_dnums.add_lhs_contracting_dimensions(num_batch_dims + 1);
+  dot_dnums.add_rhs_contracting_dimensions(num_batch_dims);
+
+  HloInstruction* dot = computation->AddInstruction(HloInstruction::CreateDot(
+      ShapeUtil::MakeShape(original_dot->shape().element_type(), dot_dims),
+      reshaped_lhs, reshaped_rhs, dot_dnums, original_dot->precision_config()));
+
+  return computation->ReplaceInstruction(
+      original_dot, computation->AddInstruction(HloInstruction::CreateReshape(
+                        original_dot->shape(), dot)));
+}
+
 }  // namespace
 
 StatusOr<bool> DotDecomposer::Run(HloModule* module) {
   XLA_VLOG_LINES(2, "DotDecomposer ENTRY\n" + module->ToString());
-  // Gather all batch Dot operations.
-  std::vector<HloInstruction*> batch_dots;
+  // Gather all Non-canonical Dot operations.
+  std::vector<HloInstruction*> non_canonical_dots;
   for (auto* computation : module->MakeNonfusionComputations()) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() != HloOpcode::kDot) {
         continue;
       }
       const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
-      if (dnums.lhs_batch_dimensions_size() > 0 && decompose_batch_dot_) {
-        batch_dots.push_back(instruction);
+      // A dot it not canonical if there are more than one contracting
+      // dimension.
+      if (dnums.lhs_contracting_dimensions_size() > 1) {
+        non_canonical_dots.push_back(instruction);
+        continue;
+      }
+      if (dnums.lhs_batch_dimensions().empty() &&
+          dnums.lhs_contracting_dimensions().empty()) {
+        non_canonical_dots.push_back(instruction);
+        continue;
+      }
+      if (dnums.lhs_batch_dimensions().empty()) {
+        continue;
+      }
+      std::vector<int64> canonical_batch_dims(
+          dnums.lhs_batch_dimensions_size());
+      absl::c_iota(canonical_batch_dims, 0);
+      if (!absl::c_equal(dnums.lhs_batch_dimensions(), canonical_batch_dims) ||
+          !absl::c_equal(dnums.rhs_batch_dimensions(), canonical_batch_dims)) {
+        non_canonical_dots.push_back(instruction);
       }
     }
   }
-  // Decompose each batch Dot in 'batch_dots'.
   bool changed = false;
-  for (auto* dot : batch_dots) {
-    TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+  for (auto* dot : non_canonical_dots) {
+    TF_RETURN_IF_ERROR(CanonicalizeDot(dot));
     changed = true;
   }
+
+  if (decompose_batch_dot_) {
+    std::vector<HloInstruction*> batch_dots;
+    for (auto* computation : module->MakeNonfusionComputations()) {
+      for (auto* instruction : computation->instructions()) {
+        if (instruction->opcode() != HloOpcode::kDot) {
+          continue;
+        }
+        const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
+        if (!dnums.lhs_batch_dimensions().empty()) {
+          batch_dots.push_back(instruction);
+        }
+      }
+    }
+    // Decompose each batch Dot in 'batch_dots'.
+
+    for (auto* dot : batch_dots) {
+      TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+      changed = true;
+    }
+  }
   XLA_VLOG_LINES(2, "DotDecompose EXIT\n" + module->ToString());
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 6d0472689bf48092ceef2e9792c1358687d707ec..2b158d7a6ec510ce4cbc56bddc5cca71ac4f14f4 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -173,7 +173,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
 
         // Find out the new dynamic dimension after reduce.
         int64 dimensions_not_reduced_count = 0;
-        for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+        for (int i = 0; i < operand->shape().rank(); ++i) {
           if (dimension == i) {
             parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
                                     dynamic_size);
@@ -207,7 +207,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
           result_dim_mapping[i] = current_result_dims++;
         }
 
-        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(0)->shape()); i++) {
+        for (int64 i = 0; i < dot->operand(0)->shape().rank(); i++) {
           if (!absl::c_linear_search(
                   dimension_numbers.lhs_contracting_dimensions(), i)) {
             if (operand_index == 0) {
@@ -217,7 +217,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
           }
         }
 
-        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(1)->shape()); i++) {
+        for (int64 i = 0; i < dot->operand(1)->shape().rank(); i++) {
           if (!absl::c_linear_search(
                   dimension_numbers.rhs_contracting_dimensions(), i) &&
               !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(),
@@ -433,7 +433,7 @@ Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
 /* static */
 StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
     HloModule* module) {
-  VLOG(0) << "Param Config " << module->dynamic_parameter_binding().ToString();
+  VLOG(2) << "Param Config " << module->dynamic_parameter_binding().ToString();
   DynamicDimensionInference inference(module);
   TF_RETURN_IF_ERROR(inference.AnalyzeDynamicDimensions());
   return inference;
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 1dd196821c05cc820e2a3bf53a04d96b15484cd4..b42e67b4bbcf731d89dd8af9e46b405235a92d8a 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -62,6 +62,17 @@ class DynamicDimensionInferenceTest : public HloTestBase {
     return module_->AddEmbeddedComputation(embedded_builder.Build());
   }
 
+  HloComputation* GetGe() {
+    auto embedded_builder = HloComputation::Builder("ge");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGe, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<DynamicDimensionInference> inference_;
   const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
@@ -487,7 +498,7 @@ TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
   // Test the ability to trace select and scatter batch dimensions.
   auto builder = HloComputation::Builder(TestName());
   auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
-  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  auto source_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
 
   Window window;
   // First dimension is unchanged.
@@ -514,22 +525,26 @@ TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
       /*parameter_number=*/0, input_shape, "A"));
   auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, scalar_shape_, "size_param"));
+  auto* source = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, source_shape, "B"));
 
   auto init = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
 
-  auto* reduce_window =
-      builder.AddInstruction(HloInstruction::CreateReduceWindow(
-          output_shape, a_param, init, window, GetAdd()));
+  auto* sns = builder.AddInstruction(HloInstruction::CreateSelectAndScatter(
+      input_shape, a_param, GetGe(), window, source, init, GetAdd()));
 
   module_->AddEntryComputation(builder.Build());
 
   TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{2, {}, 0}));
 
   TF_ASSERT_OK(RunInference());
-  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(sns, {}, 0), size_param);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e34adfd2d2bbb7214cfa2da28291b133538845e5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+
+#include <map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+StatusOr<bool> DynamicIndexSplitter::Run(HloModule* module) {
+  bool changed = false;
+
+  std::vector<HloComputation*> computations =
+      module->MakeNonfusionComputations();
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* dynamic_op : computation->MakeInstructionPostOrder()) {
+      switch (dynamic_op->opcode()) {
+        case HloOpcode::kDynamicSlice:
+        case HloOpcode::kDynamicUpdateSlice:
+          break;
+        default:
+          continue;
+      }
+      auto parent = dynamic_op->parent();
+      bool is_update = dynamic_op->opcode() == HloOpcode::kDynamicUpdateSlice;
+      int64 num_indices = dynamic_op->operand(0)->shape().rank();
+
+      if (num_indices == 0) {
+        // If the operand rank is 0, directly replace R0 DS/DUS with the
+        // operand (for DS) or update (for DUS).
+        if (is_update) {
+          TF_CHECK_OK(parent->ReplaceInstruction(
+              dynamic_op, dynamic_op->mutable_operand(1)));
+        } else {
+          TF_CHECK_OK(parent->ReplaceInstruction(
+              dynamic_op, dynamic_op->mutable_operand(0)));
+        }
+        changed = true;
+        continue;
+      }
+
+      int64 index_operand_number = Cast<HloDynamicIndexInstruction>(dynamic_op)
+                                       ->first_index_operand_number();
+      auto index_operand = dynamic_op->mutable_operand(index_operand_number);
+      if (ShapeUtil::IsScalar(index_operand->shape())) {
+        // This DS/DUS already uses scalar indices.
+        continue;
+      }
+      TF_RET_CHECK(index_operand->shape().rank() == 1);
+      auto index_element_type = index_operand->shape().element_type();
+      std::vector<HloInstruction*> index_array;
+      for (int64 dim = 0; dim < num_indices; ++dim) {
+        auto slice = parent->AddInstruction(HloInstruction::CreateSlice(
+            ShapeUtil::MakeShape(index_element_type, {1}), index_operand, {dim},
+            {dim + 1}, {1}));
+        auto bitcast = parent->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(index_element_type, {}), slice));
+        index_array.push_back(bitcast);
+      }
+      auto new_dynamic_op =
+          is_update
+              ? HloInstruction::CreateDynamicUpdateSlice(
+                    dynamic_op->shape(), dynamic_op->mutable_operand(0),
+                    dynamic_op->mutable_operand(1), absl::MakeSpan(index_array))
+              : HloInstruction::CreateDynamicSlice(
+                    dynamic_op->shape(), dynamic_op->mutable_operand(0),
+                    absl::MakeSpan(index_array),
+                    dynamic_op->dynamic_slice_sizes());
+      TF_CHECK_OK(parent->ReplaceWithNewInstruction(dynamic_op,
+                                                    std::move(new_dynamic_op)));
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.h b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c12e3a4af287ad2272a08ba54cd99c2cad9d451
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Convert R1 index operands to DynamicSlice and DynamicUpdateSlice ops into
+// separate scalars.
+class DynamicIndexSplitter : public HloModulePass {
+ public:
+  DynamicIndexSplitter() = default;
+  absl::string_view name() const override { return "dynamic-index-splitter"; }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98029d1faff7d669730f6b66e38fcefece70f0eb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+class DynamicIndexSplitterTest : public HloTestBase {};
+
+TEST_F(DynamicIndexSplitterTest, DynamicSlice) {
+  const char* const kDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], indices: s32[3]) -> s32[1,1,1] {
+      operand = s32[4,5,6] parameter(0)
+      indices = s32[3] parameter(1)
+      ROOT dynamic-slice = s32[1,1,1] dynamic-slice(operand, indices), dynamic_slice_sizes={1,1,1}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kDynamicSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_TRUE(changed);
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicSlice(op::Parameter(0),
+                               op::Reshape(op::Slice(op::Parameter(1))),
+                               op::Reshape(op::Slice(op::Parameter(1))),
+                               op::Reshape(op::Slice(op::Parameter(1)))));
+
+  for (int i = 0; i < 3; ++i) {
+    const HloInstruction* slice = module->entry_computation()
+                                      ->root_instruction()
+                                      ->operand(i + 1)
+                                      ->operand(0);
+    EXPECT_EQ(slice->slice_starts(0), i);
+    EXPECT_EQ(slice->slice_limits(0), i + 1);
+  }
+}
+
+TEST_F(DynamicIndexSplitterTest, DynamicUpdateSlice) {
+  const char* const kDynamicUpdateSlice = R"(
+    HloModule DynamicUpdatedSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], indices: s32[3], update: s32[1,1,1]) -> s32[4,5,6] {
+      operand = s32[4,5,6] parameter(0)
+      indices = s32[3] parameter(1)
+      update = s32[1,1,1] parameter(2)
+      ROOT dynamic-update-slice = s32[4,5,6] dynamic-update-slice(operand, update, indices)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kDynamicUpdateSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_TRUE(changed);
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicUpdateSlice(op::Parameter(0), op::Parameter(2),
+                                     op::Reshape(op::Slice(op::Parameter(1))),
+                                     op::Reshape(op::Slice(op::Parameter(1))),
+                                     op::Reshape(op::Slice(op::Parameter(1)))));
+
+  for (int i = 0; i < 3; ++i) {
+    const HloInstruction* slice = module->entry_computation()
+                                      ->root_instruction()
+                                      ->operand(i + 2)
+                                      ->operand(0);
+    EXPECT_EQ(slice->slice_starts(0), i);
+    EXPECT_EQ(slice->slice_limits(0), i + 1);
+  }
+}
+
+TEST_F(DynamicIndexSplitterTest, AlreadyScalar) {
+  const char* const kDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], index.0: s32[], index.1: s32[], index.2: s32[]) -> s32[1,1,1] {
+      operand = s32[4,5,6] parameter(0)
+      index.0 = s32[] parameter(1)
+      index.1 = s32[] parameter(2)
+      index.2 = s32[] parameter(3)
+      ROOT dynamic-slice = s32[1,1,1] dynamic-slice(operand, index.0, index.1, index.2), dynamic_slice_sizes={1,1,1}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kDynamicSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicSlice(op::Parameter(0), op::Parameter(1),
+                               op::Parameter(2), op::Parameter(3)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4db280f817141bd52e3a5b9564600a618f81aeac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// ChooseIdentityValue looks at the instruction and returns a identity value
+// which, when padded, doesn't change the result of the instruction.
+//
+// nullopt is returned if padding doesn't need to be reset.
+StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst) {
+  HloComputation* comp = inst->parent();
+  // Padding on elementwise operation doesn't affect the result of the effective
+  // data.
+  if (inst->IsElementwise()) {
+    return nullptr;
+  }
+
+  switch (inst->opcode()) {
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow: {
+      // Because of the way we do reduce, we already require the `init` operand
+      // of hlo reduce instruction to be identity value. Here we reuse the
+      // operand.
+      return inst->mutable_operand(1);
+    }
+
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot: {
+      // Use 0 as padding value for convolution and dot.
+      PrimitiveType ptype = inst->shape().element_type();
+      return comp->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(ptype)));
+    }
+
+    case HloOpcode::kPad: {
+      return inst->mutable_operand(1);
+    }
+    case HloOpcode::kParameter:
+    case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kReshape:
+    case HloOpcode::kTuple:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kBroadcast:
+      return nullptr;
+    default:
+      return UnimplementedStrCat("Unimplimented padding for instruction: ",
+                                 inst->ToString());
+  }
+}
+
+}  // namespace
+
+StatusOr<bool> DynamicPadder::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "Pre DynamicPadder HLO:";
+  XLA_VLOG_LINES(2, module->ToString());
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
+                      DynamicDimensionInference::Run(module));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* inst : computation->instructions()) {
+      for (int64 operand_num = 0; operand_num < inst->operand_count();
+           ++operand_num) {
+        HloInstruction* operand = inst->mutable_operand(operand_num);
+        if (!operand->shape().IsArray()) {
+          continue;
+        }
+        for (int64 dim = 0; dim < operand->shape().rank(); ++dim) {
+          HloInstruction* dynamic_size =
+              dynamic_dimension_inference.GetDynamicSize(operand, {}, dim);
+          if (dynamic_size == nullptr) {
+            continue;
+          }
+          VLOG(1) << "Has dynamic dimension of operand" << operand_num << " @"
+                  << dim;
+          TF_ASSIGN_OR_RETURN(HloInstruction * identity_value,
+                              ChooseIdentityValue(inst));
+          if (identity_value == nullptr) {
+            continue;
+          }
+
+          // For each dimension, first generates a mask representing the
+          // effective area of data and padded area of data using iota and
+          // dynamic_size. For example, given a dimension of 7 elements and 5
+          // effective elements:
+          //
+          // iota = [0, 1, 2, 3, 4, 5, 6]
+          // broadcast_dynamic_size = [5, 5, 5, 5, 5, 5, 5]
+          // mask = lt(iota, broadcast_dynamic_size) = [t, t, t, t, t, f, f]
+          //
+          // Once the mask is generated, the input data is then padded using the
+          // mask and pad value.
+          //
+          const Shape mask_shape =
+              ShapeUtil::ChangeElementType(operand->shape(), xla::U32);
+          const Shape pred_shape =
+              ShapeUtil::ChangeElementType(operand->shape(), xla::PRED);
+          HloInstruction* iota = computation->AddInstruction(
+              HloInstruction::CreateIota(mask_shape, dim));
+
+          HloInstruction* broadcasted_effective_size =
+              computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  mask_shape, dynamic_size, {}));
+          HloInstruction* pred = computation->AddInstruction(
+              HloInstruction::CreateBinary(pred_shape, HloOpcode::kLt, iota,
+                                           broadcasted_effective_size));
+
+          HloInstruction* broadcasted_identity_value =
+              computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  operand->shape(), identity_value, {}));
+          HloInstruction* padded =
+              computation->AddInstruction(HloInstruction::CreateTernary(
+                  operand->shape(), HloOpcode::kSelect, pred, operand,
+                  broadcasted_identity_value));
+          TF_RETURN_IF_ERROR(inst->ReplaceOperandWith(operand_num, padded));
+          operand = inst->mutable_operand(operand_num);
+          changed = true;
+        }
+      }
+    }
+  }
+  HloDCE dce;
+  TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
+  VLOG(2) << "Post DynamicPadder HLO:";
+  XLA_VLOG_LINES(2, module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.h b/tensorflow/compiler/xla/service/dynamic_padder.h
new file mode 100644
index 0000000000000000000000000000000000000000..509269f7f56746fa5516ad917a04221587c6dcca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// With bounded shapes, only part of the shape contains effective data and the
+// rest contains padded data, whose value can be anything depending on the
+// source of the data. When a bounded shape is directly consumed by an
+// instruction that collapses dimensions (reduce for example), the padding data
+// would affect result of the instruction.
+//
+// DynamicPadder uses DynamicDimensionInference to detect bounded shapes in a
+// hlo module, it then inserts certain instructions to reset the padding into an
+// identity value so that in doesn't affect the result of subsequent
+// instruction. For example, it'd reset the padding to 0 before a bounded shape
+// is consumed by a reduce-sum.
+class DynamicPadder : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dynamic_padder"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55a11286e4596d87c330315322cae704fc5cd707
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class DynamicPadderTest : public HloTestBase {
+ protected:
+  DynamicPadderTest() : HloTestBase() { module_ = CreateNewVerifiedModule(); }
+
+  StatusOr<bool> RunPadder() {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before padder");
+
+    DynamicPadder padder;
+
+    return padder.Run(module_.get());
+  }
+
+  void ExpectPadded(const HloInstruction* inst) {
+    EXPECT_THAT(inst,
+                op::Select(op::Lt(op::Iota(), op::Broadcast(op::Parameter())),
+                           ::testing::_, op::Broadcast()));
+  }
+
+  HloComputation* GetScalarAddComputation() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  std::unique_ptr<HloModule> module_;
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(U32, {});
+};
+
+TEST_F(DynamicPadderTest, ReduceTest) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, negate, init, {0, 2}, GetScalarAddComputation()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(reduce->operand(0));
+}
+
+TEST_F(DynamicPadderTest, ConvolutionTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(conv->operand(0));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
index c8bfc8905064bcd7b68fe259fbcc1546ff083dbd..e9c8aa03e2aa8f4866daf2a2f8d846e50fa68793 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -29,7 +29,8 @@ Status DynamicParameterBinding::Bind(
 }
 
 absl::optional<DynamicParameterBinding::DynamicParameter>
-DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+DynamicParameterBinding::GetBinding(
+    const DynamicDimension& dynamic_dimension) const {
   auto param_iter = bindings_.find(dynamic_dimension);
   if (param_iter == bindings_.end()) {
     return absl::nullopt;
@@ -70,7 +71,7 @@ StatusOr<DynamicParameterBinding> DynamicParameterBinding::CreateFromProto(
     int64 target_param_num = binding.target_param_num();
     ShapeIndex target_param_index(binding.target_param_index().begin(),
                                   binding.target_param_index().end());
-    int64 target_dim_num = binding.target_param_num();
+    int64 target_dim_num = binding.target_param_dim_num();
 
     TF_RETURN_IF_ERROR(
         result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index},
@@ -121,10 +122,11 @@ Status DynamicParameterBinding::Verify(const HloModule& module) const {
         dynamic_dimension.parameter_index));
     TF_RET_CHECK(
         dynamic_dimension.dimension <
-        ShapeUtil::Rank(ShapeUtil::GetSubshape(
+        ShapeUtil::GetSubshape(
             entry->parameter_instruction(dynamic_dimension.parameter_num)
                 ->shape(),
-            dynamic_dimension.parameter_index)));
+            dynamic_dimension.parameter_index)
+            .rank());
     return Status::OK();
   });
 }
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
index dd474d8eed1b2c30ddb8f624a864198c74eacaba..57af2c43d3c65f7340e6a9f04e5abbf052ebceea 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -89,7 +89,7 @@ class DynamicParameterBinding {
   //
   // Returns nullopt if the binding is not set.
   absl::optional<DynamicParameter> GetBinding(
-      const DynamicDimension& dynamic_dimension);
+      const DynamicDimension& dynamic_dimension) const;
 
   using BindingFn =
       std::function<Status(const DynamicParameter& dynamic_parameter,
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
index 83a6d83dffde7995bd8e43917d13c5fd2705ba6f..b5d57cda4f469a384dc0affdae9e5f93a70ac418 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -33,7 +33,15 @@ limitations under the License.
 
 namespace xla {
 namespace {
-class DynamicParameterBindingTest : public HloTestBase {};
+class DynamicParameterBindingTest : public HloTestBase {
+ protected:
+  // Serialize and then deserialize a binding.
+  void SerializeAndDeserialize(DynamicParameterBinding* binding) {
+    DynamicParameterBindingProto proto = binding->ToProto();
+    TF_ASSERT_OK_AND_ASSIGN(*binding,
+                            DynamicParameterBinding::CreateFromProto(proto));
+  }
+};
 
 TEST_F(DynamicParameterBindingTest, SimpleBinding) {
   // 'b' is a dynamic shape; 'a' represents the real size of b's first
@@ -56,15 +64,20 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
                    DynamicParameterBinding::DynamicDimension{1, {}, 0}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
-                                                    /*parameter_index=*/{},
-                                                    /*dimension=*/0});
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                      /*parameter_index=*/{},
+                                                      /*dimension=*/0});
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+  test(binding);
+  SerializeAndDeserialize(&binding);
+  test(binding);
 }
 
 TEST_F(DynamicParameterBindingTest, TupleBinding) {
@@ -89,16 +102,21 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
                    DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+  test(binding);
+  SerializeAndDeserialize(&binding);
+  test(binding);
 }
 
 TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
@@ -127,26 +145,35 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
                    DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
-
-  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-  EXPECT_TRUE(param2);
-  EXPECT_EQ(param2->parameter_num, 0);
-  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
-
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+    absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+    EXPECT_TRUE(param2);
+    EXPECT_EQ(param2->parameter_num, 0);
+    EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+
+  test(binding);
+
+  SerializeAndDeserialize(&binding);
+
+  // Test the binding again after deserialization.
+  test(binding);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 6f1f95f2e9082649b6ca9cc0da5c238e15b77c10..727e0bfa52d45b6f8c67d7d04613e4865f18a53c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -812,11 +812,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
       auto c = EmitExtractReal(rhs_value);
       auto d = EmitExtractImag(rhs_value);
       auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b));
+      auto zero = llvm::ConstantFP::get(a->getType(), 0);
       auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
+      auto one = llvm::ConstantFP::get(a->getType(), 1);
       auto half_c = FMul(one_half, c);
 
       TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
                           EmitPow(component_type, aa_p_bb, half_c));
+
       auto neg_d = FNeg(d);
       TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
       auto neg_d_arg_lhs = FMul(neg_d, arg_lhs);
@@ -828,7 +831,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
       auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb));
       TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
       TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
-      return EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q));
+      // 0^c is 0 if d is 0 and c > 0. 0^0 is defined to be 1.0, see
+      // Branch Cuts for Complex Elementary Functions or Much Ado About
+      // Nothing's Sign Bit, W. Kahan, Section 10.
+      return Select(
+          And(And(FCmpOEQ(aa_p_bb, zero), FCmpOEQ(d, zero)), FCmpOLE(zero, c)),
+          EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero),
+          EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)));
     }
     default:
       return Unimplemented("binary complex op '%s'",
@@ -1327,9 +1336,9 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
 
   // If implicit broadcast is needed, the source dimensions that are broadcast
   // have index 0.
-  CHECK_EQ(ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(hlo.shape()));
+  CHECK_EQ(operand_shape.rank(), hlo.shape().rank());
   llvm_ir::IrArray::Index source_index(target_index.GetType());
-  for (int64 i = 0; i < ShapeUtil::Rank(hlo.shape()); ++i) {
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
     if (hlo.shape().dimensions(i) == operand_shape.dimensions(i)) {
       source_index.push_back(target_index[i]);
     } else {
@@ -1750,7 +1759,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     const llvm_ir::IrArray::Index& index) {
   // Emit IR to read dynamic start indices from hlo->operand(1).
   const HloInstruction* input_hlo = hlo->operand(0);
-  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  const int64 rank = input_hlo->shape().rank();
   // Use the same index type for all tensor accesses in the same kernel.
   llvm::Type* index_type = index.GetType();
   llvm_ir::IrArray::Index slice_start_index(index_type, rank);
@@ -1758,9 +1767,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                        operand_to_generator.at(hlo->operand(1))(dim_index));
+    // TODO(b/118437727): Remove the R1 path.
+    llvm::Value* start_index_value;
+    if (hlo->operand(1)->shape().rank() == 1) {
+      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
+      TF_ASSIGN_OR_RETURN(start_index_value,
+                          operand_to_generator.at(hlo->operand(1))(dim_index));
+    } else {
+      llvm_ir::IrArray::Index zero_index(index_type);
+      TF_ASSIGN_OR_RETURN(
+          start_index_value,
+          operand_to_generator.at(hlo->operand(1 + i))(zero_index));
+    }
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
@@ -1893,7 +1911,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   const HloInstruction* update_hlo = hlo->operand(1);
   const HloInstruction* start_hlo = hlo->operand(2);
   // Calculate slice start/end indices.
-  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  const int64 rank = input_hlo->shape().rank();
   llvm_ir::IrArray::Index slice_start_index(index.GetType(), rank);
   llvm_ir::IrArray::Index slice_limit_index(index.GetType(), rank);
   // Slice intersection gathers (ANDs) conditions on all ranks for which
@@ -1905,9 +1923,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                        operand_to_generator.at(start_hlo)(dim_index));
+
+    llvm::Value* start_index_value;
+    // TODO(b/118437727): Remove the R1 path.
+    if (hlo->operand(2)->shape().rank() == 1) {
+      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
+      TF_ASSIGN_OR_RETURN(start_index_value,
+                          operand_to_generator.at(hlo->operand(2))(dim_index));
+    } else {
+      llvm_ir::IrArray::Index zero_index(index_type);
+      TF_ASSIGN_OR_RETURN(
+          start_index_value,
+          operand_to_generator.at(hlo->operand(2 + i))(zero_index));
+    }
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
@@ -2225,7 +2253,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         auto* iota = Cast<HloIotaInstruction>(hlo);
         PrimitiveType element_type = iota->shape().element_type();
         IrArray::Index elem_index =
-            ShapeUtil::Rank(iota->shape()) > 1
+            iota->shape().rank() > 1
                 ? target_index.SourceIndexOfBroadcast(
                       iota->shape(),
                       ShapeUtil::MakeShapeWithDescendingLayout(
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 01cef499665c050d4453382289168276028e1d26..590942cddcdd138981ee829f090ae17b0d038e1a 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -153,10 +153,9 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
            dim_numbers.index_vector_dim() ==
                gather.operand(1)->shape().dimensions_size());
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * induction_var_as_vector,
+  HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                       /*result_shape_bounds=*/{1}));
+                       /*result_shape_bounds=*/{1});
 
   HloInstruction* index_vector;
 
@@ -222,7 +221,7 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
       {operand, start_indices, updated_accumulator}};
 }
 
-static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
+static HloInstruction* CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
     absl::Span<const int64> slice_sizes, int64 gather_loop_trip_count,
     const GatherDimensionNumbers& dim_numbers) {
@@ -332,12 +331,10 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
   CHECK_EQ(gather_loop_trip_count,
            canonical_start_indices->shape().dimensions(0));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_init,
-      CreateGatherLoopAccumulatorInitValue(
-          computation, output_shape.element_type(),
-          gather_instr->gather_slice_sizes(), gather_loop_trip_count,
-          gather_instr->gather_dimension_numbers()));
+  HloInstruction* accumulator_init = CreateGatherLoopAccumulatorInitValue(
+      computation, output_shape.element_type(),
+      gather_instr->gather_slice_sizes(), gather_loop_trip_count,
+      gather_instr->gather_dimension_numbers());
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index a3102368cb1dba15da7422337666d278cef775ab..e1ea5c39d58b6d23b076740626ca0ad63dc341ee 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -89,7 +89,7 @@ ENTRY main {
   // an implementation detail from WhileUtil::MakeCountedLoop).
 
   const Shape& while_shape = while_instr->shape();
-  ASSERT_TRUE(ShapeUtil::IsTuple(while_shape));
+  ASSERT_TRUE(while_shape.IsTuple());
   ASSERT_EQ(ShapeUtil::TupleElementCount(while_shape), 4);
 
   EXPECT_TRUE(ShapeUtil::SameDimensions(
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index bec02e14f951c6d905b7329be5c02896984279d0..7d450f4b53cdea209f2ef10ba785be6ec3b8bf8d 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -83,7 +83,7 @@ Status GenericTransferManager::TransferLiteralFromDeviceInternal(
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_host_shape(),
       [&](const Shape& subshape, const ShapeIndex& index) -> Status {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           TF_RETURN_IF_ERROR(executor->SynchronousMemcpyD2H(
               /*source=*/device_buffer.buffer(index),
               /*size=*/GetByteSizeRequirement(subshape),
@@ -120,7 +120,7 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
       device_buffer.on_host_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
-        if (ShapeUtil::IsArray(device_subshape)) {
+        if (device_subshape.IsArray()) {
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
           // Element is array-shaped: transfer array data to device buffer.
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 6c23f921f40cac0dc5df08494dc1b63e6d1d5e93..dc17aa4426236f54e5f03c28634278d45f462158 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -3,6 +3,11 @@
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -24,12 +29,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-
 xla_proto_library(
     name = "backend_configs",
     srcs = ["backend_configs.proto"],
@@ -94,8 +93,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_reachability",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -135,6 +134,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
@@ -263,7 +264,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -362,6 +365,7 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -695,6 +699,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -712,6 +718,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -725,6 +732,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/cuda:cuda_diagnostics",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1005,14 +1013,10 @@ cc_library(
     srcs = ["variadic_op_splitter.cc"],
     hdrs = ["variadic_op_splitter.h"],
     deps = [
-        ":ir_emission_utils",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 528209abc75777440163c2e1512658b8ad36315b..eb59ee5a1d47b6b706ef3f53a76069b3538eb6b7 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -57,16 +58,16 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
-    if (registered_buffers_.count(i)) {
-      se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
-      if (reinterpret_cast<uintptr_t>(address.opaque()) % expected_alignment !=
+    if (se::DeviceMemoryBase* address =
+            tensorflow::gtl::FindOrNull(registered_buffers_, i)) {
+      if (reinterpret_cast<uintptr_t>(address->opaque()) % expected_alignment !=
           0) {
         return InternalError(
             "Address of registered buffer %d must be a multiple of %x, but "
             "was %p",
-            i, kEntryParameterAlignBytes, address.opaque());
+            i, kEntryParameterAlignBytes, address->opaque());
       }
-      buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
+      buffer_allocations->SetBuffer(i, *address);
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 14186b8faa68ad8492ea4863fcd7bd746e2eae48..9413ac2cff7c8d3ec4be6662569c580060bf1173 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -52,7 +53,8 @@ class BufferAllocations {
         DeviceMemoryAllocator* memory_allocator);
 
    private:
-    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
+    absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
+        registered_buffers_;
   };
 
   ~BufferAllocations();
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 6d6780fa1c7b0c636eb771c40e74f074cd8c4c4b..309b0aca64954e64509d731dce28ce9d8da4ee43 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -146,7 +146,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
 StatusOr<CudnnConvAlgorithmPicker::AutotuneResult>
-CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
+CudnnConvAlgorithmPicker::PickBestAlgorithm(
+    const HloCustomCallInstruction* instr) {
   // TODO(timshen): for now only check fp16. It can be expanded to other types,
   // with some work on the HLO routines.
   const bool cross_check_enabled =
@@ -249,12 +250,13 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
 
-    backend_config.set_algorithm(alg.algo_id());
-    backend_config.set_tensor_ops_enabled(alg.tensor_ops_enabled());
-    TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
+    // Use assignment instead of brace-list to make GCC 4.9 happy.
+    RunConvOptions options;
+    options.profile_result = &profile_result;
+    options.algo_override = alg;
     bool launch_ok =
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                     &scratch_allocator, &stream, &profile_result)
+                     &scratch_allocator, &stream, options)
             .ok();
 
     if (launch_ok && profile_result.is_valid()) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 642af787afc71586d722ecc7e529ed8b3fa64d33..4991db0948589e479a202f4082d96df275f6e088 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -56,7 +56,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
 
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<AutotuneResult> PickBestAlgorithm(HloCustomCallInstruction* instr);
+  StatusOr<AutotuneResult> PickBestAlgorithm(
+      const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
index 5aa4f839f4be5f1060480fea98775f8ffada0bdd..958e0b9c6e7b7885f87b90d61ee5b3bbf6ab2702 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
@@ -50,10 +50,10 @@ static HloInstruction* PadInstruction(HloInstruction* instr,
   auto* zero = comp->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
 
-  PaddingConfig pad_config = MakeNoPaddingConfig(ShapeUtil::Rank(shape));
+  PaddingConfig pad_config = MakeNoPaddingConfig(shape.rank());
 
   bool added_padding = false;
-  for (int64 dim = 0; dim < ShapeUtil::Rank(shape); ++dim) {
+  for (int64 dim = 0; dim < shape.rank(); ++dim) {
     if (shape.dimensions(dim) == new_shape.dimensions(dim)) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
index 3a09d4d4716950a09d65dd093272482d55ac5c27..17d0f7aa7bf6031148aae79f74f7878d6fca9574 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
@@ -219,7 +219,7 @@ bool CudnnConvPaddingLegalization::CanonicalizeBackwardFilterConvolution(
   Window new_backward_conv_window = backward_conv->window();
   // input_padding_config is the config of the kPad to be inserted.
   PaddingConfig input_padding_config =
-      MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
+      MakeNoPaddingConfig(input->shape().rank());
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index 3425e1b4942aaf1011ba1bf1c50dd7e79c1f9807..b628f27f4b2ba8ccf17fd531d8a0c25cb99d9396 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -395,32 +395,36 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result) {
+                    RunConvOptions options) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
   return RunCudnnConv(conv, operand_buffers, result_buffer, &scratch_allocator,
-                      stream, profile_result);
+                      stream, options);
 }
 
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result) {
+                    RunConvOptions options) {
   TF_ASSIGN_OR_RETURN(CudnnConvParams params,
                       GetCudnnConvParams(conv, operand_buffers, result_buffer));
 
+  if (options.algo_override) {
+    params.algorithm = AlgorithmConfig(*options.algo_override);
+  }
+
   PrimitiveType output_primitive_type =
       conv->shape().tuple_shapes(0).element_type();
   switch (output_primitive_type) {
     case F16:
       return RunCudnnConvImpl<Eigen::half>(params, scratch_allocator, stream,
-                                           profile_result);
+                                           options.profile_result);
     case F32:
       return RunCudnnConvImpl<float>(params, scratch_allocator, stream,
-                                     profile_result);
+                                     options.profile_result);
     case F64:
       return RunCudnnConvImpl<double>(params, scratch_allocator, stream,
-                                      profile_result);
+                                      options.profile_result);
     default:
       LOG(FATAL) << ShapeUtil::HumanString(*params.output_shape);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
index edbc75a94a1238540390b93f0fa5217852c7781f..25b2461ca61251c6cb7b89b1f91da0f1636a3647 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
@@ -28,6 +28,14 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct RunConvOptions {
+  // Nullable output-parameter pointer for profiling results.
+  se::dnn::ProfileResult* profile_result = nullptr;
+
+  // Use this algorithm, instead of the one from the instruction.
+  absl::optional<se::dnn::AlgorithmDesc> algo_override;
+};
+
 // This file contains low-level routines for running cudnn convolutions.
 
 // Calls into cudnn to run the specified convolution.
@@ -46,13 +54,13 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result = nullptr);
+                    RunConvOptions = {});
 
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result = nullptr);
+                    RunConvOptions = {});
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 470457935acacb8940af241dadb393d770786939..91930eccdff94bb2fc85636f3a4b2d661c618d87 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -35,7 +35,7 @@ namespace {
 // Traverses users of tuple shape, adding leaf instructions to 'instructions'.
 void MaybeResolveTupleElements(HloInstruction* instruction,
                                std::vector<HloInstruction*>* instructions) {
-  if (ShapeUtil::IsTuple(instruction->shape())) {
+  if (instruction->shape().IsTuple()) {
     for (auto tuple_user : instruction->users()) {
       MaybeResolveTupleElements(tuple_user, instructions);
     }
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 27f07b1d58125092c1ed6734b238e4ae0f11c4aa..86c9bc6a345047fb5329af0be45c8981cc427f50 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -206,6 +206,8 @@ auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm<float>) {
       return &DoGemm<double>;
     case C64:
       return &DoGemm<std::complex<float>>;
+    case C128:
+      return &DoGemm<std::complex<double>>;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -221,6 +223,8 @@ auto GetGemmWithAlgorithmFn(PrimitiveType type)
       return &DoGemmWithAlgorithm<double>;
     case C64:
       return &DoGemmWithAlgorithm<std::complex<float>>;
+    case C128:
+      return &DoGemmWithAlgorithm<std::complex<double>>;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -235,6 +239,8 @@ auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune<float>) {
       return &DoGemmAutotune<double>;
     case C64:
       return &DoGemmAutotune<std::complex<float>>;
+    case C128:
+      return &DoGemmAutotune<std::complex<double>>;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -255,6 +261,8 @@ se::blas::ComputationType GetBlasComputationType(PrimitiveType type) {
       return se::blas::ComputationType::kF64;
     case C64:
       return se::blas::ComputationType::kComplexF32;
+    case C128:
+      return se::blas::ComputationType::kComplexF64;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -315,8 +323,7 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction());
   CHECK_EQ(dim_nums.lhs_batch_dimensions_size(),
            dim_nums.rhs_batch_dimensions_size());
-  CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2,
-           ShapeUtil::Rank(output_shape_));
+  CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2, output_shape_.rank());
 
   int64 row_dim = dim_nums.lhs_batch_dimensions_size();
   int64 col_dim = dim_nums.lhs_batch_dimensions_size() + 1;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index ae2e718db29803a085401969a7d9b09abf690a6c..434060ad89dac7ad65c790c8c0a7f3d6ad62a25a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -218,7 +218,7 @@ GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
 
       const Literal& literal =
           llvm_ir::LiteralForConstantAllocation(allocation);
-      CHECK(ShapeUtil::IsArray(literal.shape()));
+      CHECK(literal.shape().IsArray());
       if (!ShouldEmitLiteralInLlvmIr(literal)) {
         VLOG(3) << "H2D memcpy for constant with shape "
                 << ShapeUtil::HumanString(literal.shape());
@@ -310,12 +310,34 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
         TF_ASSIGN_OR_RETURN(
             const BufferAllocation::Slice slice,
             this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
-        CHECK(!slice.allocation()->is_entry_computation_parameter());
 
         se::DeviceMemoryBase src_base =
             buffer_allocations->GetDeviceAddress(slice.index());
         CHECK(!src_base.is_null() || src_base.size() == 0);
-        *device_memory = src_base;
+        if (!slice.allocation()->is_entry_computation_parameter()) {
+          // If the buffer coming out of the result is from a parameter, it
+          // means the caller aliased some parameter buffer to an output one
+          // (via the HloInputOutputAliasConfig API). If that is the case, the
+          // caller will receive a partially complete scoped shaped buffer,
+          // which they will have to fill up on return.
+          // Unfortunately the interface to the execute APIs are ShapedBuffer
+          // pointer based, which assumes caller ownership, and hence a buffer
+          // coming from there cannot be part of the new ScopedShapedBuffer we
+          // create for the result (which assumes ownership).
+          *device_memory = src_base;
+        } else {
+          const HloInputOutputAliasConfig& input_output_alias =
+              module().input_output_alias_config();
+          auto output_alias = input_output_alias.GetAliasedOutput(
+              slice.allocation()->parameter_number(),
+              slice.allocation()->param_shape_index());
+          CHECK(output_alias)
+              << "Ouput buffer is coming from parameter "
+              << slice.allocation()->parameter_number() << " at index "
+              << slice.allocation()->param_shape_index()
+              << ", but no alias exists";
+          CHECK_EQ(*output_alias, index);
+        }
         buffers_in_result.insert(src_base);
         return Status::OK();
       }));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 452e763a8eaadc805cd3a3859a68e2a31598fd36..842ba2fdcd31a451cec1be543e102e0a46077f38 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -42,15 +42,13 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
   int64 max_rank = -1;
   const Layout* max_rank_layout;
   for (HloInstruction* param : params) {
-    if (ShapeUtil::IsArray(param->shape()) &&
-        ShapeUtil::Rank(param->shape()) > max_rank) {
-      max_rank = ShapeUtil::Rank(param->shape());
+    if (param->shape().IsArray() && param->shape().rank() > max_rank) {
+      max_rank = param->shape().rank();
       max_rank_layout = &param->shape().layout();
     }
   }
   return absl::c_all_of(params, [&](HloInstruction* param) {
-    return (!ShapeUtil::IsArray(param->shape())) ||
-           (ShapeUtil::Rank(param->shape()) < max_rank) ||
+    return (!param->shape().IsArray()) || (param->shape().rank() < max_rank) ||
            (LayoutUtil::Equal(param->shape().layout(), *max_rank_layout));
   });
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index f59da2caa18646676297e66dd329c66fb5fddf1b..58bdd4209a2315cdb7d29e920faded4d1a6a5876 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -196,9 +196,9 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       CHECK_EQ(dim_nums.lhs_batch_dimensions_size(),
                dim_nums.rhs_batch_dimensions_size());
       CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2,
-               ShapeUtil::Rank(instruction->shape()));
+               instruction->shape().rank());
       for (int64 batch_dim : dim_nums.lhs_batch_dimensions()) {
-        CHECK_LT(batch_dim, ShapeUtil::Rank(instruction->shape()) - 2);
+        CHECK_LT(batch_dim, instruction->shape().rank() - 2);
       }
 
       // Set both inputs and the output to default layout.
@@ -215,18 +215,18 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(output_shape, instruction));
     } else if (instruction->opcode() == HloOpcode::kSort &&
-               ShapeUtil::Rank(instruction->operand(0)->shape()) > 1) {
+               instruction->operand(0)->shape().rank() > 1) {
       // Make sure that all the operands and the output(s) have the same layout.
       Shape keys_shape = instruction->operand(0)->shape();
       Layout keys_layout =
-          LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(keys_shape));
+          LayoutUtil::GetDefaultLayoutForRank(keys_shape.rank());
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         Shape shape = instruction->operand(i)->shape();
         *shape.mutable_layout() = keys_layout;
         TF_RETURN_IF_ERROR(
             constraints->SetOperandLayout(shape, instruction, i));
         const LogicalBuffer* output_buffer;
-        if (ShapeUtil::IsArray(instruction->shape())) {
+        if (instruction->shape().IsArray()) {
           TF_ASSIGN_OR_RETURN(
               output_buffer,
               constraints->points_to_analysis().GetBufferDefinedAt(instruction,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index f3c274429242d5c989146d14ea523b5910408cff..8c6a6914792a96ab517fa5f20ff2215e4785490e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -59,7 +59,7 @@ Status GpuTransferManager::TransferLiteralToInfeed(
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       shape, [&](const Shape& literal_subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsArray(literal_subshape)) {
+        if (literal_subshape.IsArray()) {
           int64 tuple_element_size = GetByteSizeRequirement(literal_subshape);
           TF_ASSIGN_OR_RETURN(
               *buffer_tree.mutable_element(index),
@@ -126,13 +126,12 @@ static void ShapeTreeToLiteral(
         ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree,
         ShapeIndex* index) {
       const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index);
-      if (ShapeUtil::IsArray(shape)) {
+      if (shape.IsArray()) {
         (*shape_tree->mutable_element(*index))->WaitUntilAvailable();
         return;
       }
 
-      CHECK(ShapeUtil::IsTuple(shape))
-          << ShapeUtil::HumanStringWithLayout(shape);
+      CHECK(shape.IsTuple()) << ShapeUtil::HumanStringWithLayout(shape);
       const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape);
       index->push_back(0);
       for (int64 i = 0; i < tuple_element_count; ++i) {
@@ -158,7 +157,7 @@ Status GpuTransferManager::TransferLiteralFromOutfeed(
           std::unique_ptr<gpu::OutfeedBuffer>* buffer) {
         const Shape& shape = ShapeUtil::GetSubshape(literal_shape, index);
         // Do not transfer tuple index buffers.
-        if (ShapeUtil::IsTuple(shape)) {
+        if (shape.IsTuple()) {
           return;
         }
         *buffer = absl::make_unique<gpu::OutfeedBuffer>(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 51627402b45f594dab3480129ba182d54d01b811..69aaaceca112364a4fd562f6a5eff1629fd3fc54 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -45,10 +46,10 @@ void HloToIrBindings::EmitBasePointersForHlos(
 
   // An HLO can have duplicated operands. This data structure remembers which
   // operand HLOs are already bound to avoid rebinding the same HLO.
-  std::set<const HloInstruction*> already_bound_for_this_function;
+  absl::flat_hash_set<const HloInstruction*> already_bound_for_this_function;
   auto arg_iter = function->arg_begin();
   for (const HloInstruction* io_hlo : io_hlos) {
-    if (!already_bound_for_this_function.count(io_hlo)) {
+    if (!already_bound_for_this_function.contains(io_hlo)) {
       if (!is_nested_ && io_hlo->opcode() == HloOpcode::kGetTupleElement) {
         BindHloToIrValue(*io_hlo, EmitGetTupleElement(io_hlo, &*arg_iter));
       } else {
@@ -63,7 +64,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   temp_buffer_base_->setName("temp_buffer");
 
   for (const HloInstruction* non_io_hlo : non_io_hlos) {
-    if (already_bound_for_this_function.count(non_io_hlo)) {
+    if (already_bound_for_this_function.contains(non_io_hlo)) {
       continue;
     }
     already_bound_for_this_function.insert(non_io_hlo);
@@ -280,7 +281,7 @@ string HloToIrBindings::ToString() const {
       StrAppend(&s, "    ", instr->ToString());
 
       const ShapeTree<llvm::Value*>& shape_tree = it->second;
-      if (!ShapeUtil::IsTuple(instr->shape())) {
+      if (!instr->shape().IsTuple()) {
         const llvm::Value* val = shape_tree.begin()->second;
         StrAppend(&s, " -> ", llvm_ir::DumpToString(*val), "\n");
         continue;
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index c0edae530cedba45c897b07b7b9cc72eaaab397c..f57b594e9c18078a3bbbf4d2b4db7e989c4edfdd 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
@@ -61,7 +62,7 @@ class HloToIrBindings {
 
   // Returns whether `hlo` is bound to an LLVM IR value.
   bool BoundToIrValue(const HloInstruction& hlo) const {
-    return base_ptrs_.count(&hlo);
+    return base_ptrs_.contains(&hlo);
   }
 
   llvm::Value* GetTempBufferBase() const { return temp_buffer_base_; }
@@ -110,7 +111,8 @@ class HloToIrBindings {
   // For an instruction that generates multiple outputs, the root will be a
   // tuple shape. The IrArray for each element output is stored in the subnode
   // in the ShapeTree.
-  std::unordered_map<const HloInstruction*, ShapeTree<llvm::Value*>> base_ptrs_;
+  absl::flat_hash_map<const HloInstruction*, ShapeTree<llvm::Value*>>
+      base_ptrs_;
 
   // The address of the memory block that contains all temporary buffers.
   llvm::Value* temp_buffer_base_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 8c3a026740851767855beae59d6a3c92f7a0d6bd..676380c3b10f9a20c641eea0d9a948a26becaddc 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -36,6 +36,21 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   ShapeTree<InfeedBuffer> infeed_buffers =
       GetOrCreateInfeedManager()->BlockingGetNextDestination();
 
+  // infeed_slices_'s shape should be a tuple of shape (buffers, token).
+  const auto& infeed_shape = infeed_slices_.shape();
+  TF_RET_CHECK(infeed_shape.IsTuple())
+      << ShapeUtil::HumanStringWithLayout(infeed_shape);
+  TF_RET_CHECK(infeed_shape.tuple_shapes().size() == 2)
+      << ShapeUtil::HumanStringWithLayout(infeed_shape);
+  TF_RET_CHECK(infeed_shape.tuple_shapes(1).IsToken())
+      << ShapeUtil::HumanStringWithLayout(infeed_shape);
+  TF_RET_CHECK(
+      ShapeUtil::Equal(infeed_buffers.shape(), infeed_shape.tuple_shapes(0)))
+      << "Expected infeed of shape "
+      << ShapeUtil::HumanStringWithLayout(infeed_shape.tuple_shapes(0))
+      << " but was "
+      << ShapeUtil::HumanStringWithLayout(infeed_buffers.shape());
+
   {
     // The infeed buffer has an extra outer tuple with a token. Adjust the index
     // accordingly.
@@ -45,7 +60,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
           const Shape& shape = ShapeUtil::GetSubshape(infeed_buffers.shape(),
                                                       ShapeIndexView(index, 1));
           // For the leaf buffers of the tuple copy the elements directly.
-          if (ShapeUtil::IsArray(shape)) {
+          if (shape.IsArray()) {
             const BufferAllocation::Slice& tuple_element_buffer =
                 infeed_slices_.element(index);
             se::DeviceMemoryBase tuple_element_address =
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 6151dd8ff4c92bb81bd756c68cc9377633c8c9d5..f07141029cbf8b034b74548f6fca8f1628589f0c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -282,22 +282,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
                                                      int64 operand_index) {
-  const HloInstruction* producer = consumer->operand(operand_index);
-  // The IR emitter has limited support for non-loop fusions with multi output
-  // at present.
-  // TODO(tjoerg): Relax this constraint to allow for arbitraty kinds of fusion.
-  if (consumer->opcode() == HloOpcode::kFusion &&
-      consumer->fusion_kind() != HloInstruction::FusionKind::kLoop) {
-    return false;
-  }
-  // Multi-output fusion requires instructions with compatible shapes.
-  if (!ShapeUtil::Compatible(producer->shape(), consumer->shape())) {
-    return false;
-  }
-  // TODO(tjoerg): Stop calling `ShouldFuse` to relax the criteria for
-  // multi-output fusion. In particular, do not check whether an instruction is
-  // expensive to duplicate, since this doesn't matter here.
-  return GpuInstructionFusion::ShouldFuse(consumer, operand_index);
+  return false;
 }
 
 HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 688604cd36e5a45debf855aacd29d05ecda92341..a05ab86cf77a134a1fc387d93cb482aa1ff5345b 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -506,202 +506,11 @@ TEST_F(InstructionFusionTest, MultiOutputFusion) {
     })")
                     .ValueOrDie();
 
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(
-      fusion->fused_expression_root(),
-      op::Tuple(op::Add(op::Subtract(), op::Parameter()), op::Subtract()));
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) {
-  // tanh --> add --> tuple
-  //  \---------------/
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     tanh = f32[4,3]{1,0} tanh(p0)
-     add = f32[4,3]{1,0} add(tanh, p1)
-     ROOT tuple = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(tanh, add)
-    })")
-                    .ValueOrDie();
-
-  // TODO(tjoerg): Allow multi-output fusion for expensive operations like tanh.
+  // Multi-output fusion is disabled here and performed in the
+  // GpuMultiOutputFusion pass instead.
   ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
                    .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusion2) {
-  // sub --> add1 --\--------\
-  //  \----------> add2 --> tuple
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[4,3]{1,0} parameter(2)
-     sub = f32[4,3]{1,0} subtract(p0, p2)
-     add1 = f32[4,3]{1,0} add(sub, p1)
-     add2 = f32[4,3]{1,0} add(sub, add1)
-     ROOT tuple = (f32[4,3]{1,0}) tuple(add1, add2)
-    })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(fusion->fused_expression_root(),
-              op::Tuple(op::Add(op::Subtract(), op::Add()),
-                        op::Add(op::Subtract(), op::Parameter())));
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusion3) {
-  // sub --> add1 ----\--------\
-  //  \ --> add2 --> add3 --> tuple
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[4,3]{1,0} parameter(2)
-     p3 = f32[4,3]{1,0} parameter(3)
-     sub = f32[4,3]{1,0} subtract(p0, p2)
-     add1 = f32[4,3]{1,0} add(sub, p1)
-     add2 = f32[4,3]{1,0} add(p2, sub)
-     add3 = f32[4,3]{1,0} add(add1, add2)
-     ROOT tuple = (f32[4,3]{1,0}) tuple(add3, add2)
-    })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(fusion->fused_expression_root(),
-              op::Tuple(op::Add(op::Add(), op::Add()),
-                        op::Add(op::Parameter(), op::Subtract())));
-}
-
-TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) {
-  // sub --> mul ---\
-  //  \--> call --> add --> tuple
-  auto module = ParseHloString(R"(
-  HloModule test_module
-  ENTRY OutputFusion {
-    c = f32[] constant(42)
-    p0 = f32[4,3]{1,0} parameter(0)
-    p1 = f32[4,3]{1,0} parameter(1)
-    sub = f32[4,3]{1,0} subtract(p0, p1)
-    mul = f32[4,3]{1,0} multiply(sub, c)
-    call = f32[4,3]{1,0} custom-call(sub), custom_call_target="foo"
-    add = f32[4,3]{1,0} add(mul, call)
-    ROOT tuple = (f32[4,3]{1,0}) tuple(add)
-  })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  // Visit instructions in post order to detect cycles.
-  // TODO(tjoerg): Add cycle detection to the HloVerifier.
-  class DummyVisitor : public DfsHloVisitorWithDefault {
-   public:
-    DummyVisitor() {}
-    Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
-      return Status::OK();
-    }
-  } visitor;
-  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
-    // Accept will return a FailedPrecondition when a cycle is detected.
-    EXPECT_TRUE(computation->root_instruction()->Accept(&visitor).ok());
-  }
-}
-
-TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) {
-  // sub[2,3] --> add[4,3] --> tuple([2,3], [4,3])
-  //  \-------------------------/
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[2,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[2,3]{1,0} parameter(2)
-     sub = f32[2,3]{1,0} subtract(p0, p2)
-     add = f32[4,3]{1,0} add(sub, p1)
-     ROOT tuple = (f32[2,3]{1,0}, f32[4,3]{1,0}) tuple(sub, add)
-    })")
-                    .ValueOrDie();
-
-  // Multi-output fusion requires shapes to be compatible. Since `sub` and `add`
-  // have incompatible shapes, expect that no multi-output fusion happens.
-  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
-}
-
-TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) {
-  auto module = ParseHloString(R"(
-  HloModule test_module
-
-  add_computation {
-    add_lhs = f32[] parameter(0)
-    add_rhs = f32[] parameter(1)
-    ROOT add_root = f32[] add(add_lhs, add_rhs)
-  }
-
-  fused_computation {
-    p1 = f32[10] parameter(0)
-    zero = f32[] constant(0)
-    ROOT f2_root = f32[] reduce(p1, zero), dimensions={0},
-           to_apply=add_computation
-  }
-
-  ENTRY entry {
-    p0 = f32[10] parameter(0)
-    mul = f32[10] multiply(p0, p0)
-    fusion = f32[] fusion(mul), kind=kInput, calls=fused_computation
-    ROOT tuple = (f32[10], f32[]) tuple(fusion, mul)
-  })")
-                    .ValueOrDie();
-
-  // Multi-output fusion is not supported for non-loop fusions at present. Since
-  // `fused_computation` is a input fusion, expect no multi-output fusion to
-  // happen.
-  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
+                   .ValueOrDie());
 }
 
 TEST_F(InstructionFusionTest, FuseScalarConstant) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 33e41a2782b5932430eea621d3cea2c6634f292f..82bdd677d96d3d0826bb4127b32d074eb632b1a3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -40,7 +40,7 @@ namespace {
 
 // Return whether the given shape is rank 2 excluding the batch dimensions.
 bool IsRank2(const Shape& shape, int64 batch_dimensions_size) {
-  return ShapeUtil::Rank(shape) == batch_dimensions_size + 2;
+  return shape.rank() == batch_dimensions_size + 2;
 }
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
@@ -54,7 +54,8 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   PrimitiveType output_primitive_type = output_shape.element_type();
   bool type_is_allowed =
       (output_primitive_type == F16 || output_primitive_type == F32 ||
-       output_primitive_type == F64 || output_primitive_type == C64);
+       output_primitive_type == F64 || output_primitive_type == C64 ||
+       output_primitive_type == C128);
   return type_is_allowed && IsRank2(lhs_shape, batch_dimensions_size) &&
          IsRank2(rhs_shape, batch_dimensions_size) &&
          IsRank2(output_shape, batch_dimensions_size) &&
@@ -154,20 +155,17 @@ bool IsReductionToVector(const HloInstruction& reduce) {
   const HloInstruction* input = reduce.operand(0);
   std::vector<int64> dims_to_keep;
   for (int64 dim = 0; dim < input->shape().dimensions().size(); ++dim) {
-    if (!std::count(reduce.dimensions().begin(), reduce.dimensions().end(),
-                    dim)) {
+    if (!absl::c_linear_search(reduce.dimensions(), dim)) {
       dims_to_keep.push_back(dim);
     }
   }
   return LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
                                               dims_to_keep) &&
-         ShapeUtil::Equal(reduce.shape(), ShapeUtil::FilterDimensions(
-                                              [&dims_to_keep](int64 dim) {
-                                                return std::count(
-                                                    dims_to_keep.begin(),
-                                                    dims_to_keep.end(), dim);
-                                              },
-                                              input->shape()));
+         ShapeUtil::Equal(
+             reduce.shape(),
+             ShapeUtil::FilterDimensions(
+                 [&](int64 dim) { return absl::c_count(dims_to_keep, dim); },
+                 input->shape()));
 }
 
 // This emits a device-side call to
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 22db38ee03b9990cc2f21a01b6c0f2249d0991ea..0007a9a8a3369d8ac010640127e1561615a6d813 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -430,7 +430,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   auto on_false = tuple_select->operand(2);
   TF_RET_CHECK(pred->shape().element_type() == PRED);
   TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()));
-  TF_RET_CHECK(ShapeUtil::IsTuple(tuple_select->shape()));
+  TF_RET_CHECK(tuple_select->shape().IsTuple());
   llvm_ir::EmitTupleSelect(GetIrArray(*tuple_select, *tuple_select),
                            GetIrArray(*pred, *tuple_select),
                            GetBasePointer(*on_true), GetBasePointer(*on_false),
@@ -648,7 +648,7 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   // TODO(b/112040122): Support variadic reduce.
-  if (!ShapeUtil::IsArray(reduce->shape())) {
+  if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on GPU");
   }
   auto arg = reduce->operand(0);
@@ -783,7 +783,7 @@ StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
 std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
     const HloInstruction& hlo) {
   std::vector<llvm_ir::IrArray> output_arrays;
-  if (ShapeUtil::IsTuple(hlo.shape())) {
+  if (hlo.shape().IsTuple()) {
     int64 num_outputs = ShapeUtil::TupleElementCount(hlo.shape());
     output_arrays.reserve(num_outputs);
     for (int64 i = 0; i < num_outputs; ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1472853dc443f0190c3bbed7f96c91ec65ae6dda..294a454931b5cfa368bf094c428a1e942f4556b8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <vector>
@@ -88,6 +89,9 @@ namespace xla {
 namespace gpu {
 
 using llvm_ir::KernelMappingScheme;
+using EmitElementFunction =
+    std::function<void(const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
+                       llvm::Value* x_loc, int64 x_iter_num)>;
 
 namespace {
 
@@ -292,13 +296,12 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
 
   auto shape_in_range = [&](const Shape& s) {
     bool in_range = true;
-    ShapeUtil::ForEachSubshape(
-        s, [&](const Shape& sub_shape, const ShapeIndex& /*index*/) {
-          if (ShapeUtil::IsArray(sub_shape) &&
-              !IsInt32(ShapeUtil::ElementsIn(sub_shape))) {
-            in_range = false;
-          }
-        });
+    ShapeUtil::ForEachSubshape(s, [&](const Shape& sub_shape,
+                                      const ShapeIndex& /*index*/) {
+      if (sub_shape.IsArray() && !IsInt32(ShapeUtil::ElementsIn(sub_shape))) {
+        in_range = false;
+      }
+    });
 
     return in_range;
   };
@@ -542,8 +545,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         // HandleFusion specializes reduction from a multi-dimensional array to
         // a 1D array. The specialized version requires a initializer thunk that
         // initializes the output array to the initial value of the reduce.
-        if (root->opcode() == HloOpcode::kReduce &&
-            ShapeUtil::IsTuple(root->shape())) {
+        if (root->opcode() == HloOpcode::kReduce && root->shape().IsTuple()) {
           // TODO(b/112040122): Support variadic reduce.
           return Unimplemented("Variadic reduce is not supported on GPU");
         }
@@ -634,7 +636,7 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
 
 Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
   // TODO(b/112040122): Support multi-output reduce.
-  if (!ShapeUtil::IsArray(reduce->shape())) {
+  if (!reduce->shape().IsArray()) {
     return Unimplemented("Multi-output reduce is not supported on GPU");
   }
   if (IsReductionToVector(*reduce)) {
@@ -698,8 +700,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   const auto* source = select_and_scatter->operand(1);
   const Window& window = select_and_scatter->window();
   PrimitiveType operand_element_type = operand->shape().element_type();
-  const int64 rank = ShapeUtil::Rank(operand->shape());
-  CHECK_EQ(rank, ShapeUtil::Rank(source->shape()));
+  const int64 rank = operand->shape().rank();
+  CHECK_EQ(rank, source->shape().rank());
   CHECK_EQ(rank, window.dimensions_size());
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
@@ -1015,7 +1017,7 @@ Status IrEmitterUnnested::EmitScatter(
     int64 raw_window_multidim_idx = 0;
     std::vector<llvm::Value*> input_window_multidim;
     std::vector<int64> input_window_bounds;
-    for (int64 i = 0, e = ShapeUtil::Rank(operand->shape()); i != e; ++i) {
+    for (int64 i = 0, e = operand->shape().rank(); i != e; ++i) {
       if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
         input_window_bounds.push_back(1);  // Trivial dimension.
         input_window_multidim.push_back(index.GetConstantWithIndexType(0));
@@ -1027,12 +1029,11 @@ Status IrEmitterUnnested::EmitScatter(
         ++raw_window_multidim_idx;
       }
     }
-    DCHECK_EQ(input_window_multidim.size(), ShapeUtil::Rank(operand->shape()));
+    DCHECK_EQ(input_window_multidim.size(), operand->shape().rank());
 
     // Insert a 1 dimension at the end if index_vector_dim requests one.
     Shape scatter_indices_shape = scatter_indices->shape();
-    if (dim_numbers.index_vector_dim() ==
-        ShapeUtil::Rank(scatter_indices_shape)) {
+    if (dim_numbers.index_vector_dim() == scatter_indices_shape.rank()) {
       scatter_indices_shape.add_dimensions(1);
       scatter_indices_shape.mutable_layout()->add_minor_to_major(
           dim_numbers.index_vector_dim());
@@ -1310,7 +1311,7 @@ Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
   // HloModuleConfig::num_replicas changes between when the module is compiled
   // and when it's run.
   if (crs->operand_count() == 1) {
-    CHECK(ShapeUtil::IsArray(crs->operand(0)->shape()))
+    CHECK(crs->operand(0)->shape().IsArray())
         << "Operands to all-reduce must be arrays: " << crs->ToString();
     AddThunkToThunkSequence(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*crs->operand(0)),
@@ -1509,10 +1510,10 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
                     return !allocation->is_constant();
                   });
 
-  std::sort(non_constant_buffers.begin(), non_constant_buffers.end(),
-            [](const BufferAllocation* a, const BufferAllocation* b) {
-              return a->index() < b->index();
-            });
+  absl::c_sort(non_constant_buffers,
+               [](const BufferAllocation* a, const BufferAllocation* b) {
+                 return a->index() < b->index();
+               });
 
   llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
 
@@ -2080,12 +2081,36 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   return Status::OK();
 }
 
+namespace {
+
+// Returns true if the fusion contains any instruction that is likely
+// translated to complex LLVM IR, such as loops, and prevent vectorization.
+bool MayPreventVectorization(const HloInstruction& fusion_hlo) {
+  CHECK_EQ(fusion_hlo.opcode(), HloOpcode::kFusion);
+  return absl::c_any_of(
+      fusion_hlo.fused_instructions_computation()->instructions(),
+      [&](const HloInstruction* instr) {
+        switch (instr->opcode()) {
+          case HloOpcode::kReduce:
+          case HloOpcode::kReduceWindow:
+          case HloOpcode::kSort:
+          case HloOpcode::kDot:
+            return true;
+          default:
+            return false;
+        }
+      });
+}
+
+}  // namespace
+
 Status IrEmitterUnnested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
   int unroll_factor = 1;
   // Unfused elementwise operations are usually memory bound, unroll them.
-  if (hlo.IsElementwise() || hlo.opcode() == HloOpcode::kFusion) {
+  if (hlo.IsElementwise() ||
+      (hlo.opcode() == HloOpcode::kFusion && !MayPreventVectorization(hlo))) {
     unroll_factor = ComputeMaxUnrollFactor(&hlo);
   }
 
@@ -2136,53 +2161,86 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
 
 namespace {
 
-void EmitFullElementalTile(
-    const KernelMappingScheme* mapping_scheme,
-    const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
-    llvm::Value* x, llvm::Type* index_ty,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+std::tuple<llvm::Value*, int64> GetStartOffsetAndStepForX(
+    int64 tile_size_x, int64 num_threads_x,
+    const KernelMappingScheme* mapping_scheme, llvm::IRBuilder<>* builder,
+    llvm::Value* x, llvm::Type* index_ty) {
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  if (mapping_scheme->DilatedX()) {
+    start_offset_x = x;
+    step_x = num_threads_x;
+  } else {
+    start_offset_x = builder->CreateMul(
+        x, llvm::ConstantInt::get(index_ty, tile_size_x / num_threads_x));
+    step_x = 1;
+  }
+  return std::make_tuple(start_offset_x, step_x);
+}
+
+void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme,
+                           const IrArray::Index& tile_origin_index,
+                           const string& loop_name, KernelSupportLibrary* ksl,
+                           llvm::IRBuilder<>* builder, llvm::Value* y,
+                           llvm::Value* x, llvm::Type* index_ty,
+                           const EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
+      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  IrArray::Index source_idx =
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
   ksl->For(loop_name + "_y", /*start=*/llvm::ConstantInt::get(index_ty, 0),
            /*end=*/llvm::ConstantInt::get(index_ty, tile_size_y),
            /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
            [&](llvm::Value* y_indvar) {
-             IrArray::Index source_idx_y = tile_origin_index.AddOffsetToDim(
+             IrArray::Index source_idx_y = source_idx.AddOffsetToDim(
                  y_indvar, KernelMappingScheme::DimY, builder);
              llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-             for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-               IrArray::Index source_idx = source_idx_y.AddOffsetToDim(
-                   llvm::ConstantInt::get(index_ty, j),
+
+             for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+               IrArray::Index source_idx_y_x = source_idx_y.AddOffsetToDim(
+                   llvm::ConstantInt::get(index_ty, j * step_x),
                    KernelMappingScheme::DimX, builder);
-               llvm::Value* x_loc =
-                   builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
-               emit_elem_function(source_idx, y_loc, x_loc);
+               llvm::Value* x_loc = builder->CreateAdd(
+                   llvm::ConstantInt::get(index_ty, j * step_x),
+                   start_offset_x);
+               emit_elem_function(source_idx_y_x, y_loc, x_loc, j);
              }
            });
 }
 
-void EmitPartialElementalTile(
-    const KernelMappingScheme* mapping_scheme,
-    const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
-    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    llvm::Type* index_ty,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+void EmitPartialElementalTile(const KernelMappingScheme* mapping_scheme,
+                              const IrArray::Index& tile_origin_index,
+                              const string& loop_name,
+                              KernelSupportLibrary* ksl,
+                              llvm::IRBuilder<>* builder, llvm::Value* y,
+                              llvm::Value* x, llvm::Value* tile_height,
+                              llvm::Value* tile_width, llvm::Type* index_ty,
+                              const EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
 
-  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-    IrArray::Index source_idx =
-        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
-                                         KernelMappingScheme::DimX, builder);
-    llvm::Value* x_loc =
-        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
+      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  IrArray::Index source_idx =
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
+  for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+    IrArray::Index source_idx_x =
+        source_idx.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j * step_x),
+                                  KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc = builder->CreateAdd(
+        llvm::ConstantInt::get(index_ty, j * step_x), start_offset_x);
 
     ksl->If(
         loop_name + "_x_in_tile", builder->CreateICmpULT(x_loc, tile_width),
@@ -2202,14 +2260,13 @@ void EmitPartialElementalTile(
               /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
               [&](llvm::Value* y_indvar) {
                 llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-                ksl->If(
-                    loop_name + "_y_in_tile",
-                    builder->CreateICmpULT(y_loc, tile_height), [&] {
-                      emit_elem_function(
-                          source_idx.AddOffsetToDim(
-                              y_indvar, KernelMappingScheme::DimY, builder),
-                          y_loc, x_loc);
-                    });
+                ksl->If(loop_name + "_y_in_tile",
+                        builder->CreateICmpULT(y_loc, tile_height), [&] {
+                          emit_elem_function(
+                              source_idx_x.AddOffsetToDim(
+                                  y_indvar, KernelMappingScheme::DimY, builder),
+                              y_loc, x_loc, j);
+                        });
               });
         });
   }
@@ -2228,8 +2285,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
     const IrArray::Index& tile_origin_index, const string& loop_name,
     KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
     llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+    const EmitElementFunction& emit_elem_function) {
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
@@ -2265,7 +2321,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
 void IrEmitterUnnested::EmitTileElementForCopy(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 /*x_iter_num*/) {
   llvm_ir::TiledParameterInfo* tiled_param_info =
       kernel_info->GetTiledParameterInfo();
   // TODO(jlebar): Add AA metadata to this load.
@@ -2295,7 +2351,7 @@ void IrEmitterUnnested::EmitTileElementForCopy(
 void IrEmitterUnnested::EmitTileElementForFusion(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 /*x_iter_num*/) {
   llvm_ir::TiledParameterInfo* tiled_param_info =
       kernel_info->GetTiledParameterInfo();
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
@@ -2396,6 +2452,23 @@ class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
                             : llvm_ir::KernelMappingScheme::DimX;
   }
 
+  int GetNumberOfPartialResults() const {
+    if (IsRowReduction()) {
+      return 1;
+    }
+    int64 num_thread = mapping_scheme_->GetNumberOfThreadsForDimensionX();
+    int64 tile_size = mapping_scheme_->GetTileSizeForDimensionX();
+    CHECK_EQ(tile_size % num_thread, 0);
+    return tile_size / num_thread;
+  }
+
+  int GetPartialResultIndex(int64 x_iter_num) const {
+    if (IsRowReduction()) {
+      return 0;
+    }
+    return x_iter_num;
+  }
+
  private:
   AddressVector partial_result_addresses_;
   AddressVector reduction_input_addresses_;
@@ -2455,10 +2528,11 @@ void IrEmitterUnnested::EmitPrologueForOneReduction(
   llvm::AllocaInst* reduction_input_address = Alloca(element_type);
   reduction_input_addresses->push_back(reduction_input_address);
 
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
   AddressVector* partial_result_addresses =
       reduction_info->GetMutablePartialResultAddresses();
   llvm::AllocaInst* partial_result_address =
-      Alloca(element_type, /*ArraySize=*/nullptr,
+      Alloca(element_type, /*ArraySize=*/b_.getInt32(num_partial_results),
              "partial_reduction_result." + llvm::Twine(reduce_idx));
   partial_result_addresses->push_back(partial_result_address);
 
@@ -2481,7 +2555,9 @@ void IrEmitterUnnested::EmitPrologueForOneReduction(
             .EmitReadArrayElement(IrArray::Index(b_.getInt32Ty()), &b_);
   }
 
-  Store(init_ir_value, partial_result_address);
+  for (int i = 0; i < num_partial_results; ++i) {
+    Store(init_ir_value, InBoundsGEP(partial_result_address, {b_.getInt32(i)}));
+  }
 }
 
 void IrEmitterUnnested::EmitPrologueForReduction(
@@ -2519,10 +2595,14 @@ void IrEmitterUnnested::EmitPrologueForReduction(
                                 std::move(output_shape_index));
   }
 
-  // Allocate stack storage to store the current output linear index and record
-  // the address of the storage.
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+
+  // Allocate stack storage to store the linear indices for the current output,
+  // and record the address of the storage.
   reduction_info->SetCurrentOutputLinearIndexAddress(
-      Alloca(reduction_info->GetIndexType()));
+      Alloca(reduction_info->GetIndexType(),
+             /*ArraySize=*/b_.getInt32(num_partial_results),
+             "current_output_linear_index_address"));
 
   if (!reduction_info->IsRowReduction()) {
     llvm::Type* bool_ty = b_.getInt1Ty();
@@ -2592,36 +2672,45 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
     llvm_ir::SetToFirstInsertPoint(if_output_inbound_data.true_block, &b_);
   }
 
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+
   // Emit an atomic operation that accumulates the partial reduction to the
   // output element. For row reduction, this is only for lane 0 due to the
   // if-statement emitted above.
   for (int i = 0; i != num_reduces; ++i) {
-    IrArray::Index element_index(
-        /*linear=*/Load(reduction_info->GetCurrentOutputLinearIndexAddress(),
-                        "output_linear_addr"),
-        ShapeUtil::GetSubshape(unnested_hlo->shape(),
-                               reduction_output_shape_indices[i]),
-        &b_);
-    llvm::Value* output_address =
-        GetIrArray(*unnested_hlo, *unnested_hlo,
-                   reduction_output_shape_indices[i])
-            .EmitArrayElementAddress(element_index, &b_,
-                                     "output_element_address");
-    // Do not emit atomic operations if each element in the reduction result is
-    // computed by one block, that is the dimension being reduced has only one
-    // block.
-    const llvm_ir::KernelMappingScheme* mapping_scheme =
-        reduction_info->GetKernelMappingScheme();
-    if (mapping_scheme->GetTileBlockSizeForDimension(
-            llvm_ir::KernelMappingScheme::DimZ) == 1 &&
-        mapping_scheme->GetTileBlockSizeForDimension(
-            reduction_info->GetReducedDimensionEnum()) == 1) {
-      TF_CHECK_OK(EmitCallToNestedComputation(
-          *reducers[i], {output_address, partial_result_addresses[i]},
-          output_address));
-    } else {
-      TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-          *reducers[i], output_address, partial_result_addresses[i]));
+    for (int j = 0; j < num_partial_results; ++j) {
+      IrArray::Index element_index(
+          /*linear=*/Load(
+              InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                          {b_.getInt32(j)}),
+              "output_linear_addr"),
+          ShapeUtil::GetSubshape(unnested_hlo->shape(),
+                                 reduction_output_shape_indices[i]),
+          &b_);
+      llvm::Value* output_address =
+          GetIrArray(*unnested_hlo, *unnested_hlo,
+                     reduction_output_shape_indices[i])
+              .EmitArrayElementAddress(element_index, &b_,
+                                       "output_element_address");
+      // Do not emit atomic operations if each element in the reduction result
+      // is computed by one block, that is the dimension being reduced has only
+      // one block.
+      const llvm_ir::KernelMappingScheme* mapping_scheme =
+          reduction_info->GetKernelMappingScheme();
+      if (mapping_scheme->GetTileBlockSizeForDimension(
+              llvm_ir::KernelMappingScheme::DimZ) == 1 &&
+          mapping_scheme->GetTileBlockSizeForDimension(
+              reduction_info->GetReducedDimensionEnum()) == 1) {
+        TF_CHECK_OK(EmitCallToNestedComputation(
+            *reducers[i],
+            {output_address,
+             InBoundsGEP(partial_result_addresses[i], {b_.getInt32(j)})},
+            output_address));
+      } else {
+        TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+            *reducers[i], output_address,
+            InBoundsGEP(partial_result_addresses[i], {b_.getInt32(j)})));
+      }
     }
   }
 }
@@ -2629,7 +2718,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
 void IrEmitterUnnested::EmitTileElementForReduction(
     HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 x_iter_num) {
   VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
   HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
                                         ? unnested_hlo->fused_expression_root()
@@ -2642,8 +2731,11 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   // Record the linear address for the current reduction.
   const ReductionCodegenInfo* reduction_info =
       dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  int partial_result_index = reduction_info->IsRowReduction() ? 0 : x_iter_num;
+
   Store(index[reduction_info->GetKeptDimensionEnum()],
-        reduction_info->GetCurrentOutputLinearIndexAddress());
+        InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                    {b_.getInt32(partial_result_index)}));
   if (!reduction_info->IsRowReduction()) {
     llvm::Type* bool_ty = b_.getInt1Ty();
     llvm::AllocaInst* output_inbound_addr =
@@ -2690,6 +2782,13 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
           index,
           GetFirstReduceInstruction(output_instructions)->operand(0)->shape());
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+  if (num_partial_results > 1) {
+    // Clear the linear index field of the IrArray::Index to enable the use of
+    // GetElementPointer with array types. This enables the vectorization of
+    // the computation for different partial results.
+    input_index.ClearLinearIndex();
+  }
   absl::Span<llvm::AllocaInst* const> partial_reduction_result_addresses =
       reduction_info->GetPartialResultAddresses();
   absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
@@ -2702,10 +2801,12 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   for (int i = 0; i != reducers.size(); ++i) {
     llvm::Value* const input_ir_value = input_gens[i](input_index).ValueOrDie();
     Store(input_ir_value, reduction_input_addresses[i]);
+    llvm::Value* partial_result_address =
+        InBoundsGEP(partial_reduction_result_addresses[i],
+                    {b_.getInt32(partial_result_index)});
     TF_CHECK_OK(EmitCallToNestedComputation(
-        *reducers[i],
-        {partial_reduction_result_addresses[i], reduction_input_addresses[i]},
-        partial_reduction_result_addresses[i]));
+        *reducers[i], {partial_result_address, reduction_input_addresses[i]},
+        partial_result_address));
   }
 
   // Emit code to generate the output for the non-reduction instructions in the
@@ -2716,8 +2817,8 @@ void IrEmitterUnnested::EmitTileElementForReduction(
 
 // Emits a kernel for the hlo instruction using the given tiling scheme.
 void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
-                                  const KernelCodegenInfo* kernel_info,
-                                  KernelSupportLibrary& ksl,
+                                  KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary* ksl,
                                   llvm::Type* index_ty) {
   KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
   absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
@@ -2750,15 +2851,14 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
           llvm::Value* num_tiles_in_block =
               Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
                      last_block_size_for_dim, block_size_for_dim);
-
-          ksl.For(loop_name,
-                  /*start=*/index_typed_constant(0),
-                  /*end=*/num_tiles_in_block,
-                  /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
-                    IrArray::Index tile_index = starting_tile.AddOffsetToDim(
-                        block_dim_induction_var, dim_id, &b_);
-                    emit_next_block_dim(tile_index);
-                  });
+          ksl->For(loop_name,
+                   /*start=*/index_typed_constant(0),
+                   /*end=*/num_tiles_in_block,
+                   /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                     IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                         block_dim_induction_var, dim_id, &b_);
+                     emit_next_block_dim(tile_index);
+                   });
         }
       };
 
@@ -2813,7 +2913,8 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
 // unnested_hlo: The unnested hlo instruction for which the kernel is generated.
 //   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
 // tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
-//   other tensors with the same dimensions and need to be tiled and tranposed.
+//   other tensors with the same dimensions and are safe to be tranposed via
+//   the shared memory tranpose implementation.
 // mapping_scheme: The tiling scheme to use.
 // kernel_generator: Contains function objects for code generation, such as
 //   element generator, block prologue and epilogue generators.
@@ -2901,8 +3002,7 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   auto emit_tiled_elemental_code_with_bounds_check =
       [&](const IrArray::Index& index, const string& loop_name,
           llvm::Value* tile_height, llvm::Value* tile_width,
-          const std::function<void(const IrArray::Index&, llvm::Value*,
-                                   llvm::Value*)>& emit_elem_function) {
+          const EmitElementFunction& emit_elem_function) {
         EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
                                               &ksl, &b_, y, x, tile_height,
                                               tile_width, emit_elem_function);
@@ -2915,10 +3015,6 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     const IrArray::Index input_tile_origin(
         Permute({0, 2, 1}, output_tile_origin.multidim()));
 
-    const IrArray::Index input_index =
-        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
-            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
-
     // If shared memory transpose is needed, wait for all threads to reach this
     // point, lest we copy a value from tile to output before the other thread
     // copies it from input to tile. This is `__syncthreads` in CUDA.
@@ -2928,9 +3024,10 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
       // Note that tile_width and tile_height are flipped here because we are
       // reading a transposed tile.
       emit_tiled_elemental_code_with_bounds_check(
-          input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
+          input_tile_origin, "input", output_tile_bounds[2],
+          output_tile_bounds[1],
           [&](const IrArray::Index& index, llvm::Value* y_loc,
-              llvm::Value* x_loc) {
+              llvm::Value* x_loc, int64 /*x_iter_num*/) {
             for (int64 id : tiled_param_ids) {
               IrArray& input_in_logical_shape =
                   param_in_reduced_shape_arrays[id];
@@ -2950,18 +3047,15 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
     kernel_info->SetTiledParamInfo(&tiled_param_info);
 
-    const IrArray::Index output_index =
-        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
-            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
-
     // Write to output[index] by emitting code like normal, except that values
     // for the tiled parameters are read from the shmem buffers.
     emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
-        [&](const IrArray::Index& index, llvm::Value* y_loc,
-            llvm::Value* x_loc) {
-          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
-                                                     kernel_info, y_loc, x_loc);
+        output_tile_origin, "output", output_tile_bounds[1],
+        output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc, llvm::Value* x_loc,
+            int64 x_iter_num) {
+          kernel_generator.GetTileElementGenerator()(
+              unnested_hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
         });
 
     // If a tile block contains multiple tiles and shared memory buffers are
@@ -2979,7 +3073,7 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     block_prologue_generator(unnested_hlo, kernel_info);
   }
 
-  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+  EmitBlock(std::move(emit_one_tile), kernel_info, &ksl, index_ty);
 
   const BlockEpilogueGenerator& block_epilogue_generator =
       kernel_generator.GetBlockEpilogueGenerator();
@@ -2992,7 +3086,10 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
 
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
 // algorithm to improve the memory access patterns for the input parameters
-// with a shape that is a 0-2-1 transpose of the output tensor shape.
+// with a shape that is a 0-2-1 transpose of the output tensor shape. The caller
+// is responsible for making sure that it is safe to apply the shared memory
+// tranpose on the input parameters.
+//
 //
 // For the purpose of tiling, the output tensors have a logical shape of three
 // components 0-2-1 while the relevant input parameters have a logical shape
@@ -3025,17 +3122,19 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
     element_generator = [&](HloInstruction* hlo,
                             const llvm_ir::IrArray::Index& index,
                             const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc) {
-      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+                            llvm::Value* y_loc, llvm::Value* x_loc,
+                            int64 x_iter_num) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
     };
   } else {
     DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
-    element_generator = [&](HloInstruction* hlo,
-                            const llvm_ir::IrArray::Index& index,
-                            const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc) {
-      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
-    };
+    element_generator =
+        [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+            const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+            llvm::Value* x_loc, int64 x_iter_num) {
+          EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc,
+                                   x_iter_num);
+        };
   }
   KernelCodegenInfo kernel_info(&mapping_scheme);
   KernelCodeGenerator kernel_generator(std::move(element_generator));
@@ -3043,26 +3142,99 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
 }
 
 namespace {
-// Returns true to indicate it is safe to use the tile based shared memory
-// transpose implementation to implement the kernel for the instruction.
+// A recursive function to inspect the users of a parameter to determine
+// whether it's safe for a parameter to participate in a shared-memory
+// transpose.
 //
-// An instruction is not safe for such an implementation if it can change the
-// element order of a tensor without changing the dimension of the tensor, and
-// the instruction has a corresponding elemental_ir_emitter.
-bool IsInstructionSafeForTileBasedTranspose(const HloInstruction* hlo) {
-  auto is_safe_for_tile_based_transpose = [&](const HloInstruction* instr) {
-    HloOpcode opcode = instr->opcode();
-    CHECK_NE(opcode, HloOpcode::kFusion);
-    return (opcode != HloOpcode::kReverse && opcode != HloOpcode::kGather);
-  };
+// Consider a fusion parameter P for which we might want to use a shmem
+// transpose.  If we do, we use a GPU thread block to preload a tile of P with
+// indices [z, y..y+31, x..x+31] to compute an output tile with the same indices
+// cooperatively, where z, y, x are the indices for the normalized input/output
+// tensor (see the document for FindTranspose021 for the definition of
+// normalized tensor for 0-2-1 transpose). This shmem transpose implementation
+// requires that the computation of the output tile only read elements within
+// the preload tile. If this is not true, we can't use a shmem transpose for P.
+//
+// If the computation of output element [z, y, x] only requires the element of
+// P with the same indices, the shmem tranpose implementation can be applied
+// to P safely. This is a sufficient but not necessary condition. We check all
+// the transitive users of P to see if we can find a user that may cause an
+// exception to the situation. If such a user is not found, we conclude that P
+// is safe for shmem transpose.
+//
+// This is trivially true for elementwise operations and some "data-movement"
+// ops like kTuple. However, it's not true for operations that can change the
+// dimensions of the inputs (e.g. pad, slice) and bitcast operation.
+// For example:
+//
+// fused_computation {
+//   param_0 = f32[64,64]{1,0} parameter(0)
+//   ROOT bitcast = f32[64,64]{0,1} bitcast(param_0)
+// }
+// The output element at logical address [0, 63] depends on the input element
+// at logical address [63, 0], which would not be within the shared-memory
+// block.
+//
+// TODO(bixia): In order to extend this for kInput fusion, that is reduction
+// with tranpose, we only need to end the use-chain checking with the input of
+// a reduce operations. In this case, the above description on "output" apply
+// to the result of such a use-chain, which provides the input to the reduce
+// operation.
+bool IsInstructionSafeForShmemTranspose(const HloInstruction* hlo) {
+  if (hlo->IsElementwise()) {
+    return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+      return IsInstructionSafeForShmemTranspose(user);
+    });
+  }
+
+  switch (hlo->opcode()) {
+    // Non-elementwise instructions that don't cause the shmem transpose
+    // to be unsafe, including the instructions that don't currently fuse.
+    case HloOpcode::kGetDimensionSize:
+      // The result of the operation doesn't rely on the content of the
+      // tensor. As such, there is no need to further inspect its users.
+      return true;
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kMap:
+    case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
+    case HloOpcode::kTupleSelect:
+      return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+        return IsInstructionSafeForShmemTranspose(user);
+      });
 
-  if (hlo->opcode() == HloOpcode::kFusion) {
-    return absl::c_all_of(hlo->fused_instructions_computation()->instructions(),
-                          is_safe_for_tile_based_transpose);
+    default:
+      return false;
   }
+}
 
-  return is_safe_for_tile_based_transpose(hlo);
+// Given a group of input parameters that are 0-2-1 tranpose of the outputs of
+// a fusion kernel, returns the input parameters that are safe for the shared
+// memory tranpose implementation.
+//
+// When a tile based shared memory transpose is used to implement an input with
+// 0-2-1 transpose, we preload a tile of the input elements
+// [z, y..y+31, x..x+31] to compute the output tile elements of the same
+// indices. Preloading the input tile this way is only safe when the computation
+// of the output tile elements do not need any input element outside the
+// preloaded tile. We inspect all the transitive users of the input parameter
+// up to the fusion root instruction to see if we can find any instruction
+// that can make preloading the input tile unsafe.
+std::vector<int64> FilterInputsForShmemTranspose(const HloInstruction* fusion,
+                                                 std::vector<int64> input_ids) {
+  std::vector<int64> filtered_input_ids;
+  for (int64 i = 0; i < input_ids.size(); ++i) {
+    const HloInstruction* input = fusion->fused_parameter(input_ids[i]);
+    if (IsInstructionSafeForShmemTranspose(input)) {
+      filtered_input_ids.push_back(input_ids[i]);
+    } else {
+      VLOG(10) << "Input not safe for shmem transpose " << input->ToString()
+               << "\n";
+    }
+  }
+  return filtered_input_ids;
 }
+
 }  // namespace
 
 bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
@@ -3109,8 +3281,11 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
-  if (!IsInstructionSafeForTileBasedTranspose(hlo)) {
-    return false;
+  if (opcode == HloOpcode::kFusion) {
+    params_012 = FilterInputsForShmemTranspose(hlo, params_012);
+    if (params_012.empty()) {
+      return false;
+    }
   }
 
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
@@ -3191,7 +3366,7 @@ Status AreFusedReductionOutputsConsistent(
 // dimensions from minor to major.
 DimensionVector GetDimensionsToKeepMinorToMajor(
     const Shape& input_shape, absl::Span<const int64> dims_to_reduce) {
-  DimensionVector input_dims(ShapeUtil::Rank(input_shape), 0);
+  DimensionVector input_dims(input_shape.rank(), 0);
   absl::c_iota(input_dims, 0);
   DimensionVector input_dims_to_keep;
   for (int input_dim : input_dims) {
@@ -3231,7 +3406,7 @@ std::tuple<int64, int64, int64> GetReductionToVectorDimensions(
   if (input_dims_to_keep_minor_to_major.empty()) {
     return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
   }
-  DimensionVector input_dims(ShapeUtil::Rank(input_shape), 0);
+  DimensionVector input_dims(input_shape.rank(), 0);
   absl::c_iota(input_dims, 0);
   absl::Span<const int64> minor_to_major =
       LayoutUtil::MinorToMajor(input_shape);
@@ -3253,11 +3428,101 @@ std::tuple<int64, int64, int64> GetReductionToVectorDimensions(
   return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
 }
 
+// Returns true if all the transitive users of hlo before hitting users in
+// use_chain_endings are elementwise operations.
+bool AreUsersElementwise(const HloInstruction* hlo,
+                         const ConstHloInstructionSet& use_chain_endings) {
+  return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+    return use_chain_endings.count(user) ||
+           (user->IsElementwise() &&
+            AreUsersElementwise(user, use_chain_endings));
+  });
+}
+
+// Returns the number of fusion inputs that have the same dimension as the
+// given shape, and involve in only elementwise operations.
+int64 NumInputsInvolveInOnlyElementwiseOps(
+    const HloInstruction* unnested_hlo, const Shape& op_shape,
+    const ConstHloInstructionSet& use_chain_endings) {
+  return absl::c_count_if(
+      unnested_hlo->fused_parameters(), [&](const HloInstruction* parameter) {
+        const Shape& parameter_shape = parameter->shape();
+        return ShapeUtil::SameDimensions(op_shape, parameter_shape) &&
+               AreUsersElementwise(parameter, use_chain_endings);
+      });
+}
+
+// Returns the number of fusion inputs that have more elements than the given
+// shape.
+int64 NumInputsWithMoreElementsThan(const HloInstruction* unnested_hlo,
+                                    const Shape& shape) {
+  int64 num_elements = ShapeUtil::ElementsIn(shape);
+  return absl::c_count_if(
+      unnested_hlo->fused_parameters(), [&](const HloInstruction* parameter) {
+        return ShapeUtil::ElementsIn(parameter->shape()) > num_elements;
+      });
+}
+
+// The benefit of unrolling a kInput fusion that is a column reduction comes
+// from the vectorization of non-reduction fusion outputs and fusion inputs.
+// On the other hand, unrolling can also introduce factors that can cause
+// the kernel to run slower. This routine uses a simple heuristic to estimate
+// the benefit as well as the overhead of unrolling in order to decide whether
+// unrolling is beneficial for the given kInput fusion.
+bool IsUnrollingColumnReductionBeneficial(const HloInstruction* unnested_hlo,
+                                          const Shape& input_shape,
+                                          int64 num_kept) {
+  // TODO(b/122468062): Need further investigate to see whether we can
+  // remove the constraint on IsPowerOfTwo.
+  if (!IsPowerOfTwo(static_cast<uint64>(num_kept))) {
+    return false;
+  }
+
+  if (unnested_hlo->opcode() == HloOpcode::kReduce) {
+    return true;
+  }
+
+  CHECK_EQ(unnested_hlo->opcode(), HloOpcode::kFusion);
+  int64 can_be_vectorized = 0;
+  int64 cannot_be_vectorized = 0;
+  const HloInstruction* fused_root = unnested_hlo->fused_expression_root();
+  ConstHloInstructionSet use_chain_endings;
+  if (fused_root->opcode() == HloOpcode::kReduce) {
+    use_chain_endings.insert(fused_root);
+    // Atomic.add of the reduction result can't be vectorized.
+    cannot_be_vectorized++;
+  } else {
+    CHECK_EQ(fused_root->opcode(), HloOpcode::kTuple);
+    for (const HloInstruction* instr : fused_root->operands()) {
+      if (instr->opcode() == HloOpcode::kReduce) {
+        // Atomic.add of the reduction result can't be vectorized.
+        cannot_be_vectorized++;
+      } else {
+        // Write of the non-reduction result can be vectorized.
+        can_be_vectorized++;
+      }
+      use_chain_endings.insert(instr);
+    }
+  }
+  // Fusion inputs that have the same dimension as the reduce input and
+  // only involve in elementwise operations can be vectorized.
+  can_be_vectorized += NumInputsInvolveInOnlyElementwiseOps(
+      unnested_hlo, input_shape, use_chain_endings);
+  // Fusion inputs with more elements than the reduce op input must participate
+  // in non-elementwise operations and we assume that they are not vectorizable
+  // for the purpose of estimating the benefit of unrolling. If the kernel is
+  // unrolled even with such an assumption,  and the accesses to those inputs
+  // turn out to be vectorizable, the compiler will still vectorize them.
+  cannot_be_vectorized +=
+      NumInputsWithMoreElementsThan(unnested_hlo, input_shape);
+  return can_be_vectorized >= cannot_be_vectorized;
+}
+
 }  // namespace
 
 std::tuple<KernelMappingScheme, bool>
 IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
-    const HloInstruction* first_reduce) {
+    const HloInstruction* unnested_hlo, const HloInstruction* first_reduce) {
   int64 depth = 1;
   int64 height = 1;
   int64 width = 1;
@@ -3274,6 +3539,7 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
   std::tie(num_reduced_major, num_kept, num_reduced_minor) =
       GetReductionToVectorDimensions(input_shape, first_reduce->dimensions());
   CHECK_EQ(num_output_elems, num_kept);
+  bool dilated_x = true;
 
   if (num_kept == 1) {
     // Scalar reduction is a special row reduction with depth = height = 1.
@@ -3288,13 +3554,21 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
     is_row_reduction = false;
     // Column reduction without transpose doesn't require communication among
     // threads processing elements in the same tile. The current implementation
-    // only support the use of on hardware thread block to process one block of
-    // tiles in the KernelMappingScheme. We try to maximize the values of
+    // only support the use of one hardware thread block to process one block of
+    // tiles in the KernelMappingScheme. We try to use one thread to compute
+    // the partial results for two tensor elements and to maximize the values of
     // num_threads_x and tile_size_x to allow a bigger hardware thread block.
     int64 hw_threads_per_block_limit =
         ThreadsPerBlockLimit(ir_emitter_context_->device_description());
-    tile_size_x = std::min(hw_threads_per_block_limit, num_kept);
-    num_threads_x = tile_size_x;
+    if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
+                                             num_kept)) {
+      tile_size_x = std::min(2 * hw_threads_per_block_limit, num_kept);
+      num_threads_x = tile_size_x / 2;
+      dilated_x = false;
+    } else {
+      tile_size_x = std::min(hw_threads_per_block_limit, num_kept);
+      num_threads_x = tile_size_x;
+    }
     int64 kNumElementsPerPartialSum = 128;
     tile_size_y = kNumElementsPerPartialSum;
   } else {
@@ -3323,6 +3597,7 @@ IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
   llvm_ir::KernelMappingScheme mapping_scheme(
       dims_in_elem, tile_size_y, tile_size_x, req_block_sizes, num_threads_y,
       num_threads_x, &b_);
+  mapping_scheme.SetDilatedX(dilated_x);
   return std::make_tuple(mapping_scheme, is_row_reduction);
 }
 
@@ -3371,14 +3646,15 @@ Status IrEmitterUnnested::EmitReductionToVector(HloInstruction* unnested_hlo) {
   bool is_row_reduction;
   llvm_ir::KernelMappingScheme mapping_scheme;
   std::tie(mapping_scheme, is_row_reduction) =
-      ComputeMappingSchemeAndReductionKind(first_reduce);
+      ComputeMappingSchemeAndReductionKind(unnested_hlo, first_reduce);
   ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction);
   KernelCodeGenerator kernel_generator(
       /*tile_element_generator=*/
       [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
           const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-          llvm::Value* x_loc) {
-        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc);
+          llvm::Value* x_loc, int64 x_iter_num) {
+        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc,
+                                    x_iter_num);
       },
       /*block_prologue_generator=*/
       [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index d217ee36cf6e9b5278024a2f78513232328e7538..21b842bb2cd63ac454f85556df20ae5877cecbe1 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -76,7 +76,6 @@ class IrEmitterUnnested : public IrEmitter {
     void SetLaneId(llvm::Value* v) { lane_id_ = v; }
     void SetIndexType(llvm::Type* t) { index_ty_ = t; }
     void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
-      CHECK_EQ(tiled_param_info_, nullptr);
       tiled_param_info_ = tiled_param_info;
     }
 
@@ -89,7 +88,7 @@ class IrEmitterUnnested : public IrEmitter {
     }
     llvm::Type* GetIndexType() const { return index_ty_; }
 
-   private:
+   protected:
     llvm_ir::KernelMappingScheme* mapping_scheme_;
     llvm_ir::TiledParameterInfo* tiled_param_info_;
     llvm::Value* lane_id_;
@@ -109,10 +108,12 @@ class IrEmitterUnnested : public IrEmitter {
   // y_loc: The y coordinate within a tile.
   // x_loc: The x coordinate within a tile.
   // kernel_info: Other information to support the kernel code generation.
+  // x_iter_num: When a thread process N elements in the X dimension, x_iter_num
+  //             has a value of 0..N-1 to identify the element being process.
   using TileElementGenerator = std::function<void(
       HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
       const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-      llvm::Value* x_loc)>;
+      llvm::Value* x_loc, int64 x_iter_num)>;
 
   // KernelCodeGenerator records the code generator objects that generate code
   // for tile elements or tile block prologue/epilogue.
@@ -216,9 +217,13 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitReductionToVector(HloInstruction* unnested_hlo);
 
   // Computes the KernelMappingScheme for the reduce HLO and indicates whether
-  // the reduction is a row reduction.
+  // the reduction is a row reduction. For an un-fused reduce op, unnested_hlo
+  // and first_reduce are the same instruction. For a kInput fusion,
+  // unnested_hlo is the fusion instruction while first_reduce is the first
+  // reduce op.
   std::tuple<llvm_ir::KernelMappingScheme, bool>
-  ComputeMappingSchemeAndReductionKind(const HloInstruction* first_reduce);
+  ComputeMappingSchemeAndReductionKind(const HloInstruction* unnested_hlo,
+                                       const HloInstruction* first_reduce);
 
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
@@ -243,26 +248,29 @@ class IrEmitterUnnested : public IrEmitter {
                               const KernelCodeGenerator& kernel_generator,
                               KernelCodegenInfo* kernel_info);
   void EmitBlock(const TileGenerator& emit_one_tile,
-                 const KernelCodegenInfo* kernel_info,
-                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+                 KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl,
+                 llvm::Type* index_ty);
   // Emits code to process a tensor element in a tile for the given kCopy HLO
   // that performs a 0-2-1 transpose.
   void EmitTileElementForCopy(HloInstruction* hlo,
                               const llvm_ir::IrArray::Index& index,
                               const KernelCodegenInfo* kernel_info,
-                              llvm::Value* y_loc, llvm::Value* x_loc);
+                              llvm::Value* y_loc, llvm::Value* x_loc,
+                              int64 x_iter_num);
   // Emits code to process a tensor element in a tile for the given kLoop fusion
   // HLO containing parameters that are 0-2-1 transpose of its outputs.
   void EmitTileElementForFusion(HloInstruction* hlo,
                                 const llvm_ir::IrArray::Index& index,
                                 const KernelCodegenInfo* kernel_info,
-                                llvm::Value* y_loc, llvm::Value* x_loc);
+                                llvm::Value* y_loc, llvm::Value* x_loc,
+                                int64 x_iter_num);
   // Emits code to process a tensor element in a tile for the given input hlo
   // that is either a unnested kReduce or a kInput fusion.
   void EmitTileElementForReduction(HloInstruction* unnested_hlo,
                                    const llvm_ir::IrArray::Index& index,
                                    const KernelCodegenInfo* kernel_info,
-                                   llvm::Value* y_loc, llvm::Value* x_loc);
+                                   llvm::Value* y_loc, llvm::Value* x_loc,
+                                   int64 x_iter_num);
   // Prepares for the code generation for a tile block of a reduction kernel.
   void EmitPrologueForReduction(HloInstruction* unnested_hlo,
                                 KernelCodegenInfo* kernel_info);
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index bd53b90b42d8e657a3ee58e7ca03fb60522aae28..153aab97d9eb971734c5ea95564895631bc2a9fa 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -110,11 +110,9 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
 }
 
 // Gets the GPU name as it's known to LLVM for a given compute capability.  If
-// we see an unrecognized compute capability, we return "sm_30".
+// we see an unrecognized compute capability, we return "sm_35".
 static string GetSmName(std::pair<int, int> compute_capability) {
   static auto* m = new std::map<std::pair<int, int>, int>({
-      {{3, 0}, 30},
-      {{3, 2}, 32},
       {{3, 5}, 35},
       {{3, 7}, 37},
       {{5, 0}, 50},
@@ -125,8 +123,9 @@ static string GetSmName(std::pair<int, int> compute_capability) {
       {{6, 2}, 62},
       {{7, 0}, 70},
       {{7, 2}, 72},
+      {{7, 5}, 75},
   });
-  int sm_version = 30;
+  int sm_version = 35;
   auto it = m->find(compute_capability);
   if (it != m->end()) {
     sm_version = it->second;
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 01fddcede64d1bb02ab89db5fc9524893c2d47a4..02e1207f377b8c28bf2566bee8cf3bcbc66794fb 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -67,7 +67,7 @@ int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
   }
   int64 profit = 0;
   for (auto instr : instr2->operands()) {
-    if (!IsProfitableOperand(instr) || in_list.count(instr) == 0) {
+    if (!IsProfitableOperand(instr) || !in_list.contains(instr)) {
       continue;
     }
     profit += ShapeUtil::ByteSizeOf(instr->shape());
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index d16c87ba5c63aa582753fe949e9e39ee2d8b81e5..40b87b16a195564c9b98497f79a70f1db0539d87 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -628,8 +628,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
       p.1 = s32[1]{0} parameter(1)
       p.2 = f16[1,96,1024]{2,1,0} parameter(2)
       c.0 = s32[] constant(0)
-      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
-      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, p.1, c.0, c.0)
     }
 
     fusion.2 {
@@ -638,7 +637,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
       p.2 = f16[1,96,1024]{2,1,0} parameter(2)
       c.0 = s32[] constant(0)
       pad = s32[3]{0} pad(p.1, c.0), padding=0_2
-      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, p.1, c.0, c.0)
     }
 
     ENTRY entry {
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index cd369d55987b96eed2efb64ae0df6b3a76acb672..48f718b514cc9809d4100627f85af7aa05445d36 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -37,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
@@ -78,6 +80,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -152,6 +155,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
+    pipeline.AddPass<DynamicIndexSplitter>();
     pipeline.AddPass<GpuHloSupportChecker>();
     ReducePrecisionInsertion::AddPasses(
         &pipeline, hlo_module->config().debug_options(),
@@ -163,6 +167,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       // We need a cost model for GPUs. Currently, do nothing.
       return false;
     };
+    pipeline.AddPass<DotDecomposer>(false);
     pipeline.AddPass<ConvolutionGroupConverter>(
         cost_model,
         /*convert_batch_groups_only=*/true);
@@ -194,10 +199,10 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
-      AlgebraicSimplifierOptions options(
-          [](const Shape&, const Shape&) { return false; });
+      AlgebraicSimplifierOptions options;
       options.set_enable_permutation_sort_replacement(true);
       pass.AddPass<AlgebraicSimplifier>(options);
+      pass.AddPass<SortSimplifier>();
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
@@ -266,10 +271,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    AlgebraicSimplifierOptions options(
-        /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
-          return true;
-        });
+    AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
     options.set_enable_permutation_sort_replacement(true);
     pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 4775baf44aecfe6adaf2bf0d2791595436635b16..1dedbd3befce6e2ceb06126d83a061207a90dd8f 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -25,7 +26,7 @@ namespace xla {
 namespace gpu {
 
 bool StreamAssignment::HasStreamAssigned(const HloInstruction& hlo) const {
-  return hlo_to_stream_number_.count(&hlo);
+  return hlo_to_stream_number_.contains(&hlo);
 }
 
 int StreamAssignment::StreamNumberForHlo(const HloInstruction& hlo) const {
@@ -98,10 +99,10 @@ int ComputeStreamToAssign(
   // greedy approach. First, we compute as forbidden_stream_numbers the
   // streams assigned to GEMMs that are concurrent with `hlo`. Then, we assign
   // `hlo` a different stream.
-  std::set<int> forbidden_stream_numbers;
+  absl::flat_hash_set<int> forbidden_stream_numbers;
   for (const auto* seen_gemm : seen_gemms) {
     int stream_num = stream_assignment.StreamNumberForHlo(*seen_gemm);
-    if (!forbidden_stream_numbers.count(stream_num) &&
+    if (!forbidden_stream_numbers.contains(stream_num) &&
         CanRunConcurrently(*seen_gemm, hlo, reachability)) {
       forbidden_stream_numbers.insert(stream_num);
     }
@@ -109,7 +110,7 @@ int ComputeStreamToAssign(
 
   for (int stream_num = 0; stream_num < stream_assignment.StreamCount();
        ++stream_num) {
-    if (!forbidden_stream_numbers.count(stream_num)) {
+    if (!forbidden_stream_numbers.contains(stream_num)) {
       return stream_num;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index d798b31643782eb25bba08227e29903ec0e7a597..d8bd9f7f6df48fe2faf510b369b99b6cd2173608 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -47,6 +47,21 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_buffer_assignment_test",
+    srcs = ["gpu_buffer_assignment_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_copy_test",
     srcs = ["gpu_copy_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_buffer_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_buffer_assignment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1335d73494100788f3ffe1bd0f5eb200de79cb21
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_buffer_assignment_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuBufferAssignmentTest : public GpuCodegenTest {
+ public:
+  HloModuleConfig ConfigWithoutHloPasses() {
+    HloModuleConfig config;
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    // Disable layout_assignment to use the preassigned layouts.
+    debug_options.xla_disable_all_hlo_passes();
+    config.set_debug_options(debug_options);
+    return config;
+  }
+};
+
+TEST_F(GpuBufferAssignmentTest, InstructionNameWithHyphenSanitized) {
+  const char *const kHloString = R"(
+    HloModule HyphenInInstructionName
+      ENTRY kernelEntry {
+        ROOT equal-to = s32[2]{0} constant({42, 73})
+    })";
+
+  // Check that '-' in the instruction name is changed to '_'.
+  auto hlo_module = ParseHloString(kHloString).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK: buffer_for_equal_to =
+)",
+                     /*match_optimized_ir=*/true);
+
+  // TODO(bixia): The run fails randomly.
+  // Check that the kernel runs correctly.
+  // EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuBufferAssignmentTest, BufferSanitizedNameCollisionResolved) {
+  const char *const kHloString = R"(
+    HloModule BufferSanitizedName
+      ENTRY kernelEntry {
+      equal.to = s32[2]{0} constant({42, 73})
+      equal-to = s32[2]{0} constant({67, 3})
+      ROOT add = s32[2]{0} add(equal.to, equal-to)
+    })";
+
+  // Turn of Hlo passes to prevent constant folding.
+  //
+  // Check that '-' and '.' in the instruction name are changed to '_', and
+  // name collision is resolved by LLVM.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutHloPasses()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK: buffer_for_equal_to =
+; CHECK: buffer_for_equal_to1 =
+)",
+                     /*match_optimized_ir=*/false);
+
+  // TODO(bixia): There is another bug that prevents this from running
+  //              correctly.
+  // Check that the kernel runs correctly.
+  // EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index a302b582ede3723acd118d2e4a4bb3efdf7a4d0b..869724db601b2d5e4ed6d3c7bf3e10a748433146 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -65,7 +65,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -91,7 +91,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -118,7 +118,7 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -152,7 +152,7 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -187,13 +187,13 @@ TEST_F(GpuKernelTilingTest,
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
 }
 
-TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
+TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
   const char *const kHloString = R"(
     HloModule FusionTransposeWithReverseNotTiled
     fused_computation.1 {
@@ -214,12 +214,203 @@ TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
+  const char *const kHloString = R"(
+    HloModule TransposedInputWithUserBitcast
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      ROOT bitcast = f32[20,20]{0,1} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = f32[20,20]{0,1} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
+  const char *const kHloString = R"(
+    HloModule TwoTransposedInputs
+
+    fused_computation {
+      param_0 = f32[64,64]{1,0} parameter(0)
+      param_1 = f32[64,64]{1,0} parameter(1)
+      bitcast = f32[64,64]{0,1} bitcast(param_0)
+      copy = f32[64,64]{0,1} copy(param_1)
+      ROOT tuple = (f32[64,64]{0,1}, f32[64,64]{0,1}) tuple(bitcast, copy)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[64,64]{1,0} parameter(0)
+      parameter.1 = f32[64,64]{1,0} parameter(1)
+      ROOT fusion = (f32[64,64]{0,1}, f32[64,64]{0,1})
+        fusion(parameter.0, parameter.1),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
+  const char *const kHloString = R"(
+  HloModule column_reduce_powerof2
+
+  reduction {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY kernel_entry {
+    constant0 = f32[] constant(0)
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg1_conv = f32[1024,512]{1,0} convert(arg1)
+    ROOT reduce = f32[512]{0} reduce(arg1_conv, constant0), dimensions={0}, to_apply=reduction
+  })";
+
+  // Check that two calls to llvm.nvvm.atomic are generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
+TEST_F(GpuKernelTilingTest,
+       ColumnReductionWithInputLargerThenReduceInputNotUnrolled) {
+  const char *const kHloString = R"(
+  HloModule larger_than_reduce_input_parameter
+
+  reduction22 {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  fused_computation {
+    constant0 = f32[] constant(0)
+    arg.1 = f16[1024,512]{1,0} parameter(0)
+    arg.2 = f16[1027,513]{1,0} parameter(1)
+    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
+    arg2.conv = f32[1027,513]{1,0} convert(arg.2)
+    slice2 = f32[1024,512]{1,0} slice(arg2.conv), slice={[2:1026], [1:513]}
+    add2 = f32[1024,512]{1,0} add(arg1.conv, slice2)
+    ROOT reduce = f32[512]{0} reduce(add2, constant0), dimensions={0},
+      to_apply=reduction22
+  }
+
+  ENTRY kernel_entry {
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg2 = f16[1027,513]{1,0} parameter(1)
+    ROOT fusion = f32[512]{0} fusion(arg1, arg2), kind=kInput,
+      calls=fused_computation
+  })";
+
+  // Check that one call to llvm.nvvm.atomic is generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
+TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
+  const char *const kHloString = R"(
+  HloModule column_reduce_powerof2_mof
+
+  reduction22 {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  fused_computation {
+    constant0 = f32[] constant(0)
+    arg.1 = f16[1024,512]{1,0} parameter(0)
+    arg.2 = f16[1024,512]{1,0} parameter(1)
+    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
+    arg2.conv = f32[1024,512]{1,0} convert(arg.2)
+    reduce1 = f32[512]{0} reduce(arg1.conv, constant0), dimensions={0},
+      to_apply=reduction22
+    reduce2 = f32[512]{0} reduce(arg2.conv, constant0), dimensions={0},
+      to_apply=reduction22
+    add = f32[1024,512]{1,0} add(arg1.conv, arg2.conv)
+    ROOT tuple = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+      tuple(reduce1, reduce2, add)
+  }
+
+  ENTRY kernel_entry {
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg2 = f16[1024,512]{1,0} parameter(1)
+    ROOT fusion = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+      fusion(arg1, arg2), kind=kInput, calls=fused_computation
+  })";
+
+  // Check that four calls to llvm.nvvm.atomic are generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
index f8120a5fa00ce38644cd85c54d5ef65701be1eda..f91a22d482bc8bc046977870a7a4d18ca1acde68 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -43,7 +43,7 @@ class InfeedTest : public ClientLibraryTestBase {
     ASSERT_IS_OK(client_->TransferToInfeed(literal));
     XlaBuilder builder(TestName());
     Infeed(&builder, literal.shape());
-    if (ShapeUtil::IsTuple(literal.shape())) {
+    if (literal.shape().IsTuple()) {
       // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
       ComputeAndCompareTuple(&builder, literal, {});
     } else {
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 6b2d76764a077dc6cfa3f9ddc6e525ab330323be..25bad67bab9375559c431466571c62acd0452b01 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -14,17 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace xla {
 namespace gpu {
 
 void ThunkSchedule::AddDependenciesOnTransitiveOperands(
     const Thunk& thunk, const HloInstruction& operand,
-    const std::unordered_map<const HloInstruction*, Thunk*>& hlo_to_thunk) {
-  if (hlo_to_thunk.count(&operand)) {
+    const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk) {
+  if (hlo_to_thunk.contains(&operand)) {
     // If `operand` is mapped to a thunk, adds `operand` to `thunk`'s dependency
     // list if `operand` is assigned to a different stream. As an optimization,
     // we skip `operand`'s operands because `operand` depends on them already.
@@ -48,14 +50,14 @@ ThunkSchedule::ThunkSchedule(
     const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
-  std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
+  absl::flat_hash_map<const HloInstruction*, Thunk*> hlo_to_thunk;
   for (const auto& thunk : *thunks_) {
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
   for (HloInstruction* hlo : hlo_total_order) {
-    if (hlo_to_thunk.count(hlo)) {
-      thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
+    if (Thunk** thunk = tensorflow::gtl::FindOrNull(hlo_to_thunk, hlo)) {
+      thunk_total_order_.push_back(*thunk);
     }
   }
 
@@ -106,7 +108,7 @@ void ThunkSchedule::RemoveRedundantDependencyEdges() {
   // redundant dependency edge.
   Array2D<int> last_dependency(stream_count, stream_count, -1);
   for (const Thunk* dst : thunk_total_order_) {
-    if (!depends_on_.count(dst)) {
+    if (!depends_on_.contains(dst)) {
       continue;
     }
 
@@ -134,7 +136,7 @@ void ThunkSchedule::RemoveRedundantDependencyEdges() {
 
 const std::list<const Thunk*>& ThunkSchedule::DependsOn(
     const Thunk* thunk) const {
-  if (depends_on_.count(thunk)) {
+  if (depends_on_.contains(thunk)) {
     return FindOrDie(depends_on_, thunk);
   } else {
     return empty_thunk_list_;
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index 43b628a1baf0e79a3197f3cfad3547991642eaed..549378debd52417252724a5d8a6f4d24f2ad0369 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -54,7 +56,9 @@ class ThunkSchedule {
   // Thunks that `thunk` depends on.
   const std::list<const Thunk*>& DependsOn(const Thunk* thunk) const;
   // Whether `thunk` is depended by another thunk.
-  bool Depended(const Thunk* thunk) const { return depended_by_.count(thunk); }
+  bool Depended(const Thunk* thunk) const {
+    return depended_by_.contains(thunk);
+  }
 
   // Delegates to StreamAssignment.
   int StreamCount() const { return stream_assignment_->StreamCount(); }
@@ -75,13 +79,13 @@ class ThunkSchedule {
   // thunk.hlo_instruction().
   void AddDependenciesOnTransitiveOperands(
       const Thunk& thunk, const HloInstruction& operand,
-      const std::unordered_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
+      const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
 
   std::unique_ptr<ThunkSequence> thunks_;
   std::vector<Thunk*> thunk_total_order_;
 
-  std::unordered_map<const Thunk*, std::list<const Thunk*>> depends_on_;
-  std::set<const Thunk*> depended_by_;
+  absl::flat_hash_map<const Thunk*, std::list<const Thunk*>> depends_on_;
+  absl::flat_hash_set<const Thunk*> depended_by_;
   std::list<const Thunk*> empty_thunk_list_;
 
   std::unique_ptr<StreamAssignment> stream_assignment_;
diff --git a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
index dd46ff433ba0ad6bfa3999b96845fdaebe148aca..167c038420a64d9fa29746ed3fe349620e08e6ff 100644
--- a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
+++ b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
@@ -47,6 +47,10 @@ class XfeedQueue {
   // Blocks until the queue is non-empty, then returns the buffer at the head of
   // the queue.
   BufferType BlockingGetNextDestination() {
+    for (const auto& callback : before_get_next_dest_callbacks_) {
+      callback();
+    }
+
     bool became_empty;
     BufferType current_buffer;
     {
@@ -69,6 +73,10 @@ class XfeedQueue {
   void RegisterOnEmptyCallback(std::function<void()> callback) {
     on_empty_callbacks_.push_back(std::move(callback));
   }
+  void RegisterBeforeGetNextDestinationCallback(
+      std::function<void()> callback) {
+    before_get_next_dest_callbacks_.push_back(std::move(callback));
+  }
 
  private:
   tensorflow::mutex mu_;
@@ -82,6 +90,11 @@ class XfeedQueue {
   // List of callbacks which will be called when 'enqueued_buffers_' becomes
   // empty.
   std::vector<std::function<void()>> on_empty_callbacks_;
+
+  // List of callbacks which will be called before BlockingGetNextDestination()
+  // is called. This lets you e.g. call EnqueueDestination() for each call to
+  // BlockingGetNextDestination().
+  std::vector<std::function<void()>> before_get_next_dest_callbacks_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 9220865867b770eebfb1ada8f31a5d24693a4b8d..4fca981c6a59cdb91a997e6a887fd26472c1a10a 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -199,7 +199,7 @@ Status HeapSimulator::RunComputation(
 
       // If the buffer has no users and isn't an entry parameter or output, it
       // must be a dead value.
-      if (live_buffers.count(buffer) == 0) {
+      if (!live_buffers.contains(buffer)) {
         dead_buffers_to_free.push_back(buffer);
       }
     }
@@ -225,10 +225,10 @@ Status HeapSimulator::RunComputation(
       }
     }
     // Sort to get a deterministic iteration order.
-    std::sort(operand_buffers_to_free.begin(), operand_buffers_to_free.end(),
-              [](const BufferValue* x, const BufferValue* y) {
-                return x->id() < y->id();
-              });
+    absl::c_sort(operand_buffers_to_free,
+                 [](const BufferValue* x, const BufferValue* y) {
+                   return x->id() < y->id();
+                 });
 
     // Allocate buffers defined by this instruction.  This is the latest point
     // that we can allocate; right before the buffer is first used.  This must
@@ -253,7 +253,7 @@ Status HeapSimulator::RunComputation(
       bool shared = false;
       if (options_.may_reuse_operand_buffers) {
         for (const BufferValue* operand_buffer : operand_buffers_to_free) {
-          if (reused_buffers.count(operand_buffer) != 0) {
+          if (reused_buffers.contains(operand_buffer)) {
             continue;
           }
           if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
@@ -335,10 +335,9 @@ Status HeapSimulator::RunComputation(
     to_free.push_back(buffer);
   }
 
-  std::sort(to_free.begin(), to_free.end(),
-            [](const BufferValue* x, const BufferValue* y) {
-              return x->id() < y->id();
-            });
+  absl::c_sort(to_free, [](const BufferValue* x, const BufferValue* y) {
+    return x->id() < y->id();
+  });
   for (const BufferValue* buffer : to_free) {
     VLOG(3) << "Freeing pending: " << buffer->ToString();
     Free(buffer, root);
@@ -374,15 +373,15 @@ bool HeapSimulator::IgnoreBuffer(const BufferValue* buffer) const {
     return true;
   }
   return options_.buffers_to_assign != nullptr &&
-         options_.buffers_to_assign->count(buffer) == 0;
+         !options_.buffers_to_assign->contains(buffer);
 }
 
 // Alloc always calls the underlying heap algorithm.
 void HeapSimulator::Alloc(const BufferValue* buffer,
                           const HloInstruction* instruction) {
-  CHECK(allocated_buffers_.count(buffer) == 0)
+  CHECK(!allocated_buffers_.contains(buffer))
       << "Alloc called on allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "Alloc called on freed buffer: " << *buffer;
 
   allocated_buffers_.insert(buffer);
@@ -411,9 +410,9 @@ void HeapSimulator::Free(const BufferValue* buffer,
     buffer = group->canonical;
   }
 
-  CHECK(allocated_buffers_.count(buffer) > 0)
+  CHECK(allocated_buffers_.contains(buffer))
       << "Free called on non-allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "Free called on freed buffer: " << *buffer;
 
   freed_buffers_.insert(buffer);
@@ -433,11 +432,11 @@ void HeapSimulator::ShareBuffer(const BufferValue* buffer,
                                 const HloInstruction* instruction) {
   CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
       << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
-  CHECK(allocated_buffers_.count(buffer) == 0)
+  CHECK(!allocated_buffers_.contains(buffer))
       << "ShareBuffer called on allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "ShareBuffer called on freed buffer: " << *buffer;
-  CHECK(freed_buffers_.count(shared) == 0)
+  CHECK(!freed_buffers_.contains(shared))
       << "ShareBuffer called on freed shared buffer: " << *shared;
 
   const BufferValue* canonical = nullptr;
@@ -452,7 +451,7 @@ void HeapSimulator::ShareBuffer(const BufferValue* buffer,
   } else {
     // The 'shared' buffer doesn't have a group; it must be the canonical.  Add
     // both 'buffer' and 'shared' to a new group.
-    CHECK(allocated_buffers_.count(shared) > 0)
+    CHECK(allocated_buffers_.contains(shared))
         << "ShareBuffer called on non-allocated shared buffer: " << *shared;
     auto group = std::make_shared<SharedGroup>();
     canonical = shared;
@@ -596,7 +595,7 @@ void DecreasingSizeRunsHeap::CallAndDrainRun() {
   }
 
   // Call ops in the run sorted by decreasing size, breaking ties by buffer id.
-  std::sort(run_.begin(), run_.end(), [](const Op& a, const Op& b) {
+  absl::c_sort(run_, [](const Op& a, const Op& b) {
     if (a.size != b.size) {
       return a.size > b.size;
     }
@@ -866,23 +865,23 @@ HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
   for (auto& entry : buffer_intervals_) {
     sorted_buffer_intervals.push_back(entry.second);
   }
-  std::sort(sorted_buffer_intervals.begin(), sorted_buffer_intervals.end(),
-            [](const BufferInterval& x, const BufferInterval& y) {
-              if (x.size != y.size) {
-                return x.size > y.size;
-              }
-              if (x.end - x.start != y.end - y.start) {
-                return x.end - x.start > y.end - y.start;
-              }
-              return x.buffer->id() < y.buffer->id();
-            });
+  absl::c_sort(sorted_buffer_intervals,
+               [](const BufferInterval& x, const BufferInterval& y) {
+                 if (x.size != y.size) {
+                   return x.size > y.size;
+                 }
+                 if (x.end - x.start != y.end - y.start) {
+                   return x.end - x.start > y.end - y.start;
+                 }
+                 return x.buffer->id() < y.buffer->id();
+               });
 
   BufferIntervalTree interval_tree(sorted_buffer_intervals.size());
   for (auto& buffer_interval : sorted_buffer_intervals) {
     auto chunks_overlapping_in_time = interval_tree.ChunksOverlappingInTime(
         buffer_interval.start, buffer_interval.end);
-    std::sort(
-        chunks_overlapping_in_time.begin(), chunks_overlapping_in_time.end(),
+    absl::c_sort(
+        chunks_overlapping_in_time,
         [](const Chunk& x, const Chunk& y) { return x.offset < y.offset; });
 
     // Find the minimum free chunk that can hold this buffer.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index dbbf43082f2c1d21f5ef42f53804bf0969903a58..3e0631aeb4aa374cb5748650e1c7529e26e10b34 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -158,7 +158,7 @@ class HeapSimulator {
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
                       const BufferValue* buffer,
                       const HloInstruction* instruction,
-                      const BufferValue* shared_with_canonical);
+                      const BufferValue* share_with_canonical);
 
   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
   // in which case we are calculating the same allocs/frees twice in the
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 9b50f1ca5b5365463f32106fc005ef2c63f2e37a..263b42a29dbb0dbc0fb6eca7968674ff242f45ed 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -229,6 +229,18 @@ message HloScheduleProto {
 }
 
 message HloInputOutputAliasProto {
+  enum Kind {
+    // Define a UNDEFINED_ALIAS equal to zero to get around the default-0 proto3
+    // behavior and missing has_*() APIs.
+    UNDEFINED_ALIAS = 0;
+    // An alias setup by the user as must alias. A use setting USER_ALIAS is
+    // expecting the designed output to be dropped over the given input
+    // parameter number+index.
+    USER_ALIAS = 1;
+    // An alias setup by the compiler as part of its optimizations.
+    SYSTEM_ALIAS = 2;
+  }
+
   // The following proto describes a pair of aliased an input
   // (described by parameter number and a ShapeIndex of the parameter)
   // and an output (described by a ShapeIndex of the root
@@ -249,6 +261,8 @@ message HloInputOutputAliasProto {
     int64 parameter_number = 2;
     // ShapeIndex of the parameter instruction.
     repeated int64 parameter_shape_index = 3;
+    // The kind of alias to be setup.
+    Kind kind = 4;
   }
 
   repeated AliasEntryProto entries = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index cf8e6594cbe5ffd28ca75dd5006e8817f1e8581c..e511f1951c5dd07ebb64fa38fd5b7f6a0e87b429 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -117,7 +117,7 @@ class BufferValueMap {
     for (const auto& pair : buffers_) {
       buffer_numbers.push_back(pair.first);
     }
-    std::sort(buffer_numbers.begin(), buffer_numbers.end());
+    absl::c_sort(buffer_numbers);
     return buffer_numbers;
   }
 
@@ -176,13 +176,12 @@ class BufferValueMap {
       const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
     // Get parameter value from an aliased_input object.
     const auto get_parameter_value =
-        [this](const std::pair<int64, ShapeIndex>& aliased_input)
+        [this](const HloInputOutputAliasConfig::Alias& aliased_input)
         -> const HloValue& {
-      int64 param_number = aliased_input.first;
-      const ShapeIndex& param_index = aliased_input.second;
       return dataflow_.GetUniqueValueAt(
-          module_->entry_computation()->parameter_instruction(param_number),
-          param_index);
+          module_->entry_computation()->parameter_instruction(
+              aliased_input.parameter_number),
+          aliased_input.parameter_index);
     };
 
     // If the value shows up in a root instruction, alias it with parameter
@@ -319,7 +318,7 @@ class BufferValueMap {
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
-    std::sort(aliased_buffers.begin(), aliased_buffers.end());
+    absl::c_sort(aliased_buffers);
     aliased_buffers.erase(
         std::unique(aliased_buffers.begin(), aliased_buffers.end()),
         aliased_buffers.end());
@@ -367,7 +366,7 @@ std::vector<const HloBuffer*> HloAliasAnalysis::ComputeBuffersAt(
   }
 
   // Sort and uniquify vector before returning.
-  std::sort(buffers.begin(), buffers.end(), HloBuffer::IdLessThan);
+  absl::c_sort(buffers, HloBuffer::IdLessThan);
   buffers.erase(std::unique(buffers.begin(), buffers.end()), buffers.end());
 
   return buffers;
@@ -430,8 +429,7 @@ Status HloAliasAnalysis::Verify() const {
   for (const auto& pair : value_to_buffer_) {
     const HloValue* value = pair.first;
     const HloBuffer& buffer = *pair.second;
-    TF_RET_CHECK(std::find(buffer.values().begin(), buffer.values().end(),
-                           value) != buffer.values().end());
+    TF_RET_CHECK(absl::c_linear_search(buffer.values(), value));
   }
 
   for (HloBuffer::Id id = 0; id < buffers_.size(); ++id) {
@@ -457,7 +455,7 @@ string HloAliasAnalysis::ToString() const {
   for (const HloComputation* computation : module_->computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       StrAppend(&out, "    ", instruction->name(), ":\n");
-      if (ShapeUtil::IsTuple(instruction->shape())) {
+      if (instruction->shape().IsTuple()) {
         ShapeUtil::ForEachSubshape(
             instruction->shape(),
             [&out, &instruction, this](const Shape&, const ShapeIndex& index) {
@@ -515,7 +513,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     auto& value_set = buffer_map.GetValuesInBuffer(buffer_number);
     std::vector<const HloValue*> sorted_values(value_set.begin(),
                                                value_set.end());
-    std::sort(sorted_values.begin(), sorted_values.end(), HloValue::IdLessThan);
+    absl::c_sort(sorted_values, HloValue::IdLessThan);
     alias_analysis->buffers_.emplace_back(next_id++, sorted_values);
     for (const HloValue* value : sorted_values) {
       alias_analysis->value_to_buffer_[value] =
@@ -533,11 +531,11 @@ bool HloAliasAnalysis::HasLiveRangeInterference(
     const HloOrdering& ordering) const {
   for (const HloBuffer& buffer : buffers()) {
     CHECK(!buffer.values().empty());
-    if (ShapeUtil::IsToken(buffer.values().front()->shape())) {
+    if (buffer.values().front()->shape().IsToken()) {
       // Tokens have no on-device representation and cannot interfere.
       for (const HloValue* value : buffer.values()) {
         // If one of the values is a token, all values must be a token.
-        DCHECK(ShapeUtil::IsToken(value->shape()));
+        DCHECK(value->shape().IsToken());
       }
       continue;
     }
@@ -547,16 +545,15 @@ bool HloAliasAnalysis::HasLiveRangeInterference(
     // tie-break using value ID. The tie-break is necessary because we need a
     // strict weak order for std::sort.
     std::vector<const HloValue*> values = buffer.values();
-    std::sort(values.begin(), values.end(),
-              [&ordering](const HloValue* a, const HloValue* b) {
-                if (ordering.IsDefinedBefore(*a, *b)) {
-                  return true;
-                } else if (ordering.IsDefinedBefore(*b, *a)) {
-                  return false;
-                } else {
-                  return a->id() < b->id();
-                }
-              });
+    absl::c_sort(values, [&ordering](const HloValue* a, const HloValue* b) {
+      if (ordering.IsDefinedBefore(*a, *b)) {
+        return true;
+      } else if (ordering.IsDefinedBefore(*b, *a)) {
+        return false;
+      } else {
+        return a->id() < b->id();
+      }
+    });
 
     // Walk through the ordered vector of values. First verify that the values
     // are totally ordered with respect to 'ordering', then check that no
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 7e6150e94153cd15463725e862ce1b8593f2c991..b6dbf07959c541bceaa8eda5a0101503970ee832 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -238,13 +238,16 @@ TEST_F(HloAliasAnalysisTest, ParametersWithAliasing) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   // Cannot alias an output twice.
   ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -279,13 +282,16 @@ TEST_F(HloAliasAnalysisTest, ParametersWithCrossAliasing) {
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   // Cannot alias an output twice.
   ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -365,9 +371,11 @@ TEST_F(HloAliasAnalysisTest, InputOutputAliasingWithWhile) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index 9c3aa0e64d119c2560f4955d0bcb492519fa52a2..32e48651b30bace4723169935d1f10dd7d7bfec3 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -49,7 +49,7 @@ std::vector<HloPosition> HloBuffer::ComputePositions() const {
                      value->positions().end());
   }
   // Remove duplicates and sort positions.
-  std::sort(positions.begin(), positions.end());
+  absl::c_sort(positions);
   positions.erase(std::unique(positions.begin(), positions.end()),
                   positions.end());
   return positions;
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 75630307186ba42f711a85454d73722533e59358..40fe91398be33f5681e1389e1b6fadcbd87487bb 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -207,14 +207,14 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   TF_RET_CHECK(instruction->user_count() == 0);
   TF_RET_CHECK(IsRemovable(instruction))
       << "Cannot remove instruction: " << instruction->ToString();
-  std::unordered_set<HloInstruction*> removed;
+  absl::flat_hash_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
   worklist.push(instruction);
   while (!worklist.empty()) {
     HloInstruction* item = worklist.front();
     worklist.pop();
 
-    if (removed.count(item) != 0 || item->user_count() != 0 ||
+    if (removed.contains(item) || item->user_count() != 0 ||
         item == root_instruction() || !IsRemovable(item) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
@@ -531,11 +531,10 @@ HloComputation::CreateFromProto(
   HloInstruction* root = instruction_map.at(proto.root_id());
 
   // Sort the instructions in the proto id's order.
-  std::sort(instructions.begin(), instructions.end(),
-            [&](const std::unique_ptr<HloInstruction>& a,
-                const std::unique_ptr<HloInstruction>& b) {
-              return to_proto_id[a.get()] < to_proto_id[b.get()];
-            });
+  absl::c_sort(instructions, [&](const std::unique_ptr<HloInstruction>& a,
+                                 const std::unique_ptr<HloInstruction>& b) {
+    return to_proto_id[a.get()] < to_proto_id[b.get()];
+  });
 
   TF_RETURN_IF_ERROR([&]() -> Status {
     std::vector<bool> parameters_seen(parameter_count);
@@ -600,7 +599,7 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     const std::function<
         HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index,
                         HloComputation* computation)>& copy_leaf) {
-  if (ShapeUtil::IsTuple(instruction->shape())) {
+  if (instruction->shape().IsTuple()) {
     std::vector<HloInstruction*> elements;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
          i++) {
@@ -617,14 +616,14 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     }
     return AddInstruction(HloInstruction::CreateTuple(elements));
   }
-  if (ShapeUtil::IsToken(instruction->shape())) {
+  if (instruction->shape().IsToken()) {
     // Tokens have no on-device representation and cannot be copied. Pass
     // through transparently.
     return instruction;
   }
 
   // Array shape.
-  TF_RET_CHECK(ShapeUtil::IsArray(instruction->shape()));
+  TF_RET_CHECK(instruction->shape().IsArray());
   return copy_leaf(instruction, *index, this);
 }
 
@@ -694,22 +693,36 @@ bool HloComputation::operator==(const HloComputation& other) const {
   if (this == &other) {
     return true;
   }
-  std::set<std::pair<const HloInstruction*, const HloInstruction*>> visited;
-  std::function<bool(const HloInstruction*, const HloInstruction*)> eq =
-      [&visited, &eq](const HloInstruction* a, const HloInstruction* b) {
-        // If <a,b> are visited but not identical, the recursion should have
-        // been aborted. So, if <a,b> are visited at this point, they must be
-        // identical.
-        if (visited.count(std::make_pair(a, b)) > 0) {
-          return true;
-        }
-        visited.emplace(a, b);
-        return a->Identical(
-            *b, eq, [](const HloComputation* a, const HloComputation* b) {
-              return *a == *b;
-            });
-      };
-  return eq(root_instruction(), other.root_instruction());
+  absl::flat_hash_set<std::pair<const HloInstruction*, const HloInstruction*>>
+      visited;
+  std::vector<std::pair<const HloInstruction*, const HloInstruction*>> worklist;
+
+  worklist.push_back({root_instruction(), other.root_instruction()});
+
+  while (!worklist.empty()) {
+    auto pair = worklist.back();
+    worklist.pop_back();
+
+    if (visited.contains(pair)) {
+      continue;
+    }
+    visited.emplace(pair);
+    // TODO(b/123082518): Avoid recursively invoking == becasue it may
+    // cause a stack overflow with deeply nested subcomputations.
+    bool identical_ignoring_operands = pair.first->Identical(
+        *pair.second,
+        [](const HloInstruction*, const HloInstruction*) { return true; },
+        [](const HloComputation* a, const HloComputation* b) {
+          return *a == *b;
+        });
+    if (!identical_ignoring_operands) {
+      return false;
+    }
+    for (size_t i = 0; i < pair.first->operands().size(); ++i) {
+      worklist.push_back({pair.first->operand(i), pair.second->operand(i)});
+    }
+  }
+  return true;
 }
 
 Status HloComputation::ReplaceWithNewInstruction(
@@ -799,17 +812,16 @@ Status HloComputation::AcceptOrdered(
     absl::Span<HloInstruction* const> order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
-    TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
-        << root->ToString();
+    TF_RET_CHECK(absl::c_linear_search(order, root)) << root->ToString();
   }
   TF_RET_CHECK(order.size() == instruction_count());
-  std::unordered_set<const HloInstruction*> visited;
+  absl::flat_hash_set<const HloInstruction*> visited;
   for (const HloInstruction* instruction : order) {
     VLOG(3) << "Visiting ordered: " << instruction->ToString();
-    TF_RET_CHECK(instruction_iterators_.count(instruction) == 1)
+    TF_RET_CHECK(instruction_iterators_.contains(instruction))
         << "Instruction " << instruction->name() << " is not in computation "
         << name();
-    TF_RET_CHECK(visited.count(instruction) == 0)
+    TF_RET_CHECK(!visited.contains(instruction))
         << "Instruction " << instruction->name()
         << " appears more than once in order";
     HloInstruction* mutable_instruction =
@@ -845,29 +857,31 @@ Status HloComputation::Accept(
 std::unique_ptr<HloComputation> HloComputation::Clone(
     const string& suffix, HloCloneContext* context) {
   return CloneWithReplacements(
-      /*replacements=*/std::unordered_map<const HloInstruction*,
-                                          std::unique_ptr<HloInstruction>>(),
-      context, suffix);
+      /*replacements=*/absl::flat_hash_map<const HloInstruction*,
+                                           std::unique_ptr<HloInstruction>>(),
+      /*extra_parameters=*/{}, context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
@@ -875,17 +889,19 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
   replacements.emplace(std::move(r3));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
-    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
+    absl::Span<const HloInstruction* const> extra_parameters,
     HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
@@ -951,6 +967,12 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
+  // First add the extra parameters to 'instructions'.
+  for (const auto& instr : extra_parameters) {
+    CHECK_EQ(instr->opcode(), HloOpcode::kParameter)
+        << "Only parameter instructions are allowed in 'extra_parameters'";
+    instructions.emplace_back(instr->Clone());
+  }
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index a0ccbc583f8c409f29d31756fcc1fa1b4af7dc35..0cb9caddd089011f3e9a4473995847dc966dd402 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -323,11 +322,16 @@ class HloComputation {
   // that's not already in the computation, it's cloned and added to the new
   // computation.
   //
+  // 'extra_parameters' allows to specify additional parameters that should be
+  // added to the computation.
+  //
   // All relevant instructions are cloned, *including* unique_ptr in the
   // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
-      std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      absl::flat_hash_map<const HloInstruction*,
+                          std::unique_ptr<HloInstruction>>
           replacements,
+      absl::Span<const HloInstruction* const> extra_parameters = {},
       HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Convenience overloads for CloneWithReplacements.  You want to do
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 0361c87428f6e4c031d95492a5bc782ad388e5b5..3b88e9745c27d6e1f2a46e5c83ac2e8bd8d05150 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 
+#include <memory>
 #include <set>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -226,7 +230,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
         : computation_(computation) {}
 
     Status DefaultAction(HloInstruction* hlo_instruction) override {
-      EXPECT_EQ(0, visited_set_.count(hlo_instruction));
+      EXPECT_FALSE(visited_set_.contains(hlo_instruction));
       visited_set_.insert(hlo_instruction);
       last_visited_ = hlo_instruction;
       return Status::OK();
@@ -239,7 +243,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
     }
 
     HloComputation* computation_;
-    std::set<HloInstruction*> visited_set_;
+    absl::flat_hash_set<HloInstruction*> visited_set_;
     int64 finish_visit_calls_ = 0;
     HloInstruction* last_visited_ = nullptr;
   };
@@ -491,6 +495,41 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
+TEST_F(HloComputationTest, CloneWithReplacements) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape r0s64 = ShapeUtil::MakeShape(S64, {});
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  Shape r0u32 = ShapeUtil::MakeShape(U32, {});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "p.0.lhs"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "p.0.rhs"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, r0s64, "p.1"));
+  auto lt = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param0, param1));
+  auto module = CreateNewVerifiedModule();
+  auto computation =
+      module->AddEntryComputation(builder.Build(/*root_instruction=*/lt));
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(param2,
+                       HloInstruction::CreateParameter(2, r0s32, "p.1"));
+  auto param3 = HloInstruction::CreateParameter(3, r0u32, "p.2");
+  std::vector<const HloInstruction*> extra_parameters{param3.get()};
+  auto clone = computation->CloneWithReplacements(std::move(replacements),
+                                                  extra_parameters);
+  ASSERT_EQ(clone->num_parameters(), 4);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(0)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(1)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(2)->shape(), r0s32));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(3)->shape(), r0u32));
+}
+
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
@@ -606,5 +645,28 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
+std::unique_ptr<HloComputation> MakeAddNComputation(int n) {
+  auto builder = HloComputation::Builder("add_n");
+  auto result = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "x_value"));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  for (int i = 0; i < n; ++i) {
+    result = builder.AddInstruction(HloInstruction::CreateBinary(
+        one->shape(), HloOpcode::kAdd, result, one));
+  }
+  return builder.Build();
+}
+
+TEST_F(HloComputationTest, DeepEquality) {
+  auto computation_a = MakeAddNComputation(200000);
+  auto computation_b = MakeAddNComputation(200000);
+  EXPECT_TRUE(*computation_a == *computation_b);
+
+  auto computation_c = MakeAddNComputation(199999);
+  EXPECT_FALSE(*computation_a == *computation_c);
+  EXPECT_FALSE(*computation_c == *computation_b);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 5e37883d3d8d5067bab873ac6b5f732e7360c5fa..e7ed858e8c5af83d08863d64a0aba162c75ed5cb 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -35,6 +35,34 @@ limitations under the License.
 
 namespace xla {
 
+// Checks whether instr is or transitively contains an instruction that we
+// shouldn't fold.
+//
+// Specifically, we don't fold kRng or kAfterAll instructions:
+//
+//  - kRng is already marked as side-effecting and so is skipped elsewhere, but
+//    we check for it here.  Even kRng weren't side-effecting and took an
+//    explicit seed, we *still* wouldn't want to constant-fold it, because the
+//    evaluator's handling of rng is not guaranteed to be identical to any
+//    particular backend's rng.
+//
+//  - kAfterAll needs to be skipped because a kAfterAll op with no args can
+//    currently materialize a token "out of thin air".  TODO(b/110532604):
+//    Remove this check once AfterAll requires at least one operand, in which
+//    case constant folding will be impossible.
+static bool IsOrContainsIllegalInstr(const HloInstruction* instr) {
+  if (instr->opcode() == HloOpcode::kAfterAll ||
+      instr->opcode() == HloOpcode::kRng) {
+    return true;
+  }
+  for (const HloComputation* c : instr->called_computations()) {
+    if (absl::c_any_of(c->instructions(), IsOrContainsIllegalInstr)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
   // Limit the constant folding to 0 iterations to skip folding loops. This
   // retains the behavior from before while loop support in HloEvaluator and may
@@ -52,25 +80,24 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, Tuple, AfterAll operation.
-      // Tuple constants are not directly supported by any backends, hence
-      // folding Tuple is not useful and would in fact be expanded back into
-      // kTuple by Algebraic Simplifier.
-      // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
-      // operand in which case constant folding will be impossible and this
-      // special case is not necessary.
-      if (instruction->opcode() == HloOpcode::kParameter ||
-          instruction->opcode() == HloOpcode::kConstant ||
-          instruction->opcode() == HloOpcode::kTuple ||
-          instruction->opcode() == HloOpcode::kAfterAll) {
-        continue;
-      }
 
       // Skip instructions with non-constant operands.
       if (!hlo_query::AllOperandsAreConstants(*instruction)) {
         continue;
       }
 
+      // Don't fold Constant, Parameter, and Tuple instructions.  Tuple
+      // constants are not directly supported by any backends, hence folding
+      // Tuple is not useful and would in fact be expanded back into kTuple by
+      // Algebraic Simplifier.
+      //
+      // (We do allow folding subcomputations that contain these instructions.)
+      if (instruction->opcode() == HloOpcode::kParameter ||
+          instruction->opcode() == HloOpcode::kConstant ||
+          instruction->opcode() == HloOpcode::kTuple) {
+        continue;
+      }
+
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
@@ -79,12 +106,23 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
         continue;
       }
 
+      // Check for instructions that we can't fold even if they appear inside of
+      // a subcomputation (e.g. a kCall).
+      if (IsOrContainsIllegalInstr(instruction)) {
+        continue;
+      }
+
+      // Don't constant-fold side-effecting instructions or instructions which
+      // contain side-effecting instructions.
+      if (instruction->HasSideEffect()) {
+        continue;
+      }
+
       // Don't constant fold unless it's a net positive or the output is small.
-      if (ShapeUtil::IsArray(instruction->shape())) {
+      if (instruction->shape().IsArray()) {
         int64 elements_in_removed_operands = 0;
         for (HloInstruction* operand : instruction->operands()) {
-          if (operand->user_count() == 1 &&
-              ShapeUtil::IsArray(operand->shape())) {
+          if (operand->user_count() == 1 && operand->shape().IsArray()) {
             elements_in_removed_operands +=
                 ShapeUtil::ElementsIn(operand->shape());
           }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 92b748d813c3efef83ef0155f1d5d3c637ce2c57..4bdc980c9ac4fb79cde0242f407aea7057474b27 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -268,5 +268,51 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
               GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
+TEST_F(HloConstantFoldingTest, DontFoldSubcomputationContainingAfterAll) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  Fn {
+    tok = token[] after-all()
+    ROOT root = f32[10] iota(), iota_dimension=0
+  }
+
+  ENTRY entry {
+    ROOT call = f32[10] call(), to_apply=Fn
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(HloConstantFoldingTest,
+       DontFoldSubcomputationTransitivelyContainingRng) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  InnerFn {
+    c0 = f32[] constant(0)
+    c1 = f32[] constant(1)
+    ROOT rng = f32[10] rng(c0, c1), distribution=rng_uniform
+  }
+
+  Fn {
+    ROOT fusion = f32[10] fusion(), kind=kLoop, calls=InnerFn
+  }
+
+  ENTRY entry {
+    ROOT call = f32[10] call(), to_apply=Fn
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index cb431aed47f0a751a697305986a8a0c194ac966c..76fd402b2c25c8dbed7902a458cd3af44f89cbd1 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -237,24 +237,17 @@ Status HloCostAnalysis::HandleDomain(const HloInstruction* domain) {
 
 Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
-  const Shape& rhs_shape = dot->operand(1)->shape();
+  const Shape& dot_shape = dot->shape();
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   // Count of elements along the reduction dimension (last dimension for the
   // rhs).
-  int64 reduction_width =
-      lhs_shape.dimensions(dnums.lhs_contracting_dimensions(0));
-  // First divide by reduction width before multiplying by rhs elements to avoid
-  // overflow.
-  int64 fma_count;
-  if (reduction_width == 0) {
-    fma_count = 0;
-  } else {
-    fma_count = (ShapeUtil::ElementsIn(lhs_shape) / reduction_width) *
-                ShapeUtil::ElementsIn(rhs_shape);
+  int64 reduction_width = 1;
+  for (auto dim : dnums.lhs_contracting_dimensions()) {
+    reduction_width *= lhs_shape.dimensions(dim);
   }
-
-  // We count an FMA operation as 2 floating point operations.
-  current_properties_[kFlopsKey] = kFmaFlops * fma_count;
+  // Each output elment requires reduction_widht FMA operations.
+  current_properties_[kFlopsKey] =
+      kFmaFlops * ShapeUtil::ElementsIn(dot_shape) * reduction_width;
   return Status::OK();
 }
 
@@ -292,7 +285,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   // does not need to be multiplied by the number of input tensors - that's
   // already "priced in" by the sub-computation doing more work.
   auto arg = reduce->operand(0);
-  auto output_shape = ShapeUtil::IsArray(reduce->shape())
+  auto output_shape = reduce->shape().IsArray()
                           ? reduce->shape()
                           : reduce->shape().tuple_shapes(0);
   int64 reduction_count =
@@ -539,7 +532,7 @@ Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
 
 Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   auto real_shape =
-      ShapeUtil::IsTuple(fft->operand(0)->shape())
+      fft->operand(0)->shape().IsTuple()
           ? ShapeUtil::GetTupleElementShape(fft->operand(0)->shape(), 0)
           : fft->operand(0)->shape();
   constexpr int kFmaPerComplexMul = 4;
@@ -561,7 +554,7 @@ Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   double flops = 0.0;
   ShapeUtil::ForEachSubshape(crs->shape(),
                              [&](const Shape& subshape, const ShapeIndex&) {
-                               if (ShapeUtil::IsArray(subshape)) {
+                               if (subshape.IsArray()) {
                                  flops += ShapeUtil::ElementsIn(subshape);
                                }
                              });
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index ff32faf298dd1f04c5b769f2a88f76a7a1e18ae7..4d42770ba784ba15fae9518b40a75d8a2f038e66 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
@@ -157,6 +158,87 @@ TEST_F(HloCostAnalysisTest, MatrixMultiply) {
             sizeof(float) * (10 * 5 + 5 * 30 + 10 * 30));
 }
 
+TEST_F(HloCostAnalysisTest, DotGeneral) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5, 5}), "lhs");
+  auto rhs =
+      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(0);
+  dnums.add_rhs_contracting_dimensions(1);
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 * 5 + 5 * 5 * 30 + 10 * 30));
+}
+
+TEST_F(HloCostAnalysisTest, DotGeneral2) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5, 5}), "lhs");
+  auto rhs =
+      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(0);
+  dnums.add_rhs_batch_dimensions(1);
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 * 5 + 5 * 5 * 30 + 5 * 10 * 30));
+}
+
+TEST_F(HloCostAnalysisTest, DotGeneral3) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs");
+  auto rhs = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 + 5 * 30 + 5 * 5 * 10 * 30));
+}
+
 TEST_F(HloCostAnalysisTest, Map) {
   XlaBuilder builder("map");
   auto input = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10}), "in");
@@ -529,7 +611,8 @@ TEST_F(HloCostAnalysisTest, DynamicSlice) {
   // Test the analysis on a slice.
   XlaBuilder builder("dynamic-slice");
   auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x");
-  DynamicSlice(x, ConstantR1<int32>(&builder, {1}), {1});
+  DynamicSlice(x, absl::Span<const XlaOp>({ConstantR0<int32>(&builder, 1)}),
+               {1});
   auto hlo_module = BuildHloGraph(&builder);
 
   // Run HLO cost analysis.
@@ -545,7 +628,7 @@ TEST_F(HloCostAnalysisTest, DynamicUpdateSlice) {
   XlaBuilder builder("dynamic-update-slice");
   auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x");
   DynamicUpdateSlice(x, ConstantR1<float>(&builder, {1.0}),
-                     ConstantR1<int32>(&builder, {1}));
+                     absl::Span<const XlaOp>({ConstantR0<int32>(&builder, 1)}));
   auto hlo_module = BuildHloGraph(&builder);
 
   // Run HLO cost analysis.
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index e41aeab19e49ddd4f2363746f0ff8ba1740139b3..d56f673455f9129b72e9d85eaf8cbf03cfee4302 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -105,12 +106,26 @@ StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     absl::Span<const int64> slice_sizes) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, start_indices->parent());
+  int64 rank = start_indices->shape().dimensions(0);
+  std::vector<HloInstruction*> scalar_start_indices;
+  for (int i = 0; i < rank; ++i) {
+    // TODO(b/118437727): Update callers to provide scalars directly.
+    auto slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(start_indices->shape().element_type(), {1}),
+        start_indices, {i}, {i + 1}, {1}));
+    scalar_start_indices.push_back(
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(start_indices->shape().element_type(), {}),
+            slice)));
+  }
+  std::vector<Shape> scalar_start_indices_shapes(
+      rank, ShapeUtil::MakeShape(start_indices->shape().element_type(), {}));
   TF_ASSIGN_OR_RETURN(
       Shape dynamic_slice_shape,
       ShapeInference::InferDynamicSliceShape(
-          operand->shape(), start_indices->shape(), slice_sizes));
+          operand->shape(), scalar_start_indices_shapes, slice_sizes));
   return computation->AddInstruction(HloInstruction::CreateDynamicSlice(
-      dynamic_slice_shape, operand, start_indices, slice_sizes));
+      dynamic_slice_shape, operand, scalar_start_indices, slice_sizes));
 }
 
 StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
@@ -119,17 +134,31 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, update->parent());
   CHECK_EQ(computation, start_indices->parent());
+  int64 rank = start_indices->shape().dimensions(0);
+  std::vector<HloInstruction*> scalar_start_indices;
+  for (int i = 0; i < rank; ++i) {
+    // TODO(b/118437727): Update callers to provide scalars directly.
+    auto slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(start_indices->shape().element_type(), {1}),
+        start_indices, {i}, {i + 1}, {1}));
+    scalar_start_indices.push_back(
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(start_indices->shape().element_type(), {}),
+            slice)));
+  }
+  std::vector<Shape> scalar_start_indices_shapes(
+      rank, ShapeUtil::MakeShape(start_indices->shape().element_type(), {}));
   TF_ASSIGN_OR_RETURN(
       Shape dynamic_update_slice_shape,
       ShapeInference::InferDynamicUpdateSliceShape(
-          operand->shape(), update->shape(), start_indices->shape()));
+          operand->shape(), update->shape(), scalar_start_indices_shapes));
   return computation->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      dynamic_update_slice_shape, operand, update, start_indices));
+      dynamic_update_slice_shape, operand, update, scalar_start_indices));
 }
 
-StatusOr<HloInstruction*> MakeBroadcastHlo(
-    HloInstruction* operand, absl::Span<const int64> broadcast_dimensions,
-    absl::Span<const int64> result_shape_bounds) {
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 absl::Span<const int64> result_shape_bounds) {
   HloComputation* computation = operand->parent();
   Shape broadcast_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
                                                result_shape_bounds);
@@ -189,8 +218,7 @@ StatusOr<HloInstruction*> MakeMapHlo(absl::Span<HloInstruction* const> operands,
   for (const HloInstruction* operand : operands) {
     CHECK_EQ(computation, operand->parent());
     operand_shapes.push_back(&operand->shape());
-    max_operand_rank =
-        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+    max_operand_rank = std::max(max_operand_rank, operand->shape().rank());
   }
   std::vector<int64> map_dims(max_operand_rank);
   std::iota(map_dims.begin(), map_dims.end(), 0);
@@ -207,7 +235,7 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
                                         HloOpcode binary_opcode,
                                         HloModule* module) {
   DCHECK_NE(nullptr, module);
-  std::vector<int64> all_dims(ShapeUtil::Rank(operand->shape()));
+  std::vector<int64> all_dims(operand->shape().rank());
   std::iota(all_dims.begin(), all_dims.end(), 0);
 
   auto scalar_shape = ShapeUtil::MakeShape(operand->shape().element_type(), {});
@@ -366,9 +394,9 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
   return MakePadHlo(operand, zero, padding_config);
 }
 
-StatusOr<HloInstruction*> BroadcastZeros(
-    HloComputation* computation, PrimitiveType element_type,
-    absl::Span<const int64> broadcast_dimensions) {
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64> broadcast_dimensions) {
   HloInstruction* zero = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
   return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{},
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 8e5ddbbd503a501bd493aec43a2ccd4db883ef0c..1c3174e9c89c16cb11589e7c0235bdf13eae6b85 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -82,9 +82,9 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
 
 // Creates a broadcast HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeBroadcastHlo(
-    HloInstruction* operand, absl::Span<const int64> broadcast_dimensions,
-    absl::Span<const int64> result_shape_bounds);
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 absl::Span<const int64> result_shape_bounds);
 
 // Creates a GetTupleElement HLO instruction and adds it to the computation
 // containing `operand`.
@@ -198,9 +198,9 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
 // Broadcasts a zero value of type `element_type` into a tensor with element
 // type `element_type` and dimension bounds `broadcast_dimensions`.  The
 // broadcast instruction is emitted into `computation`.
-StatusOr<HloInstruction*> BroadcastZeros(
-    HloComputation* computation, PrimitiveType element_type,
-    absl::Span<const int64> broadcast_dimensions);
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64> broadcast_dimensions);
 
 // Creates a HLO computation that takes arguments of type `domain` and produces
 // a value of type `range`.
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index aaa9ec60eb3c4e0159ed40b37d772e0973d306ec..6025e6a77941369f75ebaa98bdf0979669b3a03c 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -56,9 +56,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   entry_computation->set_root_instruction(first_1_dims_collapsed);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR1<int32>({3, 4})}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({3, 4})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR1<int32>({3, 4}));
 }
 
@@ -77,10 +77,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(
-          *module,
-          {LiteralUtil::CreateR3<int32>(
-              {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR3<int32>(
+                                      {{{1, 2}, {3, 4}, {5, 6}},
+                                       {{-1, -2}, {-3, -4}, {-5, -6}}})}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR2<int32>(
                {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
@@ -101,8 +100,7 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module,
-                                  {LiteralUtil::CreateR1<int32>({9, 10})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({9, 10})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{9, 10}}));
 }
 
@@ -121,8 +119,7 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module,
-                                  {LiteralUtil::CreateR1<int32>({9, 10})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({9, 10})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR3<int32>({{{9, 10}}}));
 }
 
@@ -141,7 +138,7 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module, {LiteralUtil::CreateR0<int32>(9)}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(9)}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{9}}));
 }
 
@@ -160,8 +157,8 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(
-          *module, {LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
+      evaluator.Evaluate(*module,
+                         {LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
 }
@@ -180,9 +177,9 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   entry_computation->set_root_instruction(zero_padded_param);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR1<int32>({3, 4})}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({3, 4})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
 }
 
@@ -194,15 +191,14 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
                                              /*output_shape_dims=*/{2, 2},
                                              &param, &entry_computation);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloInstruction * zeros,
-      BroadcastZeros(module->entry_computation(), S32, {2, 2}));
+  HloInstruction* zeros =
+      BroadcastZeros(module->entry_computation(), S32, {2, 2});
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module, {LiteralUtil::CreateR0<int32>(0)}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(0)}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{0, 0}, {0, 0}}));
 }
 
@@ -214,15 +210,14 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
                                              /*output_shape_dims=*/{2, 2},
                                              &param, &entry_computation);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloInstruction * zeros,
-      BroadcastZeros(module->entry_computation(), F32, {2, 2}));
+  HloInstruction* zeros =
+      BroadcastZeros(module->entry_computation(), F32, {2, 2});
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR0<float>(0.0f)}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<float>(0.0f)}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
 }
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 3ed3d3c11c71dc534f193ba3ffb556b0eb0c80e4..3144a84805454488f417391f40ed6b9e9facc752 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -107,7 +107,7 @@ bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
           return false;
         }
       }
-      if (!visited.count(user)) {
+      if (!visited.contains(user)) {
         stack.push_back(user);
       }
     }
@@ -190,7 +190,7 @@ string HloDataflowAnalysis::ToString() const {
   for (const HloComputation* computation : module_.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       StrAppend(&out, "    ", instruction->name(), ":\n");
-      if (ShapeUtil::IsTuple(instruction->shape())) {
+      if (instruction->shape().IsTuple()) {
         GetInstructionValueSet(instruction)
             .ForEachElement([this, &instruction, &out](
                                 const ShapeIndex& index,
@@ -256,7 +256,7 @@ bool HloDataflowAnalysis::Phi(
         input_value_ids.push_back(value->id());
       }
     }
-    std::sort(input_value_ids.begin(), input_value_ids.end());
+    absl::c_sort(input_value_ids);
     input_value_ids.erase(
         std::unique(input_value_ids.begin(), input_value_ids.end()),
         input_value_ids.end());
@@ -271,8 +271,7 @@ bool HloDataflowAnalysis::Phi(
     if (current_value_defined_here) {
       VLOG(5) << "current_value_defined_here: " << current_value->ToString();
       CHECK(current_value->is_phi());
-      auto it = std::find(input_value_ids.begin(), input_value_ids.end(),
-                          current_value->id());
+      auto it = absl::c_find(input_value_ids, current_value->id());
       if (it != input_value_ids.end()) {
         input_value_ids.erase(it);
       }
@@ -921,8 +920,7 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
   for (auto& pair : dataflow_analysis->values_) {
     dataflow_analysis->values_vector_.push_back(&pair.second);
   }
-  std::sort(dataflow_analysis->values_vector_.begin(),
-            dataflow_analysis->values_vector_.end(), HloValue::IdLessThan);
+  absl::c_sort(dataflow_analysis->values_vector_, HloValue::IdLessThan);
 
   TF_DCHECK_OK(dataflow_analysis->Verify());
 
@@ -937,9 +935,7 @@ Status HloDataflowAnalysis::Verify() const {
   for (const HloValue* value : values()) {
     for (const HloPosition& position : value->positions()) {
       const HloValueSet& value_set = GetValueSet(position);
-      TF_RET_CHECK(std::find(value_set.values().begin(),
-                             value_set.values().end(),
-                             value) != value_set.values().end())
+      TF_RET_CHECK(absl::c_linear_search(value_set.values(), value))
           << "Value set at position " << position << " does not contain value "
           << value->ToShortString();
     }
@@ -954,9 +950,7 @@ Status HloDataflowAnalysis::Verify() const {
         const HloValueSet& value_set = pair.second;
         const HloPosition position{instruction, index};
         for (const HloValue* value : value_set.values()) {
-          TF_RET_CHECK(std::find(value->positions().begin(),
-                                 value->positions().end(),
-                                 position) != value->positions().end())
+          TF_RET_CHECK(absl::c_linear_search(value->positions(), position))
               << "Value set at position " << position
               << " unexpectedly contains value " << value->ToShortString();
         }
@@ -1041,11 +1035,10 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       // Check if one operand of kAdd fused root is kDot or kConvolution.
       auto* add = user->fused_expression_root();
       auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot;
-                       });
+          absl::c_find_if(add->operands(), [&](HloInstruction* operand) {
+            return operand->opcode() == HloOpcode::kConvolution ||
+                   operand->opcode() == HloOpcode::kDot;
+          });
       if (add_operand_it == add->operands().end()) {
         return false;
       }
@@ -1100,16 +1093,15 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     // *) The root instruction of the called computation is element-wise on
     //    'operand'.
     const bool found_caller_use =
-        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+        absl::c_find_if(uses, [user](const HloUse& use) {
           return use.instruction == user;
         }) != uses.end();
     auto* callee_root = user->to_apply()->root_instruction();
     const bool found_elementwise_callee_use =
-        std::find_if(
-            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
-              return use.instruction == callee_root &&
-                     callee_root->IsElementwiseOnOperand(use.operand_number);
-            }) != uses.end();
+        absl::c_find_if(uses, [callee_root](const HloUse& use) {
+          return use.instruction == callee_root &&
+                 callee_root->IsElementwiseOnOperand(use.operand_number);
+        }) != uses.end();
     return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 94de7c55dd2402e55ec344b79c24af2d8283fe73..4a7c4963b7b399e625da907b3810c42df7ee2bd3 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -73,8 +73,8 @@ class HloDataflowAnalysisTest : public HloTestBase,
   bool InstructionsMayInterfere(const HloOrdering& ordering,
                                 const HloInstruction* a,
                                 const HloInstruction* b) {
-    EXPECT_FALSE(ShapeUtil::IsTuple(a->shape()));
-    EXPECT_FALSE(ShapeUtil::IsTuple(b->shape()));
+    EXPECT_FALSE(a->shape().IsTuple());
+    EXPECT_FALSE(b->shape().IsTuple());
     return ordering.MayInterfere(analysis_->GetValueDefinedAt(a),
                                  analysis_->GetValueDefinedAt(b), *analysis_);
   }
@@ -1901,9 +1901,9 @@ ENTRY %AddDependency (p: f32[3]) -> f32[3] {
   EXPECT_FALSE(analysis->ValueIsDefinedAt(root));
 }
 
-INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
-                        HloDataflowAnalysisTest,
-                        ::testing::Values(false, true));
+INSTANTIATE_TEST_SUITE_P(HloDataflowAnalysisInstantiation,
+                         HloDataflowAnalysisTest,
+                         ::testing::Values(false, true));
 
 class HloDataflowAnalysisTestBase : public HloTestBase {
  protected:
@@ -1970,12 +1970,13 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2012,12 +2013,13 @@ TEST_F(DoesNotUseOperandBufferTest, IndirectUses) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2150,17 +2152,17 @@ TEST_F(CanShareOperandBufferWithUserTest,
 
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, data_shape, "param0"));
-  auto index = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int64>({0, 0})));
-  auto ds = builder.AddInstruction(
-      HloInstruction::CreateDynamicSlice(slice_shape, param, index, {1, 2, 2}));
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(0)));
+  auto ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      slice_shape, param, {zero, zero}, {1, 2, 2}));
 
-  auto dus = builder.AddInstruction(
-      HloInstruction::CreateDynamicUpdateSlice(data_shape, param, ds, index));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, param, ds, {zero, zero}));
 
   BuildModule(builder.Build());
   auto fusion = computation_->CreateFusionInstruction(
-      {dus, ds, index}, HloInstruction::FusionKind::kLoop);
+      {dus, ds, zero}, HloInstruction::FusionKind::kLoop);
   RunAnalysis();
 
   EXPECT_TRUE(
@@ -2219,12 +2221,13 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2259,12 +2262,13 @@ TEST_F(CanShareOperandBufferWithUserTest,
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape_bf16, convert1, update, starts));
+          data_shape_bf16, convert1, update,
+          std::initializer_list<HloInstruction*>({starts})));
 
   auto convert2 = builder.AddInstruction(
       HloInstruction::CreateConvert(data_shape, dynamic_update_slice));
@@ -2290,10 +2294,13 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
       HloInstruction::CreateParameter(0, data_shape, "data"));
   auto update = builder.AddInstruction(
       HloInstruction::CreateParameter(1, update_shape, "update"));
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto start0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "start0"));
+  auto start1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, starts_shape, "start1"));
+
   auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
+      data_shape, data, update, {start0, start1}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -2304,7 +2311,9 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
   EXPECT_FALSE(
       dataflow_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
   EXPECT_FALSE(
-      dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+      dataflow_analysis_->CanShareOperandBufferWithUser(start0, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(start1, {}, dus, {}));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 7d35e251ca21951036336ff1a1eb4aabc87bc5ca..a5a11f09cf4f857b992e5ede3a9dbc5a937ce722 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -65,7 +66,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Now DCE HloComputations.  First, collect the computations that are
   // referenced by some remaining instruction.
-  std::unordered_set<HloComputation*> live_computations;
+  absl::flat_hash_set<HloComputation*> live_computations;
   if (HloComputation* entry_computation = module->entry_computation()) {
     live_computations.insert(entry_computation);
   }
@@ -79,7 +80,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Remove dead computations.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (live_computations.count(computation) == 0) {
+    if (!live_computations.contains(computation)) {
       TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(computation));
       changed = true;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 1fa4259a3e42286cbc911907eea563e6ca6f8611..b5d72b386f89568cc3066b2e497be98428d1ed0c 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -43,9 +43,7 @@ class HloDceTest : public HloTestBase {
   // Returns whether the given instruction exists in the given computation.
   bool HasInstruction(const HloComputation& computation,
                       const HloInstruction* instruction) {
-    return std::find(computation.instructions().begin(),
-                     computation.instructions().end(),
-                     instruction) != computation.instructions().end();
+    return absl::c_linear_search(computation.instructions(), instruction);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index c6d02f9f67bb599e496d20fc2acf2e627ed54438..7cdb7f6bdf26241cda4fabbb5ccaf6e6f7de39ce 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -230,10 +230,10 @@ HloDomainMap::MakeNonDomainInstructions(
     }
   }
   // sort instructions according to instructions_order
-  std::sort(instructions.begin(), instructions.end(),
-            [&instructions_order](HloInstruction* a, HloInstruction* b) {
-              return instructions_order.at(a) < instructions_order.at(b);
-            });
+  absl::c_sort(instructions,
+               [&instructions_order](HloInstruction* a, HloInstruction* b) {
+                 return instructions_order.at(a) < instructions_order.at(b);
+               });
   return instructions;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index a40b6d888c548bf0909f413c092fc32cfc0a4892..9b0f2b2a0f4dd5d1d1191e9ab0637cc3034b50da 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -68,7 +68,7 @@ Shape GetConvertedTupleShape(const Shape& shape, PrimitiveType from_type,
   std::vector<Shape> new_tuple_subshapes;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
     Shape subshape = ShapeUtil::GetTupleElementShape(shape, i);
-    CHECK(!ShapeUtil::IsTuple(subshape));
+    CHECK(!subshape.IsTuple());
     if (subshape.element_type() == from_type) {
       subshape = ShapeUtil::ChangeElementType(subshape, to_type);
     }
@@ -92,7 +92,7 @@ HloInstruction* ConvertTupleElements(HloInstruction* hlo,
     HloInstruction* element = computation->AddInstruction(
         HloInstruction::CreateGetTupleElement(ele_shape, hlo, i));
     const Shape& to_ele_shape = ShapeUtil::GetTupleElementShape(to_shape, i);
-    CHECK(!ShapeUtil::IsTuple(ele_shape));
+    CHECK(!ele_shape.IsTuple());
     if (ele_shape.element_type() != to_ele_shape.element_type()) {
       element = computation->AddInstruction(
           HloInstruction::CreateConvert(to_ele_shape, element));
@@ -190,7 +190,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
 
         new_hlo = ToElementType(new_hlo, eliminate_type_);
-      } else if (ShapeUtil::IsTuple(hlo->shape())) {
+      } else if (hlo->shape().IsTuple()) {
         Shape old_shape = hlo->shape();
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index a3b56a44a0b02923585c1dcb69571479236188a3..5b633784e2f306290ca6c096f67c657be1f188c8 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -28,15 +28,7 @@ using ::testing::Eq;
 using ::testing::Not;
 using ::testing::ResultOf;
 
-class HloElementTypeConverterTest : public HloTestBase {
- public:
-  std::unique_ptr<HloModule> CreateModuleFromHloString(
-      const string& hlo_string) {
-    return HloRunner::CreateModuleFromString(hlo_string,
-                                             GetDebugOptionsForTest())
-        .ValueOrDie();
-  }
-};
+using HloElementTypeConverterTest = HloTestBase;
 
 TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
   const string& hlo_string = R"(
@@ -47,7 +39,7 @@ TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
            custom_call_target="foo"
     }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_FALSE(converted);
@@ -63,7 +55,7 @@ TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
       outfeed = token[] outfeed(infeed.data, token0)
     }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_FALSE(converted);
@@ -73,17 +65,16 @@ TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
   const string& hlo_string = R"(
     HloModule NestedTuples
     ENTRY NestedTuples.v5 {
-      constant.4 = bf16[] constant(42)
       constant.2 = f32[2]{0} constant({1, 2})
-      constant.3 = bf16[] constant(42)
-      add = bf16[] add(constant.2, constant.3)
-      tuple = (f32[2]{0}, bf16[]) tuple(constant.2, add)
+      constant.3 = bf16[2]{0} constant({42, 42})
+      add = bf16[2]{0} add(constant.2, constant.3)
+      tuple = (f32[2]{0}, bf16[2]{0}) tuple(constant.2, add)
       constant.5 = bf16[2]{0} constant({22, 44})
-      ROOT tuple.1 = ((f32[2]{0}, bf16[]), bf16[2]{0}) tuple(tuple, constant.5)
+      ROOT tuple.1 = ((f32[2]{0}, bf16[2]{0}), bf16[2]{0}) tuple(tuple, constant.5)
     }
   )";
 
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -111,7 +102,7 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
     }
   )";
 
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -135,7 +126,7 @@ ENTRY main {
   ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
 }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -161,7 +152,7 @@ ENTRY main {
   ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform
 }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 934c082bb9f003b1d2d80835f09a8f4109c7e7fd..ecde37be58a381be7968b04de7bbe1d85d7ddb25 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cmath>
 #include <cstdlib>
 #include <functional>
+#include <iterator>
 #include <string>
 #include <type_traits>
-#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -136,8 +135,44 @@ StatusOr<Literal> Compare<complex64>(const Shape& shape, HloOpcode opcode,
   return std::move(result);
 }
 
+template <>
+StatusOr<Literal> Compare<complex128>(const Shape& shape, HloOpcode opcode,
+                                      LiteralSlice lhs_literal,
+                                      LiteralSlice rhs_literal) {
+  std::function<bool(complex128, complex128)> compare_op;
+  switch (opcode) {
+    case HloOpcode::kEq:
+      compare_op = [](complex128 lhs_el, complex128 rhs_el) {
+        return lhs_el == rhs_el;
+      };
+      break;
+    case HloOpcode::kNe:
+      compare_op = [](complex128 lhs_el, complex128 rhs_el) {
+        return lhs_el != rhs_el;
+      };
+      break;
+    default:
+      LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
+                 << HloOpcodeString(opcode);
+  }
+
+  Literal result(shape);
+  TF_RETURN_IF_ERROR(
+      result.Populate<bool>([&](absl::Span<const int64> multi_index) {
+        return compare_op(lhs_literal.Get<complex128>(multi_index),
+                          rhs_literal.Get<complex128>(multi_index));
+      }));
+
+  return std::move(result);
+}
+
 }  // namespace
 
+// Note that unsupported types by the typed visitor does not necessarily imply
+// the non-typed HloEvaluator (parent evaluator) would not support them either
+// in the type-agnostic handler. For e.g., HandleGetTupleElement in the parent
+// type-agnostic evaluator will be able to accept Tuple primitive type, whereas
+// HloEvaluatorTypedVisitor cannot.
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
   typed_visitors_[PRED] =
@@ -145,22 +180,14 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
   typed_visitors_[U8] =
       absl::make_unique<HloEvaluatorTypedVisitor<uint8>>(this);
   typed_visitors_[U16] =
-      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
-        return Unimplemented(
-            "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
-            "U16.");
-      });
+      absl::make_unique<HloEvaluatorTypedVisitor<uint16>>(this);
   typed_visitors_[U32] =
       absl::make_unique<HloEvaluatorTypedVisitor<uint32>>(this);
   typed_visitors_[U64] =
       absl::make_unique<HloEvaluatorTypedVisitor<uint64>>(this);
   typed_visitors_[S8] = absl::make_unique<HloEvaluatorTypedVisitor<int8>>(this);
   typed_visitors_[S16] =
-      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
-        return Unimplemented(
-            "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
-            "S16.");
-      });
+      absl::make_unique<HloEvaluatorTypedVisitor<int16>>(this);
   typed_visitors_[S32] =
       absl::make_unique<HloEvaluatorTypedVisitor<int32>>(this);
   typed_visitors_[S64] =
@@ -173,6 +200,8 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
       absl::make_unique<HloEvaluatorTypedVisitor<double>>(this);
   typed_visitors_[C64] =
       absl::make_unique<HloEvaluatorTypedVisitor<complex64>>(this);
+  typed_visitors_[C128] =
+      absl::make_unique<HloEvaluatorTypedVisitor<complex128>>(this);
 
   // Most of the evaluator computations we use don't support BF16 (e.g.,
   // std::ceil, std::tanh). To make evaluator work with BF16, we set all
@@ -198,65 +227,30 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
       });
 }
 
-template <typename LiteralPtr>
-StatusOr<Literal> HloEvaluator::Evaluate(
-    const HloModule& module, absl::Span<const LiteralPtr> arg_literals) {
-  XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString());
-
-  evaluated_.clear();
-  arg_literals_.clear();
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literals_.push_back(&*literal_ptr);
-  }
-
-  TF_RETURN_IF_ERROR(module.entry_computation()->Accept(this));
-
-  return GetEvaluatedLiteralFor(module.entry_computation()->root_instruction())
-      .Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    const HloModule& module, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literal_ptrs.push_back(&literal_ptr);
-  }
-  return Evaluate<const Literal*>(module, arg_literal_ptrs);
-}
-
-template <typename LiteralPtr>
 StatusOr<Literal> HloEvaluator::Evaluate(
     const HloComputation& computation,
-    absl::Span<const LiteralPtr> arg_literals) {
+    absl::Span<const Literal* const> arg_literals) {
   CHECK(computation.parent() != nullptr);
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
 
-  evaluated_.clear();
-  arg_literals_.clear();
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literals_.push_back(&*literal_ptr);
+  if (arg_literals.size() != computation.num_parameters()) {
+    return InvalidArgument(
+        "Expected %d argument%s, but got %d.", computation.num_parameters(),
+        computation.num_parameters() == 1 ? "" : "s", arg_literals.size());
   }
-
-  TF_RETURN_IF_ERROR(computation.Accept(this));
-  return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    const HloComputation& computation, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literal_ptrs.push_back(&literal_ptr);
+  for (int64 i = 0; i < arg_literals.size(); ++i) {
+    const auto& computation_shape =
+        computation.parameter_instruction(i)->shape();
+    const auto& arg_shape = arg_literals[i]->shape();
+    if (!ShapeUtil::Equal(computation_shape, arg_shape)) {
+      return InvalidArgument(
+          "Shape mismatch at parameter %d. Computation expected %s, but arg "
+          "was %s.",
+          i, ShapeUtil::HumanStringWithLayout(computation_shape),
+          ShapeUtil::HumanString(arg_shape));
+    }
   }
-  return Evaluate<const Literal*>(computation, arg_literal_ptrs);
-}
-
-template <typename LiteralPtr>
-StatusOr<Literal> HloEvaluator::Evaluate(
-    HloInstruction* instruction, absl::Span<const LiteralPtr> arg_literals) {
-  TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
 
   evaluated_.clear();
   arg_literals_.clear();
@@ -264,33 +258,20 @@ StatusOr<Literal> HloEvaluator::Evaluate(
     arg_literals_.push_back(&*literal_ptr);
   }
 
-  // Evaluate operands of Parameter type against the input literals which
-  // caches the evaluated literal results.
-  for (const auto operand : instruction->operands()) {
-    if (operand->opcode() == HloOpcode::kParameter) {
-      const Literal* input_literal = arg_literals_[operand->parameter_number()];
-      VLOG(2) << "Parameter operand evaluated to: "
-              << input_literal->ToString();
-      TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
-
-      evaluated_[operand] = input_literal->Clone();
-    }
+  // Re-seed RNG, either from the configuration's seed or a monotonic
+  // per-evaluator seed (which prevents two evaluators from returning the same
+  // random sequence).
+  if (computation.parent()->config().seed()) {
+    seed_ = computation.parent()->config().seed();
+  } else {
+    // Start global_seed at a (true) random value.
+    static std::atomic<uint64> global_seed{std::random_device()()};
+    seed_ = global_seed.fetch_add(1);
   }
+  engine_.seed(seed_);
 
-  TF_RETURN_IF_ERROR(Preprocess(instruction));
-  TF_RETURN_IF_ERROR(instruction->Visit(this));
-  TF_RETURN_IF_ERROR(Postprocess(instruction));
-  return GetEvaluatedLiteralFor(instruction).Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    HloInstruction* instruction, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal : arg_literals) {
-    arg_literal_ptrs.push_back(&literal);
-  }
-  return Evaluate<const Literal*>(instruction, arg_literal_ptrs);
+  TF_RETURN_IF_ERROR(computation.Accept(this));
+  return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
 }
 
 StatusOr<Literal> HloEvaluator::Evaluate(HloInstruction* instruction) {
@@ -408,16 +389,45 @@ Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status HloEvaluator::HandleGetDimensionSize(
+    HloInstruction* get_dimension_size) {
+  HloInstruction* operand = get_dimension_size->mutable_operand(0);
+  int64 dim = get_dimension_size->dimension();
+  if (dynamic_dimension_inference_ == nullptr) {
+    return InvalidArgument(
+        "Evaluator cannot evaluate get_dimension_size without "
+        "set_dynamic_dimension_inference.");
+  }
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference_->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    evaluated_[get_dimension_size] =
+        GetEvaluatedLiteralFor(dynamic_size).Clone();
+    return Status::OK();
+  }
+
+  const Shape& shape = get_dimension_size->operand(0)->shape();
+  Literal output(ShapeUtil::MakeShape(U32, {}));
+  output.PopulateWithValue(
+      static_cast<uint32>(shape.dimensions(get_dimension_size->dimension())));
+  evaluated_[get_dimension_size] = std::move(output);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  // Nothing to do other than sanity checks. Parameters' values are stored in
+  // arg_literals_.
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
+
+#ifndef NDEBUG
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
   VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
   DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()))
       << "parameter shape is: " << ShapeUtil::HumanString(parameter->shape())
       << ", but input literal shape is: "
       << ShapeUtil::HumanString(input_literal->shape());
+#endif
 
-  evaluated_[parameter] = input_literal->Clone();
   return Status::OK();
 }
 
@@ -442,8 +452,8 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
   // The result concatenate dimension is going to be the sum of all
   // concatenate dimensions of the operands taking part of the operation.
   const Shape& reference_shape = operands[0]->shape();
-  CHECK(ShapeUtil::IsArray(reference_shape));
-  const int64 rank = ShapeUtil::Rank(reference_shape);
+  CHECK(reference_shape.IsArray());
+  const int64 rank = reference_shape.rank();
   const int64 concat_dim = concatenate->dimensions()[0];
   CHECK_GE(concat_dim, 0);
   CHECK_LT(concat_dim, rank);
@@ -453,7 +463,7 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
 
   for (int64 i = 1; i < operands.size(); ++i) {
     const Shape& operand_shape = operands[i]->shape();
-    CHECK(ShapeUtil::IsArray(operand_shape));
+    CHECK(operand_shape.IsArray());
     // Accumulate the concat dimension from all tensors taking part to the
     // operation.
     concat_dimensions[concat_dim] +=
@@ -530,6 +540,13 @@ Status HloEvaluator::HandleReal(HloInstruction* real) {
       TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
       break;
     }
+    case C128: {
+      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
+          real, [](complex128 elem_operand) { return std::real(elem_operand); },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
+      break;
+    }
     case F16: {
       auto result_or = ElementWiseUnaryOpImpl<Eigen::half, Eigen::half>(
           real, [](Eigen::half elem_operand) { return elem_operand; },
@@ -560,11 +577,61 @@ Status HloEvaluator::HandleReal(HloInstruction* real) {
 }
 
 Status HloEvaluator::HandleImag(HloInstruction* imag) {
-  auto result_or = ElementWiseUnaryOpImpl<float, complex64>(
-      imag, [](complex64 elem_operand) { return std::imag(elem_operand); },
-      GetEvaluatedLiteralFor(imag->operand(0)));
+  auto operand = imag->operand(0);
+  switch (operand->shape().element_type()) {
+    case C64: {
+      auto result_or = ElementWiseUnaryOpImpl<float, complex64>(
+          imag, [](complex64 elem_operand) { return std::imag(elem_operand); },
+          GetEvaluatedLiteralFor(imag->operand(0)));
+
+      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+      break;
+    }
+    case C128: {
+      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
+          imag, [](complex128 elem_operand) { return std::imag(elem_operand); },
+          GetEvaluatedLiteralFor(imag->operand(0)));
+
+      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+      break;
+    }
+    default:
+      LOG(FATAL) << "HandleImag: unknown/unhandled primitive type: "
+                 << PrimitiveType_Name(operand->shape().element_type());
+  }
 
-  TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleComplex(HloInstruction* complex) {
+  const Literal& real = GetEvaluatedLiteralFor(complex->operand(0));
+  const Literal& imag = GetEvaluatedLiteralFor(complex->operand(1));
+  TF_RET_CHECK(ShapeUtil::Compatible(real.shape(), imag.shape()));
+
+  Literal result(complex->shape());
+  switch (complex->shape().element_type()) {
+    case C64: {
+      TF_RETURN_IF_ERROR(
+          result.Populate<complex64>([&](absl::Span<const int64> multi_index) {
+            return std::complex<float>(real.Get<float>(multi_index),
+                                       imag.Get<float>(multi_index));
+          }));
+      break;
+    }
+    case C128: {
+      TF_RETURN_IF_ERROR(
+          result.Populate<complex128>([&](absl::Span<const int64> multi_index) {
+            return std::complex<float>(real.Get<double>(multi_index),
+                                       imag.Get<double>(multi_index));
+          }));
+      break;
+    }
+    default:
+      LOG(FATAL) << "HandleComplex: unknown/unhandled primitive type: "
+                 << PrimitiveType_Name(complex->shape().element_type());
+  }
+
+  evaluated_[complex] = std::move(result);
   return Status::OK();
 }
 
@@ -601,8 +668,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
           evaluated_[compare],
           Compare<uint8>(compare->shape(), opcode, lhs_literal, rhs_literal));
     } break;
-    case U16:
-      return Unimplemented("unhandled primitive type: U16.");
+    case U16: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<uint16>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
     case U32: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -618,8 +688,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
           evaluated_[compare],
           Compare<int8>(compare->shape(), opcode, lhs_literal, rhs_literal));
     } break;
-    case S16:
-      return Unimplemented("unhandled primitive type: S16.");
+    case S16: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<int16>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
     case S32: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -655,6 +728,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
                           Compare<complex64>(compare->shape(), opcode,
                                              lhs_literal, rhs_literal));
     } break;
+    case C128: {
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<complex128>(compare->shape(), opcode,
+                                              lhs_literal, rhs_literal));
+    } break;
     default:
       LOG(FATAL) << "HandleCompare: unknown primitive type: "
                  << PrimitiveType_Name(lhs->shape().element_type());
@@ -1036,11 +1114,9 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
 Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
   const Literal& operand = GetEvaluatedLiteralFor(broadcast->operand(0));
 
-  TF_RET_CHECK(broadcast->dimensions().size() ==
-               ShapeUtil::Rank(operand.shape()))
+  TF_RET_CHECK(broadcast->dimensions().size() == operand.shape().rank())
       << "broadcast dimensions is of size: " << broadcast->dimensions().size()
-      << " and rank of operand_to_broadcast is: "
-      << ShapeUtil::Rank(operand.shape());
+      << " and rank of operand_to_broadcast is: " << operand.shape().rank();
   // Checks that operand's dimensions are the same as the broadcast's
   // dimensions along the dimensions to be broadcasted.
   for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
@@ -1113,9 +1189,10 @@ Status HloEvaluator::HandleCall(HloInstruction* call) {
   }
 
   HloEvaluator embedded_evaluator;
-  Literal result =
-      embedded_evaluator.Evaluate<const Literal*>(*computation, arg_literals)
-          .ConsumeValueOrDie();
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
+  Literal result = embedded_evaluator.Evaluate(*computation, arg_literals)
+                       .ConsumeValueOrDie();
 
   evaluated_[call] = std::move(result);
   return Status::OK();
@@ -1131,7 +1208,9 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
       fusion->fused_instructions_computation()->Clone(
           /*suffix=*/"clone_with_layout", &context);
   for (auto* instruction : cloned_fused_computation->instructions()) {
-    LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+    if (!LayoutUtil::HasLayout(instruction->shape())) {
+      LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+    }
   }
   auto readded_computation =
       empty_hlo_module.AddEntryComputation(std::move(cloned_fused_computation));
@@ -1145,9 +1224,10 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
   }
 
   HloEvaluator embedded_evaluator;
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   Literal result =
-      embedded_evaluator
-          .Evaluate<const Literal*>(*readded_computation, arg_literals)
+      embedded_evaluator.Evaluate(*readded_computation, arg_literals)
           .ConsumeValueOrDie();
 
   evaluated_[fusion] = std::move(result);
@@ -1165,16 +1245,16 @@ Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
   auto* false_computation = conditional->false_computation();
 
   HloEvaluator embedded_evaluator;
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   Literal result;
   if (pred.Get<bool>({})) {
-    result = embedded_evaluator
-                 .Evaluate<const Literal*>(*true_computation,
-                                           {&true_computation_arg})
-                 .ConsumeValueOrDie();
+    result =
+        embedded_evaluator.Evaluate(*true_computation, {&true_computation_arg})
+            .ConsumeValueOrDie();
   } else {
     result = embedded_evaluator
-                 .Evaluate<const Literal*>(*false_computation,
-                                           {&false_computation_arg})
+                 .Evaluate(*false_computation, {&false_computation_arg})
                  .ConsumeValueOrDie();
   }
 
@@ -1221,18 +1301,21 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
   bool keep_going = true;
   int64 iteration_count = 0;
   HloEvaluator cond_evaluator(max_loop_iterations_);
+  cond_evaluator.set_dynamic_dimension_inference(dynamic_dimension_inference_);
   HloEvaluator loop_body_evaluator(max_loop_iterations_);
+  loop_body_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   while (keep_going) {
     if (max_loop_iterations_ >= 0 && iteration_count++ > max_loop_iterations_) {
       return InvalidArgument("Loop %s exceeded loop iteration limit (%d).",
                              while_hlo->name(), max_loop_iterations_);
     }
     TF_ASSIGN_OR_RETURN(auto cond_val,
-                        cond_evaluator.Evaluate<Literal*>(*cond_comp, {&lcv}));
+                        cond_evaluator.Evaluate(*cond_comp, {&lcv}));
     keep_going = cond_val.GetFirstElement<bool>();
     if (keep_going) {
-      TF_ASSIGN_OR_RETURN(auto body_val, loop_body_evaluator.Evaluate<Literal*>(
-                                             *body_comp, {&lcv}));
+      TF_ASSIGN_OR_RETURN(auto body_val,
+                          loop_body_evaluator.Evaluate(*body_comp, {&lcv}));
       VLOG(3) << "Loop iteration result: " << body_val.ToString();
       lcv = std::move(body_val);
       cond_evaluator.ResetVisitStates();
@@ -1243,173 +1326,172 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
   return Status::OK();
 }
 
-// Key-value sort is a special snowflake: it's templated on two different
-// element types, one for the keys, and one for the values. Jump through some
-// hoops to make this work.
 namespace {
-template <typename KeyType, typename ValueType>
-StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
-                                       const Literal& keys_literal,
-                                       const Literal& values_literal) {
-  auto rank = ShapeUtil::Rank(keys_literal.shape());
-  TF_RET_CHECK(
-      ShapeUtil::SameDimensions(keys_literal.shape(), values_literal.shape()))
-      << "Sort keys and values must have the same dimensions";
-  TF_RET_CHECK(sort->operand_count() >= 2) << "Expected key-value sort";
-  // We need to sort an array of keys and an array of values, where the
-  // sorted order of the values is determined by the keys. The simplest(?)
-  // way to do this is to go to an array-of-pairs representation, sort the
-  // array using the keys, and then go back to pair-of-arrays.
-  VLOG(3) << "HandleSort keys_literal: " << keys_literal.ToString();
-  VLOG(3) << "HandleSort values_literal: " << values_literal.ToString();
-
-  if (rank == 0) {
-    // Nothing to sort.
-    return LiteralUtil::MakeTuple({&keys_literal, &values_literal});
-  }
-
-  Literal keys_result_literal(keys_literal.shape());
-  Literal values_result_literal(values_literal.shape());
-  std::vector<int64> zero_base(rank, 0);
-  std::vector<int64> increment(rank, 1);
-  int64 sort_dim = sort->dimensions(0);
-  int64 sort_dim_elements = keys_literal.shape().dimensions(sort_dim);
-  increment[sort_dim] = sort_dim_elements;
-  // Iterate through each dimension except 'sort_dim'.
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-      keys_literal.shape(), zero_base,
-      AsInt64Slice(keys_literal.shape().dimensions()), increment,
-      [&](absl::Span<const int64> indices) -> StatusOr<bool> {
-        // Extract a slice from the keys and values literals that correspond to
-        // exactly the row in dimension 'sort_dim'.
-        std::vector<int64> limit_indices(indices.begin(), indices.end());
-        std::for_each(limit_indices.begin(), limit_indices.end(),
-                      [](int64& index) { ++index; });
-        limit_indices[sort_dim] = sort_dim_elements;
-        TF_ASSIGN_OR_RETURN(auto keys_to_sort,
-                            keys_literal.Slice(indices, limit_indices)
-                                .Reshape({sort_dim_elements}));
-        const auto& keys_data = keys_to_sort.data<KeyType>();
-        TF_ASSIGN_OR_RETURN(auto values_to_sort,
-                            values_literal.Slice(indices, limit_indices)
-                                .Reshape({sort_dim_elements}));
-        const auto& values_data = values_to_sort.data<ValueType>();
-        using kv_pair = std::pair<KeyType, ValueType>;
-        std::vector<kv_pair> key_value_vector;
-        key_value_vector.reserve(keys_data.size());
-        for (int i = 0; i < keys_data.size(); ++i) {
-          key_value_vector.push_back(
-              std::make_pair(keys_data[i], values_data[i]));
-        }
-        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
-                         [](const kv_pair& a, const kv_pair& b) {
-                           return SafeLess<KeyType>(a.first, b.first);
-                         });
-        std::vector<KeyType> result_keys;
-        // We use a InlinedVector here because we need to convert it to an
-        // absl::Span later, and this would not work with std::vector<bool>.
-        absl::InlinedVector<ValueType, 10> result_values;
-        for (const auto& key_value : key_value_vector) {
-          result_keys.push_back(key_value.first);
-          result_values.push_back(key_value.second);
-        }
-        Literal sorted_keys(ShapeUtil::MakeShape(
-            keys_literal.shape().element_type(), {sort_dim_elements}));
-        sorted_keys.PopulateR1(absl::Span<const KeyType>(result_keys));
-        Literal sorted_values(ShapeUtil::MakeShape(
-            values_literal.shape().element_type(), {sort_dim_elements}));
-        sorted_values.PopulateR1(absl::Span<const ValueType>(result_values));
-        std::vector<int64> slice_dimensions(rank, 1);
-        slice_dimensions[sort_dim] = sort_dim_elements;
-        std::vector<int64> start_indices(rank, 0);
-        TF_ASSIGN_OR_RETURN(auto sorted_keys_reshaped,
-                            sorted_keys.Reshape(slice_dimensions));
-        TF_RETURN_IF_ERROR(keys_result_literal.CopySliceFrom(
-            sorted_keys_reshaped, start_indices, indices, slice_dimensions));
-        TF_ASSIGN_OR_RETURN(auto sorted_values_reshaped,
-                            sorted_values.Reshape(slice_dimensions));
-        TF_RETURN_IF_ERROR(values_result_literal.CopySliceFrom(
-            sorted_values_reshaped, start_indices, indices, slice_dimensions));
-        return true;
-      }));
-
-  Literal result_tuple;
-  result_tuple =
-      LiteralUtil::MakeTuple({&keys_result_literal, &values_result_literal});
-  VLOG(3) << "HandleSort result_tuple: " << result_tuple.ToString();
-  return std::move(result_tuple);
-}
-
-template <typename KeyType>
-StatusOr<Literal> EvaluateSortCurried(HloInstruction* sort,
-                                      const Literal& keys_literal,
-                                      const Literal& values_literal) {
-  switch (values_literal.shape().element_type()) {
-    case PRED:
-      return EvaluateSortInternal<KeyType, bool>(sort, keys_literal,
-                                                 values_literal);
-    case F32:
-      return EvaluateSortInternal<KeyType, float>(sort, keys_literal,
-                                                  values_literal);
-    case U32:
-      return EvaluateSortInternal<KeyType, uint32>(sort, keys_literal,
-                                                   values_literal);
-    case S32:
-      return EvaluateSortInternal<KeyType, int32>(sort, keys_literal,
-                                                  values_literal);
-    case BF16:
-      return EvaluateSortInternal<KeyType, bfloat16>(sort, keys_literal,
-                                                     values_literal);
-    default:
-      return InvalidArgument("Unsupported type for Sort");
-  }
-}
-
-StatusOr<Literal> EvaluateSort(HloInstruction* sort,
-                               const Literal& keys_literal,
-                               const Literal& values_literal) {
-  switch (sort->operand(0)->shape().element_type()) {
-    case F32:
-      return EvaluateSortCurried<float>(sort, keys_literal, values_literal);
-    case U32:
-      return EvaluateSortCurried<uint32>(sort, keys_literal, values_literal);
-    case S32:
-      return EvaluateSortCurried<int32>(sort, keys_literal, values_literal);
-    case BF16:
-      return EvaluateSortCurried<bfloat16>(sort, keys_literal, values_literal);
+StatusOr<Literal> ExtractFromIndexPositions(const Literal& from,
+                                            absl::Span<int64 const> indices) {
+  PrimitiveType type = from.shape().element_type();
+  switch (type) {
+    case PRED: {
+      // We use a InlinedVector here because we need to convert it to an
+      // absl::Span later, and this would not work with std::vector<bool>.
+      absl::InlinedVector<bool, 10> values;
+      for (int64 index : indices) {
+        values.push_back(from.Get<bool>({index}));
+      }
+      return LiteralUtil::CreateR1<bool>(values);
+    }
+    case F32: {
+      std::vector<float> values;
+      for (int64 index : indices) {
+        values.push_back(from.Get<float>({index}));
+      }
+      return LiteralUtil::CreateR1<float>(values);
+    }
+    case U32: {
+      std::vector<uint32> values;
+      for (int64 index : indices) {
+        values.push_back(from.Get<uint32>({index}));
+      }
+      return LiteralUtil::CreateR1<uint32>(values);
+    }
+    case S32: {
+      std::vector<int32> values;
+      for (int64 index : indices) {
+        values.push_back(from.Get<int32>({index}));
+      }
+      return LiteralUtil::CreateR1<int32>(values);
+    }
+    case BF16: {
+      std::vector<bfloat16> values;
+      for (int64 index : indices) {
+        values.push_back(from.Get<bfloat16>({index}));
+      }
+      return LiteralUtil::CreateR1<bfloat16>(values);
+    }
     default:
-      return InvalidArgument("Unsupported type for Sort");
+      return InvalidArgument("Unsupported type for Sort: %s",
+                             PrimitiveType_Name(type));
   }
 }
 }  // namespace
 
 Status HloEvaluator::HandleSort(HloInstruction* sort) {
-  if (!ShapeUtil::IsTuple(sort->shape())) {
+  if (!sort->shape().IsTuple()) {
     return DefaultAction(sort);
   } else {
-    // This is a really stupid work-around for the fact it's hard to support a
-    // multi-value sort directly, due to the fact we need to template the
-    // evaluation function on all of the value types.
-    std::vector<Literal> sort_results_backing;
-    for (int64 i = 0; i < sort->operand_count(); ++i) {
-      auto result = EvaluateSort(sort, GetEvaluatedLiteralFor(sort->operand(0)),
-                                 GetEvaluatedLiteralFor(sort->operand(i)));
-      if (!result.ok()) {
-        return result.status();
+    TF_RET_CHECK(sort->operand_count() >= 2) << "Expected key-value sort";
+    for (int64 i = 1; i < sort->operand_count(); ++i) {
+      TF_RET_CHECK(ShapeUtil::SameDimensions(sort->operand(0)->shape(),
+                                             sort->operand(i)->shape()))
+          << "All Sort operands must have the same dimensions";
+    }
+
+    if (VLOG_IS_ON(3)) {
+      for (int64 i = 0; i < sort->operand_count(); ++i) {
+        VLOG(3) << "HandleSort operand " << i << " literal: "
+                << GetEvaluatedLiteralFor(sort->operand(i)).ToString();
       }
-      sort_results_backing.push_back(
-          std::move(result.ValueOrDie().DecomposeTuple()[1]));
     }
-    std::vector<const Literal*> sort_results;
-    absl::c_transform(sort_results_backing, std::back_inserter(sort_results),
+    Shape key_shape = sort->operand(0)->shape();
+    auto rank = key_shape.rank();
+    PrimitiveType keys_type = key_shape.element_type();
+    if (keys_type != F32 && keys_type != U32 && keys_type != S32 &&
+        keys_type != BF16) {
+      return InvalidArgument("Unsupported type for Sort: %s",
+                             PrimitiveType_Name(keys_type));
+    }
+    std::vector<Literal> result_literals;
+    result_literals.reserve(sort->operand_count());
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      result_literals.emplace_back(sort->operand(i)->shape());
+    }
+    std::vector<int64> zero_base(rank, 0);
+    std::vector<int64> increment(rank, 1);
+    int64 sort_dim = sort->dimensions(0);
+    int64 sort_dim_elements = key_shape.dimensions(sort_dim);
+    increment[sort_dim] = sort_dim_elements;
+    // Iterate through each dimension except 'sort_dim'.
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
+        key_shape, zero_base, AsInt64Slice(key_shape.dimensions()), increment,
+        [&](absl::Span<const int64> indices) -> StatusOr<bool> {
+          // Extract a slice from each operand literal that corresponds to
+          // exactly the row in dimension 'sort_dim'.
+          std::vector<int64> limit_indices(indices.begin(), indices.end());
+          absl::c_for_each(limit_indices, [](int64& index) { ++index; });
+          limit_indices[sort_dim] = sort_dim_elements;
+          std::vector<Literal> literals_to_sort;
+          literals_to_sort.reserve(sort->operand_count());
+          for (int64 i = 0; i < sort->operand_count(); ++i) {
+            TF_ASSIGN_OR_RETURN(auto literal_to_sort,
+                                GetEvaluatedLiteralFor(sort->operand(i))
+                                    .Slice(indices, limit_indices)
+                                    .Reshape({sort_dim_elements}));
+            literals_to_sort.push_back(std::move(literal_to_sort));
+          }
+          std::vector<int64> indices_to_sort(sort_dim_elements);
+          std::iota(indices_to_sort.begin(), indices_to_sort.end(), 0);
+          std::stable_sort(
+              indices_to_sort.begin(), indices_to_sort.end(),
+              [keys_type, &literals_to_sort](int64 a, int64 b) {
+                switch (keys_type) {
+                  case F32: {
+                    auto key_lhs = literals_to_sort[0].Get<float>({a});
+                    auto key_rhs = literals_to_sort[0].Get<float>({b});
+                    return SafeLess(key_lhs, key_rhs);
+                  }
+                  case U32: {
+                    auto key_lhs = literals_to_sort[0].Get<uint32>({a});
+                    auto key_rhs = literals_to_sort[0].Get<uint32>({b});
+                    return SafeLess(key_lhs, key_rhs);
+                  }
+                  case S32: {
+                    auto key_lhs = literals_to_sort[0].Get<int32>({a});
+                    auto key_rhs = literals_to_sort[0].Get<int32>({b});
+                    return SafeLess(key_lhs, key_rhs);
+                  }
+                  case BF16: {
+                    auto key_lhs = literals_to_sort[0].Get<bfloat16>({a});
+                    auto key_rhs = literals_to_sort[0].Get<bfloat16>({b});
+                    return SafeLess(key_lhs, key_rhs);
+                  }
+                  default:
+                    // We should never reach here, because we checked earlier
+                    // that 'key_type' is one of the cases above.
+                    LOG(FATAL) << "Invalid key type in Sort: %s",
+                        PrimitiveType_Name(keys_type);
+                    return false;
+                }
+              });
+          std::vector<int64> slice_dimensions(rank, 1);
+          slice_dimensions[sort_dim] = sort_dim_elements;
+          std::vector<int64> start_indices(rank, 0);
+          for (int64 i = 0; i < sort->operand_count(); ++i) {
+            TF_ASSIGN_OR_RETURN(Literal sorted_literal,
+                                ExtractFromIndexPositions(literals_to_sort[i],
+                                                          indices_to_sort));
+            TF_ASSIGN_OR_RETURN(auto sorted_literal_reshaped,
+                                sorted_literal.Reshape(slice_dimensions));
+            TF_RETURN_IF_ERROR(result_literals[i].CopySliceFrom(
+                sorted_literal_reshaped, start_indices, indices,
+                slice_dimensions));
+          }
+          return true;
+        }));
+
+    std::vector<const Literal*> literal_ptrs;
+    absl::c_transform(result_literals, std::back_inserter(literal_ptrs),
                       [](const Literal& literal) { return &literal; });
-    evaluated_[sort] = LiteralUtil::MakeTuple(sort_results);
+
+    Literal result_tuple = LiteralUtil::MakeTuple(literal_ptrs);
+    VLOG(3) << "HandleSort result_tuple: " << result_tuple.ToString();
+
+    evaluated_[sort] = std::move(result_tuple);
     return Status::OK();
   }
 }
 
 Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
-  if (!ShapeUtil::IsTuple(reduce->shape())) {
+  if (!reduce->shape().IsTuple()) {
     return DefaultAction(reduce);
   } else {
     auto first_element_type = reduce->shape().tuple_shapes(0).element_type();
@@ -1424,6 +1506,27 @@ Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
   }
 }
 
+Status HloEvaluator::HandleCustomCall(HloInstruction* custom_call) {
+  if (!custom_call_handler_) {
+    // No handler is registered; this means custom-calls are not allowed.
+    return DefaultAction(custom_call);
+  }
+
+  // Evaluate input operands so the handler has access to the operand data.
+  std::vector<const Literal*> operands;
+  operands.reserve(custom_call->operand_count());
+  for (const HloInstruction* operand : custom_call->operands()) {
+    operands.push_back(&GetEvaluatedLiteralFor(operand));
+  }
+
+  // Synchronously issue the handler to populate the instruction output literal.
+  TF_ASSIGN_OR_RETURN(
+      auto output, custom_call_handler_(custom_call, absl::MakeSpan(operands)));
+
+  evaluated_[custom_call] = std::move(output);
+  return Status::OK();
+}
+
 Status HloEvaluator::Preprocess(HloInstruction* hlo) {
   VLOG(2) << "About to visit HLO: " << hlo->ToString();
   return ShapeUtil::ValidateShape(hlo->shape());
@@ -1441,18 +1544,6 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
-// Explicit instantiation of templatized Evaluate* methods.
-//
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    const HloModule& module, absl::Span<const Literal* const> arg_literals);
-
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    const HloComputation& computation,
-    absl::Span<const Literal* const> arg_literals);
-
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    HloInstruction* instruction, absl::Span<const Literal* const> arg_literals);
-
 namespace {
 template <typename T>
 std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index d363a51c63de6fd4246c4970f580b68f4a627df8..72ea40bcd797def3bc0765986881792b8752d9e1 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -16,13 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 
+#include <functional>
 #include <memory>
 
 #include "absl/container/node_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -43,16 +46,24 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // specified.
   explicit HloEvaluator(int64 max_loop_iterations = -1);
 
-  // Evaluates an HLO module and an array of pointers to literals.
-  // Returns the evaluated result as a literal if successful.
+  // Evaluates an HLO module and an array of pointers to literals.  Returns the
+  // evaluated result as a literal if successful.
+  //
   // Precondition: The indices of arg_literals correspond to the parameter
   // numbers of the HLO parameters in the computation. See comment below for an
   // example.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  StatusOr<Literal> Evaluate(const HloModule& module,
+                             absl::Span<const Literal* const> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
+  template <typename Dummy = void>
   StatusOr<Literal> Evaluate(const HloModule& module,
-                             absl::Span<const LiteralPtr> arg_literals);
+                             absl::Span<const Literal> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
@@ -70,29 +81,24 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
   // 1 in this computation. The input literals array will then have its first
   // literal map to Parameter0 and the second map to Parameter1.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  StatusOr<Literal> Evaluate(const HloComputation& computation,
+                             absl::Span<const Literal* const> arg_literals);
+  template <typename Dummy = void>
   StatusOr<Literal> Evaluate(const HloComputation& computation,
-                             absl::Span<const LiteralPtr> arg_literals);
-
-  // Evaluates a single HLO instruction and an array of pointers to literals.
-  // Return the evaluated result as literal if successful.
-  // Precondition:
-  // 1. argument literals correspond to the input instruction's parameters in
-  // their post-ordering.
-  // 2. the instruction's operands must be of either Parameter or Constant type.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
-  StatusOr<Literal> Evaluate(HloInstruction* instruction,
-                             absl::Span<const LiteralPtr> arg_literals);
-
-  // Evaluates a single HLO instruction with constant operands.
-  // Returns the evaluated result as literal if successful.
-  // Precondition:
-  // 1. all operands of the input instruction are constants.
-  // 2. the instruction is not a Parameter operation.
+                             absl::Span<const Literal> arg_literals) {
+    std::vector<const Literal*> arg_literal_ptrs;
+    for (const auto& l : arg_literals) {
+      arg_literal_ptrs.push_back(&l);
+    }
+    return Evaluate(computation, arg_literal_ptrs);
+  }
+
+  // Gets the value of running a single HLO instruction.
+  //
+  // All of the operands to this instruction must be constants.
   StatusOr<Literal> Evaluate(HloInstruction* instruction);
 
   // Same as Evaluate, except returning false on error and accepts an output
@@ -120,9 +126,31 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
                                   const PrecisionConfig& precision_config,
                                   const Literal& lhs, const Literal& rhs);
 
+  void set_dynamic_dimension_inference(
+      DynamicDimensionInference* dynamic_dimension_inference) {
+    dynamic_dimension_inference_ = dynamic_dimension_inference;
+  }
+
   // Enable the fast path for certain operations like dot or convolution.
   void set_use_fast_path(bool value) { use_fast_path_ = value; }
 
+  // Handles evaluation of a custom-call op.
+  // Operand literals are provided in |operands| and implementations must
+  // populate |output| before returning.
+  using CustomCallHandler = std::function<StatusOr<Literal>(
+      HloInstruction* custom_call, absl::Span<const Literal*> operands)>;
+
+  // Sets a handler that is called during evaluation for custom-call ops.
+  // If no handler is defined the default error behavior will occur. The handler
+  // will be provided evaluated literals for all operands and is expected to
+  // return an output literal of the appropriate shape.
+  void set_custom_call_handler(
+      std::function<StatusOr<Literal>(HloInstruction* custom_call,
+                                      absl::Span<const Literal*> operands)>
+          handler) {
+    custom_call_handler_ = std::move(handler);
+  }
+
   // Returns the result of a matrix multiply `lhs x rhs`.
   static std::unique_ptr<Array2D<Eigen::half>> MatmulArray2D(
       const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs);
@@ -158,6 +186,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   //
   Status HandleBitcast(HloInstruction* bitcast) override;
 
+  Status HandleGetDimensionSize(HloInstruction* get_dimension_size) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -204,16 +234,51 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleImag(HloInstruction* imag) override;
 
+  Status HandleComplex(HloInstruction* complex) override;
+
   Status HandleReduce(HloInstruction* reduce) override;
 
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+
+  // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
+  // expanded in a semantic-preserving way into other HLOs by adding exanpsion
+  // HLO pass to the HLO optimization pass during compilation, which can then be
+  // handled by the evaluator.
+  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override {
+    return Unimplemented("BatchNormGrad HLO is unsupported by the evaluator.");
+  };
+  Status HandleBatchNormInference(
+      HloInstruction* batch_norm_inference) override {
+    return Unimplemented(
+        "BatchNormInference HLO is unsupported by the evaluator.");
+  };
+  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override {
+    return Unimplemented(
+        "BatchNormTraining HLO is unsupported by the evaluator.");
+  };
+  Status HandleInfeed(HloInstruction* infeed) override {
+    return Unimplemented("Infeed HLO is unsupported by the evaluator.");
+  };
+  Status HandleOutfeed(HloInstruction* outfeed) override {
+    return Unimplemented("Outfeed HLO is unsupported by the evaluator.");
+  };
+
   // Returns the already-evaluated literal result for the instruction.
+  //
   // A Constant instruction is considered evaluated and its literal will be
   // returned directly without looking up the cache.
+  //
+  // Similarly, a Parameter instruction is considered evaluated and its literal
+  // is looked up in arg_literals.
+  //
   // Crash with log if the given instruction has not been evaluated previously.
   const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
     if (hlo->IsConstant()) {
       return hlo->literal();
     }
+    if (hlo->opcode() == HloOpcode::kParameter) {
+      return *arg_literals_.at(hlo->parameter_number());
+    }
     auto it = evaluated_.find(hlo);
     CHECK(it != evaluated_.end())
         << "could not find evaluated value for: " << hlo->ToString();
@@ -221,12 +286,18 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   }
 
   // Tracks the HLO instruction and its evaluated literal result.
+  //
+  // Parameters and constants aren't stored here, see implementation of
+  // GetEvaluatedLiteralFor.
+  //
   // TODO(b/35950897): have better memory management here to free instructions
   // that are no longer a parent for any other subsequent instruction in
   // post-orderring.
+  //
   // Must be cleared for each evaluation.
-  // Storing Literal in place require the container to have pointer stability so
-  // we cannot use flat_hash_map any more.
+  //
+  // Storing Literal in place requires the container to have pointer stability
+  // so we cannot use flat_hash_map any more.
   absl::node_hash_map<const HloInstruction*, Literal> evaluated_;
 
   // Use fast path that uses eigen in the evaluator.
@@ -262,6 +333,20 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Max loop iterations to execute with no maximum if negative.
   int64 max_loop_iterations_;
 
+  // Module-level seed handle.
+  uint64 seed_;
+  // RNG engine.
+  std::minstd_rand0 engine_;
+
+  // DynamicDimensionInference is used to evaluate GetDimensionSize, which
+  // returns the dynamic dimension size of its operand.
+  DynamicDimensionInference* dynamic_dimension_inference_;
+
+  // Optional handler for custom_call ops.
+  std::function<StatusOr<Literal>(HloInstruction* custom_call,
+                                  absl::Span<const Literal*> operands)>
+      custom_call_handler_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 8fa493a8732662d5357a68937bfad7ac2b3b8c5d..9bc71c9d6c5e3ed5a3de2d6320762bde6005d3c0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -51,20 +51,18 @@ namespace {
 
 static std::array<bool, 2> use_bf16_params{true, false};
 
-class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
-                         public HloTestBase {
- protected:
-  HloEvaluatorTest() : HloTestBase(), use_bfloat16_(GetParam()) {
-    evaluator_ = absl::make_unique<HloEvaluator>();
-  }
+// Test fixture for the HloEvaluator.
+//
+// In bf16 mode, all f32 shapes are converted to bf16 before running.
+class HloEvaluatorTest : public HloTestBase {
+ public:
+  HloEvaluatorTest() : use_bfloat16_(false) {}
 
   Literal Evaluate(absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
-      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
-      auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(m_.get()).ValueOrDie();
+      HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*m_->entry_computation(), arg_literals)
+    return evaluator_.Evaluate(*m_->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
@@ -74,16 +72,12 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
   Literal EvaluateWithModule(
       HloModule* module, absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
-      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
-      auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(module).ValueOrDie();
+      HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*module->entry_computation(), arg_literals)
+    return evaluator_.Evaluate(*module->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloEvaluator> evaluator_;
-
   void TestUnaryOp(HloOpcode opcode, Literal expected, Literal input,
                    float aabs = 0) {
     HloComputation::Builder b(TestName());
@@ -117,16 +111,27 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
   }
 
-  bool use_bfloat16_;
+ protected:
+  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {}
+  HloEvaluator evaluator_;
+
+  const bool use_bfloat16_;
   std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
 };
 
-#define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
-  TEST_P(test_case_name, test_name)
+// Lets you write TEST_Ps that run twice, once with and once without bf16.
+class HloEvaluatorBf16Test : public ::testing::WithParamInterface<bool>,
+                             public HloEvaluatorTest {
+ protected:
+  HloEvaluatorBf16Test() : HloEvaluatorTest(/*use_bfloat16=*/GetParam()) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(HloEvaluatorTest_Instantiation, HloEvaluatorBf16Test,
+                         ::testing::ValuesIn(use_bf16_params));
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST_P(HloEvaluatorTest, DoesClamp) {
+TEST_P(HloEvaluatorBf16Test, DoesClamp) {
   auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
   auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
   auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
@@ -147,7 +152,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
+TEST_P(HloEvaluatorBf16Test, DISABLED_DoesClampSpecialBroadcast) {
   auto low = LiteralUtil::CreateR0<float>(0.f);
   auto value = LiteralUtil::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
   auto high = LiteralUtil::CreateR0<float>(1.f);
@@ -170,7 +175,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST_P(HloEvaluatorTest, DoesSelect) {
+TEST_P(HloEvaluatorBf16Test, DoesSelect) {
   auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -195,7 +200,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST_P(HloEvaluatorTest, DoesAdd) {
+TEST_F(HloEvaluatorTest, DoesAdd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
@@ -204,7 +209,7 @@ TEST_P(HloEvaluatorTest, DoesAdd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise and with 2 operands.
-TEST_P(HloEvaluatorTest, DoesAnd) {
+TEST_P(HloEvaluatorBf16Test, DoesAnd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {4, 4}});
@@ -213,7 +218,7 @@ TEST_P(HloEvaluatorTest, DoesAnd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_P(HloEvaluatorTest, DoesOr) {
+TEST_F(HloEvaluatorTest, DoesOr) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-100, 4}});
@@ -222,7 +227,7 @@ TEST_P(HloEvaluatorTest, DoesOr) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_P(HloEvaluatorTest, DoesXor) {
+TEST_F(HloEvaluatorTest, DoesXor) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-104, 0}});
@@ -231,7 +236,7 @@ TEST_P(HloEvaluatorTest, DoesXor) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise multiply with 2 operands.
-TEST_P(HloEvaluatorTest, DoesMultiply) {
+TEST_F(HloEvaluatorTest, DoesMultiply) {
   auto lhs = LiteralUtil::CreateR2<int32>({{-1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int32>(
       {{std::numeric_limits<int32>::min(), 4}, {4, 4}});
@@ -242,14 +247,14 @@ TEST_P(HloEvaluatorTest, DoesMultiply) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST_P(HloEvaluatorTest, DoesDivideInt64) {
+TEST_F(HloEvaluatorTest, DoesDivideInt64) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
   TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
                std::move(rhs));
 }
-TEST_P(HloEvaluatorTest, DoesDivideDouble) {
+TEST_P(HloEvaluatorBf16Test, DoesDivideDouble) {
   auto lhs = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
   auto expected =
@@ -260,41 +265,41 @@ TEST_P(HloEvaluatorTest, DoesDivideDouble) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST_P(HloEvaluatorTest, DoesAbsR2) {
+TEST_F(HloEvaluatorTest, DoesAbsR2) {
   auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesAbsR0) {
+TEST_P(HloEvaluatorBf16Test, DoesAbsR0) {
   auto operand = LiteralUtil::CreateR0<float>(-1.0f);
   auto expected = LiteralUtil::CreateR0<float>(1.0f);
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
+TEST_P(HloEvaluatorBf16Test, DoesAbsR1WithZeroSize) {
   auto operand = LiteralUtil::CreateR1<float>({});
   auto expected = LiteralUtil::CreateR1<float>({});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesNegateR2) {
+TEST_F(HloEvaluatorTest, DoesNegateR2) {
   auto operand = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
   auto expected = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int>::min()}, {1, -4}});
   TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesCosR2) {
+TEST_P(HloEvaluatorBf16Test, DoesCosR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{1, -1}, {-1, 1}});
   TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
-TEST_P(HloEvaluatorTest, DoesSinR2) {
+TEST_P(HloEvaluatorBf16Test, DoesSinR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{0, 0}, {0, 0}});
   TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
-TEST_P(HloEvaluatorTest, DoesNotR2) {
+TEST_F(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       LiteralUtil::CreateR2<int32>({{0, std::numeric_limits<int>::min()},
                                     {-1, std::numeric_limits<int>::max()}});
@@ -303,9 +308,22 @@ TEST_P(HloEvaluatorTest, DoesNotR2) {
                                     {0, std::numeric_limits<int>::min()}});
   TestUnaryOp(HloOpcode::kNot, std::move(expected), std::move(operand));
 }
+
+TEST_F(HloEvaluatorTest, DoesRealC128) {
+  auto x = LiteralUtil::CreateR1<complex128>({{1, 0}, {-100, 4}});
+  auto expected_real = LiteralUtil::CreateR1<double>({1, -100});
+  TestUnaryOp(HloOpcode::kReal, std::move(expected_real), std::move(x));
+}
+
+TEST_F(HloEvaluatorTest, DoesImagC128) {
+  auto x = LiteralUtil::CreateR1<complex128>({{1, 0}, {-100, 4}});
+  auto expected_imag = LiteralUtil::CreateR1<double>({0, 4});
+  TestUnaryOp(HloOpcode::kImag, std::move(expected_imag), std::move(x));
+}
+
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
-TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
+TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
@@ -335,7 +353,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
 }
 
 // Verifies Reshape operation is correctly evaluated.
-TEST_P(HloEvaluatorTest, DoesReshape) {
+TEST_F(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
@@ -361,7 +379,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
 }
 
 // Verifies Broadcast operation is correctly evaluated.
-TEST_P(HloEvaluatorTest, DoesBroadcast) {
+TEST_F(HloEvaluatorTest, DoesBroadcast) {
   HloComputation::Builder b(TestName());
   auto input_literal = LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
   auto output_literal = LiteralUtil::CreateR3<int32>(
@@ -377,7 +395,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
 
-TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
+TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
   HloComputation::Builder b(TestName());
   auto input_literal = LiteralUtil::CreateR0<int32>(111);
   auto output_literal = LiteralUtil::CreateR2<int32>(
@@ -396,7 +414,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
 
-TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
+TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(HloInstruction::CreateConstant(
@@ -418,7 +436,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
+TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(
@@ -439,7 +457,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
+TEST_P(HloEvaluatorBf16Test, ConvertWithSameLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
@@ -458,7 +476,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
 
-TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
+TEST_P(HloEvaluatorBf16Test, ConvertWithDifferentLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = LiteralUtil::CreateR2WithLayout<int32>(
@@ -491,7 +509,7 @@ PaddingConfig CreatePaddingConfig(
   return padding_config;
 }
 
-TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
+TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto operand = LiteralUtil::CreateR2<int32>({{}, {}});
   HloComputation::Builder b(TestName());
   auto operand_instruction =
@@ -516,7 +534,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(HloEvaluatorBf16Test, Pad4DFloatArrayWithInteriorPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -551,7 +569,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, NegativePadding2D) {
+TEST_P(HloEvaluatorBf16Test, NegativePadding2D) {
   HloComputation::Builder b(TestName());
 
   // input_array:
@@ -593,7 +611,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   EXPECT_TRUE(LiteralTestUtil::Near(expected, result, ErrorSpec(0.031250)));
 }
 
-TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
+TEST_P(HloEvaluatorBf16Test, NegativeAndInteriorPadding2D) {
   HloComputation::Builder b(TestName());
 
   // f32[4,3] {
@@ -632,7 +650,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
+TEST_P(HloEvaluatorBf16Test, DotRank2AndRank1) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -678,7 +696,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
+TEST_P(HloEvaluatorBf16Test, DotRank1AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -716,7 +734,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
+TEST_P(HloEvaluatorBf16Test, DotRank2AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -766,7 +784,51 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SimpleConv1D) {
+TEST_P(HloEvaluatorBf16Test, DotRank4AndRank4) {
+  HloComputation::Builder b(TestName());
+
+  auto lhs_array = absl::make_unique<Array4D<float>>(2, 2, 3, 1);
+  lhs_array->FillIota(1.0f);
+  auto lhs_literal = LiteralUtil::CreateR4FromArray4D<float>(*lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  auto rhs_array = absl::make_unique<Array4D<float>>(2, 2, 3, 1);
+  rhs_array->FillIota(2.0f);
+  auto rhs_literal = LiteralUtil::CreateR4FromArray4D<float>(*rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 1, 1});
+  DotDimensionNumbers dot_dnums;
+
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums,
+                                             DefaultPrecisionConfig(2)));
+  m_->AddEntryComputation(b.Build());
+
+  Literal result = Evaluate();
+  float expected_1 = 0;
+  for (float i = 1.0f; i < 7.0f; ++i) {
+    expected_1 += i * i + i;
+  }
+  float expected_2 = 0;
+  for (float i = 7.0f; i < 13.0f; ++i) {
+    expected_2 += i * i + i;
+  }
+  auto expected_array = Array3D<float>({{{expected_1}}, {{expected_2}}});
+  auto expected = LiteralUtil::CreateR3FromArray3D<float>(expected_array);
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+TEST_P(HloEvaluatorBf16Test, SimpleConv1D) {
   HloComputation::Builder b(TestName());
 
   Array3D<float> lhs_array = {{{1, 2, 3}}};
@@ -815,7 +877,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
+TEST_P(HloEvaluatorBf16Test, Simple4x4Conv2DWith2x2Kernel) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -878,7 +940,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensionsReversed) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -959,7 +1021,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensions) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -1037,7 +1099,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
+TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1101,7 +1163,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
+TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithLowAndHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1166,7 +1228,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_P(HloEvaluatorBf16Test,
        DilatedWindowAndBaseConv2DWithDifferentLowAndHighPaddingAndStrides) {
   HloComputation::Builder b(TestName());
 
@@ -1239,7 +1301,7 @@ TEST_P(HloEvaluatorTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGroupedConvolution) {
   HloComputation::Builder b(TestName());
   std::vector<int64> input_dims = {1, 2, 2, 4};
   std::vector<int64> filter_dims = {2, 2, 2, 8};
@@ -1375,7 +1437,7 @@ void BM_ReducePrecisely(int num_iters) {
 
 BENCHMARK(BM_ReducePrecisely);
 
-TEST_P(HloEvaluatorTest, ReduceAdd) {
+TEST_P(HloEvaluatorBf16Test, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1417,7 +1479,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowMax) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMax) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1468,7 +1530,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxWindowDilation) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1520,7 +1582,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1577,7 +1639,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd6D) {
   HloComputation::Builder b(TestName());
 
   // arg: f32[4,4,4,4,4,4] full of ones. Using small dims to limit run-time.
@@ -1640,7 +1702,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result_literal, result));
 }
 
-TEST_P(HloEvaluatorTest, StridedSlice) {
+TEST_P(HloEvaluatorBf16Test, StridedSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1674,7 +1736,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DynamicSlice) {
+TEST_P(HloEvaluatorBf16Test, DynamicSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1690,12 +1752,14 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({0, 1})));
+  auto zero = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
-                                                      start_indices, {2, 3}));
+  b.AddInstruction(
+      HloInstruction::CreateDynamicSlice(shape, operand, {zero, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1710,7 +1774,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
 
 // Verifies that the HloEvaluator's implementation goes along with existing
 // backends' behavior, although this is not required by the spec.
-TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
+TEST_P(HloEvaluatorBf16Test, DynamicSliceModSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1726,12 +1790,14 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2, 1})));
+  auto two = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
-                                                      start_indices, {2, 3}));
+  b.AddInstruction(
+      HloInstruction::CreateDynamicSlice(shape, operand, {two, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1744,7 +1810,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
+TEST_P(HloEvaluatorBf16Test, DynamicSliceUpdate) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1760,15 +1826,17 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int64>({0, 1})));
+  auto zero = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   auto update = b.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<double>({{-2.0, -3.0}, {-6.0, -7.0}})));
 
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      shape, operand, update, start_indices));
+      shape, operand, update, {zero, one}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1781,7 +1849,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SetAndGetTuples) {
+TEST_P(HloEvaluatorBf16Test, SetAndGetTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1817,7 +1885,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
+TEST_P(HloEvaluatorBf16Test, SetAndGetNestedTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1856,7 +1924,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Reverse) {
+TEST_P(HloEvaluatorBf16Test, Reverse) {
   HloComputation::Builder b(TestName());
 
   // Input shape is float[4x3x2x1].
@@ -1909,7 +1977,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
+TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutions) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1933,7 +2001,7 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
 // we're evaluating is a constant.
-TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
+TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutionsWithConstantOperand) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1956,7 +2024,7 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
       LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result.ValueOrDie()));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV1
 
@@ -1980,7 +2048,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV2
 
@@ -2004,7 +2072,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherMultipleBatchDims
 
@@ -2029,7 +2097,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherNd
 
@@ -2055,7 +2123,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_F(HloEvaluatorTest,
        EvaluateGather_TensorFlowGatherNdNonDefaultIndexVectorDim) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherNd
@@ -2082,7 +2150,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
+TEST_F(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
   const char* hlo_text = R"(
 HloModule DynamicSlice
 
@@ -2105,7 +2173,7 @@ ENTRY main {
                                      Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
+TEST_F(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
   const char* hlo_text = R"(
 HloModule BatchDynamicSlice
 
@@ -2129,7 +2197,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
+TEST_F(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV1
 
@@ -2151,7 +2219,7 @@ ENTRY main {
                                      Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
+TEST_F(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
   const string hlo_text = R"(
 HloModule GatherXd
 
@@ -2176,7 +2244,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV1
 
@@ -2207,7 +2275,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV2
 
@@ -2239,7 +2307,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2271,7 +2339,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2303,7 +2371,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_F32) {
+TEST_P(HloEvaluatorBf16Test, EvaluateScatter_TensorFlowScatter_F32) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2337,7 +2405,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates}), ErrorSpec{0.1, 0.01}));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2369,7 +2437,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterMultipleBatchDims
 
@@ -2402,7 +2470,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNd
 
@@ -2438,7 +2506,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_F(HloEvaluatorTest,
        EvaluateScatter_TensorFlowScatterNd_NonDefaultIndexVectorDim) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNdNonDefaultIndexVectorDim
@@ -2475,7 +2543,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
   const char* hlo_text = R"(
 HloModule DynamicUpdateSlice
 
@@ -2507,7 +2575,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
   const char* hlo_text = R"(
 HloModule BatchDynamicUpdateSlice
 
@@ -2539,7 +2607,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter_ZeroDimBounds
 
@@ -2568,7 +2636,7 @@ ENTRY main {
       operand, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
   const string hlo_text = R"(
 HloModule Scatter_NoUpdateWindowDims
 
@@ -2601,7 +2669,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter_NegativeIndices
 
@@ -2636,7 +2704,7 @@ ENTRY main {
                          {&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_OobIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_OobIndices) {
   const string hlo_text = R"(
 HloModule BatchDynamicUpdateSlice
 
@@ -2672,7 +2740,7 @@ ENTRY main {
                          {&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_OobUpdateWindow) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_OobUpdateWindow) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNd_OobUpdateWindow
 
@@ -2711,7 +2779,7 @@ ENTRY main {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise comparison with 2 bfloat16 operands.
-TEST_P(HloEvaluatorTest, DoesCompareBF16) {
+TEST_F(HloEvaluatorTest, DoesCompareBF16) {
   // lhs >= rhs
   auto lhs = LiteralUtil::CreateR2<bfloat16>(
       {{bfloat16(0.25), bfloat16(0.35), bfloat16(0.125)},
@@ -2725,7 +2793,7 @@ TEST_P(HloEvaluatorTest, DoesCompareBF16) {
                std::move(rhs));
 }
 
-TEST_P(HloEvaluatorTest, Bf16Reduction) {
+TEST_P(HloEvaluatorBf16Test, Bf16Reduction) {
   const string hlo_text = R"(
 HloModule Bf16Reduction
 
@@ -2749,7 +2817,7 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, Evaluate({&arg})));
 }
 
-TEST_P(HloEvaluatorTest, SliceWithDifferentLayout) {
+TEST_P(HloEvaluatorBf16Test, SliceWithDifferentLayout) {
   // Regression test for b/114735354.
   const string hlo_text = R"(
 HloModule SliceWithDifferentLayout
@@ -2768,7 +2836,7 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
-TEST_P(HloEvaluatorTest, Bitcast) {
+TEST_P(HloEvaluatorBf16Test, Bitcast) {
   // Regression test for b/114735354.
   constexpr absl::string_view hlo_text_base = R"(
 HloModule Bitcast
@@ -2795,8 +2863,261 @@ ENTRY main {
   }
 }
 
-INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
-                        ::testing::ValuesIn(use_bf16_params));
+// Check that s32 under/overflow doesn't trigger a ubsan failure.
+TEST_F(HloEvaluatorTest, Int32Overflow) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  c1 = s32[] constant(1073741824)  // 2^30
+  sum = s32[] add(c1, c1)  // 2^31, i.e. INT_MIN
+
+  c2 = s32[] constant(-2147483648)  // -2^31
+  sub = s32[] subtract(c2, c1)  // -2^31 - 2^30, underflows
+
+  mul = s32[] multiply(c1, c1)
+  ROOT tuple = (s32[], s32[], s32[]) tuple(sum, sub, mul)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  std::vector<Literal> actual = Evaluate({}).DecomposeTuple();
+  ASSERT_EQ(actual.size(), 3);
+
+  uint32 pow30 = uint32{1} << 30;
+  uint32 pow31 = uint32{1} << 31;
+  EXPECT_EQ(actual[0].GetFirstElement<int32>(), static_cast<int32>(pow31));
+  EXPECT_EQ(actual[1].GetFirstElement<int32>(),
+            static_cast<int32>(-(pow31 + pow30)));
+  EXPECT_EQ(actual[2].GetFirstElement<int32>(),
+            static_cast<int32>(pow31 * pow31));
+}
+
+TEST_F(HloEvaluatorTest, GetDimensionSize) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  size = u32[] parameter(0)
+
+  data = s32[4] parameter(1)
+
+  sum = s32[4] add(data, data)
+
+  ROOT dynamic_size = u32[] get-dimension-size(sum), dimensions={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(m_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  TF_ASSERT_OK_AND_ASSIGN(DynamicDimensionInference dynamic_dimension_inference,
+                          DynamicDimensionInference::Run(m_.get()));
+
+  evaluator_.set_dynamic_dimension_inference(&dynamic_dimension_inference);
+  Literal size_arg = LiteralUtil::CreateR0<uint32>(3);
+  Literal data_arg = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
+
+  Literal actual = Evaluate({&size_arg, &data_arg});
+
+  EXPECT_EQ(actual.GetFirstElement<uint32>(), static_cast<uint32>(3));
+}
+
+// Check that we get a useful error if we pass inputs of the wrong shape.
+TEST_F(HloEvaluatorTest, EvaluateWithWrongInputShapes) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  p0 = s32[1] parameter(0)
+  ROOT sum = s32[1] add(p0, p0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal input_wrong_shape = LiteralUtil::CreateR1<int32>({0, 1});
+
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_, {&input_wrong_shape})
+                .status()
+                .error_message(),
+            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+            "but arg was s32[2].");
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_->entry_computation(), {&input_wrong_shape})
+                .status()
+                .error_message(),
+            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+            "but arg was s32[2].");
+}
+
+// Check that we get a useful error if we pass too many or too few inputs.
+TEST_F(HloEvaluatorTest, EvaluateWithWrongNumberOfInputs) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  p0 = s32[1] parameter(0)
+  ROOT sum = s32[1] add(p0, p0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal input = LiteralUtil::CreateR1<int32>({0});
+
+  EXPECT_EQ(
+      HloEvaluator().Evaluate(*m_, {&input, &input}).status().error_message(),
+      "Expected 1 argument, but got 2.");
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_->entry_computation(), {&input, &input})
+                .status()
+                .error_message(),
+            "Expected 1 argument, but got 2.");
+}
+
+TEST_F(HloEvaluatorTest, PreserveFusionInputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule FusionInputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{0,1} parameter(0)
+      ROOT bitcast = f32[20,20]{1,0} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{0,1} parameter(0)
+      ROOT fusion = f32[20,20]{1,0} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+}
+
+TEST_F(HloEvaluatorTest, PreserveFusionOutputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule FusionOutputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      ROOT bitcast = f32[20,20]{0,1} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = f32[20,20]{0,1} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+}
+
+TEST_F(HloEvaluatorTest, PreserveMOFusionOutputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule MOFusionOutputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      bitcast = f32[20,20]{0,1} bitcast(param_0)
+      ROOT tuple = (f32[20,20]{0,1}) tuple(bitcast)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = (f32[20,20]{0,1}) fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual_tuple = Evaluate({&args[0]});
+  std::vector<Literal> actual_literals = actual_tuple.DecomposeTuple();
+  EXPECT_TRUE(
+      absl::c_equal(args[0].data<float>(), actual_literals[0].data<float>()));
+}
+
+// Tests that custom_calls fail to evaluate when no handler is specified.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_NoHandler) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_NoHandler
+    ENTRY kernel_entry {
+      parameter.0 = u32[2,2]{1,0} parameter(0)
+      ROOT test_root = (u32[2,2]{1,0}) custom-call(parameter.0),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  EXPECT_EQ(HloEvaluator().Evaluate(*m_, {&args[0]}).status().code(),
+            ::tensorflow::error::UNIMPLEMENTED);
+}
+
+// Tests when a custom_call handler returns an error.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_HandlerError) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_HandlerError
+    ENTRY kernel_entry {
+      parameter.0 = u32[2,2]{1,0} parameter(0)
+      ROOT test_root = (u32[2,2]{1,0}) custom-call(parameter.0),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler(
+      [](HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+        return InternalError("Test error");
+      });
+  EXPECT_EQ(evaluator.Evaluate(*m_, {&args[0]}).status().code(),
+            ::tensorflow::error::INTERNAL);
+}
+
+// Tests the custom_call handler on calls with many inputs.
+// We sum the operands so that we can verify the operand and output literals
+// are properly mapped for access.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_ManyInputs) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_ManyInputs
+    ENTRY kernel_entry {
+      parameter.0 = u32[1]{0} parameter(0)
+      parameter.1 = u32[1]{0} parameter(1)
+      ROOT test_root = u32[1]{0} custom-call(parameter.0, parameter.1),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler(
+      [](HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+        EXPECT_EQ(HloOpcode::kCustomCall, custom_call->opcode());
+        EXPECT_EQ("_my_custom_call", custom_call->custom_call_target());
+        EXPECT_EQ(2, custom_call->operand_count());
+        EXPECT_EQ(2, operands.size());
+        auto output = Literal::CreateFromShape(custom_call->shape());
+        auto operand0_data = operands[0]->data<uint32>();
+        auto operand1_data = operands[1]->data<uint32>();
+        auto output_data = output.data<uint32>();
+        output_data[0] = operand0_data[0] + operand1_data[0];
+        return output;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      evaluator.Evaluate(*m_->entry_computation(), {&args[0], &args[1]}));
+  auto arg0_data = args[0].data<uint32>();
+  auto arg1_data = args[1].data<uint32>();
+  std::vector<uint32> expected_data = {arg0_data[0] + arg1_data[0]};
+  EXPECT_TRUE(absl::c_equal(expected_data, actual_literal.data<uint32>()));
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 3ace2f544329253d217e1891ce387a8a55fe2339..648c7d0e676cd85ea255557bd969d92659aeeca7 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
+#include "absl/meta/type_traits.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -39,9 +40,8 @@ namespace xla {
 // Anyway this is relatively safe as-is because hlo_evaluator_typed_visitor.h is
 // a "private" header that's not exposed outside of hlo_evaluator.cc.
 template <typename T>
-using is_complex_t = std::is_same<T, complex64>;
-template <typename T>
-using is_complex64_t = std::is_same<T, complex64>;
+using is_complex_t =
+    absl::disjunction<std::is_same<T, complex64>, std::is_same<T, complex128>>;
 
 // It's UB to use std::sort with std::less<float>, because of NaNs. Define
 // "safe" less functions which are actually strict weak orders. -NaN and NaN
@@ -83,6 +83,26 @@ bool SafeLess(const NativeT& a, const NativeT& b) {
   return SafeLess(static_cast<float>(a), static_cast<float>(b));
 }
 
+// ToArithmeticSafeType(T t):
+//  - converts `t` to the bitwise-equivalent `unsigned T` if T is a signed
+//    integer, and
+//  - otherwise returns `t` unchanged.
+//
+// It's UB in C++ to under/overflow a signed integer, so we wrap all arithmetic
+// in this type to force 2's complement behavior.
+template <typename T,
+          typename std::enable_if<std::is_integral<T>::value &&
+                                  std::is_signed<T>::value>::type* = nullptr>
+typename std::make_unsigned<T>::type ToArithmeticSafeType(T t) {
+  return static_cast<typename std::make_unsigned<T>::type>(t);
+}
+template <typename T,
+          typename std::enable_if<!std::is_integral<T>::value ||
+                                  !std::is_signed<T>::value>::type* = nullptr>
+T ToArithmeticSafeType(T t) {
+  return std::move(t);
+}
+
 // Templated DfsHloVisitor for use by HloEvaluator.
 //
 // Typically ReturnT here indicates the resulting literal type of each evaluated
@@ -192,7 +212,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <
       typename NativeT,
-      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleAbs(HloInstruction* abs) {
     const Literal& operand_literal =
         parent_->GetEvaluatedLiteralFor(abs->operand(0));
@@ -211,6 +231,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     // specifying the ElementwiseT explicitly as C64 is needed below.
     if (abs->operand(0)->shape().element_type() == C64) {
       return HandleAbs<complex64>(abs);
+    } else if (abs->operand(0)->shape().element_type() == C128) {
+      return HandleAbs<complex128>(abs);
     }
     return HandleAbs<ElementwiseT>(abs);
   }
@@ -498,47 +520,25 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_signed<NativeT>::value &&
-                !std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
-    using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return NativeT(type(lhs_elem) * type(rhs_elem));
-                            }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_unsigned<NativeT>::value ||
-                              std::is_floating_point<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
+  Status HandleMultiply(HloInstruction* multiply) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem * rhs_elem;
-                            }));
+        ElementWiseBinaryOp(
+            multiply, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+              return ElementwiseT(ToArithmeticSafeType(lhs_elem) *
+                                  ToArithmeticSafeType(rhs_elem));
+            }));
     return Status::OK();
   }
 
-  Status HandleMultiply(HloInstruction* multiply) override {
-    return HandleMultiply<ElementwiseT>(multiply);
-  }
-
   Status HandleSubtract(HloInstruction* subtract) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[subtract],
-        ElementWiseBinaryOp(subtract,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem - rhs_elem;
-                            }));
+        ElementWiseBinaryOp(
+            subtract, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+              return ElementwiseT(ToArithmeticSafeType(lhs_elem) -
+                                  ToArithmeticSafeType(rhs_elem));
+            }));
     return Status::OK();
   }
 
@@ -546,7 +546,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
                         ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
                                                     ElementwiseT rhs_elem) {
-                          return lhs_elem + rhs_elem;
+                          return ElementwiseT(ToArithmeticSafeType(lhs_elem) +
+                                              ToArithmeticSafeType(rhs_elem));
                         }));
     return Status::OK();
   }
@@ -674,11 +675,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandlePower(HloInstruction* power) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
-                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
-                                                      ElementwiseT rhs_el) {
-                          return std::pow(lhs_el, rhs_el);
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[power],
+        ElementWiseBinaryOp(
+            power, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+              return lhs_el == ElementwiseT(0) && rhs_el == ElementwiseT(0)
+                         ? static_cast<ElementwiseT>(1)
+                         : std::pow(lhs_el, rhs_el);
+            }));
     return Status::OK();
   }
 
@@ -918,7 +922,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
         clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
-          return std::fmin(high, std::fmax(value, low));
+          if (std::isnan(low) || std::isnan(high)) {
+            return static_cast<ElementwiseT>(NAN);
+          }
+          return static_cast<ElementwiseT>(
+              std::fmin(high, std::fmax(value, low)));
         };
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[clamp],
@@ -940,7 +948,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleSelect(HloInstruction* select) override {
     CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
-    CHECK(ShapeUtil::IsArray(select->shape()));
+    CHECK(select->shape().IsArray());
     std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
         [](bool pred, ReturnT on_true, ReturnT on_false) {
           if (pred) {
@@ -993,8 +1001,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
     TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
-    CHECK(ShapeUtil::IsArray(lhs_shape));
-    CHECK(ShapeUtil::IsArray(rhs_shape));
+    CHECK(lhs_shape.IsArray());
+    CHECK(rhs_shape.IsArray());
     CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
     CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
 
@@ -1005,8 +1013,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     CHECK_GE(num_spatial_dims, 0);
     CHECK_EQ(window.dimensions_size(), num_spatial_dims);
 
-    const auto lhs_rank = ShapeUtil::Rank(lhs_shape);
-    const auto rhs_rank = ShapeUtil::Rank(rhs_shape);
+    const auto lhs_rank = lhs_shape.rank();
+    const auto rhs_rank = rhs_shape.rank();
 
     CHECK_EQ(num_spatial_dims + 2, lhs_rank);
     CHECK_EQ(num_spatial_dims + 2, rhs_rank);
@@ -1037,15 +1045,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto lhs_literal_data = lhs_literal.data<ReturnT>();
     auto rhs_literal_data = rhs_literal.data<ReturnT>();
 
-    int64 feature_group_count = conv->feature_group_count();
-    int64 batch_group_count = conv->batch_group_count();
+    const int64 feature_group_count = conv->feature_group_count();
+    const int64 batch_group_count = conv->batch_group_count();
 
-    // The batch count > 1 case is unimplemented in the HLO evaluator so far.
-    TF_RET_CHECK(batch_group_count == 1);
     auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
                  &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
-                 rhs_literal_data,
-                 feature_group_count](const absl::Span<const int64> out_index) {
+                 rhs_literal_data, feature_group_count,
+                 batch_group_count](const absl::Span<const int64> out_index) {
       // Dimension number applicable for input (lhs).
       const int64 input_batch_dim = dnums.input_batch_dimension();
       const int64 input_z_dim = dnums.input_feature_dimension();
@@ -1058,6 +1064,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
       const int64 input_z_size =
           ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
+      const int64 input_batch_size =
+          ShapeUtil::GetDimension(lhs_shape, input_batch_dim);
+
+      const int64 batch_group_size = input_batch_size / batch_group_count;
+
       // The size of an input feature group.
       const int64 input_feature_group_size = input_z_size / feature_group_count;
 
@@ -1073,11 +1085,15 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const int64 feature_group_index =
           out_index[output_z_dim] / output_feature_group_size;
 
+      const int64 batch_group_index = out_index[output_z_dim];
+
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
       DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
                                         0);
 
       // Convolve input feature with kernel.
+      // The mechanism indexes into the correct LHS (input) and RHS (kernel)
+      // locations and accumulates multiplications for a given output index.
       do {
         // Find corresponding spatial dimension index for input (lhs).
         int64 lhs_linear_spatial_index = 0;
@@ -1130,11 +1146,24 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               feature_group_index * input_feature_group_size + rhs_iz;
 
           int64 lhs_linear_index = lhs_linear_spatial_index;
+
           lhs_linear_index += out_index[output_batch_dim] *
                               lhs_dim_multipliers[input_batch_dim];
+
+          // We are scraping only the diagonal elements in the resultant
+          // convolution output when batch_group_count is greater than 1,
+          // where 1 is the default. No scraping is done in that case.
+          // This approach works out automatically for 'groups' in batches
+          // with group_size > 1, because we already descend down the batch
+          // dimension for the 'output_batch_dim' above.
+          lhs_linear_index +=
+              ((batch_group_index * batch_group_size) % input_batch_size) *
+              lhs_dim_multipliers[input_batch_dim];
+
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
 
           int64 rhs_linear_index = rhs_linear_spatial_index;
+
           rhs_linear_index += out_index[output_z_dim] *
                               rhs_dim_multipliers[kernel_output_z_dim];
           rhs_linear_index += rhs_iz * rhs_dim_multipliers[kernel_input_z_dim];
@@ -1158,7 +1187,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleDot(HloInstruction* dot) override {
-    if (parent_->use_fast_path_) {
+    if (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() == 1 &&
+        parent_->use_fast_path_) {
       return HandleDot<ReturnT>(dot);
     }
     return HandleDotSlowPath(dot);
@@ -1169,21 +1199,19 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleDot(HloInstruction* dot) {
     const HloInstruction* lhs = dot->operand(0);
     const HloInstruction* rhs = dot->operand(1);
-    CHECK(ShapeUtil::IsArray(dot->shape()));
-    CHECK(ShapeUtil::IsArray(lhs->shape()));
-    CHECK(ShapeUtil::IsArray(rhs->shape()));
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
 
     const auto& dnums = dot->dot_dimension_numbers();
 
-    const int64 lhs_rank = ShapeUtil::Rank(lhs->shape());
-    const int64 rhs_rank = ShapeUtil::Rank(rhs->shape());
+    const int64 lhs_rank = lhs->shape().rank();
+    const int64 rhs_rank = rhs->shape().rank();
 
     CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
     CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
 
     // There must be 1 and only 1 Contracting dimension for lhs and rhs.
-    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
-    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
     const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
     const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
     // Contracted dimension sizes must be the same.
@@ -1232,33 +1260,18 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleDotSlowPath(HloInstruction* dot) {
     auto lhs = dot->operand(0);
     auto rhs = dot->operand(1);
-    CHECK(ShapeUtil::IsArray(dot->shape()));
-    CHECK(ShapeUtil::IsArray(lhs->shape()));
-    CHECK(ShapeUtil::IsArray(rhs->shape()));
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
 
     const auto& dnums = dot->dot_dimension_numbers();
 
-    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
-    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
+    const auto lhs_rank = lhs->shape().rank();
+    const auto rhs_rank = rhs->shape().rank();
 
     CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
     CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
 
-    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
-    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
-    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
-    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
-    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
-    // Contracted dimension sizes must be the same.
-    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
-             rhs->shape().dimensions(rhs_contracting_dimension))
-        << "lhs contracted dimension: "
-        << lhs->shape().dimensions(lhs_contracting_dimension)
-        << " rhs contracted dimension: "
-        << rhs->shape().dimensions(rhs_contracting_dimension);
-    const int64 contracted_dimension_size =
-        lhs->shape().dimensions(lhs_contracting_dimension);
-
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
@@ -1272,7 +1285,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     // in lhs_index or rhs_index where the i'th result index should go.
     absl::InlinedVector<std::pair<int64*, int64*>, kInlineRank>
         result_index_locations;
-    result_index_locations.reserve(lhs_rank + rhs_rank - 2);
+    result_index_locations.reserve(
+        (lhs_rank - dnums.lhs_contracting_dimensions_size()) +
+        (rhs_rank - dnums.rhs_contracting_dimensions_size()));
 
     // The first components in the output shape are the LHS and RHS batch
     // dimensions:
@@ -1284,18 +1299,32 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     // Then we have the LHS and RHS non-contracting dimensions, if any:
     for (int64 i = 0; i < lhs_rank; i++) {
-      if (i != lhs_contracting_dimension &&
+      if (!absl::c_linear_search(dnums.lhs_contracting_dimensions(), i) &&
           !absl::c_linear_search(dnums.lhs_batch_dimensions(), i)) {
         result_index_locations.push_back({&lhs_index[i], nullptr});
       }
     }
     for (int64 i = 0; i < rhs_rank; i++) {
-      if (i != rhs_contracting_dimension &&
+      if (!absl::c_linear_search(dnums.rhs_contracting_dimensions(), i) &&
           !absl::c_linear_search(dnums.rhs_batch_dimensions(), i)) {
         result_index_locations.push_back({&rhs_index[i], nullptr});
       }
     }
 
+    absl::InlinedVector<int64, kInlineRank> accumulate_index_sizes;
+    accumulate_index_sizes.reserve(dnums.lhs_contracting_dimensions_size());
+    absl::InlinedVector<std::pair<int64*, int64*>, kInlineRank>
+        accumulate_index_locations;
+    accumulate_index_locations.reserve(dnums.lhs_contracting_dimensions_size());
+    for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+      const int64 lhs_dnum = dnums.lhs_contracting_dimensions(i);
+      const int64 rhs_dnum = dnums.rhs_contracting_dimensions(i);
+      accumulate_index_locations.push_back(
+          {&lhs_index[lhs_dnum], &rhs_index[rhs_dnum]});
+      const int64 dim_size = lhs->shape().dimensions(lhs_dnum);
+      accumulate_index_sizes.push_back(dim_size);
+    }
+    const int64 total_contraction_size = Product(accumulate_index_sizes);
     Literal result(dot->shape());
     TF_RETURN_IF_ERROR(
         result.Populate<ReturnT>([&](absl::Span<const int64> result_index) {
@@ -1309,13 +1338,30 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           }
 
           // Accumulates resulting product along the contracted dimension.
-          for (int64 i = 0; i < contracted_dimension_size; ++i) {
-            lhs_index[lhs_contracting_dimension] = i;
-            rhs_index[rhs_contracting_dimension] = i;
+          absl::InlinedVector<int64, kInlineRank> accumulate_index(
+              accumulate_index_sizes.size(), 0);
+          for (int64 k = 0; k < total_contraction_size; k++) {
+            for (int64 i = 0; i < accumulate_index_sizes.size(); ++i) {
+              *(accumulate_index_locations[i].first) = accumulate_index[i];
+              *(accumulate_index_locations[i].second) = accumulate_index[i];
+            }
 
             result_val +=
                 static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
                 static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+
+            // If there are no contracting dimension accumulate_index_sizes is
+            // empty, do not try to count down from -1 to 0 since it is and
+            // infinite loop.
+            if (!accumulate_index_sizes.empty()) {
+              for (int64 i = accumulate_index_sizes.size() - 1; i >= 0; --i) {
+                int64 value = ++accumulate_index[i];
+                if (value != accumulate_index_sizes[i]) {
+                  break;
+                }
+                accumulate_index[i] = 0;
+              }
+            }
           }
 
           return static_cast<ReturnT>(result_val);
@@ -1326,10 +1372,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandlePad(HloInstruction* pad) override {
-    CHECK(ShapeUtil::IsArray(pad->operand(0)->shape()));
+    CHECK(pad->operand(0)->shape().IsArray());
     // Padding value must be scalar.
     CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
-    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
+    CHECK_EQ(pad->operand(0)->shape().rank(),
              pad->padding_config().dimensions_size());
 
     TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
@@ -1352,9 +1398,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& evaluated_operand =
         parent_->GetEvaluatedLiteralFor(pad->operand(0));
 
-    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
-                                   0);
-    std::vector<int64> target_index(ShapeUtil::Rank(result.shape()), 0);
+    std::vector<int64> input_index(evaluated_operand.shape().rank(), 0);
+    std::vector<int64> target_index(result.shape().rank(), 0);
 
     // Loop through each element of the operand, assign them to the
     // corresponding index of the resulting padded literal.
@@ -1397,10 +1442,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto operand = dynamic_slice->operand(0);
     auto start_indices = dynamic_slice->operand(1);
     auto result_shape = dynamic_slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferDynamicSliceShape(
-                            operand->shape(), start_indices->shape(),
-                            dynamic_slice->dynamic_slice_sizes()));
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferDynamicSliceShape(
+            operand->shape(),
+            Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+            dynamic_slice->dynamic_slice_sizes()));
     TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1409,33 +1456,39 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         primitive_util::IsIntegralType(start_indices->shape().element_type()));
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
 
     switch (start_indices->shape().element_type()) {
       case S32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int32>(operand_literal, start_indices_literal,
-                                result_shape));
+            DynamicSlice<int32>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case S64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int64>(operand_literal, start_indices_literal,
-                                result_shape));
+            DynamicSlice<int64>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case U32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint32>(operand_literal, start_indices_literal,
-                                 result_shape));
+            DynamicSlice<uint32>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case U64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint64>(operand_literal, start_indices_literal,
-                                 result_shape));
+            DynamicSlice<uint64>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       default:
         LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
@@ -1455,7 +1508,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(
         auto inferred_return_shape,
         ShapeInference::InferDynamicUpdateSliceShape(
-            operand->shape(), update->shape(), start_indices->shape()));
+            operand->shape(), update->shape(),
+            Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
+                ->index_shapes()));
     TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1466,33 +1521,39 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
 
     switch (start_indices->shape().element_type()) {
       case S32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int32>(operand_literal, update_literal,
-                                      start_indices_literal));
+            DynamicUpdateSlice<int32>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case S64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int64>(operand_literal, update_literal,
-                                      start_indices_literal));
+            DynamicUpdateSlice<int64>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case U32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
-                                       start_indices_literal));
+            DynamicUpdateSlice<uint32>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case U64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
-                                       start_indices_literal));
+            DynamicUpdateSlice<uint64>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       default:
         LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
@@ -1529,7 +1590,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           }
 
           Literal computed_result =
-              embedded_evaluator.Evaluate<Literal>(*computation, arg_literals)
+              embedded_evaluator.Evaluate(*computation, arg_literals)
                   .ConsumeValueOrDie();
           // Clear visit states so that the we can use the evaluate again on
           // the same computation.
@@ -1587,6 +1648,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
         break;
       }
+      case C128: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex128>(map));
+        break;
+      }
       default:
         LOG(FATAL) << "HandleMap: unhandled primitive type for "
                       "input operand: "
@@ -1609,7 +1674,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& keys_literal = parent_->GetEvaluatedLiteralFor(keys);
     int64 sort_dim = sort->dimensions(0);
     int64 sort_dim_elements = keys->shape().dimensions(sort_dim);
-    int64 rank = ShapeUtil::Rank(keys->shape());
+    int64 rank = keys->shape().rank();
     if (rank == 0) {
       // Nothing to sort.
       parent_->evaluated_[sort] = keys_literal.Clone();
@@ -1626,8 +1691,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           // Extract a slice from the literal that corresponds to exactly the
           // row in dimension 'sort_dim'.
           std::vector<int64> limit_indices(indices.begin(), indices.end());
-          std::for_each(limit_indices.begin(), limit_indices.end(),
-                        [](int64& index) { ++index; });
+          absl::c_for_each(limit_indices, [](int64& index) { ++index; });
           limit_indices[sort_dim] = sort_dim_elements;
           TF_ASSIGN_OR_RETURN(auto row_to_sort,
                               keys_literal.Slice(indices, limit_indices)
@@ -1670,7 +1734,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleReduce(HloInstruction* hlo) override {
     HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
     int64 num_args = reduce->inputs().size();
-    bool has_tuple_output = ShapeUtil::IsTuple(reduce->shape());
+    bool has_tuple_output = reduce->shape().IsTuple();
     absl::Span<const int64> dimensions(reduce->dimensions());
     HloComputation* function = reduce->to_apply();
 
@@ -1701,7 +1765,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     // All args and results have the same dimensions, so pick an arbitrary one.
     const Shape& arg_shape = arg_literals[0]->shape();
-    const Shape& result_shape = ShapeUtil::IsTuple(reduce->shape())
+    const Shape& result_shape = reduce->shape().IsTuple()
                                     ? reduce->shape().tuple_shapes(0)
                                     : reduce->shape();
     const auto arg_dimensions = AsInt64Slice(arg_shape.dimensions());
@@ -1790,7 +1854,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              [](Literal& literal) { return &literal; });
 
               TF_ASSIGN_OR_RETURN(Literal computed_result,
-                                  embedded_evaluator.Evaluate<const Literal*>(
+                                  embedded_evaluator.Evaluate(
                                       *function, embedded_operands_ptrs));
               // Clear visit states so that we can use the evaluator again on
               // the same computation.
@@ -1868,7 +1932,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
 
-    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+    int64 rank = operand_literal.shape().rank();
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     DimensionVector source_index(rank, 0);
@@ -1906,8 +1970,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             selected_val_literal.Set({}, *selected_val);
             Literal computed_result =
                 embedded_evaluator
-                    .Evaluate<const Literal*>(
-                        *select, {&selected_val_literal, &curr_val_literal})
+                    .Evaluate(*select,
+                              {&selected_val_literal, &curr_val_literal})
                     .ConsumeValueOrDie();
             bool selected = !computed_result.Get<bool>({});
             if (selected) {
@@ -1928,9 +1992,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               scattered_literal.Set({}, scattered);
               Literal computed_result =
                   embedded_evaluator
-                      .Evaluate<const Literal*>(
-                          *scatter,
-                          {&source_literal_scatter, &scattered_literal})
+                      .Evaluate(*scatter,
+                                {&source_literal_scatter, &scattered_literal})
                       .ConsumeValueOrDie();
               result.Set(operand_index, computed_result.Get<ReturnT>({}));
               // Clear visit states so that the we can use the evaluator again
@@ -1980,7 +2043,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         operand->shape().element_type(), window_dimension_sizes);
 
     DimensionVector window_index(window.dimensions_size());
-    DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
+    DimensionVector operand_index(operand_literal.shape().rank());
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     Literal result(reduce_window->shape());
@@ -2004,8 +2067,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                     LiteralUtil::CreateR0<ReturnT>(result_val);
                 Literal computed_result =
                     embedded_evaluator
-                        .Evaluate<const Literal*>(
-                            *function, {&result_val_literal, &curr_val_literal})
+                        .Evaluate(*function,
+                                  {&result_val_literal, &curr_val_literal})
                         .ConsumeValueOrDie();
 
                 // Clear visit states so that the we can use the evaluate again
@@ -2367,9 +2430,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           LiteralUtil::CreateR0<ReturnT>(updates.Get<ReturnT>(update_index));
       Literal updated_result =
           embedded_evaluator
-              .Evaluate<const Literal*>(
-                  *scatter->to_apply(),
-                  {&result_value_literal, &update_value_literal})
+              .Evaluate(*scatter->to_apply(),
+                        {&result_value_literal, &update_value_literal})
               .ConsumeValueOrDie();
       // Clear visit states so that the we can use the evaluate again on the
       // same computation.
@@ -2411,7 +2473,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         << " but is inferred to be: "
         << ShapeUtil::HumanString(inferred_return_shape);
 
-    const int64 rank = ShapeUtil::Rank(operand->shape());
+    const int64 rank = operand->shape().rank();
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     auto func = [&](absl::Span<const int64> out_index) {
       DimensionVector operand_index(rank);
@@ -2608,7 +2670,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_same<
                                   double, NativeT>::value>::type* = nullptr>
   Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Double not supported for reduce precision");
+    return InvalidArgument("Double is not supported for reduce precision");
   }
 
   template <
@@ -2623,12 +2685,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_same<NativeT, bfloat16>::value ||
-                std::is_same<NativeT, Eigen::half>::value ||
-                std::is_integral<NativeT>::value ||
-                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          std::is_same<NativeT, bfloat16>::value ||
+          std::is_same<NativeT, Eigen::half>::value ||
+          std::is_integral<NativeT>::value || is_complex_t<NativeT>::value ||
+          std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
     const int64 iota_size = iota->shape().dimensions(iota->iota_dimension());
@@ -2648,23 +2711,24 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     }
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
-    if (ShapeUtil::Rank(iota->shape()) > 1) {
+    if (iota->shape().rank() > 1) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[iota],
           result.Broadcast(iota->shape(), {iota->iota_dimension()}));
     } else {
-      TF_RET_CHECK(ShapeUtil::Rank(iota->shape()) == 1);
+      TF_RET_CHECK(iota->shape().rank() == 1);
       parent_->evaluated_[iota] = std::move(result);
     }
 
     return Status::OK();
   }
-  template <typename NativeT,
-            typename std::enable_if<
-                !(std::is_same<NativeT, bfloat16>::value ||
-                  std::is_same<NativeT, Eigen::half>::value ||
-                  std::is_integral<NativeT>::value ||
-                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          !(std::is_same<NativeT, bfloat16>::value ||
+            std::is_same<NativeT, Eigen::half>::value ||
+            std::is_integral<NativeT>::value || is_complex_t<NativeT>::value ||
+            std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
     return UnsupportedTypeError(iota);
   }
@@ -2672,6 +2736,103 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleIota<ReturnT>(iota);
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                !(std::is_integral<NativeT>::value ||
+                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  Status HandleRng(HloInstruction* random) {
+    return UnsupportedTypeError(random);
+  }
+  template <typename NativeT,
+            typename std::enable_if<
+                (std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  Status HandleRng(HloInstruction* random) {
+    RandomDistribution distribution = random->random_distribution();
+    const auto result_shape = random->shape();
+    Literal result(result_shape);
+
+    switch (distribution) {
+      case RNG_UNIFORM: {
+        const Literal& low =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& high =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        std::uniform_real_distribution<NativeT> generator(
+            low.Get<NativeT>({}), high.Get<NativeT>({}));
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return generator(parent_->engine_);
+            }));
+        break;
+      }
+      case RNG_NORMAL: {
+        const Literal& mean =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& stddev =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        std::normal_distribution<NativeT> generator(mean.Get<NativeT>({}),
+                                                    stddev.Get<NativeT>({}));
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return generator(parent_->engine_);
+            }));
+        break;
+      }
+      default:
+        return UnimplementedStrCat("The distribution ",
+                                   RandomDistribution_Name(distribution),
+                                   " is not implemented.");
+    }
+    parent_->evaluated_[random] = std::move(result);
+    return Status::OK();
+  }
+  template <typename NativeT,
+            typename std::enable_if<(std::is_integral<NativeT>::value)>::type* =
+                nullptr>
+  Status HandleRng(HloInstruction* random) {
+    RandomDistribution distribution = random->random_distribution();
+    const auto result_shape = random->shape();
+    Literal result(result_shape);
+
+    switch (distribution) {
+      case RNG_UNIFORM: {
+        const Literal& low =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& high =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        // Note std::uniform_int_distribution assumes interval is closed, i.e.,
+        // [low, high], but we want [low, high) instead. Hence high-1 is used as
+        // the upper range.
+        std::uniform_int_distribution<int64> generator(
+            low.Get<NativeT>({}), high.Get<NativeT>({}) - 1);
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return static_cast<NativeT>(generator(parent_->engine_));
+            }));
+        break;
+      }
+      case RNG_NORMAL: {
+        return Unimplemented(
+            "Normal distribution is not supported for integral types.");
+      }
+      default:
+        return UnimplementedStrCat("The distribution ",
+                                   RandomDistribution_Name(distribution),
+                                   " is not implemented.");
+    }
+    parent_->evaluated_[random] = std::move(result);
+    return Status::OK();
+  }
+  Status HandleRng(HloInstruction* random) override {
+    return HandleRng<ReturnT>(random);
+  }
+
  private:
   // Creates a vector of multipliers which can be used to create a linear index
   // into shape.
@@ -2683,7 +2844,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   //
   // This lets you calculate LI given the multidimensional indices in any order.
   static DimensionVector MakeDimMultipliers(const Shape& shape) {
-    DimensionVector v(ShapeUtil::Rank(shape));
+    DimensionVector v(shape.rank());
     int64 scale = 1;
     for (auto dim : LayoutUtil::MinorToMajor(shape)) {
       v[dim] = scale;
@@ -2700,7 +2861,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const Shape& window_shape, const Window& window, const Shape& base_shape,
       const absl::Span<const int64>& window_count_index,
       const std::function<void(const std::vector<int64>&)>& f) {
-    const int64 rank = ShapeUtil::Rank(base_shape);
+    const int64 rank = base_shape.rank();
     DimensionVector window_index(rank);
     std::fill(window_index.begin(), window_index.end(), 0);
     do {
@@ -2731,12 +2892,27 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename IndexT>
-  StatusOr<Literal> DynamicSlice(const Literal& operand_literal,
-                                 const Literal& start_indices_literal,
-                                 const Shape& result_shape) {
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
+  StatusOr<Literal> DynamicSlice(
+      const Literal& operand_literal,
+      absl::Span<HloInstruction* const> start_indices,
+      const Shape& result_shape) {
+    std::vector<int64> start;
+    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
+    // between the cases, this currently assumes there is at least 1 index. That
+    // is wrong in the general case, because for scalar indices, if the operand
+    // is scalar, then there are no indices. This problem with resolve itself.
+    const HloInstruction* first_index = start_indices[0];
+    if (first_index->shape().rank() == 1) {
+      auto start_indices_typed =
+          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
+      start = std::vector<int64>(start_indices_typed.begin(),
+                                 start_indices_typed.end());
+    } else {
+      for (HloInstruction* index : start_indices) {
+        start.push_back(
+            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
+      }
+    }
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
     for (int64 i = 0; i < start.size(); ++i) {
@@ -2762,14 +2938,28 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename IndexT>
-  StatusOr<Literal> DynamicUpdateSlice(const Literal& operand_literal,
-                                       const Literal& update_literal,
-                                       const Literal& start_indices_literal) {
+  StatusOr<Literal> DynamicUpdateSlice(
+      const Literal& operand_literal, const Literal& update_literal,
+      absl::Span<HloInstruction* const> start_indices) {
     auto result = operand_literal.Clone();
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    const auto rank = ShapeUtil::Rank(result.shape());
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
+    const auto rank = result.shape().rank();
+    std::vector<int64> start;
+    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
+    // between the cases, this currently assumes there is at least 1 index. That
+    // is wrong in the general case, because for scalar indices, if the operand
+    // is scalar, then there are no indices. This problem with resolve itself.
+    const HloInstruction* first_index = start_indices[0];
+    if (first_index->shape().rank() == 1) {
+      auto start_indices_typed =
+          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
+      start = std::vector<int64>(start_indices_typed.begin(),
+                                 start_indices_typed.end());
+    } else {
+      for (HloInstruction* index : start_indices) {
+        start.push_back(
+            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
+      }
+    }
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
     for (int64 i = 0; i < rank; ++i) {
@@ -2886,6 +3076,7 @@ extern template class HloEvaluatorTypedVisitor<Eigen::half, float>;
 extern template class HloEvaluatorTypedVisitor<float>;
 extern template class HloEvaluatorTypedVisitor<double>;
 extern template class HloEvaluatorTypedVisitor<complex64>;
+extern template class HloEvaluatorTypedVisitor<complex128>;
 extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f48140ee4f6ca9415bef80c83664213109dbf9f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<complex128>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e54285a1577a3f3c97fba5ba6c2f969299ab599e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int16>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc708952d20a00429944c8388a84a0e610c2f38f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint16>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 5be9dba3aa49d63c580cd6f5800f608667826b6a..df06cf8c53ec8407f8b44c9126ed4fb5409f8ef3 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -45,7 +45,7 @@ TEST_F(HloExecutionProfileTest, Basic) {
 
   auto shape_size_function = [&](const Shape& shape) {
     const int64 pointer_size = 8;
-    if (ShapeUtil::IsOpaque(shape)) {
+    if (shape.IsOpaque()) {
       return pointer_size;
     }
     return ShapeUtil::ByteSizeOf(shape, pointer_size);
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
index c919dbd82d3668c477bf37074f1d56f8cb7d9506..862b2029718bbd802b69d789b66683a4edfa2367 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -25,7 +26,9 @@ namespace xla {
 
 namespace {
 
-StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+StatusOr<bool> ReplaceGetSize(
+    HloInstruction* instr,
+    const DynamicDimensionInference* dynamic_dimension_inference) {
   if (instr->opcode() != HloOpcode::kGetDimensionSize) {
     return false;
   }
@@ -36,10 +39,18 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
                           instr->operand(0)->shape(), instr->dimension()));
   TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
   TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
-  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
-  HloInstruction* new_instr = computation->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
-  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  HloInstruction* operand = instr->mutable_operand(0);
+  int64 dim = instr->dimension();
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
+  } else {
+    uint32 size = instr->operand(0)->shape().dimensions(dim);
+    HloInstruction* new_instr = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  }
   return true;
 }
 
@@ -48,10 +59,13 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
 StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
   bool changed = false;
   HloProto proto;
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
+                      DynamicDimensionInference::Run(module));
   *proto.mutable_hlo_module() = module->ToProto();
   for (auto* computation : module->computations()) {
     for (auto instruction : computation->instructions()) {
-      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      TF_ASSIGN_OR_RETURN(bool replaced,
+                          ReplaceGetSize(instruction, &inference));
       changed = changed || replaced;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
index 30f44c23a835b3bcc935caaa917e040e07c4e703..9aa79fe66b665c48ec871c4188e44ba2056de3ad 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -21,7 +21,9 @@ limitations under the License.
 
 namespace xla {
 
-// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+// Pass to replace a kGetDimensionSize instruction with a hlo instruction
+// representing the dynamic size if the dimension is dynamic, otherwise a
+// constant instruction representing the static size.
 class HloGetDimensionSizeRewriter : public HloModulePass {
  public:
   absl::string_view name() const override {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index dbf0d2c113bf670da3617967d913da819ccf2663..4c7f5e9e7dfb12a8cb699bdf397eab21983342a1 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include <queue>
 #include <string>
 #include <tuple>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -380,7 +380,7 @@ class HloDotDumper {
   // Each HloInstruction dumped gets a monotically-increasing node ID.  This
   // must start at 1, because that's where graphviz's accounting starts.
   int64 next_node_id_ = 1;
-  std::unordered_map<const HloInstruction*, int64> node_ids_;
+  absl::flat_hash_map<const HloInstruction*, int64> node_ids_;
 
   // The "root" tag doesn't have an associated HloInstruction pointer, so we
   // need to store it outside the map.
@@ -397,7 +397,7 @@ class HloDotDumper {
 
   // Each HloComputation that's emitted gets a monotonically-increasing ID.
   int64 next_cluster_id_ = 1;
-  std::unordered_map<const HloComputation*, int64> cluster_ids_;
+  absl::flat_hash_map<const HloComputation*, int64> cluster_ids_;
 
   // Edges to print from Footer().  Edges come at the end because graphviz is
   // unhappy if an edge from a subcomputation to a node in the outer computation
@@ -407,7 +407,7 @@ class HloDotDumper {
 
   // When coloring by sharding information, we track the sharding string
   // representation to color association, by round-robin the color schemes.
-  std::unordered_map<HloSharding, ColorScheme, HloSharding::Hasher>
+  absl::flat_hash_map<HloSharding, ColorScheme, HloSharding::Hasher>
       sharding_colors_;
   int64 next_shard_color_ = 0;
 };
@@ -561,8 +561,8 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   }
 
   // Show the subcomputation if we're showing any of its members.
-  return std::any_of(
-      subcomp->instructions().begin(), subcomp->instructions().end(),
+  return absl::c_any_of(
+      subcomp->instructions(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -733,17 +733,16 @@ bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
     return true;
   }
   const int kMinUsersToOmit = 3;
-  return instr->opcode() == HloOpcode::kParameter &&
-         ShapeUtil::IsTuple(instr->shape()) && !instr->IsFused() &&
-         std::count_if(instr->users().begin(), instr->users().end(),
-                       [&](const HloInstruction* user) {
-                         return filter_.Show(user);
-                       }) > kMinUsersToOmit &&
-         std::all_of(instr->users().begin(), instr->users().end(),
-                     [&](const HloInstruction* user) {
-                       return !filter_.Show(user) ||
-                              user->opcode() == HloOpcode::kGetTupleElement;
-                     });
+  return instr->opcode() == HloOpcode::kParameter && instr->shape().IsTuple() &&
+         !instr->IsFused() &&
+         absl::c_count_if(instr->users(),
+                          [&](const HloInstruction* user) {
+                            return filter_.Show(user);
+                          }) > kMinUsersToOmit &&
+         absl::c_all_of(instr->users(), [&](const HloInstruction* user) {
+           return !filter_.Show(user) ||
+                  user->opcode() == HloOpcode::kGetTupleElement;
+         });
 }
 
 string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
@@ -816,7 +815,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Print the literal value of constants with <= K elements.
     optional<int64> elem_count;
-    if (ShapeUtil::IsArray(shape)) {
+    if (shape.IsArray()) {
       elem_count = 1;
       for (int64 dim : shape.dimensions()) {
         *elem_count *= dim;
@@ -900,12 +899,11 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   // the same color as a parameter.  Unless the merged-in parameter is a
   // parameter to a fusion node that is bound to a constant -- these aren't
   // "real" parameters from the user's perspective.
-  if (std::any_of(instr->operands().begin(), instr->operands().end(),
-                  [&](const HloInstruction* operand) {
-                    return operand->opcode() == HloOpcode::kParameter &&
-                           ShouldMergeIntoUsers(operand) &&
-                           TryGetFusionParameterConstant(operand) == nullptr;
-                  })) {
+  if (absl::c_any_of(instr->operands(), [&](const HloInstruction* operand) {
+        return operand->opcode() == HloOpcode::kParameter &&
+               ShouldMergeIntoUsers(operand) &&
+               TryGetFusionParameterConstant(operand) == nullptr;
+      })) {
     return parameter_color;
   }
 
@@ -1286,7 +1284,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
                                       int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
-  std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
+  absl::flat_hash_map<const HloInstruction*, NodeFilterResult> nodes;
   std::deque<std::pair<const HloInstruction*, /*depth*/ int64>> worklist;
   worklist.push_back({root, 0});
   while (!worklist.empty()) {
@@ -1307,7 +1305,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     // are not interesting to the graph at hand.
     if (instr == root || instr->opcode() != HloOpcode::kTuple) {
       for (const HloInstruction* operand : instr->operands()) {
-        if (!nodes.count(operand)) {
+        if (!nodes.contains(operand)) {
           worklist.push_back({operand, depth + 1});
         }
       }
@@ -1335,7 +1333,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
       continue;
     }
     for (const HloInstruction* user : instr->users()) {
-      if (!nodes.count(user)) {
+      if (!nodes.contains(user)) {
         worklist.push_back({user, depth + 1});
       }
     }
@@ -1344,7 +1342,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
   auto is_displayed = [&](const HloInstruction* instr) {
     // Constants are displayed inline with their users; they're never omitted.
     // Nodes in subcomputations are always shown.
-    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant ||
+    return nodes.contains(instr) || instr->opcode() == HloOpcode::kConstant ||
            instr->parent() != root->parent();
   };
 
@@ -1355,12 +1353,11 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     NodeFilterResult& filter_result = kv.second;
     const auto& operands = instr->operands();
 
-    if (std::any_of(operands.begin(), operands.end(), is_displayed) &&
-        !std::all_of(operands.begin(), operands.end(), is_displayed)) {
+    if (absl::c_any_of(operands, is_displayed) &&
+        !absl::c_all_of(operands, is_displayed)) {
       // Mark nodes with some operands omitted appropriately.
       filter_result = kSomeOperandsOmitted;
-    } else if (!operands.empty() &&
-               std::none_of(operands.begin(), operands.end(), is_displayed)) {
+    } else if (!operands.empty() && absl::c_none_of(operands, is_displayed)) {
       // Mark nodes with *all* operands omitted appropriately.
       filter_result = kOmitNodeOperands;
     }
@@ -1368,8 +1365,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     // Promote nodes with type kSomeUsersOmitted to kNormalNode if all of their
     // users made it into the graph.
     if (filter_result == kSomeUsersOmitted &&
-        std::all_of(instr->users().begin(), instr->users().end(),
-                    is_displayed)) {
+        absl::c_all_of(instr->users(), is_displayed)) {
       filter_result = kNormalNode;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
index 6e1597fd03db0a78aa560340b7b9b64fe500df0c..b01c00121b3363630b83a1e49d0027a66f3a9e1a 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -17,22 +17,34 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
+
+bool HloInputOutputAliasConfig::OutputHasAlias(
+    const ShapeIndex& output_index) const {
+  return alias_.element(output_index).has_value();
+}
+
 Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
                                              int64 param_number,
-                                             const ShapeIndex& param_index) {
+                                             const ShapeIndex& param_index,
+                                             AliasKind kind) {
+  TF_RET_CHECK(kind == AliasKind::kUserAlias || kind == AliasKind::kSystemAlias)
+      << kind;
   TF_RET_CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index))
       << absl::StrCat("Tring to set up alias at ", output_index.ToString(),
                       " which is an invalid index for shape ",
                       ShapeUtil::HumanString(alias_.shape()));
+  TF_RET_CHECK(param_number >= 0) << param_number;
+  TF_RET_CHECK(!OutputHasAlias(output_index))
+      << "Output index " << output_index << " already has an alias setup";
   // Output can't be aliased with multiple parameters.
   TF_RET_CHECK(!alias_.element(output_index)) << absl::StrFormat(
       "Trying to set up output alias for param %lld at %s but failed: output "
       "index %s is already aliased with param %lld at %s",
       param_number, param_index.ToString(), output_index.ToString(),
-      alias_.element(output_index)->first,
-      alias_.element(output_index)->second.ToString());
+      alias_.element(output_index)->parameter_number,
+      alias_.element(output_index)->parameter_index.ToString());
   (*alias_.mutable_element(output_index)) =
-      std::make_pair(param_number, param_index);
+      Alias(kind, param_number, param_index);
   VLOG(4) << "Set up alias between output index " << output_index.ToString()
           << " and parameter " << param_index << " at index "
           << param_index.ToString();
@@ -42,15 +54,24 @@ Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
 HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
   HloInputOutputAliasProto result;
   alias_.ForEachElement(
-      [&](const ShapeIndex& index,
-          const absl::optional<std::pair<int64, ShapeIndex>>& data) {
+      [&](const ShapeIndex& index, const absl::optional<Alias>& data) {
         if (data) {
           HloInputOutputAliasProto::AliasEntryProto entry;
+          switch (data->kind) {
+            case AliasKind::kUserAlias:
+              entry.set_kind(HloInputOutputAliasProto::USER_ALIAS);
+              break;
+            case AliasKind::kSystemAlias:
+              entry.set_kind(HloInputOutputAliasProto::SYSTEM_ALIAS);
+              break;
+            default:
+              LOG(FATAL) << "Unknown alias kind " << data->kind;
+          }
           for (int64 i : index) {
             entry.add_output_shape_index(i);
           }
-          entry.set_parameter_number(data->first);
-          for (int64 i : data->second) {
+          entry.set_parameter_number(data->parameter_number);
+          for (int64 i : data->parameter_index) {
             entry.add_parameter_shape_index(i);
           }
           result.add_entries()->Swap(&entry);
@@ -66,14 +87,18 @@ StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
        proto.entries()) {
     ShapeIndex output_index(entry.output_shape_index().begin(),
                             entry.output_shape_index().end());
-
     int64 param_number = entry.parameter_number();
     ShapeIndex param_index(entry.parameter_shape_index().begin(),
                            entry.parameter_shape_index().end());
+    // Handle backward compatibility with existing protos, which only knew of
+    // system aliases.
+    AliasKind kind = AliasKind::kSystemAlias;
+    if (entry.kind() == HloInputOutputAliasProto::USER_ALIAS) {
+      kind = AliasKind::kUserAlias;
+    }
     TF_RETURN_IF_ERROR(
-        result.SetUpAlias(output_index, param_number, param_index));
+        result.SetUpAlias(output_index, param_number, param_index, kind));
   }
-
   return result;
 }
 
@@ -81,45 +106,44 @@ string HloInputOutputAliasConfig::ToString() const {
   std::vector<string> pieces;
   pieces.push_back("HloInputOutputAliasConfig");
 
-  ForEachAlias([&](const ShapeIndex& output_index, int64 param_number,
-                   const ShapeIndex& param_index) {
+  ForEachAlias([&](const ShapeIndex& output_index, const Alias& alias) {
+    const char* kind = alias.kind == AliasKind::kUserAlias ? "USER" : "SYSTEM";
     pieces.push_back(absl::StrFormat(
-        "  OutputIndex %s is aliased with parameter %lld at %s:",
-        output_index.ToString(), param_number, param_index.ToString()));
+        "  OutputIndex %s is aliased (kind=%s) with parameter %lld at %s:",
+        output_index.ToString(), kind, alias.parameter_number,
+        alias.parameter_index.ToString()));
   });
-
   return absl::StrJoin(pieces, "\n");
 }
 
-bool HloInputOutputAliasConfig::ParameterHasAlias(
+HloInputOutputAliasConfig::AliasKind
+HloInputOutputAliasConfig::ParameterAliasKind(
     int64 param_number, const ShapeIndex& param_index) const {
-  bool output = false;
+  AliasKind kind = AliasKind::kNoAlias;
   alias_.ForEachElement(
-      [&](const xla::ShapeIndex&,
-          absl::optional<std::pair<int64, ShapeIndex>> alias) {
-        if (alias && alias->first == param_number &&
-            alias->second == param_index) {
-          output = true;
+      [&](const xla::ShapeIndex&, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index) {
+          kind = alias->kind;
         }
       });
-  return output;
+  return kind;
 }
 
 absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
     int64 param_number, const ShapeIndex& param_index) const {
   absl::optional<ShapeIndex> output;
   alias_.ForEachElement(
-      [&](const xla::ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> alias) {
-        if (alias && alias->first == param_number &&
-            alias->second == param_index) {
+      [&](const xla::ShapeIndex& output_index, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index) {
           output = output_index;
         }
       });
   return output;
 }
 
-absl::optional<std::pair<int64, ShapeIndex>>
+absl::optional<HloInputOutputAliasConfig::Alias>
 HloInputOutputAliasConfig::GetAliasedParameter(
     const ShapeIndex& output_index) const {
   CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index));
@@ -128,10 +152,9 @@ HloInputOutputAliasConfig::GetAliasedParameter(
 
 void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
   alias_.ForEachElement(
-      [&](const ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+      [&](const ShapeIndex& output_index, absl::optional<Alias> aliased) {
         if (aliased) {
-          fn(output_index, aliased->first, aliased->second);
+          fn(output_index, *aliased);
         }
       });
 }
@@ -139,10 +162,9 @@ void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
 Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
     AliasFnWithStatus fn) const {
   return alias_.ForEachElementWithStatus(
-      [&](const ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+      [&](const ShapeIndex& output_index, absl::optional<Alias> aliased) {
         if (aliased) {
-          TF_RETURN_IF_ERROR(fn(output_index, aliased->first, aliased->second));
+          TF_RETURN_IF_ERROR(fn(output_index, *aliased));
         }
         return Status::OK();
       });
@@ -158,20 +180,19 @@ Status HloInputOutputAliasConfig::Verify(
     param_has_seen.emplace_back(param->shape());
   }
   return ForEachAliasWithStatus([&](const ShapeIndex& output_index,
-                                    int64 param_number,
-                                    const ShapeIndex& param_index) -> Status {
+                                    const Alias& alias) -> Status {
     const HloInstruction* root = entry->root_instruction();
 
-    TF_RET_CHECK(0 <= param_number);
-    TF_RET_CHECK(entry->num_parameters() > param_number);
+    TF_RET_CHECK(0 <= alias.parameter_number);
+    TF_RET_CHECK(entry->num_parameters() > alias.parameter_number);
     const Shape& param_shape =
-        entry->parameter_instruction(param_number)->shape();
+        entry->parameter_instruction(alias.parameter_number)->shape();
     const Shape& output_shape = root->shape();
-    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, param_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, alias.parameter_index));
     TF_RET_CHECK(ShapeUtil::IndexIsValid(output_shape, output_index));
 
     const Shape& param_subshape =
-        ShapeUtil::GetSubshape(param_shape, param_index);
+        ShapeUtil::GetSubshape(param_shape, alias.parameter_index);
     const Shape& output_subshape =
         ShapeUtil::GetSubshape(output_shape, output_index);
     TF_RET_CHECK(LayoutUtil::IsDenseArray(param_subshape));
@@ -182,19 +203,20 @@ Status HloInputOutputAliasConfig::Verify(
           "Expected aliased input %lld at index %s and output at index %s to "
           "have the same size. Input sub-shape is %s with size %lld, output "
           "sub-shape is %s with size %lld",
-          param_number, param_index.ToString(), output_index.ToString(),
+          alias.parameter_number, alias.parameter_index.ToString(),
+          output_index.ToString(),
           ShapeUtil::HumanStringWithLayout(param_subshape),
           size_func(param_subshape),
           ShapeUtil::HumanStringWithLayout(output_subshape),
           size_func(output_subshape));
     }
 
-    // Check each param_number and param_index pair only show up once. No
-    // input can be aliased with output buffers.
-    TF_RET_CHECK(param_has_seen[param_number].element(param_index) == false);
-
-    *(param_has_seen[param_number].mutable_element(param_index)) = true;
-
+    // Check each alias.parameter_number and alias.parameter_index pair only
+    // show up once. No input can be aliased with output buffers.
+    TF_RET_CHECK(param_has_seen[alias.parameter_number].element(
+                     alias.parameter_index) == false);
+    *(param_has_seen[alias.parameter_number].mutable_element(
+        alias.parameter_index)) = true;
     return Status::OK();
   });
 }
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index 439676b1546c4af7f781fb80bccffd5248309b0f..b0b71dece81b561f492767db8c1ccbe3fde442d4 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -31,6 +32,28 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
+  // The kind of aliases which can be set. A kUserAlias is one setup at
+  // compilation time by the user, and has to be respected. A kSystemAlias one
+  // might be setup by the compiler, if it decides it is convenient to do so.
+  enum AliasKind {
+    kNoAlias,
+    kUserAlias,
+    kSystemAlias,
+  };
+
+  // Defines the alias information for a given output buffer. A given output
+  // buffer shape index can refer only to one parameter+index.
+  struct Alias {
+    Alias(AliasKind kind, int64 parameter_number, ShapeIndex parameter_index)
+        : kind(kind),
+          parameter_number(parameter_number),
+          parameter_index(std::move(parameter_index)) {}
+
+    AliasKind kind;
+    int64 parameter_number;
+    ShapeIndex parameter_index;
+  };
+
   HloInputOutputAliasConfig() = default;
 
   explicit HloInputOutputAliasConfig(Shape shape) : alias_(shape) {}
@@ -40,12 +63,22 @@ class HloInputOutputAliasConfig {
   // Sets up alias config from `output_index` to `param_index` at
   // `param_number`.
   Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                    const ShapeIndex& param_index);
+                    const ShapeIndex& param_index, AliasKind kind);
+
+  // Returns the kind of alias for the given parameter number and parameter
+  // index. If no alias exists, AliasKind::kNoAlias is returned.
+  AliasKind ParameterAliasKind(int64 param_number,
+                               const ShapeIndex& param_index) const;
 
   // Returns true if the given parameter is aliased with one of the output
   // buffers.
   bool ParameterHasAlias(int64 param_number,
-                         const ShapeIndex& param_index) const;
+                         const ShapeIndex& param_index) const {
+    return ParameterAliasKind(param_number, param_index) != AliasKind::kNoAlias;
+  }
+
+  // Checks whether the provided output index has already been aliased.
+  bool OutputHasAlias(const ShapeIndex& output_index) const;
 
   // (De)Serializes an HloInputOutoutAliasConfig to/from an
   // HloInputOutoutAliasProto.
@@ -63,19 +96,17 @@ class HloInputOutputAliasConfig {
   // Returns the number of parameter and index of the parameter buffer that the
   // given output buffer index is aliased with. A nullopt is returned if there
   // is no parameter is aliased with the specific output.
-  absl::optional<std::pair<int64, ShapeIndex>> GetAliasedParameter(
+  absl::optional<Alias> GetAliasedParameter(
       const ShapeIndex& output_index) const;
 
   using AliasFn =
-      std::function<void(const ShapeIndex& output_index, int64 param_number,
-                         const ShapeIndex& param_index)>;
+      std::function<void(const ShapeIndex& output_index, const Alias&)>;
 
   // Iterates through each aliased output and input.
   void ForEachAlias(AliasFn fn) const;
 
   using AliasFnWithStatus =
-      std::function<Status(const ShapeIndex& output_index, int64 param_number,
-                           const ShapeIndex& param_index)>;
+      std::function<Status(const ShapeIndex& output_index, const Alias&)>;
 
   // Verifies that the given config is valid for the given module.
   // Specifically, the config's input and output should be in-bound and size of
@@ -90,9 +121,10 @@ class HloInputOutputAliasConfig {
  private:
   // A ShapeTree which indicates the list of buffers that's expected to be
   // aliased. The key on this shape tree represents the output index. The value
-  // is a pair of parameter number and index into the buffer. If the value is
-  // nullopt, it means there is no parameter aliasing for this output.
-  ShapeTree<absl::optional<std::pair<int64, ShapeIndex>>> alias_;
+  // is an Alias data structure which defines the input parameter coordinates.
+  // If the value is nullopt, it means there is no parameter aliasing for this
+  // output.
+  ShapeTree<absl::optional<Alias>> alias_;
 };
 
 std::ostream& operator<<(std::ostream& out,
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
index aeb9b0fdc8b6cca87731a2d4aae25120af6c3215..a46a107723de30176241aae01b268a8c10d991d3 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -45,11 +45,12 @@ class HloInputOutputAliasConfigTest : public HloTestBase {
     EXPECT_TRUE(aliased_output);
     EXPECT_EQ(aliased_output.value(), output_index);
 
-    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+    absl::optional<HloInputOutputAliasConfig::Alias> aliased_param =
         config.GetAliasedParameter(output_index);
 
     EXPECT_TRUE(aliased_param);
-    EXPECT_EQ(aliased_param.value(), std::make_pair(param_number, param_index));
+    EXPECT_EQ(aliased_param->parameter_number, param_number);
+    EXPECT_EQ(aliased_param->parameter_index, param_index);
   }
 
   void expect_not_aliased(const ShapeIndex& output_index, int64 param_number,
@@ -60,11 +61,12 @@ class HloInputOutputAliasConfigTest : public HloTestBase {
 
     EXPECT_FALSE(aliased_output && aliased_output == output_index);
 
-    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+    absl::optional<HloInputOutputAliasConfig::Alias> aliased_param =
         config.GetAliasedParameter(output_index);
 
-    EXPECT_FALSE(aliased_param && aliased_param->first == param_number &&
-                 aliased_param->second == param_index);
+    EXPECT_FALSE(aliased_param &&
+                 aliased_param->parameter_number == param_number &&
+                 aliased_param->parameter_index == param_index);
   }
 };
 
@@ -84,8 +86,10 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/1,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   expect_aliased(/*output_index=*/{0}, /*param_number=*/1,
                  /*param_index=*/{}, config);
@@ -114,11 +118,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{0}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{1}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   expect_aliased(/*output_index=*/{0}, /*param_number=*/0,
                  /*param_index=*/{0}, config);
@@ -149,11 +157,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
     return ShapeUtil::ByteSizeOf(shape);
@@ -176,8 +188,10 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
     return ShapeUtil::ByteSizeOf(shape);
@@ -200,11 +214,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  ASSERT_IS_NOT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
-                                     /*param_index=*/{}));
+  ASSERT_IS_NOT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/1,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 3e8903c95376ae1238b68280bbbb00b0db5a23a2..3c92554ad4ec48686d64c74a00f732a3bfee87bc 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -82,15 +83,14 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     return computation_map.at(proto.called_computation_ids(index));
   };
 
-  TF_RET_CHECK(std::all_of(
-      proto.operand_ids().begin(), proto.operand_ids().end(),
-      [&instruction_map](int64 id) { return instruction_map.contains(id); }))
+  TF_RET_CHECK(
+      absl::c_all_of(proto.operand_ids(),
+                     [&](int64 id) { return instruction_map.contains(id); }))
       << proto.name() << " instruction contains invalid operand id(s)";
 
-  TF_RET_CHECK(std::all_of(
-      proto.called_computation_ids().begin(),
-      proto.called_computation_ids().end(),
-      [&computation_map](int64 id) { return computation_map.contains(id); }))
+  TF_RET_CHECK(
+      absl::c_all_of(proto.called_computation_ids(),
+                     [&](int64 id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
   Shape shape(proto.shape());
@@ -311,7 +311,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           shape, operands(0), proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
-      TF_RET_CHECK(ShapeUtil::IsTuple(shape) &&
+      TF_RET_CHECK(shape.IsTuple() &&
                    (ShapeUtil::TupleElementCount(shape) == 2))
           << "Infeed should have a tuple shape with 2 operands, but has: "
           << shape;
@@ -452,13 +452,43 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           CreatePad(shape, operands(0), operands(1), proto.padding_config());
       break;
     case HloOpcode::kDynamicSlice: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "DynamicSlice instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
       absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
+      TF_RET_CHECK(proto.operand_ids_size() >= 1)
+          << "DynamicSlice instruction should have at least 1 operands but "
+             "sees "
+          << proto.operand_ids_size();
+      // TODO(b/118437727): Old form, make the check unconditional.
+      if (proto.operand_ids_size() != 2 || operands(1)->shape().rank() != 1) {
+        auto expected_operands = 1 + operands(0)->shape().rank();
+        TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
+            << "DynamicSlice instruction should have " << expected_operands
+            << " operands, but has " << proto.operand_ids_size();
+      }
+      const auto& operand_vector = all_operands();
+      instruction = CreateDynamicSlice(
+          shape, operands(0), absl::MakeSpan(operand_vector).subspan(1),
+          slice_sizes);
+      break;
+    }
+    case HloOpcode::kDynamicUpdateSlice: {
+      TF_RET_CHECK(proto.operand_ids_size() >= 2)
+          << "DynamicUpdateSlice instruction should have at least 2 operands "
+             "but sees "
+          << proto.operand_ids_size();
+      // TODO(b/118437727): Old form, make the check unconditional.
+      if (proto.operand_ids_size() != 3 || operands(2)->shape().rank() != 1) {
+        auto expected_operands = 2 + operands(0)->shape().rank();
+        TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
+            << "DynamicUpdateSlice instruction should have "
+            << expected_operands << " operands, but has "
+            << proto.operand_ids_size();
+      }
+      const auto& operand_vector = all_operands();
       instruction =
-          CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
+          CreateDynamicUpdateSlice(shape, operands(0), operands(1),
+                                   absl::MakeSpan(operand_vector).subspan(2));
+
       break;
     }
     case HloOpcode::kGather: {
@@ -628,7 +658,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     absl::Span<HloInstruction* const> operands) {
   if (opcode == HloOpcode::kCopy) {
     // It is impossible to copy an opaque shape, we don't know how big it is.
-    CHECK(!ShapeUtil::IsOpaque(shape));
+    CHECK(!shape.IsOpaque());
   }
   auto instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
   for (auto operand : operands) {
@@ -911,17 +941,17 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDynamicSlice(
-    const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
+    const Shape& shape, HloInstruction* operand,
+    absl::Span<HloInstruction* const> start_indices,
     absl::Span<const int64> slice_sizes) {
   return absl::make_unique<HloDynamicSliceInstruction>(
       shape, operand, start_indices, slice_sizes);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
-                                         HloInstruction* operand,
-                                         HloInstruction* update,
-                                         HloInstruction* start_indices) {
+HloInstruction::CreateDynamicUpdateSlice(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices) {
   return absl::make_unique<HloDynamicUpdateSliceInstruction>(
       shape, operand, update, start_indices);
 }
@@ -1039,7 +1069,7 @@ HloInstruction::CreateBroadcastSequence(
     const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
         adder) {
   CHECK(ShapeUtil::IsScalar(operand->shape()) ||
-        ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape));
+        operand->shape().rank() == output_shape.rank());
   Shape broadcast_shape = ShapeUtil::ChangeElementType(
       output_shape, operand->shape().element_type());
   // Do explicit broadcast for scalar.
@@ -1055,7 +1085,7 @@ HloInstruction::CreateBroadcastSequence(
   // Do explicit broadcast for degenerate broadcast.
   std::vector<int64> broadcast_dimensions;
   std::vector<int64> reshaped_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(operand->shape()); i++) {
+  for (int i = 0; i < operand->shape().rank(); i++) {
     if (operand->shape().dimensions(i) == output_shape.dimensions(i)) {
       broadcast_dimensions.push_back(i);
       reshaped_dimensions.push_back(operand->shape().dimensions(i));
@@ -1132,7 +1162,7 @@ HloInstruction::CreateBroadcastSequence(
 
 void HloInstruction::set_single_sharding(const HloSharding& sharding) {
   CHECK(!sharding.IsTuple()) << sharding;
-  if (ShapeUtil::IsTuple(shape())) {
+  if (shape().IsTuple()) {
     set_sharding(HloSharding::Tuple(sharding.GetAsShapeTree(shape())));
   } else {
     set_sharding(sharding);
@@ -1382,9 +1412,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateReshape(shape, new_operands[0]);
       break;
     case HloOpcode::kDynamicUpdateSlice:
-      CHECK_EQ(new_operands.size(), 3);
       clone = CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
-                                       new_operands[2]);
+                                       new_operands.subspan(2));
       break;
     case HloOpcode::kTuple:
       clone = CreateTuple(new_operands);
@@ -1546,12 +1575,10 @@ HloInstruction::InstructionVector HloInstruction::unique_operands() const {
 
 Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
-  if (std::find(control_successors_.begin(), control_successors_.end(),
-                instruction) == control_successors_.end()) {
+  if (!absl::c_linear_search(control_successors_, instruction)) {
     control_successors_.push_back(instruction);
-    TF_RET_CHECK(std::find(instruction->control_predecessors_.begin(),
-                           instruction->control_predecessors_.end(),
-                           this) == instruction->control_predecessors_.end());
+    TF_RET_CHECK(
+        !absl::c_linear_search(instruction->control_predecessors_, this));
     instruction->control_predecessors_.push_back(this);
   }
   return Status::OK();
@@ -1800,7 +1827,7 @@ void HloInstruction::RemoveUser(HloInstruction* user) {
   user_set_.erase(set_it);
   // This is linear in the number of the users, but a vector provides a stable
   // iteration order and much faster traversal.
-  auto vec_it = std::find(users_.begin(), users_.end(), user);
+  auto vec_it = absl::c_find(users_, user);
   CHECK(vec_it != users_.end());
   users_.erase(vec_it);
 }
@@ -1818,8 +1845,7 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
 
   RemoveUser(user);
 
-  TF_RET_CHECK(
-      std::count(user->operands_.begin(), user->operands_.end(), this) >= 0);
+  TF_RET_CHECK(absl::c_count(user->operands_, this) >= 0);
   std::replace(user->operands_.begin(), user->operands_.end(), this,
                new_producer);
   new_producer->AddUser(user);
@@ -1832,6 +1858,16 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
 
 Status HloInstruction::ReplaceOperandWith(int64 operand_num,
                                           HloInstruction* new_operand) {
+  auto old_operand = operand(operand_num);
+  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
+                                                        new_operand->shape()))
+      << old_operand->shape() << " is not compatible with "
+      << new_operand->shape();
+  return ReplaceOperandWithDifferentShape(operand_num, new_operand);
+}
+
+Status HloInstruction::ReplaceOperandWithDifferentShape(
+    int64 operand_num, HloInstruction* new_operand) {
   TF_RET_CHECK(operand_num >= 0);
   TF_RET_CHECK(operand_num < operand_count());
   HloInstruction* old_operand = mutable_operand(operand_num);
@@ -1839,17 +1875,12 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
     return Status::OK();
   }
 
-  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
-                                                        new_operand->shape()))
-      << old_operand->shape() << " is not compatible with "
-      << new_operand->shape();
   operands_[operand_num] = new_operand;
 
   VLOG(3) << "Replacing operand " << operand_num << " of " << name() << " with "
           << new_operand->name() << ", was " << old_operand->name();
 
-  if (std::find(operands_.begin(), operands_.end(), old_operand) ==
-      operands_.end()) {
+  if (!absl::c_linear_search(operands_, old_operand)) {
     old_operand->RemoveUser(this);
   }
   new_operand->AddUser(this);
@@ -1857,6 +1888,14 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
 }
 
 Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
+  TF_RET_CHECK(
+      ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
+      << shape() << " is not compatible with " << new_producer->shape();
+  return ReplaceAllUsesWithDifferentShape(new_producer);
+}
+
+Status HloInstruction::ReplaceAllUsesWithDifferentShape(
+    HloInstruction* new_producer) {
   bool new_producer_is_user = false;
   for (HloInstruction* user : users()) {
     if (user == new_producer) {
@@ -1881,7 +1920,8 @@ Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
     AddUser(new_producer);
   }
   if (parent_ && parent_->root_instruction() == this) {
-    parent_->set_root_instruction(new_producer);
+    parent_->set_root_instruction(new_producer,
+                                  /*accept_different_shape=*/true);
   }
 
   return Status::OK();
@@ -2824,7 +2864,7 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
       }
       return UseKind::kReuse;
     case HloOpcode::kDynamicUpdateSlice:
-      // Dynamic-update-slice reuses only operand 2 (start_indices).
+      // Dynamic-update-slice reuses only start_indices.
       if (i == 0 || i == 1) {
         return UseKind::kUse;
       }
@@ -2877,10 +2917,10 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 
 string PaddingConfigToString(const PaddingConfig& padding) {
   bool has_interior_padding =
-      std::any_of(padding.dimensions().begin(), padding.dimensions().end(),
-                  [](const PaddingConfig::PaddingConfigDimension& dim) {
-                    return dim.interior_padding() != 0;
-                  });
+      absl::c_any_of(padding.dimensions(),
+                     [](const PaddingConfig::PaddingConfigDimension& dim) {
+                       return dim.interior_padding() != 0;
+                     });
   return StrJoin(
       padding.dimensions(), "x",
       [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 36e1ab49319a3e28143ef4d08888c68c86fbcf62..2c29b6c243bffccc346af12277dd4fc061250cbe 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -558,13 +558,14 @@ class HloInstruction {
   // 'slice_sizes'.
   static std::unique_ptr<HloInstruction> CreateDynamicSlice(
       const Shape& shape, HloInstruction* operand,
-      HloInstruction* start_indices, absl::Span<const int64> slice_sizes);
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64> slice_sizes);
 
   // Creates a dynamic update slice instruction, which updates a slice
   // of 'operand' with 'update' and 'start_indices'.
   static std::unique_ptr<HloInstruction> CreateDynamicUpdateSlice(
       const Shape& shape, HloInstruction* operand, HloInstruction* update,
-      HloInstruction* start_indices);
+      absl::Span<HloInstruction* const> start_indices);
 
   // Creates a concatenate instruction, where the operands are concatenated on
   // the provided dimension.
@@ -928,11 +929,16 @@ class HloInstruction {
   // operands of it which could be created due to this replacement.
   Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer);
 
-  // Replaces the specified operand with new_operand.
+  // Replaces the specified operand with new_operand. The old and new operands
+  // must have compatible shapes ignoring floating-point precision.
   //
   // This function does NOT remove duplicated operands even if this instruction
   // is a fusion, so that the existing operand numbers do not change.
-  Status ReplaceOperandWith(int64 operand_no, HloInstruction* new_operand);
+  Status ReplaceOperandWith(int64 operand_num, HloInstruction* new_operand);
+
+  // Same as ReplaceOperandWith(), but new_operand can have a different shape.
+  Status ReplaceOperandWithDifferentShape(int64 operand_num,
+                                          HloInstruction* new_operand);
 
   // Replaces all uses of this instruction with the new producer. If
   // new_producer is a user of this instruction then new_producer remains a use
@@ -941,10 +947,16 @@ class HloInstruction {
   // If this instruction is the root of its computation, sets the computation's
   // root to new_producer.
   //
+  // The new producer must have a compatible shape ignoring floating-point
+  // precision.
+  //
   // If a user is a fusion instruction, this function will remove any duplicated
   // operands of it which could be created due to this replacement.
   Status ReplaceAllUsesWith(HloInstruction* new_producer);
 
+  // Same as ReplaceAllUsesWith, but new_producer can have a different shape.
+  Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
+
   // Performs a postorder DFS visit using this node as the root. If
   // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
   // complete. If ignore_control_predecessors is true, instructions only
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 8048e332cb57747286758b75773b29ba154aa888..35f031f29a7aca8db7ebe2fbcfdcebb7a778d703 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -55,13 +56,13 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleParameter(HloInstruction* parameter) override {
-    EXPECT_EQ(0, count_.count(parameter));
+    EXPECT_FALSE(count_.contains(parameter));
     count_[parameter] = GetCountsForNode(parameter);
     return Status::OK();
   }
 
   Status HandleConstant(HloInstruction* constant) override {
-    EXPECT_EQ(0, count_.count(constant));
+    EXPECT_FALSE(count_.contains(constant));
     count_[constant] = GetCountsForNode(constant);
     return Status::OK();
   }
@@ -69,25 +70,25 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   Status HandleAdd(HloInstruction* add) override {
     auto lhs = add->operand(0);
     auto rhs = add->operand(1);
-    EXPECT_EQ(0, count_.count(add));
-    EXPECT_GT(count_.count(lhs), 0);
-    EXPECT_GT(count_.count(rhs), 0);
+    EXPECT_FALSE(count_.contains(add));
+    EXPECT_TRUE(count_.contains(lhs));
+    EXPECT_TRUE(count_.contains(rhs));
     count_[add] = GetCountsForNode(add);
     return Status::OK();
   }
 
   Status HandleNegate(HloInstruction* negate) override {
     auto operand = negate->operand(0);
-    EXPECT_EQ(0, count_.count(negate));
-    EXPECT_GT(count_.count(operand), 0);
+    EXPECT_FALSE(count_.contains(negate));
+    EXPECT_TRUE(count_.contains(operand));
     count_[negate] = GetCountsForNode(negate);
     return Status::OK();
   }
 
   Status HandleMap(HloInstruction* map) override {
-    EXPECT_EQ(0, count_.count(map));
+    EXPECT_FALSE(count_.contains(map));
     for (HloInstruction* arg : map->operands()) {
-      EXPECT_GT(count_.count(arg), 0);
+      EXPECT_TRUE(count_.contains(arg));
     }
     count_[map] = GetCountsForNode(map);
     return Status::OK();
@@ -96,9 +97,9 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
-    EXPECT_EQ(0, count_.count(reduce));
-    EXPECT_GT(count_.count(arg), 0);
-    EXPECT_GT(count_.count(init_value), 0);
+    EXPECT_FALSE(count_.contains(reduce));
+    EXPECT_TRUE(count_.contains(arg));
+    EXPECT_TRUE(count_.contains(init_value));
     count_[reduce] = GetCountsForNode(reduce);
     return Status::OK();
   }
@@ -128,7 +129,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   }
 
   // Counters for HLOs. Maps HLO to a NumOpsAndUsers.
-  std::unordered_map<const HloInstruction*, NumOpsAndUsers> count_;
+  absl::flat_hash_map<const HloInstruction*, NumOpsAndUsers> count_;
 };
 
 TEST_F(HloInstructionTest, BasicProperties) {
@@ -137,7 +138,7 @@ TEST_F(HloInstructionTest, BasicProperties) {
   EXPECT_EQ(HloOpcode::kParameter, parameter->opcode());
   EXPECT_TRUE(ShapeUtil::IsScalarWithElementType(parameter->shape(), F32));
   EXPECT_FALSE(ShapeUtil::IsScalarWithElementType(parameter->shape(), S32));
-  EXPECT_EQ(0, parameter->operand_count());
+  EXPECT_FALSE(parameter->operand_count());
 }
 
 TEST_F(HloInstructionTest, UserWithTwoOperands) {
@@ -981,9 +982,9 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
   module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
-  std::unordered_map<HloInstruction*, int> visit_order;
+  absl::flat_hash_map<HloInstruction*, int> visit_order;
   EXPECT_IS_OK(add->Accept([&visit_num, &visit_order](HloInstruction* inst) {
-    EXPECT_EQ(0, visit_order.count(inst));
+    EXPECT_FALSE(visit_order.contains(inst));
     visit_order[inst] = visit_num;
     visit_num++;
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 756e260b60dcda660e89c211862c8c5800439f2c..b01f01ef012b4c366035dc16b44508d71ad07d79 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -42,11 +42,9 @@ using absl::StrJoin;
 bool IsInstructionElementwiseOnOperand(const HloInstruction* instruction,
                                        const HloInstruction* operand) {
   std::vector<int64> operand_indices = instruction->OperandIndices(operand);
-  return std::all_of(
-      operand_indices.begin(), operand_indices.end(),
-      [instruction](int64 operand_index) {
-        return instruction->IsElementwiseOnOperand(operand_index);
-      });
+  return absl::c_all_of(operand_indices, [instruction](int64 operand_index) {
+    return instruction->IsElementwiseOnOperand(operand_index);
+  });
 }
 
 string PrecisionConfigToString(const PrecisionConfig& precision_config) {
@@ -385,6 +383,15 @@ HloInstructionProto HloAllReduceInstruction::ToProto() const {
   return proto;
 }
 
+bool HloAllReduceInstruction::IsNoop() const {
+  for (auto replica_group : replica_groups()) {
+    if (replica_group.replica_ids().size() != 1) {
+      return false;
+    }
+  }
+  return !all_reduce_id();
+}
+
 std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   std::vector<string> result =
@@ -734,7 +741,7 @@ HloMapInstruction::HloMapInstruction(const Shape& shape,
   AppendComputation(map_computation);
   // TODO(b/65689298) Remove code below once Map is generalized to accept
   // arbitrary map dimensions.
-  dimensions_.resize(ShapeUtil::Rank(shape));
+  dimensions_.resize(shape.rank());
   std::iota(dimensions_.begin(), dimensions_.end(), 0);
 }
 
@@ -814,8 +821,7 @@ std::vector<string> HloSliceInstruction::ExtraAttributesToStringImpl(
   std::vector<string> bounds;
   bounds.reserve(slice_starts_.size());
   const bool omit_stride =
-      std::all_of(slice_strides_.begin(), slice_strides_.end(),
-                  [](int64 stride) { return stride == 1; });
+      absl::c_all_of(slice_strides_, [](int64 stride) { return stride == 1; });
   for (int i = 0; i < slice_starts_.size(); ++i) {
     string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]);
     bounds.push_back(
@@ -866,7 +872,7 @@ void HloConstantInstruction::RelayoutConstant(const Layout& new_layout,
                                               const ShapeIndex& shape_index) {
   Shape* mutable_array_subshape =
       ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index);
-  CHECK(ShapeUtil::IsArray(*mutable_array_subshape));
+  CHECK(mutable_array_subshape->IsArray());
 
   // Normally array_subshape will always have a layout, but this invariant is
   // temporarily broken in LayoutAssignment::AssignLayouts.
@@ -900,7 +906,7 @@ string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
   string operands;
   // For constants, show the actual value in place of an empty operand list.
   if (literal_.has_value() &&
-      ((ShapeUtil::IsArray(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) ||
+      ((shape().IsArray() && ShapeUtil::ElementsIn(shape()) <= 10) ||
        options.print_large_constants())) {
     // Literal::ToString emits multidimensional arrays over multiple
     // lines. Compact this into one line by stripping out white space.
@@ -1051,8 +1057,7 @@ HloInstruction* HloFusionInstruction::AddFusionOperand(
 
 void HloFusionInstruction::MergeFusionInstruction(
     HloFusionInstruction* instruction_to_merge) {
-  CHECK(std::find(operands().begin(), operands().end(), instruction_to_merge) !=
-        operands().end());
+  CHECK(absl::c_linear_search(operands(), instruction_to_merge));
   // Clone the instruction from which to merge fused instructions.
   std::unique_ptr<HloInstruction> cloned = instruction_to_merge->Clone();
   HloFusionInstruction* cloned_fusion =
@@ -1219,8 +1224,8 @@ HloInstruction* HloFusionInstruction::CloneAndFuseInternal(
     // corresponding fused parameter instruction. Renumber parameters as
     // necessary to make parameter numbers consistent with their index in the
     // fused_parameter_ vector.
-    bool in_operand_list = std::find(operands().begin(), operands().end(),
-                                     instruction_to_fuse) != operands().end();
+    bool in_operand_list =
+        absl::c_linear_search(operands(), instruction_to_fuse);
     CHECK(add_output || in_operand_list);
     if (instruction_to_fuse->opcode() == HloOpcode::kTuple) {
       // We assume all uses of a kTuple operation are GTE ops, not another
@@ -1324,7 +1329,7 @@ HloInstruction* HloFusionInstruction::CloneAndFuseInternal(
     if (newly_created_tuple_instr) {
       HloInstruction* new_instr = parent()->AddInstruction(
           HloInstruction::CreateGetTupleElement(fused_root->shape(), this, 0));
-      TF_CHECK_OK(ReplaceAllUsesWith(new_instr));
+      TF_CHECK_OK(ReplaceAllUsesWithDifferentShape(new_instr));
     }
     int64 index = tuple_elements.size();
     if (instruction_to_fuse->opcode() == HloOpcode::kTuple) {
@@ -1706,6 +1711,10 @@ std::vector<string> HloConvolutionInstruction::ExtraAttributesToStringImpl(
     extra.push_back(StrCat("feature_group_count=", feature_group_count_));
   }
 
+  if (batch_group_count_ != 1) {
+    extra.push_back(StrCat("batch_group_count=", batch_group_count_));
+  }
+
   string precision_config_string = PrecisionConfigToString(precision_config_);
   if (!precision_config_string.empty()) {
     extra.push_back(precision_config_string);
@@ -2007,6 +2016,18 @@ HloDynamicSliceInstruction::HloDynamicSliceInstruction(
   AppendOperand(start_indices);
 }
 
+HloDynamicSliceInstruction::HloDynamicSliceInstruction(
+    const Shape& shape, HloInstruction* operand,
+    absl::Span<HloInstruction* const> start_indices,
+    absl::Span<const int64> slice_sizes)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicSlice, shape),
+      dynamic_slice_sizes_(slice_sizes.begin(), slice_sizes.end()) {
+  AppendOperand(operand);
+  for (HloInstruction* index : start_indices) {
+    AppendOperand(index);
+  }
+}
+
 HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* update,
     HloInstruction* start_indices)
@@ -2016,6 +2037,17 @@ HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
   AppendOperand(start_indices);
 }
 
+HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicUpdateSlice, shape) {
+  AppendOperand(operand);
+  AppendOperand(update);
+  for (HloInstruction* index : start_indices) {
+    AppendOperand(index);
+  }
+}
+
 HloInstructionProto HloDynamicSliceInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   for (int64 slice_size : dynamic_slice_sizes_) {
@@ -2041,9 +2073,14 @@ std::unique_ptr<HloInstruction>
 HloDynamicSliceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  CHECK_EQ(new_operands.size(), 2);
-  return absl::make_unique<HloDynamicSliceInstruction>(
-      shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
+  if (new_operands.size() == 2 && new_operands[1]->shape().rank() == 1) {
+    // TODO(b/118437727): Old form, remove this path.
+    return absl::make_unique<HloDynamicSliceInstruction>(
+        shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
+  } else {
+    return absl::make_unique<HloDynamicSliceInstruction>(
+        shape, new_operands[0], new_operands.subspan(1), dynamic_slice_sizes_);
+  }
 }
 
 HloGatherInstruction::HloGatherInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index ca212c7f2c98f75ceefc14b7fbc2a1f530c06cf7..1b4a94753cda8aba8d50836b9d51b7c3fd5807f6 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -253,6 +253,10 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  // Returns true if the AllReduce does no communication, so it's equivalent
+  // to a mem copy.
+  bool IsNoop() const;
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -1183,7 +1187,22 @@ class HloDynamicIndexInstruction : public HloInstruction {
  public:
   explicit HloDynamicIndexInstruction(HloOpcode opcode, const Shape& shape)
       : HloInstruction(opcode, shape) {}
-  virtual int64 index_operand_number() const = 0;
+  virtual int64 first_index_operand_number() const = 0;
+
+  // Returns a subspan of operands which represent the start indices.
+  absl::Span<HloInstruction* const> index_operands() const {
+    return absl::MakeSpan(operands()).subspan(first_index_operand_number());
+  }
+
+  // Returns the shapes of the index operands.
+  std::vector<Shape> index_shapes() const {
+    std::vector<Shape> shapes;
+    auto indices = index_operands();
+    for (const HloInstruction* index : indices) {
+      shapes.push_back(index->shape());
+    }
+    return shapes;
+  }
 };
 
 class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
@@ -1192,6 +1211,10 @@ class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
                                       HloInstruction* operand,
                                       HloInstruction* start_indices,
                                       absl::Span<const int64> slice_sizes);
+  explicit HloDynamicSliceInstruction(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64> slice_sizes);
   // Old methods kept for smooth subclassing transition END.
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
@@ -1204,7 +1227,7 @@ class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
-  int64 index_operand_number() const override { return 1; }
+  int64 first_index_operand_number() const override { return 1; }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -1229,8 +1252,11 @@ class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
                                             HloInstruction* operand,
                                             HloInstruction* update,
                                             HloInstruction* start_indices);
+  explicit HloDynamicUpdateSliceInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* update,
+      absl::Span<HloInstruction* const> start_indices);
 
-  int64 index_operand_number() const override { return 2; }
+  int64 first_index_operand_number() const override { return 2; }
 };
 
 class HloGatherInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index dc712e5e42c449737bf4415f3a5e3eb9d81d9be4..798760885dcd55e0a1cbdf403fa160347d67fc3a 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
@@ -37,8 +38,8 @@ constexpr int kError = -2;
 
 // [a-zA-Z0-9_.-]
 bool IsIdentifierChar(char c) {
-  return isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '.' ||
-         c == '_';
+  return absl::ascii_isalnum(static_cast<unsigned char>(c)) || c == '-' ||
+         c == '.' || c == '_';
 }
 
 }  // namespace
@@ -105,7 +106,7 @@ TokKind HloLexer::LexToken() {
     switch (current_char) {
       default:
         // [a-zA-Z_]
-        if (isalpha(static_cast<unsigned char>(current_char)) ||
+        if (absl::ascii_isalpha(static_cast<unsigned char>(current_char)) ||
             current_char == '_') {
           return LexIdentifier();
         }
@@ -140,6 +141,12 @@ TokKind HloLexer::LexToken() {
         return LexNumberOrPattern();
       case '=':
         return TokKind::kEqual;
+      case '<':
+        if (current_char == '<' && PeekCurrentChar() == '=') {
+          current_ptr_++;
+          return TokKind::kLeq;
+        }
+        return TokKind::kError;
       case ',':
         return TokKind::kComma;
       case '%':
@@ -294,7 +301,7 @@ TokKind HloLexer::LexIdentifier() {
 // name ::= [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexPercent() {
   const char* name_start = current_ptr_;
-  if (isalpha(static_cast<unsigned char>(PeekCurrentChar())) ||
+  if (absl::ascii_isalpha(static_cast<unsigned char>(PeekCurrentChar())) ||
       PeekCurrentChar() == '_') {
     current_ptr_++;
     while (IsIdentifierChar(PeekCurrentChar())) {
@@ -462,6 +469,8 @@ string TokKindToString(TokKind kind) {
       return "kRparen";
     case TokKind::kArrow:
       return "kArrow";
+    case TokKind::kLeq:
+      return "kLeq";
     case TokKind::kw_HloModule:
       return "kw_HloModule";
     case TokKind::kw_ENTRY:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 41f5043904a2622814154693679a0e27cb92f642..94fac3cd8e9da7f273e7e521e21510f5188702e6 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -49,6 +49,7 @@ enum class TokKind {
   kRparen,  // (  )
 
   kArrow,  // ->
+  kLeq,    // <=
 
   // Keywords
   kw_HloModule,
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
index 5bf055f3c012fef687cdc275d62efdf2d4cd5e5c..e14bcfa7f67e736a4d04f5b236fb2df02cf150e0 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -36,11 +37,11 @@ namespace xla {
 namespace {
 
 using Worklist = std::deque<const HloInstruction*>;
-using Workset = std::unordered_set<const HloInstruction*>;
+using Workset = absl::flat_hash_set<const HloInstruction*>;
 
 void AddToWorklist(const HloInstruction* instruction, Worklist* worklist,
                    Workset* workset) {
-  if (workset->count(instruction) == 0) {
+  if (!workset->contains(instruction)) {
     worklist->push_back(instruction);
     workset->insert(instruction);
     VLOG(3) << "ADD instruction: " << instruction->name();
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index 7227bfb27c74758d2b79e404afc9eb97a1ca894d..76cc29cbb7848eb424d07abf11a95ffd59e9eed6 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -118,7 +118,7 @@ class HloTrivialScheduler : public HloModulePass {
 };
 
 // A trivial pass which clears the schedule currently set on the
-// HloModule. After this pass runs HloModudle::has_schedule will return false.
+// HloModule. After this pass runs HloModule::has_schedule will return false.
 class HloDescheduler : public HloModulePass {
  public:
   HloDescheduler() = default;
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index fe8371384c0fa3900a9022f101ff0b296439cf16..258f918f47a313b4b89fb260457b1b119dc16177 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -107,11 +107,10 @@ HloComputation* HloModule::AddEntryComputation(
 }
 
 Status HloModule::RemoveEmbeddedComputation(HloComputation* to_remove) {
-  auto it =
-      std::find_if(computations_.begin(), computations_.end(),
-                   [&to_remove](const std::unique_ptr<HloComputation>& comp) {
-                     return comp.get() == to_remove;
-                   });
+  auto it = absl::c_find_if(
+      computations_, [&to_remove](const std::unique_ptr<HloComputation>& comp) {
+        return comp.get() == to_remove;
+      });
   TF_RET_CHECK(it->get() == to_remove);
   computations_.erase(it);
   return Status::OK();
@@ -304,11 +303,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   auto module = absl::make_unique<HloModule>(proto.name(), module_config);
 
   // Sort the computations in the proto id's order.
-  std::sort(computations.begin(), computations.end(),
-            [&](const std::unique_ptr<HloComputation>& a,
-                const std::unique_ptr<HloComputation>& b) {
-              return to_proto_id[a.get()] < to_proto_id[b.get()];
-            });
+  absl::c_sort(computations, [&](const std::unique_ptr<HloComputation>& a,
+                                 const std::unique_ptr<HloComputation>& b) {
+    return to_proto_id[a.get()] < to_proto_id[b.get()];
+  });
 
   // Add sorted computations to the module.
   for (auto& computation : computations) {
@@ -392,15 +390,12 @@ namespace {
 // Returns whether `hlo` is used outside the given subcomputation.
 // `instructions_in_subcomputation` is the instruction set of the given
 // subcomputation.
-bool IsUsedOutsideSubcomputation(
-    const HloInstruction& hlo,
-    const std::unordered_set<HloInstruction*>& instructions_in_subcomputation) {
-  for (HloInstruction* user : hlo.users()) {
-    if (!instructions_in_subcomputation.count(user)) {
-      return true;
-    }
-  }
-  return false;
+bool IsUsedOutsideSubcomputation(const HloInstruction& hlo,
+                                 const absl::flat_hash_set<HloInstruction*>&
+                                     instructions_in_subcomputation) {
+  return absl::c_any_of(hlo.users(), [&](HloInstruction* user) {
+    return !instructions_in_subcomputation.contains(user);
+  });
 }
 }  // anonymous namespace
 
@@ -411,9 +406,9 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
 
   // A map from original instructions to their counterparts in the new outlined
   // function.
-  std::unordered_map<HloInstruction*, HloInstruction*> outlined_instructions;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> outlined_instructions;
   // A set that contains all instructions to be outlined.
-  std::unordered_set<HloInstruction*> instruction_set_to_outline(
+  absl::flat_hash_set<HloInstruction*> instruction_set_to_outline(
       instructions_to_outline.begin(), instructions_to_outline.end());
   std::vector<HloInstruction*> arguments;
   std::vector<HloInstruction*> outputs;
@@ -502,7 +497,7 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // First determine all root computations by building a set of nonroot
   // computations (computations which are called by an instruction in the
   // module).
-  std::set<HloComputation*> nonroot_computations;
+  absl::flat_hash_set<HloComputation*> nonroot_computations;
   for (auto& computation : computations_) {
     for (auto* instruction : computation->instructions()) {
       for (HloComputation* called_computation :
@@ -515,19 +510,19 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // Keep track of computations which have already been added to the post
   // order. This prevents duplication as an embedded computation may be called
   // from two different root computations.
-  std::set<HloComputation*> added_computations;
+  absl::flat_hash_set<HloComputation*> added_computations;
   std::vector<HloComputation*> post_order;
   for (auto& computation : computations_) {
-    if (nonroot_computations.count(computation.get()) == 0) {
+    if (!nonroot_computations.contains(computation.get())) {
       for (HloComputation* embedded_computation :
            computation->MakeEmbeddedComputationsList()) {
-        if (added_computations.count(embedded_computation) == 0) {
+        if (!added_computations.contains(embedded_computation)) {
           post_order.push_back(embedded_computation);
           added_computations.insert(embedded_computation);
         }
       }
       // Root computations should only be encountered once.
-      CHECK_EQ(0, added_computations.count(computation.get()));
+      CHECK(!added_computations.contains(computation.get()));
       post_order.push_back(computation.get());
       added_computations.insert(computation.get());
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.cc b/tensorflow/compiler/xla/service/hlo_module_dce.cc
index 31d26cc51e8217234526bbfeb83510aadf2c27b5..6b72ba128664d27c51aa8dcfa61fe959a0160c73 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.cc
@@ -49,7 +49,7 @@ StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
       auto* while_body_param = while_body_comp->parameter_instruction(0);
       auto* while_body_root = while_body_comp->root_instruction();
 
-      if (!ShapeUtil::IsTuple(xla_while->shape()) ||
+      if (!xla_while->shape().IsTuple() ||
           while_body_root->opcode() != HloOpcode::kTuple) {
         // Only run DCE on tuple-shaped while loops where body root is Tuple,
         // with no I/O instructions.
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index e535b7d74943943069b4d795cf999a3b1e963360..f6e2866204955ac024c2b6f972de449cc3df4c15 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -38,9 +38,7 @@ class HloModuleDceTest : public HloTestBase {
   // Returns whether the given instruction exists in the given computation.
   bool HasInstruction(const HloComputation& computation,
                       const HloInstruction* instruction) {
-    return std::find(computation.instructions().begin(),
-                     computation.instructions().end(),
-                     instruction) != computation.instructions().end();
+    return absl::c_linear_search(computation.instructions(), instruction);
   }
 
   // Returns whether the while instruction with name 'while_name' in
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index b4aac4c8076cb69647d42c6243bc969d06d0709e..47734bc55cc00d605f4e318400be88639450343c 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -79,36 +79,36 @@ Status HloModuleGroupMetadata::Build() {
       return Status::OK();
     }
 
-    std::vector<HloComputation*> peers;
-    if (IsChannelInstruction(hlo)) {
-      peers.push_back(PeerComputation(hlo));
-    } else if (hlo->IsCrossModuleAllReduce()) {
-      for (HloInstruction* instr : GetAllReduceGroup(*hlo->all_reduce_id())) {
-        if (instr == hlo) {
-          continue;
+    if (IsChannelInstruction(hlo) || hlo->IsCrossModuleAllReduce()) {
+      std::vector<HloComputation*> peers;
+      if (IsChannelInstruction(hlo)) {
+        peers.push_back(PeerComputation(hlo));
+      } else if (hlo->IsCrossModuleAllReduce()) {
+        for (HloInstruction* instr : GetAllReduceGroup(*hlo->all_reduce_id())) {
+          if (instr == hlo) {
+            continue;
+          }
+          peers.push_back(instr->parent());
         }
-        peers.push_back(instr->parent());
       }
-    }
-
-    // Add the parent computation of this channel (or all-reduce) instruction
-    // and its peer computation(s) (both must be while computations) as
-    // companions.
-    for (HloComputation* peer_computation : peers) {
-      const TrackedInstruction* peer_tracked =
-          GetTrackedInstruction(peer_computation);
-      TF_RET_CHECK(peer_tracked != nullptr)
-          << "Peer instruction is not a possible companion";
-      TF_RET_CHECK(*tracked == *peer_tracked)
-          << "Peer instruction does not match the computation kind";
-      TF_RETURN_IF_ERROR(
-          AddCompanion(tracked->instruction(), peer_tracked->instruction()));
-      tracked_instructions_comms_[tracked->instruction()].push_back(hlo);
-    }
 
-    // Add the parents of companion instructions (they must be all of the same
-    // kind of instructions, opcode wise) as companions.
-    if (IsCompanionInstruction(hlo)) {
+      // Add the parent computation of this channel (or all-reduce) instruction
+      // and its peer computation(s) (both must be while computations) as
+      // companions.
+      for (HloComputation* peer_computation : peers) {
+        const TrackedInstruction* peer_tracked =
+            GetTrackedInstruction(peer_computation);
+        TF_RET_CHECK(peer_tracked != nullptr)
+            << "Peer instruction is not a possible companion";
+        TF_RET_CHECK(*tracked == *peer_tracked)
+            << "Peer instruction does not match the computation kind";
+        TF_RETURN_IF_ERROR(
+            AddCompanion(tracked->instruction(), peer_tracked->instruction()));
+        tracked_instructions_comms_[tracked->instruction()].push_back(hlo);
+      }
+    } else if (IsCompanionInstruction(hlo)) {
+      // Add the parents of companion instructions (they must be all of the same
+      // kind of instructions, opcode wise) as companions.
       for (HloInstruction* companion : Companions(hlo)) {
         const TrackedInstruction* companion_tracked =
             GetTrackedInstruction(companion->parent());
@@ -118,6 +118,7 @@ Status HloModuleGroupMetadata::Build() {
                                         companion_tracked->instruction()));
       }
     }
+
     return Status::OK();
   };
 
@@ -198,7 +199,7 @@ bool HloModuleGroupMetadata::IsChannelInstruction(
 }
 
 bool HloModuleGroupMetadata::IsCompanionInstruction(HloInstruction* hlo) const {
-  return companion_set_index_.count(hlo) > 0;
+  return companion_set_index_.contains(hlo);
 }
 
 bool HloModuleGroupMetadata::InstructionCommunicates(
@@ -509,7 +510,7 @@ Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
   HloComputation* computation = instruction->parent();
   const HloModule* module = computation->parent();
   if (module->entry_computation() == computation ||
-      tracked_instructions_.count(computation) > 0) {
+      tracked_instructions_.contains(computation)) {
     return Status::OK();
   }
   return FailedPrecondition("channel is used in disallowed computation");
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 928df0f5a7444ad877961a5de970c752e1d024da..3ed95c10504141139d83eb8679a0b8144b15ad0d 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -38,7 +38,7 @@ namespace xla {
 // Class for bookkeeping the information on the given modules, in particular on
 // the interaction between computations.
 //
-// Companion instructions are one of the information collected as we build the
+// Companion instructions are one piece of information collected as we build the
 // metadata. For example, for each While instruction, companion instructions
 // refer to a set of While instructions in other computations that communicate
 // with each other.
@@ -51,6 +51,13 @@ namespace xla {
 // }                          While_4() { Recv(0) }
 //                          }
 //
+// Each instruction can belong to at most one companion set: While_0 and While_5
+// are in the same set even though they don't communicate with each other,
+// because they both communicate with While_2.
+//
+// A send and the matching recv must both have the same level of nesting of
+// companion instructions.
+//
 // Companion instructions are used to detect cycles in the graph and also for
 // global scheduling.
 class HloModuleGroupMetadata {
@@ -171,7 +178,7 @@ class HloModuleGroupMetadata {
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::vector<HloInstruction*>& Companions(
       const HloInstruction* instruction) const {
-    CHECK_EQ(companion_set_index_.count(instruction), 1);
+    CHECK(companion_set_index_.contains(instruction));
     return companion_set(companion_set_index_.at(instruction));
   }
 
@@ -215,11 +222,8 @@ class HloModuleGroupMetadata {
   // * Each channel has all 4 instructions (Send, Recv, SendDone, RecvDone).
   // * The shape of channel instructions match.
   // * The nest level of channel instructions match.
-  // * Channel instructions are used in allowed computations; i.e., in the
+  // * Channel instructions are used in allowed computations, i.e., in the
   //   entry computation of the module or condition/body of While computations.
-  //
-  // TODO(b/62064342): Currently, HloModuleGroupScheduler checks if there is a
-  // cycle in the graph, but it would be good to verify here.
   Status VerifyChannelInstructions();
 
   // Adds metadata that the given two instructions are companions.
@@ -231,8 +235,8 @@ class HloModuleGroupMetadata {
   Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
 
   // Performs a consistency check on the companion sets built for the input
-  // modules. Check that a companion set does not include instructions from the
-  // same module/device.
+  // modules. Checks that each instruction in a companion set is in a different
+  // module/device.
   Status VerifyCompanionSets() const;
 
   // Retrieves a pointer to the stored TrackedInstruction associated with a
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index fddeb5f0a27a43ff9ca8b2b5d314bcfe91aaf0e6..91417bd2d9a6ca8a5192a37302e6a91e49a94d77 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -198,6 +198,8 @@ std::vector<HloInstruction*> HloModuleGroupUtil::RootInstructions(
   for (HloComputation* computation : computations) {
     for (HloInstruction* instruction : computation->instructions()) {
       if (GlobalSuccessors(instruction).empty()) {
+        // An instruction that has no successors, e.g., an unused instruction,
+        // is in roots, even though it's not the ROOT of its computation.
         roots.push_back(instruction);
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h
index f21b44bcd98d77b831de5d8a6afa4f9ddd91d15d..862666b48c9aa423ba4eeea3052c17fcc1064fd2 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h
@@ -49,7 +49,7 @@ class HloModuleGroupUtil {
   // Returns all unique successors of the instruction. This includes:
   // * successors in the same computation: users and control successors
   // * Send is a successor of Recv
-  // * RecvDone is a predecessor of Send
+  // * RecvDone is a successor of Send
   // * successors of companions (if the instruction is a companion while)
   // * successors' companions (for any successor that is a companion while)
   std::vector<HloInstruction*> GlobalSuccessors(HloInstruction* instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index ca6a154809be46d6a0305c29e2b89219de408019..0cec61c257bb84e467290fb52ec9063a32ed558d 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -367,7 +367,7 @@ bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
     const HloInstruction* a, const HloInstruction* b) const {
   CHECK_EQ(a->parent(), b->parent());
   // If either instruction is not in the order, then 'a' and 'b' are unordered.
-  if (order_position_.count(a) == 0 || order_position_.count(b) == 0) {
+  if (!order_position_.contains(a) || !order_position_.contains(b)) {
     return false;
   }
   return order_position_.at(a) < order_position_.at(b);
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 44643951c14fb3a210b27064ffac4b99734bca0a..638396308c2a9c1f20e47f78b594d54f07c0c4e5 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -257,7 +257,8 @@ class HloParser {
   bool ParseName(string* result);
   bool ParseAttributeName(string* result);
   bool ParseString(string* result);
-  bool ParseDimensionSizes(std::vector<int64>* dimension_sizes);
+  bool ParseDimensionSizes(std::vector<int64>* dimension_sizes,
+                           std::vector<bool>* dynamic_dimensions);
   bool ParseShape(Shape* result);
   bool ParseLayout(Layout* layout);
   bool ParseOpcode(HloOpcode* result);
@@ -1170,24 +1171,39 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
+      LocTy loc = lexer_.GetLoc();
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
+      if (operands.empty()) {
+        return Error(loc, "Expected at least one operand.");
+      }
+      if (!(operands.size() == 2 && operands[1]->shape().rank() == 1) &&
+          operands.size() != 1 + operands[0]->shape().rank()) {
+        return Error(loc, "Wrong number of operands.");
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice(
-          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
+          shape, /*operand=*/operands[0],
+          /*start_indices=*/absl::MakeSpan(operands).subspan(1),
           *dynamic_slice_sizes));
       break;
     }
     case HloOpcode::kDynamicUpdateSlice: {
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
+      LocTy loc = lexer_.GetLoc();
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
+      if (operands.size() < 2) {
+        return Error(loc, "Expected at least two operands.");
+      }
+      if (!(operands.size() == 3 && operands[2]->shape().rank() == 1) &&
+          operands.size() != 2 + operands[0]->shape().rank()) {
+        return Error(loc, "Wrong number of operands.");
+      }
       instruction =
           builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
               shape, /*operand=*/operands[0], /*update=*/operands[1],
-              /*start_indices=*/operands[2]));
+              /*start_indices=*/absl::MakeSpan(operands).subspan(2)));
       break;
     }
     case HloOpcode::kTranspose: {
@@ -1287,7 +1303,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       // the infeed instruction. ShapeUtil::GetTupleElementShape will check fail
       // if the shape is not a non-empty tuple, so add guard so an error message
       // can be emitted instead of a check fail
-      if (!ShapeUtil::IsTuple(shape) && !ShapeUtil::IsEmptyTuple(shape)) {
+      if (!shape.IsTuple() && !ShapeUtil::IsEmptyTuple(shape)) {
         return Error(lexer_.GetLoc(),
                      "infeed must have a non-empty tuple shape");
       }
@@ -1931,8 +1947,8 @@ bool HloParser::SetValueInLiteralHelper(ParsedElemT value,
 //  ::= tuple
 //  ::= non_tuple
 bool HloParser::ParseLiteral(Literal* literal, const Shape& shape) {
-  return ShapeUtil::IsTuple(shape) ? ParseTupleLiteral(literal, shape)
-                                   : ParseNonTupleLiteral(literal, shape);
+  return shape.IsTuple() ? ParseTupleLiteral(literal, shape)
+                         : ParseNonTupleLiteral(literal, shape);
 }
 
 // tuple
@@ -1980,7 +1996,7 @@ bool HloParser::ParseNonTupleLiteral(Literal* literal, const Shape& shape) {
 }
 
 bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
-  const tensorflow::int64 rank = ShapeUtil::Rank(shape);
+  const tensorflow::int64 rank = shape.rank();
   // Create a literal with the given shape in default layout.
   *literal = LiteralUtil::CreateFromDimensions(
       shape.element_type(), AsInt64Slice(shape.dimensions()));
@@ -2145,7 +2161,7 @@ template <typename LiteralNativeT>
 bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
   std::vector<tensorflow::int64> index;
 
-  tensorflow::int64 rank = ShapeUtil::Rank(shape);
+  tensorflow::int64 rank = shape.rank();
 
   *literal = Literal(shape);
 
@@ -2730,7 +2746,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   }
 
   auto is_unique = [](string str) -> bool {
-    std::sort(str.begin(), str.end());
+    absl::c_sort(str);
     return std::unique(str.begin(), str.end()) == str.end();
   };
 
@@ -2971,14 +2987,25 @@ bool HloParser::ParseParamList() {
   return ParseToken(TokKind::kRparen, "expects ')' at the end of param list");
 }
 
-// dimension_sizes ::= '[' int64_list ']'
-bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes) {
+// dimension_sizes ::= '[' dimension_list ']'
+// dimension_list
+//   ::= /*empty*/
+//   ::= <=? int64 (',' param)*
+// param ::= name shape
+bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes,
+                                    std::vector<bool>* dynamic_dimensions) {
   auto parse_and_add_item = [&]() {
     tensorflow::int64 i;
+    bool is_dynamic = false;
+    if (lexer_.GetKind() == TokKind::kLeq) {
+      is_dynamic = true;
+      lexer_.Lex();
+    }
     if (!ParseInt64(&i)) {
       return false;
     }
     dimension_sizes->push_back(i);
+    dynamic_dimensions->push_back(is_dynamic);
     return true;
   };
   return ParseList(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma,
@@ -3034,12 +3061,18 @@ bool HloParser::ParseShape(Shape* result) {
   PrimitiveType primitive_type = lexer_.GetPrimitiveTypeVal();
   lexer_.Lex();
 
+  // Each element contains a dimension size and a bool indicating whether this
+  // is a dynamic dimension.
   std::vector<int64> dimension_sizes;
-  if (!ParseDimensionSizes(&dimension_sizes)) {
+  std::vector<bool> dynamic_dimensions;
+  if (!ParseDimensionSizes(&dimension_sizes, &dynamic_dimensions)) {
     return false;
   }
   result->set_element_type(primitive_type);
-  *result->mutable_dimensions() = dimension_sizes;
+  for (int i = 0; i < dimension_sizes.size(); ++i) {
+    result->add_dimensions(dimension_sizes[i]);
+    result->set_dynamic_dimension(i, dynamic_dimensions[i]);
+  }
   LayoutUtil::SetToDefaultLayout(result);
 
   if (lexer_.GetKind() == TokKind::kw_sparse) {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index ef31cec32770690505b437d8678c45150766e559..6ba16cc82ac1da2a30610d9dfb56cacc100ae05f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -551,6 +551,17 @@ ENTRY %Transpose.v2 () -> s32[1,2,3] {
   ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2}
 }
 
+)"
+},
+{
+"TransposeC128",
+R"(HloModule TransposeC128_module
+
+ENTRY %Transpose.v3 (input: c128[1,2,3]) -> c128[1,2,3] {
+  %input = c128[1,2,3]{2,1,0} parameter(0)
+  ROOT %transpose = c128[1,2,3]{2,1,0} transpose(c128[1,2,3]{2,1,0} %input), dimensions={0,1,2}
+}
+
 )"
 },
 // Dynamic slice
@@ -566,12 +577,26 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -
   ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258}
 }
 
+)"
+},
+// Dynamic slice with scalar indices
+{
+"DynamicSliceScalarIndices",
+R"(HloModule DynamicSlice_module
+
+ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
+  %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
+  %constant = s32[] constant(0)
+  %start_index = s32[] parameter(1)
+  ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[] %constant, s32[] %constant, s32[] %start_index), dynamic_slice_sizes={2,2,258}
+}
+
 )"
 },
 // Dynamic update slice
 {
 "DynamicUpdateSlice",
-R"(HloModule DynamicUpdateSlice_module
+R"(HloModule DynamicSlice_module
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -580,6 +605,23 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
   ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices)
 }
 
+)"
+},
+// Dynamic update slice with scalar indices
+{
+"DynamicUpdateSliceScalarIndex",
+R"(HloModule DynamicUpdateSlice_module
+
+ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
+  %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+  %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+  %start_index.0 = s32[] parameter(2)
+  %start_index.1 = s32[] parameter(3)
+  %start_index.2 = s32[] parameter(4)
+  %start_index.3 = s32[] parameter(5)
+  ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[] %start_index.0, s32[] %start_index.1, s32[] %start_index.2, s32[] %start_index.3)
+}
+
 )"
 },
 // batch norm training
@@ -1329,20 +1371,20 @@ TEST_P(HloParserTestLongProto, Run) { ExpectEqual(); }
 TEST_P(HloParserTestShort, Run) { ExpectEqual(); }
 TEST_P(HloParserTestShortProto, Run) { ExpectEqual(); }
 
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTestLong,
-                        ::testing::ValuesIn(CreateTestCases()),
-                        TestDataToString);
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation,
-                        HloParserTestLongProto,
-                        ::testing::ValuesIn(CreateTestCases()),
-                        TestDataToString);
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTestShort,
-                        ::testing::ValuesIn(CreateShortTestCases()),
-                        TestDataToString);
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation,
-                        HloParserTestShortProto,
-                        ::testing::ValuesIn(CreateShortTestCases()),
-                        TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation, HloParserTestLong,
+                         ::testing::ValuesIn(CreateTestCases()),
+                         TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
+                         HloParserTestLongProto,
+                         ::testing::ValuesIn(CreateTestCases()),
+                         TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation, HloParserTestShort,
+                         ::testing::ValuesIn(CreateShortTestCases()),
+                         TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
+                         HloParserTestShortProto,
+                         ::testing::ValuesIn(CreateShortTestCases()),
+                         TestDataToString);
 
 class HloParserTest : public ::testing::Test {
  protected:
@@ -2329,5 +2371,25 @@ TEST_F(HloParserTest, ParseInvalidShapeString) {
   }
 }
 
+TEST_F(HloParserTest, ParseDynamicArray) {
+  string shape_string = "f32[123,<=456]";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShape(F32, {123, 456}, {false, true});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseDynamicTuple) {
+  string shape_string = "(f32[42], u32[<=123,<=456])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {42}),
+       ShapeUtil::MakeShape(U32, {123, 456}, {true, true})});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 791b1a97b0b82edf19ff1588fd8d5d996ac0fef4..35dc9c0029f9871334cb500c6b71f0c86ab136d7 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,9 +40,36 @@ class HloPassFix : public Pass {
     int64 iteration_count = 0;
     int64 limit =
         std::max(static_cast<int64>(1000), module->instruction_count());
+    VLOG(3) << "Running HloPassFix.";
     while (changed_this_iteration) {
       TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module));
       changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
+      ++iteration_count;
+      if (iteration_count == limit) {
+        LOG(ERROR)
+            << "Unexpectedly high number of iterations in HLO passes ("
+            << iteration_count
+            << ")\nIf compilation hangs here, please file a bug with XLA.";
+      }
+    }
+    return changed;
+  }
+
+  StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) override {
+    bool changed = false;
+    bool changed_this_iteration = true;
+    int64 iteration_count = 0;
+    int64 limit = 1000;
+    for (const HloModule* module : module_group->modules()) {
+      limit = std::max<int64>(limit, module->instruction_count());
+    }
+    VLOG(3) << "Running HloPassFix.";
+    while (changed_this_iteration) {
+      TF_ASSIGN_OR_RETURN(changed_this_iteration,
+                          Pass::RunOnModuleGroup(module_group));
+      changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == limit) {
         LOG(ERROR)
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 33ce7e23a82d840676bba5f1ca9c0ffc4433465d..ae8c08cf1d16ad6738962f3be7c1b5512110b1d1 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -89,7 +89,7 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
 
   std::vector<HloPassInterface*> enabled_passes;
   for (auto& pass : passes_) {
-    if (disabled_pass_names.count(string(pass->name())) == 0) {
+    if (!disabled_pass_names.contains(pass->name())) {
       enabled_passes.push_back(pass.get());
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index 5eb707a957e49d86cdb2f72b72ce750bf29b8fd2..9cc202aa9f5fe5a20a9da05251ea811137ccaadb 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
@@ -34,11 +35,10 @@ string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
   for (const HloComputationInfo& computation_info :
        hlo_profile_printer_data.computation_infos()) {
     const auto& instruction_infos = computation_info.instruction_infos();
-    bool any_instruction_profiled =
-        std::any_of(instruction_infos.begin(), instruction_infos.end(),
-                    [&](const HloInstructionInfo& instruction_info) {
-                      return counters[instruction_info.profile_index()] != 0;
-                    });
+    bool any_instruction_profiled = absl::c_any_of(
+        instruction_infos, [&](const HloInstructionInfo& instruction_info) {
+          return counters[instruction_info.profile_index()] != 0;
+        });
 
     if (!any_instruction_profiled) {
       continue;
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index edaa4c59e2674e5f165c468059747d3dd2d54218..0fced7f15bdaf1dbe349e3b0fc6ada68393c6512 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -49,7 +49,7 @@ void HloReachabilityMap::SetReachabilityToUnionHelper(
     absl::Span<const HloInstruction* const> inputs,
     const HloInstruction* instruction, BitVector* bit_vector) {
   // If instruction is part of inputs, don't reset the bit_vector.
-  if (std::find(inputs.begin(), inputs.end(), instruction) == inputs.end()) {
+  if (!absl::c_linear_search(inputs, instruction)) {
     bit_vector->SetToZero();
   }
   bit_vector->Set(GetIndex(instruction));
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index ac74e2432f2176e13eaf7d4a1934a50ee89d1042..a175e4643de2ac6ce07ac00da914d7ab7acca541 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -57,6 +57,15 @@ using ::tensorflow::strings::HumanReadableNumBytes;
 
 // Returns true if the given instruction is rematerializable.
 bool IsRematerializable(const HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kCopy) {
+    if (LayoutUtil::Equal(instruction->shape().layout(),
+                          instruction->operand(0)->shape().layout())) {
+      // Don't rematerialize copies added by copy insertion (layout doesn't
+      // change).
+      return false;
+    }
+  }
+
   // Don't rematerialize instructions with side effects or instructions which
   // cannot be cloned safely.
   switch (instruction->opcode()) {
@@ -179,7 +188,8 @@ class InstructionList {
   Item* CreateItem(HloInstruction* inst) {
     Item* item = new Item;
     item->instruction = inst;
-    CHECK(item_map_.insert({inst, item}).second) << "inserting inst twice";
+    CHECK(item_map_.insert({inst, item}).second)
+        << "inserting inst twice " << inst->name();
     return item;
   }
 
@@ -235,8 +245,7 @@ class InstructionList {
     }
 
     // Now scan forwards until we find one of the before_instructions.
-    while (std::find(before_instructions.begin(), before_instructions.end(),
-                     min_position_item) == before_instructions.end()) {
+    while (!absl::c_linear_search(before_instructions, min_position_item)) {
       min_position_item = min_position_item->next;
     }
     return InsertBefore(to_insert, min_position_item);
@@ -302,7 +311,7 @@ ItemList GetUsers(const InstructionList& instruction_list,
       // A buffer may be used by the instruction via more than one alias. For
       // example, a buffer which appears in more than one element of a tuple.
       Item* user_item = instruction_list.GetItem(user);
-      if (std::find(users.begin(), users.end(), user_item) == users.end()) {
+      if (!absl::c_linear_search(users, user_item)) {
         users.push_back(user_item);
       }
     }
@@ -418,11 +427,12 @@ class MemoryUsageTracker {
   // the given uses.
   Buffer& RematerializeBuffer(const Buffer& original_buffer, Item* remat_item,
                               ItemList&& rematerialized_uses) {
-    CHECK(original_buffer.defining_instruction->placed);
-    CHECK(!original_buffer.has_indirect_uses);
-    CHECK(!original_buffer.live_out);
+    CHECK(original_buffer.defining_instruction->placed)
+        << original_buffer.defining_instruction->instruction->name();
+    CHECK(!original_buffer.has_indirect_uses) << original_buffer.ToString();
+    CHECK(!original_buffer.live_out) << original_buffer.ToString();
     for (Item* use : rematerialized_uses) {
-      CHECK(!use->placed);
+      CHECK(!use->placed) << use->instruction->name();
     }
     return NewBuffer(remat_item, original_buffer.size,
                      std::move(rematerialized_uses), /*live_out=*/false,
@@ -456,8 +466,7 @@ class MemoryUsageTracker {
       return false;
     }
     const BufferIdList& in_progress_uses = in_progress_item_->buffers_used;
-    return std::find(in_progress_uses.begin(), in_progress_uses.end(),
-                     buffer_id) != in_progress_uses.end();
+    return absl::c_linear_search(in_progress_uses, buffer_id);
   }
 
   // Returns whether the given instruction is live at the current program
@@ -535,8 +544,7 @@ MemoryUsageTracker::MemoryUsageTracker(
         bool unused;
         for (Item* user_item : GetUsers(instruction_list_, logical_buffer,
                                         points_to_analysis, &unused)) {
-          if (std::find(buffer->users.begin(), buffer->users.end(),
-                        user_item) == buffer->users.end()) {
+          if (!absl::c_linear_search(buffer->users, user_item)) {
             buffer->users.push_back(user_item);
             buffer->unfinished_user_count++;
             user_item->buffers_used.push_back(buffer->id);
@@ -677,8 +685,8 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
           << ", remat_instruction = " << remat_item->instruction->name();
 
   TF_RET_CHECK(in_progress_item_ != nullptr);
-  TF_RET_CHECK(original_item->placed);
-  TF_RET_CHECK(!remat_item->placed);
+  TF_RET_CHECK(original_item->placed) << original_item->instruction->name();
+  TF_RET_CHECK(!remat_item->placed) << remat_item->instruction->name();
 
   // Construct the list of buffers used and defined by the rematerialization.
   remat_item->buffers_used = original_item->buffers_used;
@@ -707,7 +715,7 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
     ItemList unplaced_users;
     for (Item* user : old_buffer.users) {
       if (user->placed) {
-        CHECK(IsFinished(user));
+        CHECK(IsFinished(user)) << user->instruction->name();
         placed_users.push_back(user);
       } else {
         unplaced_users.push_back(user);
@@ -784,8 +792,7 @@ bool MemoryUsageTracker::Check() const {
 
     for (const Buffer& buffer : buffers_) {
       if (buffer.defining_instruction->instruction == instruction) {
-        CHECK(std::find(defined_buffers.begin(), defined_buffers.end(),
-                        buffer.id) != defined_buffers.end())
+        CHECK(absl::c_linear_search(defined_buffers, buffer.id))
             << "Instruction " << instruction->name()
             << " defined buffers is missing: " << buffer.ToString();
       }
@@ -808,8 +815,7 @@ bool MemoryUsageTracker::Check() const {
     int64 unfinished_uses = 0;
     for (Item* user : buffer.users) {
       const BufferIdList& used_buffers = user->buffers_used;
-      CHECK(std::find(used_buffers.begin(), used_buffers.end(), buffer.id) !=
-            used_buffers.end())
+      CHECK(absl::c_linear_search(used_buffers, buffer.id))
           << "Instruction " << user->instruction->name()
           << " used buffers is missing " << buffer.ToString();
       if (!IsFinished(user)) {
@@ -836,10 +842,10 @@ int64 RematerializationCost(const HloInstruction* instruction,
   // If none of the users of 'instruction' have been placed in the sequence (as
   // tracked by memory_tracker), then rematerialization of 'instruction' is a
   // zero-cost move of 'instruction' in the sequence.
-  if (!std::any_of(instruction->users().begin(), instruction->users().end(),
-                   [&memory_tracker](const HloInstruction* inst) {
-                     return memory_tracker.IsPlaced(inst);
-                   })) {
+  if (!absl::c_any_of(instruction->users(),
+                      [&memory_tracker](const HloInstruction* inst) {
+                        return memory_tracker.IsPlaced(inst);
+                      })) {
     return 0;
   }
 
@@ -1094,7 +1100,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         Item* successor_item = instruction_list.GetItem(successor);
         // Assert to make sure we never remat an operation with control
         // successor already placed.
-        CHECK(!successor_item->placed);
+        CHECK(!successor_item->placed) << successor_item->instruction->name();
         place_before.push_back(successor_item);
       }
       instruction_list.InsertBeforeInstructions(remat_item, place_before);
@@ -1164,7 +1170,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // Verify some invariants on the memory tracker.
   CHECK_EQ(memory_tracker.memory_usage(), 0);
   for (auto* instruction : computation->instructions()) {
-    CHECK(memory_tracker.IsPlaced(instruction));
+    CHECK(memory_tracker.IsPlaced(instruction)) << instruction->name();
   }
 
   VLOG(1) << "In computation " << computation->name() << " rematerialized "
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 22c3c40a93a1ddcd36659483fcc79fede32dd2c3..102a360ad8116d8781baf9cb7627a920f4a687c4 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -499,6 +499,52 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   EXPECT_THAT(add_4->operand(0), op::Broadcast(param));
 }
 
+TEST_F(HloRematerializationTest, CopyNotRematerialized) {
+  // Test that copies are not rematerialized.
+  auto module = CreateNewVerifiedModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kCopy, param));
+
+  auto negate_a_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, copy));
+
+  auto negate_a_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      vec1024_shape_, HloOpcode::kNegate, negate_a_1));
+
+  auto negate_b_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, copy));
+
+  auto negate_b_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      vec1024_shape_, HloOpcode::kNegate, negate_b_1));
+
+  builder.AddInstruction(HloInstruction::CreateTuple({negate_a_2, negate_b_2}));
+
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/1 * 1024, module.get()));
+
+  auto count_copies = [](const HloComputation* computation) {
+    int64 copy_count = 0;
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopy) {
+        copy_count++;
+      }
+    }
+    return copy_count;
+  };
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(count_copies(entry_computation), 1);
+}
+
 class IndirectUseTest : public HloRematerializationTest,
                         public ::testing::WithParamInterface<bool> {};
 
@@ -588,8 +634,8 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   }
 }
 
-INSTANTIATE_TEST_CASE_P(IndirectUseTestInstantiation, IndirectUseTest,
-                        ::testing::Values(true, false));
+INSTANTIATE_TEST_SUITE_P(IndirectUseTestInstantiation, IndirectUseTest,
+                         ::testing::Values(true, false));
 
 }  // namespace
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 5a9b820a9d7f58695383b21c9e2126cf98970c83..d7d66ae1c4592723ca991d5ee971fa72cc1af90a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -383,9 +383,7 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
   if (device_assignment != nullptr) {
     run_options.set_device_assignment(device_assignment);
   }
-  return ServiceExecutableRunOptions(
-      run_options, backend().StreamBorrower(),
-      /*xla_intra_op_thread_pool=*/backend().eigen_intra_op_thread_pool());
+  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower());
 }
 
 Backend& HloRunner::backend() {
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 8f6eb974c5179b420c8f961393ca923e0a3b3530..e75373501cffac6a736be89e9f6139b6ff2cdbc1 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -140,7 +140,7 @@ Status HloSchedule::UpdateComputationSchedule(
   std::queue<HloInstruction*> worklist;
 
   for (HloInstruction* instruction : computation->instructions()) {
-    if (ids_in_schedule.count(instruction->unique_id()) == 0) {
+    if (!ids_in_schedule.contains(instruction->unique_id())) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
         worklist.push(instruction);
@@ -204,7 +204,7 @@ Status HloSchedule::Update() {
   std::vector<HloComputation*> nonfusion_computations =
       module_->MakeNonfusionComputations();
   for (const HloComputation* computation : nonfusion_computations) {
-    TF_RET_CHECK(sequences_.count(computation->unique_id()) == 1)
+    TF_RET_CHECK(sequences_.contains(computation->unique_id()))
         << "Computation " << computation->name() << " not in HloSchedule.";
   }
   if (sequences_.size() > nonfusion_computations.size()) {
@@ -215,7 +215,7 @@ Status HloSchedule::Update() {
       nonfusion_computations_ids.insert(computation->unique_id());
     }
     for (auto it = sequences_.begin(); it != sequences_.end();) {
-      if (nonfusion_computations_ids.count(it->first) == 0) {
+      if (!nonfusion_computations_ids.contains(it->first)) {
         sequences_.erase(it++);
       } else {
         ++it;
@@ -244,7 +244,7 @@ Status HloSchedule::Verify() const {
       << "Schedule has " << sequences_.size() << " sequences, but module has "
       << nonfusion_computations.size() << " non-fusion computations";
   for (const HloComputation* computation : nonfusion_computations) {
-    TF_RET_CHECK(sequences_.count(computation->unique_id()) == 1)
+    TF_RET_CHECK(sequences_.contains(computation->unique_id()))
         << "Computation " << computation->name()
         << " missing from HLO schedule.";
   }
@@ -268,7 +268,7 @@ Status HloSchedule::Verify() const {
         << instruction_position.size() << " instructions, expected "
         << computation->instruction_count();
     for (const HloInstruction* instruction : computation->instructions()) {
-      TF_RET_CHECK(instruction_position.count(instruction) == 1)
+      TF_RET_CHECK(instruction_position.contains(instruction))
           << "Instruction " << instruction->name() << " is not in schedule";
     }
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 486ddbf499de80c634bc497158cd79ca066cc866..a5f54ae2c33259d080631061dff9ae40b41495dc 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -110,7 +110,7 @@ class HloSchedule {
 
   // Returns true if the schedule has a sequence for the given computation.
   bool is_computation_scheduled(const HloComputation* computation) const {
-    return sequences_.count(computation->unique_id()) == 1;
+    return sequences_.contains(computation->unique_id());
   }
 
   // Updates the schedule such that it is (again) a valid schedule for the
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 70a860c356ca2fb1c4c973ea3d96c50fabc2c7c2..37cc146bd7a6f2aef9373bd4afd8572ffac6473c 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/overflow_util.h"
@@ -30,7 +31,7 @@ HloSharding HloSharding::AssignDevice(int64 device_id) {
 }
 
 HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
-  CHECK_EQ(1, ShapeUtil::Rank(input_shape));
+  CHECK_EQ(1, input_shape.rank());
   CHECK_GT(num_tiles, 1);
   std::vector<int64> dimensions(1, num_tiles);
   Array<int64> assignment(dimensions);
@@ -57,7 +58,7 @@ HloSharding HloSharding::Tuple(const ShapeTree<HloSharding>& sub_shardings) {
 
 HloSharding HloSharding::Tuple(const Shape& tuple_shape,
                                absl::Span<const HloSharding> shardings) {
-  CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape);
+  CHECK(tuple_shape.IsTuple()) << ShapeUtil::HumanString(tuple_shape);
   for (auto& sharding : shardings) {
     CHECK(!sharding.IsTuple()) << sharding.ToString();
   }
@@ -70,7 +71,7 @@ HloSharding HloSharding::Tuple(const Shape& tuple_shape,
 
 HloSharding HloSharding::SingleTuple(const Shape& tuple_shape,
                                      const HloSharding& sharding) {
-  CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape);
+  CHECK(tuple_shape.IsTuple()) << ShapeUtil::HumanString(tuple_shape);
   CHECK(!sharding.IsTuple()) << sharding.ToString();
   int64 leaf_count = RequiredLeaves(tuple_shape);
   std::vector<HloSharding> flattened_list;
@@ -80,7 +81,7 @@ HloSharding HloSharding::SingleTuple(const Shape& tuple_shape,
 
 HloSharding HloSharding::Single(const Shape& shape,
                                 const HloSharding& sharding) {
-  return ShapeUtil::IsTuple(shape) ? SingleTuple(shape, sharding) : sharding;
+  return shape.IsTuple() ? SingleTuple(shape, sharding) : sharding;
 }
 
 string HloSharding::ToString() const {
@@ -106,13 +107,12 @@ string HloSharding::ToString() const {
 
 bool HloSharding::UsesDevice(int64 device) const {
   if (IsTuple()) {
-    return std::any_of(
-        tuple_elements_.begin(), tuple_elements_.end(),
-        [&](const HloSharding& s) { return s.UsesDevice(device); });
+    return absl::c_any_of(tuple_elements_, [&](const HloSharding& s) {
+      return s.UsesDevice(device);
+    });
   }
   const auto& devices = tile_assignment_;
-  return replicated_ ||
-         std::find(devices.begin(), devices.end(), device) != devices.end();
+  return replicated_ || absl::c_linear_search(devices, device);
 }
 
 std::map<int64, int64> HloSharding::UsedDevices(int64* count) const {
@@ -269,7 +269,7 @@ int64 HloSharding::GetUniqueDevice() const {
 }
 
 Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const {
-  if (!ShapeUtil::IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return tensorflow::errors::InvalidArgument(
         StrCat("Sharding is tuple-shaped but validation shape is not."));
   }
@@ -305,7 +305,7 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
 
 Status HloSharding::ValidateNonTuple(const Shape& shape,
                                      int64 num_devices) const {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     return tensorflow::errors::InvalidArgument(
         StrCat("Validation shape is a tuple but sharding is not."));
   }
@@ -316,7 +316,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   // All tile assignments must be less than the number of available cores and
   // unique.
   Status status = Status::OK();
-  std::set<int64> seen_cores;
+  absl::flat_hash_set<int64> seen_cores;
   tile_assignment_.Each(
       [&](absl::Span<const int64> indices, int32 core) {
         // Don't overwrite a bad status, so we report the first error.
@@ -324,7 +324,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
           if (core >= num_devices) {
             status = tensorflow::errors::InvalidArgument(StrCat(
                 "core ", core, " > ", num_devices, " in tile assignment"));
-          } else if (seen_cores.count(core) != 0) {
+          } else if (seen_cores.contains(core)) {
             status = tensorflow::errors::InvalidArgument(
                 StrCat("core ", core, " is not unique in tile assignment"));
           }
@@ -340,7 +340,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   }
 
   // The tile assignment tensor must have the same rank as the input.
-  if (ShapeUtil::Rank(shape) != tile_assignment_.num_dimensions()) {
+  if (shape.rank() != tile_assignment_.num_dimensions()) {
     return tensorflow::errors::InvalidArgument(
         "Number of tile assignment dimensions is different to the input rank. "
         "sharding=",
@@ -437,8 +437,8 @@ Shape HloSharding::TileShape(const Shape& shape) const {
   }
   Shape result_shape = shape;
   for (int64 i = 0; i < shape.dimensions_size(); ++i) {
-    (*result_shape.mutable_dimensions())[i] =
-        CeilOfRatio<int64>(shape.dimensions(i), tile_assignment_.dim(i));
+    result_shape.set_dimensions(
+        i, CeilOfRatio<int64>(shape.dimensions(i), tile_assignment_.dim(i)));
   }
   return result_shape;
 }
@@ -455,7 +455,7 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
     }
     sub_shape = &ShapeUtil::GetSubshape(*sub_shape, {idx});
   }
-  if (ShapeUtil::IsTuple(*sub_shape)) {
+  if (sub_shape->IsTuple()) {
     auto begin_it = tuple_elements_.begin() + sharding_index;
     std::vector<HloSharding> sub_shardings(
         begin_it, begin_it + ShapeUtil::GetLeafCount(*sub_shape));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 9775505f8608ced3e33abe376f4922cc6a972726..5789ae09988d2a85247c5b8c037a172b3699f3b7 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -101,8 +101,8 @@ class HloSharding {
     if (!IsTuple()) {
       return replicated_;
     }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsReplicated(); });
+    return absl::c_all_of(
+        tuple_elements_, [](const HloSharding& s) { return s.IsReplicated(); });
   }
 
   // Returns true if the tile size is the same as the input size.
@@ -110,8 +110,9 @@ class HloSharding {
     if (!IsTuple()) {
       return maximal_;
     }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsTileMaximal(); });
+    return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+      return s.IsTileMaximal();
+    });
   }
 
   // Returns true if the sharding defines an operation on the given device.
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index f5061304456e04ab40448861343ef201c9450dcf..094d98bc6e54028557f6d38cd165bf34e1fb8c46 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -99,7 +99,7 @@ std::vector<PassThrough> LocatePassThroughDomainLinks(
         << "Instruction is not a kDomain: " << instruction->ToString();
     for (HloInstruction* user : instruction->users()) {
       if (user->opcode() == HloOpcode::kDomain &&
-          domain.exit_domains.count(user) != 0) {
+          domain.exit_domains.contains(user)) {
         pass_through.emplace_back(user, instruction);
         VLOG(2) << "Found passthrough domain link:";
         VLOG(2) << "  " << user->ToString();
@@ -234,7 +234,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
   if (instruction->users().empty()) {
     // No sharding from users, use domain_sharding, after checking
     // compatibility.
-    TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape()) &&
+    TF_RET_CHECK(instruction->shape().IsTuple() &&
                  ShapeUtil::GetLeafCount(instruction->shape()) ==
                      domain_sharding.tuple_elements().size());
     instruction->set_sharding(domain_sharding);
@@ -253,7 +253,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice));
   for (HloInstruction* user : instruction->users()) {
     if (user->opcode() == HloOpcode::kDomain &&
-        domain.exit_domains.count(user) > 0) {
+        domain.exit_domains.contains(user)) {
       // If a user is a domain and it is registered in the domain exits, then
       // the instruction sharding is taken directly from the domain, and no
       // further users need to be visited.
@@ -266,7 +266,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
     AssignmentKind sub_assigned = AssignmentKind::kUnassigned;
     TF_ASSIGN_OR_RETURN(ShapeTree<HloSharding> user_sharding_tree,
                         GetShardingTreeFromUser(*instruction, *user));
-    if (ShapeUtil::IsTuple(instruction->shape())) {
+    if (instruction->shape().IsTuple()) {
       // For tuple-shaped instructions collect individual tuple subshardings
       // from the uses, and then combine them into the tuple sharding.
       // If the user is a GTE its sharding concerns only the subtree of
@@ -298,7 +298,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
   }
 
   if (assigned == AssignmentKind::kAssigned) {
-    if (ShapeUtil::IsTuple(instruction->shape())) {
+    if (instruction->shape().IsTuple()) {
       instruction->set_sharding(HloSharding::Tuple(sharding_tree));
     } else {
       TF_RET_CHECK(sharding_tree.leaf_count() == 1);
@@ -361,7 +361,7 @@ Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
       // kUnassignedDevice. Indeed in case of doubt it is better to leave the
       // entire tuple unassigned, and let the device placer decide for it.
       if (instruction->sharding().UsesDevice(kUnassignedDevice)) {
-        TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape()))
+        TF_RET_CHECK(instruction->shape().IsTuple())
             << "Only tuples can have kUnassignedDevice sub shardings";
         instruction->clear_sharding();
       }
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index 487653344976a10e18ba667085525ba1ecbb8612..c1f69db74eafb7743e85f499f2f4828ed0375501 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -61,8 +61,7 @@ void CleanNodeName(string* name) {
   name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
   const string chars_to_replace = "<>[]";
   auto pred = [&](char c) {
-    return std::find(chars_to_replace.begin(), chars_to_replace.end(), c) !=
-           chars_to_replace.end();
+    return absl::c_linear_search(chars_to_replace, c);
   };
   std::replace_if(name->begin(), name->end(), pred, '_');
 }
@@ -159,7 +158,7 @@ void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
   // Set the layout.
   if (LayoutUtil::HasLayout(instruction->shape())) {
     string layout_string;
-    if (ShapeUtil::IsTuple(instruction->shape())) {
+    if (instruction->shape().IsTuple()) {
       // For tuples, emit the full shape because the layout of a tuple is not
       // represented in a single Layout field.
       layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 59594ab2f0f70a206c73e998dbfa69c2c5c7ba43..218b33b2ac2b86edc30b2f014ba206c71da37682 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -46,7 +46,7 @@ const Shape& HloPosition::shape() const {
 
 string HloPosition::ToString() const {
   string index_str =
-      ShapeUtil::IsTuple(instruction->shape()) ? (" " + index.ToString()) : "";
+      instruction->shape().IsTuple() ? (" " + index.ToString()) : "";
   return StrCat(instruction->name(), index_str);
 }
 
@@ -56,10 +56,9 @@ std::ostream& operator<<(std::ostream& out, const HloPosition& position) {
 }
 
 string HloUse::ToString() const {
-  string index_str =
-      ShapeUtil::IsTuple(instruction->operand(operand_number)->shape())
-          ? (" " + operand_index.ToString())
-          : "";
+  string index_str = instruction->operand(operand_number)->shape().IsTuple()
+                         ? (" " + operand_index.ToString())
+                         : "";
   return StrCat(instruction->name(), ", operand ", operand_number, index_str);
 }
 
@@ -88,7 +87,7 @@ bool HloValue::operator!=(const HloValue& other) const {
 }
 
 string HloValue::ToShortString() const {
-  string index_str = ShapeUtil::IsTuple(defining_instruction()->shape())
+  string index_str = defining_instruction()->shape().IsTuple()
                          ? defining_index().ToString()
                          : "";
   return StrCat(id(), " ", is_phi_ ? "PHI " : "",
@@ -210,7 +209,7 @@ std::ostream& operator<<(std::ostream& out, const HloValue& value) {
 }
 
 void HloValueSet::SortAndUniquifyValues() {
-  std::sort(values_.begin(), values_.end(), HloValue::IdLessThan);
+  absl::c_sort(values_, HloValue::IdLessThan);
   values_.erase(std::unique(values_.begin(), values_.end(), HloValue::IdEqual),
                 values_.end());
 }
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index e1c737132f72948e0e46d37dd08ddf8e7b29bfca..144c01eac1c06bb067c9f29f29b536c459ea273e 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -349,7 +349,10 @@ Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
 Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
   TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 0));
   auto* iota = Cast<HloIotaInstruction>(instruction);
-  const int64 rank = ShapeUtil::Rank(iota->shape());
+  if (!iota->shape().IsArray()) {
+    return InternalError("Iota does not support non-array result.");
+  }
+  const int64 rank = iota->shape().rank();
   if (rank == 0) {
     return InternalError("Iota does not support scalars.");
   }
@@ -387,6 +390,14 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
   TF_RETURN_IF_ERROR(CheckOperandCount(bitcast, 1));
+  // Bitcasts are not allowed to change the element type.
+  if (bitcast->operand(0)->shape().element_type() !=
+      bitcast->shape().element_type()) {
+    return InternalError(
+        "Bitcast can not change the element type from %s to %s",
+        PrimitiveType_Name(bitcast->operand(0)->shape().element_type()),
+        PrimitiveType_Name(bitcast->shape().element_type()));
+  }
   return Status::OK();
 }
 
@@ -397,13 +408,11 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   const Shape& operand_shape = broadcast->operand(0)->shape();
   // Check for mixed precision.
   TF_RET_CHECK(SameElementType(broadcast->shape(), operand_shape));
-  TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
-               broadcast->dimensions().size());
-  for (int64 operand_dimension = 0;
-       operand_dimension < ShapeUtil::Rank(operand_shape);
+  TF_RET_CHECK(operand_shape.rank() == broadcast->dimensions().size());
+  for (int64 operand_dimension = 0; operand_dimension < operand_shape.rank();
        ++operand_dimension) {
     int64 output_dimension = broadcast->dimensions()[operand_dimension];
-    TF_RET_CHECK((output_dimension < ShapeUtil::Rank(broadcast->shape())) &&
+    TF_RET_CHECK((output_dimension < broadcast->shape().rank()) &&
                  output_dimension >= 0 &&
                  (broadcast->shape().dimensions(output_dimension) ==
                   operand_shape.dimensions(operand_dimension)))
@@ -498,21 +507,23 @@ Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
 }
 
 Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_slice, 2));
-  return CheckShape(dynamic_slice, ShapeInference::InferDynamicSliceShape(
-                                       dynamic_slice->operand(0)->shape(),
-                                       dynamic_slice->operand(1)->shape(),
-                                       dynamic_slice->dynamic_slice_sizes()));
+  return CheckShape(
+      dynamic_slice,
+      ShapeInference::InferDynamicSliceShape(
+          dynamic_slice->operand(0)->shape(),
+          Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+          dynamic_slice->dynamic_slice_sizes()));
 }
 
 Status ShapeVerifier::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_update_slice, 3));
-  return CheckShape(dynamic_update_slice,
-                    ShapeInference::InferDynamicUpdateSliceShape(
-                        dynamic_update_slice->operand(0)->shape(),
-                        dynamic_update_slice->operand(1)->shape(),
-                        dynamic_update_slice->operand(2)->shape()));
+  return CheckShape(
+      dynamic_update_slice,
+      ShapeInference::InferDynamicUpdateSliceShape(
+          dynamic_update_slice->operand(0)->shape(),
+          dynamic_update_slice->operand(1)->shape(),
+          Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
+              ->index_shapes()));
 }
 
 Status ShapeVerifier::HandleTuple(HloInstruction* tuple) {
@@ -524,8 +535,7 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
   int64 max_operand_rank = 0;
   for (const HloInstruction* operand : map->operands()) {
     operand_shapes.push_back(&operand->shape());
-    max_operand_rank =
-        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+    max_operand_rank = std::max(max_operand_rank, operand->shape().rank());
   }
   // TODO(b/65689298) Remove code below once Map is generalized to accept
   // arbitrary map dimensions.
@@ -695,7 +705,6 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReducePrecision:
-    case HloOpcode::kSelect:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
@@ -983,7 +992,7 @@ bool ShapeContainsToken(const Shape& shape) {
   bool contains_token = false;
   ShapeUtil::ForEachSubshape(
       shape, [&contains_token](const Shape& subshape, const ShapeIndex&) {
-        if (ShapeUtil::IsToken(subshape)) {
+        if (subshape.IsToken()) {
           contains_token = true;
         }
       });
@@ -1271,11 +1280,11 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
     // or ComputationLowerer::Visit()
     TF_RET_CHECK(broadcast->dimensions().size() ==
-                 ShapeUtil::Rank(broadcast->operand(0)->shape()))
+                 broadcast->operand(0)->shape().rank())
         << "Broadcast HLO (" << broadcast->ToShortString()
         << ") has invalid number of dimensions: "
         << broadcast->dimensions().size()
-        << " != " << ShapeUtil::Rank(broadcast->operand(0)->shape());
+        << " != " << broadcast->operand(0)->shape().rank();
     return Status::OK();
   }
 
@@ -1325,7 +1334,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   Status HandleGetTupleElement(HloInstruction* gte) override {
-    TF_RET_CHECK(ShapeUtil::IsTuple(gte->operand(0)->shape()));
+    TF_RET_CHECK(gte->operand(0)->shape().IsTuple());
     return Status::OK();
   }
 
@@ -1376,7 +1385,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
       for (HloInstruction* operand : instruction->operands()) {
         const Shape& operand_shape = operand->shape();
         if (LayoutUtil::IsDenseArray(operand_shape) &&
-            ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(result_shape)) {
+            operand_shape.rank() == result_shape.rank()) {
           const Layout& operand_layout = operand_shape.layout();
           TF_RET_CHECK(LayoutUtil::Equal(result_layout, operand_layout))
               << "Instruction shouldn't change layouts "
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index a1a6aba9728c137d17487b5914f67cb3966fc12b..479905b317d5639ff2cebc4d1044e21b527693f6 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -168,8 +168,13 @@ class ShapeVerifier : public DfsHloVisitor {
 // An interface used to encapsulate target-specific verification quirks.
 class TargetVerifierMetadata {
  public:
+  TargetVerifierMetadata(std::function<int64(const Shape&)> shape_size_function)
+      : shape_size_function_(shape_size_function) {}
+
   // Returns a target-specific shape size.
-  virtual int64 ShapeSize(const Shape& shape) const = 0;
+  int64 ShapeSize(const Shape& shape) const {
+    return shape_size_function_(shape);
+  }
 
   virtual std::unique_ptr<ShapeVerifier> GetVerifier() const = 0;
 
@@ -178,20 +183,23 @@ class TargetVerifierMetadata {
 
   TargetVerifierMetadata(const TargetVerifierMetadata&) = delete;
   TargetVerifierMetadata& operator=(const TargetVerifierMetadata&) = delete;
+
+ private:
+  // Returns a target-specific shape size.
+  std::function<int64(const Shape&)> shape_size_function_;
 };
 
 // The default implementation of TargetVerifierMetadata, used unless the target
 // needs to override it.
 class DefaultVerifierMetadata : public TargetVerifierMetadata {
  public:
-  DefaultVerifierMetadata(bool layout_sensitive, bool allow_mixed_precision)
-      : layout_sensitive_(layout_sensitive),
+  DefaultVerifierMetadata(
+      bool layout_sensitive, bool allow_mixed_precision,
+      std::function<int64(const Shape&)> shape_size_function)
+      : TargetVerifierMetadata(shape_size_function),
+        layout_sensitive_(layout_sensitive),
         allow_mixed_precision_(allow_mixed_precision) {}
 
-  int64 ShapeSize(const Shape& shape) const override {
-    return ShapeUtil::ByteSizeOf(shape);
-  }
-
   // Creates a ShapeVerifier that checks that shapes match inferred
   // expectations. This creates a new verifier every time because ShapeVerifier,
   // being a DfsHloVisitor, is stateful. We want a clean object for each run of
@@ -210,11 +218,14 @@ class DefaultVerifierMetadata : public TargetVerifierMetadata {
 // the module.
 class HloVerifier : public HloModulePass {
  public:
-  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision,
-                       std::function<bool(const HloInstruction*)>
-                           instruction_can_change_layout_func = {})
+  explicit HloVerifier(
+      bool layout_sensitive, bool allow_mixed_precision,
+      std::function<bool(const HloInstruction*)>
+          instruction_can_change_layout_func = {},
+      std::function<int64(const Shape&)> shape_size_func =
+          [](const Shape& shape) { return ShapeUtil::ByteSizeOf(shape); })
       : target_metadata_(absl::make_unique<DefaultVerifierMetadata>(
-            layout_sensitive, allow_mixed_precision)),
+            layout_sensitive, allow_mixed_precision, shape_size_func)),
         instruction_can_change_layout_func_(
             std::move(instruction_can_change_layout_func)) {
     CHECK(instruction_can_change_layout_func_ == nullptr || layout_sensitive);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 4bc557e4e62e7df4e25fda86fe417e84129b464c..4f69bd155b8713041ba539098808125956e86259 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -386,6 +388,55 @@ TEST_F(HloVerifierTest, AddWithLayoutChange) {
   ASSERT_TRUE(status.ok());
 }
 
+TEST_F(HloVerifierTest, ScalarIndexDynamicSlice) {
+  const char* const kScalarIndexDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
+      %original_parameter = s32[2,2,258] parameter(0)
+      %constant = s32[] constant(0)
+      %start_index = s32[] parameter(1)
+      ROOT %dynamic-slice = s32[2,2,258] dynamic-slice(s32[2,2,258] %original_parameter, s32[] %constant, s32[] %constant, s32[] %start_index), dynamic_slice_sizes={2,2,258}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kScalarIndexDynamicSlice, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, ScalarIndexDynamicUpdateSlice) {
+  const char* const kScalarIndexDynamicSlice = R"(
+    HloModule DynamicUpdateSlice_module
+
+    ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
+      %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+      %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+      %start_index.0 = s32[] parameter(2)
+      %start_index.1 = s32[] parameter(3)
+      %start_index.2 = s32[] parameter(4)
+      %start_index.3 = s32[] parameter(5)
+      ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[] %start_index.0, s32[] %start_index.1, s32[] %start_index.2, s32[] %start_index.3)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kScalarIndexDynamicSlice, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
   auto status = verifier().Run(module.get()).status();
@@ -399,8 +450,9 @@ TEST_F(HloVerifierTestLayoutSensitive, SliceWithLayoutChangeNotAllowed) {
    HloModule SliceWithLayoutChange
     ENTRY SliceWithLayoutChange {
       par0 = f32[4,5]{0,1} parameter(0)
-      par1 = s32[2] parameter(1)
-      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1),
+      par1 = s32[] parameter(1)
+      par2 = s32[] parameter(2)
+      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1, par2),
         dynamic_slice_sizes={3,4}
     }
   )";
@@ -429,5 +481,76 @@ TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
   EXPECT_THAT(status.error_message(),
               HasSubstr("Instruction shouldn't change layouts"));
 }
+
+TEST_F(HloVerifierTest, BitcastCanNotChangeElementType) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY BitcastCanNotChangeElementType {
+   constant.0 = f32[2] constant({0.0, 0.0})
+   ROOT bitcast = s32[2] bitcast(constant.0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Bitcast can not change the element type"));
+}
+
+TEST_F(HloVerifierTest, SelectMixedPrecisionNotAllowed) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY SelectMixedPrecisionNotAllowed {
+   p0 = pred[] parameter(0)
+   p1 = f32[32] parameter(1)
+   p2 = bf16[32] parameter(2)
+   ROOT select = f32[32] select(p0, p1, p2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Seen floating point types of different precisions"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, SelectMixedPrecisionAllowed) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY SelectMixedPrecisionAllowed {
+   p0 = pred[] parameter(0)
+   p1 = f32[32] parameter(1)
+   p2 = bf16[32] parameter(2)
+   ROOT select = f32[32] select(p0, p1, p2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, IotaNonArrayResult) {
+  const char* const hlo_string = R"(
+  HloModule IotaTupleResult
+
+  ENTRY  kernelEntry {
+    ROOT iota = () iota(), iota_dimension=24
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("does not support non-array result"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index 90904ac00110457bcc3b8974816a7080c4ab89fc..88fc62bd1e2a7830b3f61738a8642308ef4225a7 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -128,9 +128,9 @@ string HumanReadableProfileBuilder::ToString() const {
 
   // Sort ops in decreasing order of cycles, and print them.
   std::vector<OpInfo> sorted_ops(op_infos_);
-  std::sort(
-      sorted_ops.begin(), sorted_ops.end(),
-      [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; });
+  absl::c_sort(sorted_ops, [](const OpInfo& a, const OpInfo& b) {
+    return a.cycles > b.cycles;
+  });
   for (const auto& op : sorted_ops) {
     print_op(op);
   }
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 1ebb3319779c00fd4afe90606bf336e16349429d..76bf48870d55e82497ba5f63e9e2e2a322cb330e 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -103,7 +103,7 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
 
   do {
     const HloInstruction* instr = stack.back();
-    if (cache_.count(instr)) {
+    if (cache_.contains(instr)) {
       stack.pop_back();
       continue;
     }
@@ -111,9 +111,9 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
     switch (FindOrDie(dfs_state_map, instr)) {
       case kDiscovered: {
         for (const HloInstruction* operand : instr->operands()) {
-          if (!cache_.count(operand)) {
+          if (!cache_.contains(operand)) {
             stack.push_back(operand);
-            CHECK(!dfs_state_map.count(operand) ||
+            CHECK(!dfs_state_map.contains(operand) ||
                   dfs_state_map[operand] == kDiscovered);
             dfs_state_map[operand] = kDiscovered;
           }
@@ -1002,7 +1002,7 @@ bool CanFoldDotIntoIndexedArray(
     absl::Span<const int64> contracting_dims,
     absl::Span<const int64> batch_dims) {
   absl::optional<int64> non_contracting_non_batch_dim =
-      GetOnlyNonContractingNonBatchDim(ShapeUtil::Rank(indexed_array->shape()),
+      GetOnlyNonContractingNonBatchDim(indexed_array->shape().rank(),
                                        contracting_dims, batch_dims);
   if (!non_contracting_non_batch_dim.has_value()) {
     VLOG(3) << tag << ": multiple or no non-contracting non-batch dimensions";
@@ -1015,7 +1015,7 @@ bool CanFoldDotIntoIndexedArray(
     return false;
   }
 
-  int64 indexed_array_rank = ShapeUtil::Rank(indexed_array->shape());
+  int64 indexed_array_rank = indexed_array->shape().rank();
   if (indexed_array->source_dim() < (indexed_array_rank - 2)) {
     // This restriction can be lifted by inserting reshape nodes.
     VLOG(3) << tag
@@ -1043,7 +1043,7 @@ IndexedArrayAnalysis::ComputeArrayForDotWithIndexedLhs(
     return nullptr;
   }
 
-  int64 lhs_rank = ShapeUtil::Rank(lhs->shape());
+  int64 lhs_rank = lhs->shape().rank();
   DotDimensionNumbers new_dim_numbers = dim_numbers;
   new_dim_numbers.set_lhs_contracting_dimensions(
       0, lhs->source_dim() == (lhs_rank - 1) ? (lhs_rank - 2) : (lhs_rank - 1));
@@ -1078,7 +1078,7 @@ IndexedArrayAnalysis::ComputeArrayForDotWithIndexedRhs(
     return nullptr;
   }
 
-  int64 rhs_rank = ShapeUtil::Rank(rhs->shape());
+  int64 rhs_rank = rhs->shape().rank();
 
   DotDimensionNumbers new_dim_numbers = dim_numbers;
   new_dim_numbers.set_rhs_contracting_dimensions(
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 295465c8481bcb7d1385192febe0d09614e393b3..62107b5a88d4e37552fa5a6384700a9291a9c655 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <ctype.h>
-
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "absl/strings/ascii.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
@@ -43,7 +42,7 @@ class IndexedArrayAnalysisTest : public HloTestBase {
     string result;
 
     for (char c : text) {
-      if (!isspace(c)) {
+      if (!absl::ascii_isspace(c)) {
         result.push_back(c);
       } else if (!result.empty() && result.back() != ' ') {
         result.push_back(' ');
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 07448715293ca8dde5492a054b84c3408004bdaf..b97060535d998e174639dceca5cde517cef01e30 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -174,23 +174,22 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
   ShapeUtil::ForEachSubshape(
       hlo->shape(),
       [&output_rank](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           output_rank = std::max(output_rank, ShapeUtil::TrueRank(subshape));
         }
       });
-  return std::count_if(hlo->operands().begin(), hlo->operands().end(),
-                       [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast ||
-                             operand->opcode() == HloOpcode::kIota) {
-                           return false;
-                         }
-                         if (operand->opcode() == HloOpcode::kConstant &&
-                             ShapeUtil::IsEffectiveScalar(operand->shape())) {
-                           return false;
-                         }
-                         return ShapeUtil::TrueRank(operand->shape()) >=
-                                output_rank;
-                       }) <= 1;
+  return absl::c_count_if(
+             hlo->operands(), [output_rank](HloInstruction* operand) {
+               if (operand->opcode() == HloOpcode::kBroadcast ||
+                   operand->opcode() == HloOpcode::kIota) {
+                 return false;
+               }
+               if (operand->opcode() == HloOpcode::kConstant &&
+                   ShapeUtil::IsEffectiveScalar(operand->shape())) {
+                 return false;
+               }
+               return ShapeUtil::TrueRank(operand->shape()) >= output_rank;
+             }) <= 1;
 }
 
 bool InstructionFusion::CanFuseOnAllPaths(
@@ -274,7 +273,7 @@ InstructionFusion::ComputeGloballyUnfusible(
         ShapeUtil::ForEachSubshape(
             shape,
             [&size](const Shape& subshape, const ShapeIndex& shape_index) {
-              if (ShapeUtil::IsArray(subshape)) {
+              if (subshape.IsArray()) {
                 size += ShapeUtil::ElementsIn(subshape);
               }
             });
@@ -409,9 +408,8 @@ class ReversePostOrderFusionQueue : public FusionQueue {
       }
       sorted_operand_numbers.push_back(i);
     }
-    std::sort(
-        sorted_operand_numbers.begin(), sorted_operand_numbers.end(),
-        [&](int64 i, int64 j) {
+    absl::c_sort(
+        sorted_operand_numbers, [&](int64 i, int64 j) {
           // Instructions with higher priority in the queue come first.
           return (
               FindOrDie(post_order_index_, instruction->mutable_operand(i)) >
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index a981d94a999e3d322986bc2bfd56a5b0b5d175fc..a305c6e8005045f7dbca3b8099a3b8ddebb092af 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -1,12 +1,12 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "if_static",
 )
 
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
 cc_library(
     name = "interpreter_transfer_manager",
     srcs = ["interpreter_transfer_manager.cc"],
@@ -34,6 +34,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -47,8 +48,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/memory",
@@ -115,6 +118,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/stream_executor/host:host_stream",
+        "//tensorflow/stream_executor/host:host_timer",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index d37ae94bf6c4c697bbf30390c02a5099271e00a4..0827b1daf89bebb68c045784ef2b9da677792880 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -40,12 +43,50 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
+namespace {
+
+// Handles custom_call ops during evaluation by routing them through the global
+// CPU registry used by other CPU-based backends.
+StatusOr<Literal> HandleEvaluatorCustomCall(
+    HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+  // Find the target C function in the global registry.
+  auto* registry = xla::cpu::CustomCallTargetRegistry::Global();
+  void* target_fn = registry->Lookup(custom_call->custom_call_target());
+  if (!target_fn) {
+    return NotFound("Custom call target '%s' was not registered",
+                    custom_call->custom_call_target());
+  }
+
+  // Populate pointers to operand and output literal data.
+  std::vector<const void*> operand_data;
+  operand_data.reserve(operands.size());
+  for (const auto* literal : operands) {
+    operand_data.push_back(literal->untyped_data());
+  }
+  auto output = Literal::CreateFromShape(custom_call->shape());
+  void* output_data = output.untyped_data();
+
+  // Call the target function matching the C ABI used by the CPU backends.
+  auto* typed_fn = reinterpret_cast<void (*)(void*, const void**)>(target_fn);
+  (*typed_fn)(output_data, operand_data.data());
+
+  return std::move(output);
+}
+
+}  // namespace
+
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
+  pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout);
+
+  ReducePrecisionInsertion::AddPasses(
+      &pipeline, hlo_module->config().debug_options(),
+      ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
+
   return pipeline.Run(hlo_module).status();
 }
 
@@ -75,10 +116,12 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
   // In this case we are using an HloEvaluator at execution time, so we don't
   // need to compile anything
 
-  // Create executable from only the Hlo module.
   auto evaluator = absl::make_unique<HloEvaluator>();
   evaluator->set_use_fast_path(
       hlo_module->config().debug_options().xla_hlo_evaluator_use_fast_path());
+  evaluator->set_custom_call_handler(HandleEvaluatorCustomCall);
+
+  // Create executable from only the Hlo module.
   std::unique_ptr<Executable> executable =
       absl::make_unique<InterpreterExecutable>(std::move(hlo_module),
                                                std::move(evaluator));
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index de9204011ce5ba8a9fc2871c6bd7120b6ed371b5..7a6ebdef708bcc3a92fbd8618db0c42c35e6ce8b 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -68,6 +68,18 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
         "Mismatch between argument count and graph parameter count.");
   }
 
+  // Check that the args have the right shape.
+  for (int64 i = 0; i < computation->num_parameters(); ++i) {
+    const auto& expected_shape = computation->parameter_instruction(i)->shape();
+    const auto& actual_shape = arguments[i]->on_device_shape();
+    if (!ShapeUtil::Equal(expected_shape, actual_shape)) {
+      return InvalidArgument(
+          "Shape mismatch on parameter %d.  Expected %s, but was %s.", i,
+          ShapeUtil::HumanString(expected_shape),
+          ShapeUtil::HumanString(actual_shape));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
                       TransferManager::GetForPlatform(platform));
 
@@ -86,8 +98,8 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   {
     tensorflow::mutex_lock lock(evaluator_lock_);
     evaluator_->ResetVisitStates();
-    TF_ASSIGN_OR_RETURN(result_literal, evaluator_->Evaluate<Literal>(
-                                            *computation, arg_literals));
+    TF_ASSIGN_OR_RETURN(result_literal,
+                        evaluator_->Evaluate(*computation, arg_literals));
   }
 
   // Transform the result literal back into a ShapedBuffer.
@@ -117,7 +129,7 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
 }
 
 /*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) {
-  if (ShapeUtil::IsOpaque(shape)) {
+  if (shape.IsOpaque()) {
     return sizeof(void*);
   }
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index b9ddd9636fe29e85092ed67fc644a54332b218d3..5d9e3392fd86c587a0bd998a282c52d145cc710e 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -147,12 +147,9 @@ bool LayoutConstraints::OperandBufferForwarded(
   PointsToSet::BufferSet* output_buffers = GetBufferSet(instruction);
   PointsToSet::BufferSet* operand_buffers =
       GetBufferSet(instruction->operand(operand_no));
-  for (const LogicalBuffer* output_buffer : *output_buffers) {
-    if (operand_buffers->count(output_buffer) > 0) {
-      return true;
-    }
-  }
-  return false;
+  return absl::c_any_of(*output_buffers, [&](const LogicalBuffer* b) {
+    return operand_buffers->count(b) > 0;
+  });
 }
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
@@ -256,7 +253,7 @@ Status LayoutConstraints::SetArrayOperandLayout(
     const Layout& layout, const HloInstruction* instruction, int64 operand_no,
     bool mandatory, bool dfs) {
   const HloInstruction* operand = instruction->operand(operand_no);
-  TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
+  TF_RET_CHECK(operand->shape().IsArray());
   Shape shape(operand->shape());
   *shape.mutable_layout() = layout;
   TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutInShape(shape));
@@ -314,7 +311,7 @@ Status LayoutConstraints::SetInstructionLayout(
         CHECK_EQ(1, buffers.size());
         CHECK_EQ(buffers[0]->instruction(), instruction);
 
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           return SetBufferLayout(subshape.layout(), *buffers[0], mandatory);
         } else {
           return Status::OK();
@@ -406,7 +403,7 @@ Status LayoutAssignment::BuildHostChannelConstraints(
         instruction->opcode() == HloOpcode::kRecv) {
       const Shape& data_shape =
           ShapeUtil::GetTupleElementShape(send_recv_instr->shape(), 0);
-      TF_RET_CHECK(ShapeUtil::IsArray(data_shape));
+      TF_RET_CHECK(data_shape.IsArray());
       TF_RET_CHECK(LayoutUtil::HasLayout(data_shape));
       const Layout* prev_layout = host_channel_constraints_.ConstrainChannel(
           send_recv_instr->channel_id(), data_shape.layout());
@@ -489,7 +486,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       if (instruction->opcode() == HloOpcode::kSend) {
         // TODO(b/68493863): Change to use SetOperandLayout().
         const Shape send_buffer_shape = instruction->operand(0)->shape();
-        TF_RET_CHECK(ShapeUtil::IsArray(send_buffer_shape));
+        TF_RET_CHECK(send_buffer_shape.IsArray());
         Shape new_buffer_shape =
             get_channel_constraints(instruction)
                 ->LayoutShapeForChannel(send_buffer_shape,
@@ -499,7 +496,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       } else {
         const Shape recv_buffer_shape =
             ShapeUtil::GetTupleElementShape(instruction->shape(), 0);
-        TF_RET_CHECK(ShapeUtil::IsArray(recv_buffer_shape));
+        TF_RET_CHECK(recv_buffer_shape.IsArray());
         TF_ASSIGN_OR_RETURN(
             const LogicalBuffer* buffer,
             constraints->points_to_analysis().GetBufferDefinedAt(instruction,
@@ -520,7 +517,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
       // TODO(b/68493863): Change to use SetOperandLayout().
       const Shape& buffer_shape = instruction->operand(0)->shape();
-      TF_RET_CHECK(ShapeUtil::IsArray(buffer_shape));
+      TF_RET_CHECK(buffer_shape.IsArray());
       Shape new_buffer_shape =
           get_channel_constraints(instruction)
               ->LayoutShapeForChannel(buffer_shape, all_reduce_id);
@@ -780,7 +777,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
       << ShapeUtil::HumanString(instruction->shape())
       << " instruction: " << instruction->ToString();
 
-  if (ShapeUtil::IsTuple(instruction->shape())) {
+  if (instruction->shape().IsTuple()) {
     // Copy tuple elements which have differing layouts.
     std::vector<HloInstruction*> element_copies;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
@@ -811,7 +808,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
         shape_with_layout, tuple_copy->mutable_shape()));
     return tuple_copy;
-  } else if (ShapeUtil::IsArray(instruction->shape())) {
+  } else if (instruction->shape().IsArray()) {
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
@@ -988,11 +985,10 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Layout& output_layout, const HloInstruction* instruction,
     int64 operand_no) {
   const HloInstruction* operand = instruction->operand(operand_no);
-  CHECK(ShapeUtil::IsArray(instruction->shape()));
-  CHECK(ShapeUtil::IsArray(operand->shape()));
+  CHECK(instruction->shape().IsArray());
+  CHECK(operand->shape().IsArray());
   if (!ShapeUtil::IsScalar(operand->shape()) &&
-      ShapeUtil::Rank(operand->shape()) ==
-          ShapeUtil::Rank(instruction->shape()) &&
+      operand->shape().rank() == instruction->shape().rank() &&
       !instruction_can_change_layout_func_(instruction)) {
     // Propagate the result layout to the operand layout if the instruction
     // requires the same layout out for the result and the operand.
@@ -1012,7 +1008,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the operand's layout to the output.
     if (ShapeUtil::TrueRank(operand->shape()) == 1 &&
-        ShapeUtil::Rank(instruction->shape()) == 1) {
+        instruction->shape().rank() == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
     }
@@ -1026,7 +1022,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) {
       return absl::make_unique<Layout>(operand_shape.layout());
     }
-    if (ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape)) {
+    if (operand_shape.rank() == output_shape.rank()) {
       *operand_shape.mutable_layout() = output_layout;
       if (ShapeUtil::ReshapeIsBitcast(operand_shape,
                                       output_shape_with_layout)) {
@@ -1045,7 +1041,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
 
   if (instruction->opcode() == HloOpcode::kTranspose) {
     // Pick the operand layout that makes the transpose a bitcast.
-    int64 rank = ShapeUtil::Rank(instruction->shape());
+    int64 rank = instruction->shape().rank();
     std::vector<int64> new_minor_to_major(rank);
     for (int64 i = 0; i < rank; ++i) {
       int64 output_dim = LayoutUtil::Minor(output_layout, i);
@@ -1066,11 +1062,10 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     int64 operand_no) {
   const HloInstruction* operand = user->operand(operand_no);
 
-  CHECK(ShapeUtil::IsArray(user->shape()) &&
-        ShapeUtil::IsArray(operand->shape()));
+  CHECK(user->shape().IsArray() && operand->shape().IsArray());
 
   if (!ShapeUtil::IsScalar(operand->shape()) &&
-      ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape()) &&
+      operand->shape().rank() == user->shape().rank() &&
       !instruction_can_change_layout_func_(user)) {
     // Assign users the same layout as the operand.
     return absl::make_unique<Layout>(operand_layout);
@@ -1083,7 +1078,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the outputs's layout to the operand.
-    if (ShapeUtil::Rank(operand->shape()) == 1 &&
+    if (operand->shape().rank() == 1 &&
         ShapeUtil::TrueRank(user->shape()) == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
@@ -1098,7 +1093,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) {
       return absl::make_unique<Layout>(output_shape.layout());
     }
-    if (ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape)) {
+    if (operand->shape().rank() == output_shape.rank()) {
       *output_shape.mutable_layout() = operand_layout;
       if (ShapeUtil::ReshapeIsBitcast(output_shape,
                                       operand_shape_with_layout)) {
@@ -1117,7 +1112,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
 
   if (user->opcode() == HloOpcode::kTranspose) {
     // Pick the user layout that makes the transpose a bitcast.
-    int64 rank = ShapeUtil::Rank(user->shape());
+    int64 rank = user->shape().rank();
     std::vector<int64> new_minor_to_major(rank);
     auto inverse_dimensions = InversePermutation(user->dimensions());
     for (int64 i = 0; i < rank; ++i) {
@@ -1193,7 +1188,7 @@ std::vector<std::pair<const HloInstruction*, int64>> GetArrayUsesOfBuffer(
   CHECK(buffer.IsArray());
   std::vector<std::pair<const HloInstruction*, int64>> uses;
   for (const auto& buffer_alias : points_to_analysis.GetBufferAliases(buffer)) {
-    if (!ShapeUtil::IsArray(buffer_alias.instruction()->shape())) {
+    if (!buffer_alias.instruction()->shape().IsArray()) {
       continue;
     }
     // This alias must be the top-level (index == {}) of the instruction's
@@ -1227,7 +1222,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
         if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) {
           for (const LogicalBuffer* buffer : buffers) {
             if (constraints->BufferLayout(*buffer) == nullptr &&
-                ShapeUtil::IsArray(buffer->shape())) {
+                buffer->shape().IsArray()) {
               TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
                   ShapeUtil::GetSubshape(shape_layout.shape(), index).layout(),
                   *buffer, /*mandatory=*/true));
@@ -1238,6 +1233,23 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
       });
 }
 
+namespace {
+// A transpose or a reshape that only changes trivial dimensions have meaningful
+// layouts that are valuable to propagate in a depthfirst manner to avoid
+// unassigned layouts in the graph.
+bool InstructionShouldPropagateDepthFirst(const HloInstruction& hlo) {
+  switch (hlo.opcode()) {
+    case HloOpcode::kReshape:
+      return std::get<0>(hlo.ReshapeMerelyInsertsOrDeletes1SizedDimensions());
+    case HloOpcode::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
 Status LayoutAssignment::PropagateOperandConstraint(
     const OperandLayoutConstraint& operand_constraint,
     LayoutConstraints* constraints) {
@@ -1258,11 +1270,10 @@ Status LayoutAssignment::PropagateOperandConstraint(
   // layout for the operands with the same ranks.
   const HloInstruction* operand = operand_constraint.operand();
   const HloInstruction* user = operand_constraint.instruction();
-  if (!ShapeUtil::IsArray(operand->shape())) {
+  if (!operand->shape().IsArray()) {
     return Status::OK();
   }
-  if (instruction_can_change_layout_func_(user) &&
-      !ShapeUtil::IsArray(user->shape())) {
+  if (instruction_can_change_layout_func_(user) && !user->shape().IsArray()) {
     return Status::OK();
   }
 
@@ -1273,7 +1284,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
     return Status::OK();
   }
 
-  int64 operand_rank = ShapeUtil::Rank(operand->shape());
+  int64 operand_rank = operand->shape().rank();
   if (operand_rank <= 1) {
     return Status::OK();
   }
@@ -1288,7 +1299,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
         continue;
       }
       const HloInstruction* sibling = user->operand(operand_no);
-      const int64 sibling_rank = ShapeUtil::Rank(sibling->shape());
+      const int64 sibling_rank = sibling->shape().rank();
       if (sibling_rank <= 1) {
         continue;
       }
@@ -1317,16 +1328,16 @@ Status LayoutAssignment::PropagateOperandConstraint(
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         user->shape(),
         [&](const Shape& subshape, const ShapeIndex& shape_index) {
-          if (ShapeUtil::IsTuple(subshape)) {
+          if (subshape.IsTuple()) {
             return Status::OK();
           }
-          if (ShapeUtil::Rank(subshape) <= 1) {
+          if (subshape.rank() <= 1) {
             return Status::OK();
           }
 
           // Assign the right layout to input fusion of higher rank reduce
           // operations.
-          if (ShapeUtil::Rank(subshape) != ShapeUtil::Rank(operand->shape())) {
+          if (subshape.rank() != operand->shape().rank()) {
             return Status::OK();
           }
           // TODO(b/67641796): Are there cases except fusion that use this code
@@ -1354,10 +1365,10 @@ Status LayoutAssignment::PropagateOperandConstraint(
   }
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       user->shape(), [&](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsTuple(subshape)) {
+        if (subshape.IsTuple()) {
           return Status::OK();
         }
-        if (ShapeUtil::Rank(subshape) <= 1) {
+        if (subshape.rank() <= 1) {
           return Status::OK();
         }
         TF_ASSIGN_OR_RETURN(
@@ -1373,7 +1384,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
             TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
                 *layout, *buffer,
                 /*mandatory=*/user->opcode() == HloOpcode::kReduce,
-                /*dfs=*/false));
+                /*dfs=*/InstructionShouldPropagateDepthFirst(*user)));
           }
         }
         return Status::OK();
@@ -1401,8 +1412,8 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
     }
     if (!instruction_can_change_layout_func_(instruction)) {
       // Copy the layout to the operand.
-      if (buffer.IsArray() && ShapeUtil::IsArray(operand->shape()) &&
-          ShapeUtil::Rank(operand->shape()) ==
+      if (buffer.IsArray() && operand->shape().IsArray() &&
+          operand->shape().rank() ==
               LayoutUtil::MinorToMajor(buffer_constraint.layout()).size()) {
         TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
             buffer_constraint.layout(), instruction, operand_no,
@@ -1410,7 +1421,7 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
       }
     } else {
       if (!buffer.IsTopLevel() ||
-          !ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+          !instruction->operand(operand_no)->shape().IsArray()) {
         continue;  // Don't touch buffers that are internal to a tuple.
       }
       VLOG(6) << "Propagating constraint to operand " << operand_no << " of "
@@ -1423,11 +1434,9 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
             ChooseOperandLayoutFromOutputLayout(buffer_constraint.layout(),
                                                 instruction, operand_no);
         if (operand_layout != nullptr) {
-          // Do not propagate operand constraints of transposes and reshapes, it
-          // tends to create really bad layouts.
           TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
               *operand_layout, instruction, operand_no, /*mandatory=*/false,
-              /*dfs=*/false));
+              /*dfs=*/InstructionShouldPropagateDepthFirst(*instruction)));
         }
       } else {
         VLOG(6) << "Operand already has a constraint "
@@ -1497,7 +1506,7 @@ StatusOr<Layout> InferArrayLayout(
   // This function should only be called for array shapes which don't yet have
   // layouts.
   const Shape& subshape = ShapeUtil::GetSubshape(instruction->shape(), index);
-  TF_RET_CHECK(ShapeUtil::IsArray(subshape));
+  TF_RET_CHECK(subshape.IsArray());
   TF_RET_CHECK(!subshape.has_layout());
 
   // The instruction should not define the buffer at this index.
@@ -1576,8 +1585,9 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           fused_instruction->mutable_shape()));
     } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
       // Nop; leave the infeed layout alone.
-    } else {
+    } else if (fusion->fusion_kind() != HloInstruction::FusionKind::kCustom) {
       // Other instructions don't have layouts inside of fusion nodes.
+      // But do not clear layouts for other instructions in custom fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1615,7 +1625,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     for (const LogicalBuffer* buffer :
          constraints.points_to_analysis().GetBuffersDefinedByInstruction(
              instruction)) {
-      if (!ShapeUtil::IsArray(buffer->shape())) {
+      if (!buffer->shape().IsArray()) {
         continue;
       }
 
@@ -1639,7 +1649,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
         instruction->mutable_shape(),
         [instruction, &constraints](Shape* subshape, const ShapeIndex& index) {
-          if (subshape->has_layout() || !ShapeUtil::IsArray(*subshape)) {
+          if (subshape->has_layout() || !subshape->IsArray()) {
             return Status::OK();
           }
           // Set Layout of subshape to match layout of LogicalBuffer which
@@ -2100,8 +2110,8 @@ bool LayoutAssignment::InstructionCanChangeLayout(
 
 /* static */
 bool LayoutAssignment::IsAtMostRank1(const Shape& shape) {
-  if (ShapeUtil::IsArray(shape)) {
-    return ShapeUtil::Rank(shape) <= 1;
+  if (shape.IsArray()) {
+    return shape.rank() <= 1;
   }
   return absl::c_all_of(shape.tuple_shapes(), [](const Shape& subshape) {
     return IsAtMostRank1(subshape);
@@ -2123,7 +2133,7 @@ Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kCopy &&
-          added_copies_.count(instruction) > 0) {
+          added_copies_.contains(instruction)) {
         VLOG(5) << "Removing added copy: " << instruction->ToString();
         TF_RETURN_IF_ERROR(
             instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 3b081de3c7826c3c11a7d87d542835d0ecce1b7e..5701cb5b025e563247d46d0d24f81a5f886fc23b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -243,7 +243,7 @@ class ChannelLayoutConstraints {
 
   // Returns true if channel_id has a layout constraint.
   bool IsChannelConstrained(int64 channel_id) const {
-    return constraints_.count(channel_id) > 0;
+    return constraints_.contains(channel_id);
   }
 
   // Given `shape`, apply the layout for `channel_id`. `channel_id` must already
@@ -276,7 +276,7 @@ class ChannelLayoutConstraints {
   }
 
  private:
-  std::unordered_map<int64, Layout> constraints_;
+  absl::flat_hash_map<int64, Layout> constraints_;
 };
 
 // HLO pass which assigns layouts to all instructions in the HLO module while
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 31d78752f07c57aef6023fabb8e3a7de20c4278c..c8cf3c47d380012fdb0206c0d20d67e6a13017ae 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -528,8 +528,7 @@ class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
     for (int64 operand_no = 0; operand_no < instruction->operand_count();
          ++operand_no) {
       const HloInstruction* operand = instruction->operand(operand_no);
-      if (ShapeUtil::Rank(instruction->shape()) !=
-          ShapeUtil::Rank(operand->shape())) {
+      if (instruction->shape().rank() != operand->shape().rank()) {
         continue;
       }
       TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
@@ -961,8 +960,9 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
     ENTRY CopyDSliceOperandToAvoidImplicitLayoutChange {
       par0 = f32[3,4]{1,0} parameter(0)
       par1 = f32[4,5]{0,1} parameter(1)
-      par2 = s32[2] parameter(2)
-      dslice0 = f32[3,4] dynamic-slice(par1, par2), dynamic_slice_sizes={3,4}
+      par2 = s32[] parameter(2)
+      par3 = s32[] parameter(3)
+      dslice0 = f32[3,4] dynamic-slice(par1, par2, par3), dynamic_slice_sizes={3,4}
       ROOT add0 = f32[3,4]{1,0} add(par0,dslice0)
     }
   )";
@@ -983,7 +983,7 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
                   m::Parameter(),
                   m::DynamicSlice(
                       m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
-                      m::Parameter(2)))));
+                      m::Parameter(2), m::Parameter(3)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 728a66b388f0f9af480ff88b5e96990a26e36af5..c5d59fb28e02ce229967fb3856012d608fb83c5d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -39,7 +39,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
@@ -169,6 +168,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 643ecd0fbaa546c551097b29e74ccd49418e1466..ce3d922ca7a9bdea3a520959a8b8d284bc3e0d64 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -81,9 +81,7 @@ void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
     if (hlo.opcode() == HloOpcode::kParameter) {
       const std::vector<HloInstruction*>& parameter_instructions =
           module_.entry_computation()->parameter_instructions();
-      if (std::find(parameter_instructions.begin(),
-                    parameter_instructions.end(),
-                    &hlo) != parameter_instructions.end()) {
+      if (absl::c_linear_search(parameter_instructions, &hlo)) {
         array->MarkInvariantOverWholeProgram(context_);
       }
     }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index 2b46b3c3964b15548dbacc8b0ada0047a0fa85b6..12e2f449e23ac2511aac576fed893f5a9ef510c0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -76,15 +76,12 @@ class AliasAnalysis {
   // A map from a buffer slice to metadata corresponding to its alias.scope
   // metadata.  The index kParameterAliasSet is used to hold aliasing
   // information for parameters.
-  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
-                      BufferAllocation::Slice::Hasher>
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*>
       alias_scope_metadata_;
 
   // A map from a buffer slice to metadata corresponding to its noalias
   // metadata.
-  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
-                      BufferAllocation::Slice::Hasher>
-      noalias_metadata_;
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*> noalias_metadata_;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
index bdce4a171b8a58f617f1d56e6cf6db5354846703..c2c6405cdad28196a4793887c8c5cc5b87ee5301 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,7 +45,10 @@ string ConstantBufferAllocationToGlobalName(
     const BufferAllocation& allocation) {
   string instr_name = InstrForConstantBufferAllocation(allocation).name();
   for (char& c : instr_name) {
-    if (c == '.') {
+    // Having a hyphen in a global variable name can crash the LLVM PTX backend.
+    // LLVM is able to generate unique global variable names using the string
+    // returned from here as name prefix.
+    if (c == '.' || c == '-') {
       c = '_';
     }
   }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 4d7f36d9f8b565a819edf0631efc5c7a58c4f87f..c66eaec8fb0e4c03f6967fec0cf0ae9661cdf470 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -36,19 +36,20 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
 // EmitFusedDynamicUpdateSliceInPlace.
 //
 // Emits a sequential loop if launch_dimensions is null.
+using IndexGenerator = std::function<StatusOr<llvm::Value*>(int64)>;
+
 static Status EmitDynamicUpdateSliceInPlaceImpl(
-    const Shape& update_shape, const ElementGenerator& start_indices_generator,
+    const Shape& update_shape, const IndexGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
     absl::string_view name, llvm::IRBuilder<>* b) {
   const Shape& output_shape = output_array.GetShape();
 
   // Read start indices from start_indices_generator.
-  const int64 rank = ShapeUtil::Rank(output_shape);
+  const int64 rank = output_shape.rank();
   IrArray::Index start_index(b->getInt64Ty(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    IrArray::Index dim_index({b->getInt64(i)});
-    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
+    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(i));
     llvm::Value* output_dim_size = llvm::ConstantInt::get(
         start_index[i]->getType(), output_shape.dimensions(i));
     llvm::Value* update_dim_size = llvm::ConstantInt::get(
@@ -112,9 +113,20 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
   Shape output_shape = output_array.GetShape();
   Shape update_shape = update_array.GetShape();
 
-  ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
-    return start_indices_array.EmitReadArrayElement(index, b);
-  };
+  IndexGenerator start_indices_generator;
+  // TODO(b/118437727): Remove the R1 path, and rename the variables.
+  if (start_indices_array.GetShape().rank() == 1) {
+    start_indices_generator = [&](int64 index) {
+      return start_indices_array.EmitReadArrayElement(
+          IrArray::Index({b->getInt64(index)}), b);
+    };
+  } else {
+    start_indices_generator = [&](int64 index) {
+      return operand_arrays[2 + index].EmitReadArrayElement(
+          IrArray::Index(b->getInt64Ty()), b);
+    };
+  }
+
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
     return update_array.EmitReadArrayElement(index, b);
   };
@@ -165,8 +177,21 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
                                elemental_emitter);
   TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
   ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
-  ElementGenerator start_indices_generator =
-      fused_emitter.GetGenerator(start_indices);
+
+  // TODO(b/118437727): Remove the R1 path, and rename the variables.
+  IndexGenerator start_indices_generator;
+  if (start_indices->shape().rank() == 1) {
+    start_indices_generator = [&](int64 index) {
+      return fused_emitter.GetGenerator(start_indices)(
+          IrArray::Index({b->getInt64(index)}));
+    };
+  } else {
+    start_indices_generator = [&](int64 index) {
+      ElementGenerator element_generator =
+          fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
+      return element_generator(IrArray::Index(b->getInt64Ty()));
+    };
+  }
 
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 38f2b5da23a7b92e4547dceaba011ce654977da3..e440f05e2b2f0d4a2a4c7b326b4881183de4d235 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -35,7 +35,7 @@ using llvm_ir::IrArray;
 Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
   indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    if (generated_value_cache_[hlo].count(index.multidim()) > 0) {
+    if (generated_value_cache_[hlo].contains(index.multidim())) {
       llvm::Value* generated_value =
           generated_value_cache_[hlo][index.multidim()];
       llvm::BasicBlock* generated_value_bb = nullptr;
@@ -115,7 +115,7 @@ Status FusedIrEmitter::HandleGetTupleElement(
         /*alignment=*/1, tuple_ptr, b_, module_);
   };
 
-  if (!ShapeUtil::IsTuple(get_tuple_element->shape())) {
+  if (!get_tuple_element->shape().IsTuple()) {
     indexed_generators_[get_tuple_element] =
         [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
       // TODO(b/34080002) Add aliasing information to tuple element IrArray.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 1b9c61f6700e2a1309b21e499f4a9e2439ed3702..e6d52a580c04a920d3f0e8ed6f39c1cae587cf1b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <map>
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
@@ -134,8 +135,9 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
 
   // Cache of generated values, lest we regenerate an element of a node with
   // multiple outgoing edges
-  std::unordered_map<const HloInstruction*,
-                     std::map<std::vector<llvm::Value*>, llvm::Value*>>
+  absl::flat_hash_map<
+      const HloInstruction*,
+      absl::flat_hash_map<std::vector<llvm::Value*>, llvm::Value*>>
       generated_value_cache_;
 };
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 67f7423121177e2ca1e3384341dad2644c8f5e34..8ee07ae8331e986f9d271be5e39065f0d87853b1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -61,7 +61,7 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
 
 IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
                       llvm::IRBuilder<>* b)
-    : multidim_(ShapeUtil::Rank(shape)),
+    : multidim_(shape.rank()),
       linear_(linear),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
@@ -104,8 +104,8 @@ IrArray::Index::Index(absl::Span<llvm::Value* const> multidim,
   CHECK(LayoutUtil::HasLayout(shape));
 }
 
-IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
-    : base_ptr_(base_ptr), shape_(&shape) {
+IrArray::IrArray(llvm::Value* base_ptr, Shape shape)
+    : base_ptr_(base_ptr), shape_(std::move(shape)) {
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
   CHECK(base_ptr_->getType()->isPointerTy());
   int depth = 0;
@@ -117,10 +117,10 @@ IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
     ++depth;
   }
 
-  if (!ShapeUtil::IsArray(*shape_) || ShapeUtil::IsScalar(*shape_)) {
+  if (!shape_->IsArray() || ShapeUtil::IsScalar(*shape_)) {
     DCHECK(depth == 1 || depth == 0) << depth;
   } else {
-    DCHECK_EQ(depth, ShapeUtil::Rank(*shape_)) << shape.ShortDebugString();
+    DCHECK_EQ(depth, shape_->rank()) << shape.ShortDebugString();
   }
 }
 
@@ -137,12 +137,12 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape(
     const Shape& output_shape, const Shape& input_shape,
     llvm::IRBuilder<>* builder) const {
   const auto& target_index = *this;
-  CHECK_EQ(target_index.size(), ShapeUtil::Rank(output_shape));
+  CHECK_EQ(target_index.size(), output_shape.rank());
   std::vector<std::pair<int64, int64>> common_factors =
       CommonFactors(AsInt64Slice(input_shape.dimensions()),
                     AsInt64Slice(output_shape.dimensions()));
   std::vector<llvm::Value*> source_multidim_index(
-      ShapeUtil::Rank(input_shape), llvm::UndefValue::get(index_type_));
+      input_shape.rank(), llvm::UndefValue::get(index_type_));
   // We compute the source indices in each common factor from only the target
   // indices in the same common factor.
   for (ssize_t k = common_factors.size() - 2; k >= 0; --k) {
@@ -257,7 +257,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
     const Shape& shape, const Shape& operand_shape,
     absl::Span<const int64> dimension_mapping,
     llvm::IRBuilder<>* builder) const {
-  int64 rank = ShapeUtil::Rank(operand_shape);
+  int64 rank = operand_shape.rank();
   std::vector<llvm::Value*> source_index(rank);
   for (int64 i = 0; i < rank; ++i) {
     source_index[i] = multidim_[dimension_mapping[i]];
@@ -271,7 +271,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
   // The other dimensions can be masked out with a div and a mod operation.
   std::vector<int64> logical_to_physical =
       LayoutUtil::MakeLogicalToPhysical(shape.layout());
-  int64 output_rank = ShapeUtil::Rank(shape);
+  int64 output_rank = shape.rank();
   // The minimum physical dimension that is broadcasted.
   int64 min_broadcasted_dimension = output_rank;
   // The maximum physical dimension that is broadcasted.
@@ -348,7 +348,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
     // over higher-rank arrays.
     return base_ptr_;
   }
-  CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_));
+  CHECK_EQ(index.size(), shape_->rank());
 
   if (index.LinearValidOnShape(*shape_)) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index d6d84994ee147f4b8c1a333b0eaccdf6e0a2219b..b706ebd311cbb706e7e4698b93319e37e664d10a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -130,6 +130,11 @@ class IrArray {
       CHECK_LE(index, size());
       mutable_multidim().insert(mutable_multidim().begin() + index, value);
     }
+    void InsertAt(int64 index, int64 count, llvm::Value* value) {
+      CHECK_LE(index, size());
+      mutable_multidim().insert(mutable_multidim().begin() + index, count,
+                                value);
+    }
 
     using iterator = std::vector<llvm::Value*>::iterator;
     using const_iterator = std::vector<llvm::Value*>::const_iterator;
@@ -189,6 +194,8 @@ class IrArray {
       return llvm::ConstantInt::get(index_type_, c);
     }
 
+    void ClearLinearIndex() { linear_ = nullptr; }
+
    private:
     // Changing the multi-dimensional index invalidates the linear index.
     std::vector<llvm::Value*>& mutable_multidim() {
@@ -220,11 +227,11 @@ class IrArray {
   };
 
   // Default constructor. Constructs an IrArray in a null status.
-  IrArray() : base_ptr_(nullptr), shape_(nullptr) {}
+  IrArray() : base_ptr_(nullptr) {}
 
   // Construct an IrArray with the given base pointer and shape. base_ptr is a
   // pointer type pointing to the first element(lowest address) of the array.
-  IrArray(llvm::Value* base_ptr, const Shape& shape);
+  IrArray(llvm::Value* base_ptr, Shape shape);
 
   // Default implementations of copying and moving.
   IrArray(IrArray&& other) = default;
@@ -236,7 +243,6 @@ class IrArray {
   llvm::Type* GetElementLlvmType() const { return element_type_; }
 
   const Shape& GetShape() const {
-    CHECK(shape_ != nullptr);
     return *shape_;
   }
 
@@ -331,7 +337,7 @@ class IrArray {
   llvm::Type* element_type_;
 
   // Shape of the XLA array.
-  const Shape* shape_;
+  absl::optional<Shape> shape_;
 
   // The list of key/value pairs used when attaching metadata to emitted
   // loads/stores for this array.  They keys are the metadata kinds and the
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index abc06fb7b4245294df2dc20d25a22ac4fdaeb4cf..cf5083e8c13b9485035923895cec1ad05049c644 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -254,6 +254,11 @@ class IrBuilderMixin {
     return mixin_builder()->CreateFCmpOLT(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpOLE(Args&&... args) {
+    return mixin_builder()->CreateFCmpOLE(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FCmpONE(Args&&... args) {
     return mixin_builder()->CreateFCmpONE(std::forward<Args>(args)...);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index cebbc4290163d4e98003cd7b5df6ec906509a446..cd8dd72cd775d5e0b52f96a2326367da0775e7eb 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -123,7 +123,8 @@ KernelMappingScheme::KernelMappingScheme(
       dims_in_elems_(dims_in_elems.begin(), dims_in_elems.end()),
       tile_sizes_{1, tile_size_y, tile_size_x},
       num_threads_x_(num_threads_x),
-      num_threads_y_(num_threads_y) {
+      num_threads_y_(num_threads_y),
+      dilated_x_(true) {
   DCHECK_EQ(dims_in_elems_.size(), 3);
   DCHECK_EQ(req_block_sizes.size(), 3);
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index fb633b12e60d1a9f3103fb2919ad2c3f3f14de20..f802cc27d519e621262f328903697373aa8c284c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -117,7 +117,10 @@ class KernelMappingScheme {
   int64 GetNumberOfTilesInOneBlock() const {
     return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
   }
-
+  int64 GetNumberOfTilesInOneBlockForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return block_sizes_[d];
+  }
   int64 GetNumberOfBlocks() const {
     return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
   }
@@ -147,6 +150,16 @@ class KernelMappingScheme {
            GetNumberOfThreadsForDimensionY();
   }
 
+  bool DilatedX() const { return dilated_x_; }
+  void SetDilatedX(bool v) {
+    dilated_x_ = v;
+    if (!dilated_x_) {
+      // dilated_x_=false is for the purpose of vectorization, which requires
+      // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
+      CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
+    }
+  }
+
   IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
   // Returns the index for the first tile in the block with the given block
   // index.
@@ -186,6 +199,13 @@ class KernelMappingScheme {
   int64 num_threads_x_;
   // Number of threads used to process elements in the Y direction of a tile.
   int64 num_threads_y_;
+
+  // When num_threads_x threads process a total of tile_size_x elements in the
+  // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
+  // elements. When dilated_x=false, the n elements processed by a thread are
+  // contiguous. On the other hand, when dilated_x=true the n elements are
+  // dilated by a factor of num_threads_x.
+  bool dilated_x_;
 };
 
 // A class to represent information for tiled parameters to support IR emission
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 219a9f221fbd116cdfbaf17985e21d82aefd079d..fe320bbe727111fbc986cc1fbc217feed74d30f1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -235,7 +235,7 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
 
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
                                              absl::string_view suffix) {
-  std::vector<int64> dimensions(ShapeUtil::Rank(shape));
+  std::vector<int64> dimensions(shape.rank());
   std::iota(dimensions.begin(), dimensions.end(), 0);
   return AddLoopsForShapeOnDimensions(shape, dimensions, suffix);
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index ceea24685af566e02340664f0a40c398c62b5ab0..807296329c07b8e4ac630486a1e1f59e4fdfa009 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -188,7 +188,16 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       }
       return cplx_t;
     }
-    // A Tuple contains an array of pointers. Use i8*.
+    case C128: {
+      auto cplx_t = module->getTypeByName("complex128");
+      if (cplx_t == nullptr) {
+        return llvm::StructType::create(
+            {llvm::Type::getDoubleTy(module->getContext()),
+             llvm::Type::getDoubleTy(module->getContext())},
+            "complex128", /*isPacked=*/true);
+      }
+      return cplx_t;
+    }  // A Tuple contains an array of pointers. Use i8*.
     case TUPLE:
     // An Opaque is like a void*, use i8*.
     case OPAQUE:
@@ -219,10 +228,10 @@ int GetSizeInBits(llvm::Type* type) {
 
 llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   llvm::Type* result_type = PrimitiveTypeToIrType(shape.element_type(), module);
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     // A tuple buffer is an array of pointers.
     result_type = llvm::ArrayType::get(result_type, shape.tuple_shapes_size());
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
       result_type =
           llvm::ArrayType::get(result_type, shape.dimensions(dimension));
@@ -621,6 +630,10 @@ llvm::Function* CreateFunction(llvm::FunctionType* function_type,
   function->setCallingConv(llvm::CallingConv::C);
   function->addFnAttr("no-frame-pointer-elim", "false");
 
+  // Generate unwind information so that GDB can crawl through the stack frames
+  // created by the JIT compiled code.
+  function->setHasUWTable();
+
   if (enable_fast_math) {
     function->addFnAttr("unsafe-fp-math", "true");
     function->addFnAttr("no-infs-fp-math", "true");
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 6a9406bfebafcc02dc2e144b62284a9e83c3edeb..89b6a36f96beedbcb7322e6164ac59221650d3d8 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -322,7 +322,7 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
   // comparisons).
 
   const Shape& keys_shape = keys_array.GetShape();
-  int64 rank = ShapeUtil::Rank(keys_shape);
+  int64 rank = keys_shape.rank();
   int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64> dimensions_in_iteration_order(rank);
   std::vector<int64> iteration_order_to_logical_order(rank);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index a60643bc754f896d096b3ca4e1216e77d7e384c6..d8d2700e1934fd202d44a1dc60e71a99913d4537 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -93,7 +93,7 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
   llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr);
 
   // Mark the loaded pointer as dereferenceable if we know its shape.
-  if (!ShapeUtil::IsOpaque(target_shape)) {
+  if (!target_shape.IsOpaque()) {
     SetDereferenceableMetadataForLoad(
         src_buffer,
         ByteSizeOf(target_shape, src_buffer->getModule()->getDataLayout()));
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 600b069ecdbabf6b05e6abb3a6b8d9b1a4b0ecf4..3470fe5b2c34bf832207ed546fad176319446f31 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -110,6 +110,7 @@ ExecutionOptions CreateExecutionOptions(
     *execution_options.mutable_shape_with_output_layout() =
         result_shape.ToProto();
   }
+  execution_options.set_num_replicas(build_options.num_replicas());
   return execution_options;
 }
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 9ccdd7d8d818b9fa3aa77cdd10d37ca18928b448..53d52d9a3d918fa6dee093668923fcfff963d084 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -198,7 +198,7 @@ void MultiOutputFusion::Update(HloInstruction* instr1, HloInstruction* instr2) {
     if (instr == fusion || is_fused(instr) || is_connected(fusion, instr)) {
       continue;
     }
-    if (in_list.count(instr) > 0) {
+    if (in_list.contains(instr)) {
       continue;
     }
     int64 profit = GetProfit(instr, fusion);
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index daa718879ddd45afb02725b557380b2f49fe833e..e55b83d17e90bc2ca0053a0421cf80ef6edd5bca 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -28,13 +29,13 @@ namespace {
 
 bool IsAllowed(char character) {
   auto c = static_cast<unsigned char>(character);
-  return (isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
+  return (absl::ascii_isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
 }
 
 }  // namespace
 
 NameUniquer::NameUniquer(const string& separator) {
-  CHECK(std::all_of(separator.begin(), separator.end(), IsAllowed))
+  CHECK(absl::c_all_of(separator, IsAllowed))
       << "separator should comprises allowed characters only";
   separator_ = separator;
 }
@@ -46,7 +47,7 @@ NameUniquer::NameUniquer(const string& separator) {
 
   string result = name;
   char c = static_cast<unsigned char>(result[0]);
-  if (!isalpha(c) && c != '_') {
+  if (!absl::ascii_isalpha(c) && c != '_') {
     result[0] = '_';
   }
   for (int i = 1; i < result.length(); i++) {
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index fdb6a9b01be4b9198e40aa9bf7cdc07ff068a619..9e3d1060210790f60243195a1c1dff13f1fc7fc5 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -775,7 +775,7 @@ class ShapePatternIsArrayImpl {
   explicit constexpr ShapePatternIsArrayImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (!ShapeUtil::IsArray(*shape)) {
+    if (!shape->IsArray()) {
       EXPLAIN << "Shape is not an array";
       return false;
     }
@@ -793,7 +793,7 @@ class ShapePatternIsTupleImpl {
   explicit constexpr ShapePatternIsTupleImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (!ShapeUtil::IsTuple(*shape)) {
+    if (!shape->IsTuple()) {
       EXPLAIN << "Shape is not a tuple";
       return false;
     }
@@ -831,7 +831,7 @@ class ShapePatternRankImpl {
   explicit constexpr ShapePatternRankImpl(int64 rank) : rank_(rank) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (ShapeUtil::Rank(*shape) != rank_) {
+    if (shape->rank() != rank_) {
       if (rank_ == 0) {
         EXPLAIN << "Shape is not a scalar";
       } else {
@@ -1878,7 +1878,7 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape)
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const
       -> decltype(this->WithShape(Shape().EqualTo(shape))) {
     return WithShape(Shape().EqualTo(shape));
   }
@@ -1886,7 +1886,7 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape)
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const
       -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
     return WithShape(Shape().CompatibleTo(shape));
   }
@@ -2057,7 +2057,6 @@ XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
 XLA_UNOP_PATTERN(Slice)
-XLA_UNOP_PATTERN(Sort)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
@@ -2119,7 +2118,6 @@ XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
 XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
-XLA_BINOP_PATTERN(DynamicSlice)
 XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
 XLA_BINOP_PATTERN(Ge)
@@ -2236,8 +2234,10 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 XLA_VARIADIC_OP_PATTERN(AfterAll);
 XLA_VARIADIC_OP_PATTERN(Concatenate);
 XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(DynamicSlice)
 XLA_VARIADIC_OP_PATTERN(Map)
 XLA_VARIADIC_OP_PATTERN(Reduce);
+XLA_VARIADIC_OP_PATTERN(Sort);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for matching non-constant instructions.
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
index 9ca2fb05c1f7ef093c58237cf21fbc7c813a592a..f51a18b13894d75300c46835fabd82a4ce0699af 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
@@ -23,7 +23,6 @@ namespace xla {
 namespace {
 
 namespace m = ::xla::match;
-using ::testing::Eq;
 using ::testing::Not;
 
 template <typename MatchedTy>
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 896b73cda41cb21b539b586aa4701c5bad43f8b9..886a0545624927fa77528141f61d8ecb6bec180a 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -70,6 +70,9 @@ PlatformUtil::GetSupportedPlatforms() {
   for (se::Platform* platform : all_platforms) {
     auto compiler_status = Compiler::GetForPlatform(platform);
     if (compiler_status.ok()) {
+      if (!platform->Initialized()) {
+        TF_RETURN_IF_ERROR(platform->Initialize({}));
+      }
       platforms.push_back(platform);
     } else {
       LOG(INFO) << "platform " << platform->Name() << " present but no "
@@ -260,8 +263,8 @@ PlatformUtil::GetStreamExecutors(
     // Block here in thread_pool destructor until all devices are initialized.
   }
   VLOG(1) << "Device initialization complete";
-  if (std::all_of(stream_executors.begin(), stream_executors.end(),
-                  [](se::StreamExecutor* s) { return s == nullptr; })) {
+  if (absl::c_all_of(stream_executors,
+                     [](se::StreamExecutor* s) { return s == nullptr; })) {
     return InternalError("no supported devices found for platform %s",
                          platform->Name());
   }
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 4df746fca9f8320eed72911726f33bb01f06fed5..a62118df157edf67114ff41befbdce3da129fe93 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -226,7 +226,10 @@ StatusOr<bool> PerformSinkReshapeOrTranspose(
     // changes, so all the fused instructions have the same dimensions.
     for (const auto& fused_instruction : instruction->fused_instructions()) {
       Shape* shape = fused_instruction->mutable_shape();
-      *shape->mutable_dimensions() = new_operand_shape.dimensions();
+      shape->clear_dimensions();
+      for (int64 i : new_operand_shape.dimensions()) {
+        shape->add_dimensions(i);
+      }
       *shape->mutable_layout() = new_operand_shape.layout();
     }
   }
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index 11c2f8392d285095816dd5d61f7029c1bfd158d4..acad871c4d427b174ffce3a462a0a3918a1e0c33 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -26,7 +26,6 @@ limitations under the License.
 
 namespace xla {
 
-
 // Transposes the given scatter_indices such that the index_vector_dim becomes
 // the most-minor dimension.
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
@@ -60,6 +59,13 @@ static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
   TF_ASSIGN_OR_RETURN(
       HloInstruction * transposed_scatter_indices,
       TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim));
+  if (scatter_indices->shape().rank() == index_vector_dim + 1 &&
+      scatter_indices->shape().dimensions(index_vector_dim) == 1) {
+    auto new_shape =
+        ShapeUtil::DeleteDimension(index_vector_dim, scatter_indices->shape());
+    TF_ASSIGN_OR_RETURN(scatter_indices,
+                        MakeReshapeHlo(new_shape, scatter_indices));
+  }
   bool indices_are_scalar =
       index_vector_dim == scatter_indices->shape().dimensions_size();
 
@@ -88,7 +94,7 @@ static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
 static StatusOr<HloInstruction*> PermuteScatterAndWindowDims(
     HloInstruction* updates, absl::Span<const int64> update_window_dims) {
   std::vector<int64> permutation;
-  const int64 updates_rank = ShapeUtil::Rank(updates->shape());
+  const int64 updates_rank = updates->shape().rank();
   permutation.reserve(updates_rank);
 
   for (int64 i = 0; i < updates_rank; ++i) {
@@ -165,10 +171,9 @@ static StatusOr<HloInstruction*> CheckIndexValidity(
   // Valid range for the index: [0, operand_dims - window_sizes]
 
   // Check if the index has any negative values.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * zero_index,
+  HloInstruction* zero_index =
       BroadcastZeros(computation, index->shape().element_type(),
-                     AsInt64Slice(index->shape().dimensions())));
+                     AsInt64Slice(index->shape().dimensions()));
   TF_ASSIGN_OR_RETURN(HloInstruction * negative_index_check,
                       MakeBinaryHlo(HloOpcode::kLe, zero_index, index));
 
@@ -214,15 +219,11 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
   HloInstruction* updates = loop_state[2];
 
   bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1;
-  CHECK_EQ(has_scalar_indices,
-           dim_numbers.index_vector_dim() ==
-               scatter->operand(1)->shape().dimensions_size());
 
   // Build a vector form of the induction variable of the while loop.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * induction_var_as_vector,
+  HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                       /*result_shape_bounds=*/{1}));
+                       /*result_shape_bounds=*/{1});
 
   // Pick the index to scatter from scatter_indices based on the induction_var
   // and transform that to an index into the `operand` space.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index a0126f39b3dc4281abedc36a19dd20c3b128e249..83434528a21b16cad7c831e7d9cc42d436634540 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
@@ -295,11 +296,16 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
 
-  config->set_replica_count(options_.number_of_replicas());
   if (execution_options != nullptr) {
+    if (execution_options->num_replicas() > 0) {
+      config->set_replica_count(execution_options->num_replicas());
+    } else {
+      config->set_replica_count(options_.number_of_replicas());
+    }
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
   } else {
+    config->set_replica_count(options_.number_of_replicas());
     config->set_debug_options(GetDebugOptionsFromFlags());
   }
 
@@ -523,13 +529,13 @@ Service::ExecuteParallelAndRegisterResult(
 
 StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     Executable* executable,
-    const absl::Span<const std::vector<const ShapedBuffer*>> arguments,
-    Backend* backend, const string& result_tag, ExecutionProfile* profile) {
+    absl::Span<const std::vector<const ShapedBuffer*>> arguments,
+    Backend* backend, const DeviceHandle& device_handle,
+    const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
   std::vector<StreamPool::Ptr> streams;
 
-  TF_ASSIGN_OR_RETURN(auto replicas,
-                      Replicas(*backend, SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handle));
   TF_RET_CHECK(!replicas.empty());
   for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream,
@@ -537,10 +543,11 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     streams.push_back(std::move(stream));
   }
 
-  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      backend->computation_placer()->AssignDevices(
-                          options_.number_of_replicas(),
-                          /*computation_count=*/1));
+  DeviceAssignment device_assignment(options_.number_of_replicas(),
+                                     /*computation_count=*/1);
+  for (int64 replica = 0; replica < replicas.size(); ++replica) {
+    device_assignment(replica, 0) = replicas[replica]->device_ordinal();
+  }
 
   // Set up run options.
   std::vector<ServiceExecutableRunOptions> run_options;
@@ -552,9 +559,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
-    run_options.emplace_back(
-        options, backend->StreamBorrower(),
-        /*xla_intra_op_thread_pool=*/backend->eigen_intra_op_thread_pool());
+    run_options.emplace_back(options, backend->StreamBorrower());
   }
 
   if (options_.number_of_replicas() == 1) {
@@ -711,14 +716,33 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     }
   }
 
-  // Execute the generated executables in parallel and return the device
-  // handles for each computation's output.
+  // If we have multiple executables to run, execute them all in parallel.  But
+  // if we only have one executable, execute it using the vanilla, non-parallel
+  // call.
+  //
+  // We do this because the Client API uses ExecuteGraphParallel when it wants
+  // to compile and run one computation without caching the executable, but not
+  // all backends support the async StreamExecutor API required by
+  // ExecuteParallelAndRegisterResult.
+  //
+  // TODO(b/122731460): Consolidate Execute{,Parallel}AndRegisterResult; they do
+  // basically the same thing.
   ExecutionProfile profile;
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDataHandle> outputs,
-      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
-                                       execute_backend_.get(), device_handles,
-                                       computation_names, &profile));
+  std::vector<GlobalDataHandle> outputs;
+  if (executable_ptrs.size() == 1) {
+    TF_ASSIGN_OR_RETURN(
+        auto output,
+        ExecuteAndRegisterResult(executable_ptrs[0], all_arguments[0],
+                                 execute_backend_.get(), device_handles[0],
+                                 computation_names[0], &profile));
+    outputs.push_back(std::move(output));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        outputs, ExecuteParallelAndRegisterResult(
+                     executable_ptrs, all_arguments, execute_backend_.get(),
+                     device_handles, computation_names, &profile));
+  }
+
   for (const GlobalDataHandle& output : outputs) {
     ExecuteResponse response;
     *response.mutable_output() = output;
@@ -904,6 +928,7 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
       *result->mutable_output(),
       ExecuteAndRegisterResult(executable.get(), replicated_arguments,
                                execute_backend_.get(),
+                               SingleComputationDeviceHandle(),
                                "result of " + executable->module().name(),
                                result->mutable_profile()));
 
@@ -1097,9 +1122,12 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
 
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
+                      DynamicDimensionInference::Run(module.get()));
+
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate<Literal>(
-                                               *module, /*arg_literals=*/{}));
+  evaluator.set_dynamic_dimension_inference(&dynamic_dimension_inference);
+  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate(*module, {}));
 
   // Since the result layout is non-effective to the Evaluator results, explicit
   // relayout here.
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index abd3ee5a059ac0910d6acc8076899950498b4c43..fd907d07daef9e8337aeed198ef4fd23d069df21 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -53,7 +53,7 @@ class ServiceOptions {
   ServiceOptions& set_platform(se::Platform* platform);
   se::Platform* platform() const;
 
-  // Set the number of replicas to use when compiling replicated
+  // Set the default number of replicas to use when compiling replicated
   // programs.
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
@@ -250,8 +250,9 @@ class Service : public ServiceInterface {
   // ExecutionProfile object which will be filled in with profile data.
   StatusOr<GlobalDataHandle> ExecuteAndRegisterResult(
       Executable* executable,
-      const absl::Span<const std::vector<const ShapedBuffer*>> arguments,
-      Backend* backend, const string& result_tag, ExecutionProfile* profile);
+      absl::Span<const std::vector<const ShapedBuffer*>> arguments,
+      Backend* backend, const DeviceHandle& device_handle,
+      const string& result_tag, ExecutionProfile* profile);
 
   // Runs the given executables with the given arguments and register the result
   // from each executable in the allocation tracker. The handles of the result
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index dbfed628bfcabffe66bef41a82e0e2430897d80d..6bee671056552b83014367889320b748659bbfdf 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -32,12 +32,10 @@ class ServiceExecutableRunOptions {
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
 
-  explicit ServiceExecutableRunOptions(
-      ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
-      tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
+  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
+                                       StreamBorrower borrow_stream = nullptr)
       : run_options_(std::move(run_options)),
-        borrow_stream_(std::move(borrow_stream)),
-        xla_intra_op_thread_pool_(xla_intra_op_thread_pool) {}
+        borrow_stream_(std::move(borrow_stream)) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
   const ExecutableRunOptions& run_options() const { return run_options_; }
@@ -56,15 +54,9 @@ class ServiceExecutableRunOptions {
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
   }
 
-  // Returns reference to thread pool for execution of XLA ops on CPU backend.
-  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool() const {
-    return xla_intra_op_thread_pool_;
-  }
-
  private:
   ExecutableRunOptions run_options_;
   StreamBorrower borrow_stream_;
-  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 8e571675c79b08efd454ee5e0fe47bacdcf3dbb7..946577d55d43f04fe2dbabb3dd11c3468f2c7edf 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 
-#include <stddef.h>
 #include <algorithm>
+#include <cstddef>
 #include <numeric>
 #include <set>
 #include <string>
@@ -50,7 +50,7 @@ bool AllUnique(absl::Span<const int64> slice) {
 }
 
 Status ExpectArray(const Shape& shape, absl::string_view op_type) {
-  if (!ShapeUtil::IsArray(shape)) {
+  if (!shape.IsArray()) {
     return InvalidArgument("Expected array argument for %s, but got %s.",
                            string(op_type), ShapeUtil::HumanString(shape));
   }
@@ -70,7 +70,7 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
 
   const Shape& accumulator_shape = reducer_shape.result();
   std::vector<const Shape*> accumulator_subshapes;
-  if (ShapeUtil::IsArray(accumulator_shape)) {
+  if (accumulator_shape.IsArray()) {
     if (inputs != 1) {
       return InvalidArgument(
           "Reduction function must produce a tuple with %d elements, but "
@@ -78,7 +78,7 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
           inputs);
     }
     accumulator_subshapes.push_back(&accumulator_shape);
-  } else if (ShapeUtil::IsTuple(accumulator_shape)) {
+  } else if (accumulator_shape.IsTuple()) {
     if (ShapeUtil::TupleElementCount(accumulator_shape) != inputs) {
       return InvalidArgument(
           "Reduction function must produce a tuple with %d elements, but has "
@@ -96,7 +96,7 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
   }
 
   for (const Shape* element_shape : accumulator_subshapes) {
-    if (ShapeUtil::Rank(*element_shape) != 0) {
+    if (element_shape->rank() != 0) {
       return InvalidArgument(
           "Reduction function must return a scalar or tuple of scalars but "
           "returns shape: %s",
@@ -156,17 +156,26 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
   return Status::OK();
 }
 
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+
 StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                                        const Window& window,
                                        PrimitiveType element_type,
                                        bool allow_negative_padding) {
-  if (window.dimensions_size() != ShapeUtil::Rank(base_shape)) {
+  if (window.dimensions_size() != base_shape.rank()) {
     return InvalidArgument(
         "Window has dimension %d but base shape has dimension %d.",
-        window.dimensions_size(), ShapeUtil::Rank(base_shape));
+        window.dimensions_size(), base_shape.rank());
   }
 
   std::vector<int64> output_dimensions(window.dimensions_size());
+  std::vector<bool> output_is_dynamic(window.dimensions_size());
   for (int64 i = 0; i < window.dimensions_size(); ++i) {
     const auto& dim = window.dimensions(i);
     if (dim.size() <= 0) {
@@ -196,6 +205,12 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
           window.DebugString());
     }
 
+    if (base_shape.is_dynamic_dimension(i) && !IsTrivialWindowDimension(dim)) {
+      return Unimplemented(
+          "Dynamic shape is not supported for non trivial window: %s",
+          window_util::ToString(window));
+    }
+
     const int64 dilated_base = window_util::DilatedBound(
         ShapeUtil::GetDimension(base_shape, i), dim.base_dilation());
     const int64 padded_dilated_base =
@@ -205,9 +220,11 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
     output_dimensions[i] = window_util::StridedBound(
         padded_dilated_base, dilated_window, dim.stride());
+    output_is_dynamic[i] = base_shape.is_dynamic_dimension(i);
   }
 
-  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions);
+  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions,
+                                       output_is_dynamic);
 }
 
 }  // namespace
@@ -338,7 +355,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   if (arg_shapes.empty()) {
     return InvalidArgument("Concatenate expects at least one argument.");
   }
-  if (dimension < 0 || dimension >= ShapeUtil::Rank(*arg_shapes[0])) {
+  if (dimension < 0 || dimension >= arg_shapes[0]->rank()) {
     return InvalidArgument("Concatenate dimension out of bounds: %d.",
                            dimension);
   }
@@ -351,12 +368,12 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
       element_type = arg_shape->element_type();
       continue;
     }
-    if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
+    if (arg_shape->rank() != shape->rank()) {
       return InvalidArgument(
           "Cannot concatenate arrays with different ranks: %d (%s) vs %d "
           "(%s).",
-          ShapeUtil::Rank(*arg_shape), ShapeUtil::HumanString(*arg_shape),
-          ShapeUtil::Rank(*shape), ShapeUtil::HumanString(*shape));
+          arg_shape->rank(), ShapeUtil::HumanString(*arg_shape), shape->rank(),
+          ShapeUtil::HumanString(*shape));
     }
     if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) {
       return InvalidArgument(
@@ -364,8 +381,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
           PrimitiveType_Name(arg_shape->element_type()),
           PrimitiveType_Name(shape->element_type()));
     }
-    for (int64 dimension_number = 0;
-         dimension_number < ShapeUtil::Rank(*arg_shape); ++dimension_number) {
+    for (int64 dimension_number = 0; dimension_number < arg_shape->rank();
+         ++dimension_number) {
       if (arg_shape->dimensions(dimension_number) !=
           shape->dimensions(dimension_number)) {
         if (dimension_number == dimension) {
@@ -401,7 +418,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
         ShapeUtil::HumanString(operand_shape),
         PrimitiveType_Name(new_element_type));
   }
-  if (!ShapeUtil::IsArray(operand_shape) ||
+  if (!operand_shape.IsArray() ||
       !primitive_util::IsArrayType(new_element_type)) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
@@ -424,7 +441,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                            ShapeUtil::HumanString(operand_shape),
                            PrimitiveType_Name(new_element_type));
   }
-  if (!ShapeUtil::IsArray(operand_shape) ||
+  if (!operand_shape.IsArray() ||
       !primitive_util::IsArrayType(new_element_type)) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
@@ -472,7 +489,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 /* static */ StatusOr<Shape> ShapeInference::InferPadShape(
     const Shape& operand_shape, const Shape& padding_value_shape,
     const PaddingConfig& padding_config) {
-  if (!ShapeUtil::IsArray(operand_shape)) {
+  if (!operand_shape.IsArray()) {
     return InvalidArgument(
         "Pad operation does not support tuple-shape operands.");
   }
@@ -480,7 +497,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return InvalidArgument(
         "Pad operation does not support non-scalar padding values.");
   }
-  if (ShapeUtil::Rank(operand_shape) != padding_config.dimensions_size()) {
+  if (operand_shape.rank() != padding_config.dimensions_size()) {
     return InvalidArgument(
         "The rank of the operand and the padding configuration do not match: "
         "%s vs %s.",
@@ -500,35 +517,40 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                            padding_config.ShortDebugString());
   }
 
-  std::vector<int64> dimensions(ShapeUtil::Rank(operand_shape));
+  if (!padding_value_shape.is_static()) {
+    return InvalidArgument("Dynamic padding value is not supported");
+  }
+
+  std::vector<int64> dimensions(operand_shape.rank());
+  std::vector<bool> is_dynamic(operand_shape.rank());
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
     const auto& p = padding_config.dimensions(i);
+    if (operand_shape.is_dynamic_dimension(i) && p.edge_padding_high() != 0 &&
+        p.edge_padding_low() != 0 && p.interior_padding() != 0) {
+      return InvalidArgument(
+          "Dynamic dimension on padding dimension is not supported.");
+    }
     dimensions[i] = operand_shape.dimensions(i) + p.edge_padding_low() +
                     p.edge_padding_high() +
                     std::max<int64>(operand_shape.dimensions(i) - 1, 0LL) *
                         p.interior_padding();
+    is_dynamic[i] = operand_shape.is_dynamic_dimension(i);
   }
+
   return ShapeUtil::MakeShape(
       ShapeUtil::HigherPrecisionElementType(operand_shape, padding_value_shape),
-      dimensions);
+      dimensions, is_dynamic);
 }
 
 // Current DotDimensionNumbers Requirements:
 //
 // Contracting Dimensions:
-// *) Exactly one contracting dimension on both lhs and rhs.
+// *) Same number of contracting dimensions on both lhs and rhs.
 // *) Contracting dimension size must be the same on both lhs and rhs.
-// *) Contracting dimension numbers do not need to be the same (i.e. transposes
-//    are passed on to emitter implementations).
 //
 // Batch Dimensions:
 // *) Same number of batch dimensions on both lhs and rhs.
-// *) Same batch dimension numbers (and sizes) on both lhs and rhs.
-// *) Batch dimension numbers must be ordered before contracting and
-//    non-contracting/non-batch dimension numbers.
-//
-// Non-Contracting-Non-Batch Dimensions:
-// *) Can be 0 (matrix-vector) or 1 (matrix-matrix).
+// *) Same batch dimension sizes on both lhs and rhs.
 //
 
 namespace {
@@ -541,9 +563,8 @@ Status ValidateDotDimensionNumbers(
                           absl::Span<const int64> contracting_dims,
                           absl::Span<const int64> batch_dims) -> bool {
     auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; };
-    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
-                       in_range) &&
-           std::all_of(batch_dims.begin(), batch_dims.end(), in_range);
+    return absl::c_all_of(contracting_dims, in_range) &&
+           absl::c_all_of(batch_dims, in_range);
   };
 
   absl::Span<const int64> lhs_contracting_dimensions =
@@ -555,9 +576,9 @@ Status ValidateDotDimensionNumbers(
   absl::Span<const int64> rhs_batch_dimensions =
       AsInt64Slice(dimension_numbers.rhs_batch_dimensions());
 
-  if (!dims_in_range(ShapeUtil::Rank(lhs), lhs_contracting_dimensions,
+  if (!dims_in_range(lhs.rank(), lhs_contracting_dimensions,
                      lhs_batch_dimensions) ||
-      !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions,
+      !dims_in_range(rhs.rank(), rhs_contracting_dimensions,
                      rhs_batch_dimensions)) {
     return InvalidArgument("A dimension number is out of range in Dot: %s.",
                            dimension_numbers.DebugString());
@@ -570,9 +591,8 @@ Status ValidateDotDimensionNumbers(
     auto is_unique = [&dim_set](int64 i) -> bool {
       return dim_set.insert(i).second;
     };
-    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
-                       is_unique) &&
-           std::all_of(batch_dims.begin(), batch_dims.end(), is_unique);
+    return absl::c_all_of(contracting_dims, is_unique) &&
+           absl::c_all_of(batch_dims, is_unique);
   };
 
   if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
@@ -581,36 +601,6 @@ Status ValidateDotDimensionNumbers(
                            dimension_numbers.DebugString());
   }
 
-  // Check that the count of non-contracting-non-batch dimensions is in {0, 1}.
-  const int64 lhs_non_contracting_non_batch_dims =
-      ShapeUtil::Rank(lhs) -
-      dimension_numbers.lhs_contracting_dimensions_size() -
-      dimension_numbers.lhs_batch_dimensions_size();
-  const int64 rhs_non_contracting_non_batch_dims =
-      ShapeUtil::Rank(rhs) -
-      dimension_numbers.rhs_contracting_dimensions_size() -
-      dimension_numbers.rhs_batch_dimensions_size();
-  if (lhs_non_contracting_non_batch_dims < 0 ||
-      lhs_non_contracting_non_batch_dims > 1 ||
-      rhs_non_contracting_non_batch_dims < 0 ||
-      rhs_non_contracting_non_batch_dims > 1) {
-    return InvalidArgument(
-        "Batch and contracting dimension number mismatch with rank.");
-  }
-
-  // Check that batch dimension numbers are ordered before all others, and
-  // that they are monotonically increasing.
-  std::vector<int64> batch_dim_numbers(lhs_batch_dimensions.size());
-  std::iota(batch_dim_numbers.begin(), batch_dim_numbers.end(), 0);
-  if (!std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
-                  lhs_batch_dimensions.begin()) ||
-      !std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
-                  rhs_batch_dimensions.begin())) {
-    return InvalidArgument(
-        "Batch dimension numbers must precede non-batch dimensions and be"
-        "monotonically increasing.");
-  }
-
   return Status::OK();
 }
 
@@ -637,28 +627,33 @@ Status ValidateDotDimensionNumbers(
     return fail("Element types do not match.");
   }
 
-  if ((ShapeUtil::Rank(lhs) < 1) || (ShapeUtil::Rank(rhs) < 1)) {
+  if ((lhs.rank() < 1) || (rhs.rank() < 1)) {
     return fail("Dot only supports rank 1 or above.");
   }
 
   // Validate basic properties of dot dimension numbers.
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
 
-  // Check that there is only one contracting dimension for both lhs and rhs.
+  // Check that number of contracting dimensions match.
   if (dimension_numbers.lhs_contracting_dimensions_size() !=
-          dimension_numbers.rhs_contracting_dimensions_size() ||
-      dimension_numbers.lhs_contracting_dimensions_size() != 1) {
-    return fail("Must specify one contracting dimension for both lhs and rhs.");
+      dimension_numbers.rhs_contracting_dimensions_size()) {
+    return fail(
+        "Must specify the same number of contracting dimensions for lhs and "
+        "rhs.");
   }
-
   // Check that contracting dimension sizes match.
-  const int64 lhs_contracting_dimension =
-      dimension_numbers.lhs_contracting_dimensions(0);
-  const int64 rhs_contracting_dimension =
-      dimension_numbers.rhs_contracting_dimensions(0);
-  if (lhs.dimensions(lhs_contracting_dimension) !=
-      rhs.dimensions(rhs_contracting_dimension)) {
-    return fail("Contracting dimension sizes do not match.");
+  for (int64 i = 0; i < dimension_numbers.lhs_contracting_dimensions_size();
+       ++i) {
+    const int64 lhs_contracting_dimension =
+        dimension_numbers.lhs_contracting_dimensions(i);
+    const int64 rhs_contracting_dimension =
+        dimension_numbers.rhs_contracting_dimensions(i);
+    if (lhs.dimensions(lhs_contracting_dimension) !=
+            rhs.dimensions(rhs_contracting_dimension) ||
+        lhs.is_dynamic_dimension(lhs_contracting_dimension) !=
+            rhs.is_dynamic_dimension(rhs_contracting_dimension)) {
+      return fail("Contracting dimension sizes do not match.");
+    }
   }
 
   // Check that number of batch dimensions match.
@@ -669,11 +664,12 @@ Status ValidateDotDimensionNumbers(
 
   // Check that batch dimension numbers and sizes match.
   for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
-    if (dimension_numbers.lhs_batch_dimensions(i) !=
-            dimension_numbers.rhs_batch_dimensions(i) ||
-        lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
-            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
-      return fail("Batch dimension numbers and sizes must match for lhs/rhs.");
+    if (lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i)) ||
+        lhs.is_dynamic_dimension(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.is_dynamic_dimension(
+                dimension_numbers.rhs_batch_dimensions(i))) {
+      return fail("Batch dimension sizes must match for lhs/rhs.");
     }
   }
 
@@ -683,21 +679,29 @@ Status ValidateDotDimensionNumbers(
   // Generate the result dimensions in order, rhs dimensions followed by lhs
   // dimensions except the contracted and batch dimensions.
   std::vector<int64> dimensions;
-  std::unordered_set<int64> rhs_batch_dims(
-      dimension_numbers.rhs_batch_dimensions().begin(),
-      dimension_numbers.rhs_batch_dimensions().end());
-  for (int64 i = 0; i < ShapeUtil::Rank(lhs); i++) {
-    if (i != lhs_contracting_dimension) {
+  std::vector<bool> is_dynamic;
+  for (int64 lhs_dim : dimension_numbers.lhs_batch_dimensions()) {
+    dimensions.push_back(lhs.dimensions(lhs_dim));
+    is_dynamic.push_back(lhs.is_dynamic_dimension(lhs_dim));
+  }
+  for (int64 i = 0; i < lhs.rank(); i++) {
+    if (!absl::c_linear_search(dimension_numbers.lhs_contracting_dimensions(),
+                               i) &&
+        !absl::c_linear_search(dimension_numbers.lhs_batch_dimensions(), i)) {
       dimensions.push_back(lhs.dimensions(i));
+      is_dynamic.push_back(lhs.is_dynamic_dimension(i));
     }
   }
-  for (int64 i = 0; i < ShapeUtil::Rank(rhs); i++) {
-    if (i != rhs_contracting_dimension && rhs_batch_dims.count(i) == 0) {
+  for (int64 i = 0; i < rhs.rank(); i++) {
+    if (!absl::c_linear_search(dimension_numbers.rhs_contracting_dimensions(),
+                               i) &&
+        !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(), i)) {
       dimensions.push_back(rhs.dimensions(i));
+      is_dynamic.push_back(rhs.is_dynamic_dimension(i));
     }
   }
   Shape result = ShapeUtil::MakeShape(
-      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions);
+      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions, is_dynamic);
 
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
   VLOG(2) << "inferred dot shape: " << ShapeUtil::HumanString(result);
@@ -708,20 +712,24 @@ Status ValidateDotDimensionNumbers(
 ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                                                        const Shape& lhs,
                                                        const Shape& rhs) {
-  TF_RET_CHECK(ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs));
+  TF_RET_CHECK(lhs.rank() == rhs.rank());
 
   // The shapes have to be compatible. That is, if some dimension d has a
   // different size in the two shapes, one of them has to be 1 (a "degenerate"
   // dimension). In that case, the output shape has the non-1 dimension size
   // from the lhs/rhs pair in every index.
-  std::vector<int64> output_dimensions(ShapeUtil::Rank(lhs));
-  for (int64 i = 0; i < ShapeUtil::Rank(lhs); ++i) {
+  std::vector<int64> output_dimensions(lhs.rank());
+  std::vector<bool> output_dimensions_is_dynamic(lhs.rank());
+  for (int64 i = 0; i < lhs.rank(); ++i) {
     if (lhs.dimensions(i) == rhs.dimensions(i)) {
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else if (lhs.dimensions(i) == 1) {
       output_dimensions[i] = rhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = rhs.is_dynamic_dimension(i);
     } else if (rhs.dimensions(i) == 1) {
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else {
       return InvalidArgument(
           "Binary op %s with incompatible shapes: %s and %s.",
@@ -730,7 +738,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
   }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
-                              output_dimensions);
+                              output_dimensions, output_dimensions_is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferInDimBroadcastShape(
@@ -743,13 +751,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     return InvalidArgument("Automatic shape inference not supported: %s and %s",
                            ShapeUtil::HumanString(smaller_shape),
                            ShapeUtil::HumanString(larger_shape));
-  } else if (broadcast_dimensions.size() != ShapeUtil::Rank(smaller_shape)) {
+  } else if (broadcast_dimensions.size() != smaller_shape.rank()) {
     return InvalidArgument(
         "Size of broadcast_dimensions has to match lower-rank operand's "
         "rank; "
         " lower-rank operand's rank is %d, size of broadcast_dimensions is "
         "%u.",
-        ShapeUtil::Rank(smaller_shape), broadcast_dimensions.size());
+        smaller_shape.rank(), broadcast_dimensions.size());
   }
 
   // broadcast_dimensions is a sequence of dimensions; its length is equal to
@@ -809,6 +817,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
     int64 small_dimension_size = smaller_shape.dimensions(i);
     int64 large_dimension_size = larger_shape.dimensions(dimension_to_match);
+    bool small_is_dynamic = smaller_shape.is_dynamic_dimension(i);
+    bool large_is_dynamic =
+        larger_shape.is_dynamic_dimension(dimension_to_match);
     // Dimension sizes must be compatible: match or be degenerate (degenerate
     // case is handled by degenerate dimension broadcasting which occurs after
     // InDim broadcasting).
@@ -820,6 +831,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           ShapeUtil::HumanString(smaller_shape),
           ShapeUtil::HumanString(larger_shape));
     }
+    if (small_is_dynamic != large_is_dynamic) {
+      return InvalidArgument(
+          "Broadcast dimension %d dynamism mismatch: %s and %s.", i,
+          ShapeUtil::HumanString(smaller_shape),
+          ShapeUtil::HumanString(larger_shape));
+    }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) {
@@ -829,6 +846,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
 
     output_shape.set_dimensions(dimension_to_match, small_dimension_size);
+    output_shape.set_dynamic_dimension(dimension_to_match, small_is_dynamic);
   }
 
   return output_shape;
@@ -847,8 +865,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(rhs));
   }
 
-  if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) {
-    std::vector<int64> identity_dims(ShapeUtil::Rank(lhs));
+  if (lhs.rank() == rhs.rank()) {
+    std::vector<int64> identity_dims(lhs.rank());
     std::iota(identity_dims.begin(), identity_dims.end(), 0);
     if (!broadcast_dimensions.empty() &&
         broadcast_dimensions != identity_dims) {
@@ -865,15 +883,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         lhs, ShapeUtil::HigherPrecisionElementType(lhs, rhs));
   }
 
-  if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) {
+  if (lhs.rank() == rhs.rank()) {
     return InferDegenerateDimensionBroadcastShape(operation, lhs, rhs);
   } else {
     // Ranks do not match, so perform InDim broadcasting using
     // broadcast_dimensions. Scalar broadcasting is a special case of this.
-    const Shape& larger_shape =
-        ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? lhs : rhs;
-    const Shape& smaller_shape =
-        ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? rhs : lhs;
+    const Shape& larger_shape = lhs.rank() > rhs.rank() ? lhs : rhs;
+    const Shape& smaller_shape = lhs.rank() > rhs.rank() ? rhs : lhs;
 
     // After InDim broadcasting, perform degenerate dimensions broadcasting.
     TF_ASSIGN_OR_RETURN(Shape indim_broadcast_shape,
@@ -942,6 +958,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                                                         broadcast_dimensions));
       if (lhs.element_type() == F32 && rhs.element_type() == F32) {
         return ShapeUtil::ChangeElementType(shape, C64);
+      } else if (lhs.element_type() == F64 && rhs.element_type() == F64) {
+        return ShapeUtil::ChangeElementType(shape, C128);
       } else {
         return Unimplemented("Complex component type is not implemented.");
       }
@@ -1162,12 +1180,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
                Status::OK());
 
-  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+  if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-training to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, ShapeUtil::Rank(operand_shape));
+        feature_index, operand_shape.rank());
   }
 
   if (feature_index < 0) {
@@ -1177,25 +1195,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         feature_index);
   }
 
-  if (ShapeUtil::Rank(operand_shape) < 1) {
+  if (operand_shape.rank() < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
         "batch-norm-training to be at least 1; got %d.",
-        ShapeUtil::Rank(operand_shape));
+        operand_shape.rank());
   }
 
-  if (ShapeUtil::Rank(offset_shape) != 1) {
+  if (offset_shape.rank() != 1) {
     return InvalidArgument(
         "Offset input of batch-norm-training must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(offset_shape));
+        offset_shape.rank());
   }
 
-  if (ShapeUtil::Rank(scale_shape) != 1) {
+  if (scale_shape.rank() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-training must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(scale_shape));
+        scale_shape.rank());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1272,12 +1290,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape) ==
                Status::OK());
 
-  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+  if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-inference to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, ShapeUtil::Rank(operand_shape));
+        feature_index, operand_shape.rank());
   }
 
   if (feature_index < 0) {
@@ -1287,25 +1305,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         feature_index);
   }
 
-  if (ShapeUtil::Rank(operand_shape) < 1) {
+  if (operand_shape.rank() < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
         "batch-norm-inference to be at least 1; got %d.",
-        ShapeUtil::Rank(operand_shape));
+        operand_shape.rank());
   }
 
-  if (ShapeUtil::Rank(offset_shape) != 1) {
+  if (offset_shape.rank() != 1) {
     return InvalidArgument(
         "Offset input of batch-norm-inference must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(offset_shape));
+        offset_shape.rank());
   }
 
-  if (ShapeUtil::Rank(scale_shape) != 1) {
+  if (scale_shape.rank() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-inference must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(scale_shape));
+        scale_shape.rank());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1417,41 +1435,41 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RETURN_IF_ERROR(
       ShapeUtil::ValidateShapeWithOptionalLayout(output_grad_shape));
 
-  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+  if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-grad to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, ShapeUtil::Rank(operand_shape));
+        feature_index, operand_shape.rank());
   }
 
-  if (ShapeUtil::Rank(operand_shape) != ShapeUtil::Rank(output_grad_shape)) {
+  if (operand_shape.rank() != output_grad_shape.rank()) {
     return InvalidArgument(
         "Expected operand_shape of batch-norm-grad to have the same rank as"
         " output_grad_shape; got rank(oprand_shape) %d, and"
         " rank(output_grad_shape) %d.",
-        ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(output_grad_shape));
+        operand_shape.rank(), output_grad_shape.rank());
   }
 
-  if (ShapeUtil::Rank(mean_shape) != 1) {
+  if (mean_shape.rank() != 1) {
     return InvalidArgument(
         "Mean input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(mean_shape));
+        mean_shape.rank());
   }
 
-  if (ShapeUtil::Rank(scale_shape) != 1) {
+  if (scale_shape.rank() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(scale_shape));
+        scale_shape.rank());
   }
 
-  if (ShapeUtil::Rank(var_shape) != 1) {
+  if (var_shape.rank() != 1) {
     return InvalidArgument(
         "Var input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(var_shape));
+        var_shape.rank());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1538,7 +1556,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   }
 
   // Verify operand_shape and output_grad_shape have same bounds.
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
     if (ShapeUtil::GetDimension(operand_shape, i) !=
         ShapeUtil::GetDimension(output_grad_shape, i)) {
       return InvalidArgument(
@@ -1573,6 +1591,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         batch_group_count);
   }
 
+  if (batch_group_count > 1 && feature_group_count > 1) {
+    return InvalidArgument(
+        "both batch_group_count %d and feature_group_count %d cannot be "
+        "greater than 1",
+        batch_group_count, feature_group_count);
+  }
+
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
         "Convolution with different element types: %s and %s.",
@@ -1603,12 +1628,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   }
 
   const int num_dims = num_spatial_dims + 2;
-  if (ShapeUtil::Rank(lhs) != num_dims) {
+  if (lhs.rank() != num_dims) {
     return InvalidArgument(
         "The LHS argument to a convolution should have rank %d; lhs: %s.",
         num_dims, ShapeUtil::HumanString(lhs));
   }
-  if (ShapeUtil::Rank(rhs) != num_dims) {
+  if (rhs.rank() != num_dims) {
     return InvalidArgument(
         "The RHS argument to a convolution should have rank %d; rhs: %s.",
         num_dims, ShapeUtil::HumanString(rhs));
@@ -1623,29 +1648,29 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   input_dnums[1] = dnums.input_feature_dimension();
   std::copy(dnums.input_spatial_dimensions().begin(),
             dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
-  std::sort(input_dnums.begin(), input_dnums.end());
+  absl::c_sort(input_dnums);
 
   std::vector<int64> window_dnums(num_dims);
   window_dnums[0] = dnums.kernel_input_feature_dimension();
   window_dnums[1] = dnums.kernel_output_feature_dimension();
   std::copy(dnums.kernel_spatial_dimensions().begin(),
             dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
-  std::sort(window_dnums.begin(), window_dnums.end());
+  absl::c_sort(window_dnums);
 
   std::vector<int64> output_dnums(num_dims);
   output_dnums[0] = dnums.output_batch_dimension();
   output_dnums[1] = dnums.output_feature_dimension();
   std::copy(dnums.output_spatial_dimensions().begin(),
             dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
-  std::sort(output_dnums.begin(), output_dnums.end());
+  absl::c_sort(output_dnums);
 
   std::vector<int64> expected_dnums(num_dims);
   std::iota(expected_dnums.begin(), expected_dnums.end(), 0);
 
   const auto in_range = [num_dims](int64 i) { return 0 <= i && i < num_dims; };
-  if (!std::all_of(input_dnums.begin(), input_dnums.end(), in_range) ||
-      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range) ||
-      !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) {
+  if (!absl::c_all_of(input_dnums, in_range) ||
+      !absl::c_all_of(window_dnums, in_range) ||
+      !absl::c_all_of(output_dnums, in_range)) {
     return InvalidArgument(
         "A dimension number is out of range in convolution: %s.",
         dnums.DebugString());
@@ -1686,6 +1711,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   const int64 kernel_output_features =
       rhs.dimensions(dnums.kernel_output_feature_dimension());
 
+  if (batch_group_count > 1 && input_batch % kernel_output_features != 0) {
+    return InvalidArgument(
+        "Expected output feature dimension (value %d) to be divisible by "
+        "input_batch (value %d) for batch group count %d; "
+        "got <conv>(%s, %s)\n"
+        "Dimension numbers: {%s}.",
+        kernel_output_features, input_batch, batch_group_count,
+        ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
+        dnums.DebugString());
+  }
+
   if (input_features % feature_group_count != 0 ||
       input_features / feature_group_count != kernel_input_features) {
     return InvalidArgument(
@@ -1747,8 +1783,33 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     dimensions[dnums.output_spatial_dimensions(i)] =
         window_output_shape.dimensions(i);
   }
+  std::vector<bool> is_dynamic(num_dims);
+  for (int i = 0; i < num_dims; i++) {
+    if (lhs.is_dynamic_dimension(i)) {
+      if (i == dnums.input_batch_dimension()) {
+        is_dynamic[dnums.output_batch_dimension()] = true;
+      } else if (i == dnums.input_feature_dimension()) {
+        // Input feature dimension is a contracting dimension, which does not
+        // affect the output dimension size. So we need to do nothing.
+      } else {
+        return InvalidArgument(
+            "Dynamic Spatial Convolution is not supported: lhs shape is %s ",
+            lhs.ToString());
+      }
+    }
+    if (rhs.is_dynamic_dimension(i)) {
+      if (i == dnums.kernel_input_feature_dimension()) {
+        // Kernel feature dimension does not affect the output dimension size.
+        // So we need to do nothing.
+      } else {
+        return InvalidArgument(
+            "Dynamic Spatial Convolution is not supported: rhs shape is %s ",
+            rhs.ToString());
+      }
+    }
+  }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
-                              dimensions);
+                              dimensions, is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferFftShape(
@@ -1769,7 +1830,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     case FFT:
     case IFFT:
       if (in.element_type() != C64) {
-        return InvalidArgument("%s requires C64 input type, found %s.",
+        return InvalidArgument("%s requires complex input type, found %s.",
                                FftType_Name(fft_type),
                                PrimitiveType_Name(in.element_type()));
       }
@@ -1853,12 +1914,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& shape, int64 split_dimension, int64 concat_dimension,
     int64 split_count) {
   TF_RET_CHECK(split_count > 0);
-  if (split_dimension >= ShapeUtil::Rank(shape) || split_dimension < 0) {
+  if (split_dimension >= shape.rank() || split_dimension < 0) {
     return InvalidArgument(
         "AllToAll split_dimension %d is out-of-bounds in shape %s.",
         split_dimension, ShapeUtil::HumanString(shape));
   }
-  if (concat_dimension >= ShapeUtil::Rank(shape) || concat_dimension < 0) {
+  if (concat_dimension >= shape.rank() || concat_dimension < 0) {
     return InvalidArgument(
         "AllToAll concat_dimension %d is out-of-bounds in shape %s.",
         concat_dimension, ShapeUtil::HumanString(shape));
@@ -1896,7 +1957,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferCollectivePermuteShape(
     const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape));
+  TF_RET_CHECK(shape.IsArray());
   return shape;
 }
 
@@ -1920,7 +1981,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   for (int64 i = 1; i < num_reduced_args; ++i) {
     if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) {
       return InvalidArgument(
-          "All reduced tensors must have the sime dimension. Tensor 0 has "
+          "All reduced tensors must have the same dimension. Tensor 0 has "
           "shape %s, Tensor %d has shape %s",
           ShapeUtil::HumanString(*reduced_args[0]), i,
           ShapeUtil::HumanString(*reduced_args[i]));
@@ -1932,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   // doesn't matter which one we choose.
   const Shape& arg = *reduced_args[0];
   for (int64 dimension : dimensions_to_reduce) {
-    if (dimension >= ShapeUtil::Rank(arg) || dimension < 0) {
+    if (dimension >= arg.rank() || dimension < 0) {
       return InvalidArgument("Reducing out-of-bounds dimension %d in shape %s.",
                              dimension, ShapeUtil::HumanString(arg));
     }
@@ -1949,20 +2010,22 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   std::set<int64> dimensions_to_reduce_set(dimensions_to_reduce.begin(),
                                            dimensions_to_reduce.end());
   std::vector<int64> new_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(arg); ++i) {
+  std::vector<bool> new_is_dynamic;
+  for (int i = 0; i < arg.rank(); ++i) {
     if (dimensions_to_reduce_set.find(i) == dimensions_to_reduce_set.end()) {
       new_dimensions.push_back(arg.dimensions(i));
+      new_is_dynamic.push_back(arg.is_dynamic_dimension(i));
     }
   }
 
   if (ShapeUtil::IsScalar(to_apply.result())) {
     return ShapeUtil::MakeShape(to_apply.result().element_type(),
-                                new_dimensions);
+                                new_dimensions, new_is_dynamic);
   } else {
     std::vector<Shape> result_subshapes;
     for (const Shape& subshape : to_apply.result().tuple_shapes()) {
-      result_subshapes.push_back(
-          ShapeUtil::MakeShape(subshape.element_type(), new_dimensions));
+      result_subshapes.push_back(ShapeUtil::MakeShape(
+          subshape.element_type(), new_dimensions, new_is_dynamic));
     }
     return ShapeUtil::MakeTupleShape(result_subshapes);
   }
@@ -2036,12 +2099,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(source_shape),
         ShapeUtil::HumanString(window_result_shape));
   }
+
   return operand_shape;
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferGetDimensionSizeShape(
     const Shape& shape, int64 dimension) {
-  if (dimension < 0 || dimension >= ShapeUtil::Rank(shape)) {
+  if (dimension < 0 || dimension >= shape.rank()) {
     return InvalidArgument("GetDimensionSize dimension out of bounds: %d.",
                            dimension);
   }
@@ -2083,10 +2147,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                            starts.size(), strides.size()));
   }
 
-  if (starts.size() != ShapeUtil::Rank(arg)) {
+  if (starts.size() != arg.rank()) {
     return InvalidArgument(
         "Slice index count does not match argument rank: %u vs %d.",
-        starts.size(), ShapeUtil::Rank(arg));
+        starts.size(), arg.rank());
   }
 
   std::vector<int64> sizes;
@@ -2121,41 +2185,87 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicSliceShape(
-    const Shape& operand_shape, const Shape& start_indices_shape,
-    absl::Span<const int64> slice_sizes) {
+    const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+    absl::Span<const int64> slice_sizes, bool allow_scalar_indices) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of dynamic slice"));
-  TF_RETURN_IF_ERROR(
-      ExpectArray(start_indices_shape, "start indices of dynamic slice"));
+  auto number_of_indices = start_index_shapes.size();
+  // TODO(b/118437727): Remove this path.
+  if (!allow_scalar_indices ||
+      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+    if (number_of_indices != 1) {
+      return InvalidArgument(
+          "Dynamic slice should have exactly 1 index operand, has %d.",
+          number_of_indices);
+    }
 
-  VLOG(2) << StrFormat(
-      "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
-      ShapeUtil::HumanString(operand_shape),
-      ShapeUtil::HumanString(start_indices_shape), StrJoin(slice_sizes, ", "));
+    const Shape& start_indices_shape = start_index_shapes[0];
+    VLOG(2) << StrFormat(
+        "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
+        ShapeUtil::HumanString(operand_shape),
+        ShapeUtil::HumanString(start_indices_shape),
+        StrJoin(slice_sizes, ", "));
 
-  if (ShapeUtil::Rank(start_indices_shape) != 1) {
-    return InvalidArgument(
-        "Dynamic slice start indices of rank %d must be rank1.",
-        ShapeUtil::Rank(start_indices_shape));
-  }
+    TF_RETURN_IF_ERROR(
+        ExpectArray(start_indices_shape, "start indices of dynamic slice"));
 
-  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
-    return InvalidArgument(
-        "Dynamic slice start indices must be of integral type.");
-  }
+    if (start_indices_shape.rank() != 1) {
+      return InvalidArgument(
+          "Dynamic slice start indices of rank %d must be rank1.",
+          start_indices_shape.rank());
+    }
 
-  const int64 start_num_dims = start_indices_shape.dimensions(0);
-  if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
-    return InvalidArgument(
-        "Dynamic slice start number of dimensions %d (%s) must match rank "
-        "%d of slice input (%s).",
-        start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-        ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape));
+    if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
+      return InvalidArgument(
+          "Dynamic slice start indices must be of integral type.");
+    }
+
+    const int64 start_num_dims = start_indices_shape.dimensions(0);
+    if (operand_shape.rank() != start_num_dims) {
+      return InvalidArgument(
+          "Dynamic slice start number of dimensions %d (%s) must match rank "
+          "%d of slice input (%s).",
+          start_num_dims, ShapeUtil::HumanString(start_indices_shape),
+          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    }
+  } else {
+    VLOG(2) << StrFormat("slicing shape %s a with slice_sizes={%s}",
+                         ShapeUtil::HumanString(operand_shape),
+                         StrJoin(slice_sizes, ", "));
+
+    if (operand_shape.rank() != number_of_indices) {
+      return InvalidArgument(
+          "Dynamic slice start number of dimensions %d must match rank "
+          "%d of slice input (%s).",
+          number_of_indices, operand_shape.rank(),
+          ShapeUtil::HumanString(operand_shape));
+    }
+
+    if (number_of_indices > 0) {
+      const Shape& first_index_shape = start_index_shapes[0];
+      if (!ShapeUtil::IsScalar(first_index_shape)) {
+        return InvalidArgument("Dynamic slice indices must be scalar, not %s.",
+                               ShapeUtil::HumanString(first_index_shape));
+      }
+      if (!ShapeUtil::ElementIsIntegral(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic slice start indices must be of integral type.");
+      }
+      for (const Shape& index_shape : start_index_shapes) {
+        if (!ShapeUtil::Compatible(first_index_shape, index_shape)) {
+          return InvalidArgument(
+              "Dynamic slice start indices must all have the same shape, got "
+              "mismatching indices with shapes %s and %s.",
+              ShapeUtil::HumanString(first_index_shape),
+              ShapeUtil::HumanString(index_shape));
+        }
+      }
+    }
   }
 
-  if (slice_sizes.size() != ShapeUtil::Rank(operand_shape)) {
+  if (slice_sizes.size() != operand_shape.rank()) {
     return InvalidArgument(
         "Dynamic slice index count does not match argument rank: %u vs %d.",
-        slice_sizes.size(), ShapeUtil::Rank(operand_shape));
+        slice_sizes.size(), operand_shape.rank());
   }
 
   for (int64 dim = 0; dim < slice_sizes.size(); ++dim) {
@@ -2178,46 +2288,92 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicUpdateSliceShape(
     const Shape& operand_shape, const Shape& update_shape,
-    const Shape& start_indices_shape) {
+    absl::Span<const Shape> start_index_shapes, bool allow_scalar_indices) {
   TF_RETURN_IF_ERROR(
       ExpectArray(operand_shape, "operand of dynamic update slice"));
   TF_RETURN_IF_ERROR(
       ExpectArray(update_shape, "update of dynamic update slice"));
-  TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape,
-                                 "start indices of dynamic update slice"));
 
-  VLOG(2) << StrFormat(
-      "updating slice of shape %s at dynamic start_indices %s with update "
-      "shape %s",
-      ShapeUtil::HumanString(operand_shape),
-      ShapeUtil::HumanString(start_indices_shape),
-      ShapeUtil::HumanString(update_shape));
+  auto number_of_indices = start_index_shapes.size();
+  // TODO(b/118437727): Remove this path.
+  if (!allow_scalar_indices ||
+      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+    if (number_of_indices != 1) {
+      return InvalidArgument(
+          "Dynamic update slice should have exactly 1 index operand, has %d.",
+          number_of_indices);
+    }
+    const Shape& start_indices_shape = start_index_shapes[0];
+    TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape,
+                                   "start indices of dynamic update slice"));
 
-  if (ShapeUtil::Rank(start_indices_shape) != 1) {
-    return InvalidArgument(
-        "Dynamic update slice start indices of rank %d must be rank1.",
-        ShapeUtil::Rank(start_indices_shape));
-  }
+    VLOG(2) << StrFormat(
+        "updating slice of shape %s at dynamic start_indices %s with update "
+        "shape %s",
+        ShapeUtil::HumanString(operand_shape),
+        ShapeUtil::HumanString(start_indices_shape),
+        ShapeUtil::HumanString(update_shape));
 
-  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
-    return InvalidArgument(
-        "Dynamic update slice start indices must be of integral type.");
-  }
+    if (start_indices_shape.rank() != 1) {
+      return InvalidArgument(
+          "Dynamic update slice start indices of rank %d must be rank1.",
+          start_indices_shape.rank());
+    }
 
-  const int64 start_num_dims = start_indices_shape.dimensions(0);
-  if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
-    return InvalidArgument(
-        "Dynamic update slice start number of dimensions %d (%s) must match "
-        "rank %d of slice input (%s).",
-        start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-        ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape));
+    if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
+      return InvalidArgument(
+          "Dynamic update slice start indices must be of integral type.");
+    }
+
+    const int64 start_num_dims = start_indices_shape.dimensions(0);
+    if (operand_shape.rank() != start_num_dims) {
+      return InvalidArgument(
+          "Dynamic update slice start number of dimensions %d (%s) must match "
+          "rank %d of slice input (%s).",
+          start_num_dims, ShapeUtil::HumanString(start_indices_shape),
+          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    }
+  } else {
+    VLOG(2) << StrFormat("updating slice of shape %s with update shape %s",
+                         ShapeUtil::HumanString(operand_shape),
+                         ShapeUtil::HumanString(update_shape));
+
+    if (operand_shape.rank() != number_of_indices) {
+      return InvalidArgument(
+          "Dynamic update slice start number of dimensions %d must match rank "
+          "%d of slice input (%s).",
+          number_of_indices, operand_shape.rank(),
+          ShapeUtil::HumanString(operand_shape));
+    }
+
+    if (number_of_indices > 0) {
+      const Shape& first_index_shape = start_index_shapes[0];
+      if (!ShapeUtil::IsScalar(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic update slice indices must be scalar, not %s.",
+            ShapeUtil::HumanString(first_index_shape));
+      }
+      if (!ShapeUtil::ElementIsIntegral(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic update slice start indices must be of integral type.");
+      }
+      for (const Shape& index_shape : start_index_shapes) {
+        if (!ShapeUtil::Compatible(first_index_shape, index_shape)) {
+          return InvalidArgument(
+              "Dynamic update slice start indices must all have the same "
+              "shape, got mismatching indices with shapes %s and %s.",
+              ShapeUtil::HumanString(first_index_shape),
+              ShapeUtil::HumanString(index_shape));
+        }
+      }
+    }
   }
 
-  if (ShapeUtil::Rank(update_shape) != ShapeUtil::Rank(operand_shape)) {
+  if (update_shape.rank() != operand_shape.rank()) {
     return InvalidArgument(
         "Dynamic update slice update rank does not match argument rank: "
         "%d vs %d.",
-        ShapeUtil::Rank(update_shape), ShapeUtil::Rank(operand_shape));
+        update_shape.rank(), operand_shape.rank());
   }
 
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
@@ -2229,7 +2385,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         PrimitiveType_Name(update_shape.element_type()));
   }
 
-  for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) {
+  for (int64 dim = 0; dim < operand_shape.rank(); ++dim) {
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 update_dim_size = update_shape.dimensions(dim);
     if (update_dim_size < 0) {
@@ -2255,7 +2411,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     return InvalidArgument("a dimension number is duplicated in reverse");
   }
   for (int64 dimension : dimensions) {
-    if (dimension >= ShapeUtil::Rank(operand_shape) || dimension < 0) {
+    if (dimension >= operand_shape.rank() || dimension < 0) {
       return InvalidArgument(
           "One of the reverse dimensions (%d) is out-of-bounds in shape %s.",
           dimension, ShapeUtil::HumanString(operand_shape));
@@ -2266,7 +2422,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferGetTupleElementShape(
     const Shape& arg, int64 index) {
-  if (!ShapeUtil::IsTuple(arg)) {
+  if (!arg.IsTuple()) {
     return InvalidArgument(
         "Cannot infer shape: attempting to index into non-tuple: %s.",
         ShapeUtil::HumanString(arg));
@@ -2302,7 +2458,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   };
 
   // Check the shapes of computation parameters and return types.
-  if (!ShapeUtil::ShapeIs(condition.result(), PRED, {})) {
+  if (!ShapeUtil::Equal(condition.result(), ShapeUtil::MakeShape(PRED, {}))) {
     return InvalidArgument("Condition must return a boolean; got %s.",
                            shape_string());
   }
@@ -2322,7 +2478,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& predicate, const Shape& true_operand,
     const Shape& false_operand, const ProgramShape& true_computation,
     const ProgramShape& false_computation) {
-  if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
+  if (!ShapeUtil::Equal(predicate, ShapeUtil::MakeShape(PRED, {}))) {
     return InvalidArgument("Predicate must be a boolean; got %s.",
                            ShapeUtil::HumanString(predicate));
   }
@@ -2397,8 +2553,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     absl::Span<const int64> broadcast_dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of broadcast"));
   TF_RETURN_IF_ERROR(ExpectArray(output_shape, "operand of broadcast"));
-  const int64 operand_rank = ShapeUtil::Rank(operand_shape);
-  const int64 output_rank = ShapeUtil::Rank(output_shape);
+  const int64 operand_rank = operand_shape.rank();
+  const int64 output_rank = output_shape.rank();
   if (operand_rank > output_rank) {
     return InvalidArgument(
         "InDim style broadcast must be to an equal or higher ranked shape; "
@@ -2426,6 +2582,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           i, operand_shape.dimensions(i), broadcast_dimensions[i],
           output_shape.dimensions(broadcast_dimensions[i]));
     }
+    if (operand_shape.is_dynamic_dimension(i) !=
+        output_shape.is_dynamic_dimension(broadcast_dimensions[i])) {
+      return InvalidArgument(
+          "Broadcast input and output dynamism mismatch: %s and %s",
+          operand_shape.ToString(), output_shape.ToString());
+    }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions[i - 1] >= broadcast_dimensions[i]) {
@@ -2457,9 +2619,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(inferred_shape));
   }
 
-  std::vector<int64> indices(ShapeUtil::Rank(operand));
+  std::vector<int64> indices(operand.rank());
   std::iota(indices.begin(), indices.end(), 0);
-  if (dimensions.size() != ShapeUtil::Rank(operand) ||
+  if (dimensions.size() != operand.rank() ||
       !std::is_permutation(dimensions.begin(), dimensions.end(),
                            indices.begin())) {
     return InvalidArgument(
@@ -2468,6 +2630,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         StrJoin(dimensions, ","), ShapeUtil::HumanString(operand));
   }
 
+  std::vector<std::pair<int64, int64>> unmodified_dims =
+      ShapeUtil::DimensionsUnmodifiedByReshape(operand, inferred_shape);
+  for (auto& unmodified : unmodified_dims) {
+    if (operand.is_dynamic_dimension(unmodified.first)) {
+      inferred_shape.set_dynamic_dimension(unmodified.second, true);
+    }
+  }
+
   return inferred_shape;
 }
 
@@ -2475,9 +2645,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& operand, absl::Span<const int64> dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand, "transpose"));
 
-  std::vector<int64> indices(ShapeUtil::Rank(operand));
+  std::vector<int64> indices(operand.rank());
   std::iota(indices.begin(), indices.end(), 0);
-  if (dimensions.size() != ShapeUtil::Rank(operand) ||
+  if (dimensions.size() != operand.rank() ||
       !std::is_permutation(dimensions.begin(), dimensions.end(),
                            indices.begin())) {
     return InvalidArgument(
@@ -2548,12 +2718,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     // dimensions as on_true and on_false.
     return ShapeUtil::ChangeElementType(
         on_true, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
-  } else {
-    return InvalidArgument(
-        "Select operation with non-scalar predicate with dimensionality "
-        " different from the other operands: %s.",
-        ShapeUtil::HumanString(pred));
   }
+  return InvalidArgument(
+      "Select operation with non-scalar predicate with dimensionality "
+      "different from the other operands: %s.",
+      ShapeUtil::HumanString(pred));
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferTupleSelectShape(
@@ -2829,7 +2998,7 @@ Status ValidateScatterDimensionNumbers(
         "update_window_dims in scatter op must not repeat; got: %s.",
         StrJoin(dim_numbers.update_window_dims(), ", "));
   }
-  const int64 updates_rank = ShapeUtil::Rank(updates_shape);
+  const int64 updates_rank = updates_shape.rank();
   for (int64 window_dim : dim_numbers.update_window_dims()) {
     if (window_dim < 0 || window_dim >= updates_rank) {
       return InvalidArgument(
@@ -2863,10 +3032,10 @@ Status ValidateScatterDimensionNumbers(
   // Validate window size.
   auto window_size = dim_numbers.update_window_dims_size() +
                      dim_numbers.inserted_window_dims_size();
-  if (window_size != ShapeUtil::Rank(operand_shape)) {
+  if (window_size != operand_shape.rank()) {
     return InvalidArgument(
         "Scatter op has window of size %d; doesn't match operand of rank %d.",
-        window_size, ShapeUtil::Rank(operand_shape));
+        window_size, operand_shape.rank());
   }
 
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
@@ -2951,10 +3120,9 @@ Status ValidateScatterDimensionNumbers(
 
   int64 expected_updates_rank = expanded_scatter_indices_shape.size() - 1 +
                                 scatter_dim_numbers.update_window_dims_size();
-  if (ShapeUtil::Rank(updates_shape) != expected_updates_rank) {
+  if (updates_shape.rank() != expected_updates_rank) {
     return InvalidArgument("Updates tensor must be of rank %d; got %d.",
-                           expected_updates_rank,
-                           ShapeUtil::Rank(updates_shape));
+                           expected_updates_rank, updates_shape.rank());
   }
 
   TF_RETURN_IF_ERROR(ValidateScatterDimensionNumbers(
@@ -2985,7 +3153,7 @@ Status ValidateScatterDimensionNumbers(
   }
 
   int64 scatter_dims_seen = 0;
-  for (int64 i = 0; i < ShapeUtil::Rank(updates_shape); ++i) {
+  for (int64 i = 0; i < updates_shape.rank(); ++i) {
     bool is_update_window_dim =
         absl::c_binary_search(scatter_dim_numbers.update_window_dims(), i);
     if (is_update_window_dim) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 1b8fd10d691498087b28ef68517868c5def1da5a..7d39ef38e05abf0a81683c1fb0f3999908b27d23 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -176,14 +176,15 @@ class ShapeInference {
   // Infers the shape produced by a dynamic slice operation of size specified
   // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
   static StatusOr<Shape> InferDynamicSliceShape(
-      const Shape& operand_shape, const Shape& start_indices_shape,
-      absl::Span<const int64> slice_sizes);
+      const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+      absl::Span<const int64> slice_sizes, bool allow_scalar_indices = true);
 
   // Infers the shape produced by a dynamic update slice operation based
   // on the shape of operand and update.
   static StatusOr<Shape> InferDynamicUpdateSliceShape(
       const Shape& operand_shape, const Shape& update_shape,
-      const Shape& start_indices_shape);
+      absl::Span<const Shape> start_index_shapes,
+      bool allow_scalar_indices = true);
 
   // Infers the shape produced by doing a compile-time-constant indexing into
   // the given input shape. This is essential for operations on tuples, because
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 0a870808d4cd89fa18382522ea5a4bf2355e5ce7..26120a06b823c9fddf378991cec434a880fb888d 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -35,6 +35,7 @@ class ShapeInferenceTest : public ::testing::Test {
  protected:
   // Some handy scalar shapes.
   const Shape s32_ = ShapeUtil::MakeShape(S32, {});
+  const Shape f16_ = ShapeUtil::MakeShape(F16, {});
   const Shape f32_ = ShapeUtil::MakeShape(F32, {});
   const Shape f64_ = ShapeUtil::MakeShape(F64, {});
   const Shape pred_ = ShapeUtil::MakeShape(PRED, {});
@@ -260,8 +261,8 @@ TEST_F(ShapeInferenceTest, Complex) {
   ASSERT_FALSE(complex_shape(pred_, pred_, {}).ok());
   // Component types must match.
   ASSERT_FALSE(complex_shape(f32_, f64_, {}).ok());
-  // Only F32->C64 supported.
-  ASSERT_FALSE(complex_shape(f64_, f64_, {}).ok());
+  // Only F32->C64 and F64->C128 supported.
+  ASSERT_FALSE(complex_shape(f16_, f16_, {}).ok());
   // Validate correct uses.
   Shape c64_32 = ShapeUtil::MakeShape(C64, {32});
   TF_ASSERT_OK_AND_ASSIGN(Shape result, complex_shape(f32_, f32_, {}));
@@ -285,6 +286,9 @@ TEST_F(ShapeInferenceTest, Complex) {
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32_64));
   TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(matrix_32_64_, f32_, {}));
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32_64));
+
+  TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(f64_, f64_, {}));
+  ASSERT_TRUE(ShapeUtil::Equal(result, ShapeUtil::MakeShape(C128, {})));
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
@@ -1006,9 +1010,9 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Batch and contracting dimension number mismatch"));
+  EXPECT_TRUE(inferred_status.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(),
+                               ShapeUtil::MakeShape(F32, {32, 32, 64})));
 }
 
 // vector <dot> vector -> scalar
@@ -1100,7 +1104,6 @@ TEST_F(ShapeInferenceTest, DotGeneral) {
 TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1114,8 +1117,28 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Must specify one contracting dimension for both "
-                        "lhs and rhs"));
+              HasSubstr("Must specify the same number of contracting "
+                        "dimensions for lhs and rhs."));
+}
+
+TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 2, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  EXPECT_TRUE(inferred_status.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), output_shape));
 }
 
 // BatchMatMul with different batch dimension sizes fails.
@@ -1134,11 +1157,11 @@ TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimSizesFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Batch dimension numbers and sizes must match"));
+              HasSubstr("Batch dimension sizes must match"));
 }
 
-// BatchMatMul with different batch dimension numbers fails.
-TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
+// BatchMatMul with different batch dimension numbers passes
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersPasses) {
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
 
@@ -1151,9 +1174,9 @@ TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
 
   auto inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Batch dimension numbers must precede non-batch"));
+  ASSERT_TRUE(inferred_status.ok());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(),
+                               ShapeUtil::MakeShape(F32, {2, 11, 14})));
 }
 
 // BatchMatMul with out-of-range dimension numbers fails.
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 28a30b5ee2dbcb5012804578d4d037c241045309..d90dde3b13d3aa9e1de10dd9e1d11a8e6da170de 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -85,7 +85,7 @@ string ShapedBuffer::ToString() const {
       on_device_shape(),
       [this, &s](const Shape& subshape, const ShapeIndex& index) {
         string shape_str;
-        if (ShapeUtil::IsTuple(subshape)) {
+        if (subshape.IsTuple()) {
           shape_str = "tuple";
         } else {
           shape_str = ShapeUtil::HumanStringWithLayout(subshape);
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.cc b/tensorflow/compiler/xla/service/sort_simplifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a00e8d7b227f14d462ca53f695189f3f48754ee
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sort_simplifier.cc
@@ -0,0 +1,126 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+
+namespace xla {
+namespace {
+
+// If the sort instruction has a tuple shape then looks for unused output
+// values and removes them from the sort instruction. Returns true if the
+// graph has been modified.
+StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
+  if (!sort->shape().IsTuple()) {
+    return false;
+  }
+
+  HloComputation* computation = sort->parent();
+
+  if (computation->root_instruction() == sort) {
+    // Can't analyse users of the root instruction.
+    return false;
+  }
+
+  // Index 0 is the sorting key used by the sort HLO itself.
+  absl::flat_hash_set<int64> used_indices{0};
+  for (const HloInstruction* user : sort->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      // Can't analyse users other then get-tuple-element.
+      return false;
+    }
+    used_indices.insert(user->tuple_index());
+  }
+
+  if (used_indices.size() == sort->operand_count()) {
+    // All operands are used.
+    return false;
+  }
+
+  std::vector<HloInstruction*> operands{sort->mutable_operand(0)};
+  std::vector<Shape> new_shapes{sort->operand(0)->shape()};
+  for (int64 i = 1; i < sort->operand_count(); ++i) {
+    if (used_indices.count(i)) {
+      operands.push_back(sort->mutable_operand(i));
+      new_shapes.push_back(sort->operand(i)->shape());
+    }
+  }
+
+  Shape new_sort_shape = new_shapes.size() == 1
+                             ? new_shapes[0]
+                             : ShapeUtil::MakeTupleShape(new_shapes);
+  HloInstruction* new_sort = computation->AddInstruction(
+      sort->CloneWithNewOperands(new_sort_shape, operands));
+
+  // Map from original get-tuple-element tuple index to new HLO instruction
+  absl::flat_hash_map<int64, HloInstruction*> result_map;
+  if (new_sort->shape().IsTuple()) {
+    // Old sort key maps to new sort key.
+    int64 new_index = 0;
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      if (used_indices.count(i)) {
+        result_map[i] =
+            computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                new_shapes[new_index], new_sort, new_index));
+        ++new_index;
+      }
+    }
+  } else {
+    result_map[0] = new_sort;
+  }
+  std::vector<HloInstruction*> users(sort->users().begin(),
+                                     sort->users().end());
+  for (HloInstruction* user : users) {
+    TF_RETURN_IF_ERROR(
+        user->ReplaceAllUsesWith(result_map.at(user->tuple_index())));
+    TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(user));
+  }
+  return true;
+}
+}  // namespace
+
+StatusOr<bool> SortSimplifier::Run(HloModule* module) {
+  VLOG(2) << "HLO module before SortSimplifier:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  bool changed = false;
+  std::vector<HloInstruction*> sort_instrs;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(sort_instrs),
+                    [](const HloInstruction* instr) {
+                      return instr->opcode() == HloOpcode::kSort;
+                    });
+  }
+
+  for (HloInstruction* sort_instr : sort_instrs) {
+    TF_ASSIGN_OR_RETURN(bool result, RemoveUnusedOperandFromSort(sort_instr));
+    changed |= result;
+  }
+
+  if (changed) {
+    VLOG(2) << "HLO module after SortSimplifier:";
+    XLA_VLOG_LINES(2, module->ToString());
+  } else {
+    VLOG(2) << "HLO module unchanged after SortSimplifier";
+  }
+
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.h b/tensorflow/compiler/xla/service/sort_simplifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c6f313aa04f51e14a14450bc72fc622d74133a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sort_simplifier.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which removes unused operands from sort, where an unused operand is
+// defined as an operand at some index 'x' at which the output is not used.
+class SortSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "simplify-sorts"; }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/sort_simplifier_test.cc b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd05fcf830d32e8bac4f8b260d3dd143ab98ad7b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+using SortSimplifierTest = HloTestBase;
+
+TEST_F(SortSimplifierTest, RemoveUnusedSortOperandArrayResult) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} parameter(1)
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+        dimensions={1}
+      ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  uint64 num_executions = 0;
+  do {
+    num_executions++;
+  } while (simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(num_executions, 2);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Sort(m::Parameter(0))));
+}
+
+TEST_F(SortSimplifierTest, RemoveUnusedSortOperandTuple) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,87] parameter(0)
+      values.0 = s32[64,87] parameter(1)
+      values.1 = u32[64,87] parameter(2)
+      sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
+          keys, values.0, values.1),
+        dimensions={1}
+      gte.0 = f32[64,87] get-tuple-element(sort), index=0
+      gte.1 = u32[64,87] get-tuple-element(sort), index=2
+      ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Tuple(
+          m::GetTupleElement(m::Sort(m::Parameter(0), m::Parameter(2)), 0),
+          m::GetTupleElement(m::Sort(m::Parameter(0), m::Parameter(2)), 1))));
+}
+
+TEST_F(SortSimplifierTest, DontRemoveUnusedSortKey) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} parameter(1)
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index a21e586efadb85d18e88e44999283b28f7f65eac..15ef623cc7b2dbc31e9cba5c4783c39b8805a5aa 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -142,7 +142,7 @@ Status TransferManager::TransferArrayToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
-  TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
+  TF_RET_CHECK(on_device_shape.IsArray())
       << "On-device representation of "
       << ShapeUtil::HumanString(literal.shape())
       << " is not an array: " << ShapeUtil::HumanString(on_device_shape);
@@ -227,7 +227,7 @@ Status TransferManager::WriteTupleIndexTablesAsync(
   return ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_device_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
-        if (ShapeUtil::IsTuple(device_subshape)) {
+        if (device_subshape.IsTuple()) {
           se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
@@ -248,6 +248,22 @@ Status TransferManager::WriteTupleIndexTablesAsync(
       });
 }
 
+Status TransferManager::WriteRootTupleIndexTable(
+    se::Stream* stream, const ShapedBuffer& device_buffer) {
+  TF_RET_CHECK(device_buffer.on_device_shape().IsTuple());
+  se::DeviceMemoryBase device_memory = device_buffer.buffer({});
+  TF_RET_CHECK(GetByteSizeRequirement(device_buffer.on_device_shape()) ==
+               device_memory.size());
+
+  std::vector<se::DeviceMemoryBase> elements;
+  for (int64 i = 0;
+       i < ShapeUtil::TupleElementCount(device_buffer.on_device_shape()); ++i) {
+    elements.push_back(device_buffer.buffer({i}));
+  }
+  return WriteSingleTupleIndexTable(
+      stream, elements, device_buffer.on_device_shape(), &device_memory);
+}
+
 Status TransferManager::TransferBufferFromDevice(
     se::Stream* stream, const se::DeviceMemoryBase& source, int64 size,
     void* destination) {
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 49f0b8f8b72001f07200d3e94828f60fcb0fa8fb..43a50487c636da75224547286a31625db3f91330 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -146,6 +146,12 @@ class TransferManager {
   Status WriteTupleIndexTablesAsync(se::Stream* stream,
                                     const ShapedBuffer& device_buffer);
 
+  // Writes a tuple index buffer for the root of 'device_buffer', which must
+  // be a tuple. Unlike WriteTupleIndexTables, only writes the root buffer,
+  // rather than writing all subbuffers. This method is always asynchronous.
+  Status WriteRootTupleIndexTable(se::Stream* stream,
+                                  const ShapedBuffer& device_buffer);
+
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
   // region for a host-to-device transfer.
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index eaf4f28b87ce7706832eebb0bc02d015e64ee89a..a95ca2bf2a8fcd700eb9234cafbfce9b62f2370c 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -45,7 +45,7 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     auto& operand = *dot.operand(i);
     if (operand.IsRank2Transpose()) {
       operand_set.push_back(i);
-    } else if (ShapeUtil::Rank(operand.shape()) != 2) {
+    } else if (operand.shape().rank() != 2) {
       return {};
     }
   }
@@ -130,8 +130,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   HloInstruction* new_lhs;
   const int64 kLhsIdx = 0;
-  if (std::find(operand_indices.begin(), operand_indices.end(), kLhsIdx) !=
-      operand_indices.end()) {
+  if (absl::c_linear_search(operand_indices, kLhsIdx)) {
     HloInstruction& transpose = *convolution.mutable_operand(kLhsIdx);
     const auto& transpose_dimensions = transpose.dimensions();
     HloInstruction& transpose_operand = *transpose.mutable_operand(0);
@@ -154,8 +153,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   HloInstruction* new_rhs;
   const int64 kRhsIdx = 1;
-  if (std::find(operand_indices.begin(), operand_indices.end(), kRhsIdx) !=
-      operand_indices.end()) {
+  if (absl::c_linear_search(operand_indices, kRhsIdx)) {
     HloInstruction& transpose = *convolution.mutable_operand(kRhsIdx);
     const auto& transpose_dimensions = transpose.dimensions();
     HloInstruction& transpose_operand = *transpose.mutable_operand(0);
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 50d51eaeb762e208004c1dae3dcc27503f3f94e9..5e505aaf02f157d0cba9dff42b1a9b89a6691504 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -55,11 +56,10 @@ bool PointsToSet::IsAmbiguous() const {
 
 bool PointsToSet::IsDistinct() const {
   bool distinct = true;
-  std::set<const LogicalBuffer*> all_points_to;
-  ForEachElement([&distinct, &all_points_to](const ShapeIndex& /*index*/,
-                                             const BufferList& points_to) {
+  absl::flat_hash_set<const LogicalBuffer*> all_points_to;
+  ForEachElement([&](const ShapeIndex& /*index*/, const BufferList& points_to) {
     for (auto& buffer : points_to) {
-      if (all_points_to.count(buffer) != 0) {
+      if (all_points_to.contains(buffer)) {
         distinct = false;
       }
       all_points_to.insert(buffer);
@@ -87,9 +87,7 @@ bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
   bool found = false;
   ForEachElement([&found, &buffer](const ShapeIndex& /*index*/,
                                    const BufferList& pointed_to_buffers) {
-    if (!found &&
-        std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                  &buffer) != pointed_to_buffers.end()) {
+    if (!found && absl::c_linear_search(pointed_to_buffers, &buffer)) {
       found = true;
     }
   });
@@ -99,8 +97,7 @@ bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
 bool PointsToSet::ContainsBufferAtIndex(const LogicalBuffer& buffer,
                                         const ShapeIndex& index) const {
   const auto& pointed_to_buffers = element(index);
-  return std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                   &buffer) != pointed_to_buffers.end();
+  return absl::c_linear_search(pointed_to_buffers, &buffer);
 }
 
 void PointsToSet::AddPointedToBuffer(const LogicalBuffer& buffer,
@@ -210,7 +207,7 @@ Status TuplePointsToAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
             &logical_buffer_analysis_->GetBuffer(hlo_instruction, index));
       });
 
-  if (ShapeUtil::IsTuple(hlo_instruction->shape())) {
+  if (hlo_instruction->shape().IsTuple()) {
     // If the hlo instruction is a tuple-shaped, then trivially the instruction
     // itself is the source of the tuple.
     points_to_set.add_tuple_source({}, hlo_instruction);
@@ -604,9 +601,8 @@ bool TuplePointsToAnalysis::DoesNotUseOperandBuffer(
   } else if (user->opcode() == HloOpcode::kFusion &&
              user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     // Find fusion parameter associated with 'operand'.
-    auto it = std::find_if(
-        user->fused_parameters().begin(), user->fused_parameters().end(),
-        [=](HloInstruction* fused_param) {
+    auto it = absl::c_find_if(
+        user->fused_parameters(), [&](HloInstruction* fused_param) {
           return user->operand(fused_param->parameter_number()) == operand;
         });
     CHECK(it != user->fused_parameters().end());
@@ -672,9 +668,8 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
   }
   // Find fusion parameter associated with 'operand'.
   const auto& fused_params = fusion->fused_parameters();
-  auto fused_param_it = std::find_if(
-      fused_params.begin(), fused_params.end(),
-      [&](HloInstruction* fused_param) {
+  auto fused_param_it =
+      absl::c_find_if(fused_params, [&](HloInstruction* fused_param) {
         return fusion->operand(fused_param->parameter_number()) == operand;
       });
   if (fused_param_it == fused_params.end()) {
@@ -743,11 +738,10 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
       // Check if one operand of kAdd fused root is kDot or kConvolution.
       auto* add = user->fused_expression_root();
       auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot;
-                       });
+          absl::c_find_if(add->operands(), [&](HloInstruction* operand) {
+            return operand->opcode() == HloOpcode::kConvolution ||
+                   operand->opcode() == HloOpcode::kDot;
+          });
       if (add_operand_it == add->operands().end()) {
         return false;
       }
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 561762b5d424ed5f537665be9d67a81dc8bdd56e..fd5759e44230db8223822d6ae0f511027f73d8f9 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -623,7 +623,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   void Run(const bool add_additional_gte0_user) {
     Shape input_shape = ShapeUtil::MakeShape(F32, {8});
     Shape update_shape = ShapeUtil::MakeShape(F32, {3});
-    Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+    Shape starts_shape = ShapeUtil::MakeShape(S32, {});
     Shape tuple_shape =
         ShapeUtil::MakeTupleShape({input_shape, update_shape, starts_shape});
 
@@ -657,7 +657,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
         HloInstruction::CreateGetTupleElement(starts_shape, tuple_param0, 2));
     // Update 'input' with 'update' at dynamic 'starts' indices.
     builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-        input_shape, input, update, starts));
+        input_shape, input, update, {starts}));
 
     // Build computation and add it to module as entry computation.
     BuildModule(builder.Build());
@@ -721,9 +721,8 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   // to fusion 'operand'.
   HloInstruction* GetFusionParameterForOperand(HloInstruction* fusion,
                                                HloInstruction* operand) {
-    auto it = std::find_if(
-        fusion->fused_instructions().begin(),
-        fusion->fused_instructions().end(), [=](const HloInstruction* fused) {
+    auto it = absl::c_find_if(
+        fusion->fused_instructions(), [&](const HloInstruction* fused) {
           return fused->opcode() == HloOpcode::kParameter &&
                  fusion->operand(fused->parameter_number()) == operand;
         });
@@ -734,7 +733,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   // Returns all users of 'fusion_paran' at 'tuple_index'.
   std::vector<HloInstruction*> GetFusionParameterUsersAt(
       HloInstruction* fusion_param, int64 tuple_index) {
-    CHECK(ShapeUtil::IsTuple(fusion_param->shape()));
+    CHECK(fusion_param->shape().IsTuple());
     std::vector<HloInstruction*> users_at_tuple_index;
     for (auto user : fusion_param->users()) {
       CHECK_EQ(HloOpcode::kGetTupleElement, user->opcode());
@@ -883,12 +882,12 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update, {starts}));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -977,12 +976,12 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update, {starts}));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -1004,7 +1003,7 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {8});
   Shape update_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {});
   auto data = builder.AddInstruction(
       HloInstruction::CreateParameter(0, data_shape, "data"));
   auto update = builder.AddInstruction(
@@ -1012,7 +1011,7 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
   auto starts = builder.AddInstruction(
       HloInstruction::CreateParameter(2, starts_shape, "starts"));
   auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
+      data_shape, data, update, {starts}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/tuple_util.cc b/tensorflow/compiler/xla/service/tuple_util.cc
index cfb0c787d09557fd1aec3517eb9698cfec323369..90ea79ec263a038556ccbd2cd345b337c5a5dcf3 100644
--- a/tensorflow/compiler/xla/service/tuple_util.cc
+++ b/tensorflow/compiler/xla/service/tuple_util.cc
@@ -21,7 +21,7 @@ namespace xla {
 
 /*static*/ HloInstruction* TupleUtil::ExtractPrefix(HloInstruction* input_tuple,
                                                     int64 elements) {
-  CHECK(ShapeUtil::IsTuple(input_tuple->shape()));
+  CHECK(input_tuple->shape().IsTuple());
 
   HloComputation* computation = input_tuple->parent();
   const Shape& input_shape = input_tuple->shape();
@@ -41,7 +41,7 @@ namespace xla {
 /*static*/ HloInstruction* TupleUtil::AppendSuffix(
     HloInstruction* input_tuple,
     absl::Span<HloInstruction* const> trailing_values) {
-  CHECK(ShapeUtil::IsTuple(input_tuple->shape()));
+  CHECK(input_tuple->shape().IsTuple());
 
   HloComputation* computation = input_tuple->parent();
   const Shape& input_shape = input_tuple->shape();
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 68e2569f66bea9ec1223e454d1ead0efc7b9498e..c93a9ba3176002a34fe84a29e62075de4d19168f 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -301,7 +301,7 @@ optional<int64> ComputeWhileLoopTripCountUpperBound(HloInstruction* while_op) {
                                   /*dest_shape_index=*/{indvar_index},
                                   /*src_shape_index=*/{}));
   StatusOr<Literal> eval_result =
-      evaluator.Evaluate<Literal>(*while_cond, {std::move(fake_input)});
+      evaluator.Evaluate(*while_cond, {std::move(fake_input)});
 
   if (!eval_result.ok()) {
     VLOG(2) << "Couldn't evaluate while loop condition.";
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 41011176ffa91e885bc58364d1fb19617d3518ad..69cc8feb3f31ad782b9d3437d81d0ab8ce10aadb 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -89,7 +89,7 @@ static void CreateLoopInvariantCopy(
 
     HloInstruction* next_operand =
         frame->instruction->mutable_operand(frame->operand_index++);
-    if (hoisted_instructions->count(next_operand) ||
+    if (hoisted_instructions->contains(next_operand) ||
         next_operand == while_body_param) {
       continue;
     }
@@ -127,7 +127,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     HloInstruction* while_instr) {
   auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
 
-  if (!ShapeUtil::IsTuple(while_instr->shape())) {
+  if (!while_instr->shape().IsTuple()) {
     // This restriction leaves one interesting pattern on the table:
     //
     //  while_body(f32[1024, 1024] %param) {
@@ -168,7 +168,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   // is no benefit to hoisting them unless something that uses it is also
   // hoisted.
   for (auto* instr : WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
-    if (ShapeUtil::IsArray(instr->shape())) {
+    if (instr->shape().IsArray()) {
       // TODO(b/79147885): We should try to generalize this to tuples for
       // uniformity's sake, if nothing else.
       InsertOrDie(&unhoisted_invariant_instructions, instr);
@@ -221,7 +221,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
         ShapeUtil::ForEachSubshape(
             operand->shape(),
             [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) {
-              if (ShapeUtil::IsArray(subshape)) {
+              if (subshape.IsArray()) {
                 input_size += ShapeUtil::ByteSizeOfElements(subshape);
               }
             });
@@ -229,7 +229,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
       ShapeUtil::ForEachSubshape(
           instruction->shape(),
           [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) {
-            if (ShapeUtil::IsArray(subshape)) {
+            if (subshape.IsArray()) {
               output_size += ShapeUtil::ByteSizeOfElements(subshape);
             }
           });
@@ -241,7 +241,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
-             unhoisted_invariant_instructions.count(op) ||
+             unhoisted_invariant_instructions.contains(op) ||
              op->opcode() == HloOpcode::kConstant;
     };
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 8e7c4bc8828552e197b41f874c070d496b85a382..3587c016b4420163a607422b1acc838646fab83a 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -299,7 +299,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   // bitcast either.
   auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  auto effective_scalar_s32 = ShapeUtil::MakeShape(S32, {1});
   auto token_shape = ShapeUtil::MakeTokenShape();
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, token_shape});
@@ -314,10 +314,12 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
     HloInstruction* in_token = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(token_shape, param, 2));
-    HloInstruction* bitcast_inst = builder.AddInstruction(
-        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
-    HloInstruction* out_token = builder.AddInstruction(
-        HloInstruction::CreateOutfeed(scalar_f32, bitcast_inst, in_token, ""));
+    HloInstruction* bitcast_inst =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            effective_scalar_s32, HloOpcode::kBitcast, gte_0));
+    HloInstruction* out_token =
+        builder.AddInstruction(HloInstruction::CreateOutfeed(
+            effective_scalar_s32, bitcast_inst, in_token, ""));
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
@@ -352,9 +354,9 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
   // The bitcast's user can be hoisted, so hoist the bitcast too.
   auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
-  Shape while_shape =
-      ShapeUtil::MakeTupleShape({scalar_s32, scalar_f32, scalar_f32});
+  auto effective_scalar_s32 = ShapeUtil::MakeShape(S32, {1});
+  Shape while_shape = ShapeUtil::MakeTupleShape(
+      {scalar_s32, effective_scalar_s32, effective_scalar_s32});
 
   HloComputation* while_body = [&]() {
     HloComputation::Builder builder(TestName() + ".while_body");
@@ -363,12 +365,13 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
     HloInstruction* gte_0 = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
     HloInstruction* gte_1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_f32, param, 1));
-    HloInstruction* bitcast_inst = builder.AddInstruction(
-        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
+        HloInstruction::CreateGetTupleElement(effective_scalar_s32, param, 1));
+    HloInstruction* bitcast_inst =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            effective_scalar_s32, HloOpcode::kBitcast, gte_0));
     HloInstruction* add_inst =
         builder.AddInstruction(HloInstruction::CreateBinary(
-            scalar_f32, HloOpcode::kAdd, bitcast_inst, gte_1));
+            effective_scalar_s32, HloOpcode::kAdd, bitcast_inst, gte_1));
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index d30f67dd8110b88166fe807762fb653190ec00bc..386ffb995477ff1b4aef73080b6a6fd988dd1980 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -58,7 +58,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   HloComputation* while_body = while_op->while_body();
   HloInstruction* while_body_root = while_body->root_instruction();
 
-  if (!ShapeUtil::IsTuple(while_init->shape())) {
+  if (!while_init->shape().IsTuple()) {
     VLOG(2) << "While op's carried value isn't tuple shaped.";
     return false;
   }
@@ -109,8 +109,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       // operand appears in, but it may appear more than once!
       if (user->user_count() == 1 && user->users().front() == while_body_root &&
           while_body_root->operand_index(user) == user->tuple_index() &&
-          std::count(while_body_root->operands().begin(),
-                     while_body_root->operands().end(), user) == 1) {
+          absl::c_count(while_body_root->operands(), user) == 1) {
         continue;
       }
 
@@ -127,7 +126,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // through to the while body's root, count that element as "used", since
   // removing that element would be observable.
   for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
-    if (used_tuple_indices.count(i)) {
+    if (used_tuple_indices.contains(i)) {
       continue;
     }
 
@@ -158,7 +157,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                           used_tuple_indices.end());
-  std::sort(new_to_old_tuple_idx.begin(), new_to_old_tuple_idx.end());
+  absl::c_sort(new_to_old_tuple_idx);
 
   absl::flat_hash_map<int64, int64> old_to_new_tuple_idx;
   for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
@@ -181,7 +180,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // replace the old instructions after we remove unused elements from the while
   // tuple.
   auto make_while_computation_replacements = [&](const HloComputation* comp) {
-    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements;
 
     auto* param = comp->parameter_instruction(0);
@@ -233,7 +232,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       while_cond->CloneWithReplacements(
           make_while_computation_replacements(while_cond));
 
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
   std::vector<HloInstruction*> new_while_body_root_elems;
   new_while_body_root_elems.reserve(new_to_old_tuple_idx.size());
@@ -583,8 +582,7 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
 static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
     absl::Span<HloInstruction*> instrs, const Shape& desired_shape,
     std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
-  CHECK(ShapeUtil::IsTuple(desired_shape))
-      << ShapeUtil::HumanString(desired_shape);
+  CHECK(desired_shape.IsTuple()) << ShapeUtil::HumanString(desired_shape);
 
   // For each child shape in `desired_shape`, slice out the correct number of
   // `instrs` and call UnflattenTupleInstr recursively.  At each step we remove
@@ -593,7 +591,7 @@ static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
   std::vector<HloInstruction*> elems;
   for (int64 i = 0; i < desired_shape.tuple_shapes_size(); ++i) {
     const Shape& subshape = desired_shape.tuple_shapes(i);
-    if (!ShapeUtil::IsTuple(subshape)) {
+    if (!subshape.IsTuple()) {
       elems.push_back(instrs[0]);
       instrs.remove_prefix(1);
       continue;
@@ -603,7 +601,7 @@ static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
     int64 num_leaves = 0;
     ShapeUtil::ForEachSubshape(
         subshape, [&](const Shape& s, const ShapeIndex& /*index*/) {
-          if (!ShapeUtil::IsTuple(s)) {
+          if (!s.IsTuple()) {
             ++num_leaves;
           }
         });
@@ -625,7 +623,7 @@ static std::vector<HloInstruction*> GetFlatTupleElems(
     HloInstruction* instr,
     std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
   const auto& shape = instr->shape();
-  if (!ShapeUtil::IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return {instr};
   }
   std::vector<HloInstruction*> elems;
@@ -665,7 +663,7 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   std::vector<Shape> flattened_shape_elems;
   ShapeUtil::ForEachSubshape(while_shape,
                              [&](const Shape& s, const ShapeIndex& /*index*/) {
-                               if (!ShapeUtil::IsTuple(s)) {
+                               if (!s.IsTuple()) {
                                  flattened_shape_elems.push_back(s);
                                }
                              });
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 3713989ca2f64ee1d94c9f77255017909d957da2..ecca76b1e86d833c73fbb9bad6a341660a7d2669 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -407,13 +407,12 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   // The original while instruction is still left in the module as a dead
   // instruction, find a while instruction with a different name as the new
   // while instruction.
+  const auto& instrs = m->entry_computation()->instructions();
   HloInstruction* new_while_op =
-      *std::find_if(m->entry_computation()->instructions().begin(),
-                    m->entry_computation()->instructions().end(),
-                    [&](const HloInstruction* instr) {
-                      return (instr->opcode() == HloOpcode::kWhile &&
-                              instr->name() != "while");
-                    });
+      *absl::c_find_if(instrs, [&](const HloInstruction* instr) {
+        return (instr->opcode() == HloOpcode::kWhile &&
+                instr->name() != "while");
+      });
 
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index 039ccda7322f5efda6a827efbeda1225c3596cc0..d77386497a14b3e52be2ea7f655fa330f60e4a97 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -97,7 +97,7 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
 WhileUtil::MakeInstructionsLiveIn(
     HloInstruction* while_instr,
     absl::Span<HloInstruction* const> instructions) {
-  CHECK(ShapeUtil::IsTuple(while_instr->shape()));
+  CHECK(while_instr->shape().IsTuple());
 
   int64 elements_in_old_while_shape = while_instr->shape().tuple_shapes_size();
   Shape new_while_shape = while_instr->shape();
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
index 83d696fe0915086c3c98b6d7cbdaeaeb4d9d0bdb..661b7aa7d99ca549da6a509812760a1665d60919 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
@@ -31,16 +31,21 @@ StatusOr<bool> ZeroSizedHloElimination::Run(HloModule* module) {
   bool changed = false;
   for (HloComputation* comp : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      if (instruction->HasSideEffect() ||
-          !ShapeUtil::IsArray(instruction->shape()) ||
+      if (instruction->HasSideEffect() || !instruction->shape().IsArray() ||
           instruction->opcode() == HloOpcode::kConstant) {
         continue;
       }
       if (comp->IsRemovable(instruction) &&
           ShapeUtil::IsZeroElementArray(instruction->shape())) {
+        // If the instruction doesn't have a layout, use a default layout for
+        // the literal.
+        Shape shape = instruction->shape();
+        if (!LayoutUtil::HasLayout(shape)) {
+          LayoutUtil::SetToDefaultLayout(&shape);
+        }
         TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
-            instruction, HloInstruction::CreateConstant(
-                             Literal::CreateFromShape(instruction->shape()))));
+            instruction,
+            HloInstruction::CreateConstant(Literal::CreateFromShape(shape))));
         changed = true;
       }
     }
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index a546a6d39cc55d1f327b8449c7d26cd4c95dbf98..572a79609e7a912277af0fd2ba43f9a1e14a6f52 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -82,5 +82,18 @@ TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateConstant) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ZeroSizedHloEliminationTest, ZeroSizedInstructionWithoutLayoutFolded) {
+  Shape op_shape = ShapeUtil::MakeShape(F32, {4, 0});
+  op_shape.clear_layout();
+  HloInstruction* param1 = builder_.AddInstruction(
+      HloInstruction::CreateParameter(1, op_shape, "zero sized param 1"));
+  HloInstruction* param2 = builder_.AddInstruction(
+      HloInstruction::CreateParameter(2, op_shape, "zero sized param 2"));
+  builder_.AddInstruction(
+      HloInstruction::CreateBinary(op_shape, HloOpcode::kAdd, param1, param2));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
+  EXPECT_TRUE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index b206345db2ac2940b1f139c82fa03a93538b5ccd..a36d3547a0987422c2658b0f3046f7b1f83369c6 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -27,6 +27,21 @@ Shape::Shape(const ShapeProto& shape_proto) {
   for (const int64 dimension : shape_proto.dimensions()) {
     add_dimensions(dimension);
   }
+  // A malformed proto may have different is_dynamic_dimension_size and
+  // dimensions_size. Since C++ is evil, and we have no good way of bailing out
+  // in a constructor, conservatively trim the is_dynamic_dimension size.
+  // TODO(b/120111794): Make this a hard error when we have a factory method
+  // instead of a constructor.
+  if (shape_proto.dimensions_size() !=
+      shape_proto.is_dynamic_dimension_size()) {
+    LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
+                  "fields does not match number of dimension fields";
+  }
+  int64 num_dynamic_dimension_fields = std::min(
+      shape_proto.dimensions_size(), shape_proto.is_dynamic_dimension_size());
+  for (int i = 0; i < num_dynamic_dimension_fields; i++) {
+    dynamic_dimensions_[i] = shape_proto.is_dynamic_dimension(i);
+  }
   tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
   for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
     *add_tuple_shapes() = Shape(element_shape);
@@ -43,6 +58,9 @@ ShapeProto Shape::ToProto() const {
   for (const int64 dimension : dimensions()) {
     proto.add_dimensions(dimension);
   }
+  for (const bool dynamic : dynamic_dimensions_) {
+    proto.add_is_dynamic_dimension(dynamic);
+  }
   proto.mutable_tuple_shapes()->Reserve(tuple_shapes_size());
   for (const Shape& shape : tuple_shapes()) {
     *proto.add_tuple_shapes() = shape.ToProto();
@@ -61,6 +79,112 @@ string Shape::ToString(bool print_layout) const {
   }
 }
 
+bool Shape::is_static() const {
+  if (IsTuple()) {
+    for (const Shape& subshape : tuple_shapes_) {
+      if (!subshape.is_static()) {
+        return false;
+      }
+    }
+  }
+  return !absl::c_any_of(dynamic_dimensions_, [](bool b) { return b; });
+}
+
+void Shape::DeleteDimension(int64 dim_to_delete) {
+  CHECK(IsArray());
+  CHECK_GE(dim_to_delete, 0);
+  CHECK_LT(dim_to_delete, dimensions_.size());
+  dimensions_.erase(dimensions_.begin() + dim_to_delete);
+  dynamic_dimensions_.erase(dynamic_dimensions_.begin() + dim_to_delete);
+  if (LayoutUtil::HasLayout(*this)) {
+    layout_.set_format(DENSE);
+    for (int64 i = 0; i < layout_.minor_to_major().size();) {
+      if (layout_.minor_to_major(i) == dim_to_delete) {
+        layout_.mutable_minor_to_major()->erase(
+            layout_.mutable_minor_to_major()->begin() + i);
+        continue;
+      }
+      if (layout_.minor_to_major(i) > dim_to_delete) {
+        (*layout_.mutable_minor_to_major())[i] -= 1;
+      }
+      ++i;
+    }
+  }
+}
+
+bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
+  if (lhs.IsTuple()) {
+    return rhs.IsTuple() &&
+           absl::c_equal(
+               lhs.tuple_shapes(), rhs.tuple_shapes(),
+               [=](const Shape& l, const Shape& r) { return (*this)(l, r); });
+  } else if (!lhs.IsArray()) {
+    // Non-tuple, non-array tupes such as opaque and token types are trivially
+    // the same.
+    return lhs.element_type() == rhs.element_type();
+  }
+
+  if (!rhs.IsArray()) {
+    return false;
+  }
+
+  if (!ignore_element_type_) {
+    if ((ignore_fp_precision_ &&
+         !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) ||
+        (!ignore_fp_precision_ && !ShapeUtil::SameElementType(lhs, rhs))) {
+      VLOG(3) << "CompareShapes: lhs element type != rhs element type";
+      return false;
+    }
+  }
+
+  if (!ignore_layout_) {
+    if (lhs.layout().format() != rhs.layout().format()) {
+      VLOG(3) << "CompareShapes: lhs layout format != rhs layout format";
+      return false;
+    }
+    if (LayoutUtil::IsDenseArray(lhs)) {
+      if (!absl::c_equal(LayoutUtil::MinorToMajor(lhs),
+                         LayoutUtil::MinorToMajor(rhs))) {
+        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+        return false;
+      }
+
+      const auto& lhs_tiles = lhs.layout().tiles();
+      const auto& rhs_tiles = rhs.layout().tiles();
+      if (lhs_tiles.size() != rhs_tiles.size()) {
+        return false;
+      }
+      for (int64 i = 0; i < lhs_tiles.size(); i++) {
+        if (!absl::c_equal(lhs_tiles[i].dimensions(),
+                           rhs_tiles[i].dimensions())) {
+          return false;
+        }
+      }
+
+      if (lhs.layout().element_size_in_bits() !=
+          rhs.layout().element_size_in_bits()) {
+        return false;
+      }
+    }
+  }
+
+  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+    return false;
+  }
+
+  if (!ignore_dynamic_dimension_) {
+    for (int i = 0; i < lhs.rank(); ++i) {
+      if (lhs.is_dynamic_dimension(i) != rhs.is_dynamic_dimension(i)) {
+        VLOG(3)
+            << "CompareShapes: lhs and rhs have different dynamic dimensions.";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 std::ostream& operator<<(std::ostream& out, const Shape& shape) {
   out << shape.ToString(/*print_layout=*/true);
   return out;
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 7643f64d8a5f0450be1cddad35cf7422afb89048..e6b4e872f69e16ea407dc18cadfc83618080084f 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -44,6 +45,43 @@ class Shape {
   // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
   string ToString(bool print_layout = false) const;
 
+  // Returns the rank (number of dimensions) of the given shape. Shape must be
+  // an array.
+  int64 rank() const {
+    CHECK(IsArray()) << "Non-arrays do not have a rank, shape: " << ToString();
+    return dimensions_.size();
+  }
+
+  // Returns whether the shape is of the specified type (array, tuple, etc).
+  bool IsArray() const { return primitive_util::IsArrayType(element_type()); }
+  bool IsTuple() const { return element_type() == TUPLE; }
+  bool IsToken() const { return element_type() == TOKEN; }
+  bool IsOpaque() const { return element_type() == OPAQUE; }
+
+  // Returns true if no array dimension in the shape is dynamically sized. Tuple
+  // shapes are traversed recursively.
+  bool is_static() const;
+
+  // Returns true if the given dimension is dynamically-sized.
+  bool is_dynamic_dimension(int dimension) const {
+    return dynamic_dimensions_.at(dimension);
+  }
+
+  // Sets whether or not the given dimension is dynamically-sized.
+  void set_dynamic_dimension(int dimension, bool is_dynamic) {
+    dynamic_dimensions_[dimension] = is_dynamic;
+  }
+
+  const std::vector<bool>& dynamic_dimensions() const {
+    return dynamic_dimensions_;
+  }
+
+  // Add dimension_upper_bound().
+
+  // Removes the given dimension form the shape. Layout, if it exists, is
+  // adjusted to match the modified shape.
+  void DeleteDimension(int64 dim_to_delete);
+
   // The following methods mirror the protobuf generated code interface for the
   // message ShapeProto. This enabled easy migration of this data structure
   // from a proto to a proper C++ class.
@@ -58,10 +96,16 @@ class Shape {
   int dimensions_size() const { return dimensions_.size(); }
   int64 dimensions(int index) const { return dimensions_.at(index); }
   void set_dimensions(int index, int64 value) { dimensions_.at(index) = value; }
-  void add_dimensions(int64 value) { dimensions_.push_back(value); }
-  void clear_dimensions() { dimensions_.clear(); }
+  void add_dimensions(int64 value) {
+    dimensions_.push_back(value);
+    dynamic_dimensions_.push_back(false);
+  }
+  void clear_dimensions() {
+    dimensions_.clear();
+    dynamic_dimensions_.clear();
+  }
   const std::vector<int64>& dimensions() const { return dimensions_; }
-  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+  absl::Span<int64> mutable_dimensions() { return absl::MakeSpan(dimensions_); }
 
   // Methods for accessing the tuple subshapes. This field only non-empty for
   // tuple shapes.
@@ -98,13 +142,58 @@ class Shape {
   string ShortDebugString() const { return ToProto().ShortDebugString(); }
   string DebugString() const { return ToProto().DebugString(); }
 
- public:
+  // Equal is a configurable functor to check the equality of two shapes.
+  //
+  // Examples:
+  //
+  // - Comparing two shapes ignoring they layout difference:
+  //   Equal().IgnoreLayout()(shape1, shape2);
+  //
+  // - Comparing two shapes ignoring they layout and element type difference:
+  //   Equal().IgnoreLayout().IgnoreElementType()(shape1, shape2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Shape& lhs, const Shape& rhs);
+
+    Equal& IgnoreLayout() {
+      ignore_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementType() {
+      ignore_element_type_ = true;
+      return *this;
+    }
+    Equal& IgnoreFpPrecision() {
+      ignore_fp_precision_ = true;
+      return *this;
+    }
+    Equal& IgnoreDynamicDimension() {
+      ignore_dynamic_dimension_ = true;
+      return *this;
+    }
+
+   public:
+    bool ignore_layout_ = false;
+    bool ignore_element_type_ = false;
+    bool ignore_fp_precision_ = false;
+    bool ignore_dynamic_dimension_ = false;
+  };
+
+ private:
   // The element type of this shape (tuple, array, etc).
   PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
 
-  // The array bounds of the dimensions. This is nonempty only for array shapes.
+  // The array bounds of the dimensions. This is nonempty only for array
+  // shapes. For a dynamically-sized dimension, the respective value in this
+  // vector is an inclusive upper limit of the array bound.
   std::vector<int64> dimensions_;
 
+  // This vector is the same size as 'dimensions_' and indicates whether the
+  // respective dimension is dynamically sized.
+  std::vector<bool> dynamic_dimensions_;
+
   // The tuple element subshapes. This is nonempty only for tuple shapes.
   std::vector<Shape> tuple_shapes_;
 
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index d44db89d571891ecef554cd45c050017833982bb..a000886d60d06a4a598910c901accb6dfd0a8f1a 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -52,7 +52,7 @@ bool ShapeLayout::MatchesLayoutInShape(const Shape& shape) const {
 
 const Layout& ShapeLayout::layout() const {
   CHECK(LayoutIsSet());
-  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!shape_.IsTuple());
   return shape_.layout();
 }
 
@@ -61,15 +61,15 @@ void ShapeLayout::Clear() { LayoutUtil::ClearLayout(&shape_); }
 bool ShapeLayout::LayoutIsSet() const { return LayoutUtil::HasLayout(shape_); }
 
 void ShapeLayout::ResetLayout(const Layout& layout) {
-  CHECK(!ShapeUtil::IsTuple(shape_));
-  CHECK(!ShapeUtil::IsOpaque(shape_));
+  CHECK(!shape_.IsTuple());
+  CHECK(!shape_.IsOpaque());
   *shape_.mutable_layout() = layout;
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
 }
 
 void ShapeLayout::ResetLayout(const Layout& layout,
                               ShapeIndexView shape_index) {
-  CHECK(ShapeUtil::IsTuple(shape_));
+  CHECK(shape_.IsTuple());
   *ShapeUtil::GetMutableSubshape(&shape_, shape_index)->mutable_layout() =
       layout;
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
index e396897eeebc2e7bdc2dc49300c8906710608b05..55ce5fe884e98e474253be9ef694f1b8137b4b01 100644
--- a/tensorflow/compiler/xla/shape_test.cc
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -41,11 +41,13 @@ class ShapeTest : public ::testing::Test {
       ShapeUtil::MakeTupleShape({opaque_, scalar_, matrix_, matrix2_});
   const Shape nested_tuple_ =
       ShapeUtil::MakeTupleShape({tuple_, matrix_, token_});
+  const Shape dyanmic_matrix_ =
+      ShapeUtil::MakeShape(S32, {5, 2}, {true, false});
 };
 
 TEST_F(ShapeTest, ShapeToFromProto) {
-  for (const Shape& shape :
-       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_}) {
+  for (const Shape& shape : {opaque_, token_, scalar_, matrix_, matrix2_,
+                             tuple_, nested_tuple_, dyanmic_matrix_}) {
     Shape shape_copy(shape.ToProto());
     EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
         << shape << " != " << shape_copy;
@@ -74,6 +76,47 @@ TEST_F(ShapeTest, ShapeToString) {
       nested_tuple_.ToString(/*print_layout=*/true));
 }
 
+TEST_F(ShapeTest, DynamicShapeToString) {
+  Shape array_shape =
+      ShapeUtil::MakeShape(F32, {23, 44, 55}, {true, false, true});
+  EXPECT_EQ("f32[<=23,44,<=55]", array_shape.ToString());
+
+  array_shape.set_dynamic_dimension(2, false);
+  EXPECT_EQ("f32[<=23,44,55]", array_shape.ToString());
+}
+
+TEST_F(ShapeTest, IsStatic) {
+  EXPECT_TRUE(opaque_.is_static());
+  EXPECT_TRUE(token_.is_static());
+  EXPECT_TRUE(matrix_.is_static());
+  EXPECT_TRUE(tuple_.is_static());
+  EXPECT_TRUE(nested_tuple_.is_static());
+
+  Shape dynamic_matrix = matrix_;
+  EXPECT_TRUE(dynamic_matrix.is_static());
+  dynamic_matrix.set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_matrix.is_static());
+
+  Shape dynamic_tuple = tuple_;
+  EXPECT_TRUE(dynamic_tuple.is_static());
+  ShapeUtil::GetMutableSubshape(&dynamic_tuple, {2})
+      ->set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_tuple.is_static());
+}
+
+TEST_F(ShapeTest, IsDynamicDimension) {
+  Shape dynamic_matrix = matrix_;
+  dynamic_matrix.set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_matrix.is_dynamic_dimension(0));
+  EXPECT_TRUE(dynamic_matrix.is_dynamic_dimension(1));
+
+  Shape dynamic_tuple = tuple_;
+  EXPECT_TRUE(dynamic_tuple.is_static());
+  ShapeUtil::GetMutableSubshape(&dynamic_tuple, {2})
+      ->set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_tuple.is_static());
+}
+
 TEST_F(ShapeTest, ProgramShapeToFromProto) {
   ProgramShape program_shape;
   *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 7bf97729165bef98fabc29040e02203eee68a53c..089120179e2a77518eb5b18c11a35670b03e9b77 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -395,7 +395,7 @@ class ShapeTreeIterator
 template <typename T>
 int64 ShapeTree<T>::CountSubshapes(const Shape& shape) {
   int64 current_count = 1;
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     int64 count = ShapeUtil::TupleElementCount(shape);
     for (int i = 0; i < count; ++i) {
       current_count += CountSubshapes(shape.tuple_shapes(i));
@@ -407,7 +407,7 @@ int64 ShapeTree<T>::CountSubshapes(const Shape& shape) {
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
                                 Node* node, Index* index) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     const int64 size = ShapeUtil::TupleElementCount(shape);
 #ifndef NDEBUG
     index->children_count = size;
@@ -443,7 +443,7 @@ void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
 
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, Node* node, Index* index) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     const int64 size = ShapeUtil::TupleElementCount(shape);
 #ifndef NDEBUG
     index->children_count = size;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index be7d71ada009535a5c08aa3d3d062fa451cfeef3..1ada4bc0362f86bc770d4adfcd4d4b0ff7379c77 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -81,73 +81,10 @@ bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
 
 /* static */ bool ShapeUtil::IsArrayPrimitiveType(
     PrimitiveType primitive_type) {
-  return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
-         primitive_type != OPAQUE && primitive_type != TOKEN;
+  return primitive_util::IsArrayType(primitive_type);
 }
 
 namespace {
-
-// Recursive helper for comparing the equality of two shapes. Returns true if
-// the shapes are the same. If compare_layouts is true, then layouts must also
-// match.
-bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
-                   bool ignore_fp_precision) {
-  if ((ignore_fp_precision &&
-       !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) ||
-      (!ignore_fp_precision && !ShapeUtil::SameElementType(lhs, rhs))) {
-    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
-    return false;
-  }
-
-  if (ShapeUtil::IsTuple(lhs)) {
-    return absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         [=](const Shape& l, const Shape& r) {
-                           return CompareShapes(l, r, compare_layouts,
-                                                ignore_fp_precision);
-                         });
-  } else if (!ShapeUtil::IsArray(lhs)) {
-    // Non-tuple, non-array tupes such as opaque and token types are trivially
-    // the same.
-    return true;
-  }
-
-  if (compare_layouts) {
-    if (lhs.layout().format() != rhs.layout().format()) {
-      return false;
-    }
-    if (LayoutUtil::IsDenseArray(lhs)) {
-      if (!absl::c_equal(LayoutUtil::MinorToMajor(lhs),
-                         LayoutUtil::MinorToMajor(rhs))) {
-        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
-        return false;
-      }
-
-      const auto& lhs_tiles = lhs.layout().tiles();
-      const auto& rhs_tiles = rhs.layout().tiles();
-      if (lhs_tiles.size() != rhs_tiles.size()) {
-        return false;
-      }
-      for (int64 i = 0; i < lhs_tiles.size(); i++) {
-        if (!absl::c_equal(lhs_tiles[i].dimensions(),
-                           rhs_tiles[i].dimensions())) {
-          return false;
-        }
-      }
-
-      if (lhs.layout().element_size_in_bits() !=
-          rhs.layout().element_size_in_bits()) {
-        return false;
-      }
-    }
-  }
-
-  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
-    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
-    return false;
-  }
-  return true;
-}
-
 // Constructs and returns the new shape with the given minor_to_major order in
 // its Layout.
 StatusOr<Shape> MakeShapeWithLayoutInternal(
@@ -174,12 +111,11 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
   return shape;
 }
-
 }  // namespace
 
 /* static */ bool ShapeUtil::Equal(const Shape& lhs, const Shape& rhs) {
-  bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true,
-                             /*ignore_fp_precision=*/false);
+  bool equal = Shape::Equal()(lhs, rhs);
+
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ShortDebugString()
             << ", rhs = " << rhs.ShortDebugString();
@@ -190,8 +126,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ bool ShapeUtil::EqualIgnoringFpPrecision(const Shape& lhs,
                                                       const Shape& rhs) {
-  bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true,
-                             /*ignore_fp_precision=*/true);
+  bool equal = Shape::Equal().IgnoreFpPrecision()(lhs, rhs);
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::EqualIgnoringFpPrecision differ: lhs = "
             << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString();
@@ -200,12 +135,6 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
-/* static */ int64 ShapeUtil::Rank(const Shape& shape) {
-  CHECK(ShapeUtil::IsArray(shape))
-      << "Non-arrays do not have a rank, shape: " << shape;
-  return shape.dimensions_size();
-}
-
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -232,6 +161,13 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return MakeValidatedShape(element_type, dimensions).ValueOrDie();
 }
 
+/* static */ Shape ShapeUtil::MakeShape(
+    PrimitiveType element_type, absl::Span<const int64> dimensions,
+    const std::vector<bool>& dynamic_dimensions) {
+  return MakeValidatedShape(element_type, dimensions, dynamic_dimensions)
+      .ValueOrDie();
+}
+
 /* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64> dimensions) {
   CHECK(IsArrayPrimitiveType(element_type)) << element_type;
@@ -240,6 +176,17 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return result;
 }
 
+/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+    PrimitiveType element_type, absl::Span<const int64> dimensions,
+    const std::vector<bool>& dynamic_dimensions) {
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      MakeValidatedShape(element_type, dimensions));
+  for (int i = 0; i < dynamic_dimensions.size(); ++i) {
+    shape.set_dynamic_dimension(i, dynamic_dimensions[i]);
+  }
+  return shape;
+}
+
 /* static */ Shape ShapeUtil::MakeShapeWithLayout(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
     absl::Span<const int64> minor_to_major) {
@@ -319,7 +266,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
   CHECK(LayoutUtil::IsDenseArray(*shape));
-  shape->mutable_layout()->add_minor_to_major(Rank(*shape));
+  shape->mutable_layout()->add_minor_to_major(shape->rank());
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
 }
@@ -334,7 +281,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::ElementHasBitWidth(const Shape& shape, int bits) {
-  if (!IsArray(shape)) {
+  if (!shape.IsArray()) {
     return false;
   }
   return primitive_util::BitWidth(shape.element_type()) == bits;
@@ -358,6 +305,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     case U32:
     case U64:
     case C64:
+    case C128:
     case TUPLE:
     case OPAQUE:
     case TOKEN:
@@ -376,27 +324,24 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return primitive_util::IsFloatingPointType(shape.element_type());
 }
 
-/* static */ bool ShapeUtil::IsArray(const Shape& shape) {
-  return IsArrayPrimitiveType(shape.element_type());
-}
-
 /* static */ bool ShapeUtil::IsNestedTuple(const Shape& shape) {
-  return IsTuple(shape) && std::any_of(shape.tuple_shapes().begin(),
-                                       shape.tuple_shapes().end(), IsTuple);
+  return shape.IsTuple() &&
+         absl::c_any_of(shape.tuple_shapes(),
+                        [](const Shape& s) { return s.IsTuple(); });
 }
 
 /* static */ bool ShapeUtil::IsEmptyTuple(const Shape& shape) {
-  return IsTuple(shape) && TupleElementCount(shape) == 0;
+  return shape.IsTuple() && TupleElementCount(shape) == 0;
 }
 
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
-  CHECK(IsTuple(shape)) << HumanString(shape);
+  CHECK(shape.IsTuple()) << HumanString(shape);
   return shape.tuple_shapes_size();
 }
 
 /* static */ const Shape& ShapeUtil::GetTupleElementShape(const Shape& shape,
                                                           int64 index) {
-  CHECK(IsTuple(shape));
+  CHECK(shape.IsTuple());
   CHECK_GT(TupleElementCount(shape), index);
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape.tuple_shapes(index)));
   return shape.tuple_shapes(index);
@@ -412,7 +357,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Shape ShapeUtil::SliceTuple(const Shape& tuple, int64 start,
                                          int64 limit) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(tuple));
-  CHECK(IsTuple(tuple));
+  CHECK(tuple.IsTuple());
   CHECK_LE(start, TupleElementCount(tuple));
   CHECK_LE(limit, TupleElementCount(tuple));
 
@@ -429,15 +374,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                               complex_shape.element_type()));
 }
 
-/* static */ bool ShapeUtil::ShapeIs(const Shape& shape,
-                                     PrimitiveType element_type,
-                                     std::initializer_list<int64> dimensions) {
-  return Equal(shape, MakeShape(element_type, dimensions));
-}
-
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  DCHECK(IsArray(shape)) << ShapeUtil::HumanString(shape);
-  DCHECK_EQ(shape.dimensions_size(), Rank(shape));
+  DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
+  DCHECK_EQ(shape.dimensions_size(), shape.rank());
   if (shape.dimensions().size() == 1) {
     return shape.dimensions()[0];
   }
@@ -447,8 +386,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ int64 ShapeUtil::ElementsInRecursive(const Shape& shape) {
-  CHECK(IsArray(shape) || IsTuple(shape));
-  if (IsArray(shape)) {
+  CHECK(shape.IsArray() || shape.IsTuple());
+  if (shape.IsArray()) {
     return ElementsIn(shape);
   }
   int64 count = 0;
@@ -472,7 +411,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::IsZeroElementArray(const Shape& shape) {
-  return ShapeUtil::IsArray(shape) && ElementsIn(shape) == 0;
+  return shape.IsArray() && ElementsIn(shape) == 0;
 }
 
 /* static */ bool ShapeUtil::IsScalarWithElementType(
@@ -481,7 +420,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
-  if (IsTuple(shape)) {
+  if (shape.IsTuple()) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -491,13 +430,21 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     text += ")";
     return text;
   }
+  std::vector<string> dim_elements;
+  for (int i = 0; i < shape.dimensions_size(); ++i) {
+    if (shape.is_dynamic_dimension(i)) {
+      dim_elements.push_back(StrCat("<=", shape.dimensions(i)));
+    } else {
+      dim_elements.push_back(StrCat(shape.dimensions(i)));
+    }
+  }
   return StrCat(
       primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[",
-      absl::StrJoin(shape.dimensions(), ","), "]");
+      absl::StrJoin(dim_elements, ","), "]");
 }
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
-  if (IsTuple(shape)) {
+  if (shape.IsTuple()) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -510,10 +457,11 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   string result = StrCat(
       primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[");
   for (int i = 0; i < shape.dimensions().size(); i++) {
-    StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i));
+    StrAppend(&result, (i > 0) ? "," : "",
+              shape.is_dynamic_dimension(i) ? "<=" : "", shape.dimensions(i));
   }
   result += "]";
-  if (!IsScalar(shape) && IsArray(shape)) {
+  if (!IsScalar(shape) && shape.IsArray()) {
     if (LayoutUtil::HasLayout(shape)) {
       StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
     }
@@ -536,43 +484,23 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
                                             const Shape& rhs) {
-  CHECK(ShapeUtil::IsArray(lhs));
-  CHECK(ShapeUtil::IsArray(rhs));
+  CHECK(lhs.IsArray());
+  CHECK(rhs.IsArray());
   return absl::c_equal(lhs.dimensions(), rhs.dimensions());
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
-  return CompareShapes(lhs, rhs, /*compare_layouts=*/false,
-                       /*ignore_fp_precision=*/false);
+  return Shape::Equal().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (IsArray(lhs)) {
-    return IsArray(rhs) && SameDimensions(lhs, rhs);
-  } else if (lhs.element_type() == TUPLE) {
-    return rhs.element_type() == TUPLE &&
-           absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         CompatibleIgnoringElementType);
-  } else {
-    // Opaque, token, etc types are vacuously compatible.
-    return lhs.element_type() == rhs.element_type();
-  }
+  return Shape::Equal().IgnoreElementType().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (IsArray(lhs)) {
-    return IsArray(rhs) && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
-           CompatibleIgnoringElementType(lhs, rhs);
-  } else if (lhs.element_type() == TUPLE) {
-    return rhs.element_type() == TUPLE &&
-           absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         CompatibleIgnoringFpPrecision);
-  } else {
-    // Opaque, token, etc types are vacuously compatible.
-    return lhs.element_type() == rhs.element_type();
-  }
+  return Shape::Equal().IgnoreFpPrecision().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
@@ -583,7 +511,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ int64 ShapeUtil::GetDimensionNumber(const Shape& shape,
                                                  int64 dimension_number) {
   if (dimension_number < 0) {
-    dimension_number += Rank(shape);
+    dimension_number += shape.rank();
   }
   CHECK_GE(dimension_number, 0);
   return dimension_number;
@@ -620,6 +548,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
       return sizeof(double);
     case C64:
       return sizeof(complex64);
+    case C128:
+      return sizeof(complex128);
     case TOKEN:
       // Tokens require no space.
       return 0;
@@ -637,7 +567,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   TF_DCHECK_OK(ValidateShape(shape));
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
-  } else if (IsArray(shape)) {
+  } else if (shape.IsArray()) {
     int64 byte_size = ByteSizeOfElements(shape);
     if (LayoutUtil::IsSparseArray(shape)) {
       byte_size += ByteSizeOfSparseIndices(shape);
@@ -663,7 +593,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ int64 ShapeUtil::ByteSizeOfElements(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   int64 allocated_element_count;
 
   if (LayoutUtil::IsSparseArray(shape)) {
@@ -679,8 +609,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ int64 ShapeUtil::ByteSizeOfSparseIndices(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
   CHECK(LayoutUtil::IsSparseArray(shape));
-  return LayoutUtil::MaxSparseElements(shape.layout()) *
-         ShapeUtil::Rank(shape) * sizeof(int64);
+  return LayoutUtil::MaxSparseElements(shape.layout()) * shape.rank() *
+         sizeof(int64);
 }
 
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
@@ -723,10 +653,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     return Status::OK();
   }
 
-  if (LayoutUtil::IsSparseArray(shape) && Rank(shape) == 0) {
+  if (LayoutUtil::IsSparseArray(shape) && shape.rank() == 0) {
     return InvalidArgument("sparse arrays must have rank > 0");
   }
-  for (int64 i = 0; i < Rank(shape); ++i) {
+  for (int64 i = 0; i < shape.rank(); ++i) {
     int64 dimension = shape.dimensions(i);
     if (dimension < 0) {
       return InvalidArgument(
@@ -742,7 +672,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Status ShapeUtil::ValidateShapeSize(const Shape& shape) {
   VLOG(3) << "Validating shape size: " << ShapeUtil::HumanString(shape);
 
-  if (!IsArray(shape)) {
+  if (!shape.IsArray()) {
     return Status::OK();
   }
 
@@ -763,7 +693,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
         return sparse_elements_size;
       }
       int64 sparse_indices_size =
-          MultiplyWithoutOverflow(max_sparse_elements, ShapeUtil::Rank(shape));
+          MultiplyWithoutOverflow(max_sparse_elements, shape.rank());
       if (sparse_indices_size < 0) {
         return sparse_indices_size;
       }
@@ -835,7 +765,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                           ShapeIndexView index) {
   const Shape* subshape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size() || i < 0) {
+    if (!subshape->IsTuple() || i >= subshape->tuple_shapes_size() || i < 0) {
       return false;
     }
     subshape = &subshape->tuple_shapes(i);
@@ -847,7 +777,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape))
+    CHECK(return_shape->IsTuple())
         << "Invalid index " << index << " for shape " << shape;
     return_shape = &return_shape->tuple_shapes(i);
   }
@@ -858,7 +788,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     const Shape& shape, ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*return_shape) || i < 0 ||
+    if (!return_shape->IsTuple() || i < 0 ||
         i >= return_shape->tuple_shapes_size()) {
       return InvalidArgument(
           "Shape index %s not a valid subshape index for tuple with shape %s",
@@ -873,7 +803,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                                   ShapeIndexView index) {
   Shape* return_shape = shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape));
+    CHECK(return_shape->IsTuple());
     return_shape = return_shape->mutable_tuple_shapes(i);
   }
   return return_shape;
@@ -881,11 +811,11 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */
 bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
-  return !IsTuple(GetSubshape(shape, index));
+  return !GetSubshape(shape, index).IsTuple();
 }
 
 /* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
-  if (!IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return 1;
   }
   int64 count = 0;
@@ -907,7 +837,7 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
 }
 
 /* static */ bool ShapeUtil::HasDegenerateDimensions(const Shape& shape) {
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   return absl::c_linear_search(shape.dimensions(), 1);
 }
 
@@ -924,7 +854,7 @@ Status ForEachSubshapeHelper(const Shape& shape,
                              const ShapeUtil::StatusVisitorFunction& func,
                              ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       index->push_back(i);
       TF_RETURN_IF_ERROR(ForEachSubshapeHelper(
@@ -941,7 +871,7 @@ Status ForEachMutableSubshapeHelper(
     Shape* shape, const ShapeUtil::MutatingStatusVisitorFunction& func,
     ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
-  if (ShapeUtil::IsTuple(*shape)) {
+  if (shape->IsTuple()) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
       index->push_back(i);
       TF_RETURN_IF_ERROR(ForEachMutableSubshapeHelper(
@@ -999,6 +929,10 @@ Status ForEachMutableSubshapeHelper(
   for (auto dim : Permute(permutation, shape.dimensions())) {
     new_shape.add_dimensions(dim);
   }
+  for (int64 i = 0; i < shape.rank(); i++) {
+    new_shape.set_dynamic_dimension(permutation[i],
+                                    shape.is_dynamic_dimension(i));
+  }
 
   // If `shape` has a layout, by contract we choose a new layout such that the
   // transpose defined by this permutation is a bitcast.
@@ -1049,8 +983,8 @@ Status ForEachMutableSubshapeHelper(
 /* static */ std::tuple<bool, std::vector<int64>, std::vector<int64>>
 ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
                                              const Shape& shape_post) {
-  CHECK(IsArray(shape_pre));
-  CHECK(IsArray(shape_post));
+  CHECK(shape_pre.IsArray());
+  CHECK(shape_post.IsArray());
 
   auto nil = std::make_tuple(false, std::vector<int64>(), std::vector<int64>());
 
@@ -1097,7 +1031,7 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     auto unmodified_dim_pair =
         i < unmodified_dims.size()
             ? unmodified_dims[i]
-            : std::make_pair(Rank(shape_pre), Rank(shape_post));
+            : std::make_pair(shape_pre.rank(), shape_post.rank());
     if (!check_modified_dims(prior_unmodified_dim_pair, unmodified_dim_pair)) {
       return nil;
     }
@@ -1109,8 +1043,8 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
 /* static */ std::vector<std::pair<int64, int64>>
 ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
                                          const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
 
   // Unmodified dimensions are merely common factors of rank 1.
   auto common_factors = CommonFactors(AsInt64Slice(input_shape.dimensions()),
@@ -1160,8 +1094,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape,
                                               const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
   CHECK(LayoutUtil::HasLayout(input_shape));
   CHECK(LayoutUtil::HasLayout(output_shape));
 
@@ -1289,12 +1223,12 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     Shape output_shape_dim0_major = MakeShapeWithDescendingLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()));
 
-    for (int64 input_dim = 0; input_dim < Rank(input_shape); ++input_dim) {
+    for (int64 input_dim = 0; input_dim < input_shape.rank(); ++input_dim) {
       if (input_shape.dimensions(input_dim) <= 1) {
         continue;
       }
 
-      std::vector<int64> input_unit_index(Rank(input_shape), 0);
+      std::vector<int64> input_unit_index(input_shape.rank(), 0);
       input_unit_index[input_dim] = 1;
       int64 logical_linear_index =
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
@@ -1320,11 +1254,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ absl::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
 
-  int64 input_rank = Rank(input_shape);
-  int64 output_rank = Rank(output_shape);
+  int64 input_rank = input_shape.rank();
+  int64 output_rank = output_shape.rank();
 
   // First, calculate an alignment of the dimensions. A consecutive sequence of
   // input dimensions and output dimensions belong to the same alignment part if
@@ -1461,30 +1395,14 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
-  CHECK(IsArray(shape));
-  shape.mutable_dimensions()->erase(shape.mutable_dimensions()->begin() +
-                                    dim_to_delete);
-  if (LayoutUtil::HasLayout(shape)) {
-    Layout* layout = shape.mutable_layout();
-    layout->set_format(DENSE);
-    for (int64 i = 0; i < layout->minor_to_major().size();) {
-      if (layout->minor_to_major(i) == dim_to_delete) {
-        layout->mutable_minor_to_major()->erase(
-            layout->mutable_minor_to_major()->begin() + i);
-        continue;
-      }
-      if (layout->minor_to_major(i) > dim_to_delete) {
-        (*layout->mutable_minor_to_major())[i] -= 1;
-      }
-      ++i;
-    }
-  }
+  CHECK(shape.IsArray());
+  shape.DeleteDimension(dim_to_delete);
   return shape;
 }
 
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
-  CHECK(IsArray(shape));
+  CHECK(shape.IsArray());
   std::vector<int64> dims_to_delete;
   for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) {
     if (!p(i)) {
@@ -1504,8 +1422,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   size_t hash_value = hash<PrimitiveType>()(shape.element_type());
 
   if (shape.tuple_shapes().empty()) {
-    for (int64 dim : shape.dimensions()) {
-      hash_value = Hash64Combine(hash_value, hash<int64>()(dim));
+    for (int i = 0; i < shape.dimensions_size(); ++i) {
+      hash_value =
+          Hash64Combine(hash_value, hash<int64>()(shape.dimensions(i)));
+      hash_value = Hash64Combine(hash_value,
+                                 hash<bool>()(shape.is_dynamic_dimension(i)));
     }
 
     hash_value = Hash64Combine(hash_value, LayoutUtil::Hash(shape.layout()));
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 6b7a9cd34f25f2088bdb8d2c7f0412e5d8519d23..fb6da7460e2475732d6f02758e5519fbdb7c0f8d 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -185,7 +185,7 @@ class ShapeUtil {
   // may not actually be able to store this number of elements. See
   // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of
   // elements that can be stored in a sparse shape.
-  // Precondition: IsArray(shape)
+  // Precondition: shape.IsArray()
   static int64 ElementsIn(const Shape& shape);
 
   // As ElementsIn(), but recurses through tuples.
@@ -207,7 +207,7 @@ class ShapeUtil {
 
   // Returns the number of bytes used to store the primitive_type.
   //
-  // Precondition: ShapeUtil::IsArray(shape)
+  // Precondition: shape.IsArray()
   static int64 ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
 
   // Returns the number of bytes required to store the tuple member pointers for
@@ -262,7 +262,7 @@ class ShapeUtil {
   }
 
   // Returns the higher-precision element type if a and b are both floating
-  // point types; otherwise, checks that that they have the same element type
+  // point types; otherwise, checks that they have the same element type
   // and returns it.
   static PrimitiveType HigherPrecisionElementType(const Shape& a,
                                                   const Shape& b) {
@@ -290,16 +290,12 @@ class ShapeUtil {
   // being F32. Tuple elements are compared recursively for compatibility.
   static bool CompatibleIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
-  // Returns whether the lhs and rhs shapes are identical protobufs.
+  // Returns whether the lhs and rhs shapes are identical.
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
-  // Returns the rank (number of dimensions) of the given shape.
-  // Precondition: !IsTuple(shape)
-  static int64 Rank(const Shape& shape);
-
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -313,10 +309,10 @@ class ShapeUtil {
   // Scalar-specific
 
   static bool IsScalar(const Shape& shape) {
-    return IsArray(shape) && Rank(shape) == 0;
+    return shape.IsArray() && shape.rank() == 0;
   }
   static bool IsEffectiveScalar(const Shape& shape) {
-    return IsArray(shape) && TrueRank(shape) == 0;
+    return shape.IsArray() && TrueRank(shape) == 0;
   }
 
   // Returns whether "shape" is a scalar (array) with the given element_type.
@@ -371,11 +367,24 @@ class ShapeUtil {
   static Shape MakeShape(PrimitiveType element_type,
                          absl::Span<const int64> dimensions);
 
+  // Constructs a new shape with the given element type and sequence of
+  // potentially dynamic dimensions. The argument 'dynamic_dimensions' indicates
+  // with a true value that the respective dimension is dynamic. If the
+  // dimension is dynamic then the respective value in 'dimension' is an upper
+  // bound on the dimension size. 'dimensions' and 'dynamic_dimensions' must be
+  // the same size.
+  static Shape MakeShape(PrimitiveType element_type,
+                         absl::Span<const int64> dimensions,
+                         const std::vector<bool>& dynamic_dimensions);
+
   // Constructs a new shape with the given element type and sequence of
   // dimensions. Method checks if the element type is valid and the shape's
   // size fits in std::numeric_limits<int64>::max().
   static StatusOr<Shape> MakeValidatedShape(PrimitiveType element_type,
                                             absl::Span<const int64> dimensions);
+  static StatusOr<Shape> MakeValidatedShape(
+      PrimitiveType element_type, absl::Span<const int64> dimensions,
+      const std::vector<bool>& dynamic_dimensions);
 
   // Creates a Shape with element type corresponding to T and the given
   // dimensions
@@ -443,27 +452,6 @@ class ShapeUtil {
   // that floating point numbers are signed.
   static bool ElementIsSigned(const Shape& shape);
 
-  // Returns whether the shape is a tuple.
-  static bool IsTuple(const Shape& shape) {
-    return shape.element_type() == TUPLE;
-  }
-
-  // Returns whether the shape is an opaque value (i.e. an 'existential' typed
-  // value that is passed to CustomCall operations).
-  static bool IsOpaque(const Shape& shape) {
-    return shape.element_type() == OPAQUE;
-  }
-
-  // Returns whether the shape is an token value used for ordering
-  // side-effecting operations.
-  static bool IsToken(const Shape& shape) {
-    return shape.element_type() == TOKEN;
-  }
-
-  // Returns whether the shape is an array.  Note that scalars are considered
-  // arrays.
-  static bool IsArray(const Shape& shape);
-
   // Returns whether the given primitive type corresponds to an array shape.
   static bool IsArrayPrimitiveType(PrimitiveType primitive_type);
 
@@ -493,12 +481,6 @@ class ShapeUtil {
   // shape.
   static Shape ComplexComponentShape(const Shape& complex_shape);
 
-  // Shorthand for testing whether a shape is of a given element type and
-  // sequence of dimensions.
-  ABSL_DEPRECATED("Use Equal() instead.")
-  static bool ShapeIs(const Shape& shape, PrimitiveType element_type,
-                      std::initializer_list<int64> dimensions);
-
   // Returns true if the given shape has a subshape at the given index.
   static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
 
@@ -693,11 +675,9 @@ class ShapeUtil {
 
   template <typename FnType>
   static void ForEachIndex(const Shape& shape, const FnType& visitor_function) {
-    ForEachIndexWithStatus(shape,
-                           [&](absl::Span<const int64> indices) {
-                             return StatusOr<bool>(visitor_function(indices));
-                           })
-        .IgnoreError();
+    ForEachIndexWithStatus(shape, [&](absl::Span<const int64> indices) {
+      return StatusOr<bool>(visitor_function(indices));
+    }).IgnoreError();
   }
 
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
@@ -746,7 +726,7 @@ class ShapeUtil {
     if (ShapeUtil::IsZeroElementArray(shape)) {
       return Status::OK();
     }
-    CHECK_EQ(Rank(shape), base.size());
+    CHECK_EQ(shape.rank(), base.size());
     CHECK_EQ(incr.size(), base.size());
     CHECK_EQ(count.size(), base.size());
     const int64 rank = LayoutUtil::MinorToMajor(shape).size();
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 0a3081f5161f80ac97e864ba08d186df4fbdb55d..126ae58293d12182e9b6e30f779f681829729526 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -176,6 +176,28 @@ TEST(ShapeUtilTest, UnequalIgnoringFpPrecision) {
       ShapeUtil::MakeShapeWithLayout(PRED, {4, 3}, {0, 1})));
 }
 
+TEST(ShapeUtilTest, EqualDynamicShapes) {
+  EXPECT_TRUE(
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
+                       ShapeUtil::MakeShape(F32, {4, 3}, {true, false})));
+  EXPECT_FALSE(
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
+                       ShapeUtil::MakeShape(F32, {4, 3}, {false, false})));
+}
+
+TEST(ShapeUtilTest, CompatibleDynamicShapes) {
+  Shape shape_a = ShapeUtil::MakeShape(F32, {4, 3}, {true, false});
+  *shape_a.mutable_layout() = Layout({1, 0});
+  Shape shape_b = ShapeUtil::MakeShape(F32, {4, 3}, {true, false});
+  *shape_b.mutable_layout() = Layout({0, 1});
+  Shape shape_c = ShapeUtil::MakeShape(F32, {4, 3}, {false, true});
+  *shape_c.mutable_layout() = Layout({0, 1});
+
+  EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_a));
+  EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_b));
+  EXPECT_FALSE(ShapeUtil::Compatible(shape_a, shape_c));
+}
+
 TEST(ShapeUtilTest, CompatibleTuples) {
   Shape tuple1 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(PRED, {4, 5})});
@@ -516,10 +538,6 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
       ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape2)));
 }
 
-TEST(ShapeUtilTest, ShapeIs) {
-  EXPECT_FALSE(ShapeUtil::ShapeIs(ShapeUtil::MakeShape(PRED, {2}), PRED, {}));
-}
-
 TEST(ShapeUtilTest, ForEachIndex) {
   struct ShapeDimensionAndNumberInvocations {
     std::vector<int64> dimensions;
@@ -692,6 +710,26 @@ TEST(ShapeUtilTest, PermuteDimensionsLayout) {
   } while (std::next_permutation(layout.begin(), layout.end()));
 }
 
+TEST(ShapeUtilTest, PermuteDynamicDimensions) {
+  Shape shape =
+      ShapeUtil::MakeShape(F32, {10, 100, 1000},
+                           /*dynamic_dimensions*/ {false, true, true});
+  SCOPED_TRACE(absl::StrCat("shape=", shape.ToString()));
+
+  std::vector<int64> permutation(3);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  do {
+    SCOPED_TRACE(absl::StrCat("permutation=", absl::StrJoin(permutation, ",")));
+
+    auto permuted = ShapeUtil::PermuteDimensions(permutation, shape);
+    for (int i = 0; i < shape.rank(); i++) {
+      EXPECT_EQ(permuted.dimensions(permutation[i]), shape.dimensions(i));
+      EXPECT_EQ(permuted.is_dynamic_dimension(permutation[i]),
+                shape.is_dynamic_dimension(i));
+    }
+  } while (std::next_permutation(permutation.begin(), permutation.end()));
+}
+
 TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
   EXPECT_FALSE(ShapeUtil::ReshapeIsBitcast(
       ShapeUtil::MakeShapeWithLayout(F32, {3, 2, 2}, {0, 1, 2}),
diff --git a/tensorflow/compiler/xla/sparse_index_array.cc b/tensorflow/compiler/xla/sparse_index_array.cc
index a40bb7875e7ea53a8959a9a67ec09ec260ba9c37..82091bdee65c709bb6020f40acc15f13d8599c1d 100644
--- a/tensorflow/compiler/xla/sparse_index_array.cc
+++ b/tensorflow/compiler/xla/sparse_index_array.cc
@@ -79,7 +79,7 @@ void SparseIndexArray::Resize(int64 num_indices) {
 }
 
 bool SparseIndexArray::Validate(const Shape& shape) const {
-  if (rank_ == 0 || rank_ != ShapeUtil::Rank(shape)) {
+  if (rank_ == 0 || rank_ != shape.rank()) {
     return false;
   }
   int64 num_indices = index_count();
diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h
index a96d483462efd77ae4761541e8c79b2c84fa49f3..0c25355467da3fd346d80db790d78252869975ef 100644
--- a/tensorflow/compiler/xla/sparse_index_array.h
+++ b/tensorflow/compiler/xla/sparse_index_array.h
@@ -135,7 +135,7 @@ void SparseIndexArray::SortWithValues(absl::Span<NativeT> values) {
   auto sort_order_less = [this](int64 lhs, int64 rhs) {
     return IndexUtil::CompareIndices(At(lhs), At(rhs)) < 0;
   };
-  std::sort(sort_order.begin(), sort_order.end(), sort_order_less);
+  absl::c_sort(sort_order, sort_order_less);
 
   // Reorder the array elements according to sort_order.  Work through the array
   // and follow cycles so we can do the reorder in-place.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index ee24d4d99cb1f7ce51a72c6258cbadd6adf12f81..8fb674255020ced6bfaf5f004758ed48f8621a65 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -71,6 +71,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
@@ -276,9 +277,6 @@ cc_library(
 xla_test(
     name = "bad_rng_shape_validation_test",
     srcs = ["bad_rng_shape_validation_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -315,6 +313,26 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conv_depthwise_backprop_filter_test",
+    timeout = "long",
+    srcs = ["conv_depthwise_backprop_filter_test.cc"],
+    shard_count = 6,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
     name = "grouped_convolution_test",
     timeout = "long",
@@ -344,9 +362,6 @@ xla_test(
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -367,9 +382,6 @@ xla_test(
 xla_test(
     name = "query_inferred_shape_test",
     srcs = ["query_inferred_shape_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -387,9 +399,6 @@ xla_test(
 xla_test(
     name = "while_test",
     srcs = ["while_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -413,6 +422,10 @@ xla_test(
 xla_test(
     name = "xla_hlo_profile_test",
     srcs = ["xla_hlo_profile_test.cc"],
+    blacklisted_backends = [
+        # Hlo profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
@@ -436,9 +449,6 @@ xla_test(
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -453,7 +463,6 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -506,9 +515,6 @@ xla_test(
 xla_test(
     name = "pred_test",
     srcs = ["pred_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:local_client",
@@ -524,9 +530,6 @@ xla_test(
 xla_test(
     name = "select_test",
     srcs = ["select_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -544,7 +547,6 @@ xla_test(
 xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -562,7 +564,6 @@ xla_test(
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -623,9 +624,6 @@ xla_test(
 xla_test(
     name = "deconstruct_tuple_test",
     srcs = ["deconstruct_tuple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -648,7 +646,6 @@ xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -698,7 +695,6 @@ xla_test(
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -725,7 +721,6 @@ xla_test(
     srcs = ["dot_operation_test.cc"],
     shard_count = 20,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -735,7 +730,9 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -792,7 +789,9 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -806,9 +805,6 @@ xla_test(
 xla_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -828,9 +824,6 @@ xla_test(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -841,6 +834,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -951,6 +945,11 @@ xla_test(
 xla_test(
     name = "batch_normalization_test",
     srcs = ["batch_normalization_test.cc"],
+    blacklisted_backends = [
+        # BatchNorm HLOs are not handled by the interpreter backend, and the
+        # BatchNorm expander is not run on the interpreter.
+        "interpreter",
+    ],
     shard_count = 40,
     deps = [
         ":test_utils",
@@ -1042,9 +1041,6 @@ xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
     shard_count = 40,
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -1065,9 +1061,6 @@ xla_test(
 xla_test(
     name = "multidimensional_slice_test",
     srcs = ["multidimensional_slice_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1085,9 +1078,6 @@ xla_test(
     name = "dynamic_ops_test",
     timeout = "moderate",
     srcs = ["dynamic_ops_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -1113,9 +1103,6 @@ xla_test(
 xla_test(
     name = "tuple_test",
     srcs = ["tuple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -1139,9 +1126,6 @@ xla_test(
 xla_test(
     name = "vector_ops_reduce_test",
     srcs = ["vector_ops_reduce_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1162,7 +1146,6 @@ xla_test(
     srcs = ["reduce_test.cc"],
     shard_count = 40,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -1229,7 +1212,6 @@ xla_test(
     srcs = [],
     shard_count = 20,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     xla_test_library_deps = [":reduce_window_test_library"],
@@ -1241,7 +1223,6 @@ xla_test(
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -1267,9 +1248,6 @@ xla_test(
 xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
@@ -1290,9 +1268,6 @@ xla_test(
 xla_test(
     name = "reduce_hlo_test",
     srcs = ["reduce_hlo_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1306,9 +1281,6 @@ xla_test(
 xla_test(
     name = "token_hlo_test",
     srcs = ["token_hlo_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
@@ -1323,9 +1295,6 @@ xla_test(
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -1368,9 +1337,6 @@ xla_test(
 xla_test(
     name = "binop_scaling_test",
     srcs = ["binop_scaling_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1388,9 +1354,6 @@ xla_test(
 xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1410,9 +1373,6 @@ xla_test(
 xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1434,9 +1394,6 @@ xla_test(
 xla_test(
     name = "fmax_test",
     srcs = ["fmax_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1451,9 +1408,6 @@ xla_test(
 xla_test(
     name = "log_test",
     srcs = ["log_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1468,9 +1422,6 @@ xla_test(
 xla_test(
     name = "matrix_ops_simple_test",
     srcs = ["matrix_ops_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -1517,9 +1468,6 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1545,9 +1493,6 @@ xla_test(
 xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1566,9 +1511,6 @@ xla_test(
 xla_test(
     name = "vector_ops_simple_test",
     srcs = ["vector_ops_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:shape_util",
@@ -1592,9 +1534,6 @@ xla_test(
 xla_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1615,9 +1554,6 @@ xla_test(
 xla_test(
     name = "convert_test",
     srcs = ["convert_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1637,6 +1573,10 @@ xla_test(
 xla_test(
     name = "all_reduce_test",
     srcs = ["all_reduce_test.cc"],
+    blacklisted_backends = [
+        # All reduce is not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1661,9 +1601,6 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1703,9 +1640,6 @@ xla_test(
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1767,6 +1701,10 @@ xla_test(
 xla_test(
     name = "execution_profile_test",
     srcs = ["execution_profile_test.cc"],
+    blacklisted_backends = [
+        # Execution profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1781,6 +1719,10 @@ xla_test(
     name = "execution_profile_test_with_xla_hlo_profile",
     srcs = ["execution_profile_test.cc"],
     args = ["--xla_hlo_profile"],
+    blacklisted_backends = [
+        # Hlo profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1794,9 +1736,6 @@ xla_test(
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -1819,9 +1758,6 @@ xla_test(
 xla_test(
     name = "broadcast_test",
     srcs = ["broadcast_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1883,9 +1819,6 @@ xla_test(
 xla_test(
     name = "fusion_test",
     srcs = ["fusion_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2003,6 +1936,10 @@ xla_test(
 xla_test(
     name = "outfeed_in_nested_computation_test",
     srcs = ["outfeed_in_nested_computation_test.cc"],
+    blacklisted_backends = [
+        # Outfeed ops are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/tests:local_client_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2179,7 +2116,6 @@ xla_test(
     srcs = ["iota_test.cc"],
     shard_count = 30,
     tags = [
-        "enable_for_xla_interpreter",
         # Require optimized builds, iota_test_cpu is very slow in fastbuild.
         "optonly",
     ],
@@ -2207,3 +2143,18 @@ tf_cc_test(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+xla_test(
+    name = "ptxas_bug_120501638",
+    srcs = ["ptxas_bug_120501638.cc"],
+    tags = [
+        # Disabled in OSS until nvidia publicly releases a fixed ptxas.
+        "no_oss",
+    ],
+    deps = [
+        ":hlo_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 915b456b52215f8d6a9eb6c5b933f3502f1d3d2c..7379fbcc22745f46f2a29732c4bda46f352d07e7 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1443,6 +1443,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, PowC64s) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto lhs =
+      ConstantR1<complex64>(&builder, {-2.0f, -0.6f, -0.6f, 0.0f, 0.0f, 0.0f});
+  auto rhs =
+      ConstantR1<complex64>(&builder, {0.5f, 0.6f, -0.6f, 0.5f, 0.6f, 0.0f});
+  Pow(lhs, rhs);
+
+  ComputeAndCompareR1<complex64>(&builder,
+                                 {
+                                     {0, 1.41421356},
+                                     {-2.27443288e-01, 0.69999846},
+                                     {-4.19847531e-01, -1.29215783},
+                                     {0, 0},
+                                     {0, 0},
+                                     {1, 0},
+                                 },
+                                 {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<float>(&builder, {});
@@ -2047,6 +2068,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampF32) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto minimum = ConstantR1<float>(&builder, {1.0f, -6.5f, 1.0f, 2.25f, NAN});
+  auto argument =
+      ConstantR1<float>(&builder, {2.0f, 10.0f, -5.0f, 1.0f, 10.0f});
+  auto maximum = ConstantR1<float>(&builder, {3.0f, 0.5f, 25.5f, NAN, 123.0f});
+  Clamp(minimum, argument, maximum);
+
+  ComputeAndCompareR1<float>(&builder, {2.0f, 0.5f, 1.0f, NAN, NAN}, {},
+                             error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
   XlaBuilder builder(TestName());
   auto minimum = ConstantR0<float>(&builder, 0.0f);
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index e9728e636f0ee032416b2da17a3ea83c5bb18083..63e48117056dec4af603cbc85e478fcb15ad0cec 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -76,7 +76,9 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
                                 error_spec_);
 }
 
-XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
+// Disabled on interpreter since BatchNormExanper is not run by default on the
+// intepreter backend.
+XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormTraining)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
@@ -110,7 +112,9 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01, 0.02));
 }
 
-XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
+// Disabled on interpreter since BatchNormExanper is not run by default on the
+// intepreter backend.
+XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormGrad)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index a350715597044730429ee9fa268ecd6f2bf26b66..edb95c973b70e30702ed8490c15a48d4d5604170 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -191,7 +191,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
   verify_output(actual, "");
 
   // Try with all output layouts.
-  std::vector<int64> minor_to_major(ShapeUtil::Rank(expected.shape()));
+  std::vector<int64> minor_to_major(expected.shape().rank());
   std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
   do {
     auto layout = ShapeUtil::MakeShapeWithLayout(
@@ -224,7 +224,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
       TF_ASSIGN_OR_RETURN(auto literal,
                           client_->Transfer(*arguments[index], nullptr));
       // Skip tuples because they don't have a rank.
-      if (ShapeUtil::IsTuple(literal.shape())) {
+      if (literal.shape().IsTuple()) {
         layout_strings.push_back(
             ShapeUtil::HumanStringWithLayout(literal.shape()));
         arguments_with_layout.push_back(arguments[index]);
@@ -234,7 +234,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         return Status::OK();
       }
 
-      std::vector<int64> minor_to_major(ShapeUtil::Rank(literal.shape()));
+      std::vector<int64> minor_to_major(literal.shape().rank());
       std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
       do {
         auto literal_relayout =
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 65a23dd883594b9bf9c37494a37e9be39b197788..3f65ed7fce4ff4b5c3781ac2581935bfacc69ce1 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -431,7 +431,8 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -455,7 +456,8 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -480,7 +482,8 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
@@ -506,7 +509,8 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
@@ -532,7 +536,8 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 363dee74b2755a6bdc3c5a5164a85378581c21d2..247328b730f3af936d933f824da491b593b27c90 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -96,7 +96,7 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
                                         LiteralSlice(result, {1}));
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.shape()));
+  EXPECT_TRUE(result.shape().IsTuple());
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.shape()));
 
   EXPECT_TRUE(ShapeUtil::Equal(
@@ -109,7 +109,10 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
-XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
+// Disabled for interpreter since ExecuteAsyncOnStream is not implemented on
+// interpreter backend.
+XLA_TEST_F(ClientTest,
+           DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(ExecuteParallel))) {
   XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 3b0414a6045a7c5f4f75948d8ccf2775c575626e..ef800b8ef624bf1020ff1e6857c13b0387482cd3 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -151,19 +151,35 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
   }
 }
 
-TEST_F(ComputeConstantTest, IndirectParamMissing) {
+TEST_F(ComputeConstantTest, GetDimensionSize) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
-    auto computation =
-        Add(ConstantR0<float>(&b, 1.0f),
-            Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "param"));
-    EXPECT_FALSE(IsConstant(computation, &b));
+    auto add =
+        Add(ConstantR1<float>(&b, {1.0f}), ConstantR1<float>(&b, {1.0f}));
+    auto get_dimension_size = GetDimensionSize(add, 0);
+    EXPECT_TRUE(IsConstant(get_dimension_size, &b));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto value, ComputeConstantScalar<uint32>(
+                                            client, get_dimension_size, &b));
+    EXPECT_EQ(value, 1);
+  }
+}
 
-    auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(
-        absl::StrContains(value.status().ToString(), "depends on a parameter"))
-        << value.status();
+TEST_F(ComputeConstantTest, MultipleGetDimensionSize) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto add =
+        Add(ConstantR2<float>(&b, {{1.0f}}), ConstantR2<float>(&b, {{1.0f}}));
+    auto get_dimension_size = GetDimensionSize(add, 0);
+    auto get_dimension_size_2 = GetDimensionSize(add, 0);
+    auto add_2 = Add(get_dimension_size, get_dimension_size_2);
+    EXPECT_TRUE(IsConstant(add_2, &b));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto value,
+                            ComputeConstantScalar<uint32>(client, add_2, &b));
+    EXPECT_EQ(value, 2);
   }
 }
 
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 9174f2651cb90b364f869364fe108cf208c11a84..6530007871ced1d0bbffe2b44ccc8cf9bddd79e1 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -180,6 +181,29 @@ TEST_F(ConstantsTest, Token) {
   TF_ASSERT_OK(Execute(&builder, {}).status());
 }
 
+TEST_F(ConstantsTest, FullLike) {
+  XlaBuilder b(TestName());
+  auto val1 = Iota(&b, F32, 3);
+  auto val2 = FullLike(val1, 10);
+  val1 + val2;
+  ComputeAndCompareR1<float>(&b, {10, 11, 12}, {}, error_spec_);
+}
+
+TEST_F(ConstantsTest, IllegalFullLikeOnTuple) {
+  XlaBuilder b(TestName());
+  auto tuple = Tuple(&b, {Iota(&b, F32, 3), Iota(&b, F32, 1)});
+  FullLike(tuple, 10);  // Illegal; can't do FullLike on a tuple.
+  EXPECT_FALSE(b.Build().ok());
+}
+
+TEST_F(ConstantsTest, FullLikeScalar) {
+  XlaBuilder b(TestName());
+  auto scalar1 = ConstantR0WithType(&b, F32, 1);
+  auto scalar2 = FullLike(scalar1, 2);
+  scalar1 - scalar2;
+  ComputeAndCompareR0<float>(&b, -1, {}, error_spec_);
+}
+
 class ConstantsHloTest : public HloTestBase {};
 
 // TODO(b/121147351): Fails on GPU. Not clear if this is expected behavior.
@@ -200,9 +224,7 @@ XLA_TEST_F(ConstantsHloTest, DISABLED_ON_GPU(BitcastOfConstant)) {
       ROOT result = s32[] call(parameter.0, constant-as-scalar), to_apply=func
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR0<int32>(1);
   auto result = ExecuteNoHloPasses(std::move(module), {&param});
   EXPECT_TRUE(LiteralTestUtil::Equal(param, result));
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c78d3f3d9ee2115206e6c4aeeb2991c07e57392a
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -0,0 +1,154 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct DepthwiseConvolution2DSpec {
+  int64 output_batch, window, window_dilation;
+  std::vector<int64> activation_dims;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> output_dims;
+};
+
+class DepthwiseConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
+
+static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<DepthwiseConvolution2DSpec> config_set;
+  std::vector<std::vector<int64>> config_options = {
+      {8, 5, 3, 2},   {4, 5, 5, 2},   {8, 7, 4, 128},  {16, 20, 20, 256},
+      {256, 7, 5, 4}, {256, 6, 6, 4}, {256, 8, 8, 512}};
+
+  for (auto option : config_options) {
+    int64 feature = option[3];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[0];
+
+    DepthwiseConvolution2DSpec config;
+    config.window_dilation = 1;
+    config.output_batch = feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size, feature};
+
+    config.kernel_dims = {batch, kernel_size, kernel_size, feature};
+
+    int64 output_space_size = 3 + activation_size - kernel_size;
+    config.output_dims = {output_space_size, output_space_size, feature, 1};
+
+    config_set.push_back(config);
+
+    // Add configurations for window dilation cases.
+    if (activation_size % 2 == 0 && activation_size == kernel_size) {
+      DepthwiseConvolution2DSpec config;
+      config.window_dilation = 2;
+      config.output_batch = feature;
+      config.window = kernel_size / 2;
+      config.activation_dims = {batch, activation_size, activation_size,
+                                feature};
+      config.kernel_dims = {batch, kernel_size / 2, kernel_size / 2, feature};
+
+      int64 output_space_size = 5;
+      config.output_dims = {output_space_size, output_space_size, feature, 1};
+
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), data_type);
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  return absl::StrFormat(
+      R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s] parameter(0)
+      kernel = %s[%s] parameter(1)
+      ROOT conv = %s[%s] convolution(%s[%s] activation, %s[%s] kernel),
+          window={size=%dx%d pad=1_%dx1_%d rhs_dilate=%dx%d}, dim_labels=f01b_i01o->01fb,
+          batch_group_count=%d
+    }
+    )",
+      data_type, absl::StrJoin(spec.activation_dims, ","), data_type,
+      absl::StrJoin(spec.kernel_dims, ","), data_type,
+      absl::StrJoin(spec.output_dims, ","), data_type,
+      absl::StrJoin(spec.activation_dims, ","), data_type,
+      absl::StrJoin(spec.kernel_dims, ","), spec.window, spec.window,
+      spec.window_dilation, spec.window_dilation, spec.window_dilation,
+      spec.window_dilation, spec.output_batch);
+}
+
+XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
+  const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text =
+      BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    DepthwiseConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 249693891290e14645ee5b4b4d97b2d506a01302..9db9f2563b636c4f929585eb13a9c7f809833eda 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -467,8 +467,8 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
 // servers. The error message is missing the operator ++.
 template <typename T>
 void iota_int_init_value(std::vector<T>& values, int init_value) {
-  std::for_each(values.begin(), values.end(),
-                [&](T& value) { value = static_cast<T>(init_value++); });
+  absl::c_for_each(values,
+                   [&](T& value) { value = static_cast<T>(init_value++); });
 }
 
 template <typename T>
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index c5d8b663f4abe77e05ec213d2e4e075c260a8655..f740f4815810727890583405b2244fceaec0bd3f 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -918,8 +920,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {1, 6});
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -945,8 +948,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -974,8 +978,9 @@ XLA_TEST_F(DotOperationTest,
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1001,8 +1006,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSReverseMM) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {1, 6});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1033,8 +1039,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSRows) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1065,8 +1072,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSRows) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1089,8 +1097,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSCols) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {1, 6});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -1113,8 +1122,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSCols) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {1, 6});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -1147,5 +1157,105 @@ XLA_TEST_F(DotOperationTest, DotRank2AndRank2NonDefaultContractionDims) {
 
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
+
+using EinsumParamType =
+    std::tuple<std::vector<int64>, std::vector<int64>, string>;
+class EinsumTest : public DotOperationTest,
+                   public ::testing::WithParamInterface<EinsumParamType> {};
+XLA_TEST_P(EinsumTest, SimpleEinsumTest) {
+  XlaBuilder builder(TestName());
+  auto x = AddParam(
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
+          .ValueOrDie(),
+      &builder);
+  auto y = AddParam(
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
+          .ValueOrDie(),
+      &builder);
+  Einsum(x, y, std::get<2>(GetParam()));
+  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+}
+
+std::vector<EinsumParamType> GetEinsumTestCases() {
+  using v = std::vector<int64>;
+  using p = EinsumParamType;
+  std::vector<p> test_cases = {
+      p{v{5, 6}, v{6, 7}, "mk,kn->mn"},
+      p{v{5, 6}, v{6, 7}, "mk,kn->nm"},
+      p{v{5, 6, 11}, v{6, 11, 7}, "mkB,kBn->nmB"},
+      p{v{31, 55, 11}, v{55, 11, 29}, "mkB,kBn->nmB"},
+      p{v{31, 55, 11}, v{55, 11, 29}, "mkB,kBn->Bnm"},
+      p{v{8, 55, 11, 3}, v{55, 11, 3, 29}, "mkBC,kBCn->BCnm"},
+      p{v{5, 6}, v{6, 7}, "ab,cd->dcba"},
+      p{v{6}, v{6, 7}, "b,bc->c"},
+  };
+  return test_cases;
+}
+
+INSTANTIATE_TEST_CASE_P(Einsum, EinsumTest,
+                        ::testing::ValuesIn(GetEinsumTestCases()));
+
+class DotOperationTextTest : public HloTestBase {};
+
+XLA_TEST_F(DotOperationTextTest, DotReorderedDotDims) {
+  absl::string_view hlo_string =
+      R"(
+HloModule ComplexDotMultipleNonContracting
+
+ENTRY %test {
+  %lhs = f32[7,17,10,13]{3,2,1,0} parameter(0)
+  %rhs = f32[7,9,10,13,6]{4,3,2,1,0} parameter(1)
+  ROOT %dot = f32[10,7,17,9,6]{4,3,2,1,0} dot(%lhs, %rhs), lhs_batch_dims={2,0}, rhs_batch_dims={2,0}, lhs_contracting_dims={3}, rhs_contracting_dims={3}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DotReorderedDotDimsAndMultipleContracting) {
+  absl::string_view hlo_string =
+      R"(
+HloModule ComplexDotMultipleNonContracting
+
+ENTRY %test {
+  %lhs = f32[7,5,17,10,13]{4,3,2,1,0} parameter(0)
+  %rhs = f32[7,9,10,13,6,5]{5,4,3,2,1,0} parameter(1)
+  ROOT %dot = f32[10,7,17,9,6]{4,3,2,1,0} dot(%lhs, %rhs), lhs_batch_dims={3,0}, rhs_batch_dims={2,0}, lhs_contracting_dims={1,4}, rhs_contracting_dims={5,3}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DotWithNoDnums) {
+  absl::string_view hlo_string =
+      R"(
+HloModule DotWithNoDnums
+
+ENTRY %test {
+  %lhs = f32[2,3]{1,0} parameter(0)
+  %rhs = f32[4,5]{1,0} parameter(1)
+  ROOT %dot = f32[2,3,4,5]{3,2,1,0} dot(%lhs, %rhs)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, Einsum) {
+  absl::string_view hlo_string =
+      R"(
+HloModule Einsum
+
+ENTRY %test {
+  %lhs = f32[8,64,96]{2,1,0} parameter(0)
+  %rhs = f32[96,32,4]{2,1,0} parameter(1)
+  ROOT %dot = f32[8,64,32,4]{3,2,1,0}  dot(%lhs, %rhs), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 7501c6d957e7afe99b8c530e5f0d575f818367da..82e2db36143b2552472fedae701f32389a9be108 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -135,11 +135,11 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::unique_ptr<GlobalData> start_data = CreateR0Parameter<IndexT>(
+        slice_starts[0], 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
-    DynamicSlice(input, starts, slice_sizes);
+    DynamicSlice(input, absl::Span<const XlaOp>({starts}), slice_sizes);
     // Run computation and compare against expected values.
     ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
@@ -160,14 +160,23 @@ class DynamicSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(2);
+    std::vector<std::unique_ptr<GlobalData>> start_data(2);
+    for (int i = 0; i < 2; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
+
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
   template <typename IndexT, typename DataT>
@@ -186,14 +195,22 @@ class DynamicSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(3);
+    std::vector<std::unique_ptr<GlobalData>> start_data(3);
+    for (int i = 0; i < 3; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 };
 
@@ -372,16 +389,12 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                       .ValueOrDie());
 
     XlaBuilder builder(TestName());
-    // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_value);
     auto update = ConstantLiteral(&builder, update_value);
-    DynamicUpdateSlice(input, update, starts);
+    DynamicUpdateSlice(input, update, absl::Span<const XlaOp>({}));
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_value, {});
   }
 
   template <typename IndexT, typename DataT>
@@ -405,12 +418,12 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::unique_ptr<GlobalData> start_data = CreateR0Parameter<IndexT>(
+        slice_starts[0], 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
-    DynamicUpdateSlice(input, update, starts);
+    DynamicUpdateSlice(input, update, absl::Span<const XlaOp>({starts}));
     // Run computation and compare against expected values.
     ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
@@ -435,15 +448,23 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(2);
+    std::vector<std::unique_ptr<GlobalData>> start_data(2);
+    for (int i = 0; i < 2; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
   template <typename IndexT, typename DataT>
@@ -466,15 +487,24 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(3);
+    std::vector<std::unique_ptr<GlobalData>> start_data(3);
+    for (int i = 0; i < 3; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
+
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
   template <class T>
@@ -518,8 +548,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaOp update;
     std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
-    auto starts = ConstantR1<int32>(&builder, {index, 0, 0});
-    DynamicUpdateSlice(input, update, starts);
+    auto constant_index = ConstantR0<int32>(&builder, index);
+    auto zero = ConstantR0<int32>(&builder, 0);
+    DynamicUpdateSlice(input, update, {constant_index, zero, zero});
 
     // Run computation and compare against expected values.
     ComputeAndCompareR3<T>(&builder, expected_values,
@@ -720,46 +751,55 @@ void BM_DynamicSlice(int num_iters) {
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
   auto input = ConstantLiteral(&builder, input_literal);
 
+  auto stream =
+      client->mutable_backend()->BorrowStream(device_ordinal).ValueOrDie();
+
   // Create dynamic slice start indices as a parameter: shape [4]
-  auto start_indices_shape = ShapeUtil::MakeShape(S32, {4});
-  auto start_indices =
-      Parameter(&builder, 0, start_indices_shape, "start_indices");
+  auto start_indices_shape = ShapeUtil::MakeShape(S32, {});
+  std::vector<XlaOp> start_indices(4);
+  std::vector<ScopedShapedBuffer> shaped_buffers;
+  std::vector<const Shape*> host_shapes(4);
+  for (int i = 0; i < 4; ++i) {
+    start_indices[i] =
+        Parameter(&builder, i, start_indices_shape, "start_indices");
+    auto start_index_literal = LiteralUtil::CreateR0<int32>(i + 1);
+    // Initialize and transfer parameter buffer.
+    shaped_buffers.emplace_back(
+        client->backend()
+            .transfer_manager()
+            ->AllocateScopedShapedBuffer(start_indices_shape, &allocator,
+                                         /*device_ordinal=*/0)
+            .ConsumeValueOrDie());
+    host_shapes[i] = &shaped_buffers[i].on_host_shape();
+    ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
+        stream.get(), start_index_literal, shaped_buffers[i]));
+  }
+
   // Add DynamicSlice op to the computatation.
   DynamicSlice(input, start_indices, {1, 1, 1, 1});
   auto computation = builder.Build().ConsumeValueOrDie();
 
-  // Initialize and transfer parameter buffer.
-  auto buffer = client->backend()
-                    .transfer_manager()
-                    ->AllocateScopedShapedBuffer(
-                        start_indices_shape, &allocator, /*device_ordinal=*/0)
-                    .ConsumeValueOrDie();
-
-  auto start_indices_literal = LiteralUtil::CreateR1<int32>({0, 1, 2, 3});
-  auto stream =
-      client->mutable_backend()->BorrowStream(device_ordinal).ValueOrDie();
-  ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      stream.get(), start_indices_literal, buffer));
-
   std::unique_ptr<LocalExecutable> executable =
-      client
-          ->Compile(computation, {&buffer.on_host_shape()},
-                    ExecutableBuildOptions())
+      client->Compile(computation, host_shapes, ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
   // Run some warm-up executions.
   ExecutableRunOptions options;
   options.set_allocator(&allocator);
   const int kWarmups = 2;
+  std::vector<const ShapedBuffer*> shaped_buffer_ptrs;
+  absl::c_transform(shaped_buffers, std::back_inserter(shaped_buffer_ptrs),
+                    [](const ScopedShapedBuffer& buffer) { return &buffer; });
+
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({&buffer}, options);
+    auto result = executable->Run(shaped_buffer_ptrs, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({&buffer}, options);
+    auto result = executable->Run(shaped_buffer_ptrs, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index c84973e17b234c24c84f02a369ce0185f5772cca..139d5c70b8cbcf14670abcb064fcca2f0ba853c6 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -40,14 +40,15 @@ class ExhaustiveF32ElementwiseOpTest
 
     Literal input_literal =
         LiteralUtil::CreateFromDimensions(F32, {input_size});
+    absl::Span<float> input_arr = input_literal.data<float>();
     for (int64 i = begin; i < end; i++) {
       if (i >= known_incorrect_range.first &&
           i < known_incorrect_range.second) {
         // If the operation is known to be buggy on a specific input clamp that
         // input to 0 under the assumption that the op is at least correct on 0.
-        input_literal.Set({i - begin}, 0.0f);
+        input_arr[i - begin] = 0;
       } else {
-        input_literal.Set({i - begin}, absl::bit_cast<float, int>(i));
+        input_arr[i - begin] = absl::bit_cast<float, int>(i);
       }
     }
 
@@ -60,7 +61,7 @@ class ExhaustiveF32ElementwiseOpTest
     std::vector<float> expected_result;
     expected_result.reserve(input_size);
     for (int64 i = 0; i < input_size; i++) {
-      expected_result.push_back(evaluate_op(input_literal.Get<float>({i})));
+      expected_result.push_back(evaluate_op(input_arr[i]));
     }
 
     ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index dcb469087e0064d17ce3b04fdeaf0b6136069a55..1b0bebe2d03a9a153cd0c80329ed0c49c91333a3 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -48,7 +48,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
 
   tensorflow::SubProcess file_check_process;
   file_check_process.SetProgram(file_check_path,
-                                {file_check_path, pattern_path});
+                                {file_check_path, "-v", pattern_path});
   file_check_process.SetChannelAction(tensorflow::CHAN_STDIN,
                                       tensorflow::ACTION_PIPE);
   file_check_process.SetChannelAction(tensorflow::CHAN_STDERR,
@@ -71,9 +71,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
       LOG(WARNING) << "NOTE: FileCheck binary does not exist!";
     }
 
-    LOG(WARNING) << "FileCheck error: " << standard_error;
-    LOG(WARNING) << "FileCheck input was:";
-    XLA_LOG_LINES(tensorflow::WARNING, input);
+    LOG(WARNING) << "FileCheck error:\n" << standard_error;
     LOG(WARNING) << "FileCheck pattern was:";
     XLA_LOG_LINES(tensorflow::WARNING, pattern);
   } else if (!standard_error.empty()) {
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index d1fddf9d6b494a822610e41307fa103dc90bdef3..2178c9b3f3d39ac034c59585c6836d2bc59162c1 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -523,10 +523,10 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
   auto dynamic_slice2 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(S32, {2}), const0, const1, {2}));
+          ShapeUtil::MakeShape(S32, {2}), const0, {const1}, {2}));
   auto negate3 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, dynamic_slice2));
   hlo_module->AddEntryComputation(builder.Build())
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index daa89398a697af9149797d621c3bdca80a00aedd..d65b67a535d43553a3a94f76482ad4618f9b8aab 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -600,7 +600,9 @@ ENTRY main {
 
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
+// Disabled on interpreter since ExectuteAsyncOnStream is not supported.
+XLA_TEST_F(GatherClientLibraryTest,
+           DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(Basic))) {
   // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d57846e19bb80c5b9c87d50596da2915f9aef317..66f72ba8d20b8ef1f436da4425b2bb6518ee9a94 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -139,7 +139,8 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
     const string& name) {
   return absl::make_unique<VerifiedHloModule>(
       name, GetModuleConfigForTest(), verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
+      allow_mixed_precision_in_hlo_verifier_,
+      backend().compiler()->ShapeSizeBytesFunction());
 }
 
 StatusOr<std::unique_ptr<VerifiedHloModule>>
@@ -147,7 +148,8 @@ HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           const HloModuleConfig& config) {
   auto module = absl::make_unique<VerifiedHloModule>(
       TestName(), config, verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
+      allow_mixed_precision_in_hlo_verifier_,
+      backend().compiler()->ShapeSizeBytesFunction());
   TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
   TF_RETURN_IF_ERROR(module->Verify());
   return std::move(module);
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 1d1e7f437296a7493ef7da07039fcf6d273f35bc..69a4f96288c7285010e9adbdc33f1b394f58d8d2 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -46,10 +46,12 @@ class VerifiedHloModule : public HloModule {
  public:
   VerifiedHloModule(const string& name, const HloModuleConfig& config,
                     bool verifier_layout_sensitive,
-                    bool allow_mixed_precision_in_hlo_verifier)
+                    bool allow_mixed_precision_in_hlo_verifier,
+                    std::function<int64(const Shape&)> shape_size_function)
       : HloModule(name, config),
-        verifier_(verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
+        verifier_(
+            verifier_layout_sensitive, allow_mixed_precision_in_hlo_verifier,
+            /*instruction_can_change_layout_func=*/{}, shape_size_function) {}
 
   ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
 
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 554eb24d44168caa7d7252015e3d99f2d567df9b..a2fd6070731943f15c773265f428b16f520d02ee 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -86,7 +86,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const LiteralSlice& expected, const LiteralSlice& actual,
-    const ErrorSpec& error_spec, bool detailed_message) {
+    const ErrorSpec& error_spec, absl::optional<bool> detailed_message) {
   return StatusToAssertion(literal_comparison::Near(
       expected, actual, error_spec, detailed_message, &OnMiscompare));
 }
@@ -97,7 +97,8 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
   if (error.has_value()) {
     VLOG(1) << "Expects near";
     return StatusToAssertion(literal_comparison::Near(
-        expected, actual, *error, /*detailed_message=*/false, &OnMiscompare));
+        expected, actual, *error, /*detailed_message=*/absl::nullopt,
+        &OnMiscompare));
   }
   VLOG(1) << "Expects equal";
   return StatusToAssertion(literal_comparison::Equal(expected, actual));
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 43cca91f64b2c0fbfde5054a361cf0f95302c23d..d7cf9bed98a3eb7479b6deb6838dc388a0869360 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -93,7 +93,7 @@ class LiteralTestUtil {
   static ::testing::AssertionResult Near(
       const LiteralSlice& expected, const LiteralSlice& actual,
       const ErrorSpec& error_spec,
-      bool detailed_message = false) TF_MUST_USE_RESULT;
+      absl::optional<bool> detailed_message = absl::nullopt) TF_MUST_USE_RESULT;
 
   // Asserts the given literal are within the given error bound of the given
   // expected values. Only supported for floating point values.
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index a99b43f4690b3063f76e2cda1e58c9b4ba9a1df4..96527886b718bc1ea4ce8cc2d7dbeb2e3ef1d1eb 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -205,7 +205,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   ScopedShapedBuffer result =
       ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_TRUE(result.on_host_shape().IsTuple());
   EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
   Literal result_literal = ShapedBufferToLiteral(result);
@@ -233,7 +233,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   ScopedShapedBuffer result =
       ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_TRUE(result.on_host_shape().IsTuple());
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
   Literal result_literal = ShapedBufferToLiteral(result);
@@ -311,7 +311,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   ScopedShapedBuffer result =
       ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_TRUE(result.on_host_shape().IsTuple());
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
   Literal result_literal = ShapedBufferToLiteral(result);
@@ -842,7 +842,8 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
        LiteralUtil::CreateR0<int64>(123456789000LL)}));
 }
 
-XLA_TEST_F(LocalClientExecuteTest, InfeedTest) {
+// Disabled on interpreter backend since infeed HLO is unsupported.
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedTest)) {
   XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = Infeed(&builder, shape);
@@ -867,7 +868,8 @@ XLA_TEST_F(LocalClientExecuteTest, InfeedTest) {
   LiteralTestUtil::ExpectR1Equal<float>({-4.0, 125.0, 45.0}, result);
 }
 
-XLA_TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
+// Disabled on interpreter backend since infeed/outfeed HLOs are unsupported.
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
   XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = Infeed(&builder, shape);
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 3f5135438fc59bea98527b1be30ee49339edd455..1fd9cb055c0bebc0f31496eb82f53a7b7a6cbfba 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -208,9 +208,7 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
       ROOT fusion = (s32[]) fusion(x), kind=kLoop, calls=fused_computation
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::MakeTupleOwned(
       LiteralUtil::MakeTupleOwned(
           LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<int32>(42)),
@@ -241,9 +239,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
       const = f32[4] constant({0, 0, 0, 0})
       ROOT select = f32[4] select(gte0, gte1, const)
     })";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, -1.0});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0, 1.0}, result);
@@ -273,9 +269,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
       p1 = f32[3] parameter(0)
       ROOT map = f32[3] map(p1), to_apply=map_computation
     })";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0}, result);
@@ -315,9 +309,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -346,9 +338,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -378,9 +368,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2]{0}, f32[2]{0}, f32[2]{0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -410,9 +398,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -443,9 +429,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -478,9 +462,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -513,9 +495,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p, i, j), kind=kInput,
                                                               calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{0, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   auto init1 = LiteralUtil::CreateR0<float>(5);
@@ -549,9 +529,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) fusion(p),
                     kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR3<Eigen::half>(
       {{{Eigen::half(1), Eigen::half(2)}, {Eigen::half(3), Eigen::half(4)}},
        {{Eigen::half(5), Eigen::half(6)}, {Eigen::half(7), Eigen::half(8)}}});
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 8f2c26f0eea9c7a3b33cd77e5977924c1659535a..e49bcf26bd6e50f8fb36c86f217907b5d4901eae 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -80,7 +80,9 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
+// TODO(b/122047800): Interpreter does not support BF16 for RNG ops.
+XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(
+                         DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests)))) {
   for (int64 seed = 0; seed < 100; ++seed) {
     // The largest negative number smaller than zero in bf16 that's not
     // denormalized.
@@ -103,7 +105,9 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
 }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) {
+// TODO(b/122047800): Interpreter does not support BF16 for RNG ops.
+XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
+                         DISABLED_ON_CPU(ScalarBF16CountTests)))) {
   // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75,
   // they should get similar counts.
   bfloat16 low = static_cast<bfloat16>(32.25);
@@ -276,6 +280,39 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
   EXPECT_FALSE(LiteralTestUtil::Equal(result5, result6));
 }
 
+// This test verifies that the two RNG instructions with the same parameters in
+// the same HloComputation produces different values.
+XLA_TEST_F(PrngTest, DifferentValuesForIdenticalRngNodesInSameComputation) {
+  // Build a U[0,1) computation.
+  auto build_computation = [this]() {
+    XlaBuilder builder(TestName());
+    auto a = RngUniform(ConstantR0<int32>(&builder, 0),
+                        ConstantR0<int32>(&builder, 100),
+                        ShapeUtil::MakeShape(S32, {10}));
+    auto b = RngUniform(ConstantR0<int32>(&builder, 0),
+                        ConstantR0<int32>(&builder, 100),
+                        ShapeUtil::MakeShape(S32, {10}));
+    Tuple(&builder, {a, b});
+    return builder.Build();
+  };
+
+  ExecutionOptions execution_options = execution_options_;
+  execution_options.set_seed(42);
+
+  Literal result_tuple;
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, build_computation());
+    TF_ASSERT_OK_AND_ASSIGN(
+        result_tuple, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                                  &execution_options));
+  }
+
+  auto results = result_tuple.DecomposeTuple();
+  ASSERT_EQ(results.size(), 2);
+
+  EXPECT_FALSE(LiteralTestUtil::Equal(results[0], results[1]));
+}
+
 XLA_TEST_F(PrngTest, TenValuesN01) {
   XlaBuilder builder(TestName());
   RngNormal(ConstantR0<float>(&builder, 0), ConstantR0<float>(&builder, 1),
diff --git a/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc b/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e5d7db97e88936e7336ed02a5c7a1171254b0cf
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class PtxasBugTest : public HloTestBase {};
+
+// Checks for a bug in ptxas, tracked as Google bug 120501638, and nvidia bug
+// 2459377.  We never received an explanation of what exactly was going wrong
+// here in ptxas.  Known-bad in ptxas 10.0.145, known-good in ptxas 10.0.249.
+TEST_F(PtxasBugTest, DoIt) {
+  const char* const kModuleStr = R"(
+HloModule test
+
+add_F32.14 {
+  lhs.15 = f32[] parameter(0)
+  rhs.16 = f32[] parameter(1)
+  ROOT add.17 = f32[] add(lhs.15, rhs.16)
+}
+
+ENTRY testcase {
+  arg0.1 = f32[2,5,2]{2,1,0} parameter(0)
+  reshape.2 = f32[2,5,2]{2,1,0} reshape(arg0.1)
+  constant.3 = f32[] constant(0)
+  pad.4 = f32[2,6,2]{2,1,0} pad(reshape.2, constant.3), padding=0_0x0_1x0_0
+  reshape.5 = f32[2,3,2,2]{3,2,1,0} reshape(pad.4)
+  transpose.6 = f32[2,2,3,2]{3,0,2,1} transpose(reshape.5), dimensions={2,0,1,3}
+  reshape.7 = f32[4,3,2]{2,1,0} reshape(transpose.6)
+  reshape.8 = f32[4,1,3,2]{3,2,1,0} reshape(reshape.7)
+  transpose.9 = f32[4,2,1,3]{1,3,2,0} transpose(reshape.8), dimensions={0,3,1,2}
+  convert.10 = f32[4,2,1,3]{1,3,2,0} convert(transpose.9)
+  constant.12 = f32[] constant(0)
+  pad.13 = f32[4,2,1,3]{3,2,1,0} pad(convert.10, constant.12), padding=0_0x0_0x0_0x0_0
+  constant.11 = f32[] constant(0)
+  reduce-window.18 = f32[4,2,1,3]{3,2,1,0} reduce-window(pad.13, constant.11),
+    window={size=1x1x1x1}, to_apply=add_F32.14
+  constant.19 = f32[] constant(1)
+  broadcast.20 = f32[4,2,1,3]{3,2,1,0} broadcast(constant.19), dimensions={}
+  divide.21 = f32[4,2,1,3]{3,2,1,0} divide(reduce-window.18, broadcast.20)
+  convert.22 = f32[4,2,1,3]{3,2,1,0} convert(divide.21)
+  transpose.23 = f32[4,1,3,2]{2,1,3,0} transpose(convert.22), dimensions={0,2,3,1}
+  reshape.24 = f32[4,3,2]{2,1,0} reshape(transpose.23)
+  reshape.25 = f32[2,2,3,2]{3,2,1,0} reshape(reshape.24)
+  transpose.26 = f32[2,3,2,2]{3,1,0,2} transpose(reshape.25), dimensions={1,2,0,3}
+  reshape.27 = f32[2,6,2]{2,1,0} reshape(transpose.26)
+  slice.28 = f32[2,5,2]{2,1,0} slice(reshape.27), slice={[0:2], [0:5], [0:2]}
+  reshape.29 = f32[2,5,2]{2,1,0} reshape(slice.28)
+  tuple.30 = (f32[2,5,2]{2,1,0}) tuple(reshape.29)
+  ROOT get-tuple-element.31 = f32[2,5,2]{2,1,0} get-tuple-element(tuple.30), index=0
+})";
+
+  // Create a module with the true-default flags, not the default-for-testing
+  // flags.  In particular, true-default flags enable unrolling, whereas for
+  // testing we disable unrolling, and this bug doesn't trigger without
+  // unrolling.
+  HloModuleConfig config;
+  config.set_debug_options(DefaultDebugOptionsIgnoringFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01, 0.01}));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 22fe4a2670e2e0e1fedc45036a1ceec19f44e42e..16c67d94c76bcf8984a2b3e4cb092026a6924aeb 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -607,7 +607,10 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
 
     Array4D<float> input(param.base_bounds[0], param.base_bounds[1],
                          param.base_bounds[2], param.base_bounds[3]);
-    input.FillRandom(0.1f, 0.1f);
+    // Choose a prime iota length so that each window sees a unique set of
+    // values. (Technically, the requirement is that the iota length is
+    // relatively prime to all of the dimensions involved in the reduce-window.)
+    input.FillRepeatedIota(0, 137);
     Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
         input, LayoutUtil::MakeLayout(param.layout));
     XlaOp parameter;
@@ -623,9 +626,9 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
         CreateConstantFromLiteral(LiteralUtil::CreateR0(kInitValue), &b);
     CHECK(param.reducer == kAdd || param.reducer == kMax);
     auto reducer = param.reducer;
-    if (use_bfloat16() && Product(param.window_bounds) > 128) {
-      // To avoid numerical issues, force the reducer to be kMax for large bf16
-      // windows.
+    if (use_bfloat16()) {
+      // To avoid numerical issues, force the reducer to be kMax for bf16
+      // inputs.
       reducer = kMax;
     }
 
@@ -949,16 +952,16 @@ struct R3ReduceWindowTestData {
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{95, 202, 251}, /*window_bounds=*/{95, 202, 251},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
     {/*base_bounds=*/{999, 57, 3}, /*window_bounds=*/{999, 57, 3},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{178, 302, 64}, /*window_bounds=*/{178, 302, 64},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
     {/*base_bounds=*/{63, 261, 257}, /*window_bounds=*/{63, 261, 257},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
     {/*base_bounds=*/{10003, 10, 5}, /*window_bounds=*/{9999, 7, 3},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
@@ -1001,17 +1004,19 @@ TEST_P(R3ReduceWindowTest, DoIt) {
   const float kInitValue = 0.0f;
   Array3D<float> input(param.base_bounds[0], param.base_bounds[1],
                        param.base_bounds[2]);
-  input.FillRandom(0.1f, 0.1f);
+  // Choose a prime iota length so that each window sees a unique set of values.
+  // (Technically, the requirement is that the iota length is relatively prime
+  // to all of the dimensions involved in the reduce-window.)
+  input.FillRepeatedIota(0, 137);
   Literal input_literal = LiteralUtil::CreateR3FromArray3DWithLayout(
       input, LayoutUtil::MakeLayout(param.layout));
   auto reducer = param.reducer;
   if (use_bfloat16()) {
     input_literal = LiteralUtil::ConvertF32ToBF16(input_literal);
-    if (Product(param.window_bounds) > 128) {
-      // To avoid numerical issues, force the reducer to be kMax for large bf16
-      // windows.
-      reducer = kMax;
-    }
+
+    // To avoid numerical issues, force the reducer to be kMax for bf16
+    // inputs.
+    reducer = kMax;
   }
 
   XlaOp parameter = Parameter(&b, 0, input_literal.shape(), "input");
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 7ca99a91635e85cd0888e59ecde31e47fec21844..80a6868485c9162d1cb0de24f0adf3f1c1d2503a 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -79,30 +79,28 @@ string PrependDisabledIfIndicated(const string& test_case_name,
 // heuristic to decide whether the test case should be disabled, and we
 // determine whether the test case should be disabled by resolving the (test
 // case name, test name) in a manifest file.
-#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)   \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                     \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                    \
-                                                                              \
-   private:                                                                   \
-    virtual void TestBody();                                                  \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,    \
-                                                           test_name));       \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,           \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_case_name,                                                    \
-          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)      \
-              .c_str(),                                                       \
-          nullptr, nullptr,                                                   \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          parent_class::SetUpTestCase, parent_class::TearDownTestCase,        \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_case_name, test_name)>);                                   \
+#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class)             \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
+      : public parent_class {                                                \
+   public:                                                                   \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
+                                                                             \
+   private:                                                                  \
+    virtual void TestBody();                                                 \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;    \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
+                                                           test_name));      \
+  };                                                                         \
+                                                                             \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,          \
+                                                    test_name)::test_info_ = \
+      ::testing::RegisterTest(                                               \
+          #test_case_name,                                                   \
+          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)     \
+              .c_str(),                                                      \
+          nullptr, nullptr, __FILE__, __LINE__, []() -> parent_class* {      \
+            return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name)();  \
+          });                                                                \
   void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 // This is identical to the TEST_F macro from "gtest", but it potentially
@@ -111,9 +109,8 @@ string PrependDisabledIfIndicated(const string& test_case_name,
 // Per usual, you can see what tests are available via --gunit_list_tests and
 // choose to run tests that have been disabled via the manifest via
 // --gunit_also_run_disabled_tests.
-#define XLA_TEST_F(test_fixture, test_name)              \
-  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture, \
-                  ::testing::internal::GetTypeId<test_fixture>())
+#define XLA_TEST_F(test_fixture, test_name) \
+  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture)
 
 // Likewise, this is identical to the TEST_P macro from "gtest", but
 // potentially disables the test based on the DISABLED_MANIFEST file.
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index eafa48ed7b8cf2bd67fe767ad36082661dbbd66e..95c89b0ba6f29c453abab88e29bca13ee006455a 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -168,7 +169,7 @@ void PopulateWithRandomIntegralData(Literal* literal, std::minstd_rand0* engine,
 StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
                                           std::minstd_rand0* engine,
                                           bool no_duplicates) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     std::vector<Literal> elements;
     for (const Shape& element_shape : shape.tuple_shapes()) {
       TF_ASSIGN_OR_RETURN(
@@ -274,16 +275,9 @@ bool NeedsInitValue(const HloUse& use) {
 
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
-Literal MakeRandomIndex(absl::Span<const int64> index_space,
-                        std::minstd_rand0* engine) {
-  std::vector<int32> start_indices(index_space.size());
-  if (engine != nullptr) {
-    for (int i = 0; i < index_space.size(); ++i) {
-      std::uniform_int_distribution<int32> generator(0, index_space[i]);
-      start_indices[i] = generator(*engine);
-    }
-  }
-  return LiteralUtil::CreateR1<int32>(start_indices);
+Literal MakeRandomIndex(int64 index_bound, std::minstd_rand0* engine) {
+  std::uniform_int_distribution<int32> generator(0, index_bound);
+  return LiteralUtil::CreateR0<int32>(generator(*engine));
 }
 
 // Use dataflow analysis on each parameter to see if there are uses that would
@@ -300,8 +294,8 @@ std::vector<HloInstruction*> FindConstrainedUses(
       HloInstruction* instruction = use.instruction;
       const HloOpcode opcode = instruction->opcode();
       const int64 op_num = use.operand_number;
-      if ((opcode == HloOpcode::kDynamicSlice && op_num == 1) ||
-          (opcode == HloOpcode::kDynamicUpdateSlice && op_num == 2)) {
+      if ((opcode == HloOpcode::kDynamicSlice && op_num >= 1) ||
+          (opcode == HloOpcode::kDynamicUpdateSlice && op_num >= 2)) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kFusion) {
         const HloInstruction* const to_analyze =
@@ -336,7 +330,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
 StatusOr<Literal> CreateLiteralForConstrainedUses(
     const absl::Span<HloInstruction* const> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
-  std::vector<int64> index_space;
+  int64 index_bound = INT64_MAX;
   bool no_duplicates = false;
   bool needs_constant = false;
   ConstantType constant_type = ConstantType::kUnknown;
@@ -348,19 +342,16 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
         const Shape& slice_shape = use->opcode() == HloOpcode::kDynamicSlice
                                        ? use->shape()
                                        : use->operand(1)->shape();
-        const int64 rank = ShapeUtil::Rank(indexed_shape);
-        if (!index_space.empty()) {
-          TF_RET_CHECK(rank == index_space.size());
-          for (int64 i = 0; i < rank; ++i) {
-            index_space[i] = std::min(
-                index_space[i], ShapeUtil::GetDimension(indexed_shape, i) -
-                                    ShapeUtil::GetDimension(slice_shape, i));
-          }
-        } else {
-          index_space.resize(rank);
-          for (int64 i = 0; i < rank; ++i) {
-            index_space[i] = ShapeUtil::GetDimension(indexed_shape, i) -
-                             ShapeUtil::GetDimension(slice_shape, i);
+        const int64 first_index =
+            Cast<HloDynamicIndexInstruction>(use)->first_index_operand_number();
+        for (int64 operand = first_index; operand < use->operand_count();
+             ++operand) {
+          if (use->operand(operand) == &param) {
+            index_bound = std::min(
+                index_bound,
+                ShapeUtil::GetDimension(indexed_shape, operand - first_index) -
+                    ShapeUtil::GetDimension(slice_shape,
+                                            operand - first_index));
           }
         }
         break;
@@ -388,13 +379,14 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
   }
   int constraint_count = 0;
   constraint_count += no_duplicates ? 1 : 0;
-  constraint_count += !index_space.empty() ? 1 : 0;
+  constraint_count += (index_bound != INT64_MAX) ? 1 : 0;
   constraint_count += needs_constant ? 1 : 0;
   if (constraint_count > 1) {
     return Unimplemented("Conflicting operand generation constraints.");
   }
-  if (!index_space.empty()) {
-    return MakeRandomIndex(index_space, engine);
+  if (index_bound != INT64_MAX) {
+    return MakeRandomIndex(index_bound, engine)
+        .Reshape(param.shape().dimensions());
   } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
@@ -459,8 +451,8 @@ Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
 std::unique_ptr<HloDotInstruction> CreateCanonicalDot(const Shape& shape,
                                                       HloInstruction* lhs,
                                                       HloInstruction* rhs) {
-  CHECK_EQ(ShapeUtil::Rank(lhs->shape()), 2);
-  CHECK_EQ(ShapeUtil::Rank(rhs->shape()), 2);
+  CHECK_EQ(lhs->shape().rank(), 2);
+  CHECK_EQ(rhs->shape().rank(), 2);
   PrecisionConfig precision_config;
   precision_config.mutable_operand_precision()->Resize(
       2, PrecisionConfig::DEFAULT);
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 448a66cfdd897b17cce1c87c050520a2f2eb0ea2..591d6c19228a313f530cdae18f4be37e7b517601 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -79,25 +79,26 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
                     R"(HloModule index_space_module
 
     ENTRY IndexSpace {
-      index_param = s32[3]{0} parameter(0)
-      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
-      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
-      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param), dynamic_slice_sizes={1,2,3}
-      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param), dynamic_slice_sizes={3,2,2}
+      index_param.0 = s32[] parameter(0)
+      index_param.1 = s32[] parameter(1)
+      index_param.2 = s32[] parameter(2)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(3)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(4)
+      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param.0, index_param.1, index_param.2), dynamic_slice_sizes={1,2,3}
+      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param.0, index_param.1, index_param.2), dynamic_slice_sizes={3,2,2}
     })")
                     .ValueOrDie();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
                           MakeFakeArguments(module.get()));
-  ASSERT_EQ(args.size(), 3);
-  const Literal& index_arg = args[0];
+  ASSERT_EQ(args.size(), 5);
 
-  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+  EXPECT_EQ(args[0].Get<int32>({}), 0);
 
-  EXPECT_GE(index_arg.Get<int32>({1}), 0);
-  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), 0);
+  EXPECT_LE(args[0].Get<int32>({}), 2);
 
-  EXPECT_GE(index_arg.Get<int32>({2}), 0);
-  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
 XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
@@ -105,28 +106,29 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
                     R"(HloModule index_space_module
 
     ENTRY IndexSpace {
-      index_param = s32[3]{0} parameter(0)
-      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
-      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
-      update_param.1 = f32[1,2,3]{0,1,2} parameter(3)
-      update_param.2 = f32[3,2,2]{0,1,2} parameter(4)
-
-      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param)
-      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param)
+      index_param.0 = s32[] parameter(0)
+      index_param.1 = s32[] parameter(1)
+      index_param.2 = s32[] parameter(2)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(3)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(4)
+      update_param.1 = f32[1,2,3]{0,1,2} parameter(5)
+      update_param.2 = f32[3,2,2]{0,1,2} parameter(6)
+
+      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param.0, index_param.1, index_param.2)
+      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param.0, index_param.1, index_param.2)
     })")
                     .ValueOrDie();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
                           MakeFakeArguments(module.get()));
-  ASSERT_EQ(args.size(), 5);
-  const Literal& index_arg = args[0];
+  ASSERT_EQ(args.size(), 7);
 
-  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+  EXPECT_EQ(args[0].Get<int32>({}), 0);
 
-  EXPECT_GE(index_arg.Get<int32>({1}), 0);
-  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), 0);
+  EXPECT_LE(args[0].Get<int32>({}), 2);
 
-  EXPECT_GE(index_arg.Get<int32>({2}), 0);
-  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
 XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
@@ -198,5 +200,33 @@ ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,14
   }
 }
 
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsR0InputToDynamicSlice) {
+  auto module = ParseHloString(R"(
+HloModule Test
+
+ENTRY %module (parameter.0: s32[], parameter.1: f32[20,20]) -> f32[] {
+  %parameter.1 = f32[20,20]{1,0} parameter(1)
+  %constant.1 = s32[1]{0} constant({0})
+  %parameter.0 = s32[] parameter(0)
+  %bitcast.3 = s32[1]{0} bitcast(s32[] %parameter.0)
+  %concatenate.1 = s32[2]{0} concatenate(s32[1]{0} %constant.1, s32[1]{0} %bitcast.3), dimensions={0}
+  %dynamic-slice.2 = f32[20,1]{1,0} dynamic-slice(f32[20,20]{1,0} %parameter.1, s32[2]{0} %concatenate.1), dynamic_slice_sizes={20,1}
+  %bitcast.4 = f32[20]{0} bitcast(f32[20,1]{1,0} %dynamic-slice.2)
+  %dynamic-slice.3 = f32[1]{0} dynamic-slice(f32[20]{0} %bitcast.4, s32[1]{0} %bitcast.3), dynamic_slice_sizes={1}
+  ROOT %bitcast.5 = f32[] bitcast(f32[1]{0} %dynamic-slice.3)
+}
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  EXPECT_TRUE(ShapeUtil::Equal(args[0].shape(), ShapeUtil::MakeShape(S32, {})))
+      << ShapeUtil::HumanString(args[0].shape());
+  EXPECT_TRUE(
+      ShapeUtil::Equal(args[1].shape(), ShapeUtil::MakeShape(F32, {20, 20})))
+      << ShapeUtil::HumanString(args[1].shape());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 9c586bdeb05afb7378e92caed1f3edc408e051bf..cdf2c34fcc3cc005e84626c39c8ab301a9040529 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -176,8 +176,9 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
       {2.f, 4.f, 6.f},  // row 0
       {5.f, 7.f, 9.f},  // row 1
   });
-  ASSERT_TRUE(ShapeUtil::ShapeIs(vector_shape, F32, {3}));
-  ASSERT_TRUE(ShapeUtil::ShapeIs(matrix_shape, F32, {/*y=*/2, /*x=*/3}));
+  ASSERT_TRUE(ShapeUtil::Equal(vector_shape, ShapeUtil::MakeShape(F32, {3})));
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape,
+                               ShapeUtil::MakeShape(F32, {/*y=*/2, /*x=*/3})));
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
@@ -512,8 +513,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 
 class TupleHloTest : public HloTestBase {};
 
-// Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+XLA_TEST_F(TupleHloTest, BitcastAfterGTE) {
   const char* testcase = R"(
     HloModule m, is_scheduled=true
 
@@ -525,9 +525,7 @@ XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
       ROOT tuple.4 = (f32[1,3]{1,0}) tuple(copy)
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({1, 2, 3}));
   auto result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -559,9 +557,7 @@ XLA_TEST_F(TupleHloTest,
       ROOT outfeed = token[] outfeed(tuple, token0)
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param0 = LiteralUtil::CreateR1<float>({1, 2});
   auto param1 = LiteralUtil::CreateR1<float>({2, 3});
   auto param4 = LiteralUtil::CreateR0<bool>(false);
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 6d5f276e82087cedc356691b0ff08df24cec8d20..85212fa56d71088156d2f3edda17f71cdab56da2 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -861,7 +861,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // Update.
     auto update = ConvertElementType(Broadcast(out0, {2}), F32);
     // Starts = iteration * 2;
-    auto starts = Reshape(Mul(iteration, ConstantR0<int32>(&builder, 2)), {1});
+    auto starts = Mul(iteration, ConstantR0<int32>(&builder, 2));
     // UpdateSlice.
     auto out1 = DynamicUpdateSlice(input, update, starts);
 
@@ -901,7 +901,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 // Per backend the values generated can be different as the different backends
 // use different random number generators.
 // TODO(b/32240857): Extend test to verify outputs.
-XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
+XLA_TEST_F(WhileTest, WhileWithPrngScalarResult) {
   auto v6s32 = ShapeUtil::MakeShape(S32, {6});
 
   // Create a computation for the condition: repeat for count iterations.
@@ -1146,7 +1146,7 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 // while (f(result).get<0>()) {
 //   result = result + 1;
 // }
-XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
+XLA_TEST_F(WhileTest, WhileWithCallInsideCondition) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
@@ -1299,9 +1299,9 @@ void BM_WhileLoop(int num_iters) {
     auto one = ConstantR0<float>(&builder, 1.0);
     auto update = Broadcast(one, {1, 1024, 1024});
     // Starts = iteration * 2;
-    auto starts = ConstantR1<int32>(&builder, {0, 0, 0});
+    auto zero = ConstantR0<int32>(&builder, 0);
     // UpdateSlice.
-    auto out1 = DynamicUpdateSlice(input, update, starts);
+    auto out1 = DynamicUpdateSlice(input, update, {zero, zero, zero});
     Tuple(&builder, {out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index e57d072a0632b492b8b6e34439f4e80332b843b6..c7337e8caae8f2ee25f4b25dc22439e08d2ecc25 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -174,9 +174,8 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   exec_run_options.set_allocator(backend->memory_allocator());
   exec_run_options.set_intra_op_thread_pool(
       backend->eigen_intra_op_thread_pool_device());
-  ServiceExecutableRunOptions run_options(
-      exec_run_options, /*borrow_stream=*/nullptr,
-      backend->eigen_intra_op_thread_pool());
+  ServiceExecutableRunOptions run_options(exec_run_options,
+                                          /*borrow_stream=*/nullptr);
   std::vector<const ShapedBuffer*> args = {&lhs_arg, &rhs_arg};
   TF_ASSERT_OK_AND_ASSIGN(
       auto execution_result,
@@ -225,14 +224,17 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
 
   line_no++;  // Skip 'Execution profile for ....'
 
+  ASSERT_LT(line_no, profile_output_lines.size());
   TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
                                          /*expect_hlo=*/false,
                                          &parsed_profile_lines));
 
+  ASSERT_LT(line_no, profile_output_lines.size());
   TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
                                          /*expect_hlo=*/true,
                                          &parsed_profile_lines));
 
+  ASSERT_LT(line_no, profile_output_lines.size());
   TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
                                          /*expect_hlo=*/true,
                                          &parsed_profile_lines));
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 99b32c19a52bf2a1f02047a1ceea626947d994fc..52fee4770ab940741723514d742e998b25765f24 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -29,33 +29,6 @@ tf_cc_binary(
     ],
 )
 
-cc_library(
-    name = "dumped_computation_to_graphviz_library",
-    srcs = ["dumped_computation_to_graphviz.cc"],
-    deps = [
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-tf_cc_binary(
-    name = "dumped_computation_to_graphviz",
-    deps = [
-        ":dumped_computation_to_graphviz_library",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",
-    ],
-)
-
 tf_cc_binary(
     name = "show_signature",
     srcs = ["show_signature.cc"],
@@ -95,6 +68,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
+        "//tensorflow/compiler/xla/service/gpu:outfeed_manager",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -281,3 +255,9 @@ tf_cc_binary(
         "@com_google_absl//absl/strings",
     ],
 )
+
+sh_test(
+    name = "interactive_graphviz_build_only_test",
+    srcs = ["interactive_graphviz_test.sh"],
+    data = [":interactive_graphviz"],
+)
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
deleted file mode 100644
index b623556468fb4a5d96be614b6c067d5a1df51a6f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Usage: dumped_computation_to_graphviz some_binary_snapshot_proto*
-//
-// Dumps a graphviz URL for a snapshot computation to the command line.
-//
-// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
-// ServiceInterface::SnapshotComputation to disk.
-//
-// The GraphViz URL is placed into the log stderr, whereas computation
-// statistics are printed on stdout (implementation note: getting computation
-// statistics is how we trigger compilation to split out a GraphViz URL).
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-namespace tools {
-
-void RealMain(absl::Span<char* const> args) {
-  Client* client = ClientLibrary::LocalClientOrDie();
-  for (char* arg : args) {
-    HloSnapshot module;
-    TF_CHECK_OK(
-        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    XlaComputation computation =
-        client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = GetDebugOptionsFromFlags();
-    debug_options.set_xla_generate_hlo_graph(".*");
-    ComputationStats stats =
-        client->GetComputationStats(computation, debug_options)
-            .ConsumeValueOrDie();
-    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
-  }
-}
-
-}  // namespace tools
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  std::vector<tensorflow::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  absl::Span<char* const> args(argv, argc);
-  args.remove_prefix(1);  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
index c187222a11ee721b006194a68620c58749707193..4beb099b330cadf4540944979f38681bae07103c 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
+++ b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
@@ -36,9 +36,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   {
     auto extracted_module =
@@ -75,9 +74,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   {
     auto extracted_module =
@@ -120,9 +118,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   {
     auto extracted_module =
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 6c90cde5a75a93837ee149fd9b5a60e6413c2ac4..ac865707f8697e0b94173a2a33e7be52a9564867 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,8 +29,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/string_view_utils.h"
-#include "absl/strings/util.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -56,7 +55,8 @@ bool ReadLine(const char *prompt, string *line) {
   return util::ReadLine(prompt, line);
 #else
   std::cout << prompt;
-  return std::getline(std::cin, *line);
+  std::getline(std::cin, *line);
+  return std::cin.good();
 #endif
 }
 
@@ -391,9 +391,9 @@ void DisplayGraphHandle(const Options &opts, const string& handle) {
   std::cout << handle << std::endl;
 
   // If it is a url, try to open it up in the user's browser too.
-  if (strings::StartsWithIgnoreCase(handle, "http://") ||
-      strings::StartsWithIgnoreCase(handle, "https://") ||
-      strings::StartsWithIgnoreCase(handle, "file://")) {
+  if (absl::StartsWithIgnoreCase(handle, "http://") ||
+      absl::StartsWithIgnoreCase(handle, "https://") ||
+      absl::StartsWithIgnoreCase(handle, "file://")) {
     const char* browser_bin = opts.browser.empty() ? "/usr/bin/sensible-browser"
                                                    : opts.browser.c_str();
     tensorflow::SubProcess p;
@@ -515,7 +515,7 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
                 << std::endl;
       continue;
     }
-    std::vector<string> tokens = strings::Split(line, ' ');
+    std::vector<string> tokens = absl::StrSplit(line, ' ');
     if (tokens[0] == "quit" || tokens[0] == "exit") {
       break;
     } else if (tokens[0] == "help") {
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3e43aa7da062547fb5f187b885e997fc44bbb65
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+# /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================*/
+
+# This is a placeholder for a compile-only test for intractive_graphviz tool.
+
+exit 0
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 27a8dd13308b29da9a5013ac9f696613981d68bb..c01a47b510c0e4252e350960b995643b39b70d4a 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -73,7 +74,17 @@ namespace {
 // fields.
 struct Options {
   string fake_infeed_shape;
-  bool generate_fake_infeed = false;
+  string fake_outfeed_shape;
+
+  // generate_fake_infeed == true is a safe default: If the model has 0 or 1
+  // infeeds, then it will work like normal.  If the model has more than one
+  // infeed, it will be an error, but that wouldn't have worked anyway if you
+  // hadn't passed generate_fake_infeed.
+  //
+  // Same for generate_fake_outfeed.
+  bool generate_fake_infeed = true;
+  bool generate_fake_outfeed = true;
+
   bool use_fake_data = false;
   bool print_result = true;
   int num_runs = 1;
@@ -96,6 +107,83 @@ std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
       .ValueOrDie();
 }
 
+absl::optional<Shape> GetXfeedShape(bool is_infeed,
+                                    const HloModuleProto& module,
+                                    const Options& opts) {
+  std::vector<HloInstructionProto> xfeed_instrs;
+  for (const auto& comp : module.computations()) {
+    for (const auto& instruction : comp.instructions()) {
+      if (instruction.opcode() == HloOpcodeString(is_infeed
+                                                      ? HloOpcode::kInfeed
+                                                      : HloOpcode::kOutfeed)) {
+        xfeed_instrs.push_back(instruction);
+      }
+    }
+  }
+
+  auto log_xfeed_instrs = [&] {
+    for (const auto& infeed : xfeed_instrs) {
+      LOG(ERROR) << "  " << ShapeUtil::HumanString(Shape(infeed.shape())) << " "
+                 << infeed.name();
+    }
+  };
+
+  auto find_instruction_from_id_or_die = [&](int64 id) {
+    for (const auto& comp : module.computations()) {
+      for (const auto& instruction : comp.instructions()) {
+        if (instruction.id() == id) {
+          return instruction;
+        }
+      }
+    }
+    LOG(FATAL) << "No instruction with id " << id;
+  };
+
+  absl::optional<Shape> xfeed_shape;
+  string xfeed_name = is_infeed ? "infeed" : "outfeed";
+  string fake_xfeed_shape =
+      is_infeed ? opts.fake_infeed_shape : opts.fake_outfeed_shape;
+  bool generate_fake_xfeed =
+      is_infeed ? opts.generate_fake_infeed : opts.generate_fake_outfeed;
+  if (!fake_xfeed_shape.empty()) {
+    xfeed_shape = std::move(ParseShape(fake_xfeed_shape)).ValueOrDie();
+  } else if (generate_fake_xfeed) {
+    CHECK_LT(xfeed_instrs.size(), 2)
+        << "--generate_fake_" << xfeed_name
+        << " only works if the model has 0 or 1 " << xfeed_name << " ops.";
+    if (xfeed_instrs.empty()) {
+      LOG(INFO) << "Not generating fake " << xfeed_name
+                << " shape; model has no " << xfeed_name << "s.";
+    } else if (xfeed_instrs.size() == 1) {
+      // kInfeed instructions should have a shape (buffer, token).  kOutfeed
+      // instructions should have operand 0 of shape `buffer`. We want to xfeed
+      // just `buffer`.
+      xfeed_shape = is_infeed
+                        ? Shape(xfeed_instrs.front().shape()).tuple_shapes(0)
+                        : Shape(find_instruction_from_id_or_die(
+                                    xfeed_instrs.front().operand_ids(0))
+                                    .shape());
+      LOG(INFO) << "Generating fake " << xfeed_name << " with inferred shape: "
+                << ShapeUtil::HumanString(*xfeed_shape);
+    } else {
+      LOG(ERROR) << "--generate_fake_" << xfeed_name
+                 << " only works if the model has 0 or 1 " << xfeed_name
+                 << " ops, but this model has " << xfeed_instrs.size()
+                 << " of them:";
+      log_xfeed_instrs();
+      LOG(FATAL) << "Can't run model with --generate_fake_infeed.";
+    }
+  } else if (!xfeed_instrs.empty()) {
+    LOG(ERROR) << "Model contains " << xfeed_instrs.size() << " " << xfeed_name
+               << " instruction(s), but neither --generate_fake_" << xfeed_name
+               << " nor --fake_" << xfeed_name
+               << "_shape was specified.  Execution will likely hang.";
+    log_xfeed_instrs();
+  }
+
+  return xfeed_shape;
+}
+
 // Invokes the given computation passing arbitrary data for every (unbound)
 // parameter if use_fake_data, Otherwise use recorded data if available.
 //
@@ -142,54 +230,37 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
     }
   }
 
-  bool provide_infeed = false;
-  Shape infeed_shape;
-  if (!opts.fake_infeed_shape.empty()) {
-    StatusOr<Shape> shape_status = ParseShape(opts.fake_infeed_shape);
-    TF_CHECK_OK(shape_status.status());
-    infeed_shape = std::move(shape_status).ValueOrDie();
-    provide_infeed = true;
-  } else if (opts.generate_fake_infeed) {
-    for (const auto& comp : computation.proto().computations()) {
-      for (const auto& instruction : comp.instructions()) {
-        if (instruction.opcode() == HloOpcodeString(HloOpcode::kInfeed)) {
-          CHECK(!provide_infeed)
-              << "--generate_fake_infeed only works if the model has 0 or 1 "
-                 "infeed ops, but this one has >= 2.";
-          provide_infeed = true;
-          infeed_shape = Shape(instruction.shape());
-          LOG(INFO) << "Generating fake infeed shape for inferred shape: "
-                    << ShapeUtil::HumanString(infeed_shape);
-        }
-      }
-    }
+  if (absl::optional<Shape> infeed_shape = GetXfeedShape(
+          /*is_infeed=*/true, computation.proto(), opts)) {
+    auto infeed_data = std::make_shared<Literal>(
+        std::move(MakeFakeLiteral(*infeed_shape)).ValueOrDie());
+    xla::gpu::GetOrCreateInfeedManager()
+        ->RegisterBeforeGetNextDestinationCallback([infeed_data, client] {
+          TF_CHECK_OK(client->TransferToInfeed(*infeed_data));
+        });
   }
-  // We only instantiate the thread pool if the user has requested that a
-  // concurrent infeed occur via the fake_infeed_shape, or when
-  // --generate_fake_infeed is passed and there exists an infeed operation in
-  // the HloSnapshot.
-  absl::optional<tensorflow::thread::ThreadPool> pool;
-  Literal data;
-  if (provide_infeed) {
-    data = std::move(MakeFakeLiteral(infeed_shape)).ValueOrDie();
-  }
-  auto transfer_infeed = [&data, client]() {
-    TF_CHECK_OK(client->TransferToInfeed(data));
-  };
-  if (provide_infeed) {
-    pool.emplace(tensorflow::Env::Default(), "infeed",
-                 /*num_threads=*/1);
-    pool->Schedule([transfer_infeed]() {
-      // There may be several infeed buffers needed, however we don't know how
-      // many. If we proactively transfer too many infeed buffers, we may run
-      // out of memory. If we transfer too few infeed buffers, the program will
-      // hang. Therefore, we register a callback that is called when the infeed
-      // becomes empty, and in this callback we will transfer another fake
-      // infeed.
-      auto infeed_manager = xla::gpu::GetOrCreateInfeedManager();
-      infeed_manager->RegisterOnEmptyCallback(transfer_infeed);
-      transfer_infeed();
-    });
+
+  absl::optional<tensorflow::thread::ThreadPool> outfeed_thread_pool;
+  if (absl::optional<Shape> outfeed_shape = GetXfeedShape(
+          /*is_infeed=*/false, computation.proto(), opts)) {
+    // For each an outfeed that runs, enqueue a task that will consume it.  We
+    // need a thread pool because the act of running an outfeed blocks on there
+    // being a destination available, and the act of making a destination
+    // available blocks on there being outfeed data available.
+    outfeed_thread_pool.emplace(tensorflow::Env::Default(), "infeed",
+                                /*num_threads=*/1);
+    auto consume_outfeed = [client, outfeed_shape] {
+      TF_CHECK_OK(
+          client->TransferFromOutfeedLocal(*outfeed_shape, /*device_ordinal=*/0)
+              .status());
+      VLOG(1) << "Received outfeed data of shape "
+              << ShapeUtil::HumanStringWithLayout(*outfeed_shape);
+    };
+    xla::gpu::GetOrCreateOutfeedManager()
+        ->RegisterBeforeGetNextDestinationCallback(
+            [consume_outfeed, &outfeed_thread_pool] {
+              outfeed_thread_pool->Schedule(consume_outfeed);
+            });
   }
 
   // Do not attempt to run the executable if num_runs is less than 1.
@@ -304,8 +375,10 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
 
   for (int64 i = 0; i < executables.size(); ++i) {
     LocalExecutable* executable = executables[i].get();
+    LOG(ERROR) << "Running iteration " << i;
     StatusOr<Literal> result_status =
         ReplayComputation(snapshots[i], executable, client, opts);
+    LOG(ERROR) << "iteration complete.";
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", args[i],
               result_status.status().ToString().c_str());
@@ -350,9 +423,14 @@ int main(int argc, char** argv) {
                        "Number of times to run each computation"),
       tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
+      tensorflow::Flag("fake_outfeed_shape", &opts.fake_outfeed_shape,
+                       "Shape of fake data to outfeed from computation"),
       tensorflow::Flag("generate_fake_infeed", &opts.generate_fake_infeed,
-                       "Whether a fake infeed shape should be generated "
-                       "derived from the computation"),
+                       "Whether a fake infeed shape should be derived "
+                       "from the computation"),
+      tensorflow::Flag("generate_fake_outfeed", &opts.generate_fake_outfeed,
+                       "Whether a fake outfeed shape should be derived "
+                       "from the computation"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index b645acb700b0f168112a40c9c72b4669435f717d..daf678f69017b9eb86cbc464a1f33b434021901d 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -41,6 +41,7 @@ using ::tensorflow::uint32;
 using ::tensorflow::uint64;
 
 using complex64 = std::complex<float>;
+using complex128 = std::complex<double>;
 
 using ::Eigen::half;
 
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 68cab7387cf1576072f96878b50f07def6862d8b..34b73b5206fa20d6dff7567afd78fd89897c8c33 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -86,7 +86,7 @@ bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
     CHECK_LT(index, rank);
     output[index] = 0;
   }
-  return std::find(output.begin(), output.end(), -1) == output.end();
+  return !absl::c_linear_search(output, -1);
 }
 
 std::vector<int64> InversePermutation(
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 6722641e9d2c177440361e6f0d1f6c0804eb7cda..f2fd17dc99455a921bf875aad2a3661b4d456823 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -324,8 +324,7 @@ bool IsIdentityPermutation(absl::Span<const int64> permutation);
 
 template <typename Container>
 int64 PositionInContainer(const Container& container, int64 value) {
-  return std::distance(container.begin(),
-                       std::find(container.begin(), container.end(), value));
+  return std::distance(container.begin(), absl::c_find(container, value));
 }
 
 // Formats the container as a comma-separated string. StrAppend must support
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 51c73b3d17e4c32d9a8a14d3055ab56f02922af3..e001cc35f9fcea2783b3952e825838af6bbece72 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -137,25 +138,23 @@ bool HasPadding(const Window& window) {
 }
 
 bool HasSymmetricPadding(const Window& window) {
-  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
-                     [](const WindowDimension& dim) {
-                       return dim.padding_low() == dim.padding_high();
-                     });
+  return absl::c_all_of(window.dimensions(), [](const WindowDimension& dim) {
+    return dim.padding_low() == dim.padding_high();
+  });
 }
 
 bool HasSymmetricPadding(const PaddingConfig& padding_config) {
-  return std::all_of(padding_config.dimensions().begin(),
-                     padding_config.dimensions().end(),
-                     [](const PaddingConfig::PaddingConfigDimension& dim) {
-                       return dim.edge_padding_low() == dim.edge_padding_high();
-                     });
+  return absl::c_all_of(padding_config.dimensions(),
+                        [](const PaddingConfig::PaddingConfigDimension& dim) {
+                          return dim.edge_padding_low() ==
+                                 dim.edge_padding_high();
+                        });
 }
 
 bool HasNegativePadding(const Window& window) {
-  return std::any_of(window.dimensions().begin(), window.dimensions().end(),
-                     [](const WindowDimension& dim) {
-                       return dim.padding_low() < 0 || dim.padding_high() < 0;
-                     });
+  return absl::c_any_of(window.dimensions(), [](const WindowDimension& dim) {
+    return dim.padding_low() < 0 || dim.padding_high() < 0;
+  });
 }
 
 bool HasBaseDilation(const Window& window) {
@@ -190,10 +189,9 @@ bool AllOrNoneReversed(const Window& window) {
     return true;
   }
   bool reversed = window.dimensions()[0].window_reversal();
-  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
-                     [&](const WindowDimension& dim) {
-                       return dim.window_reversal() == reversed;
-                     });
+  return absl::c_all_of(window.dimensions(), [&](const WindowDimension& dim) {
+    return dim.window_reversal() == reversed;
+  });
 }
 
 bool HasDilation(const Window& window) {
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 1439f1bcc5cec39203a7cb4b1f8604e7349382c6..60adea5a4a242e5843b41927ba77c197e8fac444 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -1,30 +1,40 @@
 """Wrapper around cc_proto_library used inside the XLA codebase."""
 
-load("//tensorflow/core:platform/default/build_config.bzl",
-     "cc_proto_library")
-load("//tensorflow/core:platform/default/build_config_root.bzl",
-     "if_static")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "cc_proto_library",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
+)
+load("//tensorflow:tensorflow.bzl", "if_cuda_is_configured")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
-  if kwargs.get('use_grpc_plugin'):
-    kwargs['use_grpc_namespace'] = True
-  cc_proto_library(name=name,
-                   srcs=srcs,
-                   deps=deps,
-                   cc_libs = if_static(
-                       ["@protobuf_archive//:protobuf"],
-                       otherwise=["@protobuf_archive//:protobuf_headers"],
-                   ),
-                   protoc="@protobuf_archive//:protoc",
-                   testonly=testonly,
-                   visibility=visibility,
-                   **kwargs)
+def xla_proto_library(name, srcs = [], deps = [], visibility = None, testonly = 0, **kwargs):
+    if kwargs.get("use_grpc_plugin"):
+        kwargs["use_grpc_namespace"] = True
+    cc_proto_library(
+        name = name,
+        srcs = srcs,
+        deps = deps,
+        cc_libs = if_static(
+            ["@protobuf_archive//:protobuf"],
+            otherwise = ["@protobuf_archive//:protobuf_headers"],
+        ),
+        protoc = "@protobuf_archive//:protoc",
+        testonly = testonly,
+        visibility = visibility,
+        **kwargs
+    )
 
 def xla_py_grpc_library(**kwargs):
-  # Note: we don't currently define any special targets for Python GRPC in OSS.
-  _ignore = kwargs
-  pass
-
+    # Note: we don't currently define any special targets for Python GRPC in OSS.
+    _ignore = kwargs
+    pass
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
+
+# We link the GPU plugin into the XLA Python extension if CUDA is enabled.
+def xla_python_default_plugins():
+    return if_cuda_is_configured(["//tensorflow/compiler/xla/service:gpu_plugin"])
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 0e8fa73f8170addfa5061b33f3d6882a13890bce..92834dbb02cdcd6383ceec3ffd079834b163ee6a 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -230,7 +230,11 @@ message DebugOptions {
   // Enable fast math with eigen in the HLO evaluator.
   bool xla_hlo_evaluator_use_fast_path = 106;
 
-  // Next id: 107
+  // Temporary option to allow support for both the R1 and the scalar index
+  // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
+  bool xla_allow_scalar_index_dynamic_ops = 107;
+
+  // Next id: 108
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -261,6 +265,10 @@ message ExecutionOptions {
   // computation on. The computation will be partitioned across these devices.
   // If not provided, the default device will be chosen.
   repeated DeviceHandle device_handles = 5;
+
+  // Number of replicas of the computation to run. If zero, uses the default
+  // number of replicas for the XLA service.
+  int32 num_replicas = 6;
 }
 
 message GetDeviceHandlesRequest {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index e9c86abe5094244988d3465ef7c949509deaec37..a64e2f5df5cacca05e83f31c941c57abd5ccf4de 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -56,6 +56,7 @@ enum PrimitiveType {
 
   // Complex values of fixed width.
   C64 = 15;  // Paired F32 (real, imag), as in std::complex<float>.
+  C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
 
   // A tuple is a polymorphic sequence; e.g. a shape that holds different
   // sub-shapes. They are used for things like returning multiple values from a
@@ -75,7 +76,7 @@ enum PrimitiveType {
   // primitive type will have empty dimensions and tuple_shapes fields.
   TOKEN = 17;
 
-  // Next = 18
+  // Next = 19
 }
 
 // Describes the padding configuration for Pad operation. The padding amount on
@@ -188,11 +189,14 @@ message ShapeProto {
   // The element type for this shape.
   PrimitiveType element_type = 2;
 
-  // The size (number of elements) for each dimension.
-  // In XLA, dimensions are numbered from 0 to N-1 for an
-  // N-dimensional array. The first element of 'dimensions' is the size of
-  // dimension 0, the second element is the size of dimension 1, and so forth.
-  // Empty list indicates a scalar.
+  // The size (number of elements) for each dimension, or an upper bound on the
+  // size if the dimension is dynamic.  In XLA, dimensions are numbered from 0
+  // to N-1 for an N-dimensional array. The first element of 'dimensions' is the
+  // size of dimension 0, the second element is the size of dimension 1, and so
+  // forth.  Empty list indicates a scalar.
+  //
+  // If the respective element in 'is_dimension_dynamic' is true then the value
+  // in this field represents an upper bound on the size of the dimension.
   repeated int64 dimensions = 3;
 
   // For tuples only, the shapes of constitutent shapes in the tuple sequence.
@@ -201,6 +205,12 @@ message ShapeProto {
   // The layout used to back this shape.
   LayoutProto layout = 5;
 
+  // For arrays, this indicates whether or not each dimension is
+  // dynamically-sized. The number of elements in this repeated field should be
+  // zero (indicating that no dimensions are dynamic) or equal to the number of
+  // elements in the 'dimensions' field.
+  repeated bool is_dynamic_dimension = 6;
+
   // Important: if any field is added, be sure to modify ShapeUtil::Equal(),
   // ShapeUtil::Compatible() and ShapeUtil::Hash() appropriately to account for
   // the new field.
@@ -358,6 +368,7 @@ message LiteralProto {
   repeated float f32s = 8;
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
+  repeated double c128s = 18;  // Stored as interleaved real, imag doubles.
   repeated LiteralProto tuple_literals = 10;
   // The F16s, BF16s, U16s and S16s are encoded in little endian byte order
   bytes f16s = 11;
@@ -365,7 +376,7 @@ message LiteralProto {
   bytes u16s = 16;
   bytes s16s = 17;
   repeated int64 sparse_indices = 14;
-  // Next = 18
+  // Next = 19
 }
 
 message WindowDimension {
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 67f475846e5f16060c1080759b0acb4216c4e72b..dc02fd272fd8700c7f8fa64adf7ab57c88bab706 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -11,20 +11,15 @@ cc_library(
     name = "xrt_state_ops",
     hdrs = ["xrt_state_ops.h"],
     deps = [
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt:xrt_utils",
         "//tensorflow/core:core_cpu_internal",
@@ -55,6 +50,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt:xrt_utils",
         "//tensorflow/core:core_cpu_internal",
@@ -62,7 +58,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/stream_executor:stream_executor_headers_lib",
+        "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 2ccdf0f02d840600d5e0649c4805e3672d4a1286..2ee1a6cd1aebcdbd65892b33e5044489070ab5c4 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -215,11 +215,6 @@ XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;
 void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
 
-  const Tensor& key_tensor = ctx->input(0);
-  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor.shape()),
-              errors::Internal("computation key should be a string scalar"));
-  int64 uid = key_tensor.scalar<int64>()();
-
   ResourceMgr* rm;
   OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
 
@@ -230,9 +225,13 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
                           kXRTCompilationCacheResourceName, &cache));
   core::ScopedUnref cache_unref(cache);
 
-  OP_REQUIRES_OK(ctx, cache->Release(uid));
-
-  VLOG(2) << "Released computation handle " << uid;
+  const Tensor& keys_tensor = ctx->input(0);
+  auto flat_keys = keys_tensor.flat<int64>();
+  for (int64 i = 0; i < flat_keys.size(); ++i) {
+    int64 key = flat_keys(i);
+    OP_REQUIRES_OK(ctx, cache->Release(key));
+    VLOG(2) << "Released computation handle " << key;
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 751329eefc33f3372335c805233dafabbf42bf36..116c193cab65410a5a7c3058f98cc2be2cbe9e67 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -228,8 +229,27 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
       shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
       &output_tuple));
+
+  // The ScopedShapedBuffer returned by the executable Run() API, in case of
+  // input/output buffer aliasing, might have holes in it, which need to be
+  // filled using the proper input tuples buffers which are the source of
+  // aliasing.
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
+  auto alias_function =
+      [&](const xla::ShapeIndex& output_index,
+          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
+    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
+    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
+               ? output_tuple->AliasBufferFrom(
+                     *input_tuples[alias.parameter_number],
+                     alias.parameter_index, output_index)
+               : Status::OK();
+  };
+  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
+
   if (config_proto.return_exploded_tuple() &&
-      xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) {
+      output_tuple->on_device_shape().IsTuple()) {
     int64 tuple_element_count =
         xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
     Tensor* output_tensor;
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 1a5bfac337baf773b84b92af5f88ef7a4c8ba81f..6a7f10652533920ba3fa48fba1d5161f7c4d4530 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -37,6 +37,17 @@ REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
                             .HostMemory("handle"),
                         XRTAllocateOp<XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("inputs")
+                            .HostMemory("handle"),
+                        XRTAllocateFromTensorOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("inputs")
+                            .HostMemory("handle"),
+                        XRTAllocateFromTensorOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTSubTuple")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("base_handle")
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 2e2f3ff116a7b331df8dbd58a9fe40096f524140..e2c223b3dbb2311d0f42e1a36e316fd9d5f66040 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -19,10 +19,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
 #define TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -30,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -200,6 +205,109 @@ class XRTAllocateOp : public OpKernel {
   }
 };
 
+// Op that allocates memory for a tensor (with optional layout) and transfers it
+// to the device, returning an allocation handle.
+template <class DeviceAccessor>
+class XRTAllocateFromTensorOp : public OpKernel {
+ public:
+  explicit XRTAllocateFromTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    bool make_tuple = false;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &tf_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("make_tuple", &make_tuple));
+    if (ctx->HasAttr("layouts")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major_));
+    }
+    OP_REQUIRES(
+        ctx, tf_shapes_.size() == dtypes_.size(),
+        errors::InvalidArgument("shapes and dtypes must be the same length"));
+    std::vector<xla::Shape> xla_shapes;
+    for (int i = 0; i < tf_shapes_.size(); i++) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(
+          ctx, TensorShapeToXLAShape(dtypes_[i], tf_shapes_[i], &xla_shape));
+      xla_shapes.push_back(xla_shape);
+    }
+    if (xla_shapes.size() > 1 || make_tuple) {
+      shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes);
+    } else {
+      shape_.Swap(&xla_shapes.front());
+    }
+    if (!minor_to_major_.empty()) {
+      xla::Shape shape_with_layouts;
+      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major_,
+                                             /*layout_func=*/nullptr,
+                                             &shape_with_layouts));
+      shape_.Swap(&shape_with_layouts);
+    }
+  }
+
+  ~XRTAllocateFromTensorOp() override = default;
+  XRTAllocateFromTensorOp(const XRTAllocateFromTensorOp&) = delete;
+  XRTAllocateFromTensorOp& operator=(const XRTAllocateFromTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTAllocateFromTensorOp::Compute";
+
+    OpInputList values;
+    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
+    OP_REQUIRES(ctx, values.size() == tf_shapes_.size(),
+                errors::InvalidArgument(
+                    "Wrong number of inputs to XRTAllocateFromTensor: ",
+                    values.size(), " vs. ", tf_shapes_.size()));
+
+    std::vector<const char*> tensors_data;
+    for (size_t i = 0; i < values.size(); ++i) {
+      const Tensor& input_tensor = values[i];
+      OP_REQUIRES(ctx, input_tensor.dtype() == dtypes_[i],
+                  errors::InvalidArgument(
+                      "Input tensor type and input dtype do not match"));
+      // We allow the requested on-device shape to differ from the shape of the
+      // input tensor, as long as they have the same number of elements.
+      OP_REQUIRES(
+          ctx,
+          input_tensor.shape().num_elements() == tf_shapes_[i].num_elements(),
+          errors::InvalidArgument(
+              "Input tensor must have the number of elements specified "
+              "in the matching input shape: ",
+              input_tensor.shape().num_elements(), " vs. ",
+              tf_shapes_[i].num_elements(), " at index ", i));
+      tensors_data.push_back(
+          static_cast<const char*>(DMAHelper::base(&input_tensor)));
+    }
+    // Use the buffer straight out of the input tensors to create the literal.
+    xla::BorrowingLiteral literal =
+        shape_.IsTuple() ? xla::BorrowingLiteral(tensors_data, shape_)
+                         : xla::BorrowingLiteral(tensors_data.front(), shape_);
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    class DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
+                            literal, device_ref.backend(),
+                            device_ref.device_ordinal(), &allocation));
+
+    // Intern takes ownership of our reference to allocation.
+    int64 key;
+    OP_REQUIRES_OK(ctx, allocation->Intern(rm, &key));
+
+    Tensor output(DT_INT64, TensorShape({}));
+    output.scalar<int64>()() = key;
+    ctx->set_output(0, output);
+  }
+
+ private:
+  std::vector<TensorShape> tf_shapes_;
+  DataTypeVector dtypes_;
+  std::vector<int64> minor_to_major_;
+  xla::Shape shape_;
+};
+
 // Op that takes a tuple handle input and returns a handle to a sub-tuple of the
 // input.
 template <bool discard_, class DeviceAccessor>
@@ -453,17 +561,17 @@ class XRTReleaseAllocationOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReleaseAllocationOp::Compute";
 
-    const Tensor& allocation_handle = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(allocation_handle.shape()),
-                errors::Internal("handle input should be an int64 scalar"));
-    int64 key = allocation_handle.scalar<int64>()();
-
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(rm, key));
-
-    VLOG(2) << "Released allocation handle " << key;
+    const Tensor& allocation_handle = ctx->input(0);
+    auto flat_keys = allocation_handle.flat<int64>();
+    for (int64 i = 0; i < flat_keys.size(); ++i) {
+      int64 key = flat_keys(i);
+      OP_REQUIRES_OK(ctx,
+                     XRTTupleAllocation::DeleteFromResourceManager(rm, key));
+      VLOG(2) << "Released allocation handle " << key;
+    }
   }
 };
 
diff --git a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
index 7b3b50c69559f6003a108fdf6a1325dbdbaa80a6..9dd964e5467cd855d67764a512e95a6a18f482e1 100644
--- a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
@@ -44,10 +44,10 @@ REGISTER_OP("XRTReleaseCompilationHandle")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(
         R"(
-Discards a computation from the compilation cache. The handle cannot be
-subsequently used.
+Discards one or more computation handles from the compilation cache.
+The handle(s) cannot be subsequently used.
 
-'handle' is an id returned from a XRTCompile Op.
+'handle' is an ID (or vector of IDs) returned from a XRTCompile Op.
 )");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index fe6bee0dacf5dc2050613fc9ad34d3235b5a7b63..2e743fec4963a52ee1abf64525f26e3d89479670 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -26,12 +26,41 @@ REGISTER_OP("XRTAllocate")
     .SetShapeFn(tensorflow::shape_inference::ScalarShape)
     .Doc(
         R"(
-Reads a literal proto and transfers it to TPU device memory.
+Reads a literal proto and transfers it to device memory.
 
-'allocation' is a serialized xrt::TPUAllocation proto.
+'allocation' is a serialized xrt::XLAAllocation proto.
 'handle' is an id that can be used in other ops to refer to the allocation.
 )");
 
+REGISTER_OP("XRTAllocateFromTensor")
+    .Input("inputs: dtypes")
+    .Output("handle: int64")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .Attr("layouts: list(int) = []")
+    .Attr("make_tuple: bool = false")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Reads a list of tensors with optional layouts, and transfers it to device
+memory.
+
+inputs: The tensors holding the input data.
+shapes: The shapes which the tensors should have on device. The i-th shape
+corresponds to the i-th input. The shapes, together with the (optional)
+layouts, helps creating the fully qualified shape of the data on the device.
+The shapes can differ from the corresponding input one, as long as the total
+number of elements matches. In other words, it is possible to feed an input
+tensor with shape {8} and have a corresponding shape {2,2,2}.
+layouts: A vector holding the requested layout in minor-to-major sequence.
+If empty, the default layout wil be used.
+For a tuple, the layouts vector holds a linearized minor-to-major numbers
+for all the tuple leaves, in the order they appear within the tuple.
+The elements within the layouts sequence corresponding to a given tuple
+subshape can be set to -1, to leave such subshape to the default shape.
+handle: An id that can be used in other ops to refer to the allocation.
+)");
+
 REGISTER_OP("XRTSubTuple")
     .Input("base_handle: int64")
     .Input("shape_index: int32")
@@ -127,10 +156,11 @@ REGISTER_OP("XRTReleaseAllocationHandle")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(
         R"(
-Discards an allocation from device memory. The handle cannot be subsequently
+Discards one or more device memory handles. The handle(s) cannot be subsequently
 used.
 
-'handle' is the id returned from the Op that produced the on-device allocation.
+'handle' is the ID (or a vector of IDs) returned from the Op that produced the
+on-device allocation.
 )");
 
 REGISTER_OP("XRTReleaseAllAllocations")
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index be44a3474acdeb9905c1d21b932fa0dd10b5a212..3a19327e5b5d8072fbecdbe10e9959c8491780eb 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 5f8121703e108f26b048feb7a0412a282f52892c..1111f8240512e81c10a42a28c09f5b0a94daf1ee 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -53,6 +55,14 @@ string DeviceFromFlag() {
   return absl::StrCat("/device:", xla_test_device, ":0");
 }
 
+std::vector<int> GetAttrLayout(absl::Span<const int64> minor_to_mayor) {
+  std::vector<int> layout;
+  for (auto dim : minor_to_mayor) {
+    layout.push_back(static_cast<int>(dim));
+  }
+  return layout;
+}
+
 xla::LiteralProto TwoElementTuple() {
   auto array = xla::LiteralUtil::CreateR1<float>({1.0f, 3.0f});
   auto matrix = xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}});
@@ -96,14 +106,21 @@ xla::LiteralProto FloatMatrix(
   return array.ToProto();
 }
 
+xla::Literal ReadOutputLiteral(const std::vector<Tensor>& outputs, size_t idx) {
+  xla::LiteralProto response;
+  CHECK(response.ParseFromString(outputs[idx].scalar<string>()()));
+  return xla::Literal::CreateFromProto(response).ValueOrDie();
+}
+
 bool CompareLiteralProtos(const xla::LiteralProto& a,
                           const xla::LiteralProto& b) {
   auto l_a = xla::Literal::CreateFromProto(a).ValueOrDie();
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = l_a == l_b;
   if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match: " << a.DebugString()
-              << " != " << b.DebugString();
+    LOG(INFO) << "LiteralProtos don't match:\n"
+              << a.DebugString() << "\n!=\n"
+              << b.DebugString();
   }
   return equal;
 }
@@ -113,8 +130,19 @@ bool CompareLiteralToLiteralProto(const xla::Literal& a,
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = a == l_b;
   if (!equal) {
-    LOG(INFO) << "Literal and LiteralProto don't match "
-              << a.ToProto().DebugString() << " != " << b.DebugString();
+    LOG(INFO) << "Literal and LiteralProto don't match:\n"
+              << a.ToProto().DebugString() << "\n!=\n"
+              << b.DebugString();
+  }
+  return equal;
+}
+
+bool CompareLiterals(const xla::Literal& a, const xla::Literal& b) {
+  bool equal = a == b;
+  if (!equal) {
+    LOG(INFO) << "Literals don't match:\n"
+              << a.ToProto().DebugString() << "\n!=\n"
+              << b.ToProto().DebugString();
   }
   return equal;
 }
@@ -215,6 +243,120 @@ xla::ProgramShape XlaCompiledProgramShape(
       ->ComputeProgramShape();
 }
 
+TEST(RawApiTest, AllocFromTensor) {
+  xla::Literal literal =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  Tensor tensor;
+  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  std::vector<int> layout =
+      GetAttrLayout(literal.shape().layout().minor_to_major());
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout);
+  auto handle =
+      ops::XRTAllocateFromTensor(root, {tensor}, {tensor.shape()}, alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
+}
+
+TEST(RawApiTest, AllocFromTensorTuple) {
+  xla::Literal literal0 =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  xla::Literal literal1 =
+      xla::LiteralUtil::CreateR2<float>({{14.0f, -5.0f}, {16.0f, 17.0f}});
+  xla::Literal literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
+  Tensor tensor0;
+  TF_ASSERT_OK(LiteralToHostTensor(literal0, DT_FLOAT, &tensor0));
+  Tensor tensor1;
+  TF_ASSERT_OK(LiteralToHostTensor(literal1, DT_FLOAT, &tensor1));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  std::vector<int> layout = GetShapeLayoutVector(literal.shape()).ValueOrDie();
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout);
+  auto handle = ops::XRTAllocateFromTensor(root, {tensor0, tensor1},
+                                           {tensor0.shape(), tensor1.shape()},
+                                           alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
+}
+
+TEST(RawApiTest, AllocFromTensorTupleSingle) {
+  xla::Literal literal0 =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  xla::Literal literal = xla::LiteralUtil::MakeTuple({&literal0});
+  Tensor tensor0;
+  TF_ASSERT_OK(LiteralToHostTensor(literal0, DT_FLOAT, &tensor0));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  std::vector<int> layout = GetShapeLayoutVector(literal.shape()).ValueOrDie();
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout).MakeTuple(true);
+  auto handle = ops::XRTAllocateFromTensor(root, {tensor0}, {tensor0.shape()},
+                                           alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
+}
+
+TEST(RawApiTest, AllocFromTensorRelayout) {
+  xla::Literal literal =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  Tensor tensor;
+  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  // Use inverse array layout with the tensor data above.
+  std::vector<int> layout({0, 1});
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout);
+  auto handle =
+      ops::XRTAllocateFromTensor(root, {tensor}, {tensor.shape()}, alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  // We have sent literal's data (in array layout) with a attribute layout
+  // {0,1}, so the expected literal read from device needs to be changed
+  // accordingly.
+  xla::Literal expected_literal =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 6.0f}, {5.0f, 7.0f}});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected_literal, response));
+}
+
 TEST(RawApiTest, AllocAndRewrite) {
   xrt::XLAAllocation alloc;
   *alloc.mutable_value() =
@@ -258,8 +400,102 @@ TEST(RawApiTest, AllocAndRewrite) {
   EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
   EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
 
-  auto release =
-      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  Tensor release_tensor(DT_INT64, TensorShape({1}));
+  release_tensor.flat<int64>()(0) = allocation_handle;
+
+  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, AllocReleaseMany) {
+  xrt::XLAAllocation alloc1;
+  *alloc1.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+  xrt::XLAAllocation alloc2;
+  *alloc2.mutable_value() =
+      xla::LiteralUtil::CreateR2({{6, 7}, {4, 5}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value1 =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc1.SerializeAsString());
+  auto value2 =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc2.SerializeAsString());
+  auto handle1 = ops::XRTAllocate(root, value1);
+  auto handle2 = ops::XRTAllocate(root, value2);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({handle1, handle2}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle1 = outputs[0].scalar<int64>()();
+  int64 allocation_handle2 = outputs[1].scalar<int64>()();
+
+  Tensor release_tensor(DT_INT64, TensorShape({2}));
+  release_tensor.flat<int64>()(0) = allocation_handle1;
+  release_tensor.flat<int64>()(1) = allocation_handle2;
+
+  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, CompileAndReleaseMany) {
+  xrt::XLAComputation c1;
+  auto config1 = c1.mutable_config();
+  auto shapes1 = config1->mutable_program_shape();
+  *shapes1->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes1->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes1->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  StoreComputationSnapshot(AddAndScale(), c1.mutable_hlo_snapshot());
+
+  xrt::XLAComputation c2;
+  auto config2 = c2.mutable_config();
+  auto shapes2 = config2->mutable_program_shape();
+  *shapes2->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes2->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes2->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndTuple(), c2.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(false);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation1 =
+      ops::Const(root.WithDevice("/device:CPU:0"), c1.SerializeAsString());
+  auto c_handle1 = ops::XRTCompile(root, computation1);
+  auto computation2 =
+      ops::Const(root.WithDevice("/device:CPU:0"), c2.SerializeAsString());
+  auto c_handle2 = ops::XRTCompile(root, computation2);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({c_handle1.handle, c_handle2.handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 compilation_handle1 = outputs[0].scalar<int64>()();
+  int64 compilation_handle2 = outputs[1].scalar<int64>()();
+
+  Tensor release_tensor(DT_INT64, TensorShape({2}));
+  release_tensor.flat<int64>()(0) = compilation_handle1;
+  release_tensor.flat<int64>()(1) = compilation_handle2;
+
+  auto release = ops::XRTReleaseCompilationHandle(root, release_tensor);
+  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
                            &outputs));
 }
@@ -845,6 +1081,107 @@ TEST(RawApiTest, LeakCompilationReference) {
   TF_EXPECT_OK(session.Run({c_handle.handle}, &outputs));
 }
 
+TEST(RawApiTest, CompileAndExecuteWithReusedBuffers) {
+  xla::Shape element_shape = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  xla::Shape shape =
+      xla::ShapeUtil::MakeTupleShape({element_shape, element_shape});
+  xla::Shape return_shape = xla::ShapeUtil::MakeTupleShape(
+      {element_shape, element_shape, element_shape, element_shape});
+  xla::XlaBuilder builder("ReuseBuffer");
+  auto param = xla::Parameter(&builder, 0, shape, "param");
+  auto p0 = xla::GetTupleElement(param, 0);
+  auto p1 = xla::GetTupleElement(param, 1);
+  auto add = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  xla::Tuple(&builder, {add, sub, p0, p1});
+
+  // Flip the tuple literals in the input handle.
+  builder.SetUpAlias({1}, 0, {0});
+  builder.SetUpAlias({0}, 0, {1});
+
+  auto computation = builder.Build().ValueOrDie();
+
+  auto literal0 = xla::LiteralUtil::CreateR1<float>({1.0f, 2.0f});
+  auto literal1 = xla::LiteralUtil::CreateR1<float>({5.0f, 9.0f});
+  auto literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
+
+  xrt::XLAAllocation param_alloc;
+  *param_alloc.mutable_value() = literal.ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = shape.ToProto();
+  *shapes->mutable_result() = return_shape.ToProto();
+  StoreComputationSnapshot(computation, c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  ClientSession session(root);
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto c_data =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, c_data);
+  auto param_value = ops::Const(root.WithDevice("/device:CPU:0"),
+                                param_alloc.SerializeAsString());
+  auto param_handle = ops::XRTAllocate(root, param_value);
+  TF_ASSERT_OK(root.status());
+
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({param_handle}, &outputs));
+
+  int64 alloc_handle = outputs[0].scalar<int64>()();
+
+  // Note that we release the result handle immediately, but since we aliased
+  // the output buffers onto the input allocation ones (held in alloc_handle),
+  // we can fetch the result from there.
+  auto result =
+      ops::XRTExecute(root, c_handle.handle, e_config, {Input(alloc_handle)});
+  auto read_back = ops::XRTReadLiteral(root, result);
+  auto release = ops::XRTReleaseAllocationHandle(
+      root.WithControlDependencies(read_back), result);
+  TF_ASSERT_OK(root.status());
+
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_back},
+                           {release}, &outputs));
+
+  xla::Literal exec_literal = ReadOutputLiteral(outputs, 0);
+  auto exec_literal_parts = exec_literal.DecomposeTuple();
+  ASSERT_EQ(exec_literal_parts.size(), 4);
+
+  EXPECT_TRUE(CompareLiterals(exec_literal_parts[2], literal0));
+  EXPECT_TRUE(CompareLiterals(exec_literal_parts[3], literal1));
+
+  // Now we read back the original input handle values, which at this point
+  // should contain the result of the XLA computation.
+  auto read_handle = ops::XRTReadLiteral(root, Input(alloc_handle));
+  TF_ASSERT_OK(root.status());
+  auto release_handle = ops::XRTReleaseAllocationHandle(
+      root.WithControlDependencies(read_handle), Input(alloc_handle));
+  TF_ASSERT_OK(root.status());
+
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_handle},
+                           {release_handle}, &outputs));
+
+  xla::Literal return_literal = ReadOutputLiteral(outputs, 0);
+
+  auto expected_literal0 = xla::LiteralUtil::CreateR1<float>({6.0f, 11.0f});
+  auto expected_literal1 = xla::LiteralUtil::CreateR1<float>({-4.0f, -7.0f});
+  // The first element of the computation returned tuple would be the add
+  // (expected_literal0), but since we flipped the buffers, the sub
+  // (expected_literal1) should come first.
+  auto expected_literal =
+      xla::LiteralUtil::MakeTuple({&expected_literal1, &expected_literal0});
+
+  EXPECT_TRUE(CompareLiterals(return_literal, expected_literal));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = xla::LiteralUtil::CreateR0<int64>(11031965).ToProto();
@@ -862,6 +1199,7 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XRTExecutionConfig e;
   e.set_release_input_handles(true);
   e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
   auto e_config =
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.cc b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
index d1405eae468492748ae88d842334a922dce272c6..8bf0f28d2233d9e7593365bc42187e327a1c4ac4 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.cc
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
@@ -273,6 +273,8 @@ Status XRTCompilationCache::Lookup(
   return Status::OK();
 }
 
-string XRTCompilationCache::DebugString() { return "XRTCompilationCache"; }
+string XRTCompilationCache::DebugString() const {
+  return "XRTCompilationCache";
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
index c43d0fc47873abdc82ee937c155bebc346a05f17..7398e847d8b744f947adb03e1bcfd5c0a5b2cc55 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@@ -118,7 +118,7 @@ class XRTCompilationCache : public ResourceBase {
   // EntryRef holding the program is returned in entry.
   Status Lookup(int64 uid, std::unique_ptr<XRTCompilationCacheEntryRef>* entry);
 
-  string DebugString() override;
+  string DebugString() const override;
 
  private:
   // An entry in the compilation cache. The entry is deleted once it has been
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 343460ff107fa81be127950837f786fe4eeadf26..1e2a9584f88b73d7c92a929e93af60376a59170b 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -133,7 +133,8 @@ Status AllocateScopedShapedBuffer(
 XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          int device_ordinal,
                                          xla::DeviceMemoryAllocator* allocator)
-    : allocation_(allocation),
+    : size_(allocation.size()),
+      allocation_(allocation),
       device_ordinal_(device_ordinal),
       allocator_(allocator) {
   if (VLOG_IS_ON(2)) {
@@ -181,7 +182,7 @@ XRTTupleAllocation::~XRTTupleAllocation() {
 }
 
 /*static*/ Status XRTTupleAllocation::CreateAndTransfer(
-    const xla::Literal& literal, xla::Backend* backend, int device_ordinal,
+    const xla::LiteralBase& literal, xla::Backend* backend, int device_ordinal,
     XRTTupleAllocation** allocation) {
   auto transfer_manager = backend->transfer_manager();
   auto allocator = backend->memory_allocator();
@@ -223,8 +224,19 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
                                      xla::Literal* literal) {
   auto transfer_manager = backend->transfer_manager();
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
+
+  // Validate the allocation buffers as if nulls gets to
+  // TransferLiteralFromDevice() a CHECK is issued.
+  xla::ShapedBuffer shaped_buffer = ToShapedBuffer();
+  for (auto& index_buffer : shaped_buffer.buffers()) {
+    if (index_buffer.second.is_null()) {
+      return errors::InvalidArgument("Literal buffer at index ",
+                                     index_buffer.first.ToString(),
+                                     " has been released");
+    }
+  }
   TF_ASSIGN_OR_RETURN(*literal, transfer_manager->TransferLiteralFromDevice(
-                                    stream.get(), ToShapedBuffer()));
+                                    stream.get(), shaped_buffer));
   return Status::OK();
 }
 
@@ -505,11 +517,34 @@ xla::ShapedBuffer XRTTupleAllocation::ToShapedBuffer() {
   return shaped_buffer;
 }
 
+Status XRTTupleAllocation::AliasBufferFrom(const XRTTupleAllocation& source,
+                                           const xla::ShapeIndex& source_index,
+                                           const xla::ShapeIndex& dest_index) {
+  XRTBufferAllocation* source_buffer = source.buffers_.element(source_index);
+  XRTBufferAllocation* dest_buffer = buffers_.element(dest_index);
+  // We allow the destination size being zero, because there are cases where we
+  // are coming in later filling in null/uninitialized device buffers.
+  // In all other cases, the size of the new buffer must match.
+  if (source_buffer->size() != dest_buffer->size() &&
+      dest_buffer->size() != 0) {
+    return errors::InvalidArgument(
+        "Source buffer at index ", source_index.ToString(),
+        " does not match the size of destination buffer at index ",
+        dest_index.ToString(), ": ", source_buffer->size(), " vs ",
+        dest_buffer->size());
+  }
+  *buffers_.mutable_element(dest_index) = source_buffer;
+  source_buffer->Ref();
+  dest_buffer->Unref();
+  return Status::OK();
+}
+
 xla::ShapeTree<xla::MaybeOwningDeviceMemory>
-XRTTupleAllocation::ToDeviceMemoryTree(bool release) {
+XRTTupleAllocation::ToDeviceMemoryTree(
+    const std::function<bool(const xla::ShapeIndex&)>& release_checker) {
   xla::ShapeTree<xla::MaybeOwningDeviceMemory> shaped_tree(on_device_shape());
   for (const auto& buffer : buffers_) {
-    if (!release) {
+    if (!release_checker(buffer.first)) {
       *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
     } else {
       *shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory(
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 3e3d5024124e13b87eed6f79596d50cd64325914..ddf2656e6f51775024a6d1cd0d7a387605faae6f 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -58,7 +59,14 @@ class XRTBufferAllocation : public core::RefCounted {
   // freed when the reference count drops to zero.
   void DiscardAllocation();
 
+  // Returns the expected size of the allocation. Since DiscardAllocation() will
+  // set allocation_ to {null,0}, and since later we might want to replace the
+  // discarded buffer with a new one, we need to be able to verify the size
+  // compatibility.
+  uint64 size() const { return size_; }
+
  private:
+  uint64 size_ = 0;
   se::DeviceMemoryBase allocation_;
   int device_ordinal_;
   xla::DeviceMemoryAllocator* allocator_;
@@ -80,7 +88,7 @@ class XRTTupleAllocation : public ResourceBase {
   // Allocates new device memory buffers sufficient to store literal, transfers
   // literal to that memory, and returns a XRTTupleAllocation handle to the
   // allocated buffers.
-  static Status CreateAndTransfer(const xla::Literal& literal,
+  static Status CreateAndTransfer(const xla::LiteralBase& literal,
                                   xla::Backend* backend, int device_ordinal,
                                   XRTTupleAllocation** allocation);
 
@@ -168,11 +176,20 @@ class XRTTupleAllocation : public ResourceBase {
   // the same shape as on_host_shape.
   xla::ShapedBuffer ToShapedBuffer();
 
-  // Returns the device memory tree of this allocation. If 'release' is set, the
-  // ownership of the device memory is transferred to the result.
-  xla::ShapeTree<xla::MaybeOwningDeviceMemory> ToDeviceMemoryTree(bool release);
+  // Aliases the source buffer at source_index into the current tuple allocation
+  // dest_index.
+  Status AliasBufferFrom(const XRTTupleAllocation& source,
+                         const xla::ShapeIndex& source_index,
+                         const xla::ShapeIndex& dest_index);
+
+  // Returns the device memory tree of this allocation. If the release_checker
+  // function returns true for a given index, the ownership of the device memory
+  // at that index is transferred to the result. Every attempt to read the value
+  // at that index will fail.
+  xla::ShapeTree<xla::MaybeOwningDeviceMemory> ToDeviceMemoryTree(
+      const std::function<bool(const xla::ShapeIndex&)>& release_checker);
 
-  string DebugString() override { return "XLA allocation handle"; }
+  string DebugString() const override { return "XLA allocation handle"; }
 
  private:
   // Creates a new handle with (tuple) shape.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 832db0f4ab46911e067d17b4a125706c276cf798..a4c3d9623adfe3133af0c6ea055586b9544e659b 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -63,7 +63,6 @@ py_library(
         "//tensorflow/contrib/libsvm",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
-        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -197,7 +196,7 @@ cc_library(
             "//tensorflow/contrib/kinesis:dataset_kernels",
         ],
     }) + if_not_windows([
-        "//tensorflow/contrib/tensorrt:trt_engine_op_kernel",
+        "//tensorflow/contrib/tensorrt:trt_op_kernels",
     ]),
 )
 
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 4f1a2a5693235183c8f486817b82c8c81fa389ec..48d5296c71cbdb470fa405b30547a32b7022f29b 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -20,13 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import platform
 
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import autograph
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
 from tensorflow.contrib import checkpoint
-if os.name != "nt":
+if os.name != "nt" and platform.machine() != "s390x":
   from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
@@ -91,7 +92,6 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.recurrent.python import recurrent_api as recurrent
@@ -103,6 +103,8 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 ffmpeg = LazyLoader("ffmpeg", globals(),
                     "tensorflow.contrib.ffmpeg")
 del os
+del platform
+
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
index 44532cb078f9bd1578172f8a7d8a4b55cd21a7cb..831c613f2c8c9a4fcc2cb9d313077fe79ee96fd7 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
@@ -186,8 +186,8 @@
         "\n",
         "  def __init__(self):\n",
         "    super(RnnColorbot, self).__init__()\n",
-        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
-        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256, dtype=tf.float32)\n",
+        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128, dtype=tf.float32)\n",
         "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
         "\n",
         "  def _rnn_layer(self, chars, cell, batch_size, training):\n",
@@ -241,7 +241,7 @@
         "    seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n",
         "\n",
         "    # Grab just the end-of-sequence from each output.\n",
-        "    indices = (length - 1, range(batch_size))\n",
+        "    indices = (length - 1, list(range(batch_size)))\n",
         "    indices = tf.stack(indices, 1)\n",
         "    sequence_ends = tf.gather_nd(seq, indices)\n",
         "    return self.relu_layer(sequence_ends)\n",
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index 648f3ebb05646a66144bcb118347cbc391909409..5174afe0a63d37e3ea3e19ac9bab644d1d83ecf1 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -37,6 +37,7 @@ py_library(
 cc_library(
     name = "batch_ops_kernels",
     deps = [
+        "//tensorflow/core:batch_ops_op_lib",
         "//tensorflow/core/kernels:batch_kernels",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
index 4652021fecabfa11fa6a8754dc884d89e151b590..e3b4535bac4a01a1277290e0d1ea6d3c7613731c 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
@@ -42,7 +42,7 @@ class BigtableClientResource : public ResourceBase {
     return client_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("BigtableClientResource(project_id: ", project_id_,
                            ", instance_id: ", instance_id_, ")");
   }
@@ -67,7 +67,7 @@ class BigtableTableResource : public ResourceBase {
 
   ::google::cloud::bigtable::noex::Table& table() { return table_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(
         "BigtableTableResource(client: ", client_->DebugString(),
         ", table: ", table_name_, ")");
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index 3fe71a2ea730cc9b60b2e2088a0d80a08b38d1a9..e6fda9e61757f1441b3691c2a3d57c6f1a5a0d42 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h"
 
-#include "google/bigtable/v2/data.pb.h"
+#include "external/com_github_googleapis_googleapis/google/bigtable/v2/data.pb.h"
 #include "google/protobuf/wrappers.pb.h"
 #include "re2/re2.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -410,6 +410,17 @@ BigtableTestClient::AsyncCheckAndMutateRow(
   return nullptr;
 }
 
+std::unique_ptr<
+    grpc::ClientAsyncReaderInterface<google::bigtable::v2::ReadRowsResponse>>
+BigtableTestClient::AsyncReadRows(
+    grpc::ClientContext* context,
+    const google::bigtable::v2::ReadRowsRequest& request,
+    grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index 85705904573e9e7710912e3f4ff30dd8fed5bf85..8e1326f2ce841368ea81fc7194a0588e5d6cd637 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -87,6 +87,12 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
       const google::bigtable::v2::CheckAndMutateRowRequest& request,
       grpc::CompletionQueue* cq) override;
 
+  std::unique_ptr<
+      grpc::ClientAsyncReaderInterface<google::bigtable::v2::ReadRowsResponse>>
+  AsyncReadRows(grpc::ClientContext* context,
+                const google::bigtable::v2::ReadRowsRequest& request,
+                grpc::CompletionQueue* cq, void* tag) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index ee052ac60387d8f993e4942dd7dff39e191dd3a4..47d910d42a27db4b857eeb12209dfbb429dd1be2 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -487,8 +487,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper_0 <= 0.98)
     self.assertTrue(frac_below_upper_1 >= 0.92)
     self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.92)
-    self.assertTrue(frac_both_below_upper <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.91)
+    self.assertTrue(frac_both_below_upper <= 0.99)
 
     train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
         two_dimension=True)
@@ -516,8 +516,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_above_lower_0 <= 0.98)
     self.assertTrue(frac_above_lower_1 >= 0.92)
     self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.92)
-    self.assertTrue(frac_both_above_lower <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.91)
+    self.assertTrue(frac_both_above_lower <= 0.99)
 
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
@@ -806,8 +806,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper_0 <= 0.98)
     self.assertTrue(frac_below_upper_1 >= 0.92)
     self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.92)
-    self.assertTrue(frac_both_below_upper <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.91)
+    self.assertTrue(frac_both_below_upper <= 0.99)
 
     train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
         two_dimension=True)
@@ -835,8 +835,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     self.assertTrue(frac_above_lower_0 <= 0.98)
     self.assertTrue(frac_above_lower_1 >= 0.92)
     self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.92)
-    self.assertTrue(frac_both_above_lower <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.91)
+    self.assertTrue(frac_both_above_lower <= 0.99)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index e446c411a8d5075563b8f8b912b29df310e16c8c..6faf6963011b698a3b233329d87471da7608e44a 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -96,7 +96,7 @@ class StatsAccumulatorResource : public boosted_trees::StampedResource {
              TensorShapeUtils::IsScalar(hessian_shape));
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("StatsAccumulatorResource[size=", values_.size(),
                            "]");
   }
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
index 42d69645acaae063fcd46bd1f6c819ccb68f48bd..aa3f24f08a0f762507df83def72e7d595265221f 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
@@ -227,7 +227,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
             tree_ensemble_config=tree_ensemble_config.SerializeToString(),
             name="restore_tree")
         resources.initialize_resources(resources.shared_resources()).run()
-        variables.initialize_all_variables().run()
+        variables.global_variables_initializer().run()
         my_saver = saver.Saver()
 
         # Add the second tree and replace the ensemble of the handle.
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index fca22c71a83459cb290eaebcf107cf1c14c222b7..c3685b54e201f73039f6623443c67ba2b217a51e 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -62,8 +62,8 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
         saver.BaseSaverBuilder.SaveSpec(ensemble_config, slice_spec,
                                         name + "_config"),
     ]
-    super(TreeEnsembleVariableSavable,
-          self).__init__(tree_ensemble_handle, specs, name)
+    super(TreeEnsembleVariableSavable, self).__init__(tree_ensemble_handle,
+                                                      specs, name)
     self._tree_ensemble_handle = tree_ensemble_handle
     self._create_op = create_op
 
@@ -115,7 +115,7 @@ class TreeEnsembleVariable(tracking.TrackableResource):
 
   def _gather_saveables_for_checkpoint(self):
     return {
-        "tree_ensemble_variable":
+        self.resource_handle.op.name + "/tree_ensemble_variable":
             functools.partial(
                 TreeEnsembleVariableSavable,
                 tree_ensemble_handle=self.resource_handle,
@@ -131,8 +131,8 @@ def tree_ensemble_variable(stamp_token,
 
   Args:
     stamp_token: The initial stamp token value for the ensemble resource.
-    tree_ensemble_config: A `Tensor` of type `string`.
-      Serialized proto of the tree ensemble.
+    tree_ensemble_config: A `Tensor` of type `string`. Serialized proto of the
+      tree ensemble.
     name: A name for the ensemble variable.
     container: An optional `string`. Defaults to `""`.
 
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index a5951fb7377d48748f5eb578c034176517df7749..e78ec476ab3b43e5eb56a2502008bb8020ae97e0 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -566,9 +566,10 @@ class GradientBoostedDecisionTreeModel(object):
     # Determine if ensemble is colocated with the inputs.
     if self._ensemble_handle.device != input_deps[0].device:
       # Create a local ensemble and get its local stamp.
-      with ops.name_scope("local_ensemble", "TreeEnsembleVariable") as name:
+      with ops.name_scope("local_ensemble", "TreeEnsembleVariable"):
         local_ensemble_handle = (
-            gen_model_ops.decision_tree_ensemble_resource_handle_op(name=name))
+            gen_model_ops.decision_tree_ensemble_resource_handle_op(
+                self._ensemble_handle.op.name + "/local_ensemble"))
         create_op = gen_model_ops.create_tree_ensemble_variable(
             local_ensemble_handle, stamp_token=-1, tree_ensemble_config="")
         with ops.control_dependencies([create_op]):
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 92068e88a76cb8bfdd394c1093347a8fb8a63449..7e45d0b2cecefa4bdec77d6cf7cfca7dba04db9c 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -43,7 +43,7 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keepdims=True)
+      math_ops.squared_difference(predictions, label), 1, keepdims=True)
   return loss
 
 
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index 220e981618b7c0bfb1e4e98c087d83b451b9b3cf..1ad40aca2880940c78d746674c7378ff0427c057 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -166,7 +166,7 @@ def per_example_squared_loss(labels, weights, predictions):
     update_op: An update operation to update the loss's internal state.
   """
   unweighted_loss = math_ops.reduce_sum(
-      math_ops.square(predictions - labels), 1, keepdims=True)
+      math_ops.squared_difference(predictions, labels), 1, keepdims=True)
 
   return unweighted_loss * weights, control_flow_ops.no_op()
 
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
index 94aeb2c7bb48c6eddb6c7894f8bf6f1567470113..0fe57c0a4e8375cc7ec7aca9553bded87e238b33 100644
--- a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -34,7 +34,7 @@ class DecisionTreeEnsembleResource : public StampedResource {
             protobuf::Arena::CreateMessage<
                 boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_)) {}
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("GTFlowDecisionTreeEnsemble[size=",
                            decision_tree_ensemble_->trees_size(), "]");
   }
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
index fdaaae7f472c8f564ab45a8366d3746cbf1158ee..574e3065e7f46049815897ef73e44d33f0d23f0f 100644
--- a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -43,7 +43,7 @@ class QuantileStreamResource : public StampedResource {
     set_stamp(stamp_token);
   }
 
-  string DebugString() override { return "QuantileStreamResource"; }
+  string DebugString() const override { return "QuantileStreamResource"; }
 
   tensorflow::mutex* mutex() { return &mu_; }
 
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 94b7f4f867655bf7fdf94e8488eeae7088c41622..99ed4959fad9699f265183d71a1f3b609d7e6d30 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -51,11 +51,11 @@ from tensorflow.contrib.checkpoint.python.split_dependency import split_dependen
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
 from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
 from tensorflow.python.training.checkpoint_management import CheckpointManager
-from tensorflow.python.training.checkpointable.base import CheckpointableBase
+from tensorflow.python.training.checkpointable.base import Checkpointable as CheckpointableBase
 from tensorflow.python.training.checkpointable.data_structures import List
 from tensorflow.python.training.checkpointable.data_structures import Mapping
 from tensorflow.python.training.checkpointable.data_structures import NoDependency
-from tensorflow.python.training.checkpointable.tracking import Checkpointable
+from tensorflow.python.training.checkpointable.tracking import AutoCheckpointable as Checkpointable
 from tensorflow.python.training.checkpointable.util import capture_dependencies
 from tensorflow.python.training.checkpointable.util import list_objects
 from tensorflow.python.training.checkpointable.util import object_metadata
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index ada41687261ab63286933d01da4e286173042e0c..4e529322c7c76797938468b405cd175609dc0a73 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -2,7 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow:internal"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "checkpoint",
@@ -27,17 +27,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "containers_test",
     srcs = ["containers_test.py"],
-    deps = [
+    additional_deps = [
         ":containers",
+        "@six_archive//:six",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/training/checkpointable:base",
         "//tensorflow/python/training/checkpointable:util",
-        "@six_archive//:six",
     ],
 )
 
@@ -53,18 +53,18 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "python_state_test",
     srcs = ["python_state_test.py"],
-    deps = [
+    additional_deps = [
         ":python_state",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/training/checkpointable:util",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -80,10 +80,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "split_dependency_test",
     srcs = ["split_dependency_test.py"],
-    deps = [
+    additional_deps = [
         ":split_dependency",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
@@ -106,10 +106,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "visualize_test",
     srcs = ["visualize_test.py"],
-    deps = [
+    additional_deps = [
         ":visualize",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:resource_variable_ops",
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 5418e2605b724edb60878e250d2c50fcc6ff5633..97936d9e9dfd5d6e62fdf8312707a276b63e1267 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -63,7 +63,7 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
       ValueError: If `checkpointable` is not a checkpointable object.
     """
 
-    if not isinstance(checkpointable, checkpointable_lib.CheckpointableBase):
+    if not isinstance(checkpointable, checkpointable_lib.Checkpointable):
       raise ValueError(
           ("Expected a checkpointable value, got %s which does not inherit "
            "from CheckpointableBase.") % (checkpointable,))
diff --git a/tensorflow/contrib/checkpoint/python/containers_test.py b/tensorflow/contrib/checkpoint/python/containers_test.py
index ac85c7be803cd4c2f8ba19d3ef887a3c65a15933..a2d453ec6eb3dcf9aba4c52fe866756a92673c63 100644
--- a/tensorflow/contrib/checkpoint/python/containers_test.py
+++ b/tensorflow/contrib/checkpoint/python/containers_test.py
@@ -52,7 +52,7 @@ class UniqueNameTrackerTests(test.TestCase):
     save_root = util.Checkpoint(slots=slots)
     save_path = save_root.save(checkpoint_prefix)
 
-    restore_slots = tracking.Checkpointable()
+    restore_slots = tracking.AutoCheckpointable()
     restore_root = util.Checkpoint(
         slots=restore_slots)
     status = restore_root.restore(save_path)
@@ -68,7 +68,7 @@ class UniqueNameTrackerTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testExample(self):
-    class SlotManager(tracking.Checkpointable):
+    class SlotManager(tracking.AutoCheckpointable):
 
       def __init__(self):
         self.slotdeps = containers.UniqueNameTracker()
diff --git a/tensorflow/contrib/checkpoint/python/python_state.py b/tensorflow/contrib/checkpoint/python/python_state.py
index 302d5cfb79a08b6adf52ebd44533152c5454eadc..969c90c78871ebff02b360f8f09623df56c9c077 100644
--- a/tensorflow/contrib/checkpoint/python/python_state.py
+++ b/tensorflow/contrib/checkpoint/python/python_state.py
@@ -34,7 +34,7 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-class NumpyState(base.CheckpointableBase):
+class NumpyState(base.Checkpointable):
   """A checkpointable object whose NumPy array attributes are saved/restored.
 
   Example usage:
@@ -130,7 +130,7 @@ class NumpyState(base.CheckpointableBase):
 
 
 @six.add_metaclass(abc.ABCMeta)
-class PythonStateWrapper(base.CheckpointableBase):
+class PythonStateWrapper(base.Checkpointable):
   """Wraps a Python object for storage in an object-based checkpoint."""
 
   @abc.abstractmethod
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
index 7e77453f3d848c2e321ed2ba66917a742d95459a..3e9700ad74618e24843181d169f3fb39ac96bff6 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
     return self._restore_callback(tensor)
 
 
-class _SplitDependency(checkpointable.CheckpointableBase):
+class _SplitDependency(checkpointable.Checkpointable):
   """Looks like a regular variable while synchronizing save/restores."""
 
   def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index 00a805af25d5d0ea723db5d015fb12bf45c53857..664a4e76ab31bf31c7a57924e4af866f2d746804 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -44,7 +44,7 @@ def _combine_variable_closure(variable):
   return _consume_restore_buffer_fn
 
 
-class SaveTensorSlicesAsDeps(base.CheckpointableBase):
+class SaveTensorSlicesAsDeps(base.Checkpointable):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
@@ -59,14 +59,14 @@ class SaveTensorSlicesAsDeps(base.CheckpointableBase):
       self._track_checkpointable(dep, name=name)
 
 
-class HasRegularDeps(tracking.Checkpointable):
+class HasRegularDeps(tracking.AutoCheckpointable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
     self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
 
 
-class OnlyOneDep(tracking.Checkpointable):
+class OnlyOneDep(tracking.AutoCheckpointable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 1311063ec023bdaa2588d6f1c826bf900f7dea09..20f8c2b2453a58fdbe5a3587fa6687debd9c06d3 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -27,7 +27,6 @@ tf_kernel_library(
     deps = [
         ":bigquery_table_accessor",
         ":bigquery_table_partition_proto_cc",
-        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:reader_base",
@@ -79,7 +78,6 @@ tf_kernel_library(
     srcs = ["gcs_config_ops.cc"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform/cloud:curl_http_request",
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index df8b48dfc46124d3b9454d92ffb70dbcf1bc4217..60ee1b4b3fd7d0b6afaefcc05effd3bbae00cf2c 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -147,19 +147,19 @@ suitable interface for project configuration and dependency setting.
     *   Go (required if you need ssl support, optional)
     *   NASM/YASM (required by grpc for ssl support, optional)
 2.  Start CMake GUI
-3.  Click on `Browse Source` and direct to the the folder
+3.  Click on `Browse Source` and direct to the folder
     `<tensorflow-source>/tensorflow/contrib/cmake`
 4.  Click on `Browse Build` and spectify a location that you want tensorflow to
     be build
 5.  Click on `Configure`, a new window will be prompted out, specify the
     generator mode for the project generation. For Windows, choose `Visual
     Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then
-    press `Finish`. Wait for a moment, the default project dependecy would
+    press `Finish`. Wait for a moment, the default project dependency would
     automatically generate.
 6.  There are a few options that you can customize your own build. **The setting
-    here is crucial for a sucessful build, please check all items carefully.**
+    here is crucial for a successful build, please check all items carefully.**
 
-    *   `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    *   `tensorflow_BUILD_ALL_KERNELS` should always be `on`
     *   `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you
         to test build (optional)
     *   `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't
@@ -278,7 +278,7 @@ suitable interface for project configuration and dependency setting.
     `make -sj<number-of-threads> install`
 
     Where `<number-of-threads>` is the threads used for the compilation, change
-    to any integer less or equal to your computer's maxiumum thread number.
+    to any integer less or equal to your computer's maximum thread number.
 
     Headers are discretely located in the build folders. Tensorflow library can
     be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index 46a193971c5084523d432065f265fa7a9909f595..6c6a5df7f76723800740a81ccdcb137a0ec33846 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -31,17 +31,17 @@ if (systemlib_ABSEIL_CPP)
   message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
   message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
 
-  add_custom_target(abseil_cpp)
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+  add_custom_target(abseil_cpp_build)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
 
 else (systemlib_ABSEIL_CPP)
 
   include (ExternalProject)
 
-  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp)
-  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz)
-  set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e)
-  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build)
+  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp.git)
+  set(abseil_cpp_TAG master)
+  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -49,8 +49,11 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/Release/absl_base.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_malloc_internal.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/time/Release/absl_time.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
     else()
       set(abseil_cpp_STATIC_LIBRARIES
@@ -62,6 +65,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/time/absl_time.lib
           ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib)
     endif()
   else()
@@ -74,15 +78,18 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a
         ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a
         ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a
+        ${abseil_cpp_BUILD}/absl/time/libabsl_time.a
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
   endif()
 
-  ExternalProject_Add(abseil_cpp
+  ExternalProject_Add(abseil_cpp_build
       PREFIX abseil_cpp
-      URL ${abseil_cpp_URL}
-      URL_HASH ${abseil_cpp_HASH}
+      GIT_REPOSITORY ${abseil_cpp_URL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
       BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
+      BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release
+      COMMAND ${CMAKE_COMMAND} --build . --config Release
       INSTALL_COMMAND ""
       CMAKE_CACHE_ARGS
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
@@ -91,8 +98,10 @@ else (systemlib_ABSEIL_CPP)
   )
 
   include_directories(${abseil_cpp_INCLUDE_DIR})
+  message(STATUS ${abseil_cpp_INCLUDE_DIR})
+
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
 
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
 
 endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 479609458c64f7c7bd7b3ce6b23aceaa3db17f21..b15143bfc1cd787b156c9d6dd724a17730f0f8fb 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 1.20.1)
+set(nsync_TAG 1.20.2)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 96160568fa79291a7b391761373e1eaf0f70974e..21ae9a08a6bb8f71e5935ddde2d7bb3ed0cd8bbc 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -1,6 +1,9 @@
 # python_sanity_test.py will complain about invalid or missing entries
 # problematic entries can be commented for temporary whitelisting
 tensorflow
+tensorflow/compiler
+tensorflow/compiler/xla
+tensorflow/compiler/xla/service
 tensorflow/core
 tensorflow/core/example
 tensorflow/core/framework
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index d7b2a1339e047aba0a9424a53a63726805e89721..d8d1cc3aa2ca4fff3c950654b7cbd7085c76010c 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -147,7 +147,6 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/function.proto"
     "tensorflow/core/framework/graph.proto"
     "tensorflow/core/framework/graph_transfer_info.proto"
-    "tensorflow/core/framework/iterator.proto"
     "tensorflow/core/framework/kernel_def.proto"
     "tensorflow/core/framework/log_memory.proto"
     "tensorflow/core/framework/node_def.proto"
@@ -302,8 +301,8 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*.h"
     "${tensorflow_source_dir}/public/*.h"
 )
 
@@ -317,14 +316,14 @@ file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*test*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/loader.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/vacuum.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*test*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/loader.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/vacuum.cc"
 )
 
 # TODO(jart): Why doesn't this work?
 # set_source_files_properties(
-#     ${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/snapfn.cc
+#     ${tensorflow_source_dir}/tensorflow/core/lib/db/snapfn.cc
 #     PROPERTIES COMPILE_FLAGS -DSQLITE_OMIT_LOAD_EXTENSION)
 
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8faccf8d55902e6701ebb4ce534b84705304fd5f..1fe8795ddf00232eba5a60a130e0845a6f6a8e17 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -802,6 +802,7 @@ add_custom_command(
       # tensorflow/__init__.py depends on files generated in this step. So, remove it while
       # this step is running since the files aren't there yet.
       COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
 
       # Run create_python_api.py to generate API init files.
       COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE}
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index f867cd15b67dbd43650d8012b4299845af7200a8..0f1be500f499ebba7e1907de663f8bbfa889bb17 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -76,10 +77,22 @@ def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
 
       All `Operation`s returned from `computation` will be executed when
       evaluating any of the returned output tensors.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    inputs: A list of inputs or `None` (equivalent to an empty list). Each input
+      can be a nested structure containing values that are convertible to
+      tensors. Note that passing an N-dimension list of compatible values will
+      result in a N-dimention list of scalar tensors rather than a single Rank-N
+      tensors. If you need different behavior, convert part of inputs to tensors
+      with `tf.convert_to_tensor`.
 
   Returns:
-    A list of output tensors.
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
   """
   # pylint: disable=protected-access
   return _compile_internal(computation, inputs)
@@ -245,13 +258,21 @@ def _compile_internal(computation, inputs=None):
   Args:
     computation: A Python function that builds the computation to compile and
       execute.
-    inputs: A list of input tensors or `None` (equivalent to `[]`). Its order
-      should match ordering of computation arguments.
+    inputs: A list of inputs or `None` (equivalent to an empty list). Each input
+      can be a nested structure containing values that are convertible to
+      tensors. Note that passing an N-dimension list of compatible values will
+      result in a N-dimension list of scalar tensors rather than a single Rank-N
+      tensors. If you need different behavior, convert part of inputs to tensors
+      with `tf.convert_to_tensor`.
+
   Returns:
-    A list of output tensors from computation.
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include: 1) None output 2) Single
+    value output 3) Operation-only outputs
   Raises:
     ValueError: If any element in computation outputs is neither an operations
       or a value that can be converted to tensor.
+    ValueError: If computation outputs is non-flat and contains any Operations.
     TypeError: If `inputs` is not a list or tuple.
   """
   if inputs is None:
@@ -260,17 +281,10 @@ def _compile_internal(computation, inputs=None):
   if not isinstance(inputs, collections.Sequence):
     raise TypeError('inputs must be a list')
 
+  # Flatten inputs.
+  flat_inputs = nest.flatten(inputs)
   # Converts inputs to Tensors.
-  inputs = [ops.convert_to_tensor(x) for x in inputs]
-  input_arity = len(inputs)
-
-  arg_error = check_function_argument_count(
-      computation, input_arity, infeed_queue=None)
-  if arg_error is not None:
-    raise TypeError(
-        'Supplied computation cannot be called with the specified inputs. You '
-        'specified %d inputs: %s, but the computation needs %s' %
-        (input_arity, str([i.name for i in inputs]), arg_error))
+  flat_inputs = [ops.convert_to_tensor(x) for x in flat_inputs]
 
   cluster_name = ops.get_default_graph().unique_name('cluster')
   pivot = control_flow_ops.no_op(name=cluster_name + '/pivot')
@@ -280,11 +294,15 @@ def _compile_internal(computation, inputs=None):
 
     # Add identity ops so even unused inputs are 'consumed' by the
     # computation.
-    computation_inputs = [
+    flat_inputs = [
         array_ops.identity(x, name='input_{}'.format(i))
-        for i, x in enumerate(inputs)
+        for i, x in enumerate(flat_inputs)
     ]
 
+    # Re-pack flat_inputs in same structure as 'inputs'.
+    computation_inputs = nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs)
+
     # Only resource variables work inside an XLA computation, so turn on
     # resource variables for the computation.
     vscope = variable_scope.get_variable_scope()
@@ -297,66 +315,166 @@ def _compile_internal(computation, inputs=None):
     # Restore variable scope after computation.
     vscope.set_use_resource(saved_use_resource)
 
-    # If the computation returns `None`, make it an empty tuple.
-    if outputs is None:
-      outputs = tuple()
-    # If the computation only returned one value, make it a tuple.
-    if not isinstance(outputs, collections.Sequence):
-      outputs = (outputs,)
-
-    # Append `no_op` here so that return value of this function always contains
-    # at least one op that can trigger XlaLaunch node.
-    outputs += (control_flow_ops.no_op(),)
-    try:
-      outputs = [
-          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-          for o in outputs
-      ]
-    except Exception as e:
-      raise ValueError(
-          'XLA computation function return values must all either be Operations'
-          ' or convertible to Tensors. Got error: "%s"' % str(e))
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          'XLA computation function must return zero or more Tensor values '
-          'followed by zero or more Operations.')
-    output_arity = len(output_tensors)
-
-    new_output_tensors = []
-    for t in output_tensors:
-      with ops.device(t.device if t.device else ''):
-        new_output_tensors.append(array_ops.identity(t))
+    outputs_is_flat = is_flat(outputs)
+    if outputs_is_flat:
+      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+    else:
+      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
 
-    output_tensors = new_output_tensors
     context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
     context.Exit()
 
-  outputs = [
-      xla_ops.xla_cluster_output(output_tensors[i], name='output{}'.format(i))
-      for i in xrange(output_arity)
+  # When XLA computation returns only operations and no tensors, a NoOp
+  # dependent on the operations in outputs is returned. Otherwise final
+  # outputs would be empty and there is no way to trigger returned
+  # operations.
+  if not output_tensors:
+    return control_flow_ops.group(control_deps, name='output_0')
+
+  output_tensors = [
+      xla_ops.xla_cluster_output(o, name='output{}'.format(i))
+      for i, o in enumerate(output_tensors)
   ]
 
-  with ops.control_dependencies(output_operations):
-    if output_arity == 0:
-      # When XLA computation returns only operations and no tensors, a NoOp
-      # dependent on the operations in outputs is returned. Otherwise final
-      # outputs would be empty and there is no way to trigger returned
-      # operations.
-      return control_flow_ops.no_op(name='output_0')
-    else:
-      # Wraps the outputs in identity operators that carries control
-      # dependencies.
-      return [
-          array_ops.identity(outputs[i], name='output_%d' % i)
-          for i in xrange(output_arity)
-      ]
+  with ops.control_dependencies(control_deps):
+    # Wraps the outputs in identity operators that carries control
+    # dependencies.
+    output_tensors = [
+        array_ops.identity(o, name='output_%d' % i)
+        for i, o in enumerate(output_tensors)
+    ]
+
+  # If `computation` returned non-flat output structure, pack output tensors
+  # back into same structure.
+  if not outputs_is_flat:
+    output_tensors = nest.pack_sequence_as(
+        structure=outputs, flat_sequence=output_tensors)
+
+  return output_tensors
+
+
+def is_flat(outputs):
+  """Checks if outputs is a flat structure.
+
+    Following structures and values are considered flat:
+    1) None
+    2) A single object
+    3) A list or tuple of Tensors/Operations
+
+    The only structures that this function understands are sequences and
+    dictionaries.  E.g. this means that if outputs contains a single
+    user-defined Object, it is considered to be flat. Errors are raised later on
+    if that Object cannot be converted to a Tensor.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    A boolean indicates whether outputs is flat.
+  """
+  # If outputs is a list or tuple, check if it has any nested structure. If
+  # there is, then outputs is non-flat.
+  if isinstance(outputs, collections.Sequence):
+    for o in outputs:
+      if isinstance(o, collections.Sequence) or isinstance(o, dict):
+        return False
+
+  # If outputs is a dict, it is non-flat.
+  if isinstance(outputs, dict):
+    return False
+
+  # Getting here means either outputs itself is a single non-structured value
+  # or it is a flat list of single non-structured values.
+  return True
+
+
+def _postprocess_flat_outputs(outputs):
+  """Validates flat outputs and adds back device assignments.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    Tensors and Operations extracted from outputs.
+  """
+  # Following code segment is to preserve legacy behavior. Previously we only
+  # supported flat outputs and thus for consistency it was nice to convert even
+  # single element into a tuple. But now that we support arbitrary output
+  # structure, this is no longer necessary.
+  # TODO(b/121383831): Migrate all legacy use cases and delete this special
+  # case.
+  # If the computation returns `None`, make it an empty tuple.
+  if outputs is None:
+    outputs = tuple()
+  # If the computation only returned one value, make it a tuple.
+  if not isinstance(outputs, collections.Sequence):
+    outputs = (outputs,)
+
+  # Append `no_op` here so that return value of this function always contains
+  # at least one op that can trigger XlaLaunch node.
+  outputs += (control_flow_ops.no_op(),)
+  try:
+    outputs = [
+        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+        for o in outputs
+    ]
+  except Exception as e:
+    raise ValueError(
+        'XLA computation function return values must all either be Operations'
+        ' or convertible to Tensors. Got error: "%s"' % str(e))
+
+  # Separates the returned Operations and Tensors.
+  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
+
+  if outputs != output_tensors + output_operations:
+    raise ValueError(
+        'XLA computation function must return zero or more Tensor values '
+        'followed by zero or more Operations.')
+
+  new_output_tensors = []
+  for t in output_tensors:
+    with ops.device(t.device if t.device else ''):
+      new_output_tensors.append(array_ops.identity(t))
+
+  return new_output_tensors, output_operations
+
+
+def _postprocess_non_flat_outputs(outputs):
+  """Validates non-flat outputs and adds back device assignments.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    Tensors extracted from outputs and an empty list because Operations are not
+    allowed in non-flat outputs..
+  """
+  # Convert all non-Operation outputs to Tensors.
+  new_output_tensors = []
+  for o in nest.flatten(outputs):
+    if isinstance(o, ops.Operation):
+      raise ValueError(
+          'xla.compile does not support Operation as return value in non-flat '
+          'output structure. You can set returned Operations as control '
+          'dependencies of returned Tensors so Operations are triggered when '
+          'Tensors are evaluated. Operation found: "%s"' % o.name)
+
+    try:
+      o = ops.convert_to_tensor(o)
+    except Exception as e:
+      raise ValueError(
+          'XLA computation function return values must all either be '
+          'Operations or convertible to Tensors. Got error: "%s"' % str(e))
+
+    # Makes sure even pass-through inputs/outputs are touched in compile
+    # context by creating an Identity node inside compile context.
+    with ops.device(o.device if o.device else ''):
+      new_output_tensors.append(array_ops.identity(o))
+
+  return new_output_tensors, []
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
index eee4329acbeb38c9f37f79227aeb3acd46dce5e7..619153df67c90cea5a5082a411972948bac5fe90 100644
--- a/tensorflow/contrib/constrained_optimization/BUILD
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -42,11 +42,6 @@ py_test(
     name = "candidates_test",
     srcs = ["python/candidates_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        # TODO(b/121223093): Re-enable this test after fixing "Distribution
-        # should match known solution" errors.
-        "no_mac",
-    ],
     deps = [
         ":constrained_optimization",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
index a4c49d48bc5c763489215261a909573af0f19055..280e9acd88638a9385bfd9128ba6d3739879aab2 100644
--- a/tensorflow/contrib/constrained_optimization/python/candidates_test.py
+++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
@@ -52,12 +52,12 @@ class CandidatesTest(test.TestCase):
     distribution = candidates.find_best_candidate_distribution(
         objective_vector, constraints_matrix)
     # Verify that the solution is a probability distribution.
-    self.assertTrue(np.all(distribution >= 0))
+    self.assertTrue(np.all(distribution >= -1e-6))
     self.assertAlmostEqual(np.sum(distribution), 1.0)
     # Verify that the solution satisfies the constraints.
     maximum_constraint_violation = np.amax(
         np.dot(constraints_matrix, distribution))
-    self.assertLessEqual(maximum_constraint_violation, 0)
+    self.assertLessEqual(maximum_constraint_violation, 1e-6)
     # Verify that the solution matches that which we expect.
     expected_distribution = np.array([0.37872711, 0.62127289, 0, 0])
     self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 7e1b4062ce435f3ab4216e90b4f5fcbab984c1dc..ca92c31236a7a3882415834eb32a994a120b6d2d 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -1023,7 +1023,7 @@ class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):
           outputs_v, output_state_v = sess.run(
               [outputs, output_state],
               feed_dict={cell_inputs: inference_input})
-          self.assertAllClose(cudnn_outputs_v, outputs_v, atol=2e-5, rtol=2e-5)
+          self.assertAllClose(cudnn_outputs_v, outputs_v, atol=1e-4, rtol=2e-4)
           (cudnn_output_h_v,) = cudnn_output_states_v
           self.assertAllClose(cudnn_output_h_v, output_state_v, atol=2e-5,
                               rtol=2e-5)
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 1facc83972faf229f243af5bc534bcb98aff5440..f36e8d5022bc7e3f8268a161089153e5510dffc6 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -837,7 +837,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
       checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
     assert len(biases) == len(weights)
     for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.Checkpointable()
+      cell = checkpointable_lib.AutoCheckpointable()
       checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
       cell.bias = bias
       cell.kernel = kernel
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 8a8dc159ade6f2a4a9b5ec29055ea4848492b29f..dbcaf8185fb7a9d2bcf22376439c0ebd49accb1a 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -43,28 +43,19 @@ the workers.
 
 Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy` with [tf.keras] (https://www.tensorflow.org/guide/keras).
 
-Take a very simple model consisting of a single layer:
+Let's define a simple input dataset for training this model. Note that currently we require using
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
+with `DistributionStrategy`.
 
 ```python
 import tensorflow as tf
 from tensorflow import keras
 
-inputs = tf.keras.layers.Input(shape=(1,))
-predictions = tf.keras.layers.Dense(1)(inputs)
-model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
-```
-
-Let's also define a simple input dataset for training this model. Note that currently we require using
-[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
-with `DistributionStrategy`.
-
-```python
 features = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
 labels = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
 train_dataset = tf.data.Dataset.zip((features, labels))
 ```
 
-
 To distribute this Keras model on multiple GPUs using `MirroredStrategy` we
 first instantiate a `MirroredStrategy` object.
 
@@ -72,14 +63,17 @@ first instantiate a `MirroredStrategy` object.
 distribution = tf.contrib.distribute.MirroredStrategy()
 ```
 
-We then compile the Keras model and pass the `MirroredStrategy` object in the
-`distribute` argument (apart from other usual arguments like `loss` and
-`optimizer`).
+Take a very simple model consisting of a single layer. We need to create and compile
+the model under the distribution strategy scope.
 
 ```python
-model.compile(loss='mean_squared_error',
-              optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2),
-              distribute=distribution)
+with distribution.scope():
+  inputs = tf.keras.layers.Input(shape=(1,))
+  predictions = tf.keras.layers.Dense(1)(inputs)
+  model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
+
+  model.compile(loss='mean_squared_error',
+                optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2))
 ```
 
 To train the model we call Keras `fit` API using the input dataset that we
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index 8ec73654e30e4967f318c558ba94301e84a206e4..59d76f5d1c817d7f2cc8ad285b9fb517fe994a81 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -30,12 +30,13 @@ from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
+from tensorflow.contrib.distribute.python.tpu_strategy import initialize_tpu_system
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
 from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server
-from tensorflow.python.training.distribute import *
-from tensorflow.python.training.distribution_strategy_context import *
+from tensorflow.python.distribute.distribute_lib import *
+from tensorflow.python.distribute.distribution_strategy_context import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -58,11 +59,14 @@ _allowed_symbols = [
     'StandardSingleLossStep',
     'ReplicaContext',
     'TPUStrategy',
+    'initialize_tpu_system',
     'get_cross_replica_context',
     'get_distribution_strategy',
     'get_loss_reduction',
     'get_replica_context',
+    'get_strategy',
     'has_distribution_strategy',
+    'has_strategy',
     'in_cross_replica_context',
     'require_replica_context',
     'run_standard_tensorflow_server',
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index d2fb878f96f55200d870447b45f3d0a37c6b0f86..1b455a4e644417561a7556e66465f2cb093776d5 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -1,5 +1,7 @@
 # Implementation of a prototype TF distributed computation library.
 
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
+load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -13,8 +15,18 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-# TODO(priyag): Figure out testonly issues that are preventing us from
-# including our tests in pip for now.
+py_library(
+    name = "distribute_test_lib_pip",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":combinations",
+        ":keras_correctness_test_lib",
+        ":keras_test_lib",
+        ":multi_worker_test_base",
+        ":single_loss_example",
+        ":strategy_test_lib",
+    ],
+)
 
 cuda_py_test(
     name = "values_test",
@@ -22,25 +34,36 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         ":mirrored_strategy",
-        ":multi_worker_test_base",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
     ],
-    tags = [
-        "no_pip",
+)
+
+cuda_py_test(
+    name = "input_lib_test",
+    srcs = ["input_lib_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":mirrored_strategy",
+        ":multi_worker_test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
     ],
 )
 
@@ -50,8 +73,8 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -60,18 +83,10 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":mirrored_strategy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute:cross_device_ops",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:parameter_server_strategy",
         "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
     ],
 )
 
@@ -104,7 +119,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -118,6 +132,8 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
@@ -138,7 +154,9 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
@@ -146,12 +164,8 @@ py_library(
 
 py_library(
     name = "strategy_test_lib",
-    testonly = 1,
     srcs = ["strategy_test_lib.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -164,17 +178,14 @@ py_library(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
     ],
 )
 
 py_library(
     name = "combinations",
-    testonly = 1,
     srcs = ["combinations.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":mirrored_strategy",
         ":one_device_strategy",
@@ -186,6 +197,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -193,9 +205,6 @@ py_library(
 py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":combinations",
         "//tensorflow/python/eager:test",
@@ -206,9 +215,6 @@ py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":one_device_strategy",
         ":strategy_test_lib",
@@ -242,18 +248,13 @@ cuda_py_test(
     tags = [
         "guitar",
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
 py_library(
     name = "multi_worker_test_base",
-    testonly = 1,
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -288,6 +289,8 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
     ],
@@ -320,14 +323,16 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
-py_library(
-    name = "minimize_loss_test_lib",
-    testonly = 1,
+distribute_py_test(
+    name = "minimize_loss_test",
     srcs = ["minimize_loss_test.py"],
+    main = "minimize_loss_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":combinations",
         ":mirrored_strategy",
@@ -347,18 +352,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "minimize_loss_test",
-    srcs = ["minimize_loss_test.py"],
-    additional_deps = [
-        ":minimize_loss_test_lib",
-    ],
-    tags = [
-        "multi_and_single_gpu",
-        "no_pip",
-    ],
-)
-
 cuda_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
@@ -372,9 +365,6 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 cuda_py_test(
@@ -392,7 +382,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -415,7 +404,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
-        "no_pip",
         "tf_integration_test",
     ],
 )
@@ -429,7 +417,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
-        "no_pip",
         "tf_integration_test",
     ],
 )
@@ -459,7 +446,6 @@ cuda_py_test(
     shard_count = 48,
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
         # TODO(b/118768923): Re-enable {a,m,t}san test.
         "noasan",
         "nomsan",
@@ -481,10 +467,13 @@ py_library(
     ],
 )
 
-py_library(
-    name = "step_fn_test_lib",
-    testonly = 1,
+distribute_py_test(
+    name = "step_fn_test",
     srcs = ["step_fn_test.py"],
+    main = "step_fn_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":combinations",
         ":single_loss_example",
@@ -497,18 +486,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "step_fn_test",
-    srcs = ["step_fn_test.py"],
-    additional_deps = [
-        ":step_fn_test_lib",
-    ],
-    tags = [
-        "multi_and_single_gpu",
-        "no_pip",
-    ],
-)
-
 py_library(
     name = "monitor",
     srcs = ["monitor.py"],
@@ -536,7 +513,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -553,9 +529,6 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 cuda_py_test(
@@ -577,13 +550,11 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
 py_library(
     name = "keras_test_lib",
-    testonly = 1,
     srcs = [
         "keras_backward_compat_test.py",
         "keras_test.py",
@@ -602,43 +573,50 @@ py_library(
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "keras_test",
     srcs = ["keras_test.py"],
-    additional_deps = [
-        ":keras_test_lib",
-    ],
+    main = "keras_test.py",
     shard_count = 16,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
+    deps = [
+        ":keras_test_lib",
+    ],
 )
 
 # TODO(b/121200287): Remove this in 2.0
-cuda_py_test(
+distribute_py_test(
     name = "keras_backward_compat_test",
     srcs = ["keras_backward_compat_test.py"],
-    additional_deps = [
-        ":keras_test_lib",
-    ],
-    shard_count = 16,
+    full_precision = True,
+    main = "keras_backward_compat_test.py",
+    shard_count = 31,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
+    deps = [
+        ":keras_test_lib",
+    ],
 )
 
 py_library(
     name = "keras_correctness_test_lib",
-    testonly = 1,
-    srcs = ["keras_correctness_test.py"],
+    srcs = [
+        "keras_correctness_test_base.py",
+        "keras_dnn_correctness_test.py",
+        "keras_embedding_model_correctness_test.py",
+        "keras_image_model_correctness_test.py",
+        "keras_lstm_model_correctness_test.py",
+        "keras_stateful_lstm_model_correctness_test.py",
+    ],
     deps = [
         ":combinations",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
@@ -653,13 +631,95 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "keras_correctness_test",
-    srcs = ["keras_correctness_test.py"],
-    additional_deps = [
+distribute_py_test(
+    name = "keras_dnn_correctness_test",
+    size = "medium",
+    srcs = ["keras_dnn_correctness_test.py"],
+    full_precision = True,
+    main = "keras_dnn_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 19,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
         ":keras_correctness_test_lib",
     ],
-    shard_count = 16,
+)
+
+distribute_py_test(
+    name = "keras_image_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_image_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_image_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_embedding_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_embedding_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_embedding_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_lstm_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_lstm_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_lstm_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_stateful_lstm_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_stateful_lstm_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_stateful_lstm_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/117919883): Fix python error.
@@ -667,12 +727,18 @@ cuda_py_test(
         "no_windows_gpu",
         "notsan",
     ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
 )
 
-py_library(
-    name = "metrics_v1_test_lib",
-    testonly = 1,
+distribute_py_test(
+    name = "metrics_v1_test",
     srcs = ["metrics_v1_test.py"],
+    main = "metrics_v1_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":combinations",
         "//tensorflow/python:math_ops",
@@ -684,18 +750,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "metrics_v1_test",
-    srcs = ["metrics_v1_test.py"],
-    additional_deps = [
-        ":metrics_v1_test_lib",
-    ],
-    tags = [
-        "multi_and_single_gpu",
-        "no_pip",
-    ],
-)
-
 cuda_py_test(
     name = "warm_starting_util_test",
     size = "medium",
@@ -710,7 +764,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -729,6 +782,25 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
+    ],
+)
+
+tf_xla_py_test(
+    name = "checkpointing_test",
+    srcs = ["checkpointing_test.py"],
+    disabled_backends = [
+        # Only makes sense on TPUs
+        "cpu",
+        "gpu",
+        "cpu_ondemand",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":tpu_strategy",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index 31bd0e996a247a2fc01405fb3b8172a40853d698..3ef8b9574a36730dcc1d8fd42b6c7f364d84bbed 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -71,7 +71,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
 
     with ops.Graph().as_default() as g, distribution.scope():
       if in_replica_mode:
-        distribution.call_for_each_replica(init_and_verify, args=[g])
+        distribution.extended.call_for_each_replica(init_and_verify, args=[g])
       else:
         init_and_verify(g)
 
diff --git a/tensorflow/contrib/distribute/python/checkpointing_test.py b/tensorflow/contrib/distribute/python/checkpointing_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa5b9f57b8a5bc12ee94399ec1fc5a55177a5b5d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/checkpointing_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam as adam_v1
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+class NonLayerCheckpointable(tracking.AutoCheckpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class Subclassed(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class TrainingCheckpointTests(xla_test.XLATestCase):
+
+  def testEagerTPUDistributionStrategy(self):
+    self.skipTest("b/121387144")
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      strategy = tpu_strategy.TPUStrategy()
+      with strategy.scope():
+        model = Subclassed()
+        optimizer = adam_v1.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            optimizer_step=training_util.get_or_create_global_step())
+        root.restore(checkpoint_management.latest_checkpoint(
+            checkpoint_directory))
+
+        for _ in range(num_training_steps):
+          strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index e6bbf0c308a6abb6bbddb5ca9291a641ad518501..aa4d82b4d0c0dffc66115346d5f82a9d64bcfa56 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -26,9 +26,12 @@ from tensorflow.python.distribute import cross_device_ops as cross_device_ops_li
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
@@ -85,9 +88,11 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       local_devices = ("/device:CPU:0",)
     self._worker_device = device_util.canonicalize("/device:CPU:0")
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
     self._initialize_local(local_devices)
+    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         num_workers=self._num_workers,
         num_gpus_per_worker=num_gpus_per_worker,
@@ -120,6 +125,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
                                                 task_id)
 
     self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
     if num_gpus_per_worker:
       local_devices = tuple(
           "%s/device:GPU:%d" % (self._worker_device, i)
@@ -130,7 +136,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     self._collective_keys = cross_device_utils.CollectiveKeys()
     self._initialize_local(local_devices)
-    self._input_workers = values.InputWorkers(
+    self._input_workers = input_lib.InputWorkers(
         self._device_map, [(self._worker_device, self.worker_devices)])
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         num_workers=self._num_workers,
@@ -156,19 +162,23 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     if colocate_with is None:
       device_map = self._device_map
       logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
     else:
       device_map = colocate_with.device_map
       logical_device = colocate_with.logical_device
-    group_size = device_map.num_replicas_in_graph * self._num_workers
-    group_key = self._collective_keys.get_group_key(self.worker_devices)
 
     def _real_mirrored_creator(devices, *args, **kwargs):
       """Creates one MirroredVariable on the current worker."""
-      value_list = []
       unique_var_name = ops.get_default_graph().unique_name(
           kwargs["name"], mark_as_used=False).rstrip("/")
+      # pylint: disable=protected-access
       collective_instance_key = self._collective_keys.get_instance_key(
           key_id=unique_var_name)
+      # Only the first device participles in the broadcast of initial values.
+      group_key = self._collective_keys.get_group_key([devices[0]])
+      group_size = self._num_workers
       if "initial_value" not in kwargs:
         raise ValueError("Initial value must be specified.")
       initial_value = kwargs["initial_value"]
@@ -177,9 +187,33 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       else:
         initial_value_fn = lambda: initial_value
 
+      value_list = []
       for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
+        with ops.init_scope(), ops.device(d):
+          if i == 0:
+            # The initial value fn makes sure variables all initialized to
+            # same values. The first device of the chief worker will send their
+            # variable values to other workers.
+            def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
+              with ops.device(device):
+                initial_value = initial_value_fn()
+                assert not callable(initial_value)
+                initial_value = ops.convert_to_tensor(initial_value)
+
+                assert index == 0, index
+                if self._num_workers > 1:
+                  if self._is_chief:
+                    bcast_send = collective_ops.broadcast_send(
+                        initial_value, initial_value.shape, initial_value.dtype,
+                        group_size, group_key, collective_instance_key)
+                    with ops.control_dependencies([bcast_send]):
+                      return array_ops.identity(initial_value)
+                  else:
+                    return collective_ops.broadcast_recv(
+                        initial_value.shape, initial_value.dtype, group_size,
+                        group_key, collective_instance_key)
+                return initial_value
+          else:
             # Give replicas meaningful distinct names:
             var0name = value_list[0].name.split(":")[0]
             # We append a / to variable names created on replicas with id > 0 to
@@ -187,30 +221,22 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
 
-          # The initial value fn makes sure variables all initialized to
-          # same values. The first device of the chief worker will send their
-          # variable values to other devices and other workers.
-          def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
-            with ops.device(device):
-              initial_value = initial_value_fn()
-              assert not callable(initial_value)
-              initial_value = ops.convert_to_tensor(initial_value)
-
-              if self._is_chief and index == 0:
-                bcast_send = collective_ops.broadcast_send(
-                    initial_value, initial_value.shape, initial_value.dtype,
-                    group_size, group_key, collective_instance_key)
-                with ops.control_dependencies([bcast_send]):
-                  return array_ops.identity(initial_value)
-              else:
-                return collective_ops.broadcast_recv(
-                    initial_value.shape, initial_value.dtype, group_size,
-                    group_key, collective_instance_key)
+            # Variables on non-first replica get initial values from the
+            # variables created on the first device of each worker.
+            def _overridden_initial_value_fn(device=d, index=i):
+              assert index > 0
+              with ops.device(device):
+                if context.executing_eagerly():
+                  return array_ops.identity(value_list[0].value())
+                else:
+                  return array_ops.identity(value_list[0].initial_value)
 
           kwargs["initial_value"] = _overridden_initial_value_fn
-
           with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            v = next_creator(*args, **kwargs)
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
 
           if i == 0:
             actual_var_name = v.name.split(":")[0]
@@ -222,19 +248,12 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     # pylint: disable=protected-access
     return mirrored_strategy._create_mirrored_variable(
-        device_map, logical_device, _real_mirrored_creator, *args, **kwargs)
-
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    # TODO(yuefengz): shard the dataset.
-    worker_index = 0
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._input_workers, worker_index,
-        prefetch_on_device=True)
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
 
   def _make_dataset_iterator(self, dataset):
-    return values.DatasetIterator(dataset, self._input_workers,
-                                  self._num_replicas_in_sync)
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
@@ -251,7 +270,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         input_pipeline_id=input_pipeline_id,
         num_replicas_in_sync=self._num_replicas_in_sync)
 
-    return values.InputFunctionIterator(
+    return input_lib.InputFunctionIterator(
         input_fn, self._input_workers, [input_context])
 
   def _configure(self,
@@ -345,4 +364,11 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
-    return False
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 0fb672dded7624e798592d2f5c01945aa830021e..9b6236fd9f89ec30c1234c846930a05d9c32e99d 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -123,7 +123,7 @@ class CollectiveAllReduceStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=[one])
+        g_v = d.extended.call_for_each_replica(grad_fn, args=[one])
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -135,7 +135,7 @@ class CollectiveAllReduceStrategyTestBase(
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
-                d.update(v, update, g, grouped=False)):
+                d.extended.update(v, update, args=(g,), group=False)):
               after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
@@ -192,6 +192,7 @@ class CollectiveAllReduceStrategyTestBase(
       image = random_ops.random_uniform([2, 28, 28])
       label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32)
       logits = model(image, training=True)
+      # TODO(yuefengz): make loss a callable for eager mode.
       loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits)
       optimizer = adam.AdamOptimizer(learning_rate=1e-4)
       train_op = optimizer.minimize(loss,
@@ -202,7 +203,7 @@ class CollectiveAllReduceStrategyTestBase(
          self.cached_session(config=config,
                              target=master_target) as sess:
       with d.scope():
-        train_op = d.call_for_each_replica(model_fn)
+        train_op = d.extended.call_for_each_replica(model_fn)
         train_op = d.group(d.unwrap(train_op))
 
       sess.run(variables.global_variables_initializer())
@@ -225,7 +226,7 @@ class CollectiveAllReduceStrategyTestBase(
                 1.0, 10.0, dtype=dtypes.float32))
         return array_ops.identity(x)
 
-      x = distribution.call_for_each_replica(model_fn)
+      x = distribution.extended.call_for_each_replica(model_fn)
       reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
       x = distribution.unwrap(x)[0]
 
@@ -397,28 +398,38 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
 
-class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
-                                       strategy_test_lib.DistributionTestBase,
-                                       parameterized.TestCase):
+class LocalCollectiveAllReduceStrategy(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
-  def testMinimizeLossGraph(self, num_gpus=2):
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2))
+  def testMinimizeLoss(self, num_gpus):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    self._test_minimize_loss_graph(None, None, num_gpus)
+    if context.executing_eagerly():
+      strategy, _, _ = self._get_test_object(None, None, num_gpus)
+      self._test_minimize_loss_eager(strategy)
+    else:
+      self._test_minimize_loss_graph(None, None, num_gpus)
 
-  def testComplexModel(self, num_gpus=2):
-    # Collective ops doesn't support strategy with one device.
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2))
+  def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
 
-  def testMakeInputFnIterator(self, num_gpus=2):
-    # Collective ops doesn't support strategy with one device.
-    if context.num_gpus() < num_gpus:
-      self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+  @combinations.generate(
+      combinations.combine(mode=['graph', 'eager'], required_gpus=2))
+  def testMakeInputFnIterator(self):
+    num_gpus = 2
+    dataset_fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
+    expected_values = [range(i, i + num_gpus) for i in range(0, 10, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
         dataset_fn,
@@ -428,6 +439,49 @@ class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
     self._test_input_fn_iterator(None, None, num_gpus,
                                  input_fn, expected_values)
 
+  def testAllReduceSum(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradient_tape(distribution)
+
+  def testNumpyIterator(self):
+    num_gpus = 2
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    strategy, _, _ = self._get_test_object(None, None, num_gpus)
+    self._test_numpy_iterator(strategy)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 4a934953ad2d4c6ecbe2bde2333a49bf8fd72821..db79d6c0d8ac4590b0e16598a3fde6f89d4759a6 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -46,16 +46,21 @@ import unittest
 from absl.testing import parameterized
 import six
 
-from tensorflow.contrib.cluster_resolver import TPUClusterResolver
+from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
 from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
 from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.contrib.tpu.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_keras_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_keras_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_keras_v2
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -321,6 +326,23 @@ class NamedDistribution(object):
     return self._required_tpu
 
 
+def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
+  def _create_tpu_strategy():
+    resolver = cluster_resolver.TPUClusterResolver("")
+    topology = tpu_lib.initialize_tpu_system(resolver)
+    device_assignment = None
+    if use_single_core:
+      device_assignment = device_assignment_lib.DeviceAssignment(
+          topology, core_assignment=device_assignment_lib.
+          SINGLE_CORE_ASSIGNMENT)
+
+    strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run,
+                                   device_assignment=device_assignment,
+                                   **kwargs)
+    return strategy
+  return _create_tpu_strategy
+
+
 # pylint: disable=g-long-lambda
 default_strategy = NamedDistribution(
     "Default",
@@ -330,13 +352,31 @@ one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
 tpu_strategy = NamedDistribution(
-    "TPU", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=2),
+    "TPU", _get_tpu_strategy_creator(steps_per_run=2),
     required_tpu=True)
 tpu_strategy_one_step = NamedDistribution(
-    "TPUOneStep", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=1),
+    "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1),
+    required_tpu=True)
+tpu_strategy_loop_on_device_one_core = NamedDistribution(
+    "TPULoopOnDeviceOneCore", _get_tpu_strategy_creator(
+        steps_per_run=2, use_single_core=True,
+        _disable_training_loop_on_host=True),
+    required_tpu=True)
+tpu_strategy_one_step_loop_on_device_one_core = NamedDistribution(
+    "TPUOneStepLoopOnDeviceOneCore", _get_tpu_strategy_creator(
+        steps_per_run=1, use_single_core=True,
+        _disable_training_loop_on_host=True),
     required_tpu=True)
+# TODO(b/122327153): Remove below two NamedDistributions.
+tpu_strategy_loop_on_device = NamedDistribution(
+    "TPULoopOnDevice", _get_tpu_strategy_creator(
+        steps_per_run=2, _disable_training_loop_on_host=True),
+    required_tpu=True)
+tpu_strategy_one_step_loop_on_device = NamedDistribution(
+    "TPUOneStepLoopOnDevice", _get_tpu_strategy_creator(
+        steps_per_run=1, _disable_training_loop_on_host=True),
+    required_tpu=True)
+
 mirrored_strategy_with_one_cpu = NamedDistribution(
     "Mirrored1CPU",
     lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
@@ -386,10 +426,20 @@ gradient_descent_optimizer_v2_fn = NamedObject(
 adagrad_optimizer_v2_fn = NamedObject(
     "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001))
 adam_optimizer_v2_fn = NamedObject(
-    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1.0))
 
 optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn]
 
+gradient_descent_optimizer_keras_v2_fn = NamedObject(
+    "GradientDescentKerasV2",
+    lambda: gradient_descent_keras_v2.SGD(0.2))
+adagrad_optimizer_keras_v2_fn = NamedObject(
+    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
+adam_optimizer_keras_v2_fn = NamedObject(
+    "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0))
+rmsprop_optimizer_keras_v2_fn = NamedObject(
+    "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001))
+
 graph_and_eager_modes = ["graph", "eager"]
 
 
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
index 84b106545e1326fddd3ed299462534af982dc102..5f89df5824a8d03198987a6fa3d21e2330deedf0 100644
--- a/tensorflow/contrib/distribute/python/examples/BUILD
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -31,6 +31,12 @@ py_binary(
 
 py_binary(
     name = "keras_mnist",
+    srcs = ["keras_mnist.py"],
+    deps = [":keras_mnist_lib"],
+)
+
+py_library(
+    name = "keras_mnist_lib",
     srcs = [
         "keras_mnist.py",
     ],
@@ -39,3 +45,14 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
+
+py_binary(
+    name = "mnist_eager_multigpu",
+    srcs = [
+        "mnist_eager_multigpu.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index 60fda996642464135fe1fb8c314bcf7f04d19362..1ce91ecaf22a80a53124c8f00fac05c6b4711ed9 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -109,22 +109,21 @@ def main(_):
   tf.enable_eager_execution()
 
   train_ds, eval_ds, input_shape = get_input_datasets()
-  model = get_model(input_shape)
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
   # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
   strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
 
-  optimizer = rmsprop.RMSProp(learning_rate=0.001)
-
-  # Compile the model by passing the distribution strategy object to the
-  # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
-  # based on the strategy instantiated.
-  model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                optimizer=optimizer,
-                metrics=['accuracy'],
-                distribute=strategy)
+  # Create and compile the model under Distribution strategy scope.
+  # `fit`, `evaluate` and `predict` will be distributed based on the strategy
+  # model was compiled with.
+  with strategy.scope():
+    model = get_model(input_shape)
+    optimizer = rmsprop.RMSProp(learning_rate=0.001)
+    model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                  optimizer=optimizer,
+                  metrics=['accuracy'])
 
   # Train the model with the train dataset.
   model.fit(x=train_ds, epochs=20, steps_per_epoch=468)
diff --git a/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a3b5e91a52a6881d48a0aceadbd901a46e86b2
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
@@ -0,0 +1,151 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run MNIST on multiple GPUs on using MirroredStrategy with eager execution.
+
+By default, runs on all available GPUs, or CPU if no GPUs are available.
+
+NOTE: Currently, this takes more time than when running MNIST in eager without
+MirroredStrategy because of a number overheads. Therefore, this is just a
+proof of concept right now and cannot be used to actually scale up training.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+tf.flags.DEFINE_integer("num_gpus", None, "How many GPUs should we run on?"
+                        "Defaults to all available GPUs, otherwise CPU.")
+tf.flags.DEFINE_integer("batch_size", 64,
+                        "What should be the size of each batch?")
+tf.flags.DEFINE_integer("num_epochs", 10, "How many epochs to run?")
+tf.flags.DEFINE_float("learning_rate", 0.01, "Learning Rate")
+tf.flags.DEFINE_float("momentum", 0.5, "SGD momentum")
+
+FLAGS = tf.flags.FLAGS
+NUM_TRAIN_IMAGES = 60000
+NUM_TEST_IMAGES = 10000
+
+
+def create_model():
+  max_pool = tf.keras.layers.MaxPooling2D((2, 2), (2, 2), padding="same")
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  return tf.keras.Sequential([
+      tf.keras.layers.Reshape(
+          target_shape=[28, 28, 1],
+          input_shape=(28, 28,)),
+      tf.keras.layers.Conv2D(2, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Conv2D(4, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(32, activation=tf.nn.relu),
+      tf.keras.layers.Dropout(0.4),
+      tf.keras.layers.Dense(10)])
+
+
+def compute_loss(logits, labels):
+  loss = tf.reduce_sum(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+  # Scale loss by global batch size.
+  return loss * (1. / FLAGS.batch_size)
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  # TODO(priyag): `strategy.make_numpy_iterator` can be used directly instead of
+  # converting to datasets.
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def main(unused_argv):
+  """Run a CNN model on MNIST data to demonstrate DistributedStrategies."""
+
+  tf.enable_eager_execution()
+
+  num_gpus = FLAGS.num_gpus
+  if num_gpus is None:
+    devices = None
+  elif num_gpus == 0:
+    devices = ["/device:CPU:0"]
+  else:
+    devices = ["/device:GPU:{}".format(i) for i in range(num_gpus)]
+  strategy = tf.distribute.MirroredStrategy(devices)
+
+  with strategy.scope():
+    train_ds, test_ds = mnist_datasets()
+    train_ds = train_ds.shuffle(NUM_TRAIN_IMAGES).batch(FLAGS.batch_size)
+    test_ds = test_ds.batch(FLAGS.batch_size)
+
+    model = create_model()
+    optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum)
+    training_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "training_accuracy", dtype=tf.float32)
+    test_loss = tf.keras.metrics.Mean("test_loss", dtype=tf.float32)
+    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "test_accuracy", dtype=tf.float32)
+
+    def train_step(inputs):
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss = compute_loss(logits, labels)
+      grads = tape.gradient(loss, model.variables)
+      optimizer.apply_gradients(zip(grads, model.variables))
+      training_loss.update_state(loss)
+      training_accuracy.update_state(labels, logits)
+
+    def test_step(inputs):
+      images, labels = inputs
+      logits = model(images, training=False)
+      loss = compute_loss(logits, labels)
+      test_loss.update_state(loss)
+      test_accuracy.update_state(labels, logits)
+
+    train_iterator = strategy.make_dataset_iterator(train_ds)
+    test_iterator = strategy.make_dataset_iterator(test_ds)
+    for epoch in range(0, FLAGS.num_epochs):
+      # Train
+      print("Starting epoch {}".format(epoch))
+      train_iterator.initialize()
+      for _ in range(NUM_TRAIN_IMAGES // FLAGS.batch_size):
+        strategy.experimental_run(train_step, train_iterator)
+      print("Training loss: {:0.4f}, accuracy: {:0.2f}%".format(
+          training_loss.result(), training_accuracy.result() * 100))
+      training_loss.reset_states()
+      training_accuracy.reset_states()
+
+      # Test
+      test_iterator.initialize()
+      for _ in range(NUM_TEST_IMAGES // FLAGS.batch_size):
+        strategy.experimental_run(test_step, test_iterator)
+      print("Test loss: {:0.4f}, accuracy: {:0.2f}%".format(
+          test_loss.result(), test_accuracy.result() * 100))
+      test_loss.reset_states()
+      test_accuracy.reset_states()
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/contrib/distribute/python/input_lib_test.py b/tensorflow/contrib/distribute/python/input_lib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a58316ec5b3d9d968a88c5c39ff70c277daa65
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/input_lib_test.py
@@ -0,0 +1,246 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the input_lib library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+
+
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+    device_map = values.ReplicaDeviceMap(devices)
+    input_workers = input_lib.InputWorkers(device_map, worker_device_pairs)
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = input_lib.InputFunctionIterator(
+          input_fn, input_workers, input_contexts)
+    else:
+      iterator = input_lib.DatasetIterator(
+          dataset_fn(), input_workers, split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_replica(r, next_element)
+                for r in range(len(devices))])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+class SplitDatasetBatchTest(test.TestCase):
+
+  def testBatchDataset(self):
+    dataset = dataset_ops.Dataset.range(100).batch(20)
+    split_batch_by = 2
+    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
+    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
+    result = [self.evaluate(el) for el in result_dataset]
+    self.assertAllEqual(expected_values, result)
+
+  def testMapAndBatchDataset(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.apply(batching.map_and_batch(lambda x: x, 20))
+    split_batch_by = 2
+    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
+    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
+    result = [self.evaluate(el) for el in result_dataset]
+    self.assertAllEqual(expected_values, result)
+
+  def testPrefetchDataset(self):
+    dataset = dataset_ops.Dataset.range(100).batch(20).prefetch(1)
+    split_batch_by = 2
+    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
+    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
+    result = [self.evaluate(el) for el in result_dataset]
+    self.assertAllEqual(expected_values, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
index 93c0280c8215712071457cafb9c6040f7d97fa60..3bc84dc009bf91493d10d28ef7c3b718ef17ba91 100644
--- a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
+++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 from absl.testing import parameterized
 import numpy as np
 
@@ -27,20 +26,12 @@ from tensorflow.contrib.distribute.python import tpu_strategy
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import values
 from tensorflow.python.eager import test
-from tensorflow.python.estimator import keras as keras_lib
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
@@ -325,15 +316,20 @@ def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-# TODO(priyag): Add v2 optimizers here.
 def strategy_and_optimizer_combinations():
+  # TODO(b/122372746): Uncomment optimizers after they pass tests.
   return combinations.times(
       all_strategy_combinations(),
-      combinations.combine(
-          optimizer=[combinations.adagrad_optimizer_v1_fn,
-                     combinations.adam_optimizer_v1_fn,
-                     combinations.gradient_descent_optimizer_v1_fn,
-                     combinations.rmsprop_optimizer_v1_fn]))
+      combinations.combine(optimizer=[
+          combinations.adagrad_optimizer_v1_fn,
+          # combinations.adagrad_optimizer_keras_v2_fn,
+          combinations.adam_optimizer_v1_fn,
+          combinations.adam_optimizer_keras_v2_fn,
+          combinations.gradient_descent_optimizer_v1_fn,
+          combinations.gradient_descent_optimizer_keras_v2_fn,
+          combinations.rmsprop_optimizer_v1_fn,
+          # combinations.rmsprop_optimizer_keras_v2_fn
+      ]))
 
 
 def strategy_and_input_combinations():
@@ -359,298 +355,9 @@ def strategy_for_numpy_input_combinations():
       mode=['graph'])
 
 
-class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
-                                        parameterized.TestCase):
-
-  def setUp(self):
-    self._base_dir = os.path.join(self.get_temp_dir(),
-                                  'keras_mirrored_strategy_test')
-    gfile.MakeDirs(self._base_dir)
-    self._config = run_config_lib.RunConfig(
-        tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
-
-  def tearDown(self):
-    writer_cache.FileWriterCache.clear()
-    if os.path.isdir(self._base_dir):
-      gfile.DeleteRecursively(self._base_dir)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
-  def test_train_functional_with_distribution_strategy(self, distribution):
-    keras_model = simple_functional_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
-    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
-                                      model_dir=self._base_dir,
-                                      train_distribute=distribution,
-                                      eval_distribute=distribution)
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=config)
-      before_eval_results = est_keras.evaluate(
-          input_fn=get_ds_test_input_fn, steps=1)
-      est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-      after_eval_results = est_keras.evaluate(input_fn=get_ds_test_input_fn,
-                                              steps=1)
-      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-    writer_cache.FileWriterCache.clear()
-    gfile.DeleteRecursively(self._config.model_dir)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph']))
-  def test_train_sequential_with_distribution_strategy(self, distribution):
-    keras_model = simple_sequential_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        metrics=[keras.metrics.CategoricalAccuracy()],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
-    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
-                                      model_dir=self._base_dir,
-                                      train_distribute=distribution)
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=config)
-      before_eval_results = est_keras.evaluate(
-          input_fn=get_ds_test_input_fn, steps=1)
-      est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-      after_eval_results = est_keras.evaluate(input_fn=get_ds_test_input_fn,
-                                              steps=1)
-      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-    writer_cache.FileWriterCache.clear()
-    gfile.DeleteRecursively(self._config.model_dir)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
-  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
-    train_data, test_data = get_multi_inputs_multi_outputs_data()
-
-    def train_input_fn():
-      input_dict = {
-          'input_a': train_data['input_a'],
-          'input_b': train_data['input_b'],
-          'input_m': train_data['input_m'].astype(np.str)
-      }
-      output_dict = {
-          'dense_2': train_data['output_c'],
-          'dense_3': train_data['output_d']
-      }
-      return dataset_ops.Dataset.from_tensor_slices((input_dict,
-                                                     output_dict)).batch(16)
-
-    def eval_input_fn():
-      input_dict = {
-          'input_a': test_data['input_a'],
-          'input_b': test_data['input_b'],
-          'input_m': test_data['input_m'].astype(np.str)
-      }
-      output_dict = {
-          'dense_2': test_data['output_c'],
-          'dense_3': test_data['output_d']
-      }
-      return dataset_ops.Dataset.from_tensor_slices((input_dict,
-                                                     output_dict)).batch(16)
-
-    self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        distribution, train_input_fn, eval_input_fn)
-
-  def do_test_multi_inputs_multi_outputs_with_input_fn(
-      self, distribution, train_input_fn, eval_input_fn):
-    config = run_config_lib.RunConfig(
-        tf_random_seed=_RANDOM_SEED,
-        model_dir=self._base_dir,
-        train_distribute=distribution)
-    with self.cached_session():
-      model = multi_inputs_multi_outputs_model()
-      est_keras = keras_lib.model_to_estimator(keras_model=model, config=config)
-      baseline_eval_results = est_keras.evaluate(
-          input_fn=eval_input_fn, steps=1)
-      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-      eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-      self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
-  def test_keras_optimizer_with_distribution_strategy(self, distribution):
-    keras_model = simple_sequential_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.rmsprop(lr=0.01))
-
-    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
-                                      model_dir=self._base_dir,
-                                      train_distribute=distribution)
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
-                                               config=config)
-      with self.assertRaisesRegexp(ValueError,
-                                   'Only TensorFlow native optimizers are '
-                                   'supported with DistributionStrategy.'):
-        est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-
-    writer_cache.FileWriterCache.clear()
-    gfile.DeleteRecursively(self._config.model_dir)
-
-
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_creating_var_with_numpy_arrays(self, distribution):
-    with self.cached_session():
-      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
-      var_x = distributed_training_utils.get_var_for_numpy(distribution, x)
-      val = self.evaluate(var_x.value())
-      # Verify that the numpy value is copied to the variable.
-      self.assertAllEqual(x, val)
-
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
-    # Calculate the per_replica_batch_size scaling factor for strategies
-    # that use per_core_batch_size
-    replica_scale_factor = 1.0
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      replica_scale_factor = distribution.num_replicas_in_sync
-
-    with self.cached_session():
-      # Input samples of different sizes
-      input_20_samples = np.zeros((20, 3), dtype=np.float32)
-      input_63_samples = np.zeros((63, 3), dtype=np.float32)
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
-      # Default global batch size 32 for input with 64 samples run in 2 steps
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=None, batch_size=None)
-      self.assertEqual(batch_size, 32 // replica_scale_factor)
-      self.assertEqual(steps, 2)
-
-      # Computed global batch size 20 is lower than 32 if we pass less samples.
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_20_samples, steps=None, batch_size=None)
-      self.assertEqual(batch_size, 20 // replica_scale_factor)
-      self.assertEqual(steps, 1)
-
-      #  Default global batch size 32 cannot be used with 63 samples.
-      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
-        distributed_training_utils.get_input_params(
-            distribution, input_63_samples, steps=None, batch_size=None)
-
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_calculating_input_params_with_steps_no_batch_size(self,
-                                                             distribution):
-    # Calculate the per_replica_batch_size scaling factor for strategies
-    # that use per_core_batch_size
-    replica_scale_factor = 1.0
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      replica_scale_factor = distribution.num_replicas_in_sync
-
-    with self.cached_session():
-      # Input samples of different sizes
-      input_63_samples = np.zeros((63, 3), dtype=np.float32)
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
-      # Computed global batch size is correct for number of specified 1 step
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=1, batch_size=None)
-      self.assertEqual(batch_size, 64 // replica_scale_factor)
-      self.assertEqual(steps, 1)
-
-      # Computed global batch size is correct for number of specified 2 steps
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=2, batch_size=None)
-      self.assertEqual(batch_size, 32 // replica_scale_factor)
-      self.assertEqual(steps, 2)
-
-      # All samples can not be consumed in specified number of steps
-      with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
-        distributed_training_utils.get_input_params(
-            distribution, input_63_samples, steps=2, batch_size=None)
-
-      # This cases is different for different strategies due to the
-      # difference in supported batch size being global or per-replica.
-      if replica_scale_factor == 1:
-        # Computed global batch size is correct even if not sharadable
-        steps, batch_size = distributed_training_utils.get_input_params(
-            distribution, input_63_samples, steps=3, batch_size=None)
-        self.assertEqual(batch_size, 21)
-        self.assertEqual(steps, 3)
-      else:
-        # Computed global batch size can not be sharded across replicas
-        with self.assertRaisesRegexp(ValueError, 'could not be sharded evenly '
-                                     'across the sync replicas'):
-          distributed_training_utils.get_input_params(
-              distribution, input_63_samples, steps=1, batch_size=None)
-
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_calculating_input_params_no_steps_with_batch_size(self,
-                                                             distribution):
-    # Calculate the per_replica_batch_size scaling factor for strategies
-    # that use per_core_batch_size
-    replica_scale_factor = 1.0
-    if not distributed_training_utils.global_batch_size_supported(distribution):
-      replica_scale_factor = distribution.num_replicas_in_sync
-
-    with self.cached_session():
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
-      # Computed steps is correct for specified batch size
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=None, batch_size=16)
-      self.assertEqual(batch_size, 16)
-      self.assertEqual(steps, 4 // replica_scale_factor)
-
-      # Computed steps is correct for specified batch size
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=None, batch_size=32)
-      self.assertEqual(batch_size, 32)
-      self.assertEqual(steps, 2 // replica_scale_factor)
-
-      # Number of samples is not divisible by the global batch size
-      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
-        distributed_training_utils.get_input_params(
-            distribution, input_64_samples, steps=None, batch_size=20)
-
-      # Number of samples is not divisible by the global batch size
-      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
-        distributed_training_utils.get_input_params(
-            distribution, input_64_samples, steps=None, batch_size=3)
-
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_calculating_input_params_with_steps_with_batch_size(self,
-                                                               distribution):
-    with self.cached_session():
-      input_64_samples = np.zeros((64, 3), dtype=np.float32)
-
-      # No change to steps and batch size if both specified and feasible
-      steps, batch_size = distributed_training_utils.get_input_params(
-          distribution, input_64_samples, steps=5, batch_size=3)
-      self.assertEqual(batch_size, 3)
-      self.assertEqual(steps, 5)
-
-      # Number of samples is less than global batch size * steps
-      with self.assertRaisesRegexp(ValueError, 'less than samples required'):
-        distributed_training_utils.get_input_params(
-            distribution, input_64_samples, steps=10, batch_size=13)
-
   @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
@@ -1039,7 +746,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-      grouped_models = distribution.unwrap(model._distributed_model)
+      grouped_models = distribution.unwrap(model._distributed_model_train)
       with distribution.scope():
         for m in grouped_models:
           self.assertAllClose(0.001, keras.backend.get_value(
@@ -1048,54 +755,6 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2))
-      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
-      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
-      x = values.DistributedValues(device_map, (a, b))
-      y = values.DistributedValues(device_map, (a, a))
-      with distribution.scope():
-        # Removed device and input tensor shape details from the error message
-        # since the order of the device and the corresponding input tensor shape
-        # is not deterministic over different runs.
-        with self.assertRaisesRegexp(ValueError,
-                                     'Input tensor shapes do not match for '
-                                     'distributed tensor inputs '
-                                     'DistributedValues:.+'):
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
-      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
-      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
-      x = values.DistributedValues(device_map, (a, b))
-      y = values.DistributedValues(device_map, (a, a))
-      with distribution.scope():
-        # Removed device and input tensor dtype details from the error message
-        # since the order of the device and the corresponding input tensor dtype
-        # is not deterministic over different runs.
-        with self.assertRaisesRegexp(ValueError,
-                                     'Input tensor dtypes do not match for '
-                                     'distributed tensor inputs '
-                                     'DistributedValues:.+'):
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
-
   @combinations.generate(combinations.combine(
       distribution=[
           combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1135,14 +794,14 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
       # Test with not specifying the `steps` argument.
       with self.assertRaisesRegexp(
-          ValueError, 'you should specify the `steps_per_epoch` argument'):
+          ValueError, 'the `steps_per_epoch` argument'):
         model.fit(dataset, epochs=1, verbose=0)
       with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
+                                   'the `steps` argument'):
         model.evaluate(dataset, verbose=0)
 
       with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
+                                   'the `steps` argument'):
         model.predict(dataset, verbose=0)
 
   @combinations.generate(combinations.combine(
diff --git a/tensorflow/contrib/distribute/python/keras_correctness_test.py b/tensorflow/contrib/distribute/python/keras_correctness_test.py
deleted file mode 100644
index e078731610882bfe6d5a97b1636d9a4a1325b047..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/keras_correctness_test.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Correctness tests for tf.keras using DistributionStrategy."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import tpu_strategy
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.eager import test
-from tensorflow.python.framework import random_seed
-from tensorflow.python.keras.engine import distributed_training_utils
-from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
-from tensorflow.python.training import gradient_descent
-
-_RANDOM_SEED = 1337
-
-# Note: Please make sure the tests in this file are also covered in
-# keras_backward_compat_test for features that are supported with both APIs.
-
-
-def batch_wrapper(dataset, batch_size, distribution, repeat=None):
-  if repeat:
-    dataset = dataset.repeat(repeat)
-  # TPUs currently require fully defined input shapes, drop_remainder ensures
-  # the input will have fully defined shapes.
-  if isinstance(distribution, tpu_strategy.TPUStrategy):
-    return dataset.batch(batch_size, drop_remainder=True)
-  else:
-    return dataset.batch(batch_size)
-
-
-def get_correctness_test_inputs(use_numpy, use_validation_data,
-                                with_distribution,
-                                x_train, y_train, x_predict):
-  """Generates the inputs for correctness check when enable Keras with DS."""
-  training_epochs = 2
-  global_batch_size = 64
-  batch_size = global_batch_size
-  # TODO(b/118776054): Use global batch size for Keras/DS support.
-  use_per_core_batch_size = (
-      with_distribution and
-      not distributed_training_utils.global_batch_size_supported(
-          with_distribution))
-  if use_per_core_batch_size:
-    batch_size //= with_distribution.num_replicas_in_sync
-
-  if use_numpy:
-    training_inputs = {
-        'batch_size': batch_size,
-        'x': x_train,
-        'y': y_train,
-        'epochs': training_epochs,
-        'shuffle': False,
-    }
-
-    if use_validation_data:
-      eval_inputs = None
-      training_inputs['validation_data'] = (x_train, y_train)
-    else:
-      eval_inputs = {
-          'batch_size': batch_size,
-          'x': x_train,
-          'y': y_train,
-      }
-    predict_inputs = {
-        'x': np.array(x_predict, dtype=np.float32),
-    }
-  else:
-    # For dataset inputs, we do not pass batch_size to
-    # keras.fit/evaluate/predict. The batch size is part of the dataset.
-    train_dataset = dataset_ops.Dataset.from_tensor_slices(
-        (x_train, y_train))
-    x = batch_wrapper(
-        train_dataset, batch_size, with_distribution, repeat=training_epochs)
-
-    training_inputs = {
-        'batch_size': None,
-        'x': x,
-        'y': None,
-        'epochs': training_epochs,
-        'shuffle': False,
-        'steps_per_epoch': len(x_train) // global_batch_size,
-    }
-    if use_validation_data:
-      eval_inputs = None  # Remove the eval_inputs
-      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
-          (x_train, y_train))
-      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
-      training_inputs['validation_data'] = x
-      training_inputs['validation_steps'] = 5
-    else:
-      eval_inputs = {
-          'batch_size': None,
-          'x': x,
-          'y': None,
-          'steps': 20,
-      }
-
-    predict_batch_size = len(x_predict)
-    if use_per_core_batch_size:
-      predict_batch_size //= with_distribution.num_replicas_in_sync
-    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
-    predict_dataset = batch_wrapper(predict_dataset,
-                                    predict_batch_size, with_distribution)
-    predict_inputs = {
-        'steps': 1,
-        'x': predict_dataset,
-    }
-
-  return training_inputs, eval_inputs, predict_inputs
-
-
-strategies_minus_tpu = [
-    combinations.default_strategy,
-    combinations.one_device_strategy,
-    combinations.mirrored_strategy_with_gpu_and_cpu,
-    combinations.mirrored_strategy_with_two_gpus,
-    combinations.core_mirrored_strategy_with_gpu_and_cpu,
-    combinations.core_mirrored_strategy_with_two_gpus]
-
-tpu_strategies = [
-    combinations.tpu_strategy,  # steps_per_run=2
-    combinations.tpu_strategy_one_step]
-
-
-def strategy_minus_tpu_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu,
-      mode=['graph', 'eager'])
-
-
-def tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=tpu_strategies,
-      mode=['graph'])
-
-
-def all_strategy_combinations():
-  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
-
-
-def strategy_and_input_combinations():
-  return (
-      combinations.times(
-          combinations.combine(distribution=strategies_minus_tpu),
-          combinations.combine(mode=['graph'],
-                               use_numpy=[True, False],
-                               use_validation_data=[True, False])
-          + combinations.combine(mode=['eager'],
-                                 use_numpy=[False],
-                                 use_validation_data=[False])) +
-      combinations.times(
-          combinations.combine(distribution=tpu_strategies),
-          combinations.combine(mode=['graph'],
-                               use_numpy=[True, False],
-                               use_validation_data=[True, False])))
-
-
-class TestDistributionStrategyCorrectness(test.TestCase,
-                                          parameterized.TestCase):
-
-  @combinations.generate(all_strategy_combinations())
-  def test_metric_correctness(self, distribution):
-    with self.cached_session():
-      keras.backend.set_image_data_format('channels_last')
-      num_samples = 10000
-
-      x_train = np.random.randint(0, 2, num_samples)
-      x_train = np.reshape(x_train, (num_samples, 1))
-      y_train = x_train
-      x_train = x_train.astype('float32')
-      y_train = y_train.astype('float32')
-
-      # Create identity model.
-      with distribution.scope():
-        model = keras.Sequential()
-        model.add(
-            keras.layers.Dense(1, input_shape=(1,), kernel_initializer='ones'))
-        model.compile(
-            loss=keras.losses.mean_squared_error,
-            optimizer=gradient_descent.GradientDescentOptimizer(0.5),
-            metrics=[keras.metrics.BinaryAccuracy()])
-
-      batch_size = 64
-      if not distributed_training_utils.global_batch_size_supported(
-          distribution):
-        batch_size //= distribution.num_replicas_in_sync
-      train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
-      train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
-
-      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
-      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
-
-  @combinations.generate(all_strategy_combinations())
-  def test_eval_metrics_correctness(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras.Sequential()
-        model.add(
-            keras.layers.Dense(
-                3, activation='relu', input_dim=4, kernel_initializer='ones'))
-        model.add(
-            keras.layers.Dense(
-                1, activation='sigmoid', kernel_initializer='ones'))
-        model.compile(
-            loss='mae',
-            metrics=['accuracy', keras.metrics.BinaryAccuracy()],
-            optimizer=gradient_descent.GradientDescentOptimizer(0.001))
-
-      # verify correctness of stateful and stateless metrics.
-      x = np.ones((100, 4)).astype('float32')
-      y = np.ones((100, 1)).astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = batch_wrapper(dataset, 4, distribution)
-      outs = model.evaluate(dataset, steps=10)
-      self.assertEqual(outs[1], 1.)
-      self.assertEqual(outs[2], 1.)
-
-      y = np.zeros((100, 1)).astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = batch_wrapper(dataset, 4, distribution)
-      outs = model.evaluate(dataset, steps=10)
-      self.assertEqual(outs[1], 0.)
-      self.assertEqual(outs[2], 0.)
-
-  @combinations.generate(strategy_and_input_combinations())
-  def test_correctness(self, distribution, use_numpy, use_validation_data):
-    with self.cached_session():
-      default_tolerance = 1e-5
-      tol_table = {}
-
-      if isinstance(distribution, (
-          mirrored_strategy.MirroredStrategy,
-          mirrored_strategy.CoreMirroredStrategy,
-          distribute_lib._DefaultDistributionStrategy)):  # pylint: disable=protected-access
-        # TODO(b/119257215): Weights are not exactly the same, so use larger
-        # tolerance for now. Predict should be related to weights.
-        tol_table = {
-            'weights_1': 1e-4,
-            'weights_2': 1e-4,
-            'predict_result_1': 1e-4,
-        }
-
-      keras.backend.set_image_data_format('channels_last')
-      np.random.seed(_RANDOM_SEED)
-      random_seed.set_random_seed(_RANDOM_SEED)
-
-      # Train, eval, and predict datasets are created with the same input numpy
-      # arrays.
-      # TODO(xiejw): Change this back to 10000, once we support final partial
-      # batch.
-      num_samples = 9984
-      x_train = np.random.rand(num_samples, 1)
-      y_train = 3 * x_train
-      x_train = x_train.astype('float32')
-      y_train = y_train.astype('float32')
-      x_predict = [[1.], [2.], [3.], [4.]]
-
-      # The model is built once and the initial weights are saved.
-      # This is used to initialize the model for both the distribution and
-      # non-distribution run. In addition, we add few non-linear layers to make
-      # it non-trivial.
-      def _create_model():
-        model = keras.Sequential()
-        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-        model.add(keras.layers.Dense(10, activation='relu'))
-        model.add(keras.layers.Dense(10, activation='relu'))
-        model.add(keras.layers.Dense(1))
-        return model
-
-      model = _create_model()
-      initial_weights = model.get_weights()
-      del model  # avoid accident usage.
-
-      def _build_and_compile_model():
-        # We have initialized the model to the same weight for the distribution
-        # and non-distribution run.
-        model = _create_model()
-        model.set_weights(initial_weights)
-        model.compile(
-            loss=keras.losses.mean_squared_error,
-            optimizer=gradient_descent_keras.SGD(0.5),
-            metrics=['mse'])
-        return model
-
-      def fit_eval_and_predict(with_distribution=None):
-        if with_distribution:
-          with with_distribution.scope():
-            model = _build_and_compile_model()
-        else:
-          model = _build_and_compile_model()
-
-        training_inputs, eval_inputs, predict_inputs = (
-            get_correctness_test_inputs(use_numpy, use_validation_data,
-                                        with_distribution,
-                                        x_train, y_train, x_predict))
-
-        result = {}
-        result['training_history_1'] = model.fit(**training_inputs).history
-
-        if eval_inputs is not None:
-          result['eval_result_1'] = model.evaluate(**eval_inputs)
-
-        result['weights_1'] = model.get_weights()
-        result['predict_result_1'] = model.predict(**predict_inputs)
-
-        # Train and eval again to mimic user's flow.
-
-        result['training_history_2'] = model.fit(**training_inputs).history
-
-        if eval_inputs is not None:
-          result['eval_result_2'] = model.evaluate(**eval_inputs)
-
-        result['weights_2'] = model.get_weights()
-
-        return result
-
-      results_with_ds = fit_eval_and_predict(with_distribution=distribution)
-      results_without_ds = fit_eval_and_predict(with_distribution=None)
-
-      # Verify that the weights, training history, eval results, predict outputs
-      # are the same within some limits of tolerance.
-      for key in results_with_ds:
-        if (key.startswith('training_history') and
-            isinstance(distribution, tpu_strategy.TPUStrategy) and
-            distribution.extended.steps_per_run > 1):
-          # TODO(b/119894254): Enable this test for all cases once the
-          # underlying bug is fixed.
-          continue
-
-        tolerance = tol_table.get(key, default_tolerance)
-
-        self.assertAllClose(
-            results_with_ds[key],
-            results_without_ds[key],
-            atol=tolerance,
-            rtol=tolerance,
-            msg='Fail to assert {}.'.format(key))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_correctness_test_base.py b/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c783099b5267d6f57f755ca67dae05099e874d8
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
@@ -0,0 +1,491 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import random_seed
+from tensorflow.python.keras.engine import distributed_training_utils
+
+_RANDOM_SEED = 1337
+_EVAL_STEPS = 20
+_GLOBAL_BATCH_SIZE = 64
+
+# Note: Please make sure the tests in this file are also covered in
+# keras_backward_compat_test for features that are supported with both APIs.
+
+
+all_strategies = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus,
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step,
+]
+
+
+def eager_mode_test_configuration():
+  return combinations.combine(mode='eager',
+                              use_numpy=False,
+                              use_validation_data=False)
+
+
+def graph_mode_test_configuration():
+  return combinations.combine(mode='graph',
+                              use_numpy=[True, False],
+                              use_validation_data=[True, False])
+
+
+def all_strategy_and_input_config_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=all_strategies),
+          eager_mode_test_configuration() + graph_mode_test_configuration()))
+
+
+def strategies_for_embedding_models():
+  """Returns distribution strategies to test for embedding models.
+
+  Since embedding models take longer to train, we disregard OneDeviceStrategy
+  and DefaultStrategy in order to prevent testing timeouts.
+  """
+
+  strategies = [s for s in all_strategies
+                if not s.required_tpu and s.required_gpus is not None]
+  strategies.append(combinations.tpu_strategy_loop_on_device)
+  strategies.append(combinations.tpu_strategy_one_step_loop_on_device)
+  return strategies
+
+
+def test_combinations_for_embedding_model():
+  return (
+      combinations.times(
+          combinations.combine(distribution=
+                               strategies_for_embedding_models()),
+          (graph_mode_test_configuration() +
+           eager_mode_test_configuration())))
+
+
+def test_combinations_with_tpu_strategies():
+  tpu_strategies = [combinations.tpu_strategy_loop_on_device,
+                    combinations.tpu_strategy_one_step_loop_on_device]
+
+  return (
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          graph_mode_test_configuration()))
+
+
+class MaybeDistributionScope(object):
+  """Provides a context allowing no distribution strategy."""
+
+  def __init__(self, distribution):
+    self._distribution = distribution
+    self._scope = None
+
+  def __enter__(self):
+    if self._distribution:
+      self._scope = self._distribution.scope()
+      self._scope.__enter__()
+
+  def __exit__(self, exc_type, value, traceback):
+    if self._distribution:
+      self._scope.__exit__(exc_type, value, traceback)
+      self._scope = None
+
+
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution, tpu_strategy.TPUStrategy):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
+def get_batch_size(global_batch_size, distribution):
+  batch_size = global_batch_size
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  use_per_core_batch_size = (
+      distribution and
+      not distributed_training_utils.global_batch_size_supported(distribution))
+  if use_per_core_batch_size:
+    batch_size //= distribution.num_replicas_in_sync
+  return batch_size
+
+
+def get_data_size(data):
+  """Gets the size of data in list, tuple, dict, or a numpy array."""
+  assert isinstance(data, (np.ndarray, list, dict, tuple))
+
+  if isinstance(data, np.ndarray):
+    return len(data)
+
+  if isinstance(data, (list, tuple)):
+    return len(data[0])
+
+  return len(six.next(six.itervalues(data)))
+
+
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution, x_train, y_train, x_predict):
+  """Generates the inputs for correctness check when enable Keras with DS."""
+  training_epochs = 2
+  global_batch_size = _GLOBAL_BATCH_SIZE
+  batch_size = get_batch_size(global_batch_size, with_distribution)
+
+  if use_numpy:
+    training_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+        'epochs': training_epochs,
+        'shuffle': False,
+    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
+    predict_inputs = {
+        'x': x_predict
+    }
+  else:
+    training_data_size = get_data_size(x_train)
+    if training_data_size < _GLOBAL_BATCH_SIZE * _EVAL_STEPS:
+      # Currently, we cannot detect the size of a dataset. So, the eval steps is
+      # hard coded.
+      raise ValueError('x_train must have at least '
+                       '_GLOBAL_BATCH_SIZE * _EVAL_STEPS samples')
+    # For dataset inputs, we do not pass batch_size to
+    # keras.fit/evaluate/predict. The batch size is part of the dataset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+    x = batch_wrapper(train_dataset, batch_size, with_distribution,
+                      repeat=training_epochs)
+
+    training_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'epochs': training_epochs,
+        'shuffle': False,
+        'steps_per_epoch': training_data_size // global_batch_size,
+    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': _EVAL_STEPS,
+      }
+
+    predict_batch_size = get_batch_size(get_data_size(x_predict),
+                                        with_distribution)
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = batch_wrapper(predict_dataset, predict_batch_size,
+                                    with_distribution)
+    predict_inputs = {
+        'steps': 1,
+        'x': predict_dataset,
+    }
+
+  return training_inputs, eval_inputs, predict_inputs
+
+
+def fit_eval_and_predict(initial_weights, input_fn, model_fn,
+                         distribution=None, is_stateful_model=False):
+  """Generates results for fit/predict/evaluate for given model."""
+  model = model_fn(initial_weights=initial_weights, distribution=distribution)
+  training_inputs, eval_inputs, predict_inputs = input_fn(distribution)
+
+  result = {}
+  result['training_history_1'] = model.fit(**training_inputs).history
+
+  if eval_inputs is not None:
+    result['eval_result_1'] = model.evaluate(**eval_inputs)
+
+  result['weights_1'] = model.get_weights()
+
+  if predict_inputs is not None:
+    # Check correctness of the result of predict() invoked
+    # multiple times -- as for stateful models, result of
+    # predict may differ for each batch.
+    predict_length = 1
+    if is_stateful_model:
+      predict_length = 3
+    for i in range(predict_length):
+      result_key = 'predict_result_{}'.format(i)
+      result[result_key] = model.predict(**predict_inputs)
+
+  # Train and eval again to mimic user's flow.
+
+  result['training_history_2'] = model.fit(**training_inputs).history
+
+  if eval_inputs is not None:
+    result['eval_result_2'] = model.evaluate(**eval_inputs)
+
+  result['weights_2'] = model.get_weights()
+
+  return result
+
+
+def compare_results(results_with_ds, results_without_ds, distribution,
+                    testcase):
+  """Compares results of model compiled with/without distribution strategy."""
+
+  default_tolerance = 1e-5
+  relaxed_tolerance = 1e-4
+
+  def _get_compare_result_tolerance(key):
+    """Returns tolerance to compare results."""
+    # TODO(b/119257215): For MirroredStrategy, weights are not exactly the same,
+    # so use larger tolerance for now. Predict should be related to weights.
+    if (isinstance(distribution, (
+        mirrored_strategy.MirroredStrategy,
+        mirrored_strategy.CoreMirroredStrategy,
+        distribute_lib._DefaultDistributionStrategy)) and  # pylint: disable=protected-access
+        key.startswith(('weights_1', 'weights_2', 'predict_result'))):
+      return relaxed_tolerance
+
+    return default_tolerance
+
+  for key in results_with_ds:
+    if (key.startswith('training_history') and
+        isinstance(distribution, tpu_strategy.TPUStrategy) and
+        distribution.extended.steps_per_run > 1):
+      # TODO(b/119894254): Enable this test for all cases once the
+      # underlying bug is fixed.
+      continue
+
+    tolerance = _get_compare_result_tolerance(key)
+    testcase.assertAllClose(
+        results_with_ds[key],
+        results_without_ds[key],
+        atol=tolerance,
+        rtol=tolerance,
+        msg='Fail to assert {}.'.format(key))
+
+
+def should_skip_tpu_with_eager(distribution):
+  return (context.executing_eagerly() and
+          isinstance(distribution, tpu_strategy.TPUStrategy))
+
+
+class LearningRateBatchScheduler(keras.callbacks.Callback):
+  """Scheduler that dynamically sets the learning rate of model."""
+
+  def __init__(self, update_freq=None):
+    self._update_freq = update_freq
+
+  def on_batch_begin(self, batch, logs=None):
+    if self._update_freq and batch % self._update_freq != 0:
+      return
+
+    # To avoid divergence, limit the value range.
+    lr = 0.001 * (batch % 10)
+    keras.backend.set_value(self.model.optimizer.lr, lr)
+
+
+class TestDistributionStrategyCorrectnessBase(test.TestCase,
+                                              parameterized.TestCase):
+  """Model agnostic testing infra to test correctness of Keras models."""
+
+  def set_up_test_config(self, use_numpy=False,
+                         use_validation_data=False,
+                         with_batch_norm=False):
+    self.use_numpy = use_numpy
+    self.use_validation_data = use_validation_data
+    self.with_batch_norm = with_batch_norm
+
+    keras.backend.set_image_data_format('channels_last')
+    np.random.seed(_RANDOM_SEED)
+    random_seed.set_random_seed(_RANDOM_SEED)
+
+  def get_data(self):
+    num_samples = 10000
+    x_train = np.random.randint(0, 2, num_samples)
+    x_train = np.reshape(x_train, (num_samples, 1))
+    y_train = x_train
+    return (x_train.astype('float32'), y_train.astype('float32'), None)
+
+  def get_model(self, distribution=None):
+    raise NotImplementedError
+
+  def skip_unsupported_test_configuration(self, distribution):
+    if should_skip_tpu_with_eager(distribution):
+      self.skipTest('TPUStrategy does not support eager mode now.')
+
+    if context.executing_eagerly() and self.use_numpy:
+      self.skipTest('Numpy as inputs is not supported with strategy in eager.')
+
+    if context.executing_eagerly() and self.use_validation_data:
+      self.skipTest('TODO(hongjunchoi): Add test logic for using validation '
+                    'data for eager execution.')
+    return
+
+  def run_correctness_test(self,
+                           distribution,
+                           use_numpy,
+                           use_validation_data,
+                           with_batch_norm=False,
+                           is_stateful_model=False):
+    with self.cached_session():
+      self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
+      self.skip_unsupported_test_configuration(distribution)
+
+      # Train, eval, and predict datasets are created with the same input numpy
+      # arrays.
+      x_train, y_train, x_predict = self.get_data()
+
+      # The model is built once and the initial weights are saved.
+      # This is used to initialize the model for both the distribution and
+      # non-distribution run.
+      model = self.get_model()
+      initial_weights = model.get_weights()
+
+      def input_fn(dist):
+        return get_correctness_test_inputs(
+            use_numpy, use_validation_data, dist, x_train, y_train, x_predict)
+
+      results_with_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=distribution, is_stateful_model=is_stateful_model)
+      results_without_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=None, is_stateful_model=is_stateful_model)
+
+      # First, special case, for multi-replica distributed training, batch norm
+      # is not aggregated globally. So it is expected to have different weights.
+      if (self.with_batch_norm and
+          distribution.num_replicas_in_sync > 1):
+        with self.assertRaises(AssertionError):
+          compare_results(results_with_ds, results_without_ds, distribution,
+                          testcase=self)
+      else:
+        compare_results(results_with_ds, results_without_ds, distribution,
+                        testcase=self)
+
+  def run_dynamic_lr_test(self, distribution):
+    with self.cached_session():
+      self.set_up_test_config()
+      self.skip_unsupported_test_configuration(distribution)
+
+      x_train, y_train, _ = self.get_data()
+      model = self.get_model()
+      initial_weights = model.get_weights()
+      update_freq = None
+
+      if (isinstance(distribution, tpu_strategy.TPUStrategy) and
+          distribution.extended.steps_per_run > 1):
+        # For TPUStrategy with steps_per_run > 1, the callback is not invoked
+        # every step. So, to compare the CPU/TPU, we let the CPU to behave the
+        # same as TPU.
+        update_freq = distribution.extended.steps_per_run
+
+      def input_fn(dist):
+        """Generates training test given test configuration."""
+        training_epochs = 2
+        global_batch_size = 64
+        batch_size = get_batch_size(global_batch_size, dist)
+
+        training_inputs = {
+            'batch_size': batch_size,
+            'x': x_train,
+            'y': y_train,
+            'epochs': training_epochs,
+            'shuffle': False,
+            'callbacks': [LearningRateBatchScheduler(update_freq)],
+            'validation_data': (x_train, y_train)
+        }
+        # In this test case, we do not care eval and predict.
+        eval_inputs, predict_inputs = None, None
+        return training_inputs, eval_inputs, predict_inputs
+
+      results_with_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=distribution)
+      results_without_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=None)
+      compare_results(results_with_ds, results_without_ds, distribution,
+                      testcase=self)
+
+
+class TestDistributionStrategyEmbeddingModelCorrectnessBase(
+    TestDistributionStrategyCorrectnessBase):
+  """Base class to test correctness of Keras models with embedding layers."""
+
+  def get_data(self,
+               count=(_GLOBAL_BATCH_SIZE * _EVAL_STEPS),
+               min_words=5,
+               max_words=10,
+               max_word_id=19,
+               num_classes=2):
+    distribution = []
+    for _ in range(num_classes):
+      dist = np.abs(np.random.randn(max_word_id))
+      dist /= np.sum(dist)
+      distribution.append(dist)
+
+    features = []
+    labels = []
+    for _ in range(count):
+      label = np.random.randint(0, num_classes, size=1)[0]
+      num_words = np.random.randint(min_words, max_words, size=1)[0]
+      word_ids = np.random.choice(
+          max_word_id, size=num_words, replace=True, p=distribution[label])
+      word_ids = word_ids
+      labels.append(label)
+      features.append(word_ids)
+
+    features = keras.preprocessing.sequence.pad_sequences(
+        features, maxlen=max_words)
+    x_train = np.asarray(features, dtype=np.float32)
+    y_train = np.asarray(labels, dtype=np.int32).reshape((count, 1))
+    x_predict = x_train[:_GLOBAL_BATCH_SIZE]
+    return x_train, y_train, x_predict
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae32188917cce9209b8e51032ef808352bc257c
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
@@ -0,0 +1,171 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras DNN model using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.training import gradient_descent
+
+
+def all_strategy_combinations_with_eager_and_graph_modes():
+  return combinations.combine(distribution=keras_correctness_test_base.
+                              all_strategies,
+                              mode=['graph', 'eager'])
+
+
+def all_strategy_combinations_with_graph_mode():
+  return combinations.combine(distribution=keras_correctness_test_base.
+                              all_strategies, mode=['graph'])
+
+
+class TestDistributionStrategyDnnCorrectness(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      # We add few non-linear layers to make it non-trivial.
+      model = keras.Sequential()
+      model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+      model.add(keras.layers.Dense(10, activation='relu'))
+      model.add(keras.layers.Dense(10, activation='relu'))
+      model.add(keras.layers.Dense(1))
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          loss=keras.losses.mean_squared_error,
+          optimizer=gradient_descent_keras.SGD(0.5),
+          metrics=['mse'])
+      return model
+
+  def get_data(self):
+    # TODO(xiejw): Change this back to 10000, once we support final partial
+    # batch.
+    num_samples = 9984
+    x_train = np.random.rand(num_samples, 1)
+    y_train = 3 * x_train
+    x_train = x_train.astype('float32')
+    y_train = y_train.astype('float32')
+    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         all_strategy_and_input_config_combinations())
+  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  def test_dnn_with_dynamic_learning_rate(self, distribution):
+    self.run_dynamic_lr_test(distribution)
+
+
+class TestDistributionStrategyDnnMetricCorrectness(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, distribution=None):
+    with distribution.scope():
+      model = keras.Sequential()
+      model.add(keras.layers.Dense(1,
+                                   input_shape=(1,),
+                                   kernel_initializer='ones'))
+      model.compile(
+          loss=keras.losses.mean_squared_error,
+          optimizer=gradient_descent.GradientDescentOptimizer(0.5),
+          metrics=[keras.metrics.BinaryAccuracy()])
+    return model
+
+  def run_metric_correctness_test(self, distribution):
+    with self.cached_session():
+      self.set_up_test_config()
+      self.skip_unsupported_test_configuration(distribution)
+
+      x_train, y_train, _ = self.get_data()
+      model = self.get_model(distribution=distribution)
+
+      batch_size = 64
+      batch_size = (keras_correctness_test_base.
+                    get_batch_size(batch_size, distribution))
+      train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+      train_dataset = (keras_correctness_test_base.
+                       batch_wrapper(train_dataset, batch_size, distribution))
+
+      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
+
+  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  def test_simple_dnn_metric_correctness(self, distribution):
+    self.run_metric_correctness_test(distribution)
+
+
+class TestDistributionStrategyDnnMetricEvalCorrectness(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, distribution=None):
+    with distribution.scope():
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              3, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='mae',
+          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001))
+    return model
+
+  def run_eval_metrics_correctness_test(self, distribution):
+    with self.cached_session():
+      self.set_up_test_config()
+      self.skip_unsupported_test_configuration(distribution)
+
+      model = self.get_model(distribution=distribution)
+
+      # verify correctness of stateful and stateless metrics.
+      x = np.ones((100, 4)).astype('float32')
+      y = np.ones((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = (keras_correctness_test_base.
+                 batch_wrapper(dataset, 4, distribution))
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
+
+      y = np.zeros((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = (keras_correctness_test_base.
+                 batch_wrapper(dataset, 4, distribution))
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
+
+  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  def test_identity_model_metric_eval_correctness(self, distribution):
+    self.run_eval_metrics_correctness_test(distribution)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e881bb70ecc428e3f972cde5f19c1b61b1dc0f0b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
@@ -0,0 +1,150 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness test for tf.keras Embedding models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+class DistributionStrategyEmbeddingModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      if self.use_distributed_dense:
+        word_embed = keras.layers.TimeDistributed(keras.layers.Dense(4))(
+            word_embed)
+      avg = keras.layers.GlobalAveragePooling1D()(word_embed)
+      preds = keras.layers.Dense(2, activation='softmax')(avg)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_embedding_model_correctness(self, distribution, use_numpy,
+                                       use_validation_data):
+
+    self.use_distributed_dense = False
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_embedding_time_distributed_model_correctness(self,
+                                                        distribution,
+                                                        use_numpy,
+                                                        use_validation_data):
+    self.use_distributed_dense = True
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids_a = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words_a')
+      word_ids_b = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words_b')
+
+      def submodel(embedding, word_ids):
+        word_embed = embedding(word_ids)
+        rep = keras.layers.GlobalAveragePooling1D()(word_embed)
+        return keras.Model(inputs=[word_ids], outputs=[rep])
+
+      word_embed = keras.layers.Embedding(
+          input_dim=20,
+          output_dim=10,
+          input_length=max_words,
+          embeddings_initializer=keras.initializers.RandomUniform(0, 1))
+
+      a_rep = submodel(word_embed, word_ids_a).outputs[0]
+      b_rep = submodel(word_embed, word_ids_b).outputs[0]
+      sim = keras.layers.Dot(axes=1, normalize=True)([a_rep, b_rep])
+
+      model = keras.Model(inputs=[word_ids_a, word_ids_b], outputs=[sim])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='mse',
+          metrics=['mse'])
+    return model
+
+  def get_data(self,
+               count=(keras_correctness_test_base._GLOBAL_BATCH_SIZE *
+                      keras_correctness_test_base._EVAL_STEPS),
+               min_words=5,
+               max_words=10,
+               max_word_id=19,
+               num_classes=2):
+    features_a, labels_a, _ = (super(
+        DistributionStrategySiameseEmbeddingModelCorrectnessTest, self).
+                               get_data(count, min_words, max_words,
+                                        max_word_id, num_classes))
+
+    features_b, labels_b, _ = (super(
+        DistributionStrategySiameseEmbeddingModelCorrectnessTest, self).
+                               get_data(count, min_words, max_words,
+                                        max_word_id, num_classes))
+
+    y_train = np.zeros((count, 1), dtype=np.float32)
+    y_train[labels_a == labels_b] = 1.0
+    y_train[labels_a != labels_b] = -1.0
+    # TODO(b/123360757): Add tests for using list as inputs for multi-input
+    # models.
+    x_train = {
+        'words_a': features_a,
+        'words_b': features_b,
+    }
+    x_predict = x_train
+
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
+                                               use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f625664372dfb6814ccbe9539f6abe018d2a4447
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras CNN models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+class DistributionStrategyCnnCorrectnessTest(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      image = keras.layers.Input(shape=(28, 28, 3), name='image')
+      c1 = keras.layers.Conv2D(
+          name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4))(
+              image)
+      if self.with_batch_norm:
+        c1 = keras.layers.BatchNormalization(name='bn1')(c1)
+      c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
+      logits = keras.layers.Dense(
+          10, activation='softmax', name='pred')(
+              keras.layers.Flatten()(c1))
+      model = keras.Model(inputs=[image], outputs=[logits])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+
+    return model
+
+  def get_data(self,
+               count=keras_correctness_test_base._GLOBAL_BATCH_SIZE
+               * keras_correctness_test_base._EVAL_STEPS,
+               shape=(28, 28, 3),
+               num_classes=10):
+    centers = np.random.randn(num_classes, *shape)
+
+    features = []
+    labels = []
+    for _ in range(count):
+      label = np.random.randint(0, num_classes, size=1)[0]
+      offset = np.random.normal(loc=0, scale=0.1, size=np.prod(shape))
+      offset = offset.reshape(shape)
+      labels.append(label)
+      features.append(centers[label] + offset)
+
+    x_train = np.asarray(features, dtype=np.float32)
+    y_train = np.asarray(labels, dtype=np.float32).reshape((count, 1))
+    x_predict = x_train
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         all_strategy_and_input_config_combinations())
+  def test_cnn_correctness(self, distribution, use_numpy, use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+  @combinations.generate(keras_correctness_test_base.
+                         all_strategy_and_input_config_combinations())
+  def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
+                                           use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              with_batch_norm=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_lstm_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_lstm_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed2dfa206cdf4be24a88b1d54090487c1873399
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_lstm_model_correctness_test.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras LSTM model using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+class DistributionStrategyLstmModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      lstm_embed = keras.layers.LSTM(units=4,
+                                     return_sequences=False)(word_embed)
+
+      preds = keras.layers.Dense(2, activation='softmax')(lstm_embed)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_lstm_model_correctness(self,
+                                  distribution,
+                                  use_numpy,
+                                  use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index cce93b3c10a2ac7bd1c594a5027b9d51629bb915..5349794334b7f6ea3d718343fa84c693dd3d7a3c 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -65,7 +65,8 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
     devices = ['/device:GPU:0', '/device:CPU:0']
     with distribution.scope():
-      (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn)
+      (var, m, v, op,
+       counter) = distribution.extended.call_for_each_replica(create_fn)
       self.evaluate(variables.global_variables_initializer())
       var_val = [2.0, 2.0, 2.0]
       self.assertAllClose(
diff --git a/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab56c01d862354bd74330f769502692bd8a8b982
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py
@@ -0,0 +1,99 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateful tf.keras LSTM models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+def strategies_for_stateful_embedding_model():
+  """Returns TPUStrategy with single core device assignment."""
+
+  return [combinations.tpu_strategy_loop_on_device_one_core,
+          combinations.tpu_strategy_one_step_loop_on_device_one_core]
+
+
+def test_combinations_for_stateful_embedding_model():
+  return (
+      combinations.combine(
+          distribution=strategies_for_stateful_embedding_model(),
+          mode='graph',
+          use_numpy=False,
+          use_validation_data=False
+      ))
+
+
+class DistributionStrategyStatefulLstmModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
+
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,),
+          batch_size=batch_size,
+          dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      lstm_embed = keras.layers.LSTM(units=4,
+                                     return_sequences=False,
+                                     stateful=True)(word_embed)
+
+      preds = keras.layers.Dense(2, activation='softmax')(lstm_embed)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(test_combinations_for_stateful_embedding_model())
+  def test_stateful_lstm_model_correctness(self,
+                                           distribution,
+                                           use_numpy,
+                                           use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              is_stateful_model=True)
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_with_tpu_strategies())
+  def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
+      self, distribution, use_numpy, use_validation_data):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Single core must be used for computation '
+                                 'on stateful models. Consider adding '
+                                 '`device_assignment` parameter to '
+                                 'TPUStrategy'):
+      self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                                is_stateful_model=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 84e9aea228352e0a6010fe95529407818d020b5f..17ed87145984af96073c78cf4974527e558d3842 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import tempfile
 from absl.testing import parameterized
 import numpy as np
 
@@ -245,15 +246,32 @@ def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-# TODO(priyag): Add v2 optimizers here.
+def all_strategy_combinations_minus_default():
+  strategy_minus_default_combinations = combinations.combine(
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager'])
+  return strategy_minus_default_combinations + tpu_strategy_combinations()
+
+
 def strategy_and_optimizer_combinations():
+  # TODO(b/122372746): Uncomment optimizers after they pass tests.
   return combinations.times(
       all_strategy_combinations(),
-      combinations.combine(
-          optimizer=[combinations.adagrad_optimizer_v1_fn,
-                     combinations.adam_optimizer_v1_fn,
-                     combinations.gradient_descent_optimizer_v1_fn,
-                     combinations.rmsprop_optimizer_v1_fn]))
+      combinations.combine(optimizer=[
+          combinations.adagrad_optimizer_v1_fn,
+          # combinations.adagrad_optimizer_keras_v2_fn,
+          combinations.adam_optimizer_v1_fn,
+          combinations.adam_optimizer_keras_v2_fn,
+          combinations.gradient_descent_optimizer_v1_fn,
+          combinations.gradient_descent_optimizer_keras_v2_fn,
+          combinations.rmsprop_optimizer_v1_fn,
+          # combinations.rmsprop_optimizer_keras_v2_fn
+      ]))
 
 
 def strategy_for_numpy_input_combinations():
@@ -417,15 +435,6 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_creating_var_with_numpy_arrays(self, distribution):
-    with self.cached_session():
-      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
-      var_x = distributed_training_utils.get_var_for_numpy(distribution, x)
-      val = self.evaluate(var_x.value())
-      # Verify that the numpy value is copied to the variable.
-      self.assertAllEqual(x, val)
-
   @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -564,26 +573,26 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         metrics = ['mae']
         model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
+        inputs = np.zeros((64, 3), dtype=np.float32)
+        targets = np.zeros((64, 4), dtype=np.float32)
 
-      # Call fit with validation data
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
-                validation_data=(inputs, targets))
+        # Call fit with validation data
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
+                  validation_data=(inputs, targets))
 
-      # TODO(anjalisridhar): We need tests for when the batch size and steps are
-      # smaller and results in a 0 batch_size and steps value.
-      model.evaluate(inputs, targets)
-      # with steps
-      model.evaluate(inputs, targets, steps=2)
-      # with batch_size
-      model.evaluate(inputs, targets, batch_size=8)
+        # TODO(anjalisridhar): We need tests for when the batch size and steps
+        # are smaller and results in a 0 batch_size and steps value.
+        model.evaluate(inputs, targets)
+        # with steps
+        model.evaluate(inputs, targets, steps=2)
+        # with batch_size
+        model.evaluate(inputs, targets, batch_size=8)
 
-      model.predict(inputs)
-      # with steps
-      model.predict(inputs, steps=2)
-      # with batch_size
-      model.predict(inputs, batch_size=8)
+        model.predict(inputs)
+        # with steps
+        model.predict(inputs, steps=2)
+        # with batch_size
+        model.predict(inputs, batch_size=8)
 
   @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
@@ -937,9 +946,6 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(all_strategy_combinations())
   def testOptimizerWithCallbacks(self, distribution):
     with self.cached_session():
-      # TODO(b/120946189): Investigate why default strategy + eager fails.
-      if '_Default' in distribution.__class__.__name__:
-        self.skipTest('Disable the test for default strategy.')
       with distribution.scope():
         model = get_model()
         optimizer = gradient_descent_keras.SGD(0.01)
@@ -1045,14 +1051,12 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
       # Test with not specifying the `steps` argument.
       with self.assertRaisesRegexp(
-          ValueError, 'you should specify the `steps_per_epoch` argument'):
+          ValueError, 'the `steps_per_epoch` argument'):
         model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
+      with self.assertRaisesRegexp(ValueError, 'the `steps` argument'):
         model.evaluate(dataset, verbose=0)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
+      with self.assertRaisesRegexp(ValueError, 'the `steps` argument'):
         model.predict(dataset, verbose=0)
 
   @combinations.generate(combinations.combine(
@@ -1119,12 +1123,15 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
 class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
-  def test_batchnorm_correctness(self, distribution):
+  @combinations.generate(combinations.times(
+      all_strategy_combinations(),
+      combinations.combine(fused=[True, False])))
+  def test_batchnorm_correctness(self, distribution, fused):
     with self.cached_session():
       with distribution.scope():
         model = keras.models.Sequential()
-        norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+        norm = keras.layers.BatchNormalization(
+            input_shape=(10,), momentum=0.8, fused=fused)
         model.add(norm)
         model.compile(loss='mse',
                       optimizer=gradient_descent.GradientDescentOptimizer(0.01))
@@ -1148,5 +1155,78 @@ class TestDistributionStrategyWithNormalizationLayer(
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
+class TestDistributionStrategySaveLoadWeights(test.TestCase,
+                                              parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_save_load_h5(self, distribution):
+    with self.cached_session():
+      dataset = get_dataset(distribution)
+      with distribution.scope():
+        model = get_model()
+        model.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp('.h5')
+        model.save_weights(weights_file)
+
+        model_2 = get_model()
+        model_2.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_save_load_checkpointable(self, distribution):
+    # TODO(sourabhbajaj): Test fails with optimizer v2 without h5
+    with self.cached_session():
+      dataset = get_dataset(distribution)
+      with distribution.scope():
+        model = get_model()
+        model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp()
+        model.save_weights(weights_file)
+
+        model_2 = get_model()
+        model_2.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+
+class TestDistributionStrategyValidation(test.TestCase,
+                                         parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_layer_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        with distribution.scope():
+          model = keras.Model(x, y)
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_model_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        model = keras.Model(x, y)
+        with distribution.scope():
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 32a0d199434e0627122fd4e47cf8894079ef3a1e..a663e809dd45ea099e1d8a08e681d07b05bee3c9 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -95,16 +95,15 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
 
   def _test_metric(self, distribution, dataset_fn, metric_fn, expected_fn):
     with ops.Graph().as_default(), distribution.scope():
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_initializable_iterator()
+      iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
-          value, update = distribution.call_for_each_replica(
+          value, update = distribution.extended.call_for_each_replica(
               metric_fn, args=(inputs,))
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
-        ctx = distribution.run_steps_on_dataset(
+        ctx = distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=distribution.extended.steps_per_run)
         update = ctx.run_op
         value = ctx.non_tensor_outputs["value"]
@@ -114,15 +113,14 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
             distribution.num_replicas_in_sync *
             distribution.extended.steps_per_run)
       else:
-        value, update = distribution.call_for_each_replica(
+        value, update = distribution.extended.call_for_each_replica(
             metric_fn, args=(iterator.get_next(),))
         update = distribution.group(update)
         # TODO(josh11b): Once we switch to using a global batch size for input,
         # replace "distribution.num_replicas_in_sync" with "1".
         batches_per_update = distribution.num_replicas_in_sync
 
-      self.evaluate(iterator.initializer)
-      self.evaluate(distribution.initialize())
+      self.evaluate(iterator.initialize())
       self.evaluate(variables.local_variables_initializer())
 
       batches_consumed = 0
@@ -136,8 +134,6 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
         if batches_consumed >= 4:  # Consume 4 input batches in total.
           break
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(all_combinations() + tpu_combinations())
   def testMean(self, distribution):
     def _dataset_fn():
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index 824c4b09371fcc8d590f2d2b2be8f39b4a585b27..f06c9b75644b2890b7657f75e74e4e20a6f15705 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -41,12 +41,9 @@ from tensorflow.python.ops.losses import losses_impl
 
 class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
-  def _get_iterator(self, ds):
-    if context.executing_eagerly():
-      iterator = ds.make_one_shot_iterator()
-    else:
-      iterator = ds.make_initializable_iterator()
-      self.evaluate(iterator.initializer)
+  def _get_iterator(self, strategy, input_fn):
+    iterator = strategy.make_input_fn_iterator(lambda _: input_fn())
+    self.evaluate(iterator.initialize())
     return iterator
 
   @combinations.generate(
@@ -67,15 +64,15 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=(inputs,)))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=2).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -84,12 +81,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       weights, biases = [], []
       for _ in range(5):
         run_step()
-
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
@@ -105,11 +99,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
         return distribution.group(
-            distribution.call_for_each_replica(
+            distribution.extended.call_for_each_replica(
                 model_fn, args=(iterator.get_next(),)))
 
       if not context.executing_eagerly():
@@ -152,7 +146,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     # `distribution.scope`.
     with variable_scope.variable_creator_scope(
         appending_creator), distribution.scope():
-      model_fn, dataset_fn, layer = minimize_loss_example(
+      model_fn, dataset_fn, _ = minimize_loss_example(
           optimizer_fn,
           use_bias=True,
           use_callable_loss=True,
@@ -161,24 +155,21 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=(inputs,)))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
-
       run_step()
 
-      self.evaluate(distribution.finalize())
-
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
@@ -197,7 +188,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.assertEqual(
           get_expected_variables(optimizer_fn,
-                                 len(distribution.parameter_devices)),
+                                 len(distribution.extended.parameter_devices)),
           set(created_variables))
 
   @combinations.generate(
@@ -230,18 +221,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
-            distribution.call_for_each_replica(model_fn, args=(inputs,)))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
         if update_ops_in_cross_replica_mode:
           fetches += tuple(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
         return control_flow_ops.group(fetches)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -267,8 +258,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(
       combinations.times(
           combinations.combine(
@@ -327,15 +316,15 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=(inputs,)))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -370,8 +359,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
@@ -412,7 +399,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return (train_op, loss)
 
       def step_fn(output_context, inputs):
-        (train_op, loss) = distribution.call_for_each_replica(
+        (train_op, loss) = distribution.extended.call_for_each_replica(
             model_fn, args=(output_context, inputs))
         output_context.set_last_step_output(
             name="cross_replica_loss_reduced",
@@ -423,7 +410,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             output=loss)
         return distribution.group(train_op)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
@@ -439,7 +426,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             "cross_replica_loss_not_reduced":
             distribution.unwrap(distribution.broadcast(initial_loss()))
         }
-        ctx = distribution.run_steps_on_dataset(
+        ctx = distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=2,
             initial_loop_values=initial_loop_values)
 
@@ -458,7 +445,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             reduced=False, distribution=distribution)
         return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -471,8 +457,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       loss_is_not_increasing = all(y <= x for x, y in zip(losses, losses[1:]))
       self.assertTrue(loss_is_not_increasing)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 71e50b83b079bc73a7b178356f0f26adbd98638f..5391e083fc9b3ed99cc64bbed11bdeb8dea07f93 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -18,11 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import values
 
 
 # pylint: disable=protected-access,invalid-name
@@ -48,8 +46,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   distributed environment.
 
   There are several important concepts for distributed TensorFlow, e.g.
-  `client`, `job`, 'task', `cluster`, `in-graph replication` and
-  'synchronous training' and they have already been defined in the
+  `client`, `job`, `task`, `cluster`, `in-graph replication` and
+  `synchronous training` and they have already been defined in the
   [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
   The distribution strategy inherits these concepts as well and in addition to
   that we also clarify several more concepts:
@@ -104,6 +102,61 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                                 auto_shard_dataset)
     super(MirroredStrategy, self).__init__(extended)
 
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def make_dataset_iterator(self, dataset):  # pylint: disable=useless-super-delegation
+    """Makes an iterator for input provided via `dataset`.
+
+    NOTE: The batch size of the `dataset` argument is treated differently for
+    this contrib version of `MirroredStrategy`.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    per-replica batch size.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return super(MirroredStrategy, self).make_dataset_iterator(dataset)
+
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def experimental_make_numpy_iterator(  # pylint: disable=useless-super-delegation
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
+
+    NOTE: The `batch_size` argument here has different behavior for this
+    contrib version of `MirroredStrategy`.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the per-replica
+        batch size. The global batch size will be this times
+        `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return super(MirroredStrategy, self).experimental_make_numpy_iterator(
+        numpy_input, batch_size, num_epochs, shuffle, session)
+
 
 class MirroredExtended(CoreMirroredExtended):
   """Implementation of (contrib) MirroredStrategy."""
@@ -135,19 +188,10 @@ class MirroredExtended(CoreMirroredExtended):
     Returns:
       An `InputIterator` which returns inputs for each step of the computation.
     """
-    return values.DatasetIterator(dataset, self._input_workers)
-
-  def _distribute_dataset(self, dataset_fn):
-    if self._local_mode:
-      return values.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._input_workers, 0)
-    else:
-      return values.MultiWorkerDataset(
-          functools.partial(self._call_dataset_fn, dataset_fn),
-          self._input_workers,
-          auto_shard=self._auto_shard_dataset)
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """The contrib version of Mirrored strategy uses per-replica batch size."""
     return False
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index f4becf1d6291cc0c7e2bdbc3911394764412b037..d6337d106fced921b8bda0a2faac99c2a77fab8e 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -66,8 +66,10 @@ GPU_TEST = "test_gpu" in sys.argv[0]
         combinations.core_mirrored_strategy_with_gpu_and_cpu,
         combinations.core_mirrored_strategy_with_two_gpus],
     mode=["graph", "eager"]))
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
-                                        parameterized.TestCase):
+class MirroredTwoDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
   def testMinimizeLoss(self, distribution):
     if context.executing_eagerly():
@@ -114,9 +116,30 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
                                  expected_values)
 
+  def testNumpyIterator(self, distribution):
+    self._test_numpy_iterator(distribution)
+
   def testGlobalStepUpdate(self, distribution):
     self._test_global_step_update(distribution)
 
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 def one_device_combinations():
   return combinations.combine(
@@ -128,25 +151,42 @@ def one_device_combinations():
       mode=["graph", "eager"])
 
 
+@combinations.generate(one_device_combinations())
 class MirroredOneDeviceDistributionTest(
     strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.OneDeviceDistributionTestBase,
     parameterized.TestCase):
 
-  @combinations.generate(one_device_combinations())
   def testMinimizeLoss(self, distribution):
     if context.executing_eagerly():
       self._test_minimize_loss_eager(distribution)
     else:
       self._test_minimize_loss_graph(distribution)
 
-  @combinations.generate(one_device_combinations())
   def testReplicaId(self, distribution):
     self._test_replica_id(distribution)
 
-  @combinations.generate(one_device_combinations())
   def testCallAndMergeExceptions(self, distribution):
     self._test_call_and_merge_exceptions(distribution)
 
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 class MirroredStrategyVariableCreatorStackTest(
     test.TestCase, parameterized.TestCase):
@@ -221,11 +261,13 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
   # TODO(priyag): Modify more tests to use this helper and check more
   # properties.
-  def _test_mv_properties(self, var, name):
+  def _test_mv_properties(self, var, name, strategy):
     self.assertIsInstance(var, values.MirroredVariable)
     self.assertEqual(name, var.name)
+    self.assertIs(strategy, var.distribute_strategy)
     for d in var.devices:
       self.assertEqual(d, var.get(d).device)
+      self.assertIs(strategy, var.get(d)._distribute_strategy)  # pylint: disable=protected-access
 
   def testVariableInFuncGraph(self, distribution):
     def model_fn():
@@ -237,8 +279,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       v1 = variable_scope.variable(1.0, name="foo")
       v2 = distribution.extended.call_for_each_replica(model_fn)
 
-    self._test_mv_properties(v1, "foo:0")
-    self._test_mv_properties(v2, "bar:0")
+    self._test_mv_properties(v1, "foo:0", distribution)
+    self._test_mv_properties(v2, "bar:0", distribution)
 
   def testSingleVariable(self, distribution):
     def model_fn():
@@ -251,7 +293,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self._test_mv_properties(result, "foo:0")
+      self._test_mv_properties(result, "foo:0", distribution)
 
   def testUnnamedVariable(self, distribution):
     def model_fn():
@@ -261,7 +303,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self._test_mv_properties(result, "Variable:0")
+      self._test_mv_properties(result, "Variable:0", distribution)
 
   def testMultipleVariables(self, distribution):
     def model_fn():
@@ -274,7 +316,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
-        self._test_mv_properties(v, "foo" + str(i) + ":0")
+        self._test_mv_properties(v, "foo" + str(i) + ":0", distribution)
 
   def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
@@ -324,14 +366,9 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
                 (layer2.kernel, layer2.bias),
                 (layer3.kernel, layer3.bias)]
 
-    ds = distribution.distribute_dataset(
-        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
-    if context.executing_eagerly():
-      iterator = ds.make_one_shot_iterator()
-    else:
-      iterator = ds.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
+    iterator = distribution.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
+    self.evaluate(iterator.initialize())
     features = iterator.get_next()
 
     with distribution.scope():
@@ -693,6 +730,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           distribution.extended.worker_devices[0]).read_value()))
       self.assertEqual(10.0, self.evaluate(ret_v_sum))
 
+  def testVarDistributeStrategy(self, distribution):
+    with distribution.scope():
+      mirrored = variable_scope.variable(1.0)
+      replica_local = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ)
+      self.assertIs(distribution, mirrored.distribute_strategy)
+      self.assertIs(distribution, replica_local.distribute_strategy)
+
 
 @combinations.generate(combinations.combine(
     distribution=[
@@ -1215,7 +1261,7 @@ class MirroredStrategyDefunTest(test.TestCase):
                             self.evaluate(device_result))
 
       for defun in defuns:
-        # PolymorphicFunctions are specialized to the current device stack, so
+        # `Function`s are specialized to the current device stack, so
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
index 17b7ab74f63f42e1ee14a82d3bffdd1df9b25857..53e35ea6b75088a3de9866973f872da4a4ce25d6 100644
--- a/tensorflow/contrib/distribute/python/monitor.py
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -51,7 +51,7 @@ class Monitor(object):
     else:
       if session is None:
         raise ValueError("Should provide a `session` in Graph mode.")
-      session.run(step_callable._iterator.initializer)  # pylint: disable=protected-access
+      session.run(step_callable.initialize())
       self._run_step = session.make_callable(step_callable())
       session.run(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index 8f13e9153ea7a951dd722c4549882c97e79b57fe..c4622cdd2af2f6a9c936fe554bcc2eb76f805fdc 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -53,7 +53,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       return var, assign
 
     with distribution.scope(), self.cached_session() as sess:
-      var, assign = distribution.call_for_each_replica(replica_fn)
+      var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([10.0, 11.0], var.eval())
       sess.run(distribution.unwrap(assign))
@@ -79,7 +79,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       return var, assign.op
 
     with distribution.scope(), self.cached_session() as sess:
-      var, assign_op = distribution.call_for_each_replica(replica_fn)
+      var, assign_op = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([0.0, 0.0], var.eval())
       sess.run(distribution.unwrap(assign_op))
@@ -152,7 +152,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       return var, assign
 
     with distribution.scope(), self.cached_session() as sess:
-      var, assign = distribution.call_for_each_replica(replica_fn)
+      var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([10.0, 11.0], var.eval())
       sess.run(distribution.unwrap(assign))
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 5986bc4661f2615a16fcd8d5bf503f1f0dd3d504..24d6a443fe15c9b9ff34b7e6d3a5bc5a2bb7abfb 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import values
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -51,41 +51,38 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
     super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
     self._default_device = device
-    worker = device_util.canonicalize("/device:CPU:0")
-    worker_device_pairs = [(worker, [self._device])]
+    self._input_device = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(self._input_device, [self._device])]
     device_map = values.SingleDeviceMap(device)
-    self._input_workers = values.InputWorkers(device_map, worker_device_pairs)
+    self._input_workers = input_lib.InputWorkers(
+        device_map, worker_device_pairs)
 
   def _create_variable(self, next_creator, *args, **kwargs):
     colocate_with = kwargs.pop("colocate_with", None)
     if colocate_with is None:
       with ops.device(self._device):
         return next_creator(*args, **kwargs)
-    if isinstance(colocate_with, six.string_types):
-      with ops.device(colocate_with):
-        return next_creator(*args, **kwargs)
-    if (isinstance(colocate_with, (list, tuple)) and len(colocate_with) == 1 and
-        isinstance(colocate_with[0], six.string_types)):
-      with ops.device(colocate_with[0]):
-        return next_creator(*args, **kwargs)
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate(colocate_with_variable, self)
+
   def _make_dataset_iterator(self, dataset):
     """Make iterator from dataset without splitting the batch."""
-    return values.DatasetIterator(dataset, self._input_workers)
-
-  def _distribute_dataset(self, dataset_fn):
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._input_workers, 0)
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   def _make_input_fn_iterator(
       self,
       input_fn,
       replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    return values.InputFunctionIterator(
+    return input_lib.InputFunctionIterator(
         input_fn, self._input_workers, [distribute_lib.InputContext()])
 
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._input_device, session)
+
   def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
@@ -97,7 +94,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
 
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
@@ -198,6 +195,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for OneDeviceStrategy."""
     return True
 
 
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index d46cd6f529e363f76bfa2b22339add63530cfde8..f81466a6c75f1cf287cdb00917872f77383c615e 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -25,7 +25,9 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
 
-class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
+class OneDeviceStrategyTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.OneDeviceDistributionTestBase):
 
   def _get_distribution_strategy(self):
     return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
@@ -57,6 +59,28 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
     self._test_input_fn_iterator(
         iterator, d.extended.worker_devices, expected_values)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNumpyIterator(self):
+    self._test_numpy_iterator(self._get_distribution_strategy())
+
+  def testAllReduceSum(self):
+    self._test_all_reduce_sum(self._get_distribution_strategy())
+
+  def testAllReduceSumGradients(self):
+    self._test_all_reduce_sum_gradients(self._get_distribution_strategy())
+
+  def testAllReduceSumGradientTape(self):
+    self._test_all_reduce_sum_gradient_tape(self._get_distribution_strategy())
+
+  def testAllReduceMean(self):
+    self._test_all_reduce_mean(self._get_distribution_strategy())
+
+  def testAllReduceMeanGradients(self):
+    self._test_all_reduce_mean_gradients(self._get_distribution_strategy())
+
+  def testAllReduceMeanGradientTape(self):
+    self._test_all_reduce_mean_gradient_tape(self._get_distribution_strategy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index fa4705af7cb592119f56686d1f693a156f7b4b13..e388061b17a9b92dedbbf9839049b13c8575a22c 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -41,21 +41,17 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
-
-      ds = distribution.distribute_dataset(dataset_fn)
-      if context.executing_eagerly():
-        iterator = ds.make_one_shot_iterator()
-      else:
-        iterator = ds.make_initializable_iterator()
+      iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
 
       def run_step():
-        return control_flow_ops.group(distribution.unwrap(
-            distribution.call_for_each_replica(
-                model_fn, args=(iterator.get_next(),))))
+        return control_flow_ops.group(
+            distribution.unwrap(
+                distribution.extended.call_for_each_replica(
+                    model_fn, args=(iterator.get_next(),))))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
-          sess.run(iterator.initializer)
+          sess.run(iterator.initialize())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 5029d59641a25364d02874bd945af15147debc24..e42bc50fdc4e5e93c998708b0790fdea7768faf2 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,34 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import device_setter
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+
+# pylint: disable=protected-access,invalid-name,line-too-long
+CoreParameterServerStrategy = parameter_server_strategy.ParameterServerStrategy
+CoreParameterServerExtended = parameter_server_strategy.ParameterServerStrategyExtended
 
-_LOCAL_CPU = "/device:CPU:0"
-_LOCAL_GPU_0 = "/device:GPU:0"
+# pylint: enable=protected-access,invalid-name,line-too-long
 
 
-# TODO(yuefengz): maybe cache variables on local CPU.
-# TODO(yuefengz): we may want to set session options to disallow communication
-# between workers.
 class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   """A parameter server DistributionStrategy.
 
+  *** contrib version ***
+
   This strategy class works for both local training and between-graph replicated
   training for multiple workers. If `cluster_spec` is specified, either passed
   in to __init__() method or parsed from the
@@ -80,9 +70,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   variables.
 
   3) It is also not recommended to open a colocation scope (i.e. calling
-  `tf.colocate_with`) under the strategy's scope. For colocating variables,
-  use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
-  create conflicts of device assignment.
+  `tf.colocate_with`) under the strategy's scope. For colocating variables, use
+  `strategy.extended.colocate_vars_with` instead. Colocation of ops will
+  possibly create conflicts of device assignment.
   """
 
   def __init__(self, num_gpus_per_worker=0):
@@ -99,433 +89,84 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     super(ParameterServerStrategy, self).__init__(
         ParameterServerExtended(self, num_gpus_per_worker))
 
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def make_dataset_iterator(self, dataset):  # pylint: disable=useless-super-delegation
+    """Makes an iterator for input provided via `dataset`.
 
-class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
-  """Implementation of ParameterServerStrategy."""
+    NOTE: The batch size of the `dataset` argument is treated differently for
+    this contrib version of `ParameterServerStrategy`.
 
-  def __init__(self, container_strategy, num_gpus_per_worker):
-    super(ParameterServerExtended, self).__init__(container_strategy)
-    self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local(num_gpus_per_worker)
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    per-replica batch size.
 
-    # We typically don't need to do all-reduce in this strategy.
-    self._cross_device_ops = (
-        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
-            reduce_to_device=_LOCAL_CPU))
-
-  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
-                               task_type, task_id):
-    """Initialize devices for multiple workers.
-
-    It creates variable devices and compute devices. Variables and operations
-    will be assigned to them respectively. We have one compute device per
-    replica. The variable device is a device function or device string. The
-    default variable device assigns variables to parameter servers in a
-    round-robin fashion.
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
 
     Args:
-      num_gpus_per_worker: number of local GPUs or GPUs per worker.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type.
-      task_id: the current task id.
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
 
-    Raises:
-      ValueError: if the cluster_spec doesn't have ps jobs.
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    assert cluster_spec
-    if not task_type or task_id is None:
-      raise ValueError("When `cluster_spec` is given, you must also specify "
-                       "`task_type` and `task_id`")
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-
-    worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)
-
-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus_per_worker > 0:
-      compute_devices = tuple(
-          "%s/device:GPU:%d" % (worker_device, i)
-          for i in range(num_gpus_per_worker)
-      )
-    else:
-      compute_devices = (worker_device,)
-
-    self._device_map = values.ReplicaDeviceMap(compute_devices)
-    self._input_workers = values.InputWorkers(
-        self._device_map, [(worker_device, compute_devices)])
-
-    # In distributed mode, place variables on ps jobs in a round-robin fashion.
-    # Note that devices returned from `replica_device_setter` are not
-    # canonical and therefore we don't canonicalize all variable devices to
-    # make them consistent.
-    # TODO(yuefengz): support passing a strategy object to control variable
-    # assignment.
-    # TODO(yuefengz): merge the logic of replica_device_setter into this
-    # class.
-    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
-    if num_ps_replicas == 0:
-      raise ValueError("The cluster spec needs to have `ps` jobs.")
-    self._variable_device = device_setter.replica_device_setter(
-        ps_tasks=num_ps_replicas,
-        worker_device=worker_device,
-        merge_devices=True,
-        cluster=cluster_spec)
-
-    # The `_parameter_devices` is needed for the `parameter_devices` property
-    # and is a list of all variable devices. Here parameter devices are all
-    # tasks of the "ps" job.
-    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
-                                        range(num_ps_replicas)))
-
-    # Add a default device so that ops without specified devices will not end up
-    # on other workers.
-    self._default_device = worker_device
-
-    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
-                                                task_id)
-    self._cluster_spec = cluster_spec
-    self._task_type = task_type
-    self._task_id = task_id
-
-    logging.info(
-        "Multi-worker ParameterServerStrategy with "
-        "cluster_spec = %r, task_type = %r, task_id = %r, "
-        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
-        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
-        num_ps_replicas, self._is_chief, self._device_map,
-        self._variable_device)
-
-  def _initialize_local(self, num_gpus_per_worker):
-    """Initialize internal devices for local training."""
-    worker_device = device_util.canonicalize("/device:CPU:0")
-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus_per_worker > 0:
-      compute_devices = tuple(
-          map("/device:GPU:{}".format, range(num_gpus_per_worker)))
-    else:
-      compute_devices = (_LOCAL_CPU,)
-
-    self._device_map = values.ReplicaDeviceMap(compute_devices)
-    self._input_workers = values.InputWorkers(
-        self._device_map, [(worker_device, compute_devices)])
-
-    # If there is only one GPU, put everything on that GPU. Otherwise, place
-    # variables on CPU.
-    if num_gpus_per_worker == 1:
-      assert len(compute_devices) == 1
-      self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = (_LOCAL_GPU_0,)
-    else:
-      self._variable_device = _LOCAL_CPU
-      self._parameter_devices = (_LOCAL_CPU,)
-
-    self._is_chief = True
-    self._cluster_spec = None
-    self._task_type = None
-    self._task_id = None
-
-    logging.info(
-        "ParameterServerStrategy with compute_devices = %r, "
-        "variable_device = %r", compute_devices, self._variable_device)
-
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._input_workers, 0,
-        prefetch_on_device=True)
-
-  def _make_dataset_iterator(self, dataset):
-    return values.DatasetIterator(dataset, self._input_workers,
-                                  self._num_replicas_in_sync)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    """Distributes the dataset to each local GPU."""
-    if self._cluster_spec:
-      input_pipeline_id = multi_worker_util.id_in_cluster(
-          self._cluster_spec, self._task_type, self._task_id)
-      num_input_pipelines = multi_worker_util.worker_count(
-          self._cluster_spec, self._task_type)
-    else:
-      input_pipeline_id = 0
-      num_input_pipelines = 1
-    input_context = distribute_lib.InputContext(
-        num_input_pipelines=num_input_pipelines,
-        input_pipeline_id=input_pipeline_id,
-        num_replicas_in_sync=self._num_replicas_in_sync)
-    return values.InputFunctionIterator(
-        input_fn, self._input_workers, [input_context])
-
-  def _broadcast_to(self, tensor, destinations):
-    # This is both a fast path for Python constants, and a way to delay
-    # converting Python values to a tensor until we know what type it
-    # should be converted to. Otherwise we have trouble with:
-    #   global_step.assign_add(1)
-    # since the `1` gets broadcast as an int32 but global_step is int64.
-    if isinstance(tensor, (float, int)):
-      return tensor
-    if not cross_device_ops_lib.check_destinations(destinations):
-      # TODO(josh11b): Use current logical device instead of 0 here.
-      destinations = values.LogicalDeviceSpec(
-          device_map=self._device_map, logical_device=0)
-    return self._cross_device_ops.broadcast(tensor, destinations)
-
-  def _allow_variable_partition(self):
-    return not context.executing_eagerly()
-
-  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
-  # this creator, such as "MutableHashTable".
-  def _create_variable(self, next_creator, *args, **kwargs):
-    if self._num_replicas_in_sync > 1:
-      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
-      if aggregation not in (
-          vs.VariableAggregation.NONE,
-          vs.VariableAggregation.SUM,
-          vs.VariableAggregation.MEAN,
-          vs.VariableAggregation.ONLY_FIRST_REPLICA
-      ):
-        raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                         " for variable: " + kwargs["name"])
-
-      def var_creator(*args, **kwargs):
-        """Create an AggregatingVariable and fix up collections."""
-        # Record what collections this variable should be added to.
-        collections = kwargs.pop("collections", None)
-        if collections is None:
-          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-        kwargs["collections"] = []
-
-        # Create and wrap the variable.
-        v = next_creator(*args, **kwargs)
-        wrapped = values.AggregatingVariable(v, aggregation)
-
-        # Add the wrapped variable to the requested collections.
-        # The handling of eager mode and the global step matches
-        # ResourceVariable._init_from_args().
-        if not context.executing_eagerly():
-          g = ops.get_default_graph()
-          # If "trainable" is True, next_creator() will add the contained
-          # variable to the TRAINABLE_VARIABLES collection, so we manually
-          # remove it and replace with the wrapper. We can't set "trainable"
-          # to False for next_creator() since that causes functions like
-          # implicit_gradients to skip those variables.
-          if kwargs.get("trainable", True):
-            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l.remove(v)
-          g.add_to_collections(collections, wrapped)
-        elif ops.GraphKeys.GLOBAL_STEP in collections:
-          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
-
-        return wrapped
-    else:
-      var_creator = next_creator
-
-    if "colocate_with" in kwargs:
-      with ops.device(None):
-        with ops.colocate_with(kwargs["colocate_with"]):
-          return var_creator(*args, **kwargs)
-
-    with ops.colocate_with(None, ignore_existing=True):
-      with ops.device(self._variable_device):
-        return var_creator(*args, **kwargs)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(
-        self._container_strategy(), self._device_map, fn, args, kwargs)
-
-  def _verify_destinations_not_different_worker(self, destinations):
-    if not self._cluster_spec:
-      return
-    if destinations is None:
-      return
-    for d in cross_device_ops_lib.get_devices_from(destinations):
-      d_spec = tf_device.DeviceSpec.from_string(d)
-      if d_spec.job == self._task_type and d_spec.task != self._task_id:
-        raise ValueError(
-            "Cannot reduce to another worker: %r, current worker is %r" %
-            (d, self._input_workers.worker_devices[0]))
+    return super(ParameterServerStrategy, self).make_dataset_iterator(dataset)
 
-  def _reduce_to(self, reduce_op, value, destinations):
-    self._verify_destinations_not_different_worker(destinations)
-    if not isinstance(value, values.DistributedValues):
-      # pylint: disable=protected-access
-      return cross_device_ops_lib.reduce_non_distributed_value(
-          reduce_op, self._device_map, value, destinations)
-    return self._cross_device_ops.reduce(
-        reduce_op, value, destinations=destinations)
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def experimental_make_numpy_iterator(  # pylint: disable=useless-super-delegation
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
-    for _, destinations in value_destination_pairs:
-      self._verify_destinations_not_different_worker(destinations)
-    return self._cross_device_ops.batch_reduce(reduce_op,
-                                               value_destination_pairs)
-
-  def _select_single_value(self, structured):
-    """Select any single values in `structured`."""
-
-    def _select_fn(x):  # pylint: disable=g-missing-docstring
-      if isinstance(x, values.Mirrored):
-        if len(x.devices) == 1:
-          return x.primary
-        else:
-          raise ValueError(
-              "You cannot update variable with a Mirrored object with multiple "
-              "components %r when using ParameterServerStrategy. You must "
-              "specify a single value or a Mirrored with a single value." % x)
-      elif isinstance(x, values.PerReplica):
-        raise ValueError(
-            "You cannot update variable with a PerReplica object %r when using "
-            "ParameterServerStrategy. You must specify a single value or a "
-            "Mirrored with a single value" % x)
-      else:
-        return x
-
-    return nest.map_structure(_select_fn, structured)
-
-  def _update(self, var, fn, args, kwargs, group):
-    if isinstance(var, values.AggregatingVariable):
-      var = var.get()
-    if not isinstance(var, resource_variable_ops.ResourceVariable):
-      raise ValueError(
-          "You can not update `var` %r. It must be a Variable." % var)
-    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
-      result = fn(var, *self._select_single_value(args),
-                  **self._select_single_value(kwargs))
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    with ops.device(
-        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
-      result = fn(*args, **kwargs)
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      return val.values
-    return (val,)
-
-  def value_container(self, val):
-    if (hasattr(val, "_aggregating_container") and
-        not isinstance(val, values.AggregatingVariable)):
-      wrapper = val._aggregating_container()  # pylint: disable=protected-access
-      if wrapper is not None:
-        return wrapper
-    return val
-
-  def read_var(self, var):
-    # No need to distinguish between normal variables and replica-local
-    # variables.
-    return array_ops.identity(var)
-
-  def _configure(self,
-                 session_config=None,
-                 cluster_spec=None,
-                 task_type=None,
-                 task_id=None):
-    """Configures the strategy class.
-
-    The strategy object will be re-initialized if `cluster_spec` is given but
-    was not passed in the constructor.
+    NOTE: The `batch_size` argument here has different behavior for this
+    contrib version of `ParameterServerStrategy`.
 
     Args:
-      session_config: not used currently.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type.
-      task_id: the current task id.
-
-    Raises:
-      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
-        not.
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the per-replica
+        batch size. The global batch size will be this times
+        `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    if not self._cluster_spec and cluster_spec:
-      # If a `cluster_spec` is already passed in, do nothing here.
-      # TODO(yuefengz): check `cluster_spec` is the same if this object has
-      # already been initialized with a `cluster_spec`.
-      if task_type is None or task_id is None:
-        raise ValueError("When `cluster_spec` is given, must also specify "
-                         "`task_type` and `task_id`.")
-      self._cluster_spec = multi_worker_util.normalize_cluster_spec(
-          cluster_spec)
-      self._task_type = task_type
-      self._task_id = task_id
-      self._initialize_multi_worker(self._num_gpus_per_worker,
-                                    self._cluster_spec, task_type, task_id)
-
-    if session_config:
-      session_config.CopyFrom(self._update_config_proto(session_config))
-
-  def _update_config_proto(self, config_proto):
-    updated_config = copy.deepcopy(config_proto)
-    if not self._cluster_spec:
-      updated_config.isolate_session_state = True
-      return updated_config
-
-    updated_config.isolate_session_state = False
-
-    assert self._task_type
-    assert self._task_id is not None
+    return super(ParameterServerStrategy,
+                 self).experimental_make_numpy_iterator(
+                     numpy_input, batch_size, num_epochs, shuffle, session)
 
-    # The device filters prevent communication between workers.
-    if self._task_type not in ["chief", "worker"]:
-      return updated_config
-    del updated_config.device_filters[:]
-    updated_config.device_filters.extend(
-        ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
-    return updated_config
 
-  @property
-  def _num_replicas_in_sync(self):
-    return self._device_map.num_replicas_in_graph
-
-  @property
-  def worker_devices(self):
-    return self._device_map.all_devices
-
-  @property
-  def worker_devices_by_replica(self):
-    return self._device_map.devices_by_replica
-
-  @property
-  def parameter_devices(self):
-    return self._parameter_devices
-
-  def non_slot_devices(self, var_list):
-    return min(var_list, key=lambda x: x.name)
-
-  @property
-  def experimental_between_graph(self):
-    # TODO(yuefengz): Should this return False in the local case?
-    return True
-
-  @property
-  def experimental_should_init(self):
-    return self._is_chief
+class ParameterServerExtended(CoreParameterServerExtended):
+  """Implementation of ParameterServerStrategy."""
 
-  @property
-  def should_checkpoint(self):
-    return self._is_chief
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    # Use TFConfigClusterResolver to parse TF_CONFIG. We don't want to change
+    # the constructor's interface to allow customized cluster resolver. Use
+    # SimpleClusterResolver to override num_accelerators.
+    tfconfig = TFConfigClusterResolver()
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=tfconfig.cluster_spec(),
+        task_type=tfconfig.task_type,
+        task_id=tfconfig.task_id,
+        num_accelerators=num_gpus_per_worker)
+    super(ParameterServerExtended, self).__init__(
+        container_strategy, cluster_resolver=cluster_resolver)
 
-  @property
-  def should_save_summary(self):
-    return self._is_chief
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """The contrib version of PS strategy uses per-replica batch size."""
     return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 805c643e679338467bf576e17baa8bf839f3b292..89dcdbcfc2f1f9d8cd46db9ccf133be08ff89533 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -29,10 +29,13 @@ from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import parameter_server_strategy as core_parameter_server_strategy
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
@@ -45,10 +48,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
+from tensorflow.python.training.server_lib import ClusterSpec
 
 CHIEF = run_config.TaskType.CHIEF
 WORKER = run_config.TaskType.WORKER
@@ -62,6 +67,57 @@ def _get_replica_id_integer():
   return replica_id
 
 
+class MockCoreParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """Mock the strategy to allow cluster resolver as an argument."""
+
+  def __init__(self, cluster_resolver):
+    super(MockCoreParameterServerStrategy, self).__init__(
+        core_parameter_server_strategy.ParameterServerStrategyExtended(
+            self, cluster_resolver=cluster_resolver))
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None,
+                        sess_config=None,
+                        use_core_strategy=False):
+  sess_config = sess_config or config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  if use_core_strategy:
+    if cluster_spec and task_type and task_id is not None:
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators=num_gpus)
+      target = 'grpc://' + cluster_spec[WORKER][task_id]
+    else:
+      cluster_resolver = SimpleClusterResolver(
+          ClusterSpec({}), num_accelerators=num_gpus)
+      target = ''
+
+    distribution = MockCoreParameterServerStrategy(cluster_resolver)
+    sess_config = copy.deepcopy(sess_config)
+    sess_config = distribution.update_config_proto(sess_config)
+  else:
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    if task_type:
+      sess_config = copy.deepcopy(sess_config)
+      distribution.configure(
+          session_config=sess_config,
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id)
+      target = 'grpc://' + cluster_spec[WORKER][task_id]
+    else:
+      target = ''
+
+  return distribution, target, sess_config
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -75,24 +131,27 @@ class ParameterServerStrategyTestBase(
     self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True)
     super(ParameterServerStrategyTestBase, self).setUp()
 
-  def _get_test_objects(self, task_type, task_id, num_gpus):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=num_gpus)
-    if not task_type:
-      return distribution, '', self._sess_config
-
-    sess_config = copy.deepcopy(self._sess_config)
-    distribution.configure(
-        session_config=sess_config,
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus,
+                        use_core_strategy=False):
+    return create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type=task_type,
-        task_id=task_id)
-    return (distribution, 'grpc://' + self._cluster_spec[WORKER][task_id],
-            sess_config)
-
-  def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
+        task_id=task_id,
+        num_gpus=num_gpus,
+        sess_config=self._sess_config,
+        use_core_strategy=use_core_strategy)
+
+  def _test_device_assignment_distributed(self,
+                                          task_type,
+                                          task_id,
+                                          num_gpus,
+                                          use_core_strategy=False):
     worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
-    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    d, _, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     with ops.Graph().as_default(), \
          self.cached_session(target=self._default_target,
                              config=sess_config) as sess, \
@@ -131,7 +190,7 @@ class ParameterServerStrategyTestBase(
                          '/job:worker/replica:0/task:0/%s' % last_part_device)
 
         # The colocate_vars_with can override the distribution's device.
-        with d.colocate_vars_with(x):
+        with d.extended.colocate_vars_with(x):
           y = variable_scope.get_variable(
               'y', initializer=20.0,
               aggregation=variable_scope.VariableAggregation.SUM)
@@ -177,7 +236,7 @@ class ParameterServerStrategyTestBase(
         self.assertIn('/job:ps/', h.device)
         return y_add, z_add, f
 
-      y, z, f = d.call_for_each_replica(model_fn)
+      y, z, f = d.extended.call_for_each_replica(model_fn)
       self.assertNotEqual(y, None)
       self.assertNotEqual(z, None)
       self.assertNotEqual(f, None)
@@ -190,9 +249,10 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(f_val, 46.0)
 
   def _test_device_assignment_distributed_enable_partitioner(
-      self, task_type, task_id, num_gpus):
-    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
-    num_shards = len(d.parameter_devices)
+      self, task_type, task_id, num_gpus, use_core_strategy=False):
+    d, _, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+    num_shards = len(d.extended.parameter_devices)
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
     with ops.Graph().as_default(), \
          self.cached_session(target=self._default_target,
@@ -224,39 +284,18 @@ class ParameterServerStrategyTestBase(
           self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
           self.assertEqual(var.device, x_add[part_id].device)
 
-        # The colocate_vars_with can override the distribution's device.
-        with d.colocate_vars_with(x_add[0]):
-          y = variable_scope.get_variable(
-              'y',
-              initializer=constant_op.constant([20.0, 10.0]),
-              aggregation=variable_scope.VariableAggregation.SUM,
-              partitioner=partitioner)
-        y_add = y.assign_add(
-            [array_ops.identity(x_add[0]),
-             array_ops.identity(x_add[1])])
-
-        for part_id, var in enumerate(y):
-          self.assertEqual(var.device, '/job:ps/task:0')
-          self.assertEqual(y_add[part_id].device, var.device)
-          self.assertEqual(var.device, x_add[0].device)
+        return x_add
 
-        return x_add, y_add
-
-      x, y = d.call_for_each_replica(model_fn)
+      x = d.extended.call_for_each_replica(model_fn)
 
       if context.num_gpus() >= 1:
         variables.global_variables_initializer().run()
-        x_val, y_val = sess.run([x, y])
+        x_val = sess.run(x)
         if num_gpus < 1:
           self.assertEqual(x_val, [13.0, 25.0])
-          self.assertEqual(y_val, [33.0, 35.0])
         else:
           x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
-          y_expect = [
-              20.0 + x_expect[0] * num_gpus, 10.0 + x_expect[1] * num_gpus
-          ]
           self.assertEqual(x_val, x_expect)
-          self.assertEqual(y_val, y_expect)
 
   def _test_device_assignment_local(self,
                                     d,
@@ -305,7 +344,7 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))
 
         # The colocate_vars_with can override the distribution's device.
-        with d.colocate_vars_with(x):
+        with d.extended.colocate_vars_with(x):
           y = variable_scope.get_variable(
               'y', initializer=20.0,
               aggregation=variable_scope.VariableAggregation.SUM)
@@ -348,7 +387,7 @@ class ParameterServerStrategyTestBase(
             device_util.canonicalize(h.device))
         return y_add, z_add, f
 
-      y, z, f = d.call_for_each_replica(model_fn)
+      y, z, f = d.extended.call_for_each_replica(model_fn)
       self.assertNotEqual(y, None)
       self.assertNotEqual(z, None)
       self.assertNotEqual(f, None)
@@ -360,9 +399,13 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
-  def _test_simple_increment(self, task_type, task_id, num_gpus):
+  def _test_simple_increment(self,
+                             task_type,
+                             task_id,
+                             num_gpus,
+                             use_core_strategy=False):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     if d.extended._cluster_spec:
       num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
       if 'chief' in d.extended._cluster_spec.as_dict():
@@ -395,7 +438,7 @@ class ParameterServerStrategyTestBase(
         train_op = control_flow_ops.group(x_add, y_add, z_add)
         return x, y, z, train_op
 
-      x, y, z, train_op = d.call_for_each_replica(model_fn)
+      x, y, z, train_op = d.extended.call_for_each_replica(model_fn)
       train_op = d.group(train_op)
 
       if context.num_gpus() < d.extended._num_gpus_per_worker:
@@ -430,9 +473,13 @@ class ParameterServerStrategyTestBase(
               y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
               z_val == 30.0 + 1.0 * num_workers)
 
-  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+  def _test_minimize_loss_graph(self,
+                                task_type,
+                                task_id,
+                                num_gpus,
+                                use_core_strategy=False):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     if task_type:
       # Multi-worker
       assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
@@ -472,7 +519,7 @@ class ParameterServerStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=(one,))
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -484,7 +531,7 @@ class ParameterServerStrategyTestBase(
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
-                d.update(v, update, g, grouped=False)):
+                d.extended.update(v, update, args=(g,), group=False)):
               after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
@@ -518,10 +565,15 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
-  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
-                              expected_values):
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              use_core_strategy=False):
     distribution, master_target, config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     devices = distribution.extended.worker_devices
 
     with ops.Graph().as_default(), \
@@ -551,9 +603,11 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(expected_value, computed_value)
 
 
-class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
-                                  strategy_test_lib.DistributionTestBase,
-                                  parameterized.TestCase):
+class ParameterServerStrategyTest(
+    ParameterServerStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -561,66 +615,93 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2)
     cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 
-  def test_num_replicas_in_sync(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def test_num_replicas_in_sync(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
     # All the devices on a given worker are in sync which in this case is the
     # number of gpus on each worker.
-    self.assertEqual(2, distribution.num_replicas_in_sync)
+    self.assertEqual(2, strategy.num_replicas_in_sync)
 
-  def testDeviceAssignmentLocalCPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=0)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalCPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=0, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
+        strategy, compute_device='CPU', variable_device='CPU', num_gpus=0)
 
-  def testDeviceAssignmentLocalOneGPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=1)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalOneGPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=1, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
+        strategy, compute_device='GPU', variable_device='GPU', num_gpus=1)
 
-  def testDeviceAssignmentLocalTwoGPUs(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
+        strategy, compute_device='GPU', variable_device='CPU', num_gpus=2)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributed(self, num_gpus):
-    self._test_device_assignment_distributed('worker', 1, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy):
+    self._test_device_assignment_distributed(
+        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus,
+                                                       use_core_strategy):
     self._test_device_assignment_distributed_enable_partitioner(
-        'worker', 1, num_gpus)
+        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
 
-  def testSimpleBetweenGraph(self):
-    self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testSimpleBetweenGraph(self, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_simple_increment,
+        self._cluster_spec,
+        context.num_gpus(),
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testLocalSimpleIncrement(self, num_gpus):
-    self._test_simple_increment(None, 0, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testLocalSimpleIncrement(self, num_gpus, use_core_strategy):
+    self._test_simple_increment(None, 0, num_gpus, use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraphDistributed(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraphLocal(self, num_gpus):
-    self._test_minimize_loss_graph(None, None, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
+    self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
 
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
-  def testMakeInputFnIteratorDistributed(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     dataset_fn = lambda: dataset_ops.Dataset.range(100)
@@ -632,12 +713,21 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=3,
         expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator('worker', 1, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
-  def testMakeInputFnIteratorLocal(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     dataset_fn = lambda: dataset_ops.Dataset.range(100)
@@ -649,23 +739,31 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)  # only one worker and pipeline for local.
-    self._test_input_fn_iterator(None, None, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        use_core_strategy=use_core_strategy)
 
-  def testGlobalStepUpdate(self):
-    strategy = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepUpdate(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(use_core_strategy=use_core_strategy)
     self._test_global_step_update(strategy)
 
-  def testUpdateConfigProtoMultiWorker(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    strategy.configure(
         cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
 
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify device filters.
     self.assertEqual(['/job:worker/task:1', '/job:ps'],
@@ -674,16 +772,48 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
     # Verify isolate_session_state
     self.assertFalse(new_config.isolate_session_state)
 
-  def testUpdateConfigProtoLocal(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProtoLocal(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
 
     config_proto = config_pb2.ConfigProto()
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify isolate_session_state
     self.assertTrue(new_config.isolate_session_state)
 
+  def testAllReduceSum(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -694,20 +824,31 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2, has_chief=True)
     cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
 
-  def testSimpleBetweenGraph(self):
-    self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testSimpleBetweenGraph(self, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_simple_increment,
+        self._cluster_spec,
+        context.num_gpus(),
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraph(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
-  def testGlobalStepIsWrapped(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    with ops.Graph().as_default(), distribution.scope():
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
       created_step = training_util.create_global_step()
       get_step = training_util.get_global_step()
       self.assertEqual(created_step, get_step,
@@ -716,19 +857,55 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                              id(get_step), get_step.__class__.__name__)))
       self.assertIs(values.AggregatingVariable, type(created_step))
       self.assertIs(values.AggregatingVariable, type(get_step))
+      self.assertIs(strategy, created_step.distribute_strategy)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=1, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
+      created_step = training_util.create_global_step()
+      get_step = training_util.get_global_step()
+      self.assertEqual(created_step, get_step,
+                       msg=('created_step %s type %s vs. get_step %s type %s' %
+                            (id(created_step), created_step.__class__.__name__,
+                             id(get_step), get_step.__class__.__name__)))
+      self.assertIs(resource_variable_ops.ResourceVariable, type(created_step))
+      self.assertIs(resource_variable_ops.ResourceVariable, type(get_step))
+      # All variables have an _distribute_strategy parameter. Only variable
+      # subclasses in distribution strategy expose it publicly.
+      self.assertFalse(hasattr(strategy, 'distribute_strategy'))
+      self.assertIs(strategy, created_step._distribute_strategy)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testValueContainer(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
 
-  def testValueContainer(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    with ops.Graph().as_default(), distribution.scope():
       def f():
         with backprop.GradientTape() as tape:
           v = variable_scope.get_variable('v', initializer=10.0)
           _ = v * v
         v, = tape.watched_variables()
-        w = distribution.extended.value_container(v)
+        w = strategy.extended.value_container(v)
         self.assertIs(values.AggregatingVariable, type(w))
-      distribution.extended.call_for_each_replica(f)
+
+      strategy.extended.call_for_each_replica(f)
+
+
+class LocalParameterServerStrategyTest(strategy_test_lib.DistributionTestBase,
+                                       parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager'],
+                                              use_core_strategy=[True, False],
+                                              required_gpus=2))
+  def testNumpyIterator(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    self._test_numpy_iterator(strategy)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index faeb96bcb7c516b1e494661ef2cbe8dad476ab55..27aad46b97195aa498d0382f08c04c312cebbe65 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.training import optimizer as optimizer_lib
 
 
@@ -33,6 +32,9 @@ class Step(object):
   def distribution(self):
     return self._distribution
 
+  def initialize(self):
+    return []
+
   def __call__(self):
     """Perform one step of this training algorithm."""
     raise NotImplementedError("must be implemented in descendants")
@@ -50,12 +52,10 @@ class StandardInputStep(Step):
 
   def __init__(self, dataset_fn, distribution):
     super(StandardInputStep, self).__init__(distribution)
-    self._distributed_input = distribution.distribute_dataset(dataset_fn)
-    if context.executing_eagerly():
-      self._iterator = self._distributed_input.make_one_shot_iterator()
-    else:
-      # TODO(priyag): Expose initializer via some initializer property.
-      self._iterator = self._distributed_input.make_initializable_iterator()
+    self._iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
+
+  def initialize(self):
+    return self._iterator.initialize()
 
 
 class StandardSingleLossStep(StandardInputStep):
@@ -99,7 +99,7 @@ class StandardSingleLossStep(StandardInputStep):
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
-        grads_and_vars = self.distribution.call_for_each_replica(
+        grads_and_vars = self.distribution.extended.call_for_each_replica(
             gradients_fn, args=(ctx, inputs))
         # If threads use layers, then we need to run the first step
         # sequentially, so that layers.build() is not executed in parallel.
@@ -109,6 +109,6 @@ class StandardSingleLossStep(StandardInputStep):
             self.distribution, grads_and_vars)
 
       # TODO(priyag): Return the outputs, context, etc as well.
-      ctx = self.distribution.run_steps_on_dataset(
+      ctx = self.distribution.extended.experimental_run_steps_on_iterator(
           step_fn, self._iterator, self._iterations_per_step)
       return ctx.run_op
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index 1ff9b9ceec13351b098d47ed3ff62f689a625a31..9f48560b2666036e149a63c98b6529fb24cc5067 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -45,24 +45,21 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
       single_loss_step, layer = single_loss_example(
           optimizer_fn, distribution, use_bias=True, iterations_per_step=2)
 
-      self.evaluate(distribution.initialize())
       if context.executing_eagerly():
+        single_loss_step.initialize()
         run_step = single_loss_step
       else:
         with self.cached_session() as sess:
-          sess.run(single_loss_step._iterator.initializer)
+          sess.run(single_loss_step.initialize())
           run_step = sess.make_callable(single_loss_step())
       self.evaluate(variables.global_variables_initializer())
 
       weights, biases = [], []
       for _ in range(5):
         run_step()
-
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 6e5280e35632d3f3cb6a4fe172a15fb7f508354c..2e2ee92b6e20471f367895ea53c0864bb3d1dae7 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
@@ -31,6 +34,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -41,25 +45,26 @@ class _TestException(Exception):
   pass
 
 
-# May be the argument to either distribution.call_for_each_replica() or
+# May be the argument to either distribution.extended.call_for_each_replica() or
 # get_replica_context().merge_call()
 def _raise_exception_fn(_=None):
   raise _TestException()
 
 
-# Must be the argument to a distribution.call_for_each_replica() call, calls a
-# get_replica_context().merge_call() that raises an exception.
+# Must be the argument to a distribution.extended.call_for_each_replica() call,
+# calls a get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
   ds_context.get_replica_context().merge_call(_raise_exception_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
-# dist.call_for_each_replica() with a function that raises an exception.
+# dist.extended.call_for_each_replica() with a function that raises an
+# exception.
 def _call_raises_fn(dist):
-  dist.call_for_each_replica(_raise_exception_fn)
+  dist.extended.call_for_each_replica(_raise_exception_fn)
 
 
-# Must be the argument to a distribution.call_for_each_replica() call,
+# Must be the argument to a distribution.extended.call_for_each_replica() call,
 # calls a get_replica_context().merge_call() that calls a
 # call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
@@ -67,15 +72,16 @@ def _merge_call_raises_fn():
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
-# dist.call_for_each_replica() with a function that calls a
+# dist.extended.call_for_each_replica() with a function that calls a
 # get_replica_context().merge_call() that raises an exception.
 def _call_merge_raises_fn(dist):
-  dist.call_for_each_replica(_merge_raises_fn)
+  dist.extended.call_for_each_replica(_merge_raises_fn)
 
 
-# Must be the argument to a distribution.call_for_each_replica() call, calls a
-# get_replica_context().merge_call() that calls a call_for_each_replica() that
-# calls a get_replica_context().merge_call() that raises an exception.
+# Must be the argument to a distribution.extended.call_for_each_replica() call,
+# calls a get_replica_context().merge_call() that calls a
+# call_for_each_replica() that calls a get_replica_context().merge_call() that
+# raises an exception.
 def _merge_call_merge_raises_fn():
   ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
@@ -106,7 +112,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=(one,))
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -118,8 +124,8 @@ class DistributionTestBase(test.TestCase):
           with ops.control_dependencies([fetched]):
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
-            with ops.control_dependencies(d.update(
-                v, update, g, grouped=False)):
+            with ops.control_dependencies(d.extended.update(
+                v, update, args=(g,), group=False)):
               after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
@@ -162,7 +168,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=(one,))
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -173,8 +179,8 @@ class DistributionTestBase(test.TestCase):
           with ops.control_dependencies([fetched]):
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
-            with ops.control_dependencies(d.update(
-                v, update, g, grouped=False)):
+            with ops.control_dependencies(d.extended.update(
+                v, update, args=(g,), group=False)):
               after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
@@ -202,20 +208,20 @@ class DistributionTestBase(test.TestCase):
         self.assertFalse(expected_devices[replica_id])
         expected_devices[replica_id] = True
 
-      d.call_for_each_replica(mark_devices_fn)
+      d.extended.call_for_each_replica(mark_devices_fn)
       self.assertAllEqual(expected_devices,
                           [True] * len(d.extended.worker_devices))
 
   def _test_call_and_merge_exceptions(self, dist):
     with dist.scope():
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_raise_exception_fn)
+        dist.extended.call_for_each_replica(_raise_exception_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_merge_raises_fn)
+        dist.extended.call_for_each_replica(_merge_raises_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_merge_call_raises_fn)
+        dist.extended.call_for_each_replica(_merge_call_raises_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_merge_call_merge_raises_fn)
+        dist.extended.call_for_each_replica(_merge_call_merge_raises_fn)
 
   def _input_fn_to_test_input_context(self,
                                       dataset_fn,
@@ -287,8 +293,195 @@ class DistributionTestBase(test.TestCase):
         value = global_step.read_value()
         return train_op, value
 
-      train_ops, value = strategy.call_for_each_replica(model_fn)
+      train_ops, value = strategy.extended.call_for_each_replica(model_fn)
       self.evaluate(strategy.group(train_ops))
       global_step_tensors = strategy.unwrap(value)
       global_step_values = self.evaluate(global_step_tensors)
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
+
+  def _test_numpy_iterator(self, strategy):
+    with strategy.scope(), self.cached_session() as sess:
+      x = np.asarray([[1, 2], [6, 12], [2, 4],
+                      [5, 10], [3, 6], [4, 8]])
+      y = np.asarray([5, 4, 3, 2, 1, 0])
+      batch_size = 6
+      if not strategy.extended._global_batch_size:  # pylint: disable=protected-access
+        batch_size = batch_size // strategy.num_replicas_in_sync
+      i = strategy.experimental_make_numpy_iterator(
+          (x, y), batch_size=batch_size, num_epochs=2, shuffle=None,
+          session=sess)
+      self.evaluate(i.initialize())
+
+      def run_and_concatenate(strategy, i):
+        x, y = strategy.experimental_run(lambda z: z, i)
+        x, y = self.evaluate((strategy.unwrap(x), strategy.unwrap(y)))
+        return np.concatenate(x), np.concatenate(y)
+
+      x_1, y_1 = run_and_concatenate(strategy, i)
+      self.assertAllEqual(x, x_1)
+      self.assertAllEqual(y, y_1)
+      x_2, y_2 = run_and_concatenate(strategy, i)
+      self.assertAllEqual(x, x_2)
+      self.assertAllEqual(y, y_2)
+      with self.assertRaises(errors.OutOfRangeError):
+        run_and_concatenate(strategy, i)
+
+
+class OneDeviceDistributionTestBase(test.TestCase):
+  """Some tests that should work with any one-device DistributionStrategy."""
+
+  def _test_all_reduce_sum(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_sum, inputs=(4., [42., 43.]), expected=(4., [42., 43.]))
+
+  def _test_all_reduce_sum_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_sum, inputs=[4.], expected_grads=[4.])
+
+  def _test_all_reduce_sum_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_sum, inputs=[4.], expected_grads=[4.])
+
+  def _test_all_reduce_mean(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_mean, inputs=(2., [21., 22.]), expected=(2., [21., 22.]))
+
+  def _test_all_reduce_mean_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_mean, inputs=[5.], expected_grads=[5.])
+
+  def _test_all_reduce_mean_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_mean, inputs=[5.], expected_grads=[5.])
+
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    outputs = self.evaluate(
+        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+    self.assertAllEqual([expected[0]], outputs[0])
+    self.assertAllEqual([expected[1]], outputs[1])
+
+  def _test_collective_comms_gradients(
+      self, strategy, comm_fn, inputs, expected_grads):
+    if context.executing_eagerly():
+      self.skipTest("`tf.gradients` is not supported with eager execution.")
+
+    def step(c):
+      x = constant_op.constant(42.)
+      y = comm_fn(x) * c
+      return gradients_impl.gradients(y, [x])[0]
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+  def _test_collective_comms_gradient_tape(
+      self, strategy, comm_fn, inputs, expected_grads):
+    def step(c):
+      x = constant_op.constant(42.)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = comm_fn(x) * c
+      return tape.gradient(y, x)
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+
+class TwoDeviceDistributionTestBase(test.TestCase):
+  """Some tests that should work with any two-device DistributionStrategy."""
+
+  def _test_all_reduce_sum(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_sum,
+        inputs=([1., 3.], [[39., 2.], [3., 41.]]),
+        expected=(4., [42., 43.]))
+
+  def _test_all_reduce_sum_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+
+  def _test_all_reduce_sum_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+
+  def _test_all_reduce_mean(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_mean,
+        inputs=([1., 3.], [[39., 2.], [3., 41.]]),
+        expected=(2., [21., 21.5]))
+
+  def _test_all_reduce_mean_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+
+  def _test_all_reduce_mean_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    outputs = self.evaluate(
+        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+    self.assertAllEqual([expected[0], expected[0]], outputs[0])
+    self.assertAllEqual([expected[1], expected[1]], outputs[1])
+
+  def _test_collective_comms_gradients(
+      self, strategy, comm_fn, inputs, expected_grads):
+    if context.executing_eagerly():
+      self.skipTest("`tf.gradients` is not supported with eager execution.")
+
+    def step(c):
+      x = constant_op.constant(42.)
+      y = comm_fn(x) * c
+      return gradients_impl.gradients(y, [x])[0]
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+  def _test_collective_comms_gradient_tape(
+      self, strategy, comm_fn, inputs, expected_grads):
+    def step(c):
+      x = constant_op.constant(42.)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = comm_fn(x) * c
+      return tape.gradient(y, x)
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+
+def _all_sum(value):
+  ctx = ds_context.get_replica_context()
+  return ctx.all_reduce(reduce_util.ReduceOp.SUM, value)
+
+
+def _all_mean(value):
+  ctx = ds_context.get_replica_context()
+  return ctx.all_reduce(reduce_util.ReduceOp.MEAN, value)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 7352203fe11b3036229119e06872aed5e160b715..4387210062e42bb1ab7e2351008a45979224ff1a 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,10 +21,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
-import functools
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.contrib.tpu.python.tpu import topology
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
@@ -33,12 +35,15 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -50,6 +55,29 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
+def initialize_tpu_system(cluster_resolver=None):
+  """Initialize the TPU devices in a separate session and graph.
+
+  Args:
+    cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster.
+  Returns:
+    The tf.contrib.tpu.Topology object for the topology of the TPU cluster.
+  """
+  if cluster_resolver is None:
+    cluster_resolver = resolver_lib.TPUClusterResolver("")
+  master = cluster_resolver.master()
+
+  logging.info("Initializing the TPU system.")
+  session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+
+  with ops.Graph().as_default():
+    with session_lib.Session(config=session_config, target=master) as sess:
+      serialized_topology = sess.run(tpu.initialize_system())
+  logging.info("Finished initializing TPU system.")
+  return topology.Topology(serialized=serialized_topology)
+
+
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -68,12 +96,13 @@ def get_tpu_system_metadata(tpu_cluster_resolver):
 
 # TODO(jhseu): Deduplicate with MirroredStrategy?
 def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
-    device_map, logical_device, real_mirrored_creator, *args, **kwargs):
+    strategy, device_map, logical_device, real_mirrored_creator,
+    *args, **kwargs):
   # Figure out what collections this variable should be added to.
   # We'll add the TPUMirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  var_collections = kwargs.pop("collections", None)
+  if var_collections is None:
+    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
   kwargs["collections"] = []
 
   # TODO(jhseu): Should we have different behavior for different
@@ -101,7 +130,8 @@ def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
     devices = device_map.logical_to_actual_devices(logical_device)
     value_list = real_mirrored_creator(devices, *args, **kwargs)
     result = values.TPUMirroredVariable(
-        device_map, value_list, aggregation, logical_device=logical_device)
+        strategy, device_map, value_list, aggregation,
+        logical_device=logical_device)
 
   if not context.executing_eagerly():
     g = ops.get_default_graph()
@@ -111,11 +141,11 @@ def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
     # "trainable" to False for next_creator() since that causes functions
     # like implicit_gradients to skip those variables.
     if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
       l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
       for v in value_list:
         l.remove(v)
-    g.add_to_collections(collections, result)
+    g.add_to_collections(var_collections, result)
   return result
 
 
@@ -125,7 +155,8 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def __init__(self,
                tpu_cluster_resolver=None,
                steps_per_run=None,
-               num_cores=None):
+               device_assignment=None,
+               **kwargs):
     """Initializes the TPUStrategy object.
 
     Args:
@@ -136,31 +167,82 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
           metrics, summaries etc.
           This parameter is only used when Distribution Strategy is used with
           estimator or keras.
-      num_cores: Number of cores to use on the TPU. If None specified, then
-          auto-detect the cores and topology of the TPU system.
+      device_assignment: Optional `tf.contrib.tpu.DeviceAssignment` to specify
+          the placement of replicas on the TPU cluster. Currently only supports
+          the usecase of using a single core within a TPU cluster.
+      **kwargs: Additional experimental flags. Will be removed in future.
     """
+    if len(kwargs) > 1:
+      raise ValueError("TPUStrategy constructor only takes one experimental "
+                       "flag now")
+    elif len(kwargs) == 1 and "_disable_training_loop_on_host" not in kwargs:
+      raise ValueError("TPUStrategy constructor does not support arguments: "
+                       "{}".format(kwargs))
+
     super(TPUStrategy, self).__init__(TPUExtended(
-        self, tpu_cluster_resolver, steps_per_run, num_cores))
+        self, tpu_cluster_resolver, steps_per_run, device_assignment,
+        kwargs.get("_disable_training_loop_on_host", False)))
 
   @property
   def steps_per_run(self):
     """DEPRECATED: use .extended.steps_per_run instead."""
     return self._extended.steps_per_run
 
+  # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
+  # can use the default implementation.
+  # This implementation runs a single step. It does not use infeed or outfeed.
+  def experimental_run(self, fn, input_iterator=None):
+    """See base class."""
+    if context.executing_eagerly():
+      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
+
+    if self.extended._disable_training_loop_on_host:  # pylint: disable=protected-access
+      raise NotImplementedError(
+          "`experimental_run` is not compatible with "
+          "`_disable_training_loop_on_host=True`")
+
+    if input_iterator is None:
+      inputs = []
+    else:
+      inputs = input_iterator.get_next()
+
+    result = [None]
+    def replicated_fn(replica_id, inputs):
+      """Wraps user function to provide replica ID and `Tensor` inputs."""
+      with _TPUReplicaContext(self, replica_id_in_sync_group=replica_id):
+        if input_iterator is None:
+          result[0] = fn()
+        else:
+          result[0] = fn(inputs)
+      return result[0]
+
+    replicate_inputs = []  # By replica.
+    for i in range(self.num_replicas_in_sync):
+      replicate_inputs.append(
+          [constant_op.constant(i, dtype=dtypes.int32),
+           values.select_replica(i, inputs)])
+
+    with self.scope():
+      replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs)
+
+    # Workaround for `tpu.replicate` behaviour when single `Tensor` returned.
+    replicate_outputs = [
+        nest.pack_sequence_as(result[0], nest.flatten(replica_outputs))
+        for replica_outputs in replicate_outputs]
+
+    device_map = self.extended._device_map  # pylint: disable=protected-access
+    return values.regroup(device_map, replicate_outputs)
+
 
 class TPUExtended(distribute_lib.DistributionStrategyExtended):
   """Implementation of TPUStrategy."""
 
-  # Track what TPU devices have been initialized. This is *intentionally*
-  # shared across all instances of TPUExtended as we want to keep track of which
-  # devices are initialized globally.
-  _initialized_devices = []
-
   def __init__(self,
                container_strategy,
                tpu_cluster_resolver=None,
                steps_per_run=None,
-               num_cores=None):
+               device_assignment=None,
+               disable_training_loop_on_host=False):
     super(TPUExtended, self).__init__(container_strategy)
 
     if tpu_cluster_resolver is None:
@@ -173,8 +255,22 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
-    # TODO(sourabhbajaj): Change this from num_cores to metadata_override
-    self._num_cores_override = num_cores
+    self._device_assignment = device_assignment
+    self._disable_training_loop_on_host = disable_training_loop_on_host
+
+    # Device assignment is currently only supported for 1 core case.
+    if self._device_assignment:
+      assert isinstance(self._device_assignment,
+                        device_assignment_lib.DeviceAssignment)
+      if self._device_assignment.num_replicas != 1:
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+      if self._device_assignment.num_cores_per_replica != 1:
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
 
     # TODO(jhseu): Switch to DeviceAssignment to support pods and model
     # parallelism.
@@ -188,45 +284,33 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
     self._device_map = values.ReplicaDeviceMap(self._tpu_devices)
 
-    # For input:
-    input_device_map = values.ReplicaDeviceMap(tuple(
-        self.get_host_cpu_device(hid) for hid in range(self.num_hosts)))
-    worker_devices = [
-        (self.get_host(hid), [self.get_host_cpu_device(hid)])
-        for hid in range(self.num_hosts)
-    ]
-    self._input_workers = values.InputWorkers(input_device_map, worker_devices)
+    # If the training loop is on the device, we must use the infeed, with input
+    # on the host. Otherwise, we preload the data onto the TPUs.
+    if disable_training_loop_on_host:
+      input_device_map = values.ReplicaDeviceMap(tuple(
+          self.get_host_cpu_device(hid) for hid in range(self.num_hosts)))
+      worker_devices = [
+          (self.get_host(hid), [self.get_host_cpu_device(hid)])
+          for hid in range(self.num_hosts)
+      ]
+      self._input_workers = input_lib.InputWorkers(
+          input_device_map, worker_devices)
+    else:
+      input_worker_devices = collections.OrderedDict()
+      for tpu_device in self._tpu_devices:
+        host_device = _get_host_for_device(tpu_device)
+        input_worker_devices.setdefault(host_device, [])
+        input_worker_devices[host_device].append(tpu_device)
+      self._input_workers = input_lib.InputWorkers(
+          self._device_map, tuple(input_worker_devices.items()))
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
     self._require_static_shapes = True
 
-    # Initialize the TPU devices.
-    self._initialize_tpu()
-
-  def _initialize_tpu(self):
-    """Initialize the TPU devices in a separate session and graph.
-
-    We keep track of all the TPU devices that we're initialized as we should
-    only be running TPU initialize once for the entire process.
-    """
-    master = self._tpu_cluster_resolver.master()
-    # Verify TPU has not already been initialized in this process.
-    if master in TPUExtended._initialized_devices:
-      logging.info("TPU master %s has already been initialized." % master)
-      return
-
-    logging.info("Initializing the TPU system.")
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    self._configure(session_config)
-    with ops.Graph().as_default():
-      with session_lib.Session(config=session_config, target=master) as sess:
-        sess.run([tpu.initialize_system()])
-    logging.info("Finized initializing TPU system.")
-
-    # Update Strategy state to make sure we can track device initialization.
-    TPUExtended._initialized_devices.append(master)
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate_tpu_variable(colocate_with_variable, self)
 
   def _get_enqueue_op_per_host(self, host_id, multi_worker_iterator,
                                input_shapes, iterations):
@@ -291,20 +375,44 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
   def _make_dataset_iterator(self, dataset):
     """Make iterators for each of the TPU hosts."""
-
-    return values.DatasetIterator(dataset, self._input_workers,
-                                  self._num_replicas_in_sync)
-
-  def _distribute_dataset(self, dataset_fn):
-    return values.MultiWorkerDataset(
-        functools.partial(self._call_dataset_fn, dataset_fn),
-        self._input_workers)
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    num_workers = self._input_workers.num_workers
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, input_contexts)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, numpy_dataset.SingleDevice(self.get_host_cpu_device(0)),
+        session)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
   def _experimental_run_steps_on_iterator(
       self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
+    if self._disable_training_loop_on_host:
+      impl = self._run_steps_on_iterator_with_device_loop
+    else:
+      impl = self._run_steps_on_iterator_with_host_loop
+
+    return impl(
+        fn=fn, multi_worker_iterator=multi_worker_iterator,
+        iterations=iterations, initial_loop_values=initial_loop_values)
+
+  def _run_steps_on_iterator_with_host_loop(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
     output_shapes = multi_worker_iterator.output_shapes
     shapes = nest.flatten(output_shapes)
     if any(not s.is_fully_defined() for s in shapes):
@@ -312,26 +420,16 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
           "TPU currently requires fully defined shapes. Either use "
           "set_shape() on the input tensors or use "
           "dataset.batch(..., drop_remainder=True).")
-    types = nest.flatten(multi_worker_iterator.output_types)
-
-    enqueue_ops = [
-        self._get_enqueue_op_per_host(host_id, multi_worker_iterator, shapes,
-                                      iterations)
-        for host_id in range(self.num_hosts)]
-
-    def dequeue_fn():
-      dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
-      return nest.pack_sequence_as(output_shapes, dequeued)
 
     # Wrap `fn` for repeat.
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
 
-    def run_fn():
+    def run_fn(inputs):
       """Single step on the TPU device."""
-      fn_result = fn(ctx, dequeue_fn())
+      fn_result = fn(ctx, inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -351,7 +449,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     def rewrite_fn(*args):
       """The rewritten step fn running on TPU."""
       del args
-      replicate_inputs = [[]] * self._num_replicas_in_sync
+
+      per_replica_inputs = multi_worker_iterator.get_next()
+      replicate_inputs = []
+      for replica_id in range(self._num_replicas_in_sync):
+        select_replica = lambda x: values.select_replica(replica_id, x)  # pylint: disable=cell-var-from-loop
+        replicate_inputs.append((nest.map_structure(
+            select_replica, per_replica_inputs),))
+
       replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
 
       # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
@@ -363,8 +468,8 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
       return replicate_outputs
 
-    # TODO(sourabhbajaj): The input to while loop should be based on the output
-    # type of the step_fn
+    # TODO(sourabhbajaj): The input to while loop should be based on the
+    # output type of the step_fn
     assert isinstance(initial_loop_values, list)
     initial_loop_values = initial_loop_values * self._num_replicas_in_sync
 
@@ -374,7 +479,7 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
                                                initial_loop_values)
 
     del self._outer_control_flow_context
-    ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
+    ctx.run_op = control_flow_ops.group(replicate_outputs)
 
     if isinstance(replicate_outputs, list):
       # Filter out any ops from the outputs, typically this would be the case
@@ -399,23 +504,80 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       # no tensors returned.
       last_step_tensor_outputs = []
 
-    # Convert replicate_outputs to the original dict structure of
-    # last_step_outputs.
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been reduced, take the first value
-      # from the list as each value should be the same. Else return the full
-      # list of values.
-      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
-      # value.
-      if reduce_op is not None:
-        # TODO(priyag): Should this return the element or a list with 1 element
-        last_step_tensor_outputs_dict[name] = output[0]
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    _set_last_step_outputs(ctx, last_step_tensor_outputs)
+    return ctx
 
+  def _run_steps_on_iterator_with_device_loop(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
+    output_shapes = multi_worker_iterator.output_shapes
+    shapes = nest.flatten(output_shapes)
+    if any(not s.is_fully_defined() for s in shapes):
+      raise ValueError(
+          "TPU currently requires fully defined shapes. Either use "
+          "set_shape() on the input tensors or use "
+          "dataset.batch(..., drop_remainder=True).")
+    types = nest.flatten(multi_worker_iterator.output_types)
+
+    enqueue_ops = [
+        self._get_enqueue_op_per_host(host_id, multi_worker_iterator, shapes,
+                                      iterations)
+        for host_id in range(self.num_hosts)]
+
+    def dequeue_fn():
+      dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
+      return nest.pack_sequence_as(output_shapes, dequeued)
+
+    # Wrap `fn` for repeat.
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+    ctx = input_lib.MultiStepContext()
+
+    def run_fn(*args, **kwargs):
+      """Single step on the TPU device."""
+      del args, kwargs
+      fn_result = fn(ctx, dequeue_fn())
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      if flat_last_step_outputs:
+        with ops.control_dependencies([fn_result]):
+          return [array_ops.identity(f) for f in flat_last_step_outputs]
+      else:
+        return fn_result
+
+    def iterate_on_tpu():
+      return training_loop.repeat(iterations, run_fn, initial_loop_values)
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop and TPU replicate context. This is useful in cases
+    # where we might need to exit these contexts and get back to the outer
+    # context to do some things, for e.g. create an op which should be
+    # evaluated only once at the end of the loop on the host. One such usage
+    # is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    replicate_inputs = [[]] * self._num_replicas_in_sync
+    replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+
+    del self._outer_control_flow_context
+    ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
+
+    # Filter out any ops from the outputs, typically this would be the case
+    # when there were no tensor outputs.
+    last_step_tensor_outputs = [x for x in replicate_outputs
+                                if not isinstance(x, ops.Operation)]
+
+    # Outputs are currently of the structure (grouped by device)
+    # [[output0_device0, output1_device0, output2_device0],
+    #  [output0_device1, output1_device1, output2_device1]]
+    # Convert this to the following structure instead: (grouped by output)
+    # [[output0_device0, output0_device1],
+    #  [output1_device0, output1_device1],
+    #  [output2_device0, output2_device1]]
+    last_step_tensor_outputs = [list(x) for x in
+                                zip(*last_step_tensor_outputs)]
+
+    _set_last_step_outputs(ctx, last_step_tensor_outputs)
     return ctx
 
   def _call_for_each_replica(self, fn, args, kwargs):
@@ -424,19 +586,13 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def _initialize(self):
-    if context.executing_eagerly():
-      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-    else:
-      return []
+  def _experimental_initialize_system(self):
+    """Experimental method added to be used by Estimator.
 
-  def _finalize(self):
-    if context.executing_eagerly():
-      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-    else:
-      return []
+    This is a private method only to be used by Estimator. Other frameworks
+    should directly be calling `tf.contrib.distribute.initialize_tpu_system`
+    """
+    initialize_tpu_system(self._tpu_cluster_resolver)
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
@@ -444,6 +600,9 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     if colocate_with is None:
       device_map = self._device_map
       logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
     else:
       device_map = colocate_with.device_map
       logical_device = colocate_with.logical_device
@@ -475,7 +634,8 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       return value_list
 
     return _create_tpu_mirrored_variable(
-        device_map, logical_device, _real_mirrored_creator, *args, **kwargs)
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
 
   def _reduce_to(self, reduce_op, value, destinations):
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
@@ -559,15 +719,34 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
   @property
   def num_hosts(self):
-    return self._tpu_metadata.num_hosts
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_hosts
+
+    return len(set([self._device_assignment.host_device(r)
+                    for r in range(self._device_assignment.num_replicas)]))
 
   @property
   def num_replicas_per_host(self):
-    return self._tpu_metadata.num_of_cores_per_host
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_of_cores_per_host
+
+    # TODO(sourabhbajaj): Remove this method we use inputs and remove infeed
+    # as the computation of num_replicas_per_host is not a constant
+    # when using device_assignment. This is a temporary workaround to support
+    # StatefulRNN as everything is 1 in that case.
+    # This method needs to take host_id as input for correct computation.
+    max_models_per_host = (self._tpu_metadata.num_of_cores_per_host //
+                           self._device_assignment.num_cores_per_replica)
+    models_per_host = min(self._device_assignment.num_replicas,
+                          max_models_per_host)
+    return models_per_host * self._device_assignment.num_cores_per_replica
 
   @property
   def _num_replicas_in_sync(self):
-    return self._num_cores_override or self._tpu_metadata.num_cores
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_cores
+    return (self._device_assignment.num_replicas *
+            self._device_assignment.num_cores_per_replica)
 
   @property
   def experimental_between_graph(self):
@@ -635,6 +814,13 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
     return True
 
 
@@ -642,15 +828,48 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
   """Replication Context class for TPU Strategy."""
 
   # TODO(sourabhbajaj): Call for each replica should be updating this.
-  def __init__(self, strategy):
-    # TODO(b/118385803): properly initialize replica_id, instead of always 0
-    replica_id = constant_op.constant(0, dtypes.int32)
+  # TODO(b/118385803): Always properly initialize replica_id.
+  def __init__(self, strategy, replica_id_in_sync_group=None):
+    if replica_id_in_sync_group is None:
+      replica_id_in_sync_group = constant_op.constant(0, dtypes.int32)
     distribute_lib.ReplicaContext.__init__(
-        self, strategy, replica_id_in_sync_group=replica_id)
+        self, strategy, replica_id_in_sync_group=replica_id_in_sync_group)
 
   @property
   def devices(self):
     distribute_lib.require_replica_context(self)
     ds = self._strategy
     replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
-    return (ds.extended.worker_devices[replica_id],)
+
+    if replica_id is None:  # Non-constant `Tensor` inside `tpu.replicate`.
+      # TODO(cjfj): Return other devices when model parallelism is supported.
+      return (tpu.core(0),)
+    else:
+      return (ds.extended.worker_devices[replica_id],)
+
+
+def _get_host_for_device(device):
+  spec = tf_device.DeviceSpec.from_string(device)
+  return tf_device.DeviceSpec(
+      job=spec.job, replica=spec.replica, task=spec.task,
+      device_type="CPU", device_index=0).to_string()
+
+
+def _set_last_step_outputs(ctx, last_step_tensor_outputs):
+  """Sets the last step outputs on the given context."""
+  # Convert replicate_outputs to the original dict structure of
+  # last_step_outputs.
+  last_step_tensor_outputs_dict = nest.pack_sequence_as(
+      ctx.last_step_outputs, last_step_tensor_outputs)
+
+  for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+    output = last_step_tensor_outputs_dict[name]
+    # For outputs that have already been reduced, take the first value
+    # from the list as each value should be the same. Else return the full
+    # list of values.
+    # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
+    # value.
+    if reduce_op is not None:
+      # TODO(priyag): Should this return the element or a list with 1 element
+      last_step_tensor_outputs_dict[name] = output[0]
+  ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 0e8e86f6b9647ebf06890c9bb343a8f8e0fcc698..51c58b0b2f3dc2ab63e22718825a471b8657f892 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -22,28 +22,20 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.util import nest
 
 
 class DistributedValuesTest(test.TestCase):
@@ -191,7 +183,7 @@ def _make_mirrored():
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
   device_map = values.ReplicaDeviceMap(devices)
-  mirrored = values.MirroredVariable(device_map, v,
+  mirrored = values.MirroredVariable(None, device_map, v,
                                      variable_scope.VariableAggregation.SUM)
   return v, device_map, mirrored
 
@@ -314,7 +306,7 @@ class RegroupAndSelectDeviceTest(test.TestCase):
       v = variable_scope.get_variable(
           name="v", initializer=1., use_resource=True)
       device_map = values.ReplicaDeviceMap((d,))
-    mirrored = values.MirroredVariable(device_map, (v,),
+    mirrored = values.MirroredVariable(None, device_map, (v,),
                                        variable_scope.VariableAggregation.SUM)
     result = values.regroup(device_map, (v,))
     self.assertIs(mirrored, result)
@@ -354,444 +346,6 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                                                merged_estimator_spec))
 
 
-class PerReplicaDatasetTest(test.TestCase):
-
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = values.InputWorkers(device_map)
-    per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0)
-    if context.executing_eagerly():
-      iterator = per_replica_dataset.make_one_shot_iterator()
-    else:
-      iterator = per_replica_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next_as_list()
-      computed_value = self.evaluate(next_element)
-      self.assertEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next_as_list()
-      self.evaluate(next_element)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testOneDevice(self):
-    devices = ["/device:CPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleDevices(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testTupleDataset(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset1 = dataset_ops.Dataset.range(10)
-    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnevenDatasetBatches(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(devices, dataset, expected_values)
-
-  def testInitializableIterator(self):
-    with context.graph_mode():
-      devices = ["/device:CPU:0"]
-      # Using random input since that is only allowed with initializable
-      # iterator.
-      dataset = dataset_ops.Dataset.from_tensor_slices(
-          random_ops.random_uniform((10,)))
-
-      device_map = values.ReplicaDeviceMap(devices)
-      input_workers = values.InputWorkers(device_map)
-      per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0)
-      iterator = per_replica_dataset.make_initializable_iterator()
-
-      self.evaluate(iterator.initializer)
-      next_element = iterator.get_next_as_list()
-      for _ in range(10):
-        self.evaluate(next_element)
-
-      # Should fail after the input is finished.
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-      # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(iterator.initializer)
-      for _ in range(10):
-        self.evaluate(next_element)
-
-
-class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
-
-  def _test_iterator(self, sess, iterator, devices, expected_values):
-    next_element = iterator.get_next()
-    for r, device in enumerate(devices):
-      v = values.select_replica(r, next_element)
-      # The `v` here can be a tuple.
-      for element in nest.flatten(v):
-        self.assertTrue(element.device in device)
-
-    for expected_value in expected_values:
-      t = [values.select_replica(r, next_element) for r in range(len(devices))]
-      actual = sess.run(t)
-      self.assertEqual(expected_value, actual)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      sess.run([values.select_replica(r, next_element)
-                for r in range(len(devices))])
-
-  def _test_dataset(self, dataset_fn, worker_devices, devices,
-                    expected_values):
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = values.InputWorkers(device_map, worker_devices)
-    multi_worker_dataset = values.MultiWorkerDataset(
-        dataset_fn, input_workers)
-    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-    with self.cached_session() as sess:
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
-
-  def _cpu_devices(self):
-    worker_devices = (
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])
-    )
-    devices = [
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def _cpu_and_one_gpu_devices(self):
-    worker_devices = (
-        ("/job:worker/replica:0/task:0", (
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        )),
-        ("/job:worker/replica:0/task:1", (
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ))
-    )
-    devices = [
-        "/job:worker/replica:0/task:0/device:GPU:0",
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:GPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def testDataDistributionOneDevicePerWorker(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(
-          dataset_fn, worker_devices, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-  def testDataDistributionTwoDevicePerWorker(self):
-    if context.num_gpus() < 1:
-      self.skipTest("A GPU is not available for this test.")
-    worker_devices, devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(
-          dataset_fn, worker_devices, devices,
-          [[0, 1, 0, 1], [2, 3, 2, 3], [4, 5, 4, 5], [6, 7, 6, 7]])
-
-  def testTupleDataset(self):
-    worker_devices, devices = self._cpu_devices()
-
-    with context.graph_mode():
-
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(8)
-        dataset2 = dataset_ops.Dataset.range(8).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [[(i, i**2), (i, i**2)] for i in range(8)]
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         expected_values)
-
-  def testInitializableIterator(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      device_map = values.ReplicaDeviceMap(devices)
-      input_workers = values.InputWorkers(device_map, worker_devices)
-      multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, input_workers)
-      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(
-          sess, multi_worker_iterator, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-      # After re-initializing the iterator, should be able to iterate again.
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(
-          sess, multi_worker_iterator, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-  def testValueErrorForIterator(self):
-    # Incompatiable arguments.
-    d1 = "/device:GPU:0"
-    d2 = "/device:GPU:1"
-    device_map = values.ReplicaDeviceMap([d1, d2])
-    input_workers = values.InputWorkers(
-        device_map, (("w1", (d1,)), ("w2", (d2,))))
-    with self.assertRaises(ValueError):
-      values.MultiWorkerDataIterator([("w1", None)], input_workers)
-
-  def testDuplicateDevices(self):
-    _, devices = self._cpu_devices()
-    devices.append("/job:worker/replica:0/task:0/device:CPU:0")
-    with self.assertRaises(ValueError):
-      _ = values.ReplicaDeviceMap(devices)
-
-
-class InputIteratorTestBase(test.TestCase):
-
-  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
-                     expected_values, sess=None, split_batch_by=None):
-    devices = nest.flatten([ds for _, ds in worker_device_pairs])
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = values.InputWorkers(device_map, worker_device_pairs)
-
-    if input_type == "input_fn":
-      input_contexts = [
-          distribute_lib.InputContext() for _ in worker_device_pairs]
-      input_fn = lambda _: dataset_fn()
-      iterator = values.InputFunctionIterator(
-          input_fn, input_workers, input_contexts)
-    else:
-      iterator = values.DatasetIterator(
-          dataset_fn(), input_workers, split_batch_by)
-
-    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
-
-    evaluate(control_flow_ops.group(iterator.initialize()))
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertAllEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next()
-      evaluate([values.select_replica(r, next_element)
-                for r in range(len(devices))])
-
-    # After re-initializing the iterator, should be able to iterate again.
-    evaluate(control_flow_ops.group(iterator.initialize()))
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertAllEqual(expected_value, computed_value)
-
-
-class InputIteratorSingleWorkerTest(InputIteratorTestBase,
-                                    parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"]))
-  def testOneDeviceCPU(self, input_type):
-    worker_device_pairs = [("", ["/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTwoDevicesOneGPUOneCPU(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTupleDataset(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    def dataset_fn():
-      dataset1 = dataset_ops.Dataset.range(10)
-      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-      return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testUnevenDatasetBatches(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["dataset"],
-      split_batch_by=[None, 2],
-      required_gpus=1))
-  def testBatchSplitting(self, input_type, split_batch_by):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    batch_size = 10
-    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
-
-    updated_batch_size = (
-        batch_size // split_batch_by if split_batch_by else batch_size)
-    expected_values = [[range(i, i+updated_batch_size),
-                        range(i+updated_batch_size, i+2*updated_batch_size)]
-                       for i in range(0, 100, updated_batch_size*2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values, sess=None,
-                        split_batch_by=split_batch_by)
-
-
-class InputIteratorMultiWorkerTest(
-    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
-    parameterized.TestCase):
-
-  def _cpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])]
-
-  def _cpu_and_one_gpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0", [
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        ]),
-        ("/job:worker/replica:0/task:1", [
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ])
-    ]
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"]))
-  def testOneDevicePerWorker(self, input_type):
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTwoDevicesPerWorker(self, input_type):
-    worker_devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"]))
-  def testTupleDataset(self, input_type):
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(4)
-        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          expected_values, sess)
-
-
-class SplitDatasetBatchTest(test.TestCase):
-
-  def testBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20)
-    split_batch_by = 2
-    result_dataset = values._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testMapAndBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100)
-    dataset = dataset.apply(batching.map_and_batch(lambda x: x, 20))
-    split_batch_by = 2
-    result_dataset = values._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testPrefetchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20).prefetch(1)
-    split_batch_by = 2
-    result_dataset = values._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -813,7 +367,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
     device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",))
-    mirrored = values.MirroredVariable(device_map, (v,),
+    mirrored = values.MirroredVariable(None, device_map, (v,),
                                        variable_scope.VariableAggregation.MEAN)
 
     self.assertEqual(v.name, mirrored.name)
@@ -952,7 +506,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
       mirrored = values.MirroredVariable(
-          values.ReplicaDeviceMap(("/device:GPU:0",)), (v,),
+          distribution, values.ReplicaDeviceMap(("/device:GPU:0",)), (v,),
           variable_scope.VariableAggregation.MEAN)
       sess.run(variables_lib.global_variables_initializer())
       sess.run({"complicated": mirrored})
@@ -961,14 +515,14 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 _devices = ("/device:GPU:0", "/device:CPU:0")
 
 
-def _make_replica_local(method):
+def _make_replica_local(method, strategy=None):
   device_map = values.ReplicaDeviceMap(_devices)
   v = []
   for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]):
     with ops.device(d):
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
-  replica_local = values.ReplicaLocalVariable(device_map, v, method)
+  replica_local = values.ReplicaLocalVariable(strategy, device_map, v, method)
   return v, replica_local
 
 
@@ -996,7 +550,7 @@ class ReplicaLocalVariablePropertiesTest(test.TestCase):
         name="v", initializer=[1.], use_resource=True)
     device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",))
     replica_local = values.ReplicaLocalVariable(
-        device_map, (v,), variable_scope.VariableAggregation.MEAN)
+        None, device_map, (v,), variable_scope.VariableAggregation.MEAN)
 
     self.assertEqual(v.name, replica_local.name)
     self.assertEqual(v.dtype, replica_local.dtype)
@@ -1043,7 +597,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
   def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
     with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
@@ -1066,7 +620,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.MEAN)
+          variable_scope.VariableAggregation.MEAN, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
@@ -1086,7 +640,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.MEAN)
+          variable_scope.VariableAggregation.MEAN, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
@@ -1102,7 +656,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
   def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, replica_local = _make_replica_local("sum")
+      v, replica_local = _make_replica_local("sum", distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
@@ -1149,7 +703,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.MEAN)
+          variable_scope.VariableAggregation.MEAN, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
@@ -1164,7 +718,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 452628257ea96713453bf2aa32b5baa9d6d0cb86..1006dfac49f36baa7cf5136f6f2982e3fd965298 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -249,9 +249,9 @@ class InverseGamma(distribution.Distribution):
       `self.allow_nan_stats` is `False`, an exception will be raised rather
       than returning `NaN`.""")
   def _variance(self):
-    var = (math_ops.square(self.rate)
-           / math_ops.square(self.concentration - 1.)
-           / (self.concentration - 2.))
+    var = (
+        math_ops.square(self.rate) / math_ops.squared_difference(
+            self.concentration, 1.) / (self.concentration - 2.))
     if self.allow_nan_stats:
       nan = array_ops.fill(
           self.batch_shape_tensor(),
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 257d02057ae0d280074559aa9e97725bf5cc3fd0..78ab155896cfeda4dd259a8529f4b1f77a12cf0b 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -200,13 +200,6 @@ class IteratorTest(test.TestCase):
         y = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], y.numpy())
 
-  def testGpuDefinedDataset(self):
-    with ops.device(test.gpu_device_name()):
-      ds = Dataset.from_tensors([0., 1.])
-      for x in ds:
-        y = math_ops.add(x, x)
-    self.assertAllEqual([0., 2.], y.numpy())
-
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index 97c299a911c9180bf69faa0fa46527e80eada790..3e0881754c750f4d36e2e4dd8b80835b031c658c 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -6,16 +6,16 @@ package(default_visibility = ["//tensorflow:internal"])
 py_library(
     name = "examples_pip",
     deps = [
-        "//tensorflow/contrib/eager/python/examples/densenet",
-        "//tensorflow/contrib/eager/python/examples/gan:mnist",
+        "//tensorflow/contrib/eager/python/examples/densenet:densenet_lib",
+        "//tensorflow/contrib/eager/python/examples/gan:mnist_lib",
         "//tensorflow/contrib/eager/python/examples/l2hmc",
         "//tensorflow/contrib/eager/python/examples/l2hmc:neural_nets",
-        "//tensorflow/contrib/eager/python/examples/linear_regression",
+        "//tensorflow/contrib/eager/python/examples/linear_regression:linear_regression_lib",
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/revnet",
         "//tensorflow/contrib/eager/python/examples/revnet:config",
-        "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
-        "//tensorflow/contrib/eager/python/examples/rnn_ptb",
+        "//tensorflow/contrib/eager/python/examples/rnn_colorbot:rnn_colorbot_lib",
+        "//tensorflow/contrib/eager/python/examples/rnn_ptb:rnn_ptb_lib",
         "//tensorflow/contrib/eager/python/examples/spinn:data",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index e2154fcc5fcf774dcd52285d9442dfd5073a4992..fbb5daf230bb79f08a3d071062ddc0e8507ab324 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "densenet",
     srcs = ["densenet.py"],
     srcs_version = "PY2AND3",
+    deps = [":densenet_lib"],
+)
+
+py_library(
+    name = "densenet_lib",
+    srcs = ["densenet.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -17,33 +24,37 @@ py_binary(
 
 cuda_py_test(
     name = "densenet_test",
-    size = "large",
+    size = "medium",
     srcs = ["densenet_test.py"],
     additional_deps = [
         ":densenet",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",
         "optonly",
+        "oss_serial",
     ],
 )
 
 cuda_py_test(
     name = "densenet_graph_test",
-    size = "large",
+    size = "medium",
     srcs = ["densenet_graph_test.py"],
     additional_deps = [
         ":densenet",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",
         "noasan",
         "nomsan",
         "notsan",
         "optonly",
+        "oss_serial",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/gan/BUILD b/tensorflow/contrib/eager/python/examples/gan/BUILD
index d64c8eb9ce122fa277567b2fbc632abfbc72df64..d99a519112787bad664232983208279cfb4d0036 100644
--- a/tensorflow/contrib/eager/python/examples/gan/BUILD
+++ b/tensorflow/contrib/eager/python/examples/gan/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "mnist",
     srcs = ["mnist.py"],
     srcs_version = "PY2AND3",
+    deps = [":mnist_lib"],
+)
+
+py_library(
+    name = "mnist_lib",
+    srcs = ["mnist.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -20,7 +27,7 @@ cuda_py_test(
     name = "mnist_test",
     srcs = ["mnist_test.py"],
     additional_deps = [
-        ":mnist",
+        ":mnist_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
@@ -30,7 +37,7 @@ cuda_py_test(
     name = "mnist_graph_test",
     srcs = ["mnist_graph_test.py"],
     additional_deps = [
-        ":mnist",
+        ":mnist_lib",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index 1a08cc0fd06516be4af5c2b0b46a3ffcf9101e95..e1a02db76f705414a34d232022f50124a5a6a3ed 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -13,11 +13,13 @@
         "\n",
         "# Convolutional VAE: An example with tf.keras and eager\n",
         "\n",
+        "This example has moved:\n",
+        "\n",
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/cvae.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
@@ -28,604 +30,14 @@
       },
       "source": [
         "![evolution of output during training](https://tensorflow.org/images/autoencoders/cvae.gif)\n",
-        "\n",
-        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) by training a Variational Autoencoder. (VAE, [[1]](https://arxiv.org/abs/1312.6114), [[2]](https://arxiv.org/abs/1401.4082)).\n",
         "\n"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "P-JuIu2N_SQf"
-      },
-      "outputs": [],
-      "source": [
-        "# to generate gifs\n",
-        "!pip install imageio"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "source": [
-        "## Import TensorFlow and enable Eager execution"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd"
-      },
-      "outputs": [],
-      "source": [
-        "from __future__ import absolute_import, division, print_function\n",
-        "\n",
-        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "tfe = tf.contrib.eager\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import os\n",
-        "import time\n",
-        "import numpy as np\n",
-        "import glob\n",
-        "import matplotlib.pyplot as plt\n",
-        "import PIL\n",
-        "import imageio\n",
-        "from IPython import display"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "source": [
-        "## Load the MNIST dataset\n",
-        "Each MNIST image is originally a vector of 784 integers, each of which is between 0-255 and represents the intensity of a pixel. We model each pixel with a Bernoulli distribution in our model, and we statically binarize the dataset."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "a4fYMGxGhrna"
-      },
-      "outputs": [],
-      "source": [
-        "(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "NFC2ghIdiZYE"
-      },
-      "outputs": [],
-      "source": [
-        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "\n",
-        "# Normalizing the images to the range of [0., 1.]\n",
-        "train_images /= 255.\n",
-        "test_images /= 255.\n",
-        "\n",
-        "# Binarization\n",
-        "train_images[train_images \u003e= .5] = 1.\n",
-        "train_images[train_images \u003c .5] = 0.\n",
-        "test_images[test_images \u003e= .5] = 1.\n",
-        "test_images[test_images \u003c .5] = 0."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "S4PIDhoDLbsZ"
-      },
-      "outputs": [],
-      "source": [
-        "TRAIN_BUF = 60000\n",
-        "BATCH_SIZE = 100\n",
-        "\n",
-        "TEST_BUF = 10000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "source": [
-        "## Use *tf.data* to create batches and shuffle the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "-yKCCQOoJ7cn"
-      },
-      "outputs": [],
-      "source": [
-        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE)\n",
-        "test_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TEST_BUF).batch(BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "source": [
-        "## Wire up the generative and inference network with *tf.keras.Sequential*\n",
-        "\n",
-        "In our VAE example, we use two small ConvNets for the generative and inference network. Since these neural nets are small, we use `tf.keras.Sequential` to simplify our code. Let $x$ and $z$ denote the observation and latent variable respectively in the following descriptions. \n",
-        "\n",
-        "### Generative Network\n",
-        "This defines the generative model which takes a latent encoding as input, and outputs the parameters for a conditional distribution of the observation, i.e. $p(x|z)$. Additionally, we use a unit Gaussian prior $p(z)$ for the latent variable.\n",
-        "\n",
-        "### Inference Network\n",
-        "This defines an approximate posterior distribution $q(z|x)$, which takes as input an observation and outputs a set of parameters for the conditional distribution of the latent representation. In this example, we simply model this distribution as a diagonal Gaussian. In this case, the inference network outputs the mean and log-variance parameters of a factorized Gaussian (log-variance instead of the variance directly is for numerical stability).\n",
-        "\n",
-        "### Reparameterization Trick\n",
-        "During optimization, we can sample from $q(z|x)$ by first sampling from a unit Gaussian, and then multiplying by the standard deviation and adding the mean. This ensures the gradients could pass through the sample to the inference network parameters.\n",
-        "\n",
-        "### Network architecture\n",
-        "For the inference network, we use two convolutional layers followed by a fully-connected layer. In the generative network, we mirror this architecture by using a fully-connected layer followed by three convolution transpose layers (a.k.a. deconvolutional layers in some contexts). Note, it's common practice to avoid using batch normalization when training VAEs, since the additional stochasticity due to using mini-batches may aggravate instability on top of the stochasticity from sampling."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "VGLbvBEmjK0a"
-      },
-      "outputs": [],
-      "source": [
-        "class CVAE(tf.keras.Model):\n",
-        "  def __init__(self, latent_dim):\n",
-        "    super(CVAE, self).__init__()\n",
-        "    self.latent_dim = latent_dim\n",
-        "    self.inference_net = tf.keras.Sequential(\n",
-        "      [\n",
-        "          tf.keras.layers.InputLayer(input_shape=(28, 28, 1)),\n",
-        "          tf.keras.layers.Conv2D(\n",
-        "              filters=32, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Conv2D(\n",
-        "              filters=64, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Flatten(),\n",
-        "          # No activation\n",
-        "          tf.keras.layers.Dense(latent_dim + latent_dim),\n",
-        "      ]\n",
-        "    )\n",
-        "\n",
-        "    self.generative_net = tf.keras.Sequential(\n",
-        "        [\n",
-        "          tf.keras.layers.InputLayer(input_shape=(latent_dim,)),\n",
-        "          tf.keras.layers.Dense(units=7*7*32, activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Reshape(target_shape=(7, 7, 32)),\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=64,\n",
-        "              kernel_size=3,\n",
-        "              strides=(2, 2),\n",
-        "              padding=\"SAME\",\n",
-        "              activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=32,\n",
-        "              kernel_size=3,\n",
-        "              strides=(2, 2),\n",
-        "              padding=\"SAME\",\n",
-        "              activation=tf.nn.relu),\n",
-        "          # No activation\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=1, kernel_size=3, strides=(1, 1), padding=\"SAME\"),\n",
-        "        ]\n",
-        "    )\n",
-        "\n",
-        "  def sample(self, eps=None):\n",
-        "    if eps is None:\n",
-        "      eps = tf.random_normal(shape=(100, self.latent_dim))\n",
-        "    return self.decode(eps, apply_sigmoid=True)\n",
-        "\n",
-        "  def encode(self, x):\n",
-        "    mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)\n",
-        "    return mean, logvar\n",
-        "\n",
-        "  def reparameterize(self, mean, logvar):\n",
-        "    eps = tf.random_normal(shape=mean.shape)\n",
-        "    return eps * tf.exp(logvar * .5) + mean\n",
-        "\n",
-        "  def decode(self, z, apply_sigmoid=False):\n",
-        "    logits = self.generative_net(z)\n",
-        "    if apply_sigmoid:\n",
-        "      probs = tf.sigmoid(logits)\n",
-        "      return probs\n",
-        "\n",
-        "    return logits"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "source": [
-        "## Define the loss function and the optimizer\n",
-        "\n",
-        "VAEs train by maximizing the evidence lower bound (ELBO) on the marginal log-likelihood:\n",
-        "\n",
-        "$$\\log p(x) \\ge \\text{ELBO} = \\mathbb{E}_{q(z|x)}\\left[\\log \\frac{p(x, z)}{q(z|x)}\\right].$$\n",
-        "\n",
-        "In practice, we optimize the single sample Monte Carlo estimate of this expectation:\n",
-        "\n",
-        "$$\\log p(x| z) + \\log p(z) - \\log q(z|x),$$\n",
-        "where $z$ is sampled from $q(z|x)$.\n",
-        "\n",
-        "**Note**: we could also analytically compute the KL term, but here we incorporate all three terms in the Monte Carlo estimator for simplicity."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7"
-      },
-      "outputs": [],
-      "source": [
-        "def log_normal_pdf(sample, mean, logvar, raxis=1):\n",
-        "  log2pi = tf.log(2. * np.pi)\n",
-        "  return tf.reduce_sum(\n",
-        "      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),\n",
-        "      axis=raxis)\n",
-        "\n",
-        "def compute_loss(model, x):\n",
-        "  mean, logvar = model.encode(x)\n",
-        "  z = model.reparameterize(mean, logvar)\n",
-        "  x_logit = model.decode(z)\n",
-        "\n",
-        "  cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n",
-        "  logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])\n",
-        "  logpz = log_normal_pdf(z, 0., 0.)\n",
-        "  logqz_x = log_normal_pdf(z, mean, logvar)\n",
-        "  return -tf.reduce_mean(logpx_z + logpz - logqz_x)\n",
-        "\n",
-        "def compute_gradients(model, x):\n",
-        "  with tf.GradientTape() as tape:\n",
-        "    loss = compute_loss(model, x)\n",
-        "  return tape.gradient(loss, model.trainable_variables), loss\n",
-        "\n",
-        "optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "def apply_gradients(optimizer, gradients, variables, global_step=None):\n",
-        "  optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We start by iterating over the dataset\n",
-        "* During each iteration, we pass the image to the encoder to obtain a set of mean and log-variance parameters of the approximate posterior $q(z|x)$\n",
-        "* We then apply the *reparameterization trick* to sample from $q(z|x)$\n",
-        "* Finally, we pass the reparameterized samples to the decoder to obtain the logits of the generative distribution $p(x|z)$\n",
-        "* **Note:** Since we use the dataset loaded by keras with 60k datapoints in the training set and 10k datapoints in the test set, our resulting ELBO on the test set is slightly higher than reported results in the literature which uses dynamic binarization of Larochelle's MNIST.\n",
-        "\n",
-        "## Generate Images\n",
-        "\n",
-        "* After training, it is time to generate some images\n",
-        "* We start by sampling a set of latent vectors from the unit Gaussian prior distribution $p(z)$\n",
-        "* The generator will then convert the latent sample $z$ to logits of the observation, giving a distribution $p(x|z)$\n",
-        "* Here we plot the probabilities of Bernoulli distributions\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo"
-      },
-      "outputs": [],
-      "source": [
-        "epochs = 100\n",
-        "latent_dim = 50\n",
-        "num_examples_to_generate = 16\n",
-        "\n",
-        "# keeping the random vector constant for generation (prediction) so\n",
-        "# it will be easier to see the improvement.\n",
-        "random_vector_for_generation = tf.random_normal(\n",
-        "    shape=[num_examples_to_generate, latent_dim])\n",
-        "model = CVAE(latent_dim)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy"
-      },
-      "outputs": [],
-      "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  predictions = model.sample(test_input)\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "\n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0], cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "\n",
-        "  # tight_layout minimizes the overlap between 2 sub-plots\n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ"
-      },
-      "outputs": [],
-      "source": [
-        "generate_and_save_images(model, 0, random_vector_for_generation)\n",
-        "\n",
-        "for epoch in range(1, epochs + 1):\n",
-        "  start_time = time.time()\n",
-        "  for train_x in train_dataset:\n",
-        "    gradients, loss = compute_gradients(model, train_x)\n",
-        "    apply_gradients(optimizer, gradients, model.trainable_variables)\n",
-        "  end_time = time.time()\n",
-        "\n",
-        "  if epoch % 1 == 0:\n",
-        "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset:\n",
-        "      loss(compute_loss(model, test_x))\n",
-        "    elbo = -loss.result()\n",
-        "    display.clear_output(wait=False)\n",
-        "    print('Epoch: {}, Test set ELBO: {}, '\n",
-        "          'time elapse for current epoch {}'.format(epoch,\n",
-        "                                                    elbo,\n",
-        "                                                    end_time - start_time))\n",
-        "    generate_and_save_images(\n",
-        "        model, epoch, random_vector_for_generation)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
-      },
-      "source": [
-        "### Display an image using the epoch number"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL"
-      },
-      "outputs": [],
-      "source": [
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "5x3q9_Oe5q0A"
-      },
-      "outputs": [],
-      "source": [
-        "display_image(epochs)  # Display images"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
-      },
-      "source": [
-        "### Generate a GIF of all the saved images."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "IGKQgENQ8lEI"
-      },
-      "outputs": [],
-      "source": [
-        "with imageio.get_writer('cvae.gif', mode='I') as writer:\n",
-        "  filenames = glob.glob('image*.png')\n",
-        "  filenames = sorted(filenames)\n",
-        "  last = -1\n",
-        "  for i,filename in enumerate(filenames):\n",
-        "    frame = 2*(i**0.5)\n",
-        "    if round(frame) \u003e round(last):\n",
-        "      last = frame\n",
-        "    else:\n",
-        "      continue\n",
-        "    image = imageio.imread(filename)\n",
-        "    writer.append_data(image)\n",
-        "  image = imageio.imread(filename)\n",
-        "  writer.append_data(image)\n",
-        "    \n",
-        "# this is a hack to display the gif inside the notebook\n",
-        "os.system('cp cvae.gif cvae.gif.png')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "uV0yiKpzNP1b"
-      },
-      "outputs": [],
-      "source": [
-        "display.Image(filename=\"cvae.gif.png\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "yQXO_dlXkKsT"
-      },
-      "source": [
-        "To downlod the animation from Colab uncomment the code below:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "4fSJS3m5HLFM"
-      },
-      "outputs": [],
-      "source": [
-        "#from google.colab import files\n",
-        "#files.download('cvae.gif')"
-      ]
     }
   ],
   "metadata": {
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "default_view": {},
       "name": "cvae.ipynb",
       "private_outputs": true,
       "provenance": [
@@ -635,8 +47,7 @@
         }
       ],
       "toc_visible": true,
-      "version": "0.3.2",
-      "views": {}
+      "version": "0.3.2"
     },
     "kernelspec": {
       "display_name": "Python 3",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 78fcd397087fd1fd64aebed7ac3b5c6b2f45c450..53767058838459e56215d286e9f8f8eb66287147 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -1,26 +1,11 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "dcgan.ipynb",
-      "version": "0.3.2",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python2",
-      "display_name": "Python 2"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
+      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "0TD5ZrvEMbhZ"
       },
-      "cell_type": "markdown",
       "source": [
         "**Copyright 2018 The TensorFlow Authors**.\n",
         "\n",
@@ -28,851 +13,39 @@
         "\n",
         "# Generating Handwritten Digits with DCGAN\n",
         "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "ITZuApL56Mny"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "This tutorial demonstrates how to generate images of handwritten digits using a Deep Convolutional Generative Adversarial Network ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)). The code is written in [tf.keras](https://www.tensorflow.org/programmers_guide/keras) with [eager execution](https://www.tensorflow.org/programmers_guide/eager) enabled. "
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "toc",
-        "id": "x2McrO9bMyLN"
-      },
-      "cell_type": "markdown",
-      "source": [
-        ">[Generating Handwritten Digits with DCGAN](#scrollTo=0TD5ZrvEMbhZ)\n",
-        "\n",
-        ">>[What are GANs?](#scrollTo=2MbKJY38Puy9)\n",
-        "\n",
-        ">>>[Import TensorFlow and enable eager execution](#scrollTo=e1_Y75QXJS6h)\n",
-        "\n",
-        ">>>[Load the dataset](#scrollTo=iYn4MdZnKCey)\n",
-        "\n",
-        ">>>[Use tf.data to create batches and shuffle the dataset](#scrollTo=PIGN6ouoQxt3)\n",
-        "\n",
-        ">>[Create the models](#scrollTo=THY-sZMiQ4UV)\n",
-        "\n",
-        ">>>[The Generator Model](#scrollTo=-tEyxE-GMC48)\n",
-        "\n",
-        ">>>[The Discriminator model](#scrollTo=D0IKnaCtg6WE)\n",
-        "\n",
-        ">>[Define the loss functions and the optimizer](#scrollTo=0FMYgY_mPfTi)\n",
-        "\n",
-        ">>>[Generator loss](#scrollTo=Jd-3GCUEiKtv)\n",
-        "\n",
-        ">>>[Discriminator loss](#scrollTo=PKY_iPSPNWoj)\n",
-        "\n",
-        ">>[Set up GANs for Training](#scrollTo=Rw1fkAczTQYh)\n",
-        "\n",
-        ">>[Train the GANs](#scrollTo=dZrd4CdjR-Fp)\n",
-        "\n",
-        ">>[Generated images](#scrollTo=P4M_vIbUi7c0)\n",
+        "This example has moved.\n",
         "\n",
-        ">>[Learn more about GANs](#scrollTo=k6qC-SbjK0yW)\n",
-        "\n"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/dcgan.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/blob/master/site/en/r2/tutorials/generative/dcgan.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "2MbKJY38Puy9"
       },
-      "cell_type": "markdown",
       "source": [
-        "## What are GANs?\n",
-        "GANs, or [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661), are a framework for estimating generative models. Two models are trained simultaneously by an adversarial process: a Generator, which is responsible for generating data (say, images), and a Discriminator, which is responsible for estimating the probability that an image was drawn from the training data (the image is real), or was produced by the Generator (the image is fake). During training, the Generator becomes progressively better at generating images, until the Discriminator is no longer able to distinguish real images from fake. \n",
-        "\n",
-        "![alt text](https://github.com/margaretmz/tensorflow/blob/margaret-dcgan/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png?raw=1)\n",
-        "\n",
-        "We will demonstrate this process end-to-end on MNIST. Below is an animation that shows a series of images produced by the Generator as it was trained for 50 epochs. Overtime, the generated images become increasingly difficult to distinguish from the training set.\n",
-        "\n",
-        "To learn more about GANs, we recommend MIT's [Intro to Deep Learning](http://introtodeeplearning.com/) course, which includes a lecture on Deep Generative Models ([video](https://youtu.be/JVb54xhEw6Y) | [slides](http://introtodeeplearning.com/materials/2018_6S191_Lecture4.pdf)). Now, let's head to the code!\n",
-        "\n",
         "![sample output](https://tensorflow.org/images/gan/dcgan.gif)"
       ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "dcgan.ipynb",
+      "provenance": [],
+      "version": "0.3.2"
     },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "u_2z-B3piVsw",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# Install imgeio in order to generate an animated gif showing the image generating process\n",
-        "!pip install imageio"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Import TensorFlow and enable eager execution"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import glob\n",
-        "import imageio\n",
-        "import matplotlib.pyplot as plt\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import PIL\n",
-        "import time\n",
-        "\n",
-        "from IPython import display"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Load the dataset\n",
-        "\n",
-        "We are going to use the MNIST dataset to train the generator and the discriminator. The generator will generate handwritten digits resembling the MNIST data."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "a4fYMGxGhrna",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "NFC2ghIdiZYE",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "S4PIDhoDLbsZ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "BUFFER_SIZE = 60000\n",
-        "BATCH_SIZE = 256"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Use tf.data to create batches and shuffle the dataset"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "-yKCCQOoJ7cn",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Create the models\n",
-        "\n",
-        "We will use tf.keras [Sequential API](https://www.tensorflow.org/guide/keras#sequential_model) to define the generator and discriminator models."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "-tEyxE-GMC48"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### The Generator Model\n",
-        "\n",
-        "The generator is responsible for creating convincing images that are good enough to fool the discriminator. The network architecture for the generator consists of [Conv2DTranspose](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2DTranspose) (Upsampling) layers. We start with a fully connected layer and upsample the image two times in order to reach the desired image size of 28x28x1. We increase the width and height, and reduce the depth as we move through the layers in the network. We use [Leaky ReLU](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LeakyReLU) activation for each layer except for the last one where we use a tanh activation."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "6bpTcDqoLWjY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def make_generator_model():\n",
-        "    model = tf.keras.Sequential()\n",
-        "    model.add(tf.keras.layers.Dense(7*7*256, use_bias=False, input_shape=(100,)))\n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "      \n",
-        "    model.add(tf.keras.layers.Reshape((7, 7, 256)))\n",
-        "    assert model.output_shape == (None, 7, 7, 256) # Note: None is the batch size\n",
-        "    \n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False))\n",
-        "    assert model.output_shape == (None, 7, 7, 128)  \n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "\n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False))\n",
-        "    assert model.output_shape == (None, 14, 14, 64)    \n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "\n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))\n",
-        "    assert model.output_shape == (None, 28, 28, 1)\n",
-        "  \n",
-        "    return model"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "D0IKnaCtg6WE"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### The Discriminator model\n",
-        "\n",
-        "The discriminator is responsible for distinguishing fake images from real images. It's similar to a regular CNN-based image classifier."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "dw2tPLmk2pEP",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def make_discriminator_model():\n",
-        "    model = tf.keras.Sequential()\n",
-        "    model.add(tf.keras.layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "    model.add(tf.keras.layers.Dropout(0.3))\n",
-        "      \n",
-        "    model.add(tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "    model.add(tf.keras.layers.Dropout(0.3))\n",
-        "       \n",
-        "    model.add(tf.keras.layers.Flatten())\n",
-        "    model.add(tf.keras.layers.Dense(1))\n",
-        "     \n",
-        "    return model"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "gDkA05NE6QMs",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "generator = make_generator_model()\n",
-        "discriminator = make_discriminator_model()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Define the loss functions and the optimizer\n",
-        "\n",
-        "Let's define the loss functions and the optimizers for the generator and the discriminator.\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Jd-3GCUEiKtv"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Generator loss\n",
-        "The generator loss is a sigmoid cross entropy loss of the generated images and an array of ones, since the generator is trying to generate fake images that resemble the real images."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "90BIcCKcDMxz",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def generator_loss(generated_output):\n",
-        "    return tf.losses.sigmoid_cross_entropy(tf.ones_like(generated_output), generated_output)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "PKY_iPSPNWoj"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Discriminator loss\n",
-        "\n",
-        "The discriminator loss function takes two inputs: real images, and generated images. Here is how to calculate the discriminator loss:\n",
-        "1. Calculate real_loss which is a sigmoid cross entropy loss of the real images and an array of ones (since these are the real images).\n",
-        "2. Calculate generated_loss which is a sigmoid cross entropy loss of the generated images and an array of zeros (since these are the fake images).\n",
-        "3. Calculate the total_loss as the sum of real_loss and generated_loss."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "wkMNfBWlT-PV",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def discriminator_loss(real_output, generated_output):\n",
-        "    # [1,1,...,1] with real output since it is true and we want our generated examples to look like it\n",
-        "    real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.ones_like(real_output), logits=real_output)\n",
-        "\n",
-        "    # [0,0,...,0] with generated images since they are fake\n",
-        "    generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.zeros_like(generated_output), logits=generated_output)\n",
-        "\n",
-        "    total_loss = real_loss + generated_loss\n",
-        "\n",
-        "    return total_loss"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "MgIc7i0th_Iu"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "The discriminator and the generator optimizers are different since we will train two networks separately."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "generator_optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "discriminator_optimizer = tf.train.AdamOptimizer(1e-4)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "mWtinsGDPJlV"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Checkpoints (Object-based saving)**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "CA1w-7s2POEy",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
-        "                                 discriminator_optimizer=discriminator_optimizer,\n",
-        "                                 generator=generator,\n",
-        "                                 discriminator=discriminator)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Set up GANs for Training\n",
-        "\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "5QC5BABamh_c"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Now it's time to put together the generator and discriminator to set up the Generative Adversarial Networks, as you see in the diagam at the beginning of the tutorial."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Ff6oN6PZX27n"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Define training parameters**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 50\n",
-        "noise_dim = 100\n",
-        "num_examples_to_generate = 16\n",
-        "\n",
-        "# We'll re-use this random vector used to seed the generator so\n",
-        "# it will be easier to see the improvement over time.\n",
-        "random_vector_for_generation = tf.random_normal([num_examples_to_generate,\n",
-        "                                                 noise_dim])"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "jylSonrqSWfi"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Define training method**\n",
-        "\n",
-        "We start by iterating over the dataset. The generator is given a random vector as an input which is processed to  output an image looking like a handwritten digit. The discriminator is then shown the real MNIST images as well as the generated images.\n",
-        "\n",
-        "Next, we calculate the generator and the discriminator loss. Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "3t5ibNo05jCB",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def train_step(images):\n",
-        "   # generating noise from a normal distribution\n",
-        "      noise = tf.random_normal([BATCH_SIZE, noise_dim])\n",
-        "      \n",
-        "      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
-        "        generated_images = generator(noise, training=True)\n",
-        "      \n",
-        "        real_output = discriminator(images, training=True)\n",
-        "        generated_output = discriminator(generated_images, training=True)\n",
-        "         \n",
-        "        gen_loss = generator_loss(generated_output)\n",
-        "        disc_loss = discriminator_loss(real_output, generated_output)\n",
-        "        \n",
-        "      gradients_of_generator = gen_tape.gradient(gen_loss, generator.variables)\n",
-        "      gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.variables)\n",
-        "      \n",
-        "      generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.variables))\n",
-        "      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "6TSZgwc2BUQ-"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "This model takes about ~30 seconds per epoch to train on a single Tesla K80 on Colab, as of October 2018. \n",
-        "\n",
-        "Eager execution can be slower than executing the equivalent graph as it can't benefit from whole-program optimizations on the graph, and also incurs overheads of interpreting Python code. By using [tf.contrib.eager.defun](https://www.tensorflow.org/api_docs/python/tf/contrib/eager/defun) to create graph functions, we get a ~20 secs/epoch performance boost (from ~50 secs/epoch down to ~30 secs/epoch). This way we get the best of both eager execution (easier for debugging) and graph mode (better performance)."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Iwya07_j5p2A",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_step = tf.contrib.eager.defun(train_step)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def train(dataset, epochs):  \n",
-        "  for epoch in range(epochs):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    for images in dataset:\n",
-        "      train_step(images)\n",
-        "\n",
-        "    display.clear_output(wait=True)\n",
-        "    generate_and_save_images(generator,\n",
-        "                               epoch + 1,\n",
-        "                               random_vector_for_generation)\n",
-        "    \n",
-        "    # saving (checkpoint) the model every 15 epochs\n",
-        "    if (epoch + 1) % 15 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "    \n",
-        "    print ('Time taken for epoch {} is {} sec'.format(epoch + 1,\n",
-        "                                                      time.time()-start))\n",
-        "  # generating after the final epoch\n",
-        "  display.clear_output(wait=True)\n",
-        "  generate_and_save_images(generator,\n",
-        "                           epochs,\n",
-        "                           random_vector_for_generation)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "2aFF7Hk3XdeW"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Generate and save images**\n",
-        "\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  # make sure the training parameter is set to False because we\n",
-        "  # don't want to train the batchnorm layer when doing inference.\n",
-        "  predictions = model(test_input, training=False)\n",
-        "\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "  \n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "        \n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "dZrd4CdjR-Fp"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Train the GANs\n",
-        "We will call the train() method defined above to train the generator and discriminator simultaneously. Note, training GANs can be tricky. It's important that the generator and discriminator do not overpower each other (e.g., that they train at a similar rate).\n",
-        "\n",
-        "At the beginning of the training, the generated images look like random noise. As training progresses, you can see the generated digits look increasingly real. After 50 epochs, they look very much like the MNIST digits."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "Ly3UN0SLLY2l",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "%%time\n",
-        "train(train_dataset, EPOCHS)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "rfM4YcPVPkNO"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Restore the latest checkpoint**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "XhXsd0srPo8c",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Generated images \n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "mLskt7EfXAjr"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "After training, its time to generate some images! \n",
-        "The last step is to plot the generated images and voila!\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# Display a single image using the epoch number\n",
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "5x3q9_Oe5q0A",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "display_image(EPOCHS)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Generate a GIF of all the saved images**\n",
-        "\n",
-        "We will use imageio to create an animated gif using all the images saved during training."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "IGKQgENQ8lEI",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "with imageio.get_writer('dcgan.gif', mode='I') as writer:\n",
-        "  filenames = glob.glob('image*.png')\n",
-        "  filenames = sorted(filenames)\n",
-        "  last = -1\n",
-        "  for i,filename in enumerate(filenames):\n",
-        "    frame = 2*(i**0.5)\n",
-        "    if round(frame) > round(last):\n",
-        "      last = frame\n",
-        "    else:\n",
-        "      continue\n",
-        "    image = imageio.imread(filename)\n",
-        "    writer.append_data(image)\n",
-        "  image = imageio.imread(filename)\n",
-        "  writer.append_data(image)\n",
-        "    \n",
-        "# this is a hack to display the gif inside the notebook\n",
-        "os.system('cp dcgan.gif dcgan.gif.png')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "cGhC3-fMWSwl"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Display the animated gif with all the mages generated during the training of GANs."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "uV0yiKpzNP1b",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "display.Image(filename=\"dcgan.gif.png\")"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "6EEG-wePkmJQ"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Download the animated gif**\n",
-        "\n",
-        "Uncomment the code below to download an animated gif from Colab."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "4UJjSnIMOzOJ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "#from google.colab import files\n",
-        "#files.download('dcgan.gif')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "k6qC-SbjK0yW"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Learn more about GANs\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "xjjkT9KAK6H7"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "We hope this tutorial was helpful! As a next step, you might like to experiment with a different dataset, for example the Large-scale Celeb Faces Attributes (CelebA) dataset [available on Kaggle](https://www.kaggle.com/jessicali9530/celeba-dataset/home).\n",
-        "\n",
-        "To learn more about GANs:\n",
-        "\n",
-        "* Check out MIT's lecture (linked above), or [this](http://cs231n.stanford.edu/slides/2018/cs231n_2018_lecture12.pdf) lecture form Stanford's CS231n. \n",
-        "\n",
-        "* We also recommend the [CVPR 2018 Tutorial on GANs](https://sites.google.com/view/cvpr2018tutorialongans/), and the [NIPS 2016 Tutorial: Generative Adversarial Networks](https://arxiv.org/abs/1701.00160).\n"
-      ]
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
     }
-  ]
-}
\ No newline at end of file
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png b/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png
deleted file mode 100644
index b715bd83ef117641c6429e0ac173dbe9b8d5fd88..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png and /dev/null differ
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 12c5eff2b4aa901bdab52bf545e95b1e4dce7468..979772acd3f823a8cc53ab5e026946ad3bb19353 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1174 +1,71 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "K2s1A9eLRPEj"
-   },
-   "source": [
-    "##### Copyright 2018 The TensorFlow Authors.\n",
-    "\n",
-    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "Cffg2i257iMS"
-   },
-   "source": [
-    "# Image Captioning with Attention\n",
-    "\n",
-    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-    "</td><td>\n",
-    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "QASbY_HGo4Lq"
-   },
-   "source": [
-    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-    "\n",
-    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-    "\n",
-    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-    "\n",
-    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-    "\n",
-    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-    "\n",
-    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-    "\n",
-    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-    "\n",
-    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-    "\n",
-    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-    "\n",
-    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "U8l4RJ0XRPEm"
-   },
-   "outputs": [],
-   "source": [
-    "# Import TensorFlow and enable eager execution\n",
-    "# This code requires TensorFlow version >=1.9\n",
-    "import tensorflow as tf\n",
-    "tf.enable_eager_execution()\n",
-    "\n",
-    "# We'll generate plots of attention in order to see which parts of an image\n",
-    "# our model focuses on during captioning\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# Scikit-learn includes many helpful utilities\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.utils import shuffle\n",
-    "\n",
-    "import re\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import time\n",
-    "import json\n",
-    "from glob import glob\n",
-    "from PIL import Image\n",
-    "import pickle"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "b6qbGw8MRPE5"
-   },
-   "source": [
-    "## Download and prepare the MS-COCO dataset\n",
-    "\n",
-    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-    "\n",
-    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "krQuPYTtRPE7"
-   },
-   "outputs": [],
-   "source": [
-    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-    "                                          cache_subdir=os.path.abspath('.'),\n",
-    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-    "                                          extract = True)\n",
-    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-    "\n",
-    "name_of_zip = 'train2014.zip'\n",
-    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-    "                                      cache_subdir=os.path.abspath('.'),\n",
-    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-    "                                      extract = True)\n",
-    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-    "else:\n",
-    "  PATH = os.path.abspath('.')+'/train2014/'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "aANEzb5WwSzg"
-   },
-   "source": [
-    "## Optionally, limit the size of the training set for faster training\n",
-    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "4G3b8x8_RPFD"
-   },
-   "outputs": [],
-   "source": [
-    "# read the json file\n",
-    "with open(annotation_file, 'r') as f:\n",
-    "    annotations = json.load(f)\n",
-    "\n",
-    "# storing the captions and the image name in vectors\n",
-    "all_captions = []\n",
-    "all_img_name_vector = []\n",
-    "\n",
-    "for annot in annotations['annotations']:\n",
-    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-    "    image_id = annot['image_id']\n",
-    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-    "    \n",
-    "    all_img_name_vector.append(full_coco_image_path)\n",
-    "    all_captions.append(caption)\n",
-    "\n",
-    "# shuffling the captions and image_names together\n",
-    "# setting a random state\n",
-    "train_captions, img_name_vector = shuffle(all_captions,\n",
-    "                                          all_img_name_vector,\n",
-    "                                          random_state=1)\n",
-    "\n",
-    "# selecting the first 30000 captions from the shuffled set\n",
-    "num_examples = 30000\n",
-    "train_captions = train_captions[:num_examples]\n",
-    "img_name_vector = img_name_vector[:num_examples]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "mPBMgK34RPFL"
-   },
-   "outputs": [],
-   "source": [
-    "len(train_captions), len(all_captions)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "8cSW4u-ORPFQ"
-   },
-   "source": [
-    "## Preprocess the images using InceptionV3\n",
-    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-    "\n",
-    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-    "* Resizing the image to (299, 299)\n",
-    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "zXR0217aRPFR"
-   },
-   "outputs": [],
-   "source": [
-    "def load_image(image_path):\n",
-    "    img = tf.read_file(image_path)\n",
-    "    img = tf.image.decode_jpeg(img, channels=3)\n",
-    "    img = tf.image.resize_images(img, (299, 299))\n",
-    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-    "    return img, image_path"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "MDvIu4sXRPFV"
-   },
-   "source": [
-    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-    "\n",
-    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-    "* We avoid doing this during training so it does not become a bottleneck. \n",
-    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "RD3vW4SsRPFW"
-   },
-   "outputs": [],
-   "source": [
-    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-    "                                                weights='imagenet')\n",
-    "new_input = image_model.input\n",
-    "hidden_layer = image_model.layers[-1].output\n",
-    "\n",
-    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "rERqlR3WRPGO"
-   },
-   "source": [
-    "## Caching the features extracted from InceptionV3\n",
-    "\n",
-    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-    "\n",
-    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-    "\n",
-    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-    "\n",
-    "```for img, path in image_dataset:``` \n",
-    "\n",
-    "to:\n",
-    "\n",
-    "```for img, path in tqdm(image_dataset):```."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Dx_fvbVgRPGQ"
-   },
-   "outputs": [],
-   "source": [
-    "# getting the unique images\n",
-    "encode_train = sorted(set(img_name_vector))\n",
-    "\n",
-    "# feel free to change the batch_size according to your system configuration\n",
-    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-    "                                encode_train).map(load_image).batch(16)\n",
-    "\n",
-    "for img, path in image_dataset:\n",
-    "  batch_features = image_features_extract_model(img)\n",
-    "  batch_features = tf.reshape(batch_features, \n",
-    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-    "\n",
-    "  for bf, p in zip(batch_features, path):\n",
-    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-    "    np.save(path_of_feature, bf.numpy())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "nyqH3zFwRPFi"
-   },
-   "source": [
-    "## Preprocess and tokenize the captions\n",
-    "\n",
-    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-    "* Finally, we create a word --> index mapping and vice-versa.\n",
-    "* We will then pad all sequences to the be same length as the longest one. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "HZfK8RhQRPFj"
-   },
-   "outputs": [],
-   "source": [
-    "# This will find the maximum length of any caption in our dataset\n",
-    "def calc_max_length(tensor):\n",
-    "    return max(len(t) for t in tensor)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "oJGE34aiRPFo"
-   },
-   "outputs": [],
-   "source": [
-    "# The steps above is a general process of dealing with text processing\n",
-    "\n",
-    "# choosing the top 5000 words from the vocabulary\n",
-    "top_k = 5000\n",
-    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-    "                                                  oov_token=\"<unk>\", \n",
-    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-    "tokenizer.fit_on_texts(train_captions)\n",
-    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "8Q44tNQVRPFt"
-   },
-   "outputs": [],
-   "source": [
-    "tokenizer.word_index['<pad>'] = 0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "0fpJb5ojRPFv"
-   },
-   "outputs": [],
-   "source": [
-    "# creating the tokenized vectors\n",
-    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AidglIZVRPF4"
-   },
-   "outputs": [],
-   "source": [
-    "# padding each vector to the max_length of the captions\n",
-    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "gL0wkttkRPGA"
-   },
-   "outputs": [],
-   "source": [
-    "# calculating the max_length \n",
-    "# used to store the attention weights\n",
-    "max_length = calc_max_length(train_seqs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "M3CD75nDpvTI"
-   },
-   "source": [
-    "## Split the data into training and testing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "iS7DDMszRPGF"
-   },
-   "outputs": [],
-   "source": [
-    "# Create training and validation sets using 80-20 split\n",
-    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-    "                                                                    cap_vector, \n",
-    "                                                                    test_size=0.2, \n",
-    "                                                                    random_state=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "XmViPkRFRPGH"
-   },
-   "outputs": [],
-   "source": [
-    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "uEWM9xrYcg45"
-   },
-   "source": [
-    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Q3TnZ1ToRPGV"
-   },
-   "outputs": [],
-   "source": [
-    "# feel free to change these parameters according to your system's configuration\n",
-    "\n",
-    "BATCH_SIZE = 64\n",
-    "BUFFER_SIZE = 1000\n",
-    "embedding_dim = 256\n",
-    "units = 512\n",
-    "vocab_size = len(tokenizer.word_index)\n",
-    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-    "# these two variables represent that\n",
-    "features_shape = 2048\n",
-    "attention_features_shape = 64"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "SmZS2N0bXG3T"
-   },
-   "outputs": [],
-   "source": [
-    "# loading the numpy files \n",
-    "def map_func(img_name, cap):\n",
-    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-    "    return img_tensor, cap"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "FDF_Nm3tRPGZ"
-   },
-   "outputs": [],
-   "source": [
-    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-    "\n",
-    "# using map to load the numpy files in parallel\n",
-    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-    "\n",
-    "# shuffling and batching\n",
-    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-    "dataset = dataset.batch(BATCH_SIZE)\n",
-    "dataset = dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "nrvoDphgRPGd"
-   },
-   "source": [
-    "## Model\n",
-    "\n",
-    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-    "\n",
-    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-    "\n",
-    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-    "* We squash that to a shape of (64, 2048).\n",
-    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-    "* The RNN(here GRU) attends over the image to predict the next word."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AAppCGLKRPGd"
-   },
-   "outputs": [],
-   "source": [
-    "def gru(units):\n",
-    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-    "  # significant speedup).\n",
-    "  if tf.test.is_gpu_available():\n",
-    "    return tf.keras.layers.CuDNNGRU(units, \n",
-    "                                    return_sequences=True, \n",
-    "                                    return_state=True, \n",
-    "                                    recurrent_initializer='glorot_uniform')\n",
-    "  else:\n",
-    "    return tf.keras.layers.GRU(units, \n",
-    "                               return_sequences=True, \n",
-    "                               return_state=True, \n",
-    "                               recurrent_activation='sigmoid', \n",
-    "                               recurrent_initializer='glorot_uniform')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "ja2LFTMSdeV3"
-   },
-   "outputs": [],
-   "source": [
-    "class BahdanauAttention(tf.keras.Model):\n",
-    "  def __init__(self, units):\n",
-    "    super(BahdanauAttention, self).__init__()\n",
-    "    self.W1 = tf.keras.layers.Dense(units)\n",
-    "    self.W2 = tf.keras.layers.Dense(units)\n",
-    "    self.V = tf.keras.layers.Dense(1)\n",
-    "  \n",
-    "  def call(self, features, hidden):\n",
-    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-    "    \n",
-    "    # hidden shape == (batch_size, hidden_size)\n",
-    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-    "    \n",
-    "    # score shape == (batch_size, 64, hidden_size)\n",
-    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-    "    \n",
-    "    # attention_weights shape == (batch_size, 64, 1)\n",
-    "    # we get 1 at the last axis because we are applying score to self.V\n",
-    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-    "    \n",
-    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-    "    context_vector = attention_weights * features\n",
-    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-    "    \n",
-    "    return context_vector, attention_weights"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AZ7R1RxHRPGf"
-   },
-   "outputs": [],
-   "source": [
-    "class CNN_Encoder(tf.keras.Model):\n",
-    "    # Since we have already extracted the features and dumped it using pickle\n",
-    "    # This encoder passes those features through a Fully connected layer\n",
-    "    def __init__(self, embedding_dim):\n",
-    "        super(CNN_Encoder, self).__init__()\n",
-    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-    "        \n",
-    "    def call(self, x):\n",
-    "        x = self.fc(x)\n",
-    "        x = tf.nn.relu(x)\n",
-    "        return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "V9UbGQmERPGi"
-   },
-   "outputs": [],
-   "source": [
-    "class RNN_Decoder(tf.keras.Model):\n",
-    "  def __init__(self, embedding_dim, units, vocab_size):\n",
-    "    super(RNN_Decoder, self).__init__()\n",
-    "    self.units = units\n",
-    "\n",
-    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-    "    self.gru = gru(self.units)\n",
-    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-    "    \n",
-    "    self.attention = BahdanauAttention(self.units)\n",
-    "        \n",
-    "  def call(self, x, features, hidden):\n",
-    "    # defining attention as a separate model\n",
-    "    context_vector, attention_weights = self.attention(features, hidden)\n",
-    "    \n",
-    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-    "    x = self.embedding(x)\n",
-    "    \n",
-    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-    "    \n",
-    "    # passing the concatenated vector to the GRU\n",
-    "    output, state = self.gru(x)\n",
-    "    \n",
-    "    # shape == (batch_size, max_length, hidden_size)\n",
-    "    x = self.fc1(output)\n",
-    "    \n",
-    "    # x shape == (batch_size * max_length, hidden_size)\n",
-    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-    "    \n",
-    "    # output shape == (batch_size * max_length, vocab)\n",
-    "    x = self.fc2(x)\n",
-    "\n",
-    "    return x, state, attention_weights\n",
-    "\n",
-    "  def reset_state(self, batch_size):\n",
-    "    return tf.zeros((batch_size, self.units))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Qs_Sr03wRPGk"
-   },
-   "outputs": [],
-   "source": [
-    "encoder = CNN_Encoder(embedding_dim)\n",
-    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "-bYN7xA0RPGl"
-   },
-   "outputs": [],
-   "source": [
-    "optimizer = tf.train.AdamOptimizer()\n",
-    "\n",
-    "# We are masking the loss calculated for padding\n",
-    "def loss_function(real, pred):\n",
-    "    mask = 1 - np.equal(real, 0)\n",
-    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-    "    return tf.reduce_mean(loss_)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "PHod7t72RPGn"
-   },
-   "source": [
-    "## Training\n",
-    "\n",
-    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-    "* The decoder returns the predictions and the decoder hidden state.\n",
-    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-    "* Use teacher forcing to decide the next input to the decoder.\n",
-    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Vt4WZ5mhJE-E"
-   },
-   "outputs": [],
-   "source": [
-    "# adding this in a separate cell because if you run the training cell \n",
-    "# many times, the loss_plot array will be reset\n",
-    "loss_plot = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "UlA4VIQpRPGo"
-   },
-   "outputs": [],
-   "source": [
-    "EPOCHS = 20\n",
-    "\n",
-    "for epoch in range(EPOCHS):\n",
-    "    start = time.time()\n",
-    "    total_loss = 0\n",
-    "    \n",
-    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-    "        loss = 0\n",
-    "        \n",
-    "        # initializing the hidden state for each batch\n",
-    "        # because the captions are not related from image to image\n",
-    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-    "\n",
-    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-    "        \n",
-    "        with tf.GradientTape() as tape:\n",
-    "            features = encoder(img_tensor)\n",
-    "            \n",
-    "            for i in range(1, target.shape[1]):\n",
-    "                # passing the features through the decoder\n",
-    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-    "\n",
-    "                loss += loss_function(target[:, i], predictions)\n",
-    "                \n",
-    "                # using teacher forcing\n",
-    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-    "        \n",
-    "        total_loss += (loss / int(target.shape[1]))\n",
-    "        \n",
-    "        variables = encoder.variables + decoder.variables\n",
-    "        \n",
-    "        gradients = tape.gradient(loss, variables) \n",
-    "        \n",
-    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-    "        \n",
-    "        if batch % 100 == 0:\n",
-    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-    "                                                          batch, \n",
-    "                                                          loss.numpy() / int(target.shape[1])))\n",
-    "    # storing the epoch end loss value to plot later\n",
-    "    loss_plot.append(total_loss / len(cap_vector))\n",
-    "    \n",
-    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-    "                                         total_loss/len(cap_vector)))\n",
-    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "1Wm83G-ZBPcC"
-   },
-   "outputs": [],
-   "source": [
-    "plt.plot(loss_plot)\n",
-    "plt.xlabel('Epochs')\n",
-    "plt.ylabel('Loss')\n",
-    "plt.title('Loss Plot')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "xGvOcLQKghXN"
-   },
-   "source": [
-    "## Caption!\n",
-    "\n",
-    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-    "* Stop predicting when the model predicts the end token.\n",
-    "* And store the attention weights for every time step."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "RCWpDtyNRPGs"
-   },
-   "outputs": [],
-   "source": [
-    "def evaluate(image):\n",
-    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-    "\n",
-    "    hidden = decoder.reset_state(batch_size=1)\n",
-    "\n",
-    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-    "    img_tensor_val = image_features_extract_model(temp_input)\n",
-    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-    "\n",
-    "    features = encoder(img_tensor_val)\n",
-    "\n",
-    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-    "    result = []\n",
-    "\n",
-    "    for i in range(max_length):\n",
-    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-    "\n",
-    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-    "\n",
-    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-    "        result.append(tokenizer.index_word[predicted_id])\n",
-    "\n",
-    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
-    "            return result, attention_plot\n",
-    "\n",
-    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-    "\n",
-    "    attention_plot = attention_plot[:len(result), :]\n",
-    "    return result, attention_plot"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "fD_y7PD6RPGt"
-   },
-   "outputs": [],
-   "source": [
-    "def plot_attention(image, result, attention_plot):\n",
-    "    temp_image = np.array(Image.open(image))\n",
-    "\n",
-    "    fig = plt.figure(figsize=(10, 10))\n",
-    "    \n",
-    "    len_result = len(result)\n",
-    "    for l in range(len_result):\n",
-    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-    "        ax.set_title(result[l])\n",
-    "        img = ax.imshow(temp_image)\n",
-    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-    "\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "K2s1A9eLRPEj"
+      },
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+      ]
     },
-    "colab_type": "code",
-    "id": "io7ws3ReRPGv"
-   },
-   "outputs": [],
-   "source": [
-    "# captions on the validation set\n",
-    "rid = np.random.randint(0, len(img_name_val))\n",
-    "image = img_name_val[rid]\n",
-    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-    "result, attention_plot = evaluate(image)\n",
-    "\n",
-    "print ('Real Caption:', real_caption)\n",
-    "print ('Prediction Caption:', ' '.join(result))\n",
-    "plot_attention(image, result, attention_plot)\n",
-    "# opening the image\n",
-    "Image.open(img_name_val[rid])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "Rprk3HEvZuxb"
-   },
-   "source": [
-    "## Try it on your own images\n",
-    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Cffg2i257iMS"
+      },
+      "source": [
+        "# Image Captioning with Attention\n",
+        "\n",
+        "This example has moved:\n",
+        "\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/image_captioning.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/image_captioning.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+      ]
     },
-    "colab_type": "code",
-    "id": "9Psd1quzaAWg"
-   },
-   "outputs": [],
-   "source": [
-    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-    "image_extension = image_url[-4:]\n",
-    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-    "                                     origin=image_url)\n",
-    "\n",
-    "result, attention_plot = evaluate(image_path)\n",
-    "print ('Prediction Caption:', ' '.join(result))\n",
-    "plot_attention(image_path, result, attention_plot)\n",
-    "# opening the image\n",
-    "Image.open(image_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "VJZXyJco6uLO"
-   },
-   "source": [
-    "# Next steps\n",
-    "\n",
-    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "collapsed_sections": [],
-   "default_view": {},
-   "name": "image_captioning_with_attention.ipynb",
-   "private_outputs": true,
-   "provenance": [
     {
-     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-     "timestamp": 1530222436922
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QASbY_HGo4Lq"
+      },
+      "source": [
+        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+        "\n",
+        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+        "\n",
+        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "image_captioning_with_attention.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+          "timestamp": 1530222436922
+        }
+      ],
+      "toc_visible": true,
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
     }
-   ],
-   "toc_visible": true,
-   "version": "0.3.2",
-   "views": {}
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index bda9e77085e45ae31a228142135425e22a1c6780..c945c753b3ba36d16aa6985d23a5849f8f552304 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -13,633 +13,13 @@
         "\n",
         "# Text Generation using a RNN\n",
         "\n",
+        "This example has moved.\n",
+        "\n",
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/sequences/text_generation.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on Github\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "BwpJ5IffzRG6"
-      },
-      "source": [
-        "This notebook demonstrates how to generate text using an RNN using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). If you like, you can write a similar [model](https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/8.1-text-generation-with-lstm.ipynb) using less code. Here, we show a lower-level impementation that's useful to understand as prework before diving in to deeper examples in a similar, like [Neural Machine Translation with Attention](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "This notebook is an end-to-end example. When you run it, it will download a dataset of Shakespeare's writing. We'll use a collection of plays, borrowed from Andrej Karpathy's excellent [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).  The notebook will train a model, and use it to generate sample output.\n",
-        "  \n",
-        "Here is the output(with start string='w') after training a single layer GRU for 30 epochs with the default settings below:\n",
-        "\n",
-        "```\n",
-        "were to the death of him\n",
-        "And nothing of the field in the view of hell,\n",
-        "When I said, banish him, I will not burn thee that would live.\n",
-        "\n",
-        "HENRY BOLINGBROKE:\n",
-        "My gracious uncle--\n",
-        "\n",
-        "DUKE OF YORK:\n",
-        "As much disgraced to the court, the gods them speak,\n",
-        "And now in peace himself excuse thee in the world.\n",
-        "\n",
-        "HORTENSIO:\n",
-        "Madam, 'tis not the cause of the counterfeit of the earth,\n",
-        "And leave me to the sun that set them on the earth\n",
-        "And leave the world and are revenged for thee.\n",
-        "\n",
-        "GLOUCESTER:\n",
-        "I would they were talking with the very name of means\n",
-        "To make a puppet of a guest, and therefore, good Grumio,\n",
-        "Nor arm'd to prison, o' the clouds, of the whole field,\n",
-        "With the admire\n",
-        "With the feeding of thy chair, and we have heard it so,\n",
-        "I thank you, sir, he is a visor friendship with your silly your bed.\n",
-        "\n",
-        "SAMPSON:\n",
-        "I do desire to live, I pray: some stand of the minds, make thee remedies\n",
-        "With the enemies of my soul.\n",
-        "\n",
-        "MENENIUS:\n",
-        "I'll keep the cause of my mistress.\n",
-        "\n",
-        "POLIXENES:\n",
-        "My brother Marcius!\n",
-        "\n",
-        "Second Servant:\n",
-        "Will't ple\n",
-        "```\n",
-        "\n",
-        "Of course, while some of the sentences are grammatical, most do not make sense. But, consider:\n",
-        "\n",
-        "* Our model is character based (when we began training, it did not yet know how to spell a valid English word, or that words were even a unit of text).\n",
-        "\n",
-        "* The structure of the output resembles a play (blocks begin with a speaker name, in all caps similar to the original text). Sentences generally end with a period. If you look at the text from a distance (or don't read the invididual words too closely, it appears as if it's an excerpt from a play).\n",
-        "\n",
-        "As a next step, you can experiment training the model on a different dataset - any large text file(ASCII) will do, and you can modify a single line of code below to make that change. Have fun!\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "R3p22DBDsaCA"
-      },
-      "source": [
-        "## Install unidecode library\n",
-        "A helpful library to convert unicode to ASCII."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "wZ6LOM12wKGH"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install unidecode"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "WGyKZj3bzf9p"
-      },
-      "source": [
-        "## Import tensorflow and enable eager execution."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "yG_n40gFzf9s"
-      },
-      "outputs": [],
-      "source": [
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "# Note: Once you enable eager execution, it cannot be disabled. \n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import re\n",
-        "import random\n",
-        "import unidecode\n",
-        "import time"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EHDoRoc5PKWz"
-      },
-      "source": [
-        "## Download the dataset\n",
-        "\n",
-        "In this example, we will use the [shakespeare dataset](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt). You can use any other dataset that you like.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "pD_55cOxLkAb"
-      },
-      "outputs": [],
-      "source": [
-        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "UHjdCjDuSvX_"
-      },
-      "source": [
-        "## Read the dataset\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "-E5JvY3wzf94"
-      },
-      "outputs": [],
-      "source": [
-        "text = unidecode.unidecode(open(path_to_file).read())\n",
-        "# length of text is the number of characters in it\n",
-        "print (len(text))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Il9ww98izf-D"
-      },
-      "source": [
-        "Creating dictionaries to map from characters to their indices and vice-versa, which will be used to vectorize the inputs"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "IalZLbvOzf-F"
-      },
-      "outputs": [],
-      "source": [
-        "# unique contains all the unique characters in the file\n",
-        "unique = sorted(set(text))\n",
-        "\n",
-        "# creating a mapping from unique characters to indices\n",
-        "char2idx = {u:i for i, u in enumerate(unique)}\n",
-        "idx2char = {i:u for i, u in enumerate(unique)}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "1v_qUYfAzf-I"
-      },
-      "outputs": [],
-      "source": [
-        "# setting the maximum length sentence we want for a single input in characters\n",
-        "max_length = 100\n",
-        "\n",
-        "# length of the vocabulary in chars\n",
-        "vocab_size = len(unique)\n",
-        "\n",
-        "# the embedding dimension \n",
-        "embedding_dim = 256\n",
-        "\n",
-        "# number of RNN (here GRU) units\n",
-        "units = 1024\n",
-        "\n",
-        "# batch size \n",
-        "BATCH_SIZE = 64\n",
-        "\n",
-        "# buffer size to shuffle our dataset\n",
-        "BUFFER_SIZE = 10000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "LFjSVAlWzf-N"
-      },
-      "source": [
-        "## Creating the input and output tensors\n",
-        "\n",
-        "Vectorizing the input and the target text because our model cannot understand strings only numbers.\n",
-        "\n",
-        "But first, we need to create the input and output vectors.\n",
-        "Remember the max_length we set above, we will use it here. We are creating **max_length** chunks of input, where each input vector is all the characters in that chunk except the last and the target vector is all the characters in that chunk except the first.\n",
-        "\n",
-        "For example, consider that the string = 'tensorflow' and the max_length is 9\n",
-        "\n",
-        "So, the `input = 'tensorflo'` and `output = 'ensorflow'`\n",
-        "\n",
-        "After creating the vectors, we convert each character into numbers using the **char2idx** dictionary we created above."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "0UHJDA39zf-O"
-      },
-      "outputs": [],
-      "source": [
-        "input_text = []\n",
-        "target_text = []\n",
-        "\n",
-        "for f in range(0, len(text)-max_length, max_length):\n",
-        "    inps = text[f:f+max_length]\n",
-        "    targ = text[f+1:f+1+max_length]\n",
-        "\n",
-        "    input_text.append([char2idx[i] for i in inps])\n",
-        "    target_text.append([char2idx[t] for t in targ])\n",
-        "    \n",
-        "print (np.array(input_text).shape)\n",
-        "print (np.array(target_text).shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "MJdfPmdqzf-R"
-      },
-      "source": [
-        "## Creating batches and shuffling them using tf.data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "p2pGotuNzf-S"
-      },
-      "outputs": [],
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)\n",
-        "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "m8gPwEjRzf-Z"
-      },
-      "source": [
-        "## Creating the model\n",
-        "\n",
-        "We use the Model Subclassing API which gives us full flexibility to create the model and change it however we like. We use 3 layers to define our model.\n",
-        "\n",
-        "* Embedding layer\n",
-        "* GRU layer (you can use an LSTM layer here)\n",
-        "* Fully connected layer"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "P3KTiiInzf-a"
-      },
-      "outputs": [],
-      "source": [
-        "class Model(tf.keras.Model):\n",
-        "  def __init__(self, vocab_size, embedding_dim, units, batch_size):\n",
-        "    super(Model, self).__init__()\n",
-        "    self.units = units\n",
-        "    self.batch_sz = batch_size\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "\n",
-        "    if tf.test.is_gpu_available():\n",
-        "      self.gru = tf.keras.layers.CuDNNGRU(self.units, \n",
-        "                                          return_sequences=True, \n",
-        "                                          return_state=True, \n",
-        "                                          recurrent_initializer='glorot_uniform')\n",
-        "    else:\n",
-        "      self.gru = tf.keras.layers.GRU(self.units, \n",
-        "                                     return_sequences=True, \n",
-        "                                     return_state=True, \n",
-        "                                     recurrent_activation='sigmoid', \n",
-        "                                     recurrent_initializer='glorot_uniform')\n",
-        "\n",
-        "    self.fc = tf.keras.layers.Dense(vocab_size)\n",
-        "        \n",
-        "  def call(self, x, hidden):\n",
-        "    x = self.embedding(x)\n",
-        "\n",
-        "    # output shape == (batch_size, max_length, hidden_size) \n",
-        "    # states shape == (batch_size, hidden_size)\n",
-        "\n",
-        "    # states variable to preserve the state of the model\n",
-        "    # this will be used to pass at every step to the model while training\n",
-        "    output, states = self.gru(x, initial_state=hidden)\n",
-        "\n",
-        "\n",
-        "    # reshaping the output so that we can pass it to the Dense layer\n",
-        "    # after reshaping the shape is (batch_size * max_length, hidden_size)\n",
-        "    output = tf.reshape(output, (-1, output.shape[2]))\n",
-        "\n",
-        "    # The dense layer will output predictions for every time_steps(max_length)\n",
-        "    # output shape after the dense layer == (max_length * batch_size, vocab_size)\n",
-        "    x = self.fc(output)\n",
-        "\n",
-        "    return x, states"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "trpqTWyvk0nr"
-      },
-      "source": [
-        "## Call the model and set the optimizer and the loss function"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "7t2XrzEOzf-e"
-      },
-      "outputs": [],
-      "source": [
-        "model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "dkjWIATszf-h"
-      },
-      "outputs": [],
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors\n",
-        "def loss_function(real, preds):\n",
-        "    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "3K6s6F79P7za"
-      },
-      "source": [
-        "## Checkpoints (Object-based saving)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "oAGisDdfP9rL"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
-        "                                 model=model)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "lPrP0XMUzf-p"
-      },
-      "source": [
-        "## Train the model\n",
-        "\n",
-        "Here we will use a custom training loop with the help of GradientTape()\n",
-        "\n",
-        "* We initialize the hidden state of the model with zeros and shape == (batch_size, number of rnn units). We do this by calling the function defined while creating the model.\n",
-        "\n",
-        "* Next, we iterate over the dataset(batch by batch) and calculate the **predictions and the hidden states** associated with that input.\n",
-        "\n",
-        "* There are a lot of interesting things happening here.\n",
-        "  * The model gets hidden state(initialized with 0), lets call that **H0** and the first batch of input, lets call that **I0**.\n",
-        "  * The model then returns the predictions **P1** and **H1**.\n",
-        "  * For the next batch of input, the model receives **I1** and **H1**.\n",
-        "  * The interesting thing here is that we pass **H1** to the model with **I1** which is how the model learns. The context learned from batch to batch is contained in the **hidden state**.\n",
-        "  * We continue doing this until the dataset is exhausted and then we start a new epoch and repeat this.\n",
-        "\n",
-        "* After calculating the predictions, we calculate the **loss** using the loss function defined above. Then we calculate the gradients of the loss with respect to the model variables(input)\n",
-        "\n",
-        "* Finally, we take a step in that direction with the help of the optimizer using the apply_gradients function.\n",
-        "\n",
-        "Note:- If you are running this notebook in Colab which has a **Tesla K80 GPU** it takes about 23 seconds per epoch.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "d4tSNwymzf-q"
-      },
-      "outputs": [],
-      "source": [
-        "# Training step\n",
-        "\n",
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    # initializing the hidden state at the start of every epoch\n",
-        "    hidden = model.reset_states()\n",
-        "    \n",
-        "    for (batch, (inp, target)) in enumerate(dataset):\n",
-        "          with tf.GradientTape() as tape:\n",
-        "              # feeding the hidden state back into the model\n",
-        "              # This is the interesting step\n",
-        "              predictions, hidden = model(inp, hidden)\n",
-        "              \n",
-        "              # reshaping the target because that's how the \n",
-        "              # loss function expects it\n",
-        "              target = tf.reshape(target, (-1,))\n",
-        "              loss = loss_function(target, predictions)\n",
-        "              \n",
-        "          grads = tape.gradient(loss, model.variables)\n",
-        "          optimizer.apply_gradients(zip(grads, model.variables))\n",
-        "\n",
-        "          if batch % 100 == 0:\n",
-        "              print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,\n",
-        "                                                            batch,\n",
-        "                                                            loss))\n",
-        "    # saving (checkpoint) the model every 5 epochs\n",
-        "    if (epoch + 1) % 5 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "\n",
-        "    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n",
-        "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "01AR9vpNQMFF"
-      },
-      "source": [
-        "## Restore the latest checkpoint"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "tyvpYomYQQkF"
-      },
-      "outputs": [],
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "DjGz1tDkzf-u"
-      },
-      "source": [
-        "## Predicting using our trained model\n",
-        "\n",
-        "The below code block is used to generated the text\n",
-        "\n",
-        "* We start by choosing a start string and initializing the hidden state and setting the number of characters we want to generate.\n",
-        "\n",
-        "* We get predictions using the start_string and the hidden state\n",
-        "\n",
-        "* Then we use argmax to calculate the index of the predicted word. **We use this predicted word as our next input to the model**\n",
-        "\n",
-        "* **The hidden state returned by the model is fed back into the model so that it now has more context rather than just one word.** After we predict the next word, the modified hidden states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.\n",
-        "\n",
-        "* If you see the predictions, the model knows when to capitalize, make paragraphs and the text follows a shakespeare style of writing which is pretty awesome!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "WvuwZBX5Ogfd"
-      },
-      "outputs": [],
-      "source": [
-        "# Evaluation step(generating text using the model learned)\n",
-        "\n",
-        "# number of characters to generate\n",
-        "num_generate = 1000\n",
-        "\n",
-        "# You can change the start string to experiment\n",
-        "start_string = 'Q'\n",
-        "# converting our start string to numbers(vectorizing!) \n",
-        "input_eval = [char2idx[s] for s in start_string]\n",
-        "input_eval = tf.expand_dims(input_eval, 0)\n",
-        "\n",
-        "# empty string to store our results\n",
-        "text_generated = ''\n",
-        "\n",
-        "# hidden state shape == (batch_size, number of rnn units); here batch size == 1\n",
-        "hidden = [tf.zeros((1, units))]\n",
-        "for i in range(num_generate):\n",
-        "    predictions, hidden = model(input_eval, hidden)\n",
-        "\n",
-        "    # using argmax to predict the word returned by the model\n",
-        "    predicted_id = tf.argmax(predictions[-1]).numpy()\n",
-        "    \n",
-        "    # We pass the predicted word as the next input to the model\n",
-        "    # along with the previous hidden state\n",
-        "    input_eval = tf.expand_dims([predicted_id], 0)\n",
-        "    \n",
-        "    text_generated += idx2char[predicted_id]\n",
-        "\n",
-        "print (start_string + text_generated)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "AM2Uma_-yVIq"
-      },
-      "source": [
-        "## Next steps\n",
-        "\n",
-        "* Change the start string to a different character, or the start of a sentence.\n",
-        "* Experiment with training on a different, or with different parameters. [Project  Gutenberg](http://www.gutenberg.org/ebooks/100), for example, contains a large collection of books.\n",
-        "* Add another RNN layer.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "gtEd86sX5cB2"
-      },
-      "outputs": [],
-      "source": [
-        ""
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/sequences/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     }
   ],
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
index 7bdf9053de749af9d09b12ba7b848e21c1fdb8f0..35d509904211d98f124d2555fc48166e75cb0dd9 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
@@ -28,7 +28,7 @@ py_library(
 
 cuda_py_test(
     name = "l2hmc_test",
-    size = "large",
+    size = "medium",
     srcs = ["l2hmc_test.py"],
     additional_deps = [
         ":l2hmc",
@@ -36,4 +36,8 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index 74ce9e84f013d79b3a33ffa79993980b561e366d..30afef83bc5c6c164c8456ed472f4d6064068a25 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "linear_regression",
     srcs = ["linear_regression.py"],
     srcs_version = "PY2AND3",
+    deps = [":linear_regression_lib"],
+)
+
+py_library(
+    name = "linear_regression_lib",
+    srcs = ["linear_regression.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -20,10 +27,13 @@ cuda_py_test(
     size = "small",
     srcs = ["linear_regression_test.py"],
     additional_deps = [
-        ":linear_regression",
+        ":linear_regression_lib",
         "//tensorflow:tensorflow_py",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
@@ -31,7 +41,7 @@ cuda_py_test(
     size = "small",
     srcs = ["linear_regression_graph_test.py"],
     additional_deps = [
-        ":linear_regression",
+        ":linear_regression_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 099b712fc06d1d3eb9ab4095f8db7283690bda76..206ef9409df7b1dc21de42ba919d2ba97f334a8c 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -56,7 +56,7 @@ class LinearModel(tf.keras.Model):
 
 
 def mean_square_loss(model, xs, ys):
-  return tf.reduce_mean(tf.square(tf.subtract(model(xs), ys)))
+  return tf.reduce_mean(tf.squared_difference(model(xs), ys))
 
 
 def fit(model, dataset, optimizer, verbose=False, logdir=None):
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 66d52a74943d0d81fde05ce51b019558b327978d..436e887736158ec1ba8e46eac8de4ac7b8e6be01 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -1,11 +1,28 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "nmt_with_attention.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
   "cells": [
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "AOpGoE2T-YXS"
       },
+      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors.\n",
         "\n",
@@ -13,19 +30,19 @@
         "\n",
         "# Neural Machine Translation with Attention\n",
         "\n",
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\n",
-        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
-        "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n",
+        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+        "</td><td>\n",
+        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "CiwtNgENbx2g"
       },
+      "cell_type": "markdown",
       "source": [
         "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n",
         "\n",
@@ -33,24 +50,22 @@
         "\n",
         "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n",
         "\n",
-        "\u003cimg src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\"\u003e\n",
+        "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n",
         "\n",
         "Note: This example takes approximately 10 mintues to run on a single P100 GPU."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "tnxXKDjq3jEL"
+        "id": "tnxXKDjq3jEL",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "from __future__ import absolute_import, division, print_function\n",
         "\n",
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
+        "# Import TensorFlow >= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "tf.enable_eager_execution()\n",
@@ -65,14 +80,16 @@
         "import time\n",
         "\n",
         "print(tf.__version__)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "wfodePkj3jEa"
       },
+      "cell_type": "markdown",
       "source": [
         "## Download and prepare the dataset\n",
         "\n",
@@ -91,14 +108,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "kRVATYOgJs1b"
+        "id": "kRVATYOgJs1b",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Download the file\n",
         "path_to_zip = tf.keras.utils.get_file(\n",
@@ -106,17 +121,17 @@
         "    extract=True)\n",
         "\n",
         "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "rd0jw-eC3jEh"
+        "id": "rd0jw-eC3jEh",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Converts the unicode file to ascii\n",
         "def unicode_to_ascii(s):\n",
@@ -128,7 +143,7 @@
         "    w = unicode_to_ascii(w.lower().strip())\n",
         "    \n",
         "    # creating a space between a word and the punctuation following it\n",
-        "    # eg: \"he is a boy.\" =\u003e \"he is a boy .\" \n",
+        "    # eg: \"he is a boy.\" => \"he is a boy .\" \n",
         "    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
         "    w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
         "    w = re.sub(r'[\" \"]+', \" \", w)\n",
@@ -140,19 +155,19 @@
         "    \n",
         "    # adding a start and an end token to the sentence\n",
         "    # so that the model know when to start and stop predicting.\n",
-        "    w = '\u003cstart\u003e ' + w + ' \u003cend\u003e'\n",
+        "    w = '<start> ' + w + ' <end>'\n",
         "    return w"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "OHn4Dct23jEm"
+        "id": "OHn4Dct23jEm",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# 1. Remove the accents\n",
         "# 2. Clean the sentences\n",
@@ -163,20 +178,20 @@
         "    word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')]  for l in lines[:num_examples]]\n",
         "    \n",
         "    return word_pairs"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "9xbqO7Iie9bb"
+        "id": "9xbqO7Iie9bb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "# This class creates a word -\u003e index mapping (e.g,. \"dad\" -\u003e 5) and vice-versa \n",
-        "# (e.g., 5 -\u003e \"dad\") for each language,\n",
+        "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n",
+        "# (e.g., 5 -> \"dad\") for each language,\n",
         "class LanguageIndex():\n",
         "  def __init__(self, lang):\n",
         "    self.lang = lang\n",
@@ -192,23 +207,23 @@
         "    \n",
         "    self.vocab = sorted(self.vocab)\n",
         "    \n",
-        "    self.word2idx['\u003cpad\u003e'] = 0\n",
+        "    self.word2idx['<pad>'] = 0\n",
         "    for index, word in enumerate(self.vocab):\n",
         "      self.word2idx[word] = index + 1\n",
         "    \n",
         "    for word, index in self.word2idx.items():\n",
         "      self.idx2word[index] = word"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "eAY9k49G3jE_"
+        "id": "eAY9k49G3jE_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def max_length(tensor):\n",
         "    return max(len(t) for t in tensor)\n",
@@ -244,71 +259,71 @@
         "                                                                  padding='post')\n",
         "    \n",
         "    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "GOi42V79Ydlr"
       },
+      "cell_type": "markdown",
       "source": [
         "### Limit the size of the dataset to experiment faster (optional)\n",
         "\n",
-        "Training on the complete dataset of \u003e100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
+        "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "cnxC7q-j3jFD"
+        "id": "cnxC7q-j3jFD",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Try experimenting with the size of that dataset\n",
         "num_examples = 30000\n",
         "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "4QILQkOs3jFG"
+        "id": "4QILQkOs3jFG",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Creating training and validation sets using an 80-20 split\n",
         "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
         "\n",
         "# Show length\n",
         "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "rgCLkfv5uO3d"
       },
+      "cell_type": "markdown",
       "source": [
         "### Create a tf.data dataset"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "TqHsArVZ3jFS"
+        "id": "TqHsArVZ3jFS",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "BUFFER_SIZE = len(input_tensor_train)\n",
         "BATCH_SIZE = 64\n",
@@ -320,27 +335,29 @@
         "\n",
         "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
         "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "TNfHIF71ulLu"
       },
+      "cell_type": "markdown",
       "source": [
         "## Write the encoder and decoder model\n",
         "\n",
-        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
+        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://github.com/tensorflow/nmt). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://github.com/tensorflow/nmt#background-on-the-attention-mechanism) from the seq2seq tutorial. The following diagram shows that each input word is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
         "\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\"\u003e\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n",
         "\n",
         "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n",
         "\n",
         "Here are the equations that are implemented:\n",
         "\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\"\u003e\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\"\u003e\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n",
         "\n",
         "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n",
         "\n",
@@ -362,14 +379,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "avyJ_4VIUoHb"
+        "id": "avyJ_4VIUoHb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def gru(units):\n",
         "  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n",
@@ -385,17 +400,17 @@
         "                               return_state=True, \n",
         "                               recurrent_activation='sigmoid', \n",
         "                               recurrent_initializer='glorot_uniform')"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "nZ2rI24i3jFg"
+        "id": "nZ2rI24i3jFg",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "class Encoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
@@ -412,17 +427,17 @@
         "    \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.enc_units))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "yJ_B3mhW3jFk"
+        "id": "yJ_B3mhW3jFk",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "class Decoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n",
@@ -476,41 +491,41 @@
         "        \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.dec_units))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "P5UY8wko3jFp"
+        "id": "P5UY8wko3jFp",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
         "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "_ch_71VbIRfK"
       },
+      "cell_type": "markdown",
       "source": [
         "## Define the optimizer and the loss function"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WmTHr5iV3jFr"
+        "id": "WmTHr5iV3jFr",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "optimizer = tf.train.AdamOptimizer()\n",
         "\n",
@@ -519,41 +534,43 @@
         "  mask = 1 - np.equal(real, 0)\n",
         "  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
         "  return tf.reduce_mean(loss_)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "DMVWzzsfNl4e"
       },
+      "cell_type": "markdown",
       "source": [
         "## Checkpoints (Object-based saving)"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Zj8bXQTgNwrF"
+        "id": "Zj8bXQTgNwrF",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "checkpoint_dir = './training_checkpoints'\n",
         "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
         "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
         "                                 encoder=encoder,\n",
         "                                 decoder=decoder)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "hpObfY22IddU"
       },
+      "cell_type": "markdown",
       "source": [
         "## Training\n",
         "\n",
@@ -567,14 +584,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ddefjBMa3jF0"
+        "id": "ddefjBMa3jF0",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "EPOCHS = 10\n",
         "\n",
@@ -592,7 +607,7 @@
         "            \n",
         "            dec_hidden = enc_hidden\n",
         "            \n",
-        "            dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']] * BATCH_SIZE, 1)       \n",
+        "            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       \n",
         "            \n",
         "            # Teacher forcing - feeding the target as the next input\n",
         "            for t in range(1, targ.shape[1]):\n",
@@ -625,14 +640,16 @@
         "    print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
         "                                        total_loss / N_BATCH))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "mU3Ce8M6I3rz"
       },
+      "cell_type": "markdown",
       "source": [
         "## Translate\n",
         "\n",
@@ -644,14 +661,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "EbQpyYs13jF_"
+        "id": "EbQpyYs13jF_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    attention_plot = np.zeros((max_length_targ, max_length_inp))\n",
@@ -668,7 +683,7 @@
         "    enc_out, enc_hidden = encoder(inputs, hidden)\n",
         "\n",
         "    dec_hidden = enc_hidden\n",
-        "    dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']], 0)\n",
+        "    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n",
         "\n",
         "    for t in range(max_length_targ):\n",
         "        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n",
@@ -681,24 +696,24 @@
         "\n",
         "        result += targ_lang.idx2word[predicted_id] + ' '\n",
         "\n",
-        "        if targ_lang.idx2word[predicted_id] == '\u003cend\u003e':\n",
+        "        if targ_lang.idx2word[predicted_id] == '<end>':\n",
         "            return result, sentence, attention_plot\n",
         "        \n",
         "        # the predicted ID is fed back into the model\n",
         "        dec_input = tf.expand_dims([predicted_id], 0)\n",
         "\n",
         "    return result, sentence, attention_plot"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "s5hQWlbN3jGF"
+        "id": "s5hQWlbN3jGF",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# function for plotting the attention weights\n",
         "def plot_attention(attention, sentence, predicted_sentence):\n",
@@ -712,17 +727,17 @@
         "    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n",
         "\n",
         "    plt.show()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "sl9zUHzg3jGI"
+        "id": "sl9zUHzg3jGI",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n",
@@ -732,91 +747,93 @@
         "    \n",
         "    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n",
         "    plot_attention(attention_plot, sentence.split(' '), result.split(' '))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "n250XbnjOaqP"
       },
+      "cell_type": "markdown",
       "source": [
         "## Restore the latest checkpoint and test"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "UJpT9D5_OgP6"
+        "id": "UJpT9D5_OgP6",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# restoring the latest checkpoint in checkpoint_dir\n",
         "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WrAM0FDomq3E"
+        "id": "WrAM0FDomq3E",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "zSx2iM36EZQZ"
+        "id": "zSx2iM36EZQZ",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "A3LLCx3ZE0Ls"
+        "id": "A3LLCx3ZE0Ls",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "DUQVLVqUE1YW"
+        "id": "DUQVLVqUE1YW",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# wrong translation\n",
         "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "RTe5P5ioMJwN"
       },
+      "cell_type": "markdown",
       "source": [
         "## Next steps\n",
         "\n",
@@ -824,31 +841,5 @@
         "* Experiment with training on a larger dataset, or using more epochs\n"
       ]
     }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "nmt_with_attention.ipynb",
-      "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
-          "timestamp": 1527858391290
-        },
-        {
-          "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
-          "timestamp": 1527776041613
-        }
-      ],
-      "toc_visible": true,
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index f3135a9668fc0dc7faa93a5f119b53f3efd34c6e..f2851d97223e483da11120f1fe3f0a2f641dfb81 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -27,7 +27,7 @@ py_library(
 
 cuda_py_test(
     name = "resnet50_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet50_test.py"],
     additional_deps = [
         ":resnet50",
@@ -35,17 +35,19 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "noasan",  # Fix b/118130911
         "nomsan",  # Fix b/118130911
         "notsan",  # Fix b/118130911
         "optonly",
+        "oss_serial",
     ],
 )
 
 cuda_py_test(
     name = "resnet50_graph_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet50_graph_test.py"],
     additional_deps = [
         ":resnet50",
@@ -53,10 +55,12 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "noasan",
         "nomsan",
         "notsan",
         "optonly",
+        "oss_serial",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index 4f0d46b1bae3760a63b2abe871034bdedf258f07..cb207b8ddf3641a68a114386f6a95a26ce2b74d6 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -67,30 +67,36 @@ py_library(
 # Tests
 cuda_py_test(
     name = "ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["ops_test.py"],
     additional_deps = [
         ":ops",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
+    tags = [
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
     name = "blocks_test",
-    size = "large",
+    size = "medium",
     srcs = ["blocks_test.py"],
     additional_deps = [
         ":blocks",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
+        "no_oss",  # b/123045964
         "optonly",
     ],
 )
 
 cuda_py_test(
     name = "revnet_test",
-    size = "large",
+    size = "medium",
     srcs = ["revnet_test.py"],
     additional_deps = [
         ":blocks_test",
@@ -98,9 +104,11 @@ cuda_py_test(
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",  # depends on blocks_test, which is not available in pip package
         "optonly",
+        "oss_serial",
     ],
 )
 
@@ -127,6 +135,13 @@ py_binary(
     name = "main",
     srcs = ["main.py"],
     srcs_version = "PY2AND3",
+    deps = [":main_lib"],
+)
+
+py_library(
+    name = "main_lib",
+    srcs = ["main.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
         ":config",
@@ -141,7 +156,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
@@ -153,7 +168,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
@@ -165,7 +180,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet.py b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
index 1f2cb14972f0b92d29489adff8f94e790e1ec4ed..7406787ba438345dc485c50e347e40597b2037f5 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
@@ -96,6 +96,7 @@ class RevNet(tf.keras.Model):
   def call(self, inputs, training=True):
     """Forward pass."""
 
+    saved_hidden = None
     if training:
       saved_hidden = [inputs]
 
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
index d500b632ebb97fd12ded3a215b0f1a686194874f..f4dbe7ac16f734f7bee045bc71e9559b630adf81 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "rnn_colorbot",
     srcs = ["rnn_colorbot.py"],
     srcs_version = "PY2AND3",
+    deps = [":rnn_colorbot_lib"],
+)
+
+py_library(
+    name = "rnn_colorbot_lib",
+    srcs = ["rnn_colorbot.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -21,8 +28,11 @@ cuda_py_test(
     name = "rnn_colorbot_test",
     srcs = ["rnn_colorbot_test.py"],
     additional_deps = [
-        ":rnn_colorbot",
+        ":rnn_colorbot_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 74ebb1ec77131a560b1ebfd062c690920c35e261..1c718a5ce3d8e1541656d92fd5e8dad6c6683c4c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -207,7 +207,7 @@ class RNNColorbot(tf.keras.Model):
 
 def loss(labels, predictions):
   """Computes mean squared loss."""
-  return tf.reduce_mean(tf.square(predictions - labels))
+  return tf.reduce_mean(tf.squared_difference(predictions, labels))
 
 
 def test(model, eval_data):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
index 2cc2fcbfeb21ee6218d7912d9a93ea2f7b2ea226..43a6ca526d3a0aecda2c8df865a0487ac28758ab 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "rnn_ptb",
     srcs = ["rnn_ptb.py"],
     srcs_version = "PY2AND3",
+    deps = [":rnn_ptb_lib"],
+)
+
+py_library(
+    name = "rnn_ptb_lib",
+    srcs = ["rnn_ptb.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
@@ -21,18 +28,22 @@ cuda_py_test(
     name = "rnn_ptb_test",
     srcs = ["rnn_ptb_test.py"],
     additional_deps = [
-        ":rnn_ptb",
+        ":rnn_ptb_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    tags = ["no_oss"],  # b/123045964
 )
 
 cuda_py_test(
     name = "rnn_ptb_graph_test",
     srcs = ["rnn_ptb_graph_test.py"],
     additional_deps = [
-        ":rnn_ptb",
+        ":rnn_ptb_lib",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 15776c694e92825895437a4c1547699f6d9269fb..9b5a2c947b153308c83f1a922d06c034ec5f9ddf 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -128,7 +128,7 @@ class PTBModel(tf.keras.Model):
 
     self.linear = layers.Dense(
         vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
-    self._output_shape = [-1, embedding_dim]
+    self._output_shape = [-1, hidden_dim]
 
   def call(self, input_seq, training):
     """Run the forward pass of PTBModel.
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index 5966f1d4873e8e77b3ad5914da7bfc7e69d4e341..9b0fbaa6793e28d327745767e6ccd3085211ff7d 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -42,5 +42,6 @@ cuda_py_test(
         "no-internal-py3",  # flaky
         "no_cuda_on_cpu_tap",
         "no_pip",  # because spinn.py is under third_party/.
+        "oss_serial",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 566246de4957c1dc5919c10e22146706f9e50be8..c8d9266672a8b87d32338ea7c4f74fb40d41c767 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -37,7 +37,7 @@ from tensorflow.python.training.checkpointable import base as checkpointable
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
 
-class Metric(checkpointable.CheckpointableBase):
+class Metric(checkpointable.Checkpointable):
   """A metric holds state for aggregating statistics over an evaluation run.
 
   Example use with eager execution:
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 31481d7685c79b76c40b1f8041441a0e71d3b00e..b82e1bb71bce9a28d7bbbf961cc6d5e25dd18acf 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -138,7 +138,7 @@ from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Vari
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
-from tensorflow.python.training.checkpointable.tracking import Checkpointable
+from tensorflow.python.training.checkpointable.tracking import AutoCheckpointable as Checkpointable
 from tensorflow.python.training.checkpointable.util import CheckpointableSaver
 from tensorflow.python.training.checkpointable.util import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index e344d7a23b55134612aab430b50cf065bd1095e4..cb86efb8da72f168b54f04773289a6fe421282b1 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -28,7 +28,6 @@ tf_custom_op_py_library(
         "python/ops/wals.py",
     ],
     dso = [
-        ":python/ops/_clustering_ops.so",
         ":python/ops/_factorization_ops.so",
     ],
     kernels = [
@@ -38,12 +37,12 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":factorization_ops_test_utils_py",
-        ":gen_clustering_ops",
         ":gen_factorization_ops",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:clustering_ops_gen",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:embedding_ops",
@@ -77,17 +76,6 @@ py_library(
     ],
 )
 
-# Ops
-tf_custom_op_library(
-    name = "python/ops/_clustering_ops.so",
-    srcs = [
-        "ops/clustering_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/factorization/kernels:clustering_ops",
-    ],
-)
-
 tf_custom_op_library(
     name = "python/ops/_factorization_ops.so",
     srcs = [
@@ -100,26 +88,16 @@ tf_custom_op_library(
 )
 
 tf_gen_op_libs([
-    "clustering_ops",
     "factorization_ops",
 ])
 
 cc_library(
     name = "all_ops",
     deps = [
-        ":clustering_ops_op_lib",
         ":factorization_ops_op_lib",
     ],
 )
 
-tf_gen_op_wrapper_py(
-    name = "gen_clustering_ops",
-    out = "python/ops/gen_clustering_ops.py",
-    deps = [
-        ":clustering_ops_op_lib",
-    ],
-)
-
 tf_gen_op_wrapper_py(
     name = "gen_factorization_ops",
     out = "python/ops/gen_factorization_ops.py",
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index ea8b9a17a27093cb57564861815edd6ecb18a014..23d7e088d067effa446e4bcdc9609db612066568 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -11,7 +11,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 cc_library(
     name = "all_kernels",
     deps = [
-        ":clustering_ops",
         ":masked_matmul_ops",
         ":wals_solver_ops",
         "@protobuf_archive//:protobuf_headers",
@@ -29,17 +28,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "clustering_ops",
-    srcs = ["clustering_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "masked_matmul_ops",
     srcs = ["masked_matmul_ops.cc"],
@@ -51,19 +39,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-tf_cc_test(
-    name = "clustering_ops_test",
-    srcs = ["clustering_ops_test.cc"],
-    deps = [
-        ":clustering_ops",
-        "//tensorflow/contrib/factorization:clustering_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index a8c5d0763c28ba2b54f217405f0da65533f26b91..68078ba8bbb07b4344c19d554012d214229f9c4f 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -19,12 +19,12 @@
 #include <numeric>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
diff --git a/tensorflow/contrib/factorization/ops/clustering_ops.cc b/tensorflow/contrib/factorization/ops/clustering_ops.cc
deleted file mode 100644
index 2686702c1d5768f661dac610c96089eb02e360d7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/factorization/ops/clustering_ops.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not
-// use this file except in compliance with the License.  You may obtain a copy
-// of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-// License for the specific language governing permissions and limitations under
-// the License.
-// ==============================================================================
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("KmeansPlusPlusInitialization")
-    .Input("points: float32")
-    .Input("num_to_sample: int64")
-    .Input("seed: int64")
-    .Input("num_retries_per_sample: int64")
-    .Output("samples: float32")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"(
-Selects num_to_sample rows of input using the KMeans++ criterion.
-
-Rows of points are assumed to be input points. One row is selected at random.
-Subsequent rows are sampled with probability proportional to the squared L2
-distance from the nearest row selected thus far till num_to_sample rows have
-been sampled.
-
-points: Matrix of shape (n, d). Rows are assumed to be input points.
-num_to_sample: Scalar. The number of rows to sample. This value must not be
-  larger than n.
-seed: Scalar. Seed for initializing the random number generator.
-num_retries_per_sample: Scalar. For each row that is sampled, this parameter
-  specifies the number of additional points to draw from the current
-  distribution before selecting the best. If a negative value is specified, a
-  heuristic is used to sample O(log(num_to_sample)) additional points.
-samples: Matrix of shape (num_to_sample, d). The sampled rows.
-)");
-
-REGISTER_OP("KMC2ChainInitialization")
-    .Input("distances: float32")
-    .Input("seed: int64")
-    .Output("index: int64")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"(
-Returns the index of a data point that should be added to the seed set.
-
-Entries in distances are assumed to be squared distances of candidate points to
-the already sampled centers in the seed set. The op constructs one Markov chain
-of the k-MC^2 algorithm and returns the index of one candidate point to be added
-as an additional cluster center.
-
-distances: Vector with squared distances to the closest previously sampled
-  cluster center for each candidate point.
-seed: Scalar. Seed for initializing the random number generator.
-index: Scalar with the index of the sampled point.
-)");
-
-REGISTER_OP("NearestNeighbors")
-    .Input("points: float32")
-    .Input("centers: float32")
-    .Input("k: int64")
-    .Output("nearest_center_indices: int64")
-    .Output("nearest_center_distances: float32")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"(
-Selects the k nearest centers for each point.
-
-Rows of points are assumed to be input points. Rows of centers are assumed to be
-the list of candidate centers. For each point, the k centers that have least L2
-distance to it are computed.
-
-points: Matrix of shape (n, d). Rows are assumed to be input points.
-centers: Matrix of shape (m, d). Rows are assumed to be centers.
-k: Scalar. Number of nearest centers to return for each point. If k is larger
-  than m, then only m centers are returned.
-nearest_center_indices: Matrix of shape (n, min(m, k)). Each row contains the
-  indices of the centers closest to the corresponding point, ordered by
-  increasing distance.
-nearest_center_distances: Matrix of shape (n, min(m, k)). Each row contains the
-  squared L2 distance to the corresponding center in nearest_center_indices.
-)");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 84e80791f4991ad2b67d0a00ee1e00cf0d0daadc..d48b89cbacce34781819010addbcbd0ba66f9873 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -18,28 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.factorization.python.ops import gen_clustering_ops
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.factorization.python.ops.gen_clustering_ops import *
-# pylint: enable=wildcard-import
-from tensorflow.contrib.util import loader
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_clustering_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.embedding_ops import embedding_lookup
-from tensorflow.python.platform import resource_loader
-
-_clustering_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile('_clustering_ops.so'))
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_clustering_ops import *
+# pylint: enable=wildcard-import
 
 # Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
 # which is the square root of the sum of the absolute squares of the elements
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index d365ad111760247fc18b730657390f07ba6b865e..9f0664dfe5ba7a098b6976388d1cf737dafb4842 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -314,8 +314,7 @@ class GmmAlgorithm(object):
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
         math_ops.log(self._covs + 1e-3), 1, keepdims=True)
-    diff = shard - self._means
-    x2 = math_ops.square(diff)
+    x2 = math_ops.squared_difference(shard, self._means)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
     # num_classes X num_examples
     x2_cov = math_ops.matmul(x2, cov_expanded)
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 4c1d1a29f20b5574b63cf87ecf62db95f92902cd..8fc5f1cfe7800653ef1e43c6d40d1a66e34f2106 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -6,7 +6,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "feature_column_py",
@@ -37,13 +37,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_test",
     srcs = ["python/feature_column/sequence_feature_column_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -53,17 +53,14 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["no_pip"],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_integration_test",
     srcs = ["python/feature_column/sequence_feature_column_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -73,6 +70,7 @@ py_test(
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras:layers",
     ],
+    tags = ["no_pip"],
 )
 
 py_library(
@@ -94,14 +92,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_v2_test",
     srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":sequence_feature_column",
+    additional_deps = [
         ":sequence_feature_column_v2",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -112,7 +109,23 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/feature_column:feature_column_v2_test",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+    ],
+    tags = ["no_pip"],
+)
+
+py_test(
+    name = "sequence_feature_column_v2_integration_test",
+    srcs = ["python/feature_column/sequence_feature_column_v2_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sequence_feature_column_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/keras:layers",
     ],
 )
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
index 83b93ec332044f754f9dcde8d7c5c19b26e53a4a..2f4bda194a41242167e0abfcaeac5044f6026f85 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -27,6 +27,7 @@ import collections
 
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc_v2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -34,107 +35,115 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variable_scope
 
 # pylint: disable=protected-access
 
 
-def sequence_input_layer(
-    features,
-    feature_columns,
-    weight_collections=None,
-    trainable=True):
-  """"Builds input layer for sequence input.
+class SequenceFeatures(fc_v2._BaseFeaturesLayer):
+  """A layer for sequence input.
 
-  All `feature_columns` must be sequence dense columns with the same
-  `sequence_length`. The output of this method can be fed into sequence
-  networks, such as RNN.
+    All `feature_columns` must be sequence dense columns with the same
+    `sequence_length`. The output of this method can be fed into sequence
+    networks, such as RNN.
 
-  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
-  `T` is the maximum sequence length for this batch, which could differ from
-  batch to batch.
+    The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+    `T` is the maximum sequence length for this batch, which could differ from
+    batch to batch.
 
-  If multiple `feature_columns` are given with `Di` `num_elements` each, their
-  outputs are concatenated. So, the final `Tensor` has shape
-  `[batch_size, T, D0 + D1 + ... + Dn]`.
+    If multiple `feature_columns` are given with `Di` `num_elements` each, their
+    outputs are concatenated. So, the final `Tensor` has shape
+    `[batch_size, T, D0 + D1 + ... + Dn]`.
 
-  Example:
+    Example:
 
-  ```python
-  rating = sequence_numeric_column('rating')
-  watches = sequence_categorical_column_with_identity(
-      'watches', num_buckets=1000)
-  watches_embedding = embedding_column(watches, dimension=10)
-  columns = [rating, watches]
+    ```python
+    rating = sequence_numeric_column('rating')
+    watches = sequence_categorical_column_with_identity(
+        'watches', num_buckets=1000)
+    watches_embedding = embedding_column(watches, dimension=10)
+    columns = [rating, watches]
 
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+    features = tf.parse_example(..., features=make_parse_example_spec(columns))
+    sequence_input_layer = SequenceFeatures(columns)
+    sequence_input, sequence_length = sequence_input_layer(features)
+    sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
-  ```
+    rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+    rnn_layer = tf.keras.layers.RNN(rnn_cell)
+    outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
+    ```
+  """
 
-  Args:
-    features: A dict mapping keys to tensors.
-    feature_columns: An iterable of dense sequence columns. Valid columns are
-      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
-      - `sequence_numeric_column`.
-    weight_collections: A list of collection names to which the Variable will be
-      added. Note that variables will also be added to collections
-      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-    trainable: If `True` also add the variable to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES`.
+  def __init__(
+      self,
+      feature_columns,
+      trainable=True,
+      name=None,
+      **kwargs):
+    """"Constructs a SequenceFeatures layer.
 
-  Returns:
-    An `(input_layer, sequence_length)` tuple where:
-    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
-        `T` is the maximum sequence length for this batch, which could differ
-        from batch to batch. `D` is the sum of `num_elements` for all
-        `feature_columns`.
-    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
-        length for each example.
+    Args:
+      feature_columns: An iterable of dense sequence columns. Valid columns are
+        - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+        - `sequence_numeric_column`.
+      trainable: Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the SequenceFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: If any of the `feature_columns` is not a
+        `SequenceDenseColumn`.
+    """
+    super(SequenceFeatures, self).__init__(
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=fc_v2.SequenceDenseColumn,
+        **kwargs)
 
-  Raises:
-    ValueError: If any of the `feature_columns` is the wrong type.
-  """
-  feature_columns = fc_old._normalize_feature_columns(feature_columns)
-  for c in feature_columns:
-    if not isinstance(c, fc_old._SequenceDenseColumn):
-      raise ValueError(
-          'All feature_columns must be of type _SequenceDenseColumn. '
-          'You can wrap a sequence_categorical_column with an embedding_column '
-          'or indicator_column. '
-          'Given (type {}): {}'.format(type(c), c))
-
-  with variable_scope.variable_scope(
-      None, default_name='sequence_input_layer', values=features.values()):
-    builder = fc_old._LazyBuilder(features)
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], input_shape[1], total_elements)
+
+  def call(self, features):
+    """Returns sequence input corresponding to the `feature_columns`.
+
+    Args:
+      features: A dict mapping keys to tensors.
+
+    Returns:
+      An `(input_layer, sequence_length)` tuple where:
+      - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+          `T` is the maximum sequence length for this batch, which could differ
+          from batch to batch. `D` is the sum of `num_elements` for all
+          `feature_columns`.
+      - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+          length for each example.
+
+    Raises:
+      ValueError: If features are not a dictionary.
+    """
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: ',
+                       features)
+    transformation_cache = fc.FeatureTransformationCache(features)
     output_tensors = []
     sequence_lengths = []
-    ordered_columns = []
-
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):
-        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
+
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        dense_tensor, sequence_length = column.get_sequence_dense_tensor(
+            transformation_cache, self._state_manager)
         # Flattens the final dimension to produce a 3D Tensor.
-        num_elements = column._variable_shape.num_elements()
-        shape = array_ops.shape(dense_tensor)
-        target_shape = [shape[0], shape[1], num_elements]
-        output_tensors.append(
-            array_ops.reshape(dense_tensor, shape=target_shape))
+        output_tensors.append(self._process_dense_tensor(column, dense_tensor))
         sequence_lengths.append(sequence_length)
 
-    fc_old._verify_static_batch_size_equality(output_tensors, ordered_columns)
-    fc_old._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
+    # Check and process sequence lengths.
+    fc_v2._verify_static_batch_size_equality(sequence_lengths,
+                                             self._feature_columns)
     sequence_length = _assert_all_equal_and_return(sequence_lengths)
 
-    return array_ops.concat(output_tensors, -1), sequence_length
+    return self._verify_and_concat_tensors(output_tensors), sequence_length
 
 
 def concatenate_context_input(context_input, sequence_input):
@@ -203,12 +212,13 @@ def sequence_categorical_column_with_identity(
   columns = [watches_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -250,12 +260,13 @@ def sequence_categorical_column_with_hash_bucket(
   columns = [tokens_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -296,12 +307,13 @@ def sequence_categorical_column_with_vocabulary_file(
   columns = [states_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -358,12 +370,13 @@ def sequence_categorical_column_with_vocabulary_list(
   columns = [colors_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -415,12 +428,13 @@ def sequence_numeric_column(
   columns = [temperature]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -445,7 +459,7 @@ def sequence_numeric_column(
     ValueError: if any dimension in shape is not a positive integer.
     ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  shape = fc_old._check_shape(shape=shape, key=key)
+  shape = fc_v2._check_shape(shape=shape, key=key)
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
@@ -540,8 +554,10 @@ class SequenceNumericColumn(
     # For the 2D case, the raw values are grouped according to num_elements;
     # for the 3D case, the grouping happens in the third dimension, and
     # sequence length is not affected.
-    num_elements = (self.variable_shape.num_elements()
-                    if sp_tensor.shape.ndims == 2 else 1)
+    if sp_tensor.shape.ndims == 2:
+      num_elements = self.variable_shape.num_elements()
+    else:
+      num_elements = 1
     seq_length = fc_old._sequence_length_from_sparse_tensor(
         sp_tensor, num_elements=num_elements)
 
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b165a620ae67e855400eb297ec17db80eac7937
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_integration_test.py
@@ -0,0 +1,283 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for sequence feature columns with SequenceExamples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import tempfile
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class SequenceFeatureColumnIntegrationTest(test.TestCase):
+
+  def _make_sequence_example(self):
+    example = example_pb2.SequenceExample()
+    example.context.feature['int_ctx'].int64_list.value.extend([5])
+    example.context.feature['float_ctx'].float_list.value.extend([123.6])
+    for val in range(0, 10, 2):
+      feat = feature_pb2.Feature()
+      feat.int64_list.value.extend([val] * val)
+      example.feature_lists.feature_list['int_list'].feature.extend([feat])
+    for val in range(1, 11, 2):
+      feat = feature_pb2.Feature()
+      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
+      example.feature_lists.feature_list['str_list'].feature.extend([feat])
+
+    return example
+
+  def _build_feature_columns(self):
+    col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
+    ctx_cols = [
+        fc.embedding_column(col, dimension=10),
+        fc.numeric_column('float_ctx')
+    ]
+
+    identity_col = sfc.sequence_categorical_column_with_identity(
+        'int_list', num_buckets=10)
+    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
+        'bytes_list', hash_bucket_size=100)
+    seq_cols = [
+        fc.embedding_column(identity_col, dimension=10),
+        fc.embedding_column(bucket_col, dimension=20)
+    ]
+
+    return ctx_cols, seq_cols
+
+  def test_sequence_example_into_input_layer(self):
+    examples = [_make_sequence_example().SerializeToString()] * 100
+    ctx_cols, seq_cols = self._build_feature_columns()
+
+    def _parse_example(example):
+      ctx, seq = parsing_ops.parse_single_sequence_example(
+          example,
+          context_features=fc.make_parse_example_spec_v2(ctx_cols),
+          sequence_features=fc.make_parse_example_spec_v2(seq_cols))
+      ctx.update(seq)
+      return ctx
+
+    ds = dataset_ops.Dataset.from_tensor_slices(examples)
+    ds = ds.map(_parse_example)
+    ds = ds.batch(20)
+
+    # Test on a single batch
+    features = ds.make_one_shot_iterator().get_next()
+
+    # Tile the context features across the sequence features
+    sequence_input_layer = sfc.SequenceFeatures(seq_cols)
+    seq_layer, _ = sequence_input_layer(features)
+    input_layer = fc.DenseFeatures(ctx_cols)
+    ctx_layer = input_layer(features)
+    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
+
+    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
+    output = rnn_layer(input_layer)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      features_r = sess.run(features)
+      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+
+      output_r = sess.run(output)
+      self.assertAllEqual(output_r.shape, [20, 10])
+
+
+class SequenceExampleParsingTest(test.TestCase):
+
+  def test_seq_ex_in_sequence_categorical_column_with_identity(self):
+    self._test_parsed_sequence_example(
+        'int_list', sfc.sequence_categorical_column_with_identity,
+        10, [3, 6], [2, 4, 6])
+
+  def test_seq_ex_in_sequence_categorical_column_with_hash_bucket(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_hash_bucket,
+        10, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_list(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_list,
+        list(string.ascii_lowercase), [3, 4],
+        [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_file(self):
+    _, fname = tempfile.mkstemp()
+    with open(fname, 'w') as f:
+      f.write(string.ascii_lowercase)
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_file,
+        fname, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def _test_parsed_sequence_example(
+      self, col_name, col_fn, col_arg, shape, values):
+    """Helper function to check that each FeatureColumn parses correctly.
+
+    Args:
+      col_name: string, name to give to the feature column. Should match
+        the name that the column will parse out of the features dict.
+      col_fn: function used to create the feature column. For example,
+        sequence_numeric_column.
+      col_arg: second arg that the target feature column is expecting.
+      shape: the expected dense_shape of the feature after parsing into
+        a SparseTensor.
+      values: the expected values at index [0, 2, 6] of the feature
+        after parsing into a SparseTensor.
+    """
+    example = _make_sequence_example()
+    columns = [
+        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc.numeric_column('float_ctx'),
+        col_fn(col_name, col_arg)
+    ]
+    context, seq_features = parsing_ops.parse_single_sequence_example(
+        example.SerializeToString(),
+        context_features=fc.make_parse_example_spec_v2(columns[:2]),
+        sequence_features=fc.make_parse_example_spec_v2(columns[2:]))
+
+    with self.cached_session() as sess:
+      ctx_result, seq_result = sess.run([context, seq_features])
+      self.assertEqual(list(seq_result[col_name].dense_shape), shape)
+      self.assertEqual(
+          list(seq_result[col_name].values[[0, 2, 6]]), values)
+      self.assertEqual(list(ctx_result['int_ctx'].dense_shape), [1])
+      self.assertEqual(ctx_result['int_ctx'].values[0], 5)
+      self.assertEqual(list(ctx_result['float_ctx'].shape), [1])
+      self.assertAlmostEqual(ctx_result['float_ctx'][0], 123.6, places=1)
+
+
+_SEQ_EX_PROTO = """
+context {
+  feature {
+    key: "float_ctx"
+    value {
+      float_list {
+        value: 123.6
+      }
+    }
+  }
+  feature {
+    key: "int_ctx"
+    value {
+      int64_list {
+        value: 5
+      }
+    }
+  }
+}
+feature_lists {
+  feature_list {
+    key: "bytes_list"
+    value {
+      feature {
+        bytes_list {
+          value: "a"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "b"
+          value: "c"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "d"
+          value: "e"
+          value: "f"
+          value: "g"
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "float_list"
+    value {
+      feature {
+        float_list {
+          value: 1.0
+        }
+      }
+      feature {
+        float_list {
+          value: 3.0
+          value: 3.0
+          value: 3.0
+        }
+      }
+      feature {
+        float_list {
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "int_list"
+    value {
+      feature {
+        int64_list {
+          value: 2
+          value: 2
+        }
+      }
+      feature {
+        int64_list {
+          value: 4
+          value: 4
+          value: 4
+          value: 4
+        }
+      }
+      feature {
+        int64_list {
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def _make_sequence_example():
+  example = example_pb2.SequenceExample()
+  return text_format.Parse(_SEQ_EX_PROTO, example)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
index be012a87690c24c6d9b7808790393e1aa6d01211..a1feaddcc00d5fac86dca3138dfa1c6314bb6a8b 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
@@ -22,9 +22,7 @@ import os
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
-from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column_v2_test import _TestStateManager
 from tensorflow.python.framework import dtypes
@@ -32,13 +30,15 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
 
-class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -111,29 +111,27 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a,
         dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc_old._embedding_column(
+    embedding_column_b = fc.embedding_column(
         categorical_column_b,
         dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[embedding_column_b, embedding_column_a])
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = sfc.SequenceFeatures(
+        [embedding_column_b, embedding_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b,})
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertCountEqual(
-        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
-         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
+        ('sequence_features/aaa_embedding/embedding_weights:0',
+         'sequence_features/bbb_embedding/embedding_weights:0'),
         tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
@@ -152,18 +150,17 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must be of '
-        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[embedding_column_a])
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures([embedding_column_a])
+      _, _ = sequence_input_layer({'aaa': sparse_input})
 
   def test_shared_embedding_column(self):
     vocabulary_size = 3
@@ -210,21 +207,18 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        feature_columns=shared_embedding_columns)
+    sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertCountEqual(
-        ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        ('aaa_bbb_shared_embedding:0',),
         tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
@@ -248,23 +242,20 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old._categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_shared_embedding\. categorical_column must '
-        r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={
-              'aaa': sparse_input_a,
-              'bbb': sparse_input_b
-          },
-          feature_columns=shared_embedding_columns)
+        r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+      _, _ = sequence_input_layer({'aaa': sparse_input_a,
+                                   'bbb': sparse_input_b})
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -319,17 +310,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc_old._indicator_column(categorical_column_b)
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[indicator_column_b, indicator_column_a])
+    indicator_column_b = fc.indicator_column(categorical_column_b)
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = sfc.SequenceFeatures(
+        [indicator_column_b, indicator_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
@@ -346,17 +335,16 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must be of '
-        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[indicator_column_a])
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures([indicator_column_a])
+      _, _ = sequence_input_layer({'aaa': sparse_input})
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -375,7 +363,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            # feature 0, ids [[20, 3], [5]]
            # feature 1, ids [[3], [8]]
            'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (20, 3, 5., 3., 8.),
+           'values': (20., 3., 5., 3., 8.),
            'dense_shape': (2, 2, 2)},
        'expected_input_layer': [
            [[20.], [3.], [5.], [0.]],
@@ -386,11 +374,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
 
-    numeric_column = sfc_old.sequence_numeric_column('aaa')
+    numeric_column = sfc.sequence_numeric_column('aaa')
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
@@ -428,14 +415,13 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
       )
   def test_numeric_column_multi_dim(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    """Tests SequenceFeatures for multi-dimensional numeric_column."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
 
-    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
@@ -454,22 +440,20 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         indices=((0, 0), (1, 0)),
         values=(1., 10.),
         dense_shape=(2, 2))
-    numeric_column_a = sfc_old.sequence_numeric_column('aaa')
-    numeric_column_b = sfc_old.sequence_numeric_column('bbb')
+    numeric_column_a = sfc.sequence_numeric_column('aaa')
+    numeric_column_b = sfc.sequence_numeric_column('bbb')
 
-    _, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        feature_columns=[numeric_column_a, numeric_column_b])
+    sequence_input_layer = sfc.SequenceFeatures(
+        [numeric_column_a, numeric_column_b])
+    _, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
     with monitored_session.MonitoredSession() as sess:
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[Condition x == y did not hold element-wise:\] '
-          r'\[x \(sequence_input_layer/aaa/sequence_length:0\) = \] \[2 1\] '
-          r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
+          r'\[x \(sequence_features/aaa/sequence_length:0\) = \] \[2 1\] '
+          r'\[y \(sequence_features/bbb/sequence_length:0\) = \] \[1 1\]'):
         sess.run(sequence_length)
 
   @parameterized.named_parameters(
@@ -497,11 +481,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
-    input_layer, _ = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
@@ -534,13 +517,49 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    input_layer, _ = sfc.sequence_input_layer(
-        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    sequence_input_layer = sfc.SequenceFeatures([indicator_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
+  def test_compute_output_shape(self):
+    price1 = sfc.sequence_numeric_column('price1', shape=2)
+    price2 = sfc.sequence_numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': sparse_tensor.SparseTensor(
+              indices=[[0, 0, 0], [0, 0, 1],
+                       [0, 1, 0], [0, 1, 1],
+                       [1, 0, 0], [1, 0, 1],
+                       [2, 0, 0], [2, 0, 1],
+                       [3, 0, 0], [3, 0, 1]],
+              values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
+              dense_shape=(4, 3, 2)),
+          'price2': sparse_tensor.SparseTensor(
+              indices=[[0, 0],
+                       [0, 1],
+                       [1, 0],
+                       [2, 0],
+                       [3, 0]],
+              values=[10., 11., 20., 30., 40.],
+              dense_shape=(4, 3))}
+      sequence_features = sfc.SequenceFeatures([price1, price2])
+      seq_input, seq_len = sequence_features(features)
+      self.assertEqual(
+          sequence_features.compute_output_shape((None, None)),
+          (None, None, 3))
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
+                           [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
+                           [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
+                           [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
+                          self.evaluate(seq_input))
+      self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
+
 
 class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
   """Tests the utility fn concatenate_context_input."""
@@ -605,8 +624,8 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-class InputLayerTest(test.TestCase):
-  """Tests input_layer with sequence feature columns."""
+class DenseFeaturesTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
 
   def test_embedding_column(self):
     """Tests that error is raised for sequence embedding column."""
@@ -620,16 +639,15 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type _SequenceCategoricalColumn\.'):
-      _ = fc_old.input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[embedding_column_a])
+        r'of type SequenceCategoricalColumn\.'):
+      input_layer = fc.DenseFeatures([embedding_column_a])
+      _ = input_layer({'aaa': sparse_input})
 
   def test_indicator_column(self):
     """Tests that error is raised for sequence indicator column."""
@@ -643,15 +661,14 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type _SequenceCategoricalColumn\.'):
-      _ = fc_old.input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[indicator_column_a])
+        r'of type SequenceCategoricalColumn\.'):
+      input_layer = fc.DenseFeatures([indicator_column_a])
+      _ = input_layer({'aaa': sparse_input})
 
 
 def _assert_sparse_tensor_value(test_case, expected, actual):
@@ -946,7 +963,7 @@ class SequenceEmbeddingColumnTest(
         embedding_column, {'aaa': inputs})
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index dad50a3a73085526f65bd87c3d8549ceb75b3af4..3f6dbe0cbdeeae5e2107755f80bcfe5f7fc310e4 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -50,6 +50,8 @@ tf_custom_op_py_library(
     visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//tensorflow_estimator:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index e72e50585a3861d4527b66f89e1659d76c85960a..3784631dcbfbeb215b6c695e4b6f1bbd02fa708c 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -130,17 +130,21 @@ _allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
+    'is_sequence_or_composite',
     'flatten',
     'flatten_dict_items',
     'pack_sequence_as',
     'map_structure',
     'map_structure_with_paths',
+    'map_structure_with_tuple_paths',
     'assert_shallow_structure',
     'flatten_up_to',
     'map_structure_up_to',
+    'map_structure_with_tuple_paths_up_to',
     'get_traverse_shallow_structure',
     'yield_flat_paths',
     'flatten_with_joined_string_paths',
+    'flatten_with_tuple_paths',
 ]
 
 remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols)
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 57a5bfbf43c915775c6b0ef05baac19581213a09..f65f450eba49163c319af54ec2bd7f6b61e34c1e 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -171,6 +171,7 @@ cuda_py_test(
     main = "python/ops/fused_conv2d_bias_activation_benchmark.py",
     tags = [
         "manual",  # TODO(b/117128481): re-enable after fixing OSS build
+        "nogpu",
         "requires-gpu-sm70",
     ],
 )
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index c541c71f996c7a1b36cf28ae9a1783f8dca0a72c..b6b75ffa248d66cc4cb49339f193d486f05a6a4a 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include "tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index f89d7ed0f45f919b17398de5d9449d12c08dd2f2..db0868fb2c43464a811b3d6dfcd96480ba2463ee 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -1,12 +1,14 @@
-# Files for using TFGAN framework.
-package(default_visibility = ["//tensorflow:__subpackages__"])
+# Files for using TF-GAN framework.
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 py_library(
     name = "gan",
     srcs = [
@@ -104,7 +106,9 @@ py_library(
     deps = [
         ":gan_estimator",
         ":head",
+        ":latent_gan_estimator",
         ":stargan_estimator",
+        ":tpu_gan_estimator",
         "//tensorflow/python:util",
     ],
 )
@@ -128,6 +132,7 @@ py_library(
         ":clip_weights",
         ":conditioning_utils",
         ":random_tensor_pool",
+        ":spectral_normalization",
         ":virtual_batchnorm",
         "//tensorflow/python:util",
     ],
@@ -141,16 +146,15 @@ py_library(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -518,15 +522,19 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
@@ -562,28 +570,114 @@ py_test(
     deps = [
         ":namedtuples",
         ":stargan_estimator",
-        ":tuple_losses",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_gan_estimator",
+    srcs = [
+        "python/estimator/python/tpu_gan_estimator.py",
+        "python/estimator/python/tpu_gan_estimator_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gan_estimator",
+        ":namedtuples",
+        ":train",
+        "//tensorflow/contrib/tpu:tpu_estimator",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/contrib/training:training_py",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:util",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "tpu_gan_estimator_test",
+    srcs = ["python/estimator/python/tpu_gan_estimator_test.py"],
+    shard_count = 11,
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":namedtuples",
+        ":tpu_gan_estimator",
+        ":tuple_losses",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/tpu:tpu_estimator",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "latent_gan_estimator",
+    srcs = [
+        "python/estimator/python/latent_gan_estimator.py",
+        "python/estimator/python/latent_gan_estimator_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":train",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "latent_gan_estimator_test",
+    srcs = [
+        "python/estimator/python/latent_gan_estimator_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":latent_gan_estimator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
 py_library(
     name = "sliced_wasserstein",
     srcs = [
@@ -618,3 +712,45 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_library(
+    name = "spectral_normalization",
+    srcs = [
+        "python/features/python/spectral_normalization.py",
+        "python/features/python/spectral_normalization_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/keras:engine",
+    ],
+)
+
+py_test(
+    name = "spectral_normalization_test",
+    srcs = ["python/features/python/spectral_normalization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":spectral_normalization",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/slim",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/keras:layers",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 9ab86329eaf0e6fd426aef1f552f4e27c2ad65de..4eac4e80cdacd779fdbedef19e4a654196f0caf1 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -1,14 +1,15 @@
 <!-- TODO(joelshor): Add images to the examples. -->
-# TensorFlow-GAN (TFGAN)
+<!-- TODO(joelshor): Add link to new location when b/122114187 is done. -->
+# TensorFlow-GAN (TF-GAN)
 
-TFGAN is a lightweight library for training and evaluating Generative
+TF-GAN is a lightweight library for training and evaluating Generative
 Adversarial Networks (GANs). This technique allows you to train a network
 (called the 'generator') to sample from a distribution, without having to
 explicitly model the distribution and without writing an explicit loss. For
 example, the generator could learn to draw samples from the distribution of
 natural images. For more details on this technique, see
 ['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
+Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](http://https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
 introduction.
 
 #### Usage
@@ -17,27 +18,27 @@ import tensorflow as tf
 tfgan = tf.contrib.gan
 ```
 
-## Why TFGAN?
+## Why TF-GAN?
 
 * Easily train generator and discriminator networks with well-tested, flexible [library calls](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py). You can
-mix TFGAN, native TF, and other custom frameworks
+mix TF-GAN, native TF, and other custom frameworks
 * Use already implemented [GAN losses and penalties](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/losses_impl.py) (ex Wasserstein loss, gradient penalty, mutual information penalty, etc)
 * [Monitor and visualize](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/summaries_impl.py) GAN progress during training, and [evaluate](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py) them
 * Use already-implemented [tricks](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/) to stabilize and improve training
 * Develop based on examples of [common GAN setups](https://github.com/tensorflow/models/tree/master/research/gan/)
-* Use the TFGAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
-* Improvements in TFGAN infrastructure will automatically benefit your TFGAN project
+* Use the TF-GAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
+* Improvements in TF-GAN infrastructure will automatically benefit your TF-GAN project
 * Stay up-to-date with research as we add more algorithms
 
-## What are the TFGAN components?
+## What are the TF-GAN components?
 
-TFGAN is composed of several parts which were design to exist independently.
+TF-GAN is composed of several parts which were design to exist independently.
 These include the following main pieces (explained in detail below).
 
 *   [core](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py):
     provides the main infrastructure needed to train a GAN. Training occurs in
     four phases, and each phase can be completed by custom-code or by using a
-    TFGAN library call.
+    TF-GAN library call.
 
 *   [features](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/):
     Many common GAN operations and normalization techniques are implemented for
@@ -56,14 +57,14 @@ These include the following main pieces (explained in detail below).
     generative models.
 
 *   [examples](https://github.com/tensorflow/models/tree/master/research/gan/)
-    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TFGAN to make
-    GAN training easier, or use the more complicated examples to jumpstart your
+    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TF-GAN to make
+    GAN training easier, or use the more complicated examples to jump-start your
     own project. These include unconditional and conditional GANs, InfoGANs,
     adversarial losses on existing networks, and image-to-image translation.
 
 ## Training a GAN model
 
-Training in TFGAN typically consists of the following steps:
+Training in TF-GAN typically consists of the following steps:
 
 1. Specify the input to your networks.
 1. Set up your generator and discriminator using a `GANModel`.
@@ -71,12 +72,12 @@ Training in TFGAN typically consists of the following steps:
 1. Create your train ops using a `GANTrainOps`.
 1. Run your train ops.
 
-At each stage, you can either use TFGAN's convenience functions, or you can
+At each stage, you can either use TF-GAN's convenience functions, or you can
 perform the step manually for fine-grained control. We provide examples below.
 
 There are various types of GAN setups. For instance, you can train a generator
 to sample unconditionally from a learned distribution, or you can condition on
-extra information such as a class label. TFGAN is compatible with many setups,
+extra information such as a class label. TF-GAN is compatible with many setups,
 and we demonstrate a few below:
 
 ### Examples
@@ -254,9 +255,9 @@ with variable_scope.variable_scope(dis_scope, reuse=True):
   discriminator_real_outputs = discriminator_fn(images)
 generator_variables = variables_lib.get_trainable_variables(gen_scope)
 discriminator_variables = variables_lib.get_trainable_variables(dis_scope)
-# Depending on what TFGAN features you use, you don't always need to supply
+# Depending on what TF-GAN features you use, you don't always need to supply
 # every `GANModel` field. At a minimum, you need to include the discriminator
-# outputs and variables if you want to use TFGAN to construct losses.
+# outputs and variables if you want to use TF-GAN to construct losses.
 gan_model = tfgan.GANModel(
     generator_inputs,
     generated_data,
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
index f1946c7f925660eae3aaa650c437e03da1f33d6c..1e6000898f7b8a53ad3f6fa12deebd54bf3a57ff 100644
--- a/tensorflow/contrib/gan/__init__.py
+++ b/tensorflow/contrib/gan/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN is a lightweight library for training and evaluating GANs.
+"""TF-GAN is a lightweight library for training and evaluating GANs.
 
 In addition to providing the infrastructure for easily training and evaluating
 GANS, this library contains modules for a TFGAN-backed Estimator,
@@ -24,7 +24,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# Collapse TFGAN into a tiered namespace.
+# Collapse TF-GAN into a tiered namespace.
 from tensorflow.contrib.gan.python import estimator
 from tensorflow.contrib.gan.python import eval  # pylint:disable=redefined-builtin
 from tensorflow.contrib.gan.python import features
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
index 99d38011ba677f03e198a431634fbb2ce349f912..430266555b723e6ca39dccffc1442dbef5d4a385 100644
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ b/tensorflow/contrib/gan/python/estimator/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN estimator module.
+"""TF-GAN estimator module.
 
 GANEstimator provides all the infrastructure support of a TensorFlow Estimator
-with the feature support of TFGAN.
+with the feature support of TF-GAN.
 """
 
 from __future__ import absolute_import
@@ -26,18 +26,25 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.gan.python.estimator.python import gan_estimator
 from tensorflow.contrib.gan.python.estimator.python import head
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
 from tensorflow.contrib.gan.python.estimator.python import stargan_estimator
+from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator
 
 from tensorflow.contrib.gan.python.estimator.python.gan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.head import *
+from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.stargan_estimator import *
+from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator import *
 # pylint: enable=unused-import,wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
+_allowed_symbols = ([
     'gan_estimator',
     'stargan_estimator',
+    'tpu_gan_estimator',
+    'latent_gan_estimator',
     'head',
-] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__
+] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__ +
+                    tpu_gan_estimator.__all__ + latent_gan_estimator.__all__)
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index adb72228217892fffc10b0e2630edcd9d3e38a02..dd904611d1a3bb78de8316d5ed29ab0f800f29a9 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator."""
+"""A TF-GAN-backed GAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -56,10 +56,10 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
-  except for one exception: if either `generator_fn` or `discriminator_fn` have
-  an argument called `mode`, then the tf.Estimator mode is passed in for that
-  argument. This helps with operations like batch normalization, which have
+  This Estimator is backed by TF-GAN. The network functions follow the TF-GAN
+  API except for one exception: if either `generator_fn` or `discriminator_fn`
+  have an argument called `mode`, then the tf.Estimator mode is passed in for
+  that argument. This helps with operations like batch normalization, which have
   different train and evaluation behavior.
 
   Example:
@@ -68,7 +68,7 @@ class GANEstimator(estimator.Estimator):
       import tensorflow as tf
       tfgan = tf.contrib.gan
 
-      # See TFGAN's `train.py` for a description of the generator and
+      # See TF-GAN's `train.py` for a description of the generator and
       # discriminator API.
       def generator_fn(generator_inputs):
         ...
@@ -123,13 +123,13 @@ class GANEstimator(estimator.Estimator):
         to continue training a previously saved model.
       generator_fn: A python function that takes a Tensor, Tensor list, or
         Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples. Additionally, if
+        generator. See `TF-GAN` for more details and examples. Additionally, if
         it has an argument called `mode`, the Estimator's `mode` will be passed
         in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
         normalization.
       discriminator_fn: A python function that takes the output of
         `generator_fn` or real data in the GAN setup, and `generator_inputs`.
-        Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
+        Outputs a Tensor in the range [-inf, inf]. See `TF-GAN` for more details
         and examples.
       generator_loss_fn: The loss function on the generator. Takes a `GANModel`
         tuple.
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 5a3d29cf0b3cb1bbe03cb5ba4f327caf46432b76..5b9c54e43a16adf457d5ed0e7e73dcd168ab0d67 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's estimator.py."""
+"""Tests for TF-GAN's estimator.py."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 1a0ee6dfc498eb6dc8c97411589d9e35bc352062..cbe990b476c3b17ce61e0826b17d10976fea43c7 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator."""
+"""A TF-GAN-backed GAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 8205bc889dc01c8680e2139393d65723280cfbd0..5b50234a0e33cd297b176f142b358338966b6758 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's head.py."""
+"""Tests for TF-GAN's head.py."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e164e24168bb0cc5e9a7cc772081781ea088bb1
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.Learn` components for `Train Input Estimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = latent_gan_estimator_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5afc7731937ed1a82c8ebb5969b2687ffdd583b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
@@ -0,0 +1,205 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements an estimator wrapper that allows training the input latent space.
+
+This file implements a latent gan estimator that wraps around a previously
+trained GAN. The latent gan estimator trains a single variable z, representing
+the hidden latent distribution that is the 'noise' input to the GAN. By training
+z, the inpainting estimator can move around the latent z space towards
+minimizing a specific loss function.
+
+The latent gan estimator has a few key differences from a normal estimator.
+
+First: the variables in the estimator should not be saved, as we are not
+updating the original GAN and are only adding a new z variable that is meant
+to be different for each run. In order to do distributed training using
+train_and_evaluate, the Tensorflow RunConfig is expected to save checkpoints
+by having either save_checkpoints_steps or save_checkpoints_secs saved.
+To avoid this conflict, we purposely set the save_checkpoints_steps value in
+the RunConfig to be one step more than the total number of steps that the
+inpainter estimator will run.
+
+Second: we need to specify warm start settings, as we are reloading the
+GAN model into a different graph (specifically, one with a new z variable).
+The warm start settings defined below reload all GAN variables and ignore the
+new z variable (and the optimizer).
+
+Usage:
+
+  def _generator(net, mode):
+    ...
+
+  def _discriminator(net, condition, mode):
+    ...
+
+  def _loss(gan_model, features, labels, add_summaries):
+    ...
+
+  def optimizer():
+    ...
+
+  params = {<required params>}
+  config = tf.estimator.RunConfig()
+  tmp_dir = path/to/output/storage
+
+  estimator = latent_gan_estimator.get_latent_gan_estimator(
+      _generator, _discriminator, _loss, optimizer, params, config, tmp_dir)
+
+  def input_fn():
+    ...
+
+  estimator.train(input_fn=input_fn)
+
+See latent_gan_estimator_test.py or tensorflow_models/gan/face_inpainting for
+further examples.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+from tensorflow.contrib.gan.python import train as tfgan_train
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+
+INPUT_NAME = 'new_var_z_input'  # The name for the new z space input variable.
+OPTIMIZER_NAME = 'latent_gan_optimizer'  # The name for the new optimizer vars.
+
+__all__ = [
+    'get_latent_gan_estimator',
+]
+
+
+def _get_latent_gan_model_fn(generator_fn, discriminator_fn, loss_fn,
+                             optimizer):
+  """Sets up a model function that wraps around a given GAN."""
+  def model_fn(features, labels, mode, params):
+    """Model function defining an inpainting estimator."""
+    batch_size = params['batch_size']
+    z_shape = [batch_size] + params['z_shape']
+    add_summaries = params['add_summaries']
+    input_clip = params['input_clip']
+
+    z = variable_scope.get_variable(
+        name=INPUT_NAME, initializer=random_ops.truncated_normal(z_shape),
+        constraint=lambda x: clip_ops.clip_by_value(x, -input_clip, input_clip))
+
+    generator = functools.partial(generator_fn, mode=mode)
+    discriminator = functools.partial(discriminator_fn, mode=mode)
+    gan_model = tfgan_train.gan_model(generator_fn=generator,
+                                      discriminator_fn=discriminator,
+                                      real_data=labels,
+                                      generator_inputs=z,
+                                      check_shapes=False)
+
+    loss = loss_fn(gan_model, features, labels, add_summaries)
+
+    # Use a variable scope to make sure that estimator variables dont cause
+    # save/load problems when restoring from ckpts.
+    with variable_scope.variable_scope(OPTIMIZER_NAME):
+      opt = optimizer(learning_rate=params['learning_rate'],
+                      **params['opt_kwargs'])
+      train_op = opt.minimize(
+          loss=loss, global_step=training_util.get_or_create_global_step(),
+          var_list=[z])
+
+    if add_summaries:
+      z_grads = gradients_impl.gradients(loss, z)
+      summary.scalar('z_loss/z_grads', clip_ops.global_norm(z_grads))
+      summary.scalar('z_loss/loss', loss)
+
+    return model_fn_lib.EstimatorSpec(mode=mode,
+                                      predictions=gan_model.generated_data,
+                                      loss=loss,
+                                      train_op=train_op)
+  return model_fn
+
+
+def get_latent_gan_estimator(generator_fn, discriminator_fn, loss_fn,
+                             optimizer, params, config, ckpt_dir,
+                             warmstart_options=True):
+  """Gets an estimator that passes gradients to the input.
+
+  This function takes in a generator and adds a trainable z variable that is
+  used as input to this generator_fn. The generator itself is treated as a black
+  box through which gradients can pass through without updating any weights. The
+  result is a trainable way to traverse the GAN latent space. The loss_fn is
+  used to actually train the z variable. The generator_fn and discriminator_fn
+  should be previously trained by the tfgan library (on reload, the variables
+  are expected to follow the tfgan format. It may be possible to use the
+  latent gan estimator with entirely custom GANs that do not use the tfgan
+  library as long as the appropriate variables are wired properly).
+
+  Args:
+    generator_fn: a function defining a Tensorflow graph for a GAN generator.
+      The weights defined in this graph should already be defined in the given
+      checkpoint location. Should have 'mode' as an argument.
+    discriminator_fn: a function defining a Tensorflow graph for a GAN
+      discriminator. Should have 'mode' as an argument.
+    loss_fn: a function defining a Tensorflow graph for a GAN loss. Takes in a
+      GANModel tuple, features, labels, and add_summaries as inputs.
+    optimizer: a tf.Optimizer or a function that returns a tf.Optimizer with no
+      inputs.
+   params: An object containing the following parameters:
+      - batch_size: an int indicating the size of the training batch.
+      - z_shape: the desired shape of the input z values (not counting batch).
+      - learning_rate: a scalar or function defining a learning rate applied to
+        optimizer.
+      - input_clip: the amount to clip the x training variable by.
+      - add_summaries: whether or not to add summaries.
+      - opt_kwargs: optimizer kwargs.
+    config: tf.RunConfig. Should point model to output dir and should indicate
+     whether to save checkpoints (to avoid saving checkpoints, set
+     save_checkpoints_steps to a number larger than the number of train steps).
+     The model_dir field in the RunConfig should point to a directory WITHOUT
+     any saved checkpoints.
+    ckpt_dir: the directory where the model checkpoints live. The checkpoint is
+     used to warm start the underlying GAN. This should NOT be the same as
+     config.model_dir.
+    warmstart_options: boolean, None, or a WarmStartSettings object. If set to
+      True, uses a default WarmStartSettings object. If set to False or None,
+      does not use warm start. If using a custom WarmStartSettings object, make
+      sure that new variables are properly accounted for when reloading the
+      underlying GAN. Defaults to True.
+  Returns:
+    An estimator spec defining a GAN input training estimator.
+  """
+  model_fn = _get_latent_gan_model_fn(generator_fn, discriminator_fn,
+                                      loss_fn, optimizer)
+
+  if isinstance(warmstart_options, estimator.WarmStartSettings):
+    ws = warmstart_options
+  elif warmstart_options:
+    # Default WarmStart loads all variable names except INPUT_NAME and
+    # OPTIMIZER_NAME.
+    var_regex = '^(?!.*(%s|%s).*)' % (INPUT_NAME, OPTIMIZER_NAME)
+    ws = estimator.WarmStartSettings(ckpt_to_initialize_from=ckpt_dir,
+                                     vars_to_warm_start=var_regex)
+  else:
+    ws = None
+
+  if 'opt_kwargs' not in params:
+    params['opt_kwargs'] = {}
+
+  return estimator.Estimator(model_fn=model_fn, config=config, params=params,
+                             warm_start_from=ws)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac139e532e35f7aae6da0655103a7249fe3382d4
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for latent_gan_estimator.
+
+See g3.tp.tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import numpy as np
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
+from tensorflow.python.estimator import run_config as run_config
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+
+class TrainInputEstimatorTest(test.TestCase):
+
+  def test_get_input_training_estimator(self):
+    """Integration test to make sure the input_training_estimator works."""
+
+    # Create dummy test input tensors.
+    true_features = np.reshape(np.random.uniform(size=100), (10, 10))
+    true_labels = np.reshape(np.random.uniform(size=100), (5, 20))
+    expected_z_output = [[1, -1], [-1, 1]]
+
+    # Fill out required parameters randomly, includes optimizer kwargs.
+    params = {
+        'batch_size': 2,
+        'z_shape': [2],
+        'learning_rate': 1.0,
+        'input_clip': 1.0,
+        'add_summaries': False,
+        'opt_kwargs': {
+            'beta1': 0.1
+        }
+    }
+
+    input_z_shape = [params['batch_size']] + params['z_shape']
+
+    # Create dummy model functions that represent an underlying GANEstimator and
+    # the input training wrapper. Make sure that everything is wired up
+    # correctly in the internals of each dummy function.
+    def _generator(net, mode):
+      """The generator function will get the newly created z variable."""
+      del mode
+      self.assertSequenceEqual(net.shape, input_z_shape)
+      gen_dummy_var = variable_scope.get_variable(
+          name='generator_dummy_variable',
+          initializer=array_ops.ones(input_z_shape))
+      return net * gen_dummy_var
+
+    def _discriminator(net, condition, mode):
+      """The discriminator function will get either the z variable or labels."""
+      del condition, mode
+      try:
+        self.assertSequenceEqual(net.shape, true_labels.shape)
+      except AssertionError:
+        self.assertSequenceEqual(net.shape, input_z_shape)
+      return net
+
+    def _loss(gan_model, features, labels, _):
+      """Make sure that features and labels are passed in from input."""
+      self.assertTrue(np.array_equal(features, true_features))
+      self.assertTrue(np.array_equal(labels, true_labels))
+      return losses.absolute_difference(expected_z_output,
+                                        gan_model.generated_data)
+
+    optimizer = training.AdamOptimizer
+
+    # We are not loading checkpoints, so set the corresponding directory to a
+    # dummy directories.
+    tmp_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig(model_dir=tmp_dir,
+                                  save_summary_steps=None,
+                                  save_checkpoints_steps=1,
+                                  save_checkpoints_secs=None)
+
+    # Get the estimator. Disable warm start so that there is no attempted
+    # checkpoint reloading.
+    estimator = latent_gan_estimator.get_latent_gan_estimator(
+        _generator, _discriminator, _loss, optimizer, params, config, tmp_dir,
+        warmstart_options=None)
+
+    # Train for a few steps.
+    def dummy_input():
+      return true_features, true_labels
+    estimator.train(input_fn=dummy_input, steps=10)
+
+    # Make sure the generator variables did not change, but the z variables did
+    # change.
+    self.assertTrue(np.array_equal(
+        estimator.get_variable_value('Generator/generator_dummy_variable'),
+        np.ones(input_z_shape)))
+    self.assertTrue(np.array_equal(
+        estimator.get_variable_value('new_var_z_input'),
+        expected_z_output))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
index f60e16bc04662b33bc0bb22b5acc8c7fcc7a03ba..2a485e7d47ff10cf34c1b44f8dcc6b1f33c9a05f 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed StarGAN Estimator."""
+"""A TF-GAN-backed StarGAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
index 2ec7938c7c4051842c7e982b54c1213b6e841b79..c00ff4399748a77f88d9753df7592bf3859d754e 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's stargan_estimator.py."""
+"""Tests for TF-GAN's stargan_estimator.py."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -80,7 +80,7 @@ class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(input_data, gan_model.input_data)
     self.assertIsNotNone(gan_model.generated_data)
     self.assertIsNotNone(gan_model.generated_data_domain_target)
-    self.assertEqual(1, len(gan_model.generator_variables))
+    self.assertLen(gan_model.generator_variables, 1)
     self.assertIsNotNone(gan_model.generator_scope)
     self.assertIsNotNone(gan_model.generator_fn)
     if mode == model_fn_lib.ModeKeys.PREDICT:
@@ -109,7 +109,7 @@ class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
           gan_model.discriminator_input_data_domain_predication)
       self.assertIsNotNone(
           gan_model.discriminator_generated_data_domain_predication)
-      self.assertEqual(2, len(gan_model.discriminator_variables))  # 1 FC layer
+      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
       self.assertIsNotNone(gan_model.discriminator_scope)
       self.assertIsNotNone(gan_model.discriminator_fn)
 
@@ -163,6 +163,7 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
+    super(GetEstimatorSpecTest, cls).setUpClass()
     cls._generator_optimizer = training.GradientDescentOptimizer(1.0)
     cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0)
 
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb381f7be3f9545ed918813ee55aede946f22d4
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.Learn` components for `TPUGANEstimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = tpu_gan_estimator_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f2a22c78a304c7cc66ef069a235483e9279b3b2
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
@@ -0,0 +1,423 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A TF-GAN-backed GAN Estimator that works on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
+from tensorflow.contrib.gan.python import train as tfgan_train
+from tensorflow.contrib.gan.python.estimator.python import gan_estimator_impl as gan_estimator_lib
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
+from tensorflow.contrib.training.python.training import training
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops.losses import losses
+
+__all__ = [
+    'TPUGANEstimator',
+]
+
+
+class TPUGANEstimator(tpu_estimator.TPUEstimator):
+  """An estimator for Generative Adversarial Networks (GANs) on TPU.
+
+  This Estimator is backed by TFGAN. It is similar to `tfgan.GANEstimator`,
+  but works on TPU.
+
+  Example:
+
+  ```python
+      import tensorflow as tf
+      tfgan = tf.contrib.gan
+
+      # See TFGAN's `train.py` for a description of the generator and
+      # discriminator API.
+      def generator_fn(generator_inputs):
+        ...
+        return generated_data
+
+      def discriminator_fn(data, conditioning):
+        ...
+        return logits
+
+      # Create GAN estimator.
+      config = tpu_config.RunConfig(model_dir='/my/dir')
+      gan_estimator = tfgan.estimator.TPUGANEstimator(
+          generator_fn=generator_fn,
+          discriminator_fn=discriminator_fn,
+          generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+          discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
+          generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          train_batch_size=4,
+          config=config)
+
+      # Train estimator.
+      gan_estimator.train(train_input_fn, train_steps)
+
+      # Evaluate resulting estimator.
+      gan_estimator.evaluate(eval_input_fn, eval_steps)
+
+      # Generate samples from generator.
+      predictions = np.array([
+          x['generated_data'] for x in gan_estimator.predict(predict_input_fn)])
+  ```
+  """
+
+  def __init__(self,
+               # Arguments to construct the `model_fn`.
+               generator_fn=None,
+               discriminator_fn=None,
+               generator_loss_fn=None,
+               discriminator_loss_fn=None,
+               generator_optimizer=None,
+               discriminator_optimizer=None,
+               get_eval_metric_ops_fn=None,
+               add_summaries=None,
+               joint_train=False,
+               gan_train_steps=tfgan_tuples.GANTrainSteps(1, 1),
+               # TPUEstimator options.
+               model_dir=None,
+               config=None,
+               params=None,
+               use_tpu=True,
+               train_batch_size=None,
+               eval_batch_size=None,
+               predict_batch_size=None,
+               batch_axis=None,
+               eval_on_tpu=True,
+               export_to_tpu=True,
+               warm_start_from=None):
+    """Initializes a TPUGANEstimator instance.
+
+    Args:
+      generator_fn: A python function that takes a Tensor, Tensor list, or
+        Tensor dictionary as inputs and returns the outputs of the GAN
+        generator. See `TFGAN` for more details and examples. Additionally, if
+        it has an argument called `mode`, the Estimator's `mode` will be passed
+        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
+        normalization.
+      discriminator_fn: A python function that takes the output of
+        `generator_fn` or real data in the GAN setup, and `generator_inputs`.
+        Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
+        and examples.
+      generator_loss_fn: The loss function on the generator. Takes a `GANModel`
+        tuple.
+      discriminator_loss_fn: The loss function on the discriminator. Takes a
+        `GANModel` tuple.
+      generator_optimizer: The optimizer for generator updates, or a function
+        that takes no arguments and returns an optimizer. This function will
+        be called when the default graph is the `GANEstimator`'s graph, so
+        utilities like `tf.contrib.framework.get_or_create_global_step` will
+        work.
+      discriminator_optimizer: Same as `generator_optimizer`, but for the
+        discriminator updates.
+      get_eval_metric_ops_fn: A function that takes a list of arguments and
+        returns a dict of metric results keyed by name. The output of this
+        function is passed into `tf.estimator.EstimatorSpec` during evaluation.
+        The arguments must be:
+            * generator_inputs
+            * generated_data
+            * real_data
+            * discriminator_real_outputs
+            * discriminator_gen_outputs
+      add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
+        This is ignored for jobs that run on TPU, such as the train job if
+        `use_tpu` is `True` or the eval job if `eval_on_tpu` is `True`.
+      joint_train: A Python boolean. If `True`, jointly train the generator and
+        the discriminator. If `False`, sequentially train them. See `train.py`
+        in TFGAN for more details on the differences between the two GAN
+        training methods.
+      gan_train_steps: A `tfgan.GANTrainSteps` named tuple describing the ratio
+        of generator to discriminator steps. For now, only supports 1:1
+        training.
+      model_dir: Same as `TPUEstimator`: Directory to save model parameters,
+        graph and etc. This can also be used to load checkpoints from the
+        directory into a estimator to continue training a previously saved
+        model. If `None`, the model_dir in `config` will be used if set. If both
+        are set, they must be same. If both are `None`, a temporary directory
+        will be used.
+      config: Same as `TPUEstimator`: An `tpu_config.RunConfig` configuration
+        object. Cannot be `None`.
+      params: Same as `TPUEstimator`: An optional `dict` of hyper parameters
+        that will be passed into `input_fn` and `model_fn`.  Keys are names of
+        parameters, values are basic python types. There are reserved keys for
+        `TPUEstimator`, including 'batch_size'.
+      use_tpu: Same as `TPUEstimator`: A bool indicating whether TPU support is
+        enabled. Currently, TPU training and evaluation respect this bit, but
+        eval_on_tpu can override execution of eval. See below. Predict still
+        happens on CPU.
+      train_batch_size: Same as `TPUEstimator`: An int representing the global
+        training batch size. TPUEstimator transforms this global batch size to a
+        per-shard batch size, as params['batch_size'], when calling `input_fn`
+        and `model_fn`. Cannot be `None` if `use_tpu` is `True`. Must be
+        divisible by total number of replicas.
+      eval_batch_size: Same as `TPUEstimator`: An int representing evaluation
+        batch size. Must be divisible by total number of replicas.
+      predict_batch_size: Same as `TPUEstimator`: An int representing the
+        prediction batch size. Must be divisible by total number of replicas.
+      batch_axis: Same as `TPUEstimator`: A python tuple of int values
+        describing how each tensor produced by the Estimator `input_fn` should
+        be split across the TPU compute shards. For example, if your input_fn
+        produced (images, labels) where the images tensor is in `HWCN` format,
+        your shard dimensions would be [3, 0], where 3 corresponds to the `N`
+        dimension of your images Tensor, and 0 corresponds to the dimension
+        along which to split the labels to match up with the corresponding
+        images. If None is supplied, and per_host_input_for_training is True,
+        batches will be sharded based on the major dimension. If
+        tpu_config.per_host_input_for_training is False or `PER_HOST_V2`,
+        batch_axis is ignored.
+      eval_on_tpu: Same as `TPUEstimator`: If False, evaluation runs on CPU or
+        GPU. In this case, the model_fn must return `EstimatorSpec` when called
+        with `mode` as `EVAL`.
+      export_to_tpu: Same as `TPUEstimator`: If True, `export_savedmodel()`
+        exports a metagraph for serving on TPU besides the one on CPU.
+      warm_start_from: Same as `TPUEstimator`: Optional string filepath to a
+        checkpoint or SavedModel to warm-start from, or a
+        `tf.estimator.WarmStartSettings` object to fully configure
+        warm-starting.  If the string filepath is provided instead of a
+        `WarmStartSettings`, then all variables are warm-started, and it is
+        assumed that vocabularies and Tensor names are unchanged.
+
+    Raises:
+      ValueError: If loss functions aren't callable.
+      ValueError: If `gan_train_steps` isn't a `tfgan_tuples.GANTrainSteps`
+        tuple.
+      ValueError: If `gan_train_steps` isn't 1:1 training.
+    """
+    if not callable(generator_loss_fn):
+      raise ValueError('generator_loss_fn must be callable.')
+    if not callable(discriminator_loss_fn):
+      raise ValueError('discriminator_loss_fn must be callable.')
+    if not isinstance(gan_train_steps, tfgan_tuples.GANTrainSteps):
+      raise ValueError(
+          '`gan_train_steps` must be `tfgan_tuples.GANTrainSteps`. Instead, '
+          'was type: %s' % type(gan_train_steps))
+    if (gan_train_steps.generator_train_steps != 1 or
+        gan_train_steps.discriminator_train_steps != 1):
+      raise ValueError('Estimator currently only supports 1:1 training.')
+
+    if use_tpu:
+      generator_optimizer = _maybe_make_cross_shard_optimizer(
+          generator_optimizer)
+      discriminator_optimizer = _maybe_make_cross_shard_optimizer(
+          discriminator_optimizer)
+
+    def _model_fn(features, labels, mode, params):
+      """GANEstimator model function."""
+      del params  # unused
+      if mode not in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
+                      model_fn_lib.ModeKeys.PREDICT]:
+        raise ValueError('Mode not recognized: %s' % mode)
+      real_data = labels  # rename inputs for clarity
+      generator_inputs = features  # rename inputs for clarity
+
+      # Make GANModel, which encapsulates the GAN model architectures.
+      # TODO(joelshor): Switch TF-GAN over to TPU-compatible summaries, then
+      # remove `add_summaries` logic below.
+      is_on_tpu = _is_on_tpu(mode, use_tpu, eval_on_tpu)
+      gan_model = gan_estimator_lib._get_gan_model(  # pylint:disable=protected-access
+          mode, generator_fn, discriminator_fn, real_data, generator_inputs,
+          add_summaries=None if is_on_tpu else add_summaries)
+
+      # Make the TPUEstimatorSpec, which incorporates the GANModel, losses, eval
+      # metrics, and optimizers (if required).
+      estimator_spec = _get_estimator_spec(
+          mode, gan_model, generator_loss_fn, discriminator_loss_fn,
+          get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
+          joint_train, is_on_tpu, gan_train_steps)
+      assert isinstance(estimator_spec, tpu_estimator.TPUEstimatorSpec)
+      return estimator_spec
+
+    super(TPUGANEstimator, self).__init__(
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        use_tpu=use_tpu,
+        train_batch_size=train_batch_size,
+        eval_batch_size=eval_batch_size,
+        predict_batch_size=predict_batch_size,
+        batch_axis=batch_axis,
+        eval_on_tpu=eval_on_tpu,
+        export_to_tpu=export_to_tpu,
+        warm_start_from=warm_start_from)
+
+
+def _is_on_tpu(mode, use_tpu, eval_on_tpu):
+  if mode == model_fn_lib.ModeKeys.TRAIN:
+    return use_tpu
+  elif mode == model_fn_lib.ModeKeys.EVAL:
+    return eval_on_tpu
+  else:
+    return False
+
+
+def _get_estimator_spec(
+    mode, gan_model, generator_loss_fn, discriminator_loss_fn,
+    get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
+    joint_train, is_on_tpu, gan_train_steps):
+  """Get the TPUEstimatorSpec for the current mode."""
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    estimator_spec = tpu_estimator.TPUEstimatorSpec(
+        mode=mode, predictions={'generated_data': gan_model.generated_data})
+  elif mode == model_fn_lib.ModeKeys.EVAL:
+    gan_loss = tfgan_tuples.GANLoss(
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu),
+        discriminator_loss=discriminator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu))
+    # Eval losses for metrics must preserve batch dimension.
+    gan_loss_no_reduction = tfgan_tuples.GANLoss(
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=False, reduction=losses.Reduction.NONE),
+        discriminator_loss=discriminator_loss_fn(
+            gan_model, add_summaries=False, reduction=losses.Reduction.NONE))
+    estimator_spec = _get_eval_estimator_spec(
+        gan_model, gan_loss, gan_loss_no_reduction, get_eval_metric_ops_fn)
+  else:  # model_fn_lib.ModeKeys.TRAIN:
+    gan_loss = tfgan_tuples.GANLoss(
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu),
+        discriminator_loss=discriminator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu))
+
+    # Construct optimizers if arguments were callable. For TPUs, they must be
+    # `CrossShardOptimizer`.
+    g_callable = callable(generator_optimizer)
+    gopt = generator_optimizer() if g_callable  else generator_optimizer
+    d_callable = callable(discriminator_optimizer)
+    dopt = discriminator_optimizer() if d_callable else discriminator_optimizer
+
+    estimator_spec = _get_train_estimator_spec(
+        gan_model, gan_loss, gopt, dopt, joint_train, gan_train_steps)
+
+  return estimator_spec
+
+
+def _get_eval_estimator_spec(gan_model, gan_loss, gan_loss_no_reduction,
+                             get_eval_metric_ops_fn):
+  """Return an TPUEstimatorSpec for the eval case."""
+  # Make the metric function and tensor names.
+  if get_eval_metric_ops_fn is not None:
+    def metric_fn(
+        generator_inputs, generated_data, real_data, discriminator_real_outputs,
+        discriminator_gen_outputs, generator_loss, discriminator_loss):
+      """`metric_fn` used in TPUEstimator to calculate metrics."""
+      eval_metric_ops = {
+          'generator_loss': metrics_lib.mean(generator_loss),
+          'discriminator_loss': metrics_lib.mean(discriminator_loss),
+      }
+      custom_eval_metric_ops = get_eval_metric_ops_fn(
+          generator_inputs, generated_data, real_data,
+          discriminator_real_outputs, discriminator_gen_outputs)
+      if not isinstance(custom_eval_metric_ops, dict):
+        raise TypeError('`get_eval_metric_ops_fn` must return a dict, '
+                        'received: {}'.format(custom_eval_metric_ops))
+      eval_metric_ops.update(custom_eval_metric_ops)
+      return eval_metric_ops
+    tensors = {
+        'generator_loss': gan_loss_no_reduction.generator_loss,
+        'discriminator_loss': gan_loss_no_reduction.discriminator_loss,
+        'generator_inputs': gan_model.generator_inputs,
+        'generated_data': gan_model.generated_data,
+        'real_data': gan_model.real_data,
+        'discriminator_real_outputs': gan_model.discriminator_real_outputs,
+        'discriminator_gen_outputs': gan_model.discriminator_gen_outputs,
+    }
+  else:
+    def metric_fn(generator_loss, discriminator_loss):
+      return {
+          'generator_loss': metrics_lib.mean(generator_loss),
+          'discriminator_loss': metrics_lib.mean(discriminator_loss),
+      }
+    tensors = {
+        'generator_loss': gan_loss_no_reduction.generator_loss,
+        'discriminator_loss': gan_loss_no_reduction.discriminator_loss,
+    }
+
+  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+  return tpu_estimator.TPUEstimatorSpec(
+      mode=model_fn_lib.ModeKeys.EVAL,
+      predictions=gan_model.generated_data,
+      loss=scalar_loss,
+      eval_metrics=(metric_fn, tensors))
+
+
+def _get_train_estimator_spec(
+    gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
+    joint_train, gan_train_steps):
+  """Return a TPUEstimatorSpec for the train case."""
+  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+
+  # Get generator and discriminator update ops. We split them so that update
+  # ops aren't accidentally run multiple times. For now, throw an error if
+  # there are update ops that aren't associated with either the generator or
+  # the discriminator. Might modify the `kwargs` dictionary.
+  gen_update_ops, dis_update_ops = tfgan_train._get_update_ops(  # pylint:disable=protected-access
+      {}, gan_model.generator_scope.name, gan_model.discriminator_scope.name)
+
+  def gen_train_op():
+    with ops.name_scope('generator_train'):
+      return training.create_train_op(
+          total_loss=gan_loss.generator_loss,
+          optimizer=generator_optimizer,
+          variables_to_train=gan_model.generator_variables,
+          update_ops=gen_update_ops)
+  def dis_train_op():
+    with ops.name_scope('discriminator_train'):
+      return training.create_train_op(
+          total_loss=gan_loss.discriminator_loss,
+          optimizer=discriminator_optimizer,
+          variables_to_train=gan_model.discriminator_variables,
+          update_ops=dis_update_ops)
+
+  # Either optimize the generator and discriminator sequentially or jointly.
+  tpu_train_op = _combine_train_ops(gen_train_op, dis_train_op, joint_train,
+                                    gan_train_steps)
+
+  return tpu_estimator.TPUEstimatorSpec(
+      loss=scalar_loss,
+      mode=model_fn_lib.ModeKeys.TRAIN,
+      train_op=tpu_train_op)
+
+
+# TODO(joelshor): Add support for multiple D / G steps.
+def _combine_train_ops(gen_train_op, dis_train_op, joint_train,
+                       gan_train_steps):
+  """Combine generator and discriminator train ops into a single op."""
+  del gan_train_steps
+  if joint_train:
+    tpu_train_op = control_flow_ops.group(gen_train_op(), dis_train_op(),
+                                          name='joint_train')
+  else:
+    with ops.control_dependencies([dis_train_op()]):
+      tpu_train_op = gen_train_op()
+
+  return tpu_train_op
+
+
+def _maybe_make_cross_shard_optimizer(opt):
+  if callable(opt):
+    if not isinstance(opt(), tpu_optimizer.CrossShardOptimizer):
+      return lambda: tpu_optimizer.CrossShardOptimizer(opt())
+  elif not isinstance(opt, tpu_optimizer.CrossShardOptimizer):
+    return tpu_optimizer.CrossShardOptimizer(opt)
+  return opt
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d9e6489bdd1d89cc49bfedc2eed784999c31d2b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
@@ -0,0 +1,319 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TF-GAN's TPU Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
+from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator_impl as estimator
+from tensorflow.contrib.gan.python.losses.python import tuple_losses as losses
+from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.estimator import WarmStartSettings
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework.errors_impl import NotFoundError
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import learning_rate_decay
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_bool('use_tpu', False, 'Whether to run test on TPU or not.')
+
+
+def generator_fn(noise, mode):
+  del mode
+  return layers.fully_connected(noise, tensor_shape.dimension_value(
+      noise.shape[1]))
+
+
+def discriminator_fn(data, unused_conditioning, mode):
+  del unused_conditioning, mode
+  return layers.fully_connected(data, 1)
+
+
+def get_dummy_gan_model():
+  # TODO(joelshor): Find a better way of creating a variable scope.
+  with variable_scope.variable_scope('generator') as gen_scope:
+    gen_var = variable_scope.get_variable('dummy_var', initializer=0.0)
+  with variable_scope.variable_scope('discriminator') as dis_scope:
+    dis_var = variable_scope.get_variable('dummy_var', initializer=0.0)
+  return tfgan_tuples.GANModel(
+      generator_inputs=None,
+      generated_data=array_ops.ones([3, 4]),
+      generator_variables=[gen_var],
+      generator_scope=gen_scope,
+      generator_fn=None,
+      real_data=array_ops.zeros([3, 4]),
+      discriminator_real_outputs=array_ops.ones([1, 2, 3]) * dis_var,
+      discriminator_gen_outputs=array_ops.ones([1, 2, 3]) * gen_var * dis_var,
+      discriminator_variables=[dis_var],
+      discriminator_scope=dis_scope,
+      discriminator_fn=None)
+
+
+def get_metrics(generator_inputs, generated_data, real_data,
+                discriminator_real_outputs, discriminator_gen_outputs):
+  del generator_inputs, discriminator_real_outputs, discriminator_gen_outputs
+  return {
+      'mse_custom_metric': metrics_lib.mean_squared_error(
+          real_data, generated_data)
+  }
+
+
+class GetTPUEstimatorSpecTest(test.TestCase, parameterized.TestCase):
+  """Tests that the EstimatorSpec is constructed appropriately."""
+
+  @classmethod
+  def setUpClass(cls):
+    super(GetTPUEstimatorSpecTest, cls).setUpClass()
+    cls._generator_optimizer = tpu_optimizer.CrossShardOptimizer(
+        training.GradientDescentOptimizer(1.0))
+    cls._discriminator_optimizer = tpu_optimizer.CrossShardOptimizer(
+        training.GradientDescentOptimizer(1.0))
+
+  @parameterized.named_parameters(
+      ('joint_train', model_fn_lib.ModeKeys.TRAIN, True),
+      ('train_sequential', model_fn_lib.ModeKeys.TRAIN, False),
+      ('eval', model_fn_lib.ModeKeys.EVAL, None),
+      ('predict', model_fn_lib.ModeKeys.PREDICT, None))
+  def test_get_estimator_spec(self, mode, joint_train):
+    with ops.Graph().as_default():
+      self._gan_model = get_dummy_gan_model()
+      spec = estimator._get_estimator_spec(
+          mode,
+          self._gan_model,
+          generator_loss_fn=losses.wasserstein_generator_loss,
+          discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+          get_eval_metric_ops_fn=get_metrics,
+          generator_optimizer=self._generator_optimizer,
+          discriminator_optimizer=self._discriminator_optimizer,
+          joint_train=joint_train,
+          is_on_tpu=FLAGS.use_tpu,
+          gan_train_steps=tfgan_tuples.GANTrainSteps(1, 1))
+
+    self.assertIsInstance(spec, tpu_estimator.TPUEstimatorSpec)
+    self.assertEqual(mode, spec.mode)
+    if mode == model_fn_lib.ModeKeys.PREDICT:
+      self.assertEqual({'generated_data': self._gan_model.generated_data},
+                       spec.predictions)
+    elif mode == model_fn_lib.ModeKeys.TRAIN:
+      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
+      self.assertIsNotNone(spec.train_op)
+      self.assertIsNotNone(spec.training_hooks)
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      self.assertEqual(self._gan_model.generated_data, spec.predictions)
+      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
+      self.assertIsNotNone(spec.eval_metrics)
+
+
+class TPUGANEstimatorIntegrationTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TPUGANEstimatorIntegrationTest, self).setUp()
+    self._model_dir = tempfile.mkdtemp()
+    self._config = tpu_config.RunConfig(model_dir=self._model_dir)
+
+  def tearDown(self):
+    super(TPUGANEstimatorIntegrationTest, self).tearDown()
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, prediction_size,
+      lr_decay=False, joint_train=True):
+    def make_opt():
+      gstep = training_util.get_or_create_global_step()
+      lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
+      return training.GradientDescentOptimizer(lr)
+
+    gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
+    dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
+    est = estimator.TPUGANEstimator(
+        generator_fn=generator_fn,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=gopt,
+        discriminator_optimizer=dopt,
+        joint_train=joint_train,
+        get_eval_metric_ops_fn=get_metrics,
+        train_batch_size=4,
+        eval_batch_size=10,
+        predict_batch_size=8,
+        use_tpu=FLAGS.use_tpu,
+        config=self._config)
+
+    # Train.
+    num_steps_train = 10
+    est.train(train_input_fn, steps=num_steps_train)
+
+    # Evaluate.
+    num_steps_eval = 2
+    scores = est.evaluate(eval_input_fn, steps=num_steps_eval)
+    self.assertIn(ops.GraphKeys.GLOBAL_STEP, six.iterkeys(scores))
+    self.assertIn('loss', six.iterkeys(scores))
+    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
+                     scores['loss'])
+    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+
+    # Predict.
+    predictions = np.array([x['generated_data'] for x in
+                            est.predict(predict_input_fn)])
+    self.assertAllEqual(prediction_size, predictions.shape)
+
+  @parameterized.named_parameters(
+      ('joint_train', True, False, False),
+      ('train_sequential', False, False, False),
+      ('lr_decay', False, True, False),
+      ('train_sequential_ds', False, False, True))
+  def test_numpy_input_fn(self, joint_train, lr_decay, return_ds):
+    """Tests complete flow with numpy_input_fn."""
+    input_dim = 4
+    def train_input_fn(params):
+      data = np.zeros([input_dim], dtype=np.float32)
+      ds = (dataset_ops.Dataset
+            .from_tensors((data, data))
+            .repeat()
+            .batch(params['batch_size'], drop_remainder=True))
+      if return_ds:
+        return ds
+      else:
+        x, y = ds.make_one_shot_iterator().get_next()
+        return x, y
+    def eval_input_fn(params):
+      data = np.zeros([input_dim], dtype=np.float32)
+      ds = (dataset_ops.Dataset
+            .from_tensors((data, data))
+            .repeat()
+            .batch(params['batch_size'], drop_remainder=True))
+      if return_ds:
+        return ds
+      else:
+        x, y = ds.make_one_shot_iterator().get_next()
+        return x, y
+    predict_size = 10
+    def predict_input_fn(params):
+      del params  # unused
+      data = np.zeros([input_dim], dtype=np.float32)
+      ds = (dataset_ops.Dataset
+            .from_tensors(data)
+            .repeat(predict_size)
+            .batch(1, drop_remainder=True))
+      return ds
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        prediction_size=[predict_size, input_dim],
+        lr_decay=lr_decay,
+        joint_train=joint_train)
+
+
+class TPUGANEstimatorWarmStartTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = self.get_temp_dir()
+    self._config = tpu_config.RunConfig(model_dir=self._model_dir)
+    self.new_variable_name = 'new_var'
+    self.new_variable_value = [1.0, 2.0, 3.0]
+
+  def tearDown(self):
+    writer_cache.FileWriterCache.clear()
+
+  def _test_warm_start(self, warm_start_from=None):
+    """Tests whether WarmStartSettings work as intended."""
+    def generator_with_new_variable(noise_dict, mode):
+      variable_scope.get_variable(name=self.new_variable_name,
+                                  initializer=self.new_variable_value,
+                                  trainable=True)
+      return generator_fn(noise_dict, mode)
+
+    est = estimator.TPUGANEstimator(
+        generator_fn=generator_fn,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=training.GradientDescentOptimizer(1.0),
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        train_batch_size=4,
+        use_tpu=FLAGS.use_tpu,
+        config=self._config)
+
+    def train_input_fn(params):
+      data = np.zeros([params['batch_size'], 4], dtype=np.float32)
+      return data, data
+
+    est.train(train_input_fn, steps=1)
+
+    est_warm = estimator.TPUGANEstimator(
+        generator_fn=generator_with_new_variable,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=training.GradientDescentOptimizer(1.0),
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        config=tpu_config.RunConfig(
+            model_dir=None if warm_start_from else self._model_dir),
+        train_batch_size=4,
+        use_tpu=FLAGS.use_tpu,
+        warm_start_from=warm_start_from)
+
+    est_warm.train(train_input_fn, steps=1)
+
+    return est_warm
+
+  def test_warm_start_error(self):
+    """Test if exception when reloading different estimators."""
+    with self.assertRaises(NotFoundError):
+      self._test_warm_start()
+
+  def test_warm_start_success(self):
+    """Test if GANEstimator allows explicit warm start variable assignment."""
+    # Regex matches all variable names in ckpt except for new_var.
+    var_regex = '^(?!.*%s.*)' % self.new_variable_name
+    warmstart = WarmStartSettings(ckpt_to_initialize_from=self._model_dir,
+                                  vars_to_warm_start=var_regex)
+    est_warm = self._test_warm_start(warm_start_from=warmstart)
+    full_variable_name = 'Generator/%s' % self.new_variable_name
+    self.assertIn(full_variable_name, est_warm.get_variable_names())
+    equal_vals = np.array_equal(est_warm.get_variable_value(full_variable_name),
+                                self.new_variable_value)
+    self.assertTrue(equal_vals)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
index f86b8513053a45f9830411f7df2c32d1f36a97b2..92e9abf8a35de1999eb800e169f32220fe47f8cd 100644
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ b/tensorflow/contrib/gan/python/eval/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN evaluation module.
+"""TF-GAN evaluation module.
 
 This module supports techniques such as Inception Score, Frechet Inception
 distance, and Sliced Wasserstein distance.
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
index 1c872626a957279132772ae27df7a66a2564e9a5..a52e899114b62cb29752f72aa59f142f4a428aa1 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN."""
+"""Model evaluation tools for TF-GAN."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index a71ee53311c1c057a5b41be0331bf56ce1a82f74..31f0d34ed68a6adc25cca102236079d0f66615cb 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN.
+"""Model evaluation tools for TF-GAN.
 
 These methods come from https://arxiv.org/abs/1606.03498,
 https://arxiv.org/abs/1706.08500, and https://arxiv.org/abs/1801.01401.
@@ -387,7 +387,7 @@ def classifier_score_from_logits(logits):
   # Use maximum precision for best results.
   logits_dtype = logits.dtype
   if logits_dtype != dtypes.float64:
-    logits = math_ops.to_double(logits)
+    logits = math_ops.cast(logits, dtypes.float64)
 
   p = nn_ops.softmax(logits)
   q = math_ops.reduce_mean(p, axis=0)
@@ -562,8 +562,8 @@ def mean_only_frechet_classifier_distance_from_activations(
 
   activations_dtype = real_activations.dtype
   if activations_dtype != dtypes.float64:
-    real_activations = math_ops.to_double(real_activations)
-    generated_activations = math_ops.to_double(generated_activations)
+    real_activations = math_ops.cast(real_activations, dtypes.float64)
+    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
 
   # Compute means of activations.
   m = math_ops.reduce_mean(real_activations, 0)
@@ -623,8 +623,8 @@ def diagonal_only_frechet_classifier_distance_from_activations(
 
   activations_dtype = real_activations.dtype
   if activations_dtype != dtypes.float64:
-    real_activations = math_ops.to_double(real_activations)
-    generated_activations = math_ops.to_double(generated_activations)
+    real_activations = math_ops.cast(real_activations, dtypes.float64)
+    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
 
   # Compute mean and covariance matrices of activations.
   m, var = nn_impl.moments(real_activations, axes=[0])
@@ -698,15 +698,16 @@ def frechet_classifier_distance_from_activations(real_activations,
 
   activations_dtype = real_activations.dtype
   if activations_dtype != dtypes.float64:
-    real_activations = math_ops.to_double(real_activations)
-    generated_activations = math_ops.to_double(generated_activations)
+    real_activations = math_ops.cast(real_activations, dtypes.float64)
+    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
 
   # Compute mean and covariance matrices of activations.
   m = math_ops.reduce_mean(real_activations, 0)
   m_w = math_ops.reduce_mean(generated_activations, 0)
-  num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0])
-  num_examples_generated = math_ops.to_double(
-      array_ops.shape(generated_activations)[0])
+  num_examples_real = math_ops.cast(
+      array_ops.shape(real_activations)[0], dtypes.float64)
+  num_examples_generated = math_ops.cast(
+      array_ops.shape(generated_activations)[0], dtypes.float64)
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
   real_centered = real_activations - m
@@ -794,9 +795,9 @@ def kernel_classifier_distance(real_images,
       on a classifier.
     num_classifier_batches: Number of batches to split images in to in order to
       efficiently run them through the classifier network.
-    max_estimator_block_size: integer, default 1024. The distance estimator
-      splits samples into blocks for computational efficiency. Larger values are
-      more computationally expensive but decrease the variance of the distance
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
       estimate.
     dtype: if not None, coerce activations to this dtype before computations.
 
@@ -871,9 +872,9 @@ def kernel_classifier_distance_and_std(real_images,
       on a classifier.
     num_classifier_batches: Number of batches to split images in to in order to
       efficiently run them through the classifier network.
-    max_estimator_block_size: integer, default 1024. The distance estimator
-      splits samples into blocks for computational efficiency. Larger values are
-      more computationally expensive but decrease the variance of the distance
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
       estimate. Having a smaller block size also gives a better estimate of the
       standard error.
     dtype: if not None, coerce activations to this dtype before computations.
@@ -910,7 +911,7 @@ def kernel_classifier_distance_and_std(real_images,
   gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
 
   return kernel_classifier_distance_and_std_from_activations(
-      real_a, gen_a, max_block_size=max_block_size)
+      real_a, gen_a, max_block_size, dtype)
 
 
 kernel_inception_distance_and_std = functools.partial(
@@ -967,14 +968,14 @@ def kernel_classifier_distance_from_activations(real_activations,
       into blocks for computational efficiency. Larger values are more
       computationally expensive but decrease the variance of the distance
       estimate.
-    dtype: if not None, coerce activations to this dtype before computations.
+    dtype: If not None, coerce activations to this dtype before computations.
 
   Returns:
    The Kernel Inception Distance. A floating-point scalar of the same type
    as the output of the activations.
   """
   return kernel_classifier_distance_and_std_from_activations(
-      real_activations, generated_activations, max_block_size=max_block_size)[0]
+      real_activations, generated_activations, max_block_size, dtype)[0]
 
 
 def kernel_classifier_distance_and_std_from_activations(real_activations,
@@ -1029,7 +1030,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
       computationally expensive but decrease the variance of the distance
       estimate. Having a smaller block size also gives a better estimate of the
       standard error.
-    dtype: if not None, coerce activations to this dtype before computations.
+    dtype: If not None, coerce activations to this dtype before computations.
 
   Returns:
    The Kernel Inception Distance. A floating-point scalar of the same type
@@ -1080,7 +1081,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
   dim = math_ops.cast(real_activations.shape[1], dtype)
 
   def compute_kid_block(i):
-    'Compute the ith block of the KID estimate.'
+    """Computes the ith block of the KID estimate."""
     r_s = inds_r[i]
     r_e = inds_r[i + 1]
     r = real_activations[r_s:r_e]
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index dbff1d2a367e10adc607dafb4c571bb3607a3963..bd17571a0535a3c8e9dfee24a8da16eb2e72f165 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN classifier_metrics."""
+"""Tests for TF-GAN classifier_metrics."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -234,7 +234,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     else:
       logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
-    self.assertTrue(isinstance(logits, ops.Tensor))
+    self.assertIsInstance(logits, ops.Tensor)
     logits.shape.assert_is_compatible_with([batch_size, 1001])
 
     # Check that none of the model variables are trainable.
@@ -258,7 +258,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
           img, _get_dummy_graphdef(),
           output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
-    self.assertTrue(isinstance(pool, ops.Tensor))
+    self.assertIsInstance(pool, ops.Tensor)
     pool.shape.assert_is_compatible_with([batch_size, 2048])
 
     # Check that none of the model variables are trainable.
@@ -276,8 +276,8 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
             classifier_metrics.INCEPTION_FINAL_POOL
         ])
 
-    self.assertTrue(isinstance(logits, ops.Tensor))
-    self.assertTrue(isinstance(pool, ops.Tensor))
+    self.assertIsInstance(logits, ops.Tensor)
+    self.assertIsInstance(pool, ops.Tensor)
     logits.shape.assert_is_compatible_with([batch_size, 1001])
     pool.shape.assert_is_compatible_with([batch_size, 2048])
 
@@ -290,7 +290,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         classifier_metrics.inception_score,
         array_ops.zeros([6, 299, 299, 3]),
         num_batches=3)
-    self.assertTrue(isinstance(score, ops.Tensor))
+    self.assertIsInstance(score, ops.Tensor)
     score.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -302,7 +302,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     distance = _run_with_mock(
         classifier_metrics.frechet_inception_distance, img, img)
 
-    self.assertTrue(isinstance(distance, ops.Tensor))
+    self.assertIsInstance(distance, ops.Tensor)
     distance.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -314,7 +314,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     distance = _run_with_mock(classifier_metrics.kernel_inception_distance, img,
                               img)
 
-    self.assertTrue(isinstance(distance, ops.Tensor))
+    self.assertIsInstance(distance, ops.Tensor)
     distance.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
index 523968bed91f1021ae629bf52c405cf5c2d7b917..326fcb3cdbf2eda66207f134cd2926f09a216a99 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN."""
+"""Model evaluation tools for TF-GAN."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries.py b/tensorflow/contrib/gan/python/eval/python/summaries.py
index ecfdb39499b1e824e02415c0db1de3157e4f3216..1b202dfc97304ddc7ced42d65366aaf419439392 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TFGAN summaries."""
+"""Common TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index f9995bb19d0d09eaf6fd96d039b0bba1d3a7055c..9f448d3a1602c503093214201bdc96fc9bee85b5 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TFGAN summaries."""
+"""Common TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 54a6f8d4d9086ad7fc8db31032677628561e48e8..53fc7cb8ede698c2d8590c7fd3016a884cef9be9 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN summaries."""
+"""Tests for TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
index 4816daf760143af9f1502873b123ffad8e5ec8ce..410c3a02052cd3a07a36a0ba332a80b3c2705d89 100644
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ b/tensorflow/contrib/gan/python/features/__init__.py
@@ -27,11 +27,13 @@ from __future__ import print_function
 from tensorflow.contrib.gan.python.features.python import clip_weights
 from tensorflow.contrib.gan.python.features.python import conditioning_utils
 from tensorflow.contrib.gan.python.features.python import random_tensor_pool
+from tensorflow.contrib.gan.python.features.python import spectral_normalization
 from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
 
 from tensorflow.contrib.gan.python.features.python.clip_weights import *
 from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
 from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
+from tensorflow.contrib.gan.python.features.python.spectral_normalization import *
 from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -40,5 +42,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = clip_weights.__all__
 _allowed_symbols += conditioning_utils.__all__
 _allowed_symbols += random_tensor_pool.__all__
+_allowed_symbols += spectral_normalization.__all__
 _allowed_symbols += virtual_batchnorm.__all__
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d3d0a218dec3588844333cd47e1f92489d8df9
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-like layers and utilities that implement Spectral Normalization.
+
+Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
+et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.features.python.spectral_normalization_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = spectral_normalization_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc653f0a7907f407e66add5537d1e0a5adb6d8b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
@@ -0,0 +1,315 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-like layers and utilities that implement Spectral Normalization.
+
+Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
+et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import numbers
+import re
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import base_layer_utils as keras_base_layer_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+
+__all__ = [
+    'compute_spectral_norm', 'spectral_normalize', 'spectral_norm_regularizer',
+    'spectral_normalization_custom_getter', 'keras_spectral_normalization'
+]
+
+# tf.bfloat16 should work, but tf.matmul converts those to tf.float32 which then
+# can't directly be assigned back to the tf.bfloat16 variable.
+_OK_DTYPES_FOR_SPECTRAL_NORM = (dtypes.float16, dtypes.float32, dtypes.float64)
+_PERSISTED_U_VARIABLE_SUFFIX = 'spectral_norm_u'
+
+
+def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
+  """Estimates the largest singular value in the weight tensor.
+
+  Args:
+    w_tensor: The weight matrix whose spectral norm should be computed.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    name: An optional scope name.
+
+  Returns:
+    The largest singular value (the spectral norm) of w.
+  """
+  with variable_scope.variable_scope(name, 'spectral_norm'):
+    # The paper says to flatten convnet kernel weights from
+    # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
+    # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
+    # (KH * KW * C_in, C_out), and similarly for other layers that put output
+    # channels as last dimension.
+    # n.b. this means that w here is equivalent to w.T in the paper.
+    w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))
+
+    # Persisted approximation of first left singular vector of matrix `w`.
+    u_var = variable_scope.get_variable(
+        _PERSISTED_U_VARIABLE_SUFFIX,
+        shape=(w.shape[0], 1),
+        dtype=w.dtype,
+        initializer=init_ops.random_normal_initializer(),
+        trainable=False)
+    u = u_var
+
+    # Use power iteration method to approximate spectral norm.
+    for _ in range(power_iteration_rounds):
+      # `v` approximates the first right singular vector of matrix `w`.
+      v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u))
+      u = nn.l2_normalize(math_ops.matmul(w, v))
+
+    # Update persisted approximation.
+    with ops.control_dependencies([u_var.assign(u, name='update_u')]):
+      u = array_ops.identity(u)
+
+    u = array_ops.stop_gradient(u)
+    v = array_ops.stop_gradient(v)
+
+    # Largest singular value of `w`.
+    spectral_norm = math_ops.matmul(
+        math_ops.matmul(array_ops.transpose(u), w), v)
+    spectral_norm.shape.assert_is_fully_defined()
+    spectral_norm.shape.assert_is_compatible_with([1, 1])
+
+    return spectral_norm[0][0]
+
+
+def spectral_normalize(w, power_iteration_rounds=1, name=None):
+  """Normalizes a weight matrix by its spectral norm.
+
+  Args:
+    w: The weight matrix to be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    name: An optional scope name.
+
+  Returns:
+    A normalized weight matrix tensor.
+  """
+  with variable_scope.variable_scope(name, 'spectral_normalize'):
+    w_normalized = w / compute_spectral_norm(
+        w, power_iteration_rounds=power_iteration_rounds)
+    return array_ops.reshape(w_normalized, w.get_shape())
+
+
+def spectral_norm_regularizer(scale, power_iteration_rounds=1, scope=None):
+  """Returns a functions that can be used to apply spectral norm regularization.
+
+  Small spectral norms enforce a small Lipschitz constant, which is necessary
+  for Wasserstein GANs.
+
+  Args:
+    scale: A scalar multiplier. 0.0 disables the regularizer.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    scope: An optional scope name.
+
+  Returns:
+    A function with the signature `sn(weights)` that applies spectral norm
+    regularization.
+
+  Raises:
+    ValueError: If scale is negative or if scale is not a float.
+  """
+  if isinstance(scale, numbers.Integral):
+    raise ValueError('scale cannot be an integer: %s' % scale)
+  if isinstance(scale, numbers.Real):
+    if scale < 0.0:
+      raise ValueError(
+          'Setting a scale less than 0 on a regularizer: %g' % scale)
+    if scale == 0.0:
+      logging.info('Scale of 0 disables regularizer.')
+      return lambda _: None
+
+  def sn(weights, name=None):
+    """Applies spectral norm regularization to weights."""
+    with ops.name_scope(scope, 'SpectralNormRegularizer', [weights]) as name:
+      scale_t = ops.convert_to_tensor(
+          scale, dtype=weights.dtype.base_dtype, name='scale')
+      return math_ops.multiply(
+          scale_t,
+          compute_spectral_norm(
+              weights, power_iteration_rounds=power_iteration_rounds),
+          name=name)
+
+  return sn
+
+
+def _default_name_filter(name):
+  """A filter function to identify common names of weight variables.
+
+  Args:
+    name: The variable name.
+
+  Returns:
+    Whether `name` is a standard name for a weight/kernel variables used in the
+    Keras, tf.layers, tf.contrib.layers or tf.contrib.slim libraries.
+  """
+  match = re.match(r'(.*\/)?(depthwise_|pointwise_)?(weights|kernel)$', name)
+  return match is not None
+
+
+def spectral_normalization_custom_getter(name_filter=_default_name_filter,
+                                         power_iteration_rounds=1):
+  """Custom getter that performs Spectral Normalization on a weight tensor.
+
+  Specifically it divides the weight tensor by its largest singular value. This
+  is intended to stabilize GAN training, by making the discriminator satisfy a
+  local 1-Lipschitz constraint.
+
+  Based on [Spectral Normalization for Generative Adversarial Networks][sn-gan].
+
+  [sn-gan]: https://openreview.net/forum?id=B1QRgziT-
+
+  To reproduce an SN-GAN, apply this custom_getter to every weight tensor of
+  your discriminator. The last dimension of the weight tensor must be the number
+  of output channels.
+
+  Apply this to layers by supplying this as the `custom_getter` of a
+  `tf.variable_scope`. For example:
+
+    with tf.variable_scope('discriminator',
+                           custom_getter=spectral_norm_getter()):
+      net = discriminator_fn(net)
+
+  IMPORTANT: Keras does not respect the custom_getter supplied by the
+  VariableScope, so Keras users should use `keras_spectral_normalization`
+  instead of (or in addition to) this approach.
+
+  It is important to carefully select to which weights you want to apply
+  Spectral Normalization. In general you want to normalize the kernels of
+  convolution and dense layers, but you do not want to normalize biases. You
+  also want to avoid normalizing batch normalization (and similar) variables,
+  but in general such layers play poorly with Spectral Normalization, since the
+  gamma can cancel out the normalization in other layers. By default we supply a
+  filter that matches the kernel variable names of the dense and convolution
+  layers of the tf.layers, tf.contrib.layers, tf.keras and tf.contrib.slim
+  libraries. If you are using anything else you'll need a custom `name_filter`.
+
+  This custom getter internally creates a variable used to compute the spectral
+  norm by power iteration. It will update every time the variable is accessed,
+  which means the normalized discriminator weights may change slightly whilst
+  training the generator. Whilst unusual, this matches how the paper's authors
+  implement it, and in general additional rounds of power iteration can't hurt.
+
+  Args:
+    name_filter: Optionally, a method that takes a Variable name as input and
+      returns whether this Variable should be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform per step. A higher number yeilds a better approximation of the
+      true spectral norm.
+
+  Returns:
+    A custom getter function that applies Spectral Normalization to all
+    Variables whose names match `name_filter`.
+
+  Raises:
+    ValueError: If name_filter is not callable.
+  """
+  if not callable(name_filter):
+    raise ValueError('name_filter must be callable')
+
+  def _internal_getter(getter, name, *args, **kwargs):
+    """A custom getter function that applies Spectral Normalization.
+
+    Args:
+      getter: The true getter to call.
+      name: Name of new/existing variable, in the same format as
+        tf.get_variable.
+      *args: Other positional arguments, in the same format as tf.get_variable.
+      **kwargs: Keyword arguments, in the same format as tf.get_variable.
+
+    Returns:
+      The return value of `getter(name, *args, **kwargs)`, spectrally
+      normalized.
+
+    Raises:
+      ValueError: If used incorrectly, or if `dtype` is not supported.
+    """
+    if not name_filter(name):
+      return getter(name, *args, **kwargs)
+
+    if name.endswith(_PERSISTED_U_VARIABLE_SUFFIX):
+      raise ValueError(
+          'Cannot apply Spectral Normalization to internal variables created '
+          'for Spectral Normalization. Tried to normalized variable [%s]' %
+          name)
+
+    if kwargs['dtype'] not in _OK_DTYPES_FOR_SPECTRAL_NORM:
+      raise ValueError('Disallowed data type {}'.format(kwargs['dtype']))
+
+    # This layer's weight Variable/PartitionedVariable.
+    w_tensor = getter(name, *args, **kwargs)
+
+    if len(w_tensor.get_shape()) < 2:
+      raise ValueError(
+          'Spectral norm can only be applied to multi-dimensional tensors')
+
+    return spectral_normalize(
+        w_tensor,
+        power_iteration_rounds=power_iteration_rounds,
+        name=(name + '/spectral_normalize'))
+
+  return _internal_getter
+
+
+@contextlib.contextmanager
+def keras_spectral_normalization(name_filter=_default_name_filter,
+                                 power_iteration_rounds=1):
+  """A context manager that enables Spectral Normalization for Keras.
+
+  Keras doesn't respect the `custom_getter` in the VariableScope, so this is a
+  bit of a hack to make things work.
+
+  Usage:
+    with keras_spectral_normalization():
+      net = discriminator_fn(net)
+
+  Args:
+    name_filter: Optionally, a method that takes a Variable name as input and
+      returns whether this Variable should be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform per step. A higher number yeilds a better approximation of the
+      true spectral norm.
+
+  Yields:
+    A context manager that wraps the standard Keras variable creation method
+    with the `spectral_normalization_custom_getter`.
+  """
+  original_make_variable = keras_base_layer_utils.make_variable
+  sn_getter = spectral_normalization_custom_getter(
+      name_filter=name_filter, power_iteration_rounds=power_iteration_rounds)
+
+  def make_variable_wrapper(name, *args, **kwargs):
+    return sn_getter(original_make_variable, name, *args, **kwargs)
+
+  keras_base_layer_utils.make_variable = make_variable_wrapper
+
+  yield
+
+  keras_base_layer_utils.make_variable = original_make_variable
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea21f70ec01950cfef5e4fa851c78b219d6062f
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
@@ -0,0 +1,354 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for features.spectral_normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import slim
+from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl as spectral_normalization
+from tensorflow.contrib.layers.python.layers import layers as contrib_layers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.layers import convolutional as keras_convolutional
+from tensorflow.python.keras.layers import core as keras_core
+from tensorflow.python.layers import convolutional as layers_convolutional
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class SpectralNormalizationTest(test.TestCase):
+
+  def testComputeSpectralNorm(self):
+    weights = variable_scope.get_variable(
+        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
+    weights = math_ops.multiply(weights, 10.0)
+    s = linalg_ops.svd(
+        array_ops.reshape(weights, [-1, weights.shape[-1]]), compute_uv=False)
+    true_sn = s[..., 0]
+    estimated_sn = spectral_normalization.compute_spectral_norm(weights)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      np_true_sn = sess.run(true_sn)
+      for i in range(50):
+        est = sess.run(estimated_sn)
+        if i < 1:
+          np_est_1 = est
+        if i < 4:
+          np_est_5 = est
+        if i < 9:
+          np_est_10 = est
+        np_est_50 = est
+
+      # Check that the estimate improves with more iterations.
+      self.assertAlmostEqual(np_true_sn, np_est_50, 0)
+      self.assertGreater(
+          abs(np_true_sn - np_est_10), abs(np_true_sn - np_est_50))
+      self.assertGreater(
+          abs(np_true_sn - np_est_5), abs(np_true_sn - np_est_10))
+      self.assertGreater(abs(np_true_sn - np_est_1), abs(np_true_sn - np_est_5))
+
+  def testSpectralNormalize(self):
+    weights = variable_scope.get_variable(
+        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
+    weights = math_ops.multiply(weights, 10.0)
+    normalized_weights = spectral_normalization.spectral_normalize(
+        weights, power_iteration_rounds=1)
+
+    unnormalized_sigma = linalg_ops.svd(
+        array_ops.reshape(weights, [-1, weights.shape[-1]]),
+        compute_uv=False)[..., 0]
+    normalized_sigma = linalg_ops.svd(
+        array_ops.reshape(normalized_weights, [-1, weights.shape[-1]]),
+        compute_uv=False)[..., 0]
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      s0 = sess.run(unnormalized_sigma)
+
+      for i in range(50):
+        sigma = sess.run(normalized_sigma)
+        if i < 1:
+          s1 = sigma
+        if i < 5:
+          s5 = sigma
+        if i < 10:
+          s10 = sigma
+        s50 = sigma
+
+      self.assertAlmostEqual(1., s50, 0)
+      self.assertGreater(abs(s10 - 1.), abs(s50 - 1.))
+      self.assertGreater(abs(s5 - 1.), abs(s10 - 1.))
+      self.assertGreater(abs(s1 - 1.), abs(s5 - 1.))
+      self.assertGreater(abs(s0 - 1.), abs(s1 - 1.))
+
+  def _testLayerHelper(self, build_layer_fn, w_shape, b_shape, is_keras=False):
+    x = array_ops.placeholder(dtypes.float32, shape=[2, 10, 10, 3])
+
+    w_initial = np.random.randn(*w_shape) * 10
+    w_initializer = init_ops.constant_initializer(w_initial)
+    b_initial = np.random.randn(*b_shape)
+    b_initializer = init_ops.constant_initializer(b_initial)
+
+    if is_keras:
+      context_manager = spectral_normalization.keras_spectral_normalization()
+    else:
+      getter = spectral_normalization.spectral_normalization_custom_getter()
+      context_manager = variable_scope.variable_scope('', custom_getter=getter)
+
+    with context_manager:
+      (net,
+       expected_normalized_vars, expected_not_normalized_vars) = build_layer_fn(
+           x, w_initializer, b_initializer)
+
+    x_data = np.random.rand(*x.shape)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      # Before running a forward pass we still expect the variables values to
+      # differ from the initial value because of the normalizer.
+      w_befores = []
+      for name, var in expected_normalized_vars.items():
+        w_before = sess.run(var)
+        w_befores.append(w_before)
+        self.assertFalse(
+            np.allclose(w_initial, w_before),
+            msg=('%s appears not to be normalized. Before: %s After: %s' %
+                 (name, w_initial, w_before)))
+
+      # Not true for the unnormalized variables.
+      for name, var in expected_not_normalized_vars.items():
+        b_before = sess.run(var)
+        self.assertTrue(
+            np.allclose(b_initial, b_before),
+            msg=('%s appears to be unexpectedly normalized. '
+                 'Before: %s After: %s' % (name, b_initial, b_before)))
+
+      # Run a bunch of forward passes.
+      for _ in range(1000):
+        _ = sess.run(net, feed_dict={x: x_data})
+
+      # We expect this to have improved the estimate of the spectral norm,
+      # which should have changed the variable values and brought them close
+      # to the true Spectral Normalized values.
+      _, s, _ = np.linalg.svd(w_initial.reshape([-1, 3]))
+      exactly_normalized = w_initial / s[0]
+      for w_before, (name, var) in zip(w_befores,
+                                       expected_normalized_vars.items()):
+        w_after = sess.run(var)
+        self.assertFalse(
+            np.allclose(w_before, w_after, rtol=1e-8, atol=1e-8),
+            msg=('%s did not improve over many iterations. '
+                 'Before: %s After: %s' % (name, w_before, w_after)))
+        self.assertAllClose(
+            exactly_normalized,
+            w_after,
+            rtol=1e-4,
+            atol=1e-4,
+            msg=('Estimate of spectral norm for %s was innacurate. '
+                 'Normalized matrices do not match.'
+                 'Estimate: %s Actual: %s' % (name, w_after,
+                                              exactly_normalized)))
+
+  def testConv2D_Layers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      layer = layers_convolutional.Conv2D(
+          filters=3,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'tf.layers.Conv2d.kernel': layer.kernel}
+      expected_not_normalized_vars = {'tf.layers.Conv2d.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_ContribLayers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['CONTRIB_LAYERS_CONV2D_WEIGHTS'],
+          'biases': ['CONTRIB_LAYERS_CONV2D_BIASES']
+      }
+      net = contrib_layers.conv2d(
+          x,
+          3,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'contrib.layers.conv2d.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {
+          'contrib.layers.conv2d.bias': bias_vars[0]
+      }
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_Slim(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['SLIM_CONV2D_WEIGHTS'],
+          'biases': ['SLIM_CONV2D_BIASES']
+      }
+      net = slim.conv2d(
+          x,
+          3,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('SLIM_CONV2D_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('SLIM_CONV2D_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {'slim.conv2d.weights': weight_vars[0]}
+      expected_not_normalized_vars = {'slim.conv2d.bias': bias_vars[0]}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_Keras(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      layer = keras_convolutional.Conv2D(
+          filters=3,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'keras.layers.Conv2d.kernel': layer.kernel}
+      expected_not_normalized_vars = {'keras.layers.Conv2d.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,), is_keras=True)
+
+  def testFC_Layers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      x = layers_core.Flatten()(x)
+      layer = layers_core.Dense(
+          units=3,
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'tf.layers.Dense.kernel': layer.kernel}
+      expected_not_normalized_vars = {'tf.layers.Dense.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_ContribLayers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['CONTRIB_LAYERS_FC_WEIGHTS'],
+          'biases': ['CONTRIB_LAYERS_FC_BIASES']
+      }
+      x = contrib_layers.flatten(x)
+      net = contrib_layers.fully_connected(
+          x,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('CONTRIB_LAYERS_FC_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('CONTRIB_LAYERS_FC_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'contrib.layers.fully_connected.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {
+          'contrib.layers.fully_connected.bias': bias_vars[0]
+      }
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_Slim(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['SLIM_FC_WEIGHTS'],
+          'biases': ['SLIM_FC_BIASES']
+      }
+      x = slim.flatten(x)
+      net = slim.fully_connected(
+          x,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('SLIM_FC_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('SLIM_FC_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'slim.fully_connected.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {'slim.fully_connected.bias': bias_vars[0]}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_Keras(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      x = keras_core.Flatten()(x)
+      layer = keras_core.Dense(
+          units=3,
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'keras.layers.Dense.kernel': layer.kernel}
+      expected_not_normalized_vars = {'keras.layers.Dense.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,), is_keras=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index a0a86c6337eefa756a209635faa70db686a36247..1f1ae2df4d6def618e86aced3296ac89c836eab7 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -28,7 +28,7 @@ wasserstein_gradient_penalty
 All losses must be able to accept 1D or 2D Tensors, so as to be compatible with
 patchGAN style losses (https://arxiv.org/abs/1611.07004).
 
-To make these losses usable in the TFGAN framework, please create a tuple
+To make these losses usable in the TF-GAN framework, please create a tuple
 version of the losses with `losses_utils.py`.
 """
 
@@ -38,6 +38,7 @@ from __future__ import print_function
 
 
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -69,6 +70,10 @@ __all__ = [
 ]
 
 
+def _to_float(tensor):
+  return math_ops.cast(tensor, dtypes.float32)
+
+
 # Wasserstein losses from `Wasserstein GAN` (https://arxiv.org/abs/1701.07875).
 def wasserstein_generator_loss(
     discriminator_gen_outputs,
@@ -98,7 +103,7 @@ def wasserstein_generator_loss(
   """
   with ops.name_scope(scope, 'generator_wasserstein_loss', (
       discriminator_gen_outputs, weights)) as scope:
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
 
     loss = - discriminator_gen_outputs
     loss = losses.compute_weighted_loss(
@@ -144,8 +149,8 @@ def wasserstein_discriminator_loss(
   with ops.name_scope(scope, 'discriminator_wasserstein_loss', (
       discriminator_real_outputs, discriminator_gen_outputs, real_weights,
       generated_weights)) as scope:
-    discriminator_real_outputs = math_ops.to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_real_outputs = _to_float(discriminator_real_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     discriminator_real_outputs.shape.assert_is_compatible_with(
         discriminator_gen_outputs.shape)
 
@@ -320,7 +325,7 @@ def wasserstein_gradient_penalty(
     generated_data: Output of the generator.
     generator_inputs: Exact argument to pass to the generator, which is used
       as optional conditioning to the discriminator.
-    discriminator_fn: A discriminator function that conforms to TFGAN API.
+    discriminator_fn: A discriminator function that conforms to TF-GAN API.
     discriminator_scope: If not `None`, reuse discriminators from this scope.
     epsilon: A small positive number added for numerical stability when
       computing the gradient norm.
@@ -647,7 +652,7 @@ def least_squares_generator_loss(
   """
   with ops.name_scope(scope, 'lsq_generator_loss',
                       (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     loss = math_ops.squared_difference(
         discriminator_gen_outputs, real_label) / 2.0
     loss = losses.compute_weighted_loss(
@@ -702,8 +707,8 @@ def least_squares_discriminator_loss(
   """
   with ops.name_scope(scope, 'lsq_discriminator_loss',
                       (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_real_outputs = math_ops.to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_real_outputs = _to_float(discriminator_real_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     discriminator_real_outputs.shape.assert_is_compatible_with(
         discriminator_gen_outputs.shape)
 
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index 221c70c38bd432a6be7f6cda9c6700aa2255821f..76e57df7f646547037b3461ac44f7ee5b971406c 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN utilities for loss functions that accept GANModel namedtuples.
+"""TF-GAN utilities for loss functions that accept GANModel namedtuples.
 
 The losses and penalties in this file all correspond to losses in
 `losses_impl.py`. Losses in that file take individual arguments, whereas in this
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index 969b68449d9c82f9f9144a8657cd8932b38fd0f7..73dfee4fdeec87cf0bac5eb675fd02a64a9ad7f5 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Named tuples for TFGAN.
+"""Named tuples for TF-GAN.
 
-TFGAN training occurs in four steps, and each step communicates with the next
-step via one of these named tuples. At each step, you can either use a TFGAN
+TF-GAN training occurs in four steps, and each step communicates with the next
+step via one of these named tuples. At each step, you can either use a TF-GAN
 helper function in `train.py`, or you can manually construct a tuple.
 """
 
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 4c7bee41b33ce1fee46d374ca5fd1c0b603762f9..f36a5d346e0f27fbbc480e876380db51ed559c09 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The TFGAN project provides a lightweight GAN training/testing framework.
+"""The TF-GAN project provides a lightweight GAN training/testing framework.
 
 This file contains the core helper functions to create and train a GAN model.
 See the README or examples in `tensorflow_models` for details on how to use.
 
-TFGAN training occurs in four steps:
+TF-GAN training occurs in four steps:
 1) Create a model
 2) Add a loss
 3) Create train ops
@@ -645,9 +645,10 @@ def gan_loss(
         type(model))
 
   # Optionally create pooled model.
-  pooled_model = (
-      _tensor_pool_adjusted_model(model, tensor_pool_fn)
-      if tensor_pool_fn else model)
+  if tensor_pool_fn:
+    pooled_model = _tensor_pool_adjusted_model(model, tensor_pool_fn)
+  else:
+    pooled_model = model
 
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
@@ -665,10 +666,11 @@ def gan_loss(
   if _use_aux_loss(mutual_information_penalty_weight):
     gen_info_loss = tfgan_losses.mutual_information_penalty(
         model, add_summaries=add_summaries)
-    dis_info_loss = (
-        gen_info_loss
-        if tensor_pool_fn is None else tfgan_losses.mutual_information_penalty(
-            pooled_model, add_summaries=add_summaries))
+    if tensor_pool_fn is None:
+      dis_info_loss = gen_info_loss
+    else:
+      dis_info_loss = tfgan_losses.mutual_information_penalty(
+          pooled_model, add_summaries=add_summaries)
     gen_loss += mutual_information_penalty_weight * gen_info_loss
     dis_loss += mutual_information_penalty_weight * dis_info_loss
   if _use_aux_loss(aux_cond_generator_weight):
@@ -929,7 +931,7 @@ def gan_train_ops(
     **kwargs):
   """Returns GAN train ops.
 
-  The highest-level call in TFGAN. It is composed of functions that can also
+  The highest-level call in TF-GAN. It is composed of functions that can also
   be called, should a user require more control over some part of the GAN
   training process.
 
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 704be917b3680a1b5712f4f1dc5059b354db8610..bf8b66dcfa5e44a03107cdf1ef8b04e1dbff4a9c 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -17,11 +17,6 @@ filegroup(
     ]),
 )
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cuda_library",
-)
-
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
@@ -66,7 +61,6 @@ cc_library(
         ":gdr_memory_manager",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:graph_mgr",
@@ -100,15 +94,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gdr_collective_executor_mgr",
+    srcs = ["gdr_collective_executor_mgr.cc"],
+    hdrs = ["gdr_collective_executor_mgr.h"],
+    deps = [
+        ":gdr_memory_manager",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:cancellable_call",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:request_id",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+    ],
+)
+
 cc_library(
     name = "gdr_server_lib",
     srcs = ["gdr_server_lib.cc"],
     hdrs = ["gdr_server_lib.h"],
     linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
     deps = [
+        ":gdr_collective_executor_mgr",
         ":gdr_memory_manager",
         ":gdr_rendezvous_mgr",
         ":gdr_worker",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
index 8242d93f129904828a11b61d48f2df8fb0f88bc3..711adc865f37fc84550e4b45d9f0c7fff421a0dc 100644
--- a/tensorflow/contrib/gdr/README.md
+++ b/tensorflow/contrib/gdr/README.md
@@ -114,7 +114,16 @@ Caveats
 
 In current implementation, only tensors that reside in host memory or in GPU memory such that the GPU is adjacent to an RDMA capable NIC will use direct RDMA as its transport. When RDMA is available but not GDR, a temporary tensor copy on host memory will be used as RDMA source/destination (and copied from/to the target device). When there is no RDMA device present, it can even fallback to the original gRPC runtime. While it is theoretically possible to mix GDR enabled TF with non-GDR deployments in the same job, make sure the environment is properly setup so the GDR mode is enabled whenever possible (i.e. do not fall back to gRPC when it is not absolutely necessary).
 
-In the original design (as in the reference), tensor buffers are only registered to NIC when we could determine that the tensor will be either a source of Send or a sink of Recv across physical machine boundary. However, to implement the precise allocations, we need to change all the devices to possibly return a NIC compatible allocator. As GDR is currently in contrib, we would like to avoid the unnecessary code disruption to the TF core, so we allocate all tensors from NIC-registered buffers using a BFC allocator. This behaviour is similar to the effect of enabling the extra GPU option `force_gpu_compatible`, which allocate all host tensors in GPU-registered buffers no matter they will be transferred from/to GPUs or not.
+In the original design (as in the reference), tensor buffers are only registered
+to NIC when we could determine that the tensor will be either a source of Send
+or a sink of Recv across physical machine boundary. However, to implement the
+precise allocations, we need to change all the devices to possibly return a NIC
+compatible allocator. As GDR is currently in contrib, we would like to avoid the
+unnecessary code disruption to the TF core, so we allocate all tensors from
+NIC-registered buffers using a BFC allocator. This behavior is similar to the
+effect of enabling the extra GPU option `force_gpu_compatible`, which allocate
+all host tensors in GPU-registered buffers no matter they will be transferred
+from/to GPUs or not.
 
 Reference
 ===
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b84710d26eb8a64bf2f86b9f920551a8a8dbb233
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
@@ -0,0 +1,160 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/gdr/gdr_collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/distributed_runtime/cancellable_call.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+class WorkerCacheInterface;
+
+namespace {
+
+class RecvBufCall : public CancellableCall {
+ public:
+  RecvBufCall(int64 step_id, const string& peer_device, const string& peer_task,
+              const string& key, Device* to_device,
+              DeviceContext* to_device_ctx,
+              const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+              const DeviceLocality& client_locality,
+              const DeviceLocality& server_locality,
+              CancellationManager* cancel_mgr, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, peer_task, wc) {
+    req_.set_step_id(step_id);
+    req_.set_buf_rendezvous_key(key);
+    *req_.mutable_client_locality() = client_locality;
+    *req_.mutable_server_locality() = server_locality;
+    req_.set_num_bytes(to_tensor->TotalBytes());
+    req_.set_buf_ptr(reinterpret_cast<int64>(DMAHelper::base(to_tensor)));
+    req_.set_src_device(peer_device);
+    req_.set_dst_device(to_device->name());
+    req_.set_request_id(GetUniqueRequestId());
+  }
+
+  ~RecvBufCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->RecvBufAsync(&opts_, &req_, &resp_, done);
+  }
+
+  RecvBufRequest req_;
+  RecvBufResponse resp_;
+};
+
+class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
+ public:
+  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
+                                    DeviceResolverInterface* dev_resolver,
+                                    WorkerCacheInterface* worker_cache,
+                                    int64 step_id,
+                                    RemoteMemoryManager* remote_memory_manager)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache),
+        remote_memory_manager_(remote_memory_manager) {}
+
+  ~CollectiveRemoteAccessDistributed() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    const StatusCallback& done) override {
+    if (peer_is_local) {
+      CollectiveRemoteAccessLocal::RecvFromPeer(
+          peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+          to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
+          done);
+      return;
+    }
+
+    // State that needs to be threaded through a couple of async calls
+    // in order to make this function completely non-blocking.
+    struct State {
+      DeviceLocality server_locality;
+      std::unique_ptr<RecvBufCall> call;
+    };
+    State* state = new State;
+
+    // Logic to be executed on the RecvBufAsync callback.
+    auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
+                              to_device_ctx, to_tensor, dev_to_dev_stream_index,
+                              done](const Status& s) {
+      if (s.ok()) {
+        remote_memory_manager_->TensorFromTransportOptions(
+            to_tensor, state->call->resp_.transport_options(), to_device,
+            to_device_ctx, to_alloc_attr.on_host(), done);
+      }
+      if (!s.ok() && errors::IsFailedPrecondition(s)) {
+        dev_resolver_->ClearTask(peer_task);
+      }
+
+      delete state;
+    };
+
+    // Logic to execute once we have the device locality for the server-side
+    // device.
+    auto dev_locality_callback = [this, state, peer_device, peer_task, key,
+                                  to_device, to_device_ctx, to_alloc_attr,
+                                  to_tensor, client_locality,
+                                  recv_buf_callback](const Status& s) {
+      if (!s.ok()) {
+        recv_buf_callback(s);
+      } else {
+        state->call.reset(new RecvBufCall(
+            step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
+            to_alloc_attr, to_tensor, client_locality, state->server_locality,
+            &cancel_mgr_, worker_cache_));
+        state->call->Start(recv_buf_callback);
+      }
+    };
+
+    dev_resolver_->GetLocalityAsync(
+        peer_device, peer_task, &state->server_locality, dev_locality_callback);
+  }
+
+  void StartAbort(const Status& s) override {
+    CollectiveRemoteAccessLocal::StartAbort(s);
+    cancel_mgr_.StartCancel();
+  }
+
+ protected:
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  CancellationManager cancel_mgr_;
+  RemoteMemoryManager* remote_memory_manager_;
+};
+
+}  // namespace
+
+CollectiveExecutor* GdrCollectiveExecutorMgr::Create(int64 step_id) {
+  CollectiveRemoteAccessDistributed* rma =
+      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
+                                            worker_cache_, step_id,
+                                            remote_memory_manager_);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
+                                    &gpu_ring_order_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..1417e51e82c31035f058e8e9b546e04fb0ad97b8
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+class WorkerCacheInterface;
+class StepSequenceRequest;
+class StepSequenceResponse;
+
+// An implementation of CollectiveExecutorMgr for a distributed environment
+// that uses WorkerInterface::RecvBufAsync to route data transfers over RDMA.
+class GdrCollectiveExecutorMgr : public RpcCollectiveExecutorMgr {
+ public:
+  GdrCollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverDistributed> dev_resolver,
+      std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      WorkerCacheInterface* worker_cache, const string& task_name,
+      RemoteMemoryManager* remote_memory_manager)
+      : RpcCollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
+                                 std::move(param_resolver), worker_cache,
+                                 task_name),
+        remote_memory_manager_(remote_memory_manager) {}
+
+  ~GdrCollectiveExecutorMgr() override {}
+
+ protected:
+  virtual CollectiveExecutor* Create(int64 step_id) override;
+
+ private:
+  RemoteMemoryManager* remote_memory_manager_;  // Not owned.
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index ce1875151597f926aeb6392e7fc8307312da123f..7321e973191c4cc45f88735c6be7f2f67fe71c39 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -73,7 +73,10 @@ int TryToReadNumaNode(ibv_device* device) {
 
   std::ifstream ifs(filename.c_str());
   string content;
-  CHECK(std::getline(ifs, content));
+  const auto& ret = std::getline(ifs, content);
+  if (!ret) {
+    return port::kNUMANoAffinity;
+  }
 
   int32 value;
   if (strings::safe_strto32(content, &value)) {
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index dc0d5d548b80d36409778ef34e63171441f10142..c39cc0f9bcecc26aedfaf9707113210acf670244 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/contrib/gdr/gdr_server_lib.h"
 
 #include "grpc/support/alloc.h"
+#include "tensorflow/contrib/gdr/gdr_collective_executor_mgr.h"
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 #include "tensorflow/contrib/gdr/gdr_rendezvous_mgr.h"
 #include "tensorflow/contrib/gdr/gdr_worker.h"
-
-#include "grpc/support/alloc.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
 namespace tensorflow {
 
@@ -57,10 +59,34 @@ Status GdrServer::Init() {
     return std::unique_ptr<GdrWorker>(
         new GdrWorker(env, config, remote_memory_manager_.get()));
   };
-
+  CollectiveMgrCreationFunction collective_mgr_func =
+      [this](const ConfigProto& config, const WorkerEnv* env,
+             WorkerCacheInterface* worker_cache) {
+        string unused;
+        string default_worker_name;
+        DeviceNameUtils::SplitDeviceName(
+            env->device_mgr->ListDevices()[0]->name(), &default_worker_name,
+            &unused);
+
+        std::unique_ptr<DeviceResolverDistributed> dev_resolver(
+            new DeviceResolverDistributed(env->device_mgr, worker_cache,
+                                          default_worker_name));
+        std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
+            new CollectiveParamResolverDistributed(
+                config, env->device_mgr, dev_resolver.get(), worker_cache,
+                default_worker_name));
+        return new GdrCollectiveExecutorMgr(
+            config, env->device_mgr, std::move(dev_resolver),
+            std::move(param_resolver), worker_cache, default_worker_name,
+            remote_memory_manager_.get());
+      };
   TF_RETURN_IF_ERROR(remote_memory_manager_->Init());
 
-  return GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func);
+  GrpcServerOptions opts;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  opts.collective_mgr_func = collective_mgr_func;
+  opts.worker_func = worker_func;
+  return GrpcServer::Init(opts);
 }
 
 Status GdrServer::Start() {
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 016e5ea27b397830c69b6e1761b5994ebcfa9c3d..1204b8ca501a8f99ea6abd6c047ab2d91350bae1 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/gdr/gdr_worker.h"
 
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -40,13 +42,13 @@ GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config,
                      RemoteMemoryManager* remote_memory_manager)
     : GrpcWorker(worker_env, config),
       remote_memory_manager_(remote_memory_manager),
-      recv_tensor_recent_request_ids_(100000) {}
+      recent_request_ids_(100000) {}
 
 void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                     const RecvTensorRequest* request,
                                     ::grpc::ByteBuffer* response,
                                     StatusCallback done) {
-  Status s = recv_tensor_recent_request_ids_.TrackUnique(
+  Status s = recent_request_ids_.TrackUnique(
       request->request_id(), "RecvTensor (GdrWorker)", *request);
   if (!s.ok()) {
     done(s);
@@ -145,4 +147,41 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GdrWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                             RecvBufResponse* response, StatusCallback done) {
+  // This is an RDMA enabled implementation augmenting grpc.
+  Status s = recent_request_ids_.TrackUnique(request->request_id(),
+                                             "RecvBuf (GdrWorker)", *request);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  CollectiveExecutor::Handle ce_handle(
+      env_->collective_executor_mgr->FindOrCreate(request->step_id()), true);
+  CollectiveRemoteAccess* rma = ce_handle.get()->remote_access();
+  rma->buf_rendezvous()->ConsumeBuf(
+      request->buf_rendezvous_key(),
+      [this, request, response, done](const Status& status,
+                                      BufRendezvous::Hook* hook) {
+        Status s = status;
+        if (s.ok()) {
+          if (!DMAHelper::CanUseDMA(hook->prod_value)) {
+            s = errors::Internal("Tensor value for key ",
+                                 request->buf_rendezvous_key(),
+                                 " is not of a type supported by RecvBuf");
+          }
+        }
+        if (s.ok()) {
+          remote_memory_manager_->TransportOptionsFromTensor(
+              response->mutable_transport_options(), *hook->prod_value,
+              hook->prod_dev, hook->prod_ctx, hook->prod_attr.on_host(),
+              [this, response, done, hook](const Status& s) {
+                response->set_send_start_micros(env_->env->NowMicros());
+                done(s);
+                BufRendezvous::DoneWithHook(hook);
+              });
+        }
+      });
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 39f11e6bde5a1ca7ae91ead02279d22d70af027b..9a85cfd4263ad86f6579eedce95969c2829ff62c 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -38,9 +38,13 @@ class GdrWorker : public GrpcWorker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done) override;
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response,
+                            StatusCallback done) override;
+
  private:
   RemoteMemoryManager* remote_memory_manager_;  // Not owned
-  RecentRequestIds recv_tensor_recent_request_ids_;
+  RecentRequestIds recent_request_ids_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 0081fb61770075a2c36e92f65e01126f657edeb4..d319aa7986d81cf9ac2d1dc2e15b053a0aa0c31b 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -16,9 +16,22 @@ tf_cc_binary(
     srcs = ["hvx_ops_support_checker_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:candidate_sampling_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:manip_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:string_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
+        "//tensorflow/core:user_ops_op_lib",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/core/kernels/hexagon:graph_transferer",
         "//tensorflow/tools/graph_transforms:file_utils",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 5a8c650fb927be0c835aaceffc516c048195c7bf..c1f6cac4942436d32f9867d4b5557c6b9e376c69 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -30,7 +30,8 @@ system based on Apache Ignite.
 
 ## Features
 
-Ignite Dataset provides features that that you can use in a wide range of cases. The most important and interesting features are described below.
+Ignite Dataset provides features that you can use in a wide range of cases. The
+most important and interesting features are described below.
 
 ### Distributed In-Memory Datasource
 [Apache Ignite](https://ignite.apache.org/) is a distributed in-memory database, caching, and processing platform that provides fast data access. It allows you to avoid limitations of hard drive and store and operate with as much data as you need in distributed cluster. You can utilize
@@ -97,6 +98,7 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
+>>> tf.enable_eager_execution()
 >>>
 >>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
 >>>
@@ -116,7 +118,15 @@ Using this ability we can calculate gradients on the nodes the data is stored on
 
 Apache Ignite uses horizontal partitioning to store data in distributed cluster. When we create Apache Ignite cache (or table in terms of SQL), we can specify the number of partitions the data will be partitioned on. For example, if an Apache Ignite cluster consists of 10 machines and we create cache with 10 partitions, then every machine will maintain approximately one data partition.
 
-Ignite Dataset allows using these two aspects of distributed neural network training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a computation graph operation that can be performed on a remote worker. The remote worker can override Ignite Dataset parameters (such as `host`, `port` or `part`) by setting correstondent environment variables for worker process (such as `IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using this overriding approach, we can assign a specific partition to every worker so that one worker handles one partition and, at the same time, transparently work with single dataset.
+Ignite Dataset allows using these two aspects of distributed neural network
+training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a
+computation graph operation that can be performed on a remote worker. The remote
+worker can override Ignite Dataset parameters (such as `host`, `port` or `part`)
+by setting correspondent environment variables for worker process (such as
+`IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using
+this overriding approach, we can assign a specific partition to every worker so
+that one worker handles one partition and, at the same time, transparently work
+with single dataset.
 
 ```python
 >>> import tensorflow as tf
@@ -149,23 +159,31 @@ system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS
 delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in
 addition to its own APIs, IGFS implements Hadoop FileSystem API and can be
 transparently plugged into Hadoop or Spark deployments. This contrib package
-contains an integration between IGFS and TensorFlow. The integration is based
-on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys)
-from TensorFlow side and
+contains an integration between IGFS and TensorFlow. The integration is based on
+[custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys) from
+TensorFlow side and
 [IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache
-Ignite side. It has numerous uses, for example: * Checkpoints of state can be
-saved to IGFS for reliability and fault-tolerance. * Training processes
-communicate with TensorBoard by writing event files to a directory, which
-TensorBoard watches. IGFS allows this communication to work even when
-TensorBoard runs in a different process or machine.
+Ignite side. It has numerous uses, for example:
+
+*   Checkpoints of state can be saved to IGFS for reliability and
+    fault-tolerance.
+*   Training processes communicate with TensorBoard by writing event files to a
+    directory, which TensorBoard watches. IGFS allows this communication to work
+    even when TensorBoard runs in a different process or machine.
 
 ### SSL Connection
 
-Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
+Apache Ignite allows to protect data transfer channels by
+[SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and
+authentication. Ignite Dataset supports both SSL connection with and without
+authentication. For more information, please refer to the
+[Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls)
+documentation.
 
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
+>>> tf.enable_eager_execution()
 >>>
 >>> dataset = IgniteDataset(cache_name="IMAGES",
                             certfile="client.pem",
@@ -186,7 +204,7 @@ Following examples will help you to easily start working with this module.
 
 The simplest way to try Ignite Dataset is to run a
 [Docker](https://www.docker.com/) container with Apache Ignite and loaded
-[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with
+[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interrupt with
 it using Ignite Dataset. Such container is available on Docker Hub:
 [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/).
 You need to start this container on your machine:
@@ -197,13 +215,13 @@ docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
 
 After that you will be able to work with it following way:
 
-![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
+![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist-2.png "Ignite Dataset Mnist")
 
 ### IGFS
 
 The simplest way to try IGFS with TensorFlow is to run
 [Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS
-and then interruct with it using TensorFlow
+and then interrupt with it using TensorFlow
 [tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container
 is available on Docker Hub:
 [dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/).
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 66e654ca636a5a051c6f9cd35bf9001dfbcbf7f4..3ffceef8070e0fc3b3cebae2522f89fe98ce4413 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -735,8 +735,6 @@ class IgniteDataset(dataset_ops.DatasetSource):
       cert_password: Password to be used if the private key is encrypted and a
         password is necessary.
     """
-    super(IgniteDataset, self).__init__()
-
     with IgniteClient(host, port, username, password, certfile, keyfile,
                       cert_password) as client:
       client.handshake()
@@ -760,6 +758,8 @@ class IgniteDataset(dataset_ops.DatasetSource):
         self.cache_type.to_output_types(), self.cache_type.to_output_shapes(),
         self.cache_type.to_output_classes())
 
+    super(IgniteDataset, self).__init__(self._as_variant_tensor())
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host, self.port,
                                           self.local, self.part, self.page_size,
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ff5d4c458c859fd8e5e3ae65ee41a454d55d6538..89b74fbfdc38c9f42795d5c778889210baf6387f 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -19,9 +19,9 @@ from __future__ import print_function
 
 import os
 
+from tensorflow import compat
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -66,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset_ops.make_one_shot_iterator(dataset)
+    it = compat.v1.data.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index b399e1b6c2ac47db205b5d8bbc81875ef5c08a31..5591c3b0cc8c8bf196bb4821c018cbf155cba4ce 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -52,7 +52,6 @@ class KafkaDataset(dataset_ops.DatasetSource):
       timeout: The timeout value for the Kafka Consumer to wait
                (in millisecond).
     """
-    super(KafkaDataset, self).__init__()
     self._topics = ops.convert_to_tensor(
         topics, dtype=dtypes.string, name="topics")
     self._servers = ops.convert_to_tensor(
@@ -63,6 +62,8 @@ class KafkaDataset(dataset_ops.DatasetSource):
     self._timeout = ops.convert_to_tensor(
         timeout, dtype=dtypes.int64, name="timeout")
 
+    super(KafkaDataset, self).__init__(self._as_variant_tensor())
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.kafka_dataset(self._topics, self._servers,
                                          self._group, self._eof, self._timeout)
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index 4ef0a66a52429233c6e6f70667a451466493629c..294a7d69a704b3c06ab9e30489af116929ab6c2a 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -34,7 +34,7 @@ def sparse_multiclass_hinge_loss(
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Adds Ops for computing the multiclass hinge loss.
+  r"""Adds Ops for computing the multiclass hinge loss.
 
   The implementation is based on the following paper:
   On the Algorithmic Implementation of Multiclass Kernel-based Vector Machines
diff --git a/tensorflow/contrib/kfac/README.md b/tensorflow/contrib/kfac/README.md
index 42b91d031375b8edb7e4f364ac91ffb74ef1f54b..19daffea6c7e4486499388314d0aaaa611e94218 100644
--- a/tensorflow/contrib/kfac/README.md
+++ b/tensorflow/contrib/kfac/README.md
@@ -1,3 +1,3 @@
 # K-FAC: Kronecker-Factored Approximate Curvature
 
-## KFAC moved to third_party/tensorflow_kfac.
+## KFAC moved to https://github.com/tensorflow/kfac.
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
index 2b1d478a9b0fd12ca25c72da6872acccfd7285fc..9479afb180df7bb4a08d6aafa4fc3bf63489d9f3 100644
--- a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -71,7 +71,6 @@ class KinesisDataset(dataset_ops.DatasetSource):
       interval: The interval for the Kinesis Client to wait before
         it tries to get records again (in millisecond).
     """
-    super(KinesisDataset, self).__init__()
     self._stream = ops.convert_to_tensor(
         stream, dtype=dtypes.string, name="stream")
     self._shard = ops.convert_to_tensor(
@@ -80,6 +79,7 @@ class KinesisDataset(dataset_ops.DatasetSource):
         read_indefinitely, dtype=dtypes.bool, name="read_indefinitely")
     self._interval = ops.convert_to_tensor(
         interval, dtype=dtypes.int64, name="interval")
+    super(KinesisDataset, self).__init__(self._as_variant_tensor())
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.kinesis_dataset(
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 9ca6f8df5dbe3c236c4cd85095176ce69ad9deaa..69d5496f8aebb9b89c5d79f80a1a439f556093d7 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -81,6 +81,7 @@ tf_custom_op_py_library(
     visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 7e6eafaa0d6f60cfc28a4c422abac0b6d5a991fb..00e41026d0038409ace178e6affd2c1cdc812122 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -1757,7 +1757,7 @@ class WeightedSumTest(test.TestCase):
       logits_core = fc_core.linear_model(features, [movies])
 
       with self.cached_session() as sess:
-        variables_lib.initialize_all_variables().run()
+        variables_lib.global_variables_initializer().run()
         lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index d791418c9d0f887058ceb535092fa8122da1aa75..1c0088186c030437454c0f764decab9e5a276adc 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1356,7 +1356,7 @@ class DropoutTest(test.TestCase):
     with self.cached_session():
       images = np.random.uniform(size=(5, height, width, 3))
       output = _layers.dropout(images)
-      self.assertEqual(output.op.name, 'Dropout/dropout_1/mul')
+      self.assertEqual(output.op.name, 'Dropout/dropout_1/mul_1')
       output.get_shape().assert_is_compatible_with(
           ops.convert_to_tensor(images).get_shape())
 
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index 11033a2e9cb646c2e7cd2f45de1f751d88c6921a..76b03ff514821d3459f84c5f46a64d1134e0d4de 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -186,7 +186,7 @@ def group_norm(inputs,
 
   Args:
     inputs: A Tensor with at least 2 dimensions one which is channels. All
-     shape dimensions must be fully defined.
+     shape dimensions except for batch must be fully defined.
     groups: Integer. Divide the channels into this number of groups over which
       normalization statistics are computed. This number must be commensurate
       with the number of channels in `inputs`.
@@ -249,13 +249,21 @@ def group_norm(inputs,
   """
   # TODO(shlens): Support partially defined shapes for the inputs.
   inputs = ops.convert_to_tensor(inputs)
-  original_shape = inputs.shape
 
   if inputs.shape.ndims is None:
     raise ValueError('Inputs %s has undefined rank.' % inputs.name)
   if channels_axis > (inputs.shape.ndims - 1):
     raise ValueError('Axis is out of bounds.')
 
+  # Use dynamic shape for not fully defined dimensions in the inputs.
+  dyanmic_shape = array_ops.shape(inputs)
+  input_shape_list = []
+  for i, dim in enumerate(inputs.shape):
+    if dim.value is None:
+      input_shape_list.append(dyanmic_shape[i])
+    else:
+      input_shape_list.append(dim)
+
   # Standardize the channels_axis to be positive and identify # of channels.
   if channels_axis < 0:
     channels_axis = inputs.shape.ndims + channels_axis
@@ -289,8 +297,8 @@ def group_norm(inputs,
   # Determine axes before channels. Some examples of common image formats:
   #  'NCHW': before = [N], after = [HW]
   #  'NHWC': before = [NHW], after = []
-  axes_before_channels = inputs.shape.as_list()[:channels_axis]
-  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+  axes_before_channels = input_shape_list[:channels_axis]
+  axes_after_channels = input_shape_list[channels_axis+1:]
 
   # Manually broadcast the parameters to conform to the number of groups.
   params_shape_broadcast = ([1] * len(axes_before_channels) +
@@ -369,7 +377,7 @@ def group_norm(inputs,
     outputs = inputs * gain + offset
 
     # Collapse the groups into the channel dimension.
-    outputs = array_ops.reshape(outputs, original_shape)
+    outputs = array_ops.reshape(outputs, input_shape_list)
 
     if activation_fn is not None:
       outputs = activation_fn(outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index c8d3c91b10dbe3b959e91182f9924b78352d370d..9a85084b239837ade87d8c778393ef8e885f5bdd 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -221,6 +221,15 @@ class GroupNormTest(test.TestCase):
       normalization.group_norm(inputs, channels_axis=-1,
                                reduction_axes=[-3, -2])
 
+  def testParamsShapeNotFullyDefinedBatchAxis(self):
+    height, width, groups = 3, 3, 4
+    inputs = array_ops.placeholder(dtypes.float32,
+                                   shape=(None, height, width, 2*groups))
+    output = normalization.group_norm(inputs, channels_axis=-1,
+                                      reduction_axes=[-3, -2], groups=groups)
+    self.assertListEqual([None, height, width, 2 * groups],
+                         output.shape.as_list())
+
   def testCreateOp(self):
     height, width, groups = 3, 3, 4
     images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 8a6b4f68a8b33d497ddb16614a7e3cdf32f2c422..5234869718b427d7e275b76ae12021a096241a56 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -399,7 +399,7 @@ def _mean_squared_loss(logits, target):
     target = array_ops.expand_dims(target, axis=1)
 
   logits.get_shape().assert_is_compatible_with(target.get_shape())
-  return math_ops.square(logits - math_ops.to_float(target))
+  return math_ops.squared_difference(logits, math_ops.to_float(target))
 
 
 def _log_loss_with_two_classes(logits, target):
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 14065fcee51c014a1af227504eaaca1fa39941e1..4749371248ee89a033912132986d7f76c85dbaa6 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -357,9 +357,9 @@ py_test(
 
 py_test(
     name = "dnn_linear_combined_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/learn/estimators/dnn_linear_combined_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["no_oss"],  # flaky b/70524820
     deps = [
diff --git a/tensorflow/contrib/learn/README.md b/tensorflow/contrib/learn/README.md
index b0bff915a993c9a01e2e6d9ef9f71c14d2f29a73..b2d3a6273abba7e3a893f30bbdd4f8b2662bd54a 100644
--- a/tensorflow/contrib/learn/README.md
+++ b/tensorflow/contrib/learn/README.md
@@ -111,18 +111,17 @@ Some arguments are renamed, please refer to documentation. In addition:
 
 Switch to `tf.estimator.train_and_evaluate`. Some differences:
 
-* Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
-  should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
-* Remove the `experiment_fn`. Instead, create the `Estimator`,
-  `train_spec` and `eval_spec`, then call `tf.estimator.train_and_evaluate`
-  directly.
-* Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement
-  for `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
-  replacement for `tf.contrib.learn.make_export_strategy`. If you want to export
-  only at the end of training  use `tf.estimator.FinalExporter`.
-* If the `TF_CONFIG` environment variable is constructed manually, please read
-  the `train_and_evaluate` documentation for the new requirementds (in
-  particular, the chief node and evaluator node).
+*   Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
+    should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
+*   Remove the `experiment_fn`. Instead, create the `Estimator`, `train_spec`
+    and `eval_spec`, then call `tf.estimator.train_and_evaluate` directly.
+*   Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement for
+    `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
+    replacement for `tf.contrib.learn.make_export_strategy`. If you want to
+    export only at the end of training use `tf.estimator.FinalExporter`.
+*   If the `TF_CONFIG` environment variable is constructed manually, please read
+    the `train_and_evaluate` documentation for the new requirements (in
+    particular, the chief node and evaluator node).
 
 ## Others Classes and Functions
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index c1b97d8b49613ea49d9813954da3b7a63d3ba04c..4bb14a6e63b159fa4d09c9ef20947d4b125de657 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -567,7 +567,8 @@ def _mean_squared_loss(labels, logits, weights=None):
     if len(logits.get_shape()) == 1:
       logits = array_ops.expand_dims(logits, axis=1)
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
-    loss = math_ops.square(logits - math_ops.to_float(labels), name=name)
+    loss = math_ops.squared_difference(
+        logits, math_ops.to_float(labels), name=name)
     return _compute_weighted_loss(loss, weights)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
index 5e90d1fa20535de3b5e25bc7ff8c3862cea5514c..318046733bf75a6d661d26f478118c8e944afe15 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -174,7 +174,7 @@ class GeneratorIoTest(test.TestCase):
       return np.arange(32, 36)
 
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+      with self.assertRaisesRegexp(TypeError, r'x\(\) must be generator'):
         failing_input_fn = generator_io.generator_input_fn(
             generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
@@ -185,7 +185,7 @@ class GeneratorIoTest(test.TestCase):
       yield np.arange(32, 36)
 
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+      with self.assertRaisesRegexp(TypeError, r'x\(\) must yield dict'):
         failing_input_fn = generator_io.generator_input_fn(
             generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index e7d091e18a8f186f89f5217442c24fb106c5cdab..af93e517f51ed33a8968982945ac1f65ec915ab1 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -36,10 +36,10 @@ def _create_parser(base_dir):
     # Modify the path object for RegEx match for Windows Paths
     if os.name == "nt":
       match = re.match(
-          "^" + compat.as_str_any(base_dir).replace("\\", "/") + "/(\\d+)$",
+          r"^" + compat.as_str_any(base_dir).replace("\\", "/") + r"/(\d+)$",
           compat.as_str_any(path.path).replace("\\", "/"))
     else:
-      match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
+      match = re.match(r"^" + compat.as_str_any(base_dir) + r"/(\d+)$",
                        compat.as_str_any(path.path))
     if not match:
       return None
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 229a72a780d5ccce8263444ffeae7700f6ac8613..c2916b82a1cefc4615547e77fdd6f4dd48d2a600 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import lookup_ops
 # pylint: disable=unused-import
 from tensorflow.python.ops.lookup_ops import FastHashSpec
 from tensorflow.python.ops.lookup_ops import HasherSpec
-from tensorflow.python.ops.lookup_ops import HashTable
 from tensorflow.python.ops.lookup_ops import IdTableWithHashBuckets
 from tensorflow.python.ops.lookup_ops import index_table_from_file
 from tensorflow.python.ops.lookup_ops import index_to_string_table_from_file
@@ -288,6 +287,83 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   return table.lookup(tensor)
 
 
+class HashTable(InitializableLookupTableBase):
+  """A generic hash table implementation.
+
+  Example usage:
+
+  ```python
+  table = tf.HashTable(
+      tf.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor)
+  table.init.run()
+  print(out.eval())
+  ```
+  """
+
+  def __init__(self, initializer, default_value, shared_name=None, name=None):
+    """Creates a non-initialized `HashTable` object.
+
+    Creates a table, the type of its keys and values are specified by the
+    initializer.
+    Before using the table you will have to initialize it. After initialization
+    the table will be immutable.
+
+    Args:
+      initializer: The table initializer to use. See `HashTable` kernel for
+        supported key and value types.
+      default_value: The value to use if a key is missing in the table.
+      shared_name: If non-empty, this table will be shared under the given name
+        across multiple sessions.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `HashTable` object.
+    """
+    self._initializer = initializer
+    self._default_value = default_value
+    self._shared_name = shared_name
+    self._name = name or "hash_table"
+    self._table_name = None
+    super(HashTable, self).__init__(default_value, initializer)
+    self._value_shape = self._default_value.get_shape()
+
+  def create_resource(self):
+    table_ref = gen_lookup_ops.hash_table_v2(
+        shared_name=self._shared_name,
+        key_dtype=self._initializer.key_dtype,
+        value_dtype=self._initializer.value_dtype,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_Export" % self.name,
+                        [self.resource_handle]) as name:
+      exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+          self.resource_handle, self._key_dtype, self._value_dtype, name=name)
+
+    exported_values.set_shape(exported_keys.get_shape().concatenate(
+        self._value_shape))
+    return exported_keys, exported_values
+
+
 class MutableHashTable(LookupInterface):
   """A generic mutable hash table implementation.
 
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 709a042bbcefb89125f7e4cd14a0d7ecd2b53281..5ebdd0b8b50063c99e6b747c594eb99c306b4efb 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -511,7 +511,7 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
-    losses = math_ops.square(math_ops.subtract(predictions, labels))
+    losses = math_ops.squared_difference(predictions, labels)
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index de76acb51ffe985162a66c617b266f47c5216b19..f3b0e77740ff1d940fcd6d00b3482e90f6ebf952 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -105,7 +105,8 @@ def contrastive_loss(labels, embeddings_anchor, embeddings_positive,
   # Get per pair distances
   distances = math_ops.sqrt(
       math_ops.reduce_sum(
-          math_ops.square(embeddings_anchor - embeddings_positive), 1))
+          math_ops.squared_difference(embeddings_anchor, embeddings_positive),
+          1))
 
   # Add contrastive loss for the siamese network.
   #   label here is {0,1} for neg, pos.
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index 87c73ec1ca610cac6d63468887bc350bada5910b..8330c45cc16ffa536107e25699379bb5d9e8993b 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -36,6 +36,7 @@ tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
 tensorflow/core/protobuf/saver.pb.cc
 tensorflow/core/protobuf/tensorflow_server.pb.cc
+tensorflow/core/protobuf/verifier_config.pb.cc
 tensorflow/core/util/event.pb.cc
 tensorflow/core/util/memmapped_file_system.pb.cc
 tensorflow/core/util/saved_tensor_slice.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index 4120ea52ec5255b1efce7a6ce6890fc79c1e4831..7257ac8feedfb8ed18c4d691cd85766e70a48ae8 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -37,6 +37,7 @@ tensorflow/core/protobuf/rewriter_config.pb.h
 tensorflow/core/protobuf/saver.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/protobuf/tensorflow_server.pb.h
+tensorflow/core/protobuf/verifier_config.pb.h
 tensorflow/core/util/event.pb.h
 tensorflow/core/util/memmapped_file_system.pb.h
 tensorflow/core/util/saved_tensor_slice.pb.h
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 655c7eefcb978d40c8bc16a23685e03ed71bfb63..2cd7d6d519a55423a96526b541845392d9ec6bc2 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -119,6 +119,7 @@ tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/fifo_queue.cc
 tensorflow/core/kernels/fifo_queue_op.cc
 tensorflow/core/kernels/fill_functor.cc
+tensorflow/core/kernels/fft_ops.cc
 tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/gather_functor.cc
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index f94d70db9046cec43073ab1406762aea1f28c8e3..13e3b6422d1989b0d499d8d20901d919554c630e 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -29,5 +29,6 @@ tensorflow/core/protobuf/debug.pb_text.cc
 tensorflow/core/protobuf/rewriter_config.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
 tensorflow/core/protobuf/tensor_bundle.pb_text.cc
+tensorflow/core/protobuf/verifier_config.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/util/saved_tensor_slice.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 2712e906d719e72dacb60e213205ad68895f905f..24d86d313b76343ed9450a33cf185d9c426696bb 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -43,6 +43,7 @@ tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/saver.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/protobuf/tensorflow_server.proto
+tensorflow/core/protobuf/verifier_config.proto
 tensorflow/core/util/event.proto
 tensorflow/core/util/memmapped_file_system.proto
 tensorflow/core/util/saved_tensor_slice.proto
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 7b432f8bd20989c6d95310bcaca88d44ce3e0d1f..ece246b7c28569a551f7733daf16ee1507f9c95d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1356,9 +1356,8 @@ def _compute_placement_auc(labels, predictions, weights, alpha,
           weights_0 * math_ops.square(1. - placement_values_0 - auc_0)) /
       (total_0 - 1. + _EPSILON))
   var_1 = (
-      math_ops.reduce_sum(
-          weights_1 * math_ops.square(placement_values_1 - auc_1)) /
-      (total_1 - 1. + _EPSILON))
+      math_ops.reduce_sum(weights_1 * math_ops.squared_difference(
+          placement_values_1, auc_1)) / (total_1 - 1. + _EPSILON))
   auc_std_err = math_ops.sqrt(
       (var_0 / (total_0 + _EPSILON)) + (var_1 / (total_1 + _EPSILON)))
 
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index ecac06354d2ce796f2a6021cdf2370d7c30ccab7..a7be92a35e0d62a61f7923ac61bb2c1267d039c6 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -52,7 +52,6 @@ tf_custom_op_library(
     deps = [
         ":mpi_defines",
         ":mpi_message_proto_cc",
-        "//tensorflow/stream_executor:stream_executor_headers_lib",
         "//third_party/mpi",
     ],
 )
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 0446e823d95f8ecbed6a0c34a83ade009e68448b..12320d9e456ae93cbf95639a0c9e0c7f414f3518 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -319,6 +319,9 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
index 3fb649ea82e79b3bc78a2da6d5c3e9a071adec6d..0b149ed17533adff3bd7cd8fd8ff94d171f72911 100644
--- a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Adam rewrite to use global step for computing beta1 & beta2 accumulation."""
 from __future__ import absolute_import
 from __future__ import division
@@ -38,10 +37,15 @@ class AdamGSOptimizer(optimizer.Optimizer):
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
-  def __init__(self, global_step=0, learning_rate=0.001,
-               beta1=0.9, beta2=0.999, epsilon=1e-8,
-               use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+  def __init__(self,
+               global_step=0,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name="Adam"):
+    r"""Construct a new Adam optimizer.
 
     Branched from tf.train.AdamOptimizer. The only difference is to pass
     global step for computing beta1 and beta2 accumulators, instead of having
@@ -83,23 +87,20 @@ class AdamGSOptimizer(optimizer.Optimizer):
     Args:
       global_step: tensorflow variable indicating the step.
       learning_rate: A Tensor or a floating point value.  The learning rate.
-      beta1: A float value or a constant float tensor.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value or a constant float tensor.
-        The exponential decay rate for the 2nd moment estimates.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
-    `epsilon` can each be a callable that takes no arguments and returns the
-    actual value to use. This can be useful for changing these values across
-    different invocations of optimizer functions.
-    @end_compatibility
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
+        callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
     """
     super(AdamGSOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
@@ -115,9 +116,6 @@ class AdamGSOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Created in SparseApply if needed.
-    self._updated_lr = None
-
   def _get_beta_accumulators(self):
     return (math_ops.pow(self._beta1_t, self._global_step_on_worker),
             math_ops.pow(self._beta2_t, self._global_step_on_worker))
@@ -149,28 +147,34 @@ class AdamGSOptimizer(optimizer.Optimizer):
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
-        var, m, v,
+        var,
+        m,
+        v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
         math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
+        grad,
+        use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
+        var.handle,
+        m.handle,
+        v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
         math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
+        grad,
+        use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power, beta2_power = self._get_beta_accumulators()
@@ -184,8 +188,7 @@ class AdamGSOptimizer(optimizer.Optimizer):
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
     m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
@@ -195,23 +198,26 @@ class AdamGSOptimizer(optimizer.Optimizer):
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
-        grad.values, var, grad.indices,
+        grad.values,
+        var,
+        grad.indices,
         lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
-            x, i, v, use_locking=self._use_locking))
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
   def _resource_apply_sparse(self, grad, var, indices):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add)
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 248ffb1f7eb5dc27112ddf9b8670344904065ed0..1b7800f324b908e3c88fe90d31a2a08cbbd5ccf2 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -36,7 +36,7 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
   def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+    r"""Construct a new Adam optimizer.
 
     Initialization:
 
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 72019b31540a943582ebb4699013d9dcfc10769f..0243927ce44aec626973744507e75b20a42253e9 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -48,7 +48,7 @@ from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 
 
-class NonLayerCheckpointable(tracking.Checkpointable):
+class NonLayerCheckpointable(tracking.AutoCheckpointable):
 
   def __init__(self):
     super(NonLayerCheckpointable, self).__init__()
@@ -440,7 +440,7 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.var = util.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
@@ -463,7 +463,7 @@ class CheckpointingTests(test.TestCase):
                                    14.))
     slots_path = util.CheckpointableSaver(root).save(
         os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.Checkpointable()
+    new_root = tracking.AutoCheckpointable()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
     slot_status = util.CheckpointableSaver(
@@ -508,7 +508,7 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = tracking.AutoCheckpointable()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
@@ -526,7 +526,7 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = tracking.AutoCheckpointable()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 7fb23abc38d9dc101204ed83808aebe5a8ef1e78..1323ed014c9e51e273491694fa44a8e36cc723d0 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -843,8 +843,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       scale_loss_by_num_replicas = (
           distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= 1. / num_replicas
     return loss_value
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index b35c4fde1a2c704880e023a0c3ac1e0766493514..b67e68ea96a15f94e62050c92405eec4fe4be70f 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -202,8 +202,9 @@ py_test(
 
 py_test(
     name = "quantize_parameterized_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/quantize_parameterized_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     # TODO(b/118839526): Re-enable msan test.
     tags = [
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 9085d9fa719520ac84ef6f8e07d7fa335bef5605..5b8da92491fb747c5a37dcfe03bcb21b5b903560 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -110,7 +110,7 @@ See the documentation for `tf.contrib.quantize` and [TensorFlow Lite](../lite/).
 
 ## Quantized accuracy results
 
-The following are results of trainiing some popular CNN models (Mobilenet-v1,
+The following are results of training some popular CNN models (Mobilenet-v1,
 Mobilenet-v2, and Inception-v3) using this tool:
 
 <figure>
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index 0e3c46f17d2e2a277418d39e31927db73a509670..92ae1021bc8f8fbf19ca7f7cbe208ecea18128e8 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -27,7 +27,8 @@ from tensorflow.python.platform import tf_logging as logging
 _UNCHANGED_RF_LAYER_OPS = [
     "Add", "BiasAdd", "Cast", "Ceil", "ConcatV2", "Const", "Floor",
     "FusedBatchNorm", "Identity", "Log", "Mul", "Pow", "RealDiv", "Relu",
-    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2", "LRN"
+    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2", "LRN",
+    "GreaterEqual"
 ]
 
 # Different ways in which padding modes may be spelled.
@@ -276,11 +277,11 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
-        node, kernel_size_x, stride_x, input_resolution[1]
-        if input_resolution is not None else None)
+        node, kernel_size_x, stride_x,
+        input_resolution[1] if input_resolution is not None else None)
     total_padding_y, padding_y = _padding_size_conv_pool(
-        node, kernel_size_y, stride_y, input_resolution[0]
-        if input_resolution is not None else None)
+        node, kernel_size_y, stride_y,
+        input_resolution[0] if input_resolution is not None else None)
   elif node.op == "Pad":
     # Kernel and stride are simply 1 in this case.
     kernel_size_x = 1
@@ -294,11 +295,11 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
-        node, kernel_size_x, stride_x, input_resolution[1]
-        if input_resolution is not None else None)
+        node, kernel_size_x, stride_x,
+        input_resolution[1] if input_resolution is not None else None)
     total_padding_y, padding_y = _padding_size_conv_pool(
-        node, kernel_size_y, stride_y, input_resolution[0]
-        if input_resolution is not None else None)
+        node, kernel_size_y, stride_y,
+        input_resolution[0] if input_resolution is not None else None)
   elif node.op in _UNCHANGED_RF_LAYER_OPS:
     # These nodes do not modify the RF parameters.
     kernel_size_x = 1
@@ -320,7 +321,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
       total_padding_y = None
       padding_y = None
     else:
-      raise ValueError("Unknown layer for operation '%s': %s" % (node.name,
-                                                                 node.op))
+      raise ValueError(
+          "Unknown layer for operation '%s': %s" % (node.name, node.op))
   return (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
           padding_y, total_padding_x, total_padding_y)
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 44b232e0f2b26f16f0300e11cf2764e1157a0050..d65d80df8073ef70d591c4ae2af99132f1c318ef 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -227,7 +227,10 @@ tf_custom_op_library(
         "kernels/lstm_ops_gpu.cu.cc",
         "kernels/lstm_ops.h",
     ],
-    deps = ["//tensorflow/core/kernels:eigen_helpers"],
+    deps = [
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -249,7 +252,10 @@ tf_custom_op_library(
         "kernels/gru_ops_gpu.cu.cc",
         "kernels/gru_ops.h",
     ],
-    deps = ["//tensorflow/core/kernels:eigen_helpers"],
+    deps = [
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -346,6 +352,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
         "//third_party/eigen3",
     ],
@@ -381,6 +388,13 @@ py_binary(
     name = "checkpoint_convert",
     srcs = ["python/tools/checkpoint_convert.py"],
     srcs_version = "PY2AND3",
+    deps = [":checkpoint_convert_lib"],
+)
+
+py_library(
+    name = "checkpoint_convert_lib",
+    srcs = ["python/tools/checkpoint_convert.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_ops",
@@ -399,7 +413,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        ":checkpoint_convert",
+        ":checkpoint_convert_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index d37210d4b81203287fb633adc309688a35d093bb..12f3182a6a8878aa27ee143fa6405903e3fc4ef3 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index a0d013c618ea56077098b15b7eed5f9110239516..7bad4a60a149011d5b8d745f45359fd25473e54e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -210,6 +210,35 @@ class RNNCellTest(test.TestCase):
         # Smoke test
         self.assertAllClose(res[0], [[0.509682, 0.509682]])
 
+  def testSRUCellKerasRNN(self):
+    """Tests that SRUCell works with keras RNN layer."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    seq_input = ops.convert_to_tensor(
+        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
+    rnn_layer = keras_layers.RNN(cell=cell)
+    rnn_outputs_keras = rnn_layer(seq_input)
+    with self.cached_session() as sess:
+      sess.run([variables_lib.global_variables_initializer()])
+      self.assertEqual(sess.run(rnn_outputs_keras).shape, (2, 10))
+
+  def testSRUCellBiasType(self):
+    """Tests that the bias' dtype is properly set."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.float32_ref)
+
+    cell = contrib_rnn_cell.SRUCell(10, dtype=dtypes.int32)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.int32_ref)
+
+    cell_input = ops.convert_to_tensor(
+        np.random.rand(2, 5), name="cell_input", dtype=dtypes.float16)
+    cell_state = ops.convert_to_tensor(
+        np.random.rand(2, 10), name="cell_state", dtype=dtypes.float16)
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell(cell_input, [cell_state])
+    self.assertEqual(cell._bias.dtype, dtypes.float16_ref)
+
   def testSRUCellWithDiffSize(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index aa1d7d2b01b4595bbb03ba8e867e93db759cbd52..d7ee7fb8faacb0876218a983d68f007e1905c11e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -29,7 +29,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras import utils
 from tensorflow.python.ops import array_ops
@@ -763,6 +765,17 @@ class RNNCellTest(test.TestCase):
         self.assertEqual(new_h.shape[1], num_proj)
         self.assertAllClose(np.concatenate(res[1], axis=1), expected_state)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNASCellKerasRNN(self):
+    """Tests that NASCell works with keras RNN layer."""
+    cell = contrib_rnn_cell.NASCell(10)
+    seq_input = ops.convert_to_tensor(
+        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
+    rnn_layer = keras_layers.RNN(cell=cell)
+    rnn_outputs = rnn_layer(seq_input)
+    self.evaluate([variables.global_variables_initializer()])
+    self.assertEqual(self.evaluate(rnn_outputs).shape, (2, 10))
+
   def testUGRNNCell(self):
     num_units = 2
     batch_size = 3
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 8a1c09f171e6108174671e3122d5ff4c0b236003..482e547a16be85804beec88a91fa03b053d09b27 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -1462,7 +1462,7 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     return new_h, new_state
 
 
-class NASCell(rnn_cell_impl.RNNCell):
+class NASCell(rnn_cell_impl.LayerRNNCell):
   """Neural Architecture Search (NAS) recurrent network cell.
 
   This implements the recurrent cell from the paper:
@@ -1475,23 +1475,28 @@ class NASCell(rnn_cell_impl.RNNCell):
   The class uses an optional projection layer.
   """
 
-  def __init__(self, num_units, num_proj=None, use_biases=False, reuse=None):
+  # NAS cell's architecture base.
+  _NAS_BASE = 8
+
+  def __init__(self, num_units, num_proj=None, use_bias=False, reuse=None,
+               **kwargs):
     """Initialize the parameters for a NAS cell.
 
     Args:
-      num_units: int, The number of units in the NAS cell
+      num_units: int, The number of units in the NAS cell.
       num_proj: (optional) int, The output dimensionality for the projection
         matrices.  If None, no projection is performed.
-      use_biases: (optional) bool, If True then use biases within the cell. This
+      use_bias: (optional) bool, If True then use biases within the cell. This
         is False by default.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
+      **kwargs: Additional keyword arguments.
     """
-    super(NASCell, self).__init__(_reuse=reuse)
+    super(NASCell, self).__init__(_reuse=reuse, **kwargs)
     self._num_units = num_units
     self._num_proj = num_proj
-    self._use_biases = use_biases
+    self._use_bias = use_bias
     self._reuse = reuse
 
     if num_proj is not None:
@@ -1509,6 +1514,33 @@ class NASCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._output_size
 
+  def build(self, inputs_shape):
+    input_size = tensor_shape.dimension_value(
+        tensor_shape.TensorShape(inputs_shape).with_rank(2)[1])
+    if input_size is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+
+    # Variables for the NAS cell. `recurrent_kernel` is all matrices multiplying
+    # the hiddenstate and `kernel` is all matrices multiplying the inputs.
+    self.recurrent_kernel = self.add_variable(
+        "recurrent_kernel", [num_proj, self._NAS_BASE * self._num_units])
+    self.kernel = self.add_variable(
+        "kernel", [input_size, self._NAS_BASE * self._num_units])
+
+    if self._use_bias:
+      self.bias = self.add_variable("bias",
+                                    shape=[self._NAS_BASE * self._num_units],
+                                    initializer=init_ops.zeros_initializer)
+
+    # Projection layer if specified
+    if self._num_proj is not None:
+      self.projection_weights = self.add_variable(
+          "projection_weights", [self._num_units, self._num_proj])
+
+    self.built = True
+
   def call(self, inputs, state):
     """Run one step of NAS Cell.
 
@@ -1535,38 +1567,20 @@ class NASCell(rnn_cell_impl.RNNCell):
     tanh = math_ops.tanh
     relu = nn_ops.relu
 
-    num_proj = self._num_units if self._num_proj is None else self._num_proj
-
     (c_prev, m_prev) = state
 
-    dtype = inputs.dtype
-    input_size = inputs.get_shape().with_rank(2).dims[1]
-    if input_size.value is None:
-      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    # Variables for the NAS cell. W_m is all matrices multiplying the
-    # hiddenstate and W_inputs is all matrices multiplying the inputs.
-    concat_w_m = vs.get_variable("recurrent_kernel",
-                                 [num_proj, 8 * self._num_units], dtype)
-    concat_w_inputs = vs.get_variable(
-        "kernel", [input_size.value, 8 * self._num_units], dtype)
-
-    m_matrix = math_ops.matmul(m_prev, concat_w_m)
-    inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
-
-    if self._use_biases:
-      b = vs.get_variable(
-          "bias",
-          shape=[8 * self._num_units],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtype)
-      m_matrix = nn_ops.bias_add(m_matrix, b)
+    m_matrix = math_ops.matmul(m_prev, self.recurrent_kernel)
+    inputs_matrix = math_ops.matmul(inputs, self.kernel)
+
+    if self._use_bias:
+      m_matrix = nn_ops.bias_add(m_matrix, self.bias)
 
     # The NAS cell branches into 8 different splits for both the hiddenstate
     # and the input
     m_matrix_splits = array_ops.split(
-        axis=1, num_or_size_splits=8, value=m_matrix)
+        axis=1, num_or_size_splits=self._NAS_BASE, value=m_matrix)
     inputs_matrix_splits = array_ops.split(
-        axis=1, num_or_size_splits=8, value=inputs_matrix)
+        axis=1, num_or_size_splits=self._NAS_BASE, value=inputs_matrix)
 
     # First layer
     layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
@@ -1598,9 +1612,7 @@ class NASCell(rnn_cell_impl.RNNCell):
 
     # Projection layer if specified
     if self._num_proj is not None:
-      concat_w_proj = vs.get_variable("projection_weights",
-                                      [self._num_units, self._num_proj], dtype)
-      new_m = math_ops.matmul(new_m, concat_w_proj)
+      new_m = math_ops.matmul(new_m, self.projection_weights)
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_m)
     return new_m, new_state
@@ -2071,7 +2083,7 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
       conv_ndims: Convolution dimensionality (1, 2 or 3).
       input_shape: Shape of the input as int tuple, excluding the batch size.
       output_channels: int, number of output channels of the conv LSTM.
-      kernel_shape: Shape of kernel as in tuple (of size 1,2 or 3).
+      kernel_shape: Shape of kernel as an int tuple (of size 1, 2 or 3).
       use_bias: (bool) Use bias in convolutions.
       skip_connection: If set to `True`, concatenate the input to the
         output of the conv LSTM. Default: `False`.
@@ -2092,7 +2104,7 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     self._conv_ndims = conv_ndims
     self._input_shape = input_shape
     self._output_channels = output_channels
-    self._kernel_shape = kernel_shape
+    self._kernel_shape = list(kernel_shape)
     self._use_bias = use_bias
     self._forget_bias = forget_bias
     self._skip_connection = skip_connection
@@ -2172,7 +2184,7 @@ def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   Args:
     args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D,
     batch x n, Tensors.
-    filter_size: int tuple of filter height and width.
+    filter_size: int tuple of filter shape (of size 1, 2 or 3).
     num_features: int, number of features.
     bias: Whether to use biases in the convolution layer.
     bias_start: starting value to initialize the bias; 0 by default.
@@ -2744,10 +2756,12 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     name: (optional) String, the name of the layer. Layers with the same name
       will share weights, but to avoid mistakes we require reuse=True in such
       cases.
+    **kwargs: Additional keyword arguments.
   """
 
-  def __init__(self, num_units, activation=None, reuse=None, name=None):
-    super(SRUCell, self).__init__(_reuse=reuse, name=name)
+  def __init__(self, num_units, activation=None, reuse=None, name=None,
+               **kwargs):
+    super(SRUCell, self).__init__(_reuse=reuse, name=name, **kwargs)
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
 
@@ -2777,7 +2791,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     self._bias = self.add_variable(
         rnn_cell_impl._BIAS_VARIABLE_NAME,  # pylint: disable=protected-access
         shape=[2 * self._num_units],
-        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+        initializer=init_ops.zeros_initializer)
 
     self._built = True
 
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
index 3fc6bfbb4d03a39906d4441e48b2788423caa234..d8ab9eba7049e468b373a1641f92dc781aa22558 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -61,10 +61,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
     self._server = server
 
   def tearDown(self):
-    # TODO(ebrevdo): Figure out why this sometimes times out.
-    #    self._service.ExitLoop()
-    #    self._service_thread.join()
-    # self._server.stop()
+    self._server.stop(grace=None)
     super(RpcOpTest, self).tearDown()
 
 
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
index 0d615923e04915a8429252317025ac8e79f9bb4e..d6148715be91c78e6e5a99fc0f3caa905b5c1a7d 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -176,7 +176,9 @@ class RpcOpTestBase(object):
       expected_message_values = np.where(
           status_code_values == errors.INVALID_ARGUMENT,
           I_WARNED_YOU.encode('ascii'), b'')
-      self.assertAllEqual(expected_message_values, status_message_values)
+      for msg, expected in zip(status_message_values, expected_message_values):
+        self.assertTrue(expected in msg,
+                        '"%s" did not contain "%s"' % (msg, expected))
 
   def testVecHostPortRpc(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 269443b2c6508bb618d30f64487b1a6a84e8646f..f0242a3b40fd566ec0f477d462426d5f550d1620 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -84,35 +84,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:util",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/saved_model",
-    ],
-)
-
-py_test(
-    name = "keras_saved_model_test",
-    size = "medium",
-    srcs = ["python/saved_model/keras_saved_model_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # TODO(b/119349471): Re-enable
-        "no_windows",
-    ],
-    deps = [
-        ":keras_saved_model",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training",
-        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 2a4b6eae367fe617e9a19d80f16eb3fda9ade1c0..0392ed9eee79391c60318faf68d8dfd6eb64a994 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -18,398 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import six
+from tensorflow.python.keras import saving
 
-from tensorflow.python.client import session
-from tensorflow.python.framework import ops
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import models as models_lib
-from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.metrics import Metric
-from tensorflow.python.keras.models import model_from_json
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import builder as saved_model_builder
-from tensorflow.python.saved_model import constants
-from tensorflow.python.saved_model import save as save_lib
-from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-from tensorflow.python.util import compat
-from tensorflow.python.util import nest
-from tensorflow_estimator.python.estimator import keras as estimator_keras_util
-from tensorflow_estimator.python.estimator import model_fn as model_fn_lib
-from tensorflow_estimator.python.estimator.export import export as export_helpers
 
-
-def save_keras_model(
-    model, saved_model_path, custom_objects=None, as_text=None,
-    input_signature=None, serving_only=False):
-  """Saves a `tf.keras.Model` into Tensorflow SavedModel format.
-
-  `save_model` generates new files/folders under the `saved_model_path` folder:
-  1) a checkpoint containing the model weights.
-  2) a saved_model.pb file containing the model's MetaGraphs. The prediction
-     graph is always exported. The evaluaton and training graphs are exported
-     if the following conditions are met:
-     - Evaluation: model loss is defined.
-     - Training: model is compiled with an optimizer defined under `tf.train`.
-       This is because `tf.keras.optimizers.Optimizer` instances cannot be
-       saved to checkpoints.
-  3) Model's json configuration, if model.get_config() has been implemented.
-     This file can be used to reload the model using
-     tf.keras.models.model_from_json(). Note that if any custom objects were
-     used, they should be passed to the `custom_object` argument when loading
-     the model.
-
-  Model limitations:
-  - Sequential and functional models can always be saved.
-  - Subclassed models can only be saved when `serving_only=True`. This is due to
-    the current implementation copying the model in order to export the training
-    and evaluation graphs. Because the topology of subclassed models cannot be
-    determined, the subclassed models cannot be cloned. Subclassed models will
-    be entirely exportable in the future.
-
-  Note that each mode is exported in separate graphs, so different modes do not
-  share variables. To use the train graph with evaluation or prediction graphs,
-  create a new checkpoint if variable values have been updated.
-
-  Example:
-
-  ```python
-  import tensorflow as tf
-
-  # Create a tf.keras model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
-  model.summary()
-
-  # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.contrib.saved_model.save_keras_model(
-        model, '/tmp/my_simple_tf_keras_saved_model')
-
-  # Load the saved keras model back.
-  model_prime = tf.contrib.saved_model.load_keras_model(saved_to_path)
-  model_prime.summary()
-  ```
-
-  Args:
-    model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
-      `serving_only` must be set to True.
-    saved_model_path: a string specifying the path to the SavedModel directory.
-      The SavedModel will be saved to a timestamped folder created within this
-      directory.
-    custom_objects: Optional dictionary mapping string names to custom classes
-      or functions (e.g. custom loss functions).
-    as_text: whether to write the `SavedModel` proto in text format. Currently
-      unavailable in serving-only mode.
-    input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
-      to specify the expected model inputs. `input_signature`'s nested structure
-      should match the expected nested structure of the inputs to the model. If
-      this is not set, this function will attempt to infer the input shapes and
-      dtypes from the model. Note that if the model is subclassed, the tensor
-      inputs to the call function should be nested in the first argument (this
-      is a general requirement for using subclassed models with Keras functions
-      .fit(), .predict(), etc.).
-    serving_only: Export only the outputs produced from calling the model in
-      predict mode. The losses, optimizer, and other training configurations are
-      not saved. If the SavedModel will only be used for serving (rather than
-      retraining), or if the model is subclassed, this can be set to True.
-
-  Returns:
-    String path to the SavedModel folder, a subdirectory of `saved_model_path`.
-
-  Raises:
-    NotImplementedError: If the model is a subclassed model, and serving_only is
-      False.
-    ValueError: If the input signature cannot be inferred from the model.
-  """
-  export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
-
-  if serving_only:
-    save_lib.save(
-        model, export_dir,
-        signatures=training_utils.trace_model_call(model, input_signature))
-  else:
-    _save_v1_format(model, export_dir, custom_objects, as_text, input_signature)
-
-  try:
-    _export_model_json(model, export_dir)
-  except NotImplementedError:
-    logging.warning('Skipped saving model JSON, subclassed model does not have '
-                    'get_config() defined.')
-
-  return export_dir
-
-
-def _export_model_json(model, saved_model_path):
-  """Saves model configuration as a json string under assets folder."""
-  model_json = model.to_json()
-  model_json_filepath = os.path.join(
-      saved_model_utils.get_or_create_assets_dir(saved_model_path),
-      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
-  file_io.write_string_to_file(model_json_filepath, model_json)
-
-
-def _export_model_variables(model, saved_model_path):
-  """Saves model weights in checkpoint format under variables folder."""
-  saved_model_utils.get_or_create_variables_dir(saved_model_path)
-  checkpoint_prefix = saved_model_utils.get_variables_path(saved_model_path)
-  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
-  return checkpoint_prefix
-
-
-def _save_v1_format(model, path, custom_objects, as_text, input_signature):
-  """Exports model to v1 SavedModel format."""
-  if not model._is_graph_network:
-    if isinstance(model, sequential.Sequential):
-      # If input shape is not directly set in the model, the exported model
-      # will infer the expected shapes of the input from the model.
-      if not model.built and input_signature is None:
-        raise ValueError(
-            'Sequential model\'s input shape is unknown. Please build the '
-            'model, or use the input_signature argument to specify the '
-            'model inputs.')
-    else:
-      raise NotImplementedError(
-          'Subclassed models can only be exported for serving. Please set '
-          'argument serving_only=True.')
-
-  builder = saved_model_builder._SavedModelBuilder(path)
-
-  # Manually save variables to export them in an object-based checkpoint. This
-  # skips the `builder.add_meta_graph_and_variables()` step, which saves a
-  # named-based checkpoint.
-  # TODO(b/113134168): Add fn to Builder to save with object-based saver.
-  # TODO(b/113178242): This should only export the model json structure. Only
-  # one save is needed once the weights can be copied from the model to clone.
-  checkpoint_path = _export_model_variables(model, path)
-
-  # Export each mode. Use ModeKeys enums defined for `Estimator` to ensure that
-  # Keras models and `Estimator`s are exported with the same format.
-  # Every time a mode is exported, the code checks to see if new variables have
-  # been created (e.g. optimizer slot variables). If that is the case, the
-  # checkpoint is re-saved to include the new variables.
-  export_args = {'builder': builder,
-                 'model': model,
-                 'custom_objects': custom_objects,
-                 'checkpoint_path': checkpoint_path,
-                 'input_signature': input_signature}
-
-  has_saved_vars = False
-  if model.optimizer:
-    # TODO(kathywu): Verify this works with v2 optimizer.
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
-      _export_mode(model_fn_lib.ModeKeys.TRAIN, has_saved_vars, **export_args)
-      has_saved_vars = True
-      _export_mode(model_fn_lib.ModeKeys.EVAL, has_saved_vars, **export_args)
-    else:
-      logging.warning(
-          'Model was compiled with an optimizer, but the optimizer is not from '
-          '`tf.train` (e.g. `tf.train.AdagradOptimizer`). Only the serving '
-          'graph was exported. The train and evaluate graphs were not added to '
-          'the SavedModel.')
-  _export_mode(model_fn_lib.ModeKeys.PREDICT, has_saved_vars, **export_args)
-
-  builder.save(as_text)
-
-
-def _get_var_list(model):
-  """Returns list of all checkpointed saveable objects in the model."""
-  return checkpointable_utils.named_saveables(model)
-
-
-def create_placeholder(spec):
-  return K.placeholder(shape=spec.shape, dtype=spec.dtype, name=spec.name)
-
-
-def _export_mode(
-    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path,
-    input_signature):
-  """Exports a model, and optionally saves new vars from the clone model.
-
-  Args:
-    mode: A `tf.estimator.ModeKeys` string.
-    has_saved_vars: A `boolean` indicating whether the SavedModel has already
-      exported variables.
-    builder: A `SavedModelBuilder` object.
-    model: A `tf.keras.Model` object.
-    custom_objects: A dictionary mapping string names to custom classes
-      or functions.
-    checkpoint_path: String path to checkpoint.
-    input_signature: Nested TensorSpec containing the expected inputs. Can be
-      `None`, in which case the signature will be inferred from the model.
-
-  Raises:
-    ValueError: If the train/eval mode is being exported, but the model does
-      not have an optimizer.
-  """
-  compile_clone = (mode != model_fn_lib.ModeKeys.PREDICT)
-  if compile_clone and not model.optimizer:
-    raise ValueError(
-        'Model does not have an optimizer. Cannot export mode %s' % mode)
-
-  model_graph = ops.get_default_graph()
-  with ops.Graph().as_default() as g:
-
-    K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
-
-    if input_signature is None:
-      input_tensors = None
-    else:
-      input_tensors = nest.map_structure(create_placeholder, input_signature)
-
-    # Clone the model into blank graph. This will create placeholders for inputs
-    # and targets.
-    clone = models_lib.clone_and_build_model(
-        model, input_tensors=input_tensors, custom_objects=custom_objects,
-        compile_clone=compile_clone)
-
-    # Make sure that iterations variable is added to the global step collection,
-    # to ensure that, when the SavedModel graph is loaded, the iterations
-    # variable is returned by `tf.train.get_global_step()`. This is required for
-    # compatibility with the SavedModelEstimator.
-    if compile_clone:
-      g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
-
-    # Extract update and train ops from train/test/predict functions.
-    train_op = None
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      clone._make_train_function()
-      train_op = clone.train_function.updates_op
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      clone._make_test_function()
-    else:
-      clone._make_predict_function()
-    g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
-
-    clone_var_list = checkpointable_utils.named_saveables(clone)
-
-    with session.Session().as_default():
-      if has_saved_vars:
-        # Confirm all variables in the clone have an entry in the checkpoint.
-        status = clone.load_weights(checkpoint_path)
-        status.assert_existing_objects_matched()
-      else:
-        # Confirm that variables between the clone and model match up exactly,
-        # not counting optimizer objects. Optimizer objects are ignored because
-        # if the model has not trained, the slot variables will not have been
-        # created yet.
-        # TODO(b/113179535): Replace with checkpointable equivalence.
-        _assert_same_non_optimizer_objects(model, model_graph, clone, g)
-
-        # TODO(b/113178242): Use value transfer for checkpointable objects.
-        clone.load_weights(checkpoint_path)
-
-        # Add graph and variables to SavedModel.
-        # TODO(b/113134168): Switch to add_meta_graph_and_variables.
-        clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
-        builder._has_saved_variables = True
-
-    # Add graph to the SavedModel builder.
-    builder.add_meta_graph(
-        model_fn_lib.EXPORT_TAG_MAP[mode],
-        signature_def_map=_create_signature_def_map(clone, mode),
-        saver=saver_lib.Saver(clone_var_list),
-        init_op=variables.local_variables_initializer(),
-        train_op=train_op)
-    return None
-
-
-def _create_signature_def_map(model, mode):
-  """Creates a SignatureDef map from a Keras model."""
-  inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
-  if model.optimizer:
-    targets_dict = {x.name.split(':')[0]: x
-                    for x in model.targets if x is not None}
-    inputs_dict.update(targets_dict)
-  outputs_dict = {name: x
-                  for name, x in zip(model.output_names, model.outputs)}
-  metrics = estimator_keras_util._convert_keras_metrics_to_estimator(model)
-
-  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
-  # are by default not added to any collections. We are doing this here, so
-  # that metric variables get initialized.
-  local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
-  vars_to_add = set()
-  if metrics is not None:
-    for key, value in six.iteritems(metrics):
-      if isinstance(value, Metric):
-        vars_to_add.update(value.variables)
-        # Convert Metric instances to (value_tensor, update_op) tuple.
-        metrics[key] = (value.result(), value.updates[0])
-  # Remove variables that are in the local variables collection already.
-  vars_to_add = vars_to_add.difference(local_vars)
-  for v in vars_to_add:
-    ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
-
-  export_outputs = model_fn_lib.export_outputs_for_mode(
-      mode,
-      predictions=outputs_dict,
-      loss=model.total_loss if model.optimizer else None,
-      metrics=metrics)
-  return export_helpers.build_all_signature_defs(
-      inputs_dict,
-      export_outputs=export_outputs,
-      serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
-
-
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
-  """Asserts model and clone contain the same checkpointable objects."""
-
-  # TODO(fchollet, kathywu): make sure this works in eager mode.
-  return True
-
-
-def load_keras_model(saved_model_path):
-  """Loads a keras.Model from SavedModel.
-
-  load_model reinstantiates model state by:
-  1) loading model topology from json (this will eventually come
-     from metagraph).
-  2) loading model weights from checkpoint.
-
-  Example:
-
-  ```python
-  import tensorflow as tf
-
-  # Create a tf.keras model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
-  model.summary()
-
-  # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.contrib.saved_model.save_keras_model(
-        model, '/tmp/my_simple_tf_keras_saved_model')
-
-  # Load the saved keras model back.
-  model_prime = tf.contrib.saved_model.load_keras_model(saved_to_path)
-  model_prime.summary()
-  ```
-
-  Args:
-    saved_model_path: a string specifying the path to an existing SavedModel.
-
-  Returns:
-    a keras.Model instance.
-  """
-  # restore model topology from json string
-  model_json_filepath = os.path.join(
-      compat.as_bytes(saved_model_path),
-      compat.as_bytes(constants.ASSETS_DIRECTORY),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
-  model_json = file_io.read_file_to_string(model_json_filepath)
-  model = model_from_json(model_json)
-
-  # restore model weights
-  checkpoint_prefix = os.path.join(
-      compat.as_text(saved_model_path),
-      compat.as_text(constants.VARIABLES_DIRECTORY),
-      compat.as_text(constants.VARIABLES_FILENAME))
-  model.load_weights(checkpoint_prefix)
-  return model
+# TODO(kathywu): Remove all contrib callers, switch to tf.keras.
+save_keras_model = saving.export
+load_keras_model = saving.load_from_saved_model
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
deleted file mode 100644
index fbf8138493362d4a3c8a75e1ee1bb2fbe8096499..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ /dev/null
@@ -1,538 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=protected-access
-"""Tests for saving/loading function for keras Model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib.saved_model.python.saved_model import keras_saved_model
-from tensorflow.python import keras
-from tensorflow.python.client import session
-from tensorflow.python.eager import context
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import training as training_module
-
-
-class TestModelSavingandLoading(test.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  def test_saving_sequential_model(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(lr=0.0001),
-          metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-
-      loaded_model = keras_saved_model.load_keras_model(output_path)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_saving_sequential_model_without_compile(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-
-      x = np.random.random((1, 3))
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
-
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_functional_model(self):
-    with self.cached_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(lr=0.0001),
-          metrics=[keras.metrics.categorical_accuracy])
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
-
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_saving_functional_model_without_compile(self):
-    with self.cached_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
-
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_saving_with_tf_optimizer(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
-      loaded_model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test that new updates are the same with both models
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-
-      ref_loss = model.train_on_batch(x, y)
-      loss = loaded_model.train_on_batch(x, y)
-      self.assertAllClose(ref_loss, loss, atol=1e-05)
-
-      ref_y = model.predict(x)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test saving/loading again
-      temp_saved_model2 = self._save_model_dir('saved_model_2')
-      output_path2 = keras_saved_model.save_keras_model(
-          loaded_model, temp_saved_model2)
-      loaded_model = keras_saved_model.load_keras_model(output_path2)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-  def test_saving_subclassed_model_raise_error(self):
-    # For now, saving subclassed model should raise an error. It should be
-    # avoided later with loading from SavedModel.pb.
-
-    class SubclassedModel(training.Model):
-
-      def __init__(self):
-        super(SubclassedModel, self).__init__()
-        self.layer1 = keras.layers.Dense(3)
-        self.layer2 = keras.layers.Dense(1)
-
-      def call(self, inp):
-        return self.layer2(self.layer1(inp))
-
-    model = SubclassedModel()
-
-    temp_saved_model = self._save_model_dir()
-    with self.assertRaises(NotImplementedError):
-      keras_saved_model.save_keras_model(model, temp_saved_model)
-
-
-class LayerWithLearningPhase(keras.engine.base_layer.Layer):
-
-  def call(self, x):
-    phase = keras.backend.learning_phase()
-    output = tf_utils.smart_cond(
-        phase, lambda: x * 0, lambda: array_ops.identity(x))
-    if not context.executing_eagerly():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-
-def functional_model(uses_learning_phase=True):
-  inputs = keras.layers.Input(shape=(3,))
-  x = keras.layers.Dense(2)(inputs)
-  x = keras.layers.Dense(3)(x)
-  if uses_learning_phase:
-    x = LayerWithLearningPhase()(x)
-  return keras.models.Model(inputs, x)
-
-
-def sequential_model(uses_learning_phase=True):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(2, input_shape=(3,)))
-  model.add(keras.layers.Dense(3))
-  if uses_learning_phase:
-    model.add(LayerWithLearningPhase())
-  return model
-
-
-def sequential_model_without_input_shape(uses_learning_phase=True):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(2))
-  model.add(keras.layers.Dense(3))
-  if uses_learning_phase:
-    model.add(LayerWithLearningPhase())
-  return model
-
-
-class Subclassed(keras.models.Model):
-
-  def __init__(self):
-    super(Subclassed, self).__init__()
-    self.dense1 = keras.layers.Dense(2)
-    self.dense2 = keras.layers.Dense(3)
-
-  def call(self, inputs):
-    x = self.dense1(inputs)
-    x = self.dense2(x)
-    return x
-
-
-def subclassed_model():
-  return Subclassed()
-
-
-def load_model(sess, path, mode):
-  tags = model_fn_lib.EXPORT_TAG_MAP[mode]
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    sig_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-  else:
-    sig_def_key = mode
-
-  meta_graph_def = loader_impl.load(sess, tags, path)
-  inputs = {
-      k: sess.graph.get_tensor_by_name(v.name)
-      for k, v in meta_graph_def.signature_def[sig_def_key].inputs.items()}
-  outputs = {
-      k: sess.graph.get_tensor_by_name(v.name)
-      for k, v in meta_graph_def.signature_def[sig_def_key].outputs.items()}
-  return inputs, outputs, meta_graph_def
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
-
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
-  @parameterized.parameters(
-      {
-          'model_builder': functional_model,
-          'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
-          'train_before_export': True},
-      {
-          'model_builder': functional_model,
-          'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
-          'train_before_export': False},
-      {
-          'model_builder': functional_model,
-          'uses_learning_phase': False,
-          'optimizer': None,
-          'train_before_export': False},
-      {
-          'model_builder': sequential_model,
-          'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
-          'train_before_export': True},
-      {
-          'model_builder': sequential_model,
-          'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
-          'train_before_export': False},
-      {
-          'model_builder': sequential_model,
-          'uses_learning_phase': False,
-          'optimizer': None,
-          'train_before_export': False},
-      {
-          'model_builder': sequential_model_without_input_shape,
-          'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
-          'train_before_export': False})
-  def testSaveAndLoadSavedModelExport(
-      self, model_builder, uses_learning_phase, optimizer, train_before_export):
-    saved_model_path = self._save_model_dir()
-    with self.session(graph=ops.Graph()):
-      np.random.seed(130)
-      input_arr = np.random.random((1, 3))
-      target_arr = np.random.random((1, 3))
-
-      model = model_builder(uses_learning_phase)
-      if optimizer is not None:
-        model.compile(
-            loss='mse',
-            optimizer=optimizer,
-            metrics=['mae'])
-        if train_before_export:
-          model.train_on_batch(input_arr, target_arr)
-
-        ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
-
-      ref_predict = model.predict(input_arr)
-
-      # Export SavedModel
-      output_path = keras_saved_model.save_keras_model(model, saved_model_path)
-
-    input_name = model.input_names[0]
-    output_name = model.output_names[0]
-    target_name = output_name + '_target'
-
-    # Load predict graph, and test predictions
-    with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
-                                      model_fn_lib.ModeKeys.PREDICT)
-
-      predictions = sess.run(outputs[output_name],
-                             {inputs[input_name]: input_arr})
-      self.assertAllClose(ref_predict, predictions, atol=1e-05)
-
-    if optimizer:
-      # Load eval graph, and test predictions, loss and metric values
-      with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, output_path,
-                                        model_fn_lib.ModeKeys.EVAL)
-
-        # First obtain the loss and predictions, and run the metric update op by
-        # feeding in the inputs and targets.
-        loss, predictions, _ = sess.run(
-            (outputs['loss'], outputs['predictions/' + output_name],
-             outputs['metrics/mean_absolute_error/update_op']), {
-                 inputs[input_name]: input_arr,
-                 inputs[target_name]: target_arr
-             })
-
-        # The metric value should be run after the update op, to ensure that it
-        # reflects the correct value.
-        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
-
-        self.assertEqual(int(train_before_export),
-                         sess.run(training_module.get_global_step()))
-        self.assertAllClose(ref_loss, loss, atol=1e-05)
-        self.assertAllClose(ref_mae, metric_value, atol=1e-05)
-        self.assertAllClose(ref_predict, predictions, atol=1e-05)
-
-      # Load train graph, and check for the train op, and prediction values
-      with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, meta_graph_def = load_model(
-            sess, output_path, model_fn_lib.ModeKeys.TRAIN)
-        self.assertEqual(int(train_before_export),
-                         sess.run(training_module.get_global_step()))
-        self.assertIn('loss', outputs)
-        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
-        self.assertIn('metrics/mean_absolute_error/value', outputs)
-        self.assertIn('predictions/' + output_name, outputs)
-
-        # Train for a step
-        train_op = loader_impl.get_train_op(meta_graph_def)
-        train_outputs, _ = sess.run(
-            [outputs, train_op], {inputs[input_name]: input_arr,
-                                  inputs[target_name]: target_arr})
-        self.assertEqual(int(train_before_export) + 1,
-                         sess.run(training_module.get_global_step()))
-
-        if uses_learning_phase:
-          self.assertAllClose(
-              [[0, 0, 0]], train_outputs['predictions/' + output_name],
-              atol=1e-05)
-        else:
-          self.assertNotAllClose(
-              [[0, 0, 0]], train_outputs['predictions/' + output_name],
-              atol=1e-05)
-
-  def testSaveAndLoadSavedModelWithCustomObject(self):
-    saved_model_path = self._save_model_dir()
-    with session.Session(graph=ops.Graph()) as sess:
-      def relu6(x):
-        return keras.backend.relu(x, max_value=6)
-      inputs = keras.layers.Input(shape=(1,))
-      outputs = keras.layers.Activation(relu6)(inputs)
-      model = keras.models.Model(inputs, outputs)
-      output_path = keras_saved_model.save_keras_model(
-          model, saved_model_path, custom_objects={'relu6': relu6})
-    with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
-                                      model_fn_lib.ModeKeys.PREDICT)
-      input_name = model.input_names[0]
-      output_name = model.output_names[0]
-      predictions = sess.run(
-          outputs[output_name], {inputs[input_name]: [[7], [-3], [4]]})
-      self.assertAllEqual([[6], [0], [4]], predictions)
-
-  def testAssertModelCloneSameObjectsIgnoreOptimizer(self):
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 3))
-
-    model_graph = ops.Graph()
-    clone_graph = ops.Graph()
-
-    # Create two models with the same layers but different optimizers.
-    with session.Session(graph=model_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(3)(x)
-      model = keras.models.Model(inputs, x)
-
-      model.compile(loss='mse', optimizer=training_module.AdadeltaOptimizer())
-      model.train_on_batch(input_arr, target_arr)
-
-    with session.Session(graph=clone_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(3)(x)
-      clone = keras.models.Model(inputs, x)
-      clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
-      clone.train_on_batch(input_arr, target_arr)
-
-    keras_saved_model._assert_same_non_optimizer_objects(
-        model, model_graph, clone, clone_graph)
-
-  def testAssertModelCloneSameObjectsThrowError(self):
-    input_arr = np.random.random((1, 3))
-    target_arr = np.random.random((1, 3))
-
-    model_graph = ops.Graph()
-    clone_graph = ops.Graph()
-
-    # Create two models with the same layers but different optimizers.
-    with session.Session(graph=model_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(3)(x)
-      model = keras.models.Model(inputs, x)
-
-      model.compile(loss='mse', optimizer=training_module.AdadeltaOptimizer())
-      model.train_on_batch(input_arr, target_arr)
-
-    with session.Session(graph=clone_graph):
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      x = keras.layers.Dense(4)(x)
-      x = keras.layers.Dense(3)(x)
-      clone = keras.models.Model(inputs, x)
-      clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
-      clone.train_on_batch(input_arr, target_arr)
-
-  def testSaveSequentialModelWithoutInputShapes(self):
-    model = sequential_model_without_input_shape(True)
-    # A Sequential model that hasn't been built should raise an error.
-    with self.assertRaisesRegexp(ValueError, 'Please build the model'):
-      keras_saved_model.save_keras_model(model, '')
-
-    saved_model_path = self._save_model_dir()
-    output_path = keras_saved_model.save_keras_model(
-        model, saved_model_path,
-        input_signature=tensor_spec.TensorSpec(shape=(10, 11, 12, 13, 14),
-                                               dtype=dtypes.float32,
-                                               name='spec_input'))
-
-    with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
-                                      model_fn_lib.ModeKeys.PREDICT)
-      self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
-      self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
-      self.assertEqual(3, outputs[next(iter(outputs.keys()))].shape[-1])
-
-  @test_util.run_v2_only
-  @parameterized.parameters(
-      {
-          'model_builder': sequential_model_without_input_shape,
-          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
-                                                     dtype=dtypes.float32)]},
-      {
-          'model_builder': subclassed_model,
-          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
-                                                     dtype=dtypes.float32)]})
-  def testServingOnly(self, model_builder, input_signature):
-    saved_model_path = self._save_model_dir()
-    input_arr = np.random.random((5, 3)).astype(np.float32)
-    model = model_builder()
-    ref_predict = model.predict(input_arr)
-
-    output_path = keras_saved_model.save_keras_model(
-        model, saved_model_path, serving_only=True,
-        input_signature=input_signature)
-
-    # Load predict graph, and test predictions
-    with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
-                                      model_fn_lib.ModeKeys.PREDICT)
-      predictions = sess.run(outputs[next(iter(outputs.keys()))],
-                             {inputs[next(iter(inputs.keys()))]: input_arr})
-      self.assertAllClose(ref_predict, predictions, atol=1e-05)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 18b56cd21942e28cb0dc3210df0bb04d55c1e16f..7d5ba90ded215a59dbded751efd497f142a95e61 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -33,7 +33,6 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":beam_search_ops",
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/contrib/util:util_py",
@@ -59,7 +58,6 @@ tf_custom_op_py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -215,3 +213,18 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
 )
+
+cuda_py_test(
+    name = "attention_wrapper_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/attention_wrapper_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ff04e1780c4c44df14d6e87c5afdbf533ca5c90
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
@@ -0,0 +1,94 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.ops.attention_wrapper."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(AttentionMechanismTest, self).setUp()
+    self.batch = 10
+    self.timestep = 5
+    self.memory_size = 6
+    self.units = 8
+
+    self.memory = ops.convert_to_tensor(
+        np.random.random((self.batch, self.timestep, self.memory_size)),
+        dtype=np.float32)
+    self.query = ops.convert_to_tensor(
+        np.random.random((self.batch, self.units)), dtype=np.float32)
+    self.state = ops.convert_to_tensor(
+        np.random.random((self.batch, self.timestep)), dtype=np.float32)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_attention_shape_inference(self, attention_cls):
+    attention = attention_cls(self.units)
+    attention_score = attention([self.query, self.state, self.memory])
+    self.assertLen(attention_score, 2)
+    self.assertEqual(attention_score[0].shape, (self.batch, self.timestep))
+    self.assertEqual(attention_score[1].shape, (self.batch, self.timestep))
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_get_config(self, attention_cls):
+    attention = attention_cls(self.units)
+    config = attention.get_config()
+
+    attention_from_config = attention_cls.from_config(config)
+    config_from_clone = attention_from_config.get_config()
+
+    self.assertDictEqual(config, config_from_clone)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_layer_output(self, attention_cls):
+    attention = attention_cls(self.units)
+
+    score = attention([self.query, self.state, self.memory])
+    self.evaluate(variables.variables_initializer(attention.variables))
+
+    score_val = self.evaluate(score)
+    self.assertLen(score_val, 2)
+    self.assertEqual(score_val[0].shape, (self.batch, self.timestep))
+    self.assertEqual(score_val[1].shape, (self.batch, self.timestep))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index b7f9f3fb090356a1c8d2bfb5044712ff93e267ce..abcf71c61b6e6df9462bf06323b8b11d5cc0d9a8 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -34,8 +34,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.platform import test
 # pylint: enable=g-import-not-at-top
 
@@ -517,7 +515,7 @@ class BasicDecoderTest(test.TestCase):
         vocabulary_size)
 
     # The sample function samples categorically from the logits.
-    sample_fn = lambda x: categorical.Categorical(logits=x).sample()
+    sample_fn = lambda x: helper_py.categorical_sample(logits=x)
     # The next inputs are a one-hot encoding of the sampled labels.
     next_inputs_fn = (
         lambda x: array_ops.one_hot(x, vocabulary_size, dtype=dtypes.float32))
@@ -599,7 +597,7 @@ class BasicDecoderTest(test.TestCase):
 
     # The sample function samples independent bernoullis from the logits.
     sample_fn = (
-        lambda x: bernoulli.Bernoulli(logits=x, dtype=dtypes.bool).sample())
+        lambda x: helper_py.bernoulli_sample(logits=x, dtype=dtypes.bool))
     # The next inputs are a one-hot encoding of the sampled labels.
     next_inputs_fn = math_ops.to_float
     end_fn = lambda sample_ids: sample_ids[:, end_token]
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 5aa32b532ffcf5772f6ace26662f5e5471cf6923..41b2a53ca5b178be9b04446c81d832575e5ed75b 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -14,80 +14,254 @@
 # ==============================================================================
 
 """Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
-# pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# pylint: enable=unused-import
 
 import numpy as np
 
 from tensorflow.contrib.seq2seq.python.ops import loss
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LossTest(test.TestCase):
 
+  def setUp(self):
+    self.batch_size = 2
+    self.sequence_length = 3
+    self.number_of_classes = 5
+    logits = [
+        constant_op.constant(i + 0.5, shape=[self.batch_size,
+                                             self.number_of_classes])
+        for i in range(self.sequence_length)
+    ]
+    self.logits = array_ops.stack(logits, axis=1)
+    targets = [
+        constant_op.constant(i, dtypes.int32, shape=[self.batch_size])
+        for i in range(self.sequence_length)
+    ]
+    self.targets = array_ops.stack(targets, axis=1)
+    weights = [
+        constant_op.constant(1.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    self.weights = array_ops.stack(weights, axis=1)
+    # expected_loss = sparse_softmax_cross_entropy_with_logits(targets, logits)
+    # where targets = [0, 1, 2], and logits = [[0.5] * 5, [1.5] * 5, [2.5] * 5]
+    self.expected_loss = 1.60944
+
   def testSequenceLoss(self):
-    with self.session(use_gpu=True) as sess:
-      with variable_scope.variable_scope(
-          'root', initializer=init_ops.constant_initializer(0.5)):
-        batch_size = 2
-        sequence_length = 3
-        number_of_classes = 5
-        logits = [
-            constant_op.constant(
-                i + 0.5, shape=[batch_size, number_of_classes])
-            for i in range(sequence_length)
-        ]
-        logits = array_ops.stack(logits, axis=1)
-        targets = [
-            constant_op.constant(
-                i, dtypes.int32, shape=[batch_size])
-            for i in range(sequence_length)
-        ]
-        targets = array_ops.stack(targets, axis=1)
-        weights = [
-            constant_op.constant(
-                1.0, shape=[batch_size]) for i in range(sequence_length)
-        ]
-        weights = array_ops.stack(weights, axis=1)
-
-        average_loss_per_example = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=True,
-            average_across_batch=True)
-        res = sess.run(average_loss_per_example)
-        self.assertAllClose(1.60944, res)
-
-        average_loss_per_sequence = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=False,
-            average_across_batch=True)
-        res = sess.run(average_loss_per_sequence)
-        compare_per_sequence = np.ones((sequence_length)) * 1.60944
-        self.assertAllClose(compare_per_sequence, res)
-
-        average_loss_per_batch = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=True,
-            average_across_batch=False)
-        res = sess.run(average_loss_per_batch)
-        compare_per_batch = np.ones((batch_size)) * 1.60944
-        self.assertAllClose(compare_per_batch, res)
-
-        total_loss = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=False,
-            average_across_batch=False)
-        res = sess.run(total_loss)
-        compare_total = np.ones((batch_size, sequence_length)) * 1.60944
-        self.assertAllClose(compare_total, res)
+    with self.test_session(use_gpu=True):
+      average_loss_per_example = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      average_loss_per_sequence = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      average_loss_per_batch = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=True,
+          average_across_batch=False)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      total_loss = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testSequenceLossClass(self):
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=True,
+                                   average_across_batch=True,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=True,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=True,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testSumReduction(self):
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=True)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=True)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testWeightedSumReduction(self):
+    weights = [
+        constant_op.constant(1.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    # Make the last element in the sequence to have zero weights.
+    weights[-1] = constant_op.constant(0.0, shape=[self.batch_size])
+    self.weights = array_ops.stack(weights, axis=1)
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=True)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=True)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      # The last element in every sequence are zeros, which will be filtered.
+      compare_per_sequence[-1] = 0.
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      # The last element in every sequence are zeros, which will be filtered.
+      compare_total[:, -1] = 0
+      self.assertAllClose(compare_total, res)
+
+  def testZeroWeights(self):
+    weights = [
+        constant_op.constant(0.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    weights = array_ops.stack(weights, axis=1)
+    with self.test_session(use_gpu=True):
+      average_loss_per_example = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(0.0, res)
+
+      average_loss_per_sequence = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.zeros((self.sequence_length))
+      self.assertAllClose(compare_per_sequence, res)
+
+      average_loss_per_batch = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=False)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.zeros((self.batch_size))
+      self.assertAllClose(compare_per_batch, res)
+
+      total_loss = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = self.evaluate(total_loss)
+      compare_total = np.zeros((self.batch_size, self.sequence_length))
+      self.assertAllClose(compare_total, res)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 60ec3efffe771a3a6d6f36ed4b51a34ef9509612..ae3e7f1b5d8c9f06b5defbaee9cad3810e58abd4 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.framework.python.framework import tensor_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import layers
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
@@ -72,77 +73,6 @@ class AttentionMechanism(object):
     raise NotImplementedError
 
 
-def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
-  """Convert to tensor and possibly mask `memory`.
-
-  Args:
-    memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
-    memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
-    check_inner_dims_defined: Python boolean.  If `True`, the `memory`
-      argument's shape is checked to ensure all but the two outermost
-      dimensions are fully defined.
-
-  Returns:
-    A (possibly masked), checked, new `memory`.
-
-  Raises:
-    ValueError: If `check_inner_dims_defined` is `True` and not
-      `memory.shape[2:].is_fully_defined()`.
-  """
-  memory = nest.map_structure(
-      lambda m: ops.convert_to_tensor(m, name="memory"), memory)
-  if memory_sequence_length is not None:
-    memory_sequence_length = ops.convert_to_tensor(
-        memory_sequence_length, name="memory_sequence_length")
-  if check_inner_dims_defined:
-    def _check_dims(m):
-      if not m.get_shape()[2:].is_fully_defined():
-        raise ValueError("Expected memory %s to have fully defined inner dims, "
-                         "but saw shape: %s" % (m.name, m.get_shape()))
-    nest.map_structure(_check_dims, memory)
-  if memory_sequence_length is None:
-    seq_len_mask = None
-  else:
-    seq_len_mask = array_ops.sequence_mask(
-        memory_sequence_length,
-        maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
-        dtype=nest.flatten(memory)[0].dtype)
-    seq_len_batch_size = (
-        tensor_shape.dimension_value(memory_sequence_length.shape[0])
-        or array_ops.shape(memory_sequence_length)[0])
-  def _maybe_mask(m, seq_len_mask):
-    rank = m.get_shape().ndims
-    rank = rank if rank is not None else array_ops.rank(m)
-    extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
-    m_batch_size = tensor_shape.dimension_value(
-        m.shape[0]) or array_ops.shape(m)[0]
-    if memory_sequence_length is not None:
-      message = ("memory_sequence_length and memory tensor batch sizes do not "
-                 "match.")
-      with ops.control_dependencies([
-          check_ops.assert_equal(
-              seq_len_batch_size, m_batch_size, message=message)]):
-        seq_len_mask = array_ops.reshape(
-            seq_len_mask,
-            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
-        return m * seq_len_mask
-    else:
-      return m
-  return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
-
-
-def _maybe_mask_score(score, memory_sequence_length, score_mask_value):
-  if memory_sequence_length is None:
-    return score
-  message = ("All values in memory_sequence_length must greater than zero.")
-  with ops.control_dependencies(
-      [check_ops.assert_positive(memory_sequence_length, message=message)]):
-    score_mask = array_ops.sequence_mask(
-        memory_sequence_length, maxlen=array_ops.shape(score)[1])
-    score_mask_values = score_mask_value * array_ops.ones_like(score)
-    return array_ops.where(score_mask, score, score_mask_values)
-
-
 class _BaseAttentionMechanism(AttentionMechanism):
   """A base AttentionMechanism class providing common functionality.
 
@@ -205,12 +135,14 @@ class _BaseAttentionMechanism(AttentionMechanism):
           self._memory_layer.dtype).as_numpy_dtype(-np.inf)
     self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
         probability_fn(
-            _maybe_mask_score(score, memory_sequence_length, score_mask_value),
+            _maybe_mask_score(score,
+                              memory_sequence_length=memory_sequence_length,
+                              score_mask_value=score_mask_value),
             prev))
     with ops.name_scope(
         name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self._values = _prepare_memory(
-          memory, memory_sequence_length,
+          memory, memory_sequence_length=memory_sequence_length,
           check_inner_dims_defined=check_inner_dims_defined)
       self._keys = (
           self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
@@ -286,6 +218,207 @@ class _BaseAttentionMechanism(AttentionMechanism):
     return self.initial_alignments(batch_size, dtype)
 
 
+class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
+  """A base AttentionMechanism class providing common functionality.
+
+  Common functionality includes:
+    1. Storing the query and memory layers.
+    2. Preprocessing and storing the memory.
+
+  Note that this layer only support Keras functional API since it takes multiple
+  input tensors, which is not available in sequential model.
+  """
+
+  def __init__(self,
+               probability_fn,
+               query_layer=None,
+               memory_layer=None,
+               **kwargs):
+    """Construct base AttentionMechanism class.
+
+    Args:
+      probability_fn: A `callable`. Converts the score and previous alignments
+        to probabilities. Its signature should be:
+        `probabilities = probability_fn(score, state)`.
+      query_layer:  (optional): Instance of `tf.keras.Layer`.  The layer's depth
+        must match the depth of `memory_layer`.  If `query_layer` is not
+        provided, the shape of `query` must match that of `memory_layer`.
+      memory_layer: (optional): Instance of `tf.keras.Layer`. The layer's
+        depth must match the depth of `query_layer`.
+        If `memory_layer` is not provided, the shape of `memory` must match
+        that of `query_layer`.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    if (query_layer is not None
+        and not isinstance(query_layer, layers.Layer)):
+      raise TypeError(
+          "query_layer is not a Layer: %s" % type(query_layer).__name__)
+    if (memory_layer is not None
+        and not isinstance(memory_layer, layers.Layer)):
+      raise TypeError(
+          "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
+    self.query_layer = query_layer
+    self.memory_layer = memory_layer
+    if self.memory_layer is not None and "dtype" not in kwargs:
+      kwargs["dtype"] = self.memory_layer.dtype
+    super(_BaseAttentionMechanismV2, self).__init__(**kwargs)
+    if not callable(probability_fn):
+      raise TypeError("probability_fn must be callable, saw type: %s" %
+                      type(probability_fn).__name__)
+    self.probability_fn = probability_fn
+
+    self.keys = None
+    self.values = None
+    self.batch_size = None
+    self._memory_initialized = False
+    self._check_inner_dims_defined = True
+
+  def build(self, input_shape):
+    # The layer suppose to take 3 inputs, [query, state, memory].
+    query_input_shape, _, memory_input_shape = input_shape
+    if self.query_layer is not None:
+      self.query_layer.build(query_input_shape)
+    if self.memory_layer is not None:
+      self.memory_layer.build(memory_input_shape)
+    # dtype of the layer is known at this moment, create the score_mask_value if
+    # needed.
+    self.score_mask_value = dtypes.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
+    self.built = True
+
+  def _setup_memory(self, memory, memory_mask=None):
+    """Pre-process the memory before actually query the memory.
+
+    This should only be called once at the first invocation of call().
+
+    Args:
+      memory: The memory to query; usually the output of an RNN encoder. This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_mask: The boolean tensor with shape `[batch_size, max_time]`. For
+        any value equal to False, the corresponding value in memory should be
+        ignored.
+    """
+    if self._memory_initialized:
+      raise ValueError("The memory for the attention has already been setup.")
+    with ops.name_scope(
+        self.name, "BaseAttentionMechanismInit", nest.flatten(memory)):
+      self.values = _prepare_memory(
+          memory, memory_mask=memory_mask,
+          check_inner_dims_defined=self._check_inner_dims_defined)
+      if self.memory_layer is not None:
+        self.keys = self.memory_layer(self.values)
+      else:
+        self.keys = self.values
+      self.batch_size = (
+          tensor_shape.dimension_value(self.keys.shape[0]) or
+          array_ops.shape(self.keys)[0])
+      self._alignments_size = (tensor_shape.dimension_value(self.keys.shape[1])
+                               or array_ops.shape(self.keys)[1])
+      if memory_mask is not None:
+        self.probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
+            self.probability_fn(_maybe_mask_score(
+                score, self.score_mask_value, memory_mask=memory_mask), prev))
+    self._memory_initialized = True
+
+  def call(self, inputs, mask=None, **kwargs):
+    """Base method to calculate the attention score.
+
+    Args:
+      inputs: a list of tensor that contains `query`, `state`, and `memory`.
+        `query` is the tensor of dtype matching `memory` and shape
+        `[batch_size, query_depth]`.
+        `state` is the tensor of dtype matching `memory` and shape
+        `[batch_size, alignments_size]`. (`alignments_size` is memory's
+        `max_time`).
+        `memory` is the memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, feature]`.
+      mask: optional bool tensor with shape `[batch, max_time]` for the mask of
+        memory. If it is not None, the corresponding item of the memory should
+        be filtered out during calculation.
+      **kwargs: Dict, other keyword arguments for the call method.
+    """
+    query, state, memory, memory_mask = self._process_inputs(inputs, mask)
+    if not self._memory_initialized:
+      self._setup_memory(memory, memory_mask=memory_mask)
+    return self.calculate_attention(query, state)
+
+  def calculate_attention(self, query, state):
+    raise NotImplementedError(
+        "calculate_attention need to be implemented by subclasses.")
+
+  def get_config(self):
+    config = {}
+    # Since the probability_fn is likely to be a wrapped function, the child
+    # class should preserve the original function and how its wrapped.
+
+    if self.query_layer is not None:
+      config["query_layer"] = {
+          "class_name": self.query_layer.__class__.__name__,
+          "config": self.query_layer.get_config(),
+      }
+    if self.memory_layer is not None:
+      config["memory_layer"] = {
+          "class_name": self.memory_layer.__class__.__name__,
+          "config": self.memory_layer.get_config(),
+      }
+    base_config = super(_BaseAttentionMechanismV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _process_inputs(self, inputs, mask):
+    if len(inputs) != 3:
+      raise ValueError(
+          "Expect to have 3 inputs for attention, got %d" % len(inputs))
+    query, state, memory = inputs
+    return query, state, memory, mask
+
+  def _process_probability_fn(self, func_name):
+    """Helper method to retrieve the probably function by string input."""
+    valid_probability_fns = {
+        "softmax": nn_ops.softmax,
+        "hardmax": hardmax,
+    }
+    if func_name not in valid_probability_fns.keys():
+      raise ValueError("Invalid probability function: %s, options are %s" %
+                       (func_name, valid_probability_fns.keys()))
+    return valid_probability_fns[func_name]
+
+  @classmethod
+  def deserialize_inner_layer_from_config(cls, config, custom_objects):
+    """Helper method that reconstruct the query and memory from the config.
+
+    In the get_config() method, the query and memory layer configs are
+    serialized into dict for persistence, this method perform the reverse action
+    to reconstruct the layer from the config.
+
+    Args:
+      config: dict, the configs that will be used to reconstruct the object.
+      custom_objects: dict mapping class names (or function names) of custom
+        (non-Keras) objects to class/functions.
+    Returns:
+      config: dict, the config with layer instance created, which is ready to be
+        used as init parameters.
+    """
+    # Reconstruct the query and memory layer for parent class.
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    # Instead of updating the input, create a copy and use that.
+    config = config.copy()
+    query_layer_config = config.pop("query_layer", None)
+    if query_layer_config:
+      query_layer = deserialize_layer(query_layer_config,
+                                      custom_objects=custom_objects)
+      config["query_layer"] = query_layer
+    memory_layer_config = config.pop("memory_layer", None)
+    if memory_layer_config:
+      memory_layer = deserialize_layer(memory_layer_config,
+                                       custom_objects=custom_objects)
+      config["memory_layer"] = memory_layer
+    return config
+
+  @property
+  def alignments_size(self):
+    return self._alignments_size
+
+
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
 
@@ -304,7 +437,7 @@ def _luong_score(query, keys, scale):
   Args:
     query: Tensor, shape `[batch_size, num_units]` to compare to keys.
     keys: Processed memory, shape `[batch_size, max_time, num_units]`.
-    scale: Whether to apply a scale to the score function.
+    scale: the optional tensor to scale the attention score.
 
   Returns:
     A `[batch_size, max_time]` tensor of unnormalized score values.
@@ -320,7 +453,6 @@ def _luong_score(query, keys, scale):
         "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
         "Perhaps you need to set num_units to the keys' dimension (%s)?"
         % (query, depth, keys, key_units, key_units))
-  dtype = query.dtype
 
   # Reshape from [batch_size, depth] to [batch_size, 1, depth]
   # for matmul.
@@ -338,12 +470,8 @@ def _luong_score(query, keys, scale):
   score = math_ops.matmul(query, keys, transpose_b=True)
   score = array_ops.squeeze(score, [1])
 
-  if scale:
-    # Scalar used in weight scaling
-    g = variable_scope.get_variable(
-        "attention_g", dtype=dtype,
-        initializer=init_ops.ones_initializer, shape=())
-    score = g * score
+  if scale is not None:
+    score = scale * score
   return score
 
 
@@ -354,8 +482,8 @@ class LuongAttention(_BaseAttentionMechanism):
   as described in:
 
   Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
-  "Effective Approaches to Attention-based Neural Machine Translation."
-  EMNLP 2015.  https://arxiv.org/abs/1508.04025
+  [Effective Approaches to Attention-based Neural Machine Translation.
+  EMNLP 2015.](https://arxiv.org/abs/1508.04025)
 
   The second is the scaled form inspired partly by the normalized form of
   Bahdanau attention.
@@ -429,13 +557,125 @@ class LuongAttention(_BaseAttentionMechanism):
         `max_time`).
     """
     with variable_scope.variable_scope(None, "luong_attention", [query]):
-      score = _luong_score(query, self._keys, self._scale)
+      attention_g = None
+      if self._scale:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.ones_initializer, shape=())
+      score = _luong_score(query, self._keys, attention_g)
     alignments = self._probability_fn(score, state)
     next_state = alignments
     return alignments, next_state
 
 
-def _bahdanau_score(processed_query, keys, normalize):
+class LuongAttentionV2(_BaseAttentionMechanismV2):
+  """Implements Luong-style (multiplicative) attention scoring.
+
+  This attention has two forms.  The first is standard Luong attention,
+  as described in:
+
+  Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+  [Effective Approaches to Attention-based Neural Machine Translation.
+  EMNLP 2015.](https://arxiv.org/abs/1508.04025)
+
+  The second is the scaled form inspired partly by the normalized form of
+  Bahdanau attention.
+
+  To enable the second form, construct the object with parameter
+  `scale=True`.
+  """
+
+  def __init__(self,
+               units,
+               scale=False,
+               probability_fn="softmax",
+               dtype=None,
+               name="LuongAttention",
+               **kwargs):
+    """Construct the AttentionMechanism mechanism.
+
+    Args:
+      units: The depth of the attention mechanism.
+      scale: Python boolean. Whether to scale the energy term.
+      probability_fn: (optional) string, the name of function to convert the
+        attention score to probabilities. The default is `softmax` which is
+        `tf.nn.softmax`. Other options is `hardmax`, which is hardmax() within
+        this module. Any other value will result intovalidation error. Default
+        to use `softmax`.
+      dtype: The data type for the memory layer of the attention mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    # For LuongAttention, we only transform the memory layer; thus
+    # num_units **must** match expected the query depth.
+    self.probability_fn_name = probability_fn
+    probability_fn = self._process_probability_fn(self.probability_fn_name)
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
+    if dtype is None:
+      dtype = dtypes.float32
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    super(LuongAttentionV2, self).__init__(
+        query_layer=None,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+    self.units = units
+    self.scale = scale
+
+  def build(self, input_shape):
+    super(LuongAttentionV2, self).build(input_shape)
+    if self.scale:
+      self.scale_weight = self.add_weight(
+          "attention_g", initializer=init_ops.ones_initializer, shape=())
+    else:
+      self.scale_weight = None
+    self.built = True
+
+  def calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: Same as the alignments.
+    """
+    score = _luong_score(query, self.keys, self.scale_weight)
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "scale": self.scale,
+        "probability_fn": self.probability_fn_name,
+    }
+    base_config = super(LuongAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
+def _bahdanau_score(processed_query, keys, attention_v,
+                    attention_g=None, attention_b=None):
   """Implements Bahdanau-style (additive) scoring function.
 
   This attention has two forms.  The first is Bhandanau attention,
@@ -453,41 +693,28 @@ def _bahdanau_score(processed_query, keys, normalize):
    Training of Deep Neural Networks."
   https://arxiv.org/abs/1602.07868
 
-  To enable the second form, set `normalize=True`.
+  To enable the second form, set please pass in attention_g and attention_b.
 
   Args:
     processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
     keys: Processed memory, shape `[batch_size, max_time, num_units]`.
-    normalize: Whether to normalize the score function.
+    attention_v: Tensor, shape `[num_units]`.
+    attention_g: Optional scalar tensor for normalization.
+    attention_b: Optional tensor with shape `[num_units]` for normalization.
 
   Returns:
     A `[batch_size, max_time]` tensor of unnormalized score values.
   """
-  dtype = processed_query.dtype
-  # Get the number of hidden units from the trailing dimension of keys
-  num_units = tensor_shape.dimension_value(
-      keys.shape[2]) or array_ops.shape(keys)[2]
   # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
   processed_query = array_ops.expand_dims(processed_query, 1)
-  v = variable_scope.get_variable(
-      "attention_v", [num_units], dtype=dtype)
-  if normalize:
-    # Scalar used in weight normalization
-    g = variable_scope.get_variable(
-        "attention_g", dtype=dtype,
-        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
-        shape=())
-    # Bias added prior to the nonlinearity
-    b = variable_scope.get_variable(
-        "attention_b", [num_units], dtype=dtype,
-        initializer=init_ops.zeros_initializer())
-    # normed_v = g * v / ||v||
-    normed_v = g * v * math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(v)))
+  if attention_g is not None and attention_b is not None:
+    normed_v = attention_g * attention_v * math_ops.rsqrt(
+        math_ops.reduce_sum(math_ops.square(attention_v)))
     return math_ops.reduce_sum(
-        normed_v * math_ops.tanh(keys + processed_query + b), [2])
+        normed_v * math_ops.tanh(keys + processed_query + attention_b), [2])
   else:
-    return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
+    return math_ops.reduce_sum(
+        attention_v * math_ops.tanh(keys + processed_query), [2])
 
 
 class BahdanauAttention(_BaseAttentionMechanism):
@@ -578,12 +805,152 @@ class BahdanauAttention(_BaseAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
-      score = _bahdanau_score(processed_query, self._keys, self._normalize)
+      attention_v = variable_scope.get_variable(
+          "attention_v", [self._num_units], dtype=query.dtype)
+      if not self._normalize:
+        attention_g = None
+        attention_b = None
+      else:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.constant_initializer(
+                math.sqrt((1. / self._num_units))),
+            shape=())
+        attention_b = variable_scope.get_variable(
+            "attention_b", [self._num_units], dtype=query.dtype,
+            initializer=init_ops.zeros_initializer())
+
+      score = _bahdanau_score(processed_query, self._keys, attention_v,
+                              attention_g=attention_g, attention_b=attention_b)
     alignments = self._probability_fn(score, state)
     next_state = alignments
     return alignments, next_state
 
 
+class BahdanauAttentionV2(_BaseAttentionMechanismV2):
+  """Implements Bahdanau-style (additive) attention.
+
+  This attention has two forms.  The first is Bahdanau attention,
+  as described in:
+
+  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+  "Neural Machine Translation by Jointly Learning to Align and Translate."
+  ICLR 2015. https://arxiv.org/abs/1409.0473
+
+  The second is the normalized form.  This form is inspired by the
+  weight normalization article:
+
+  Tim Salimans, Diederik P. Kingma.
+  "Weight Normalization: A Simple Reparameterization to Accelerate
+   Training of Deep Neural Networks."
+  https://arxiv.org/abs/1602.07868
+
+  To enable the second form, construct the object with parameter
+  `normalize=True`.
+  """
+
+  def __init__(self,
+               units,
+               normalize=False,
+               probability_fn="softmax",
+               dtype=None,
+               name="BahdanauAttention",
+               **kwargs):
+    """Construct the Attention mechanism.
+
+    Args:
+      units: The depth of the query mechanism.
+      normalize: Python boolean.  Whether to normalize the energy term.
+      probability_fn: (optional) string, the name of function to convert the
+        attention score to probabilities. The default is `softmax` which is
+        `tf.nn.softmax`. Other options is `hardmax`, which is hardmax() within
+        this module. Any other value will result into validation error. Default
+        to use `softmax`.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    self.probability_fn_name = probability_fn
+    probability_fn = self._process_probability_fn(self.probability_fn_name)
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
+    if dtype is None:
+      dtype = dtypes.float32
+    query_layer = kwargs.pop("query_layer", None)
+    if not query_layer:
+      query_layer = layers.Dense(
+          units, name="query_layer", use_bias=False, dtype=dtype)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    super(BahdanauAttentionV2, self).__init__(
+        query_layer=query_layer,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+    self.units = units
+    self.normalize = normalize
+
+  def build(self, input_shape):
+    super(BahdanauAttentionV2, self).build(input_shape)
+    self.attention_v = self.add_weight(
+        "attention_v", [self.units], dtype=self.dtype)
+    if self.normalize:
+      self.attention_g = self.add_weight(
+          "attention_g", initializer=init_ops.constant_initializer(
+              math.sqrt((1. / self.units))), shape=())
+      self.attention_b = self.add_weight(
+          "attention_b", shape=[self.units],
+          initializer=init_ops.zeros_initializer())
+    else:
+      self.attention_g = None
+      self.attention_b = None
+    self.built = True
+
+  def calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: same as alignments.
+    """
+    processed_query = self.query_layer(query) if self.query_layer else query
+    score = _bahdanau_score(processed_query, self.keys, self.attention_v,
+                            attention_g=self.attention_g,
+                            attention_b=self.attention_b)
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "normalize": self.normalize,
+        "probability_fn": self.probability_fn_name,
+    }
+    base_config = super(BahdanauAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
 def safe_cumprod(x, *args, **kwargs):
   """Computes cumprod of x in logspace using cumsum to avoid underflow.
 
@@ -766,6 +1133,34 @@ class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
         dtype=dtype)
 
 
+class _BaseMonotonicAttentionMechanismV2(_BaseAttentionMechanismV2):
+  """Base attention mechanism for monotonic attention.
+
+  Simply overrides the initial_alignments function to provide a dirac
+  distribution, which is needed in order for the monotonic attention
+  distributions to have the correct behavior.
+  """
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the monotonic attentions.
+
+    Initializes to dirac distributions, i.e. [1, 0, 0, ...memory length..., 0]
+    for all entries in the batch.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return array_ops.one_hot(
+        array_ops.zeros((batch_size,), dtype=dtypes.int32), max_time,
+        dtype=dtype)
+
+
 class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Bahadanau-style energy function.
 
@@ -860,7 +1255,22 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     with variable_scope.variable_scope(
         None, "bahdanau_monotonic_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
-      score = _bahdanau_score(processed_query, self._keys, self._normalize)
+      attention_v = variable_scope.get_variable(
+          "attention_v", [self._num_units], dtype=query.dtype)
+      if not self._normalize:
+        attention_g = None
+        attention_b = None
+      else:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.constant_initializer(
+                math.sqrt((1. / self._num_units))),
+            shape=())
+        attention_b = variable_scope.get_variable(
+            "attention_b", [self._num_units], dtype=query.dtype,
+            initializer=init_ops.zeros_initializer())
+      score = _bahdanau_score(processed_query, self._keys, attention_v,
+                              attention_g=attention_g, attention_b=attention_b)
       score_bias = variable_scope.get_variable(
           "attention_score_bias", dtype=processed_query.dtype,
           initializer=self._score_bias_init)
@@ -870,6 +1280,146 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     return alignments, next_state
 
 
+class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
+  """Monotonic attention mechanism with Bahadanau-style energy function.
+
+  This type of attention enforces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the memory
+  it can't attend to any prior points at subsequence output timesteps.  It
+  achieves this by using the _monotonic_probability_fn instead of softmax to
+  construct its attention distributions.  Since the attention scores are passed
+  through a sigmoid, a learnable scalar bias parameter is applied after the
+  score function and before the sigmoid.  Otherwise, it is equivalent to
+  BahdanauAttention.  This approach is proposed in
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+  """
+
+  def __init__(self,
+               units,
+               normalize=False,
+               sigmoid_noise=0.,
+               sigmoid_noise_seed=None,
+               score_bias_init=0.,
+               mode="parallel",
+               dtype=None,
+               name="BahdanauMonotonicAttention",
+               **kwargs):
+    """Construct the Attention mechanism.
+
+    Args:
+      units: The depth of the query mechanism.
+      normalize: Python boolean. Whether to normalize the energy term.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise. See the docstring
+        for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar. It's recommended to
+        initialize this to a negative value when the length of the memory is
+        large.
+      mode: How to compute the attention distribution. Must be one of
+        'recursive', 'parallel', or 'hard'. See the docstring for
+        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
+    wrapped_probability_fn = functools.partial(
+        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
+        seed=sigmoid_noise_seed)
+    query_layer = kwargs.pop("query_layer", None)
+    if not query_layer:
+      query_layer = layers.Dense(
+          units, name="query_layer", use_bias=False, dtype=dtype)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    super(BahdanauMonotonicAttentionV2, self).__init__(
+        query_layer=query_layer,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+    self.units = units
+    self.normalize = normalize
+    self.sigmoid_noise = sigmoid_noise
+    self.sigmoid_noise_seed = sigmoid_noise_seed
+    self.score_bias_init = score_bias_init
+    self.mode = mode
+
+  def build(self, input_shape):
+    super(BahdanauMonotonicAttentionV2, self).build(input_shape)
+    self.attention_v = self.add_weight(
+        "attention_v", [self.units], dtype=self.dtype)
+    self.attention_score_bias = self.add_weight(
+        "attention_score_bias", shape=(), dtype=self.dtype,
+        initializer=init_ops.constant_initializer(
+            self.score_bias_init, dtype=self.dtype))
+    if not self.normalize:
+      self.attention_g = None
+      self.attention_b = None
+    else:
+      self.attention_g = self.add_weight(
+          "attention_g", dtype=self.dtype,
+          initializer=init_ops.constant_initializer(
+              math.sqrt((1. / self.units))),
+          shape=())
+      self.attention_b = self.add_weight(
+          "attention_b", [self.units], dtype=self.dtype,
+          initializer=init_ops.zeros_initializer())
+    self.built = True
+
+  def calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+    """
+    processed_query = self.query_layer(query) if self.query_layer else query
+    score = _bahdanau_score(processed_query, self.keys, self.attention_v,
+                            attention_g=self.attention_g,
+                            attention_b=self.attention_b)
+    score += self.attention_score_bias
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "normalize": self.normalize,
+        "sigmoid_noise": self.sigmoid_noise,
+        "sigmoid_noise_seed": self.sigmoid_noise_seed,
+        "score_bias_init": self.score_bias_init,
+        "mode": self.mode,
+    }
+    base_config = super(BahdanauMonotonicAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Luong-style energy function.
 
@@ -960,7 +1510,12 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "luong_monotonic_attention",
                                        [query]):
-      score = _luong_score(query, self._keys, self._scale)
+      attention_g = None
+      if self._scale:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.ones_initializer, shape=())
+      score = _luong_score(query, self._keys, attention_g)
       score_bias = variable_scope.get_variable(
           "attention_score_bias", dtype=query.dtype,
           initializer=self._score_bias_init)
@@ -970,6 +1525,129 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     return alignments, next_state
 
 
+class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
+  """Monotonic attention mechanism with Luong-style energy function.
+
+  This type of attention enforces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the memory
+  it can't attend to any prior points at subsequence output timesteps.  It
+  achieves this by using the _monotonic_probability_fn instead of softmax to
+  construct its attention distributions.  Otherwise, it is equivalent to
+  LuongAttention.  This approach is proposed in
+
+  [Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.](https://arxiv.org/abs/1704.00784)
+  """
+
+  def __init__(self,
+               units,
+               scale=False,
+               sigmoid_noise=0.,
+               sigmoid_noise_seed=None,
+               score_bias_init=0.,
+               mode="parallel",
+               dtype=None,
+               name="LuongMonotonicAttention",
+               **kwargs):
+    """Construct the Attention mechanism.
+
+    Args:
+      units: The depth of the query mechanism.
+      scale: Python boolean.  Whether to scale the energy term.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the docstring
+        for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar.  It's recommended to
+        initialize this to a negative value when the length of the memory is
+        large.
+      mode: How to compute the attention distribution.  Must be one of
+        'recursive', 'parallel', or 'hard'.  See the docstring for
+        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
+    wrapped_probability_fn = functools.partial(
+        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
+        seed=sigmoid_noise_seed)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    super(LuongMonotonicAttentionV2, self).__init__(
+        query_layer=None,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+    self.units = units
+    self.scale = scale
+    self.sigmoid_noise = sigmoid_noise
+    self.sigmoid_noise_seed = sigmoid_noise_seed
+    self.score_bias_init = score_bias_init
+    self.mode = mode
+
+  def build(self, input_shape):
+    super(LuongMonotonicAttentionV2, self).build(input_shape)
+    if self.scale:
+      self.attention_g = self.add_weight(
+          "attention_g", initializer=init_ops.ones_initializer, shape=())
+    else:
+      self.attention_g = None
+    self.attention_score_bias = self.add_weight(
+        "attention_score_bias", shape=(),
+        initializer=init_ops.constant_initializer(
+            self.score_bias_init, dtype=self.dtype))
+    self.built = True
+
+  def calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: Same as alignments
+    """
+    score = _luong_score(query, self.keys, self.attention_g)
+    score += self.attention_score_bias
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "scale": self.scale,
+        "sigmoid_noise": self.sigmoid_noise,
+        "sigmoid_noise_seed": self.sigmoid_noise_seed,
+        "score_bias_init": self.score_bias_init,
+        "mode": self.mode,
+    }
+    base_config = super(LuongMonotonicAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
 class AttentionWrapperState(
     collections.namedtuple("AttentionWrapperState",
                            ("cell_state", "attention", "time", "alignments",
@@ -1026,6 +1704,97 @@ class AttentionWrapperState(
         super(AttentionWrapperState, self)._replace(**kwargs))
 
 
+def _prepare_memory(memory, memory_sequence_length=None, memory_mask=None,
+                    check_inner_dims_defined=True):
+  """Convert to tensor and possibly mask `memory`.
+
+  Args:
+    memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
+    memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
+    memory_mask: `boolean` tensor with shape [batch_size, max_time]. The memory
+      should be skipped when the corresponding mask is False.
+    check_inner_dims_defined: Python boolean.  If `True`, the `memory`
+      argument's shape is checked to ensure all but the two outermost
+      dimensions are fully defined.
+
+  Returns:
+    A (possibly masked), checked, new `memory`.
+
+  Raises:
+    ValueError: If `check_inner_dims_defined` is `True` and not
+      `memory.shape[2:].is_fully_defined()`.
+  """
+  memory = nest.map_structure(
+      lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+  if memory_sequence_length is not None and memory_mask is not None:
+    raise ValueError("memory_sequence_length and memory_mask can't be provided "
+                     "at same time.")
+  if memory_sequence_length is not None:
+    memory_sequence_length = ops.convert_to_tensor(
+        memory_sequence_length, name="memory_sequence_length")
+  if check_inner_dims_defined:
+    def _check_dims(m):
+      if not m.get_shape()[2:].is_fully_defined():
+        raise ValueError("Expected memory %s to have fully defined inner dims, "
+                         "but saw shape: %s" % (m.name, m.get_shape()))
+    nest.map_structure(_check_dims, memory)
+  if memory_sequence_length is None and memory_mask is None:
+    seq_len_mask = None
+    seq_len_batch_size = None
+  elif memory_sequence_length is not None:
+    seq_len_mask = array_ops.sequence_mask(
+        memory_sequence_length,
+        maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+        dtype=nest.flatten(memory)[0].dtype)
+    seq_len_batch_size = (
+        tensor_shape.dimension_value(memory_sequence_length.shape[0])
+        or array_ops.shape(memory_sequence_length)[0])
+  else:
+    # For memory_mask is not None
+    seq_len_mask = memory_mask
+    seq_len_batch_size = (
+        tensor_shape.dimension_value(memory_mask.shape[0])
+        or array_ops.shape(memory_mask)[0])
+  def _maybe_mask(m, seq_len_mask):
+    """Mask the memory based on the memory mask."""
+    rank = m.get_shape().ndims
+    rank = rank if rank is not None else array_ops.rank(m)
+    extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+    m_batch_size = tensor_shape.dimension_value(
+        m.shape[0]) or array_ops.shape(m)[0]
+    if seq_len_batch_size is not None:
+      message = ("memory_sequence_length and memory tensor batch sizes do not "
+                 "match.")
+      with ops.control_dependencies([
+          check_ops.assert_equal(
+              seq_len_batch_size, m_batch_size, message=message)]):
+        seq_len_mask = array_ops.reshape(
+            seq_len_mask,
+            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
+        return m * seq_len_mask
+    else:
+      return m
+  return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
+
+
+def _maybe_mask_score(score, memory_sequence_length=None, memory_mask=None,
+                      score_mask_value=None):
+  """Mask the attention score based on the masks."""
+  if memory_sequence_length is None and memory_mask is None:
+    return score
+  if memory_sequence_length is not None and memory_mask is not None:
+    raise ValueError("memory_sequence_length and memory_mask can't be provided "
+                     "at same time.")
+  if memory_sequence_length is not None:
+    message = "All values in memory_sequence_length must greater than zero."
+    with ops.control_dependencies(
+        [check_ops.assert_positive(memory_sequence_length, message=message)]):
+      memory_mask = array_ops.sequence_mask(
+          memory_sequence_length, maxlen=array_ops.shape(score)[1])
+  score_mask_values = score_mask_value * array_ops.ones_like(score)
+  return array_ops.where(memory_mask, score, score_mask_values)
+
+
 def hardmax(logits, name=None):
   """Returns batched one-hot vectors.
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 3245cc5e72154289ea3ba000b9a30586a7ad03a9..033c2eb0801d5a51ee937f5e960faa91a6f1ae54 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -32,9 +32,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -51,6 +50,68 @@ __all__ = [
 _transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
 
 
+# The following sample functions (_call_sampler, bernoulli_sample,
+# categorical_sample) mimic TensorFlow Probability distribution semantics.
+
+
+def _call_sampler(sample_n_fn, sample_shape, name=None):
+  """Reshapes vector of samples."""
+  with ops.name_scope(name, "call_sampler", values=[sample_shape]):
+    sample_shape = ops.convert_to_tensor(
+        sample_shape, dtype=dtypes.int32, name="sample_shape")
+    # Ensure sample_shape is a vector (vs just a scalar).
+    pad = math_ops.cast(math_ops.equal(array_ops.rank(sample_shape), 0),
+                        dtypes.int32)
+    sample_shape = array_ops.reshape(
+        sample_shape,
+        array_ops.pad(array_ops.shape(sample_shape),
+                      paddings=[[pad, 0]],
+                      constant_values=1))
+    samples = sample_n_fn(math_ops.reduce_prod(sample_shape))
+    batch_event_shape = array_ops.shape(samples)[1:]
+    final_shape = array_ops.concat([sample_shape, batch_event_shape], 0)
+    return array_ops.reshape(samples, final_shape)
+
+
+def bernoulli_sample(probs=None, logits=None, dtype=dtypes.int32,
+                     sample_shape=(), seed=None):
+  """Samples from Bernoulli distribution."""
+  if probs is None:
+    probs = math_ops.sigmoid(logits, name="probs")
+  else:
+    probs = ops.convert_to_tensor(probs, name="probs")
+  batch_shape_tensor = array_ops.shape(probs)
+  def _sample_n(n):
+    """Sample vector of Bernoullis."""
+    new_shape = array_ops.concat([[n], batch_shape_tensor], 0)
+    uniform = random_ops.random_uniform(
+        new_shape, seed=seed, dtype=probs.dtype)
+    return math_ops.cast(math_ops.less(uniform, probs), dtype)
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def categorical_sample(logits, dtype=dtypes.int32,
+                       sample_shape=(), seed=None):
+  """Samples from categorical distribution."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  event_size = array_ops.shape(logits)[-1]
+  batch_shape_tensor = array_ops.shape(logits)[:-1]
+  def _sample_n(n):
+    """Sample vector of categoricals."""
+    if logits.shape.ndims == 2:
+      logits_2d = logits
+    else:
+      logits_2d = array_ops.reshape(logits, [-1, event_size])
+    sample_dtype = dtypes.int64 if logits.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
+    draws = array_ops.reshape(
+        array_ops.transpose(draws),
+        array_ops.concat([[n], batch_shape_tensor], 0))
+    return math_ops.cast(draws, dtype)
+  return _call_sampler(_sample_n, sample_shape)
+
+
 def _unstack_ta(inp):
   return tensor_array_ops.TensorArray(
       dtype=inp.dtype, size=array_ops.shape(inp)[0],
@@ -307,14 +368,14 @@ class ScheduledEmbeddingTrainingHelper(TrainingHelper):
     with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
                         [time, outputs, state]):
       # Return -1s where we did not sample, and sample_ids elsewhere
-      select_sampler = bernoulli.Bernoulli(
-          probs=self._sampling_probability, dtype=dtypes.bool)
-      select_sample = select_sampler.sample(
-          sample_shape=self.batch_size, seed=self._scheduling_seed)
-      sample_id_sampler = categorical.Categorical(logits=outputs)
+      select_sample = bernoulli_sample(
+          probs=self._sampling_probability,
+          dtype=dtypes.bool,
+          sample_shape=self.batch_size,
+          seed=self._scheduling_seed)
       return array_ops.where(
           select_sample,
-          sample_id_sampler.sample(seed=self._seed),
+          categorical_sample(logits=outputs, seed=self._seed),
           gen_array_ops.fill([self.batch_size], -1))
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
@@ -425,8 +486,10 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
   def sample(self, time, outputs, state, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperSample",
                         [time, outputs, state]):
-      sampler = bernoulli.Bernoulli(probs=self._sampling_probability)
-      return sampler.sample(sample_shape=self.batch_size, seed=self._seed)
+      return bernoulli_sample(
+          probs=self._sampling_probability,
+          sample_shape=self.batch_size,
+          seed=self._seed)
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperNextInputs",
@@ -610,8 +673,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
     else:
       logits = outputs / self._softmax_temperature
 
-    sample_id_sampler = categorical.Categorical(logits=logits)
-    sample_ids = sample_id_sampler.sample(seed=self._seed)
+    sample_ids = categorical_sample(logits=logits, seed=self._seed)
 
     return sample_ids
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index 39a6d2f58b140706a94d83273d3327edd1891368..0fbfd6187030f14ac105a18b3e09b7a42d4de32a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -20,11 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.losses import Loss
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
-__all__ = ["sequence_loss"]
+__all__ = ["sequence_loss", "SequenceLoss"]
 
 
 def sequence_loss(logits,
@@ -32,16 +33,26 @@ def sequence_loss(logits,
                   weights,
                   average_across_timesteps=True,
                   average_across_batch=True,
+                  sum_over_timesteps=False,
+                  sum_over_batch=False,
                   softmax_loss_function=None,
                   name=None):
   """Weighted cross-entropy loss for a sequence of logits.
 
-  Depending on the values of `average_across_timesteps` and
-  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
-  arguments reduce the cross-entropy at each target, which has shape
-  `[batch_size, sequence_length]`, over their respective dimensions. For
-  example, if `average_across_timesteps` is `True` and `average_across_batch`
-  is `False`, then the return Tensor will have shape `[batch_size]`.
+  Depending on the values of `average_across_timesteps` / `sum_over_timesteps`
+  and `average_across_batch` / `sum_over_batch`, the return Tensor will have
+  rank 0, 1, or 2 as these arguments reduce the cross-entropy at each target,
+  which has shape `[batch_size, sequence_length]`, over their respective
+  dimensions. For example, if `average_across_timesteps` is `True` and
+  `average_across_batch` is `False`, then the return Tensor will have shape
+  `[batch_size]`.
+
+  Note that `average_across_timesteps` and `sum_over_timesteps` cannot be True
+  at same time. Same for `average_across_batch` and `sum_over_batch`.
+
+  The recommended loss reduction in tf 2.0 has been changed to sum_over, instead
+  of weighted average. User are recommend to use `sum_over_timesteps` and
+  `sum_over_batch` for reduction.
 
   Args:
     logits: A Tensor of shape
@@ -58,6 +69,12 @@ def sequence_loss(logits,
       dimension and divide the cost by the total label weight across timesteps.
     average_across_batch: If set, sum the cost across the batch dimension and
       divide the returned cost by the batch size.
+    sum_over_timesteps: If set, sum the cost across the sequence dimension and
+      divide the size of the sequence. Note that any element with 0 weights will
+      be excluded from size calculation.
+    sum_over_batch: if set, sum the cost across the batch dimension and divide
+      the total cost by the batch size. Not that any element with 0 weights will
+      be excluded from size calculation.
     softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
       **Note that to avoid confusion, it is required for the function to accept
@@ -78,11 +95,15 @@ def sequence_loss(logits,
     raise ValueError("Logits must be a "
                      "[batch_size x sequence_length x logits] tensor")
   if len(targets.get_shape()) != 2:
-    raise ValueError("Targets must be a [batch_size x sequence_length] "
-                     "tensor")
+    raise ValueError("Targets must be a [batch_size x sequence_length] tensor")
   if len(weights.get_shape()) != 2:
-    raise ValueError("Weights must be a [batch_size x sequence_length] "
-                     "tensor")
+    raise ValueError("Weights must be a [batch_size x sequence_length] tensor")
+  if average_across_timesteps and sum_over_timesteps:
+    raise ValueError("average_across_timesteps and sum_over_timesteps cannot "
+                     "be set to True at same time.")
+  if average_across_batch and sum_over_batch:
+    raise ValueError("average_across_batch and sum_over_batch cannot be set "
+                     "to True at same time.")
   with ops.name_scope(name, "sequence_loss", [logits, targets, weights]):
     num_classes = array_ops.shape(logits)[2]
     logits_flat = array_ops.reshape(logits, [-1, num_classes])
@@ -96,20 +117,56 @@ def sequence_loss(logits,
     if average_across_timesteps and average_across_batch:
       crossent = math_ops.reduce_sum(crossent)
       total_size = math_ops.reduce_sum(weights)
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
+      crossent = math_ops.div_no_nan(crossent, total_size)
+    elif sum_over_timesteps and sum_over_batch:
+      crossent = math_ops.reduce_sum(crossent)
+      total_count = math_ops.cast(math_ops.count_nonzero(weights),
+                                  crossent.dtype)
+      crossent = math_ops.div_no_nan(crossent, total_count)
     else:
-      batch_size = array_ops.shape(logits)[0]
-      sequence_length = array_ops.shape(logits)[1]
-      crossent = array_ops.reshape(crossent, [batch_size, sequence_length])
-    if average_across_timesteps and not average_across_batch:
-      crossent = math_ops.reduce_sum(crossent, axis=[1])
-      total_size = math_ops.reduce_sum(weights, axis=[1])
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
-    if not average_across_timesteps and average_across_batch:
-      crossent = math_ops.reduce_sum(crossent, axis=[0])
-      total_size = math_ops.reduce_sum(weights, axis=[0])
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
+      crossent = array_ops.reshape(crossent, array_ops.shape(logits)[0:2])
+      if average_across_timesteps or average_across_batch:
+        reduce_axis = [0] if average_across_batch else [1]
+        crossent = math_ops.reduce_sum(crossent, axis=reduce_axis)
+        total_size = math_ops.reduce_sum(weights, axis=reduce_axis)
+        crossent = math_ops.div_no_nan(crossent, total_size)
+      elif sum_over_timesteps or sum_over_batch:
+        reduce_axis = [0] if sum_over_batch else [1]
+        crossent = math_ops.reduce_sum(crossent, axis=reduce_axis)
+        total_count = math_ops.cast(
+            math_ops.count_nonzero(weights, axis=reduce_axis),
+            dtype=crossent.dtype)
+        crossent = math_ops.div_no_nan(crossent, total_count)
     return crossent
+
+
+class SequenceLoss(Loss):
+  """Weighted cross-entropy loss for a sequence of logits."""
+
+  def __init__(self,
+               average_across_timesteps=False,
+               average_across_batch=False,
+               sum_over_timesteps=True,
+               sum_over_batch=True,
+               softmax_loss_function=None,
+               name=None):
+    super(SequenceLoss, self).__init__(name=name)
+    self.average_across_timesteps = average_across_timesteps
+    self.average_across_batch = average_across_batch
+    self.sum_over_timesteps = sum_over_timesteps
+    self.sum_over_batch = sum_over_batch
+    self.softmax_loss_function = softmax_loss_function
+
+  def __call__(self, y_true, y_pred, sample_weight=None):
+    """Override the parent __call__ to have a customized reduce behavior."""
+    return sequence_loss(y_pred, y_true, sample_weight,
+                         average_across_timesteps=self.average_across_timesteps,
+                         average_across_batch=self.average_across_batch,
+                         sum_over_timesteps=self.sum_over_timesteps,
+                         sum_over_batch=self.sum_over_batch,
+                         softmax_loss_function=self.softmax_loss_function,
+                         name=self.name)
+
+  def call(self, y_true, y_pred):
+    # Skip this method since the __call__ contains real implementation.
+    pass
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index 08983337fccc138d40eb959cecc5bf9e47cf6cac..f3efd292cf5acba4319c8a5545a7f70fae4b5ce1 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -304,10 +304,10 @@ class Exporter(object):
       def parser(path):
         if os.name == "nt":
           match = re.match(
-              "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$",
+              r"^" + export_dir_base.replace("\\", "/") + r"/(\d{8})$",
               path.path.replace("\\", "/"))
         else:
-          match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
+          match = re.match(r"^" + export_dir_base + r"/(\d{8})$", path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/session_bundle/gc_test.py b/tensorflow/contrib/session_bundle/gc_test.py
index 8faf3ef3d4cd7ee0096265283070e25d06782254..02725bb1cbb4ef9ace29dcc58f6d23fb241d96b2 100644
--- a/tensorflow/contrib/session_bundle/gc_test.py
+++ b/tensorflow/contrib/session_bundle/gc_test.py
@@ -104,7 +104,7 @@ class GcTest(test_util.TensorFlowTestCase):
 
     # create a simple parser that pulls the export_version from the directory.
     def parser(path):
-      match = re.match("^" + base_dir + "/(\\d+)$", path.path)
+      match = re.match(r"^" + base_dir + r"/(\d+)$", path.path)
       if not match:
         return None
       return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index d7ba754f701d4b433e35ad8396eae7ee6132b97f..ed4eca1a60a6f0ccf629d8aa7906c02092e25ba0 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -49,6 +49,9 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
 
 cuda_py_tests(
@@ -64,4 +67,7 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index d8236a0a6fa6d0d0e383e454eb0146bb10b6f49d..0d87cea9fbaa8fe28b55ec996414a568d39efee3 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -50,9 +50,10 @@ def _accuracy(predictions, targets, weights=None):
 def _r2(probabilities, targets, weights=None):
   targets = math_ops.to_float(targets)
   y_mean = math_ops.reduce_mean(targets, 0)
-  squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
+  squares_total = math_ops.reduce_sum(
+      math_ops.squared_difference(targets, y_mean), 0)
   squares_residuals = math_ops.reduce_sum(
-      math_ops.square(targets - probabilities), 0)
+      math_ops.squared_difference(targets, probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
   return metrics.mean(score, weights=weights)
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index e04eb60f9b27cfd8b6b4e1502594d4d310ae55cc..774da472f1543f938d1b607ebdef008f7b540211 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -18,10 +18,10 @@
 #include <limits>
 
 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index d3edb43733761a906c6e5bf8b65f76e3e1ae56fc..3100a5a0e5da1103b61bd089cd433721686b9e72 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -32,7 +32,7 @@ class DecisionTreeResource : public ResourceBase {
   // Constructor.
   explicit DecisionTreeResource(const TensorForestParams& params);
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("DecisionTree[size=",
                            decision_tree_->decision_tree().nodes_size(), "]");
   }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index eea0be27caf0a022ba7acaacd359c75a2df4eedb..44f2b3f473b9eced06bd800b9cf0a5a0825ec3eb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -40,7 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override { return "FertileStats"; }
+  string DebugString() const override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 784acce444a8d0c066f1b7ae6c1b5d7d65405549..67461450f8ae53739f619622de8751b654dbc082 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,18 +11,12 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
-    "tf_custom_op_library",
     "tf_custom_op_library_additional_deps",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -33,127 +27,17 @@ exports_files(glob([
     "test/testdata/*",
 ]))
 
-tf_cuda_cc_test(
-    name = "tensorrt_test_cc",
-    size = "small",
-    srcs = ["tensorrt_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        "//tensorflow/core:gpu_init",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_custom_op_library(
-    name = "python/ops/_trt_engine_op.so",
-    srcs = [
-        "ops/trt_engine_op.cc",
-    ],
-    deps = [
-        ":trt_shape_function",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
 tf_cuda_library(
     name = "trt_shape_function",
     srcs = ["shape_fn/trt_shfn.cc"],
     hdrs = ["shape_fn/trt_shfn.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":trt_logging",
-        ":trt_plugins",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
-)
-
-cc_library(
-    name = "trt_engine_op_kernel",
-    srcs = [
-        "kernels/trt_engine_op.cc",
-    ],
-    hdrs = [
-        "kernels/trt_engine_op.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":test_utils",
-        ":trt_allocator",
-        ":trt_conversion",
-        ":trt_logging",
-        ":trt_plugins",
-        ":trt_resources",
-        ":utils",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor_headers_lib",
-        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/compiler/tf2tensorrt:trt_logging",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
     ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
+        "@local_config_tensorrt//:tensorrt",
     ]) + tf_custom_op_library_additional_deps(),
-    # TODO(laigd): fix this by merging header file in cc file.
-    alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "trt_engine_op",
-    ],
-)
-
-tf_cuda_library(
-    name = "trt_logging",
-    srcs = ["log/trt_logger.cc"],
-    hdrs = ["log/trt_logger.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_gen_op_wrapper_py(
-    name = "trt_engine_op",
-    deps = [
-        ":trt_engine_op_op_lib",
-        ":trt_logging",
-        ":trt_shape_function",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "trt_engine_op_loader",
-    srcs = ["python/ops/trt_engine_op.py"],
-    dso = [
-        ":python/ops/_trt_engine_op.so",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-    kernels = [
-        ":trt_engine_op_kernel",
-        ":trt_engine_op_op_lib",
-        ":trt_shape_function",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:resources",
-    ],
 )
 
 py_library(
@@ -175,8 +59,8 @@ py_library(
     name = "trt_ops_py",
     srcs_version = "PY2AND3",
     deps = [
-        ":trt_engine_op",
-        ":trt_engine_op_loader",
+        "//tensorflow/compiler/tf2tensorrt:trt_ops",
+        "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
     ],
 )
 
@@ -205,247 +89,13 @@ tf_py_wrap_cc(
         "//tensorflow/python:platform/base.i",
     ],
     deps = [
-        ":test_utils",
-        ":trt_conversion",
-        ":trt_engine_op_kernel",
+        "//tensorflow/compiler/tf2tensorrt:test_utils",
+        "//tensorflow/compiler/tf2tensorrt:trt_conversion",
+        "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
         "//third_party/python_runtime:headers",
     ],
 )
 
-tf_cuda_library(
-    name = "trt_resources",
-    srcs = [
-        "resources/trt_int8_calibrator.cc",
-        "resources/trt_resource_manager.cc",
-    ],
-    hdrs = [
-        "resources/trt_int8_calibrator.h",
-        "resources/trt_resource_manager.h",
-        "resources/trt_resources.h",
-    ],
-    deps = [
-        ":trt_allocator",
-        ":trt_logging",
-        ":utils",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_cuda_library(
-    name = "trt_allocator",
-    srcs = ["resources/trt_allocator.cc"],
-    hdrs = ["resources/trt_allocator.h"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_cc_test(
-    name = "trt_allocator_test",
-    size = "small",
-    srcs = ["resources/trt_allocator_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_allocator",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-# Library for the node-level conversion portion of TensorRT operation creation
-tf_cuda_library(
-    name = "trt_conversion",
-    srcs = [
-        "convert/convert_graph.cc",
-        "convert/convert_nodes.cc",
-        "convert/trt_optimization_pass.cc",
-    ],
-    hdrs = [
-        "convert/convert_graph.h",
-        "convert/convert_nodes.h",
-        "convert/trt_optimization_pass.h",
-    ],
-    deps = [
-        ":segment",
-        ":test_utils",
-        ":trt_allocator",
-        ":trt_plugins",
-        ":trt_logging",
-        ":trt_resources",
-        ":utils",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:devices",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
-)
-
-tf_cuda_cc_test(
-    name = "convert_graph_test",
-    size = "medium",
-    srcs = ["convert/convert_graph_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_conversion",
-        "@com_google_googletest//:gtest",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "convert_nodes_test",
-    size = "medium",
-    srcs = ["convert/convert_nodes_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_logging",
-        ":trt_conversion",
-        ":trt_plugins",
-        "@com_google_googletest//:gtest",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-# Library for the segmenting portion of TensorRT operation creation
-cc_library(
-    name = "segment",
-    srcs = ["segment/segment.cc"],
-    hdrs = [
-        "segment/segment.h",
-        "segment/union_find.h",
-    ],
-    deps = [
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-)
-
-tf_cc_test(
-    name = "segment_test",
-    size = "small",
-    srcs = ["segment/segment_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":segment",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-# Library for the plugin factory
-tf_cuda_library(
-    name = "trt_plugins",
-    srcs = [
-        "plugin/trt_plugin.cc",
-        "plugin/trt_plugin_factory.cc",
-        "plugin/trt_plugin_utils.cc",
-    ],
-    hdrs = [
-        "plugin/trt_plugin.h",
-        "plugin/trt_plugin_factory.h",
-        "plugin/trt_plugin_utils.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "trt_plugin_factory_test",
-    size = "small",
-    srcs = ["plugin/trt_plugin_factory_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_plugins",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
 py_library(
     name = "tf_trt_integration_test_base",
     srcs = ["test/tf_trt_integration_test_base.py"],
@@ -491,6 +141,11 @@ cuda_py_tests(
         "test/binary_tensor_weight_broadcast_test.py",
         "test/concatenation_test.py",
         "test/const_broadcast_test.py",
+        "test/conv2d_test.py",
+        "test/dynamic_input_shapes_test.py",
+        "test/identity_output_test.py",
+        "test/int32_test.py",
+        "test/lru_cache_test.py",
         "test/manual_test.py",
         "test/memory_alignment_test.py",
         "test/multi_connection_neighbor_engine_test.py",
@@ -498,6 +153,8 @@ cuda_py_tests(
         "test/quantization_test.py",
         "test/rank_two_test.py",
         "test/reshape_transpose_test.py",
+        "test/topk_test.py",
+        "test/unary_test.py",
         "test/vgg_block_nchw_test.py",
         "test/vgg_block_test.py",
     ],
@@ -513,25 +170,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "tf_trt_integration_test_no_oss",
-    srcs = [
-        "test/unary_test.py",
-    ],
-    additional_deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_oss",  # TODO(b/117274186): re-enable in OSS after crash fixed
-        "no_pip",  # TODO(b/117274186): re-enable in OSS after crash fixed
-        "no_windows",
-        "nomac",
-    ],
-)
-
 cuda_py_test(
     name = "quantization_mnist_test",
     srcs = ["test/quantization_mnist_test.py"],
@@ -556,22 +194,20 @@ cuda_py_test(
     ],
 )
 
-cc_library(
-    name = "utils",
-    srcs = ["convert/utils.cc"],
-    hdrs = ["convert/utils.h"],
-    copts = tf_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-    ],
+# The following rules forward the libraries that were moved in order to not
+# break other internal targets.
+
+alias(
+    name = "trt_conversion",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_conversion",
 )
 
-cc_library(
-    name = "test_utils",
-    srcs = ["test/utils.cc"],
-    hdrs = ["test/utils.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_googlesource_code_re2//:re2",
-    ],
+alias(
+    name = "trt_op_kernels",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
+)
+
+alias(
+    name = "trt_engine_op_op_lib",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_engine_op_op_lib",
 )
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
index 69058c5826822c519a69d50860c06b8ab3ec6578..0a2cf105baf5efb62d0c535c1f2d081973ec0ea3 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -45,10 +45,10 @@ tf_custom_op_library(
         "inc_op_kernel.cu.cc",
     ],
     deps = [
-        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
         "//tensorflow/core:framework_lite",
     ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
+        "@local_config_tensorrt//:tensorrt",
     ]),
 )
 
@@ -64,10 +64,10 @@ tf_kernel_library(
         "inc_op_kernel.cu.cc",
     ],
     deps = [
-        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
         "//tensorflow/core:stream_executor_headers_lib",
     ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
+        "@local_config_tensorrt//:tensorrt",
     ]) + tf_custom_op_library_additional_deps(),
 )
 
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
index 8d4c893af56689185da72398919e2241d451594b..7c9075142a02546ddd580e861ac87cb86badd739 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
 
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
index 189e9c939b9ffd4450f7ba95fe1abdbbc049b430..fb048d7b19da0f010ed918b147013b20d37ed0dd 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cassert>
 #include <cstring>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index 7cdfe2b1a612be2eec473d806d0eb44b611ca68a..75490aecfbe84810520c82597d127a36d36de3ee 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
 from tensorflow.contrib.tensorrt.python.trt_convert import add_test_value
 from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph
 from tensorflow.contrib.tensorrt.python.trt_convert import clear_test_values
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 203b2697babe32b45523109708cbf062dceee33b..49d72232aa0cfba3f5bf533de04f4d50e65275fd 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -45,12 +45,19 @@ from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
 
-if _six.PY2:
-  _to_bytes = lambda s: s
-  _to_string = lambda s: s
-else:
-  _to_bytes = lambda s: s.encode("utf-8", errors="surrogateescape")
-  _to_string = lambda s: s.decode("utf-8")
+
+def _to_bytes(s):
+  """Encode s if it is a sequence of chars."""
+  if isinstance(s, _six.text_type):
+    return s.encode("utf-8", errors="surrogateescape")
+  return s
+
+
+def _to_string(s):
+  """Decode s if it is a sequence of bytes."""
+  if isinstance(s, _six.binary_type):
+    return s.decode("utf-8")
+  return s
 
 
 class TrtPrecisionMode(object):
@@ -70,7 +77,7 @@ def get_tensorrt_rewriter_config(rewriter_config=None,
                                  minimum_segment_size=3,
                                  is_dynamic_op=False,
                                  maximum_cached_engines=1,
-                                 cached_engine_batch_sizes=None,
+                                 cached_engine_batches=None,
                                  use_calibration=True):
   """Returns a RewriterConfig proto for TRT transformation.
 
@@ -90,9 +97,9 @@ def get_tensorrt_rewriter_config(rewriter_config=None,
       If the number of cached engines is already at max but none of them can
       serve the input, the TRTEngineOp will fall back to run the TF function
       based on which the TRTEngineOp is created.
-    cached_engine_batch_sizes: a list of batch sizes used to create cached
+    cached_engine_batches: a list of batch sizes used to create cached
       engines, only used when is_dynamic_op is True. The length of the list
-      should be smaller than maximum_cached_engines, and the dynamic TRT op will
+      should be <= maximum_cached_engines, and the dynamic TRT op will
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
@@ -143,14 +150,14 @@ def get_tensorrt_rewriter_config(rewriter_config=None,
       "max_workspace_size_bytes"].i = max_workspace_size_bytes
   optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
   optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
-  if cached_engine_batch_sizes:
-    if not isinstance(cached_engine_batch_sizes, list):
-      raise TypeError("cached_engine_batch_sizes should be a list.")
-    if len(cached_engine_batch_sizes) > maximum_cached_engines:
-      raise ValueError("cached_engine_batch_sizes should not contain more than "
+  if cached_engine_batches:
+    if not isinstance(cached_engine_batches, list):
+      raise TypeError("cached_engine_batches should be a list.")
+    if len(cached_engine_batches) > maximum_cached_engines:
+      raise ValueError("cached_engine_batches should not contain more than "
                        "maximum_cached_engines items.")
     optimizer.parameter_map["cached_engine_batches"].list.i.extend(
-        cached_engine_batch_sizes)
+        cached_engine_batches)
   optimizer.parameter_map["use_calibration"].b = use_calibration
   return rewriter_config_with_trt
 
@@ -163,7 +170,7 @@ def create_inference_graph(input_graph_def,
                            minimum_segment_size=3,
                            is_dynamic_op=False,
                            maximum_cached_engines=1,
-                           cached_engine_batch_sizes=None,
+                           cached_engine_batches=None,
                            use_calibration=True,
                            input_saved_model_dir=None,
                            input_saved_model_tags=None,
@@ -190,9 +197,9 @@ def create_inference_graph(input_graph_def,
       If the number of cached engines is already at max but none of them can
       serve the input, the TRTEngineOp will fall back to run the TF function
       based on which the TRTEngineOp is created.
-    cached_engine_batch_sizes: a list of batch sizes used to create cached
+    cached_engine_batches: a list of batch sizes used to create cached
       engines, only used when is_dynamic_op is True. The length of the list
-      should be smaller than maximum_cached_engines, and the dynamic TRT op will
+      should be <= maximum_cached_engines, and the dynamic TRT op will
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
@@ -354,7 +361,7 @@ def create_inference_graph(input_graph_def,
   rewriter_config_with_trt = get_tensorrt_rewriter_config(
       rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
       minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batch_sizes, use_calibration)
+      cached_engine_batches, use_calibration)
   session_config_with_trt.graph_options.rewrite_options.CopyFrom(
       rewriter_config_with_trt)
 
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
index a7b2d2ea50543ba85c5a13dd6ca320e794ca47f1..abd822c7b71b4d7cca59482bdb51a922a28d480c 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.tensorrt.python import trt_convert
 # pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
 # pylint: enable=unused-import
+from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -57,7 +57,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         minimum_segment_size=10,
         is_dynamic_op=True,
         maximum_cached_engines=2,
-        cached_engine_batch_sizes=[1, 128])
+        cached_engine_batches=[1, 128])
     self.assertEqual(["constfold", "layout", "constfold"],
                      rewriter_cfg.optimizers)
     self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
@@ -84,8 +84,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         trt_optimizer.parameter_map["precision_mode"].s)
     self.assertEqual(2, trt_optimizer.parameter_map["maximum_cached_engines"].i)
     self.assertEqual(
-        [1, 128],
-        trt_optimizer.parameter_map["cached_engine_batches"].list.i)
+        [1, 128], trt_optimizer.parameter_map["cached_engine_batches"].list.i)
 
   def _GetConfigProto(self):
     """Get ConfigProto for session creation."""
@@ -239,8 +238,8 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         # Run with batch size 2, a new engine is created and cached.
         self._TestRun(sess, 2, True)
         # Run with batch size 3, since the number of cached engines has reached
-        # the max, it should fall back to TF function.
-        self._TestRun(sess, 3, False)
+        # the max, it should evict an old engine and create a new one.
+        self._TestRun(sess, 3, True)
 
     # Test the output SavedModel
     with ops.Graph().as_default():
@@ -251,8 +250,8 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         # Run with batch size 2, a new engine is created and cached.
         self._TestRun(sess, 2, True)
         # Run with batch size 3, since the number of cached engines has reached
-        # the max, it should fall back to TF function.
-        self._TestRun(sess, 3, False)
+        # the max, it should evict an old engine and create a new one.
+        self._TestRun(sess, 3, True)
 
   def testCreateInferenceGraph_StaticOp(self):
     if not trt_convert.is_tensorrt_enabled():
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
deleted file mode 100644
index aac9e5c7bd725fc10bcaa04536ebc7be071b4d4c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
-
-#include <list>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-
-#include "tensorrt/include/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-class TRTCalibrationResource : public tensorflow::ResourceBase {
- public:
-  ~TRTCalibrationResource() {
-    LOG(INFO) << "Destroying Calibration Resource " << std::endl
-              << DebugString();
-    builder_.reset();
-    engine_.reset();
-    // We need to manually destroy the builder and engine before the allocator
-    // is destroyed.
-    allocator_.reset();
-  }
-
-  string DebugString() override {
-    std::stringstream oss;
-    using std::dec;
-    using std::endl;
-    using std::hex;
-    oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
-        << " Builder    = " << hex << builder_.get() << dec << endl
-        << " Engine     = " << hex << engine_.get() << dec << endl
-        << " Logger     = " << hex << &logger_ << dec << endl
-        << " Allocator  = " << hex << allocator_.get() << dec << endl
-        << " Thread     = " << hex << thr_.get() << dec << endl;
-    return oss.str();
-  }
-
-  std::unique_ptr<TRTInt8Calibrator> calibrator_;
-  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
-  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
-  std::unique_ptr<TRTBaseAllocator> allocator_;
-  tensorflow::tensorrt::Logger logger_;
-  // TODO(sami): Use threadpool threads!
-  std::unique_ptr<std::thread> thr_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif
-#endif
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index f30dba59ad55317d7ad7730e4dc66c9aba4e6a6b..5c60d6b589ed6a16276226726d989e949bcbf9d7 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -14,14 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorrt/include/NvInfer.h"
 
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
index ff317e43e1e6ff1c0b869ae8dc6d1fda8f0ce126..17e0b6f4d2c4bbaf56ef143b78c543c9e130b458 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -68,9 +68,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(100, 6, 6, 6)])
+        expected_output_dims=[[[100, 6, 6, 6]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -125,9 +125,9 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(100, 12, 12, 6)])
+        expected_output_dims=[[[100, 12, 12, 6]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -183,9 +183,9 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -253,9 +253,9 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -286,9 +286,9 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -320,9 +320,9 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -369,9 +369,9 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
index f42308ecb7c8f8a107e78008abd3f470ddc85975..46e3407d9669085a9737bacbeec1a20765ef88cc 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -71,9 +71,9 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name, w1_name, w2_name],
-        input_dims=[input_dims, w1_dims, w2_dims],
+        input_dims=[[input_dims, w1_dims, w2_dims]],
         output_names=[output_name],
-        expected_output_dims=[(12, 5, 8, 7)])
+        expected_output_dims=[[[12, 5, 8, 7]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -86,28 +86,6 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to run."""
     return ["TRTEngineOp_1"]
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt library will fail like:
-    #
-    # ../builder/cudnnBuilder2.cpp:685:
-    # virtual std::vector<nvinfer1::query::Ports<
-    #     nvinfer1::query::TensorRequirements>>
-    # nvinfer1::builder::Node::getSupportedFormats(
-    #     const nvinfer1::query::Ports<nvinfer1::query::AbstractTensor>&,
-    #     const nvinfer1::cudnn::HardwareContext&,
-    #     nvinfer1::builder::Format::Type,
-    #     const nvinfer1::builder::FormatTypeHack&) const:
-    # Assertion `sf' failed.
-    #
-    # To reproduce, run:
-    # bazel test -c opt --copt=-mavx \
-    #   --test_arg=BatchMatMulTest.testTfTrt_ToolConversion_INT8_DynamicEngine \
-    #   tensorflow/contrib/tensorrt:batch_matmul_test
-    #
-    # Investigate and fix it.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
index 053b38ff1c0578c58f39dd6dc0630d1401a105af..ca23629efacba1df27ffb466d24b189d6074a917 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -111,9 +111,9 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(4, 6680)])
+        expected_output_dims=[[[4, 6680]]])
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
@@ -130,12 +130,6 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
-    # mode, which is a bug. Re-enable this when trt library is fixed.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
index 169835956c046dd675e967daa05fd81405662e38..846fd009db07b151e1eca794e9a8a38ff834a465 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -63,9 +63,9 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 23040)])
+        expected_output_dims=[[[5, 23040]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
index c3576f81d97afe7e0e42cd10413971911e97774c..5d8742ae356c091ba831bbd48741dee34cd39d08 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -73,9 +73,9 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 126)])
+        expected_output_dims=[[[2, 126]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
index c1c883312d867b60b88ac14318041f9750ca41e6..9137d0072f66321d8420b7caac6acc329541123f 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -58,9 +58,9 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 12, 12, 1)])
+        expected_output_dims=[[[5, 12, 12, 1]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/conv2d_test.py b/tensorflow/contrib/tensorrt/test/conv2d_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7993b4620931736cd872bfffb4ebe177555fcd2
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/conv2d_test.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.platform import test
+
+
+def conv2d_layer(inputs,
+                 filters,
+                 kernel_size,
+                 strides=(1, 1),
+                 padding="valid",
+                 data_format="channels_last",
+                 dilation_rate=(1, 1),
+                 name=None):
+  dtype = inputs.dtype
+  c_axis = -1 if data_format == "channels_last" else 1
+  nchan = inputs.shape[c_axis]
+  weights_shape = (kernel_size[0], kernel_size[1], nchan, filters)
+  weights = constant_op.constant(np.random.randn(*weights_shape), dtype=dtype)
+  padding = padding.upper()
+  if data_format == "channels_last":
+    strides = [1] + list(strides) + [1]
+    dilations = [1] + list(dilation_rate) + [1]
+    data_format = "NHWC"
+  else:
+    strides = [1, 1] + list(strides)
+    dilations = [1, 1] + list(dilation_rate)
+    data_format = "NCHW"
+  return gen_nn_ops.conv2d(
+      inputs,
+      weights,
+      strides=strides,
+      padding=padding,
+      dilations=dilations,
+      data_format=data_format)
+
+
+def div_round_up(n, d):
+  return (n - 1) // d + 1
+
+
+def build_graph(input_dims,
+                dtype,
+                num_filters,
+                data_format,
+                kernel_sizes,
+                dilation_rates,
+                padding="same"):
+  g = ops.Graph()
+  with g.as_default():
+    inp = array_ops.placeholder(
+        dtype=dtype, shape=[None] + input_dims[1:], name="input")
+    with g.device("/GPU:0"):
+      results = []
+      for kernel_size in kernel_sizes:
+        for dilation_rate in dilation_rates:
+          result = conv2d_layer(inp, num_filters, kernel_size, (1, 1), padding,
+                                data_format, dilation_rate)
+          results.append(result)
+      output = sum(results)
+      output = array_ops.identity(output, name="output")
+  return g
+
+
+class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+    np.random.seed(1234)
+    input_dims = [13, 3, 7, 11]
+    g = build_graph(
+        input_dims=input_dims,
+        dtype=dtypes.float32,
+        num_filters=5,
+        data_format="channels_first",
+        kernel_sizes=[(3, 3), (3, 2)],
+        dilation_rates=[(1, 1), (2, 3)])
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=[[input_dims]],
+        output_names=["output"],
+        expected_output_dims=[[[13, 5, 7, 11]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+    np.random.seed(1234)
+    input_dims = [13, 7, 11, 3]
+    g = build_graph(
+        input_dims=input_dims,
+        dtype=dtypes.float32,
+        num_filters=5,
+        data_format="channels_last",
+        kernel_sizes=[(3, 3), (3, 2)],
+        dilation_rates=[(1, 1), (2, 3)])
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=[[input_dims]],
+        output_names=["output"],
+        expected_output_dims=[[[13, 7, 11, 5]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of strided Conv2D (data_format=NCHW) in TF-TRT
+
+    conversion.
+    """
+    np.random.seed(1234)
+    dtype = dtypes.float32
+    input_name = "input"
+    n, c, h, w = 13, 3, 7, 11
+    num_filters = 5
+    input_dims = [n, c, h, w]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        output = inp
+        output = conv2d_layer(
+            output,
+            num_filters, (3, 2),
+            strides=(2, 2),
+            padding="same",
+            data_format="channels_first")
+        h = div_round_up(h, 2)
+        w = div_round_up(w, 2)
+        output = conv2d_layer(
+            output,
+            num_filters, (3, 3),
+            strides=(2, 2),
+            dilation_rate=(2, 3),
+            padding="same",
+            data_format="channels_first")
+        h = div_round_up(h, 2)
+        w = div_round_up(w, 2)
+        output = array_ops.identity(output, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=[output_name],
+        expected_output_dims=[[[n, num_filters, h, w]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/contrib/tensorrt/test/dynamic_input_shapes_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc28cd6087997359e81ffaa6dc8bd958109cc565
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/dynamic_input_shapes_test.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TRT INT8 conversion without calibration on Mnist model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    # TODO(laigd): we should test the following cases:
+    # - batch size is not changed, other dims are changing
+    # - batch size is decreasing, other dims are identical
+    # - batch size is decreasing, other dims are changing
+    # - batch size is increasing, other dims are identical
+    # - batch size is increasing, other dims are changing
+    input_dims = [[[1, 5, 5, 1]], [[10, 5, 5, 1]], [[3, 5, 5, 1]],
+                  [[1, 5, 5, 1]], [[1, 3, 1, 1]], [[2, 9, 9, 1]],
+                  [[1, 224, 224, 1]], [[1, 128, 224, 1]]]
+    expected_output_dims = input_dims
+
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          shape=(None, None, None, 1), dtype=dtypes.float32, name="input")
+      conv_filter1 = constant_op.constant(
+          np.ones([3, 3, 1, 8]), name="weights1", dtype=dtypes.float32)
+      bias1 = constant_op.constant(np.random.randn(8), dtype=dtypes.float32)
+      x = nn.conv2d(
+          input=x,
+          filter=conv_filter1,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          name="conv")
+      x = nn.bias_add(x, bias1)
+      x = nn.relu(x)
+      conv_filter2 = constant_op.constant(
+          np.ones([3, 3, 8, 1]), name="weights2", dtype=dtypes.float32)
+      bias2 = constant_op.constant(np.random.randn(1), dtype=dtypes.float32)
+      x = nn.conv2d(
+          input=x,
+          filter=conv_filter2,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          name="conv")
+      x = nn.bias_add(x, bias2)
+      x = array_ops.identity(x, name="output")
+
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=input_dims,
+        output_names=["output"],
+        expected_output_dims=expected_output_dims)
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    conversion_params = super(DynamicInputShapesTest,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        maximum_cached_engines=10,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+
+  def ExpectedEnginesToBuild(self, run_params):
+    return ["TRTEngineOp_0"]
+
+  def ShouldRunTest(self, run_params):
+    return (run_params.dynamic_engine and
+            not trt_test.IsQuantizationMode(run_params.precision_mode))
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-03 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-03 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/identity_output_test.py b/tensorflow/contrib/tensorrt/test/identity_output_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b568eeda945d997a832b7f71a5bfd8c42e127e65
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/identity_output_test.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This test checks a situation where the same tensor is considered as an output
+
+multiple times because it has been duplicated by 2+ indentity ops. Previously,
+the tensor would be renamed multiple times, overwriting the output binding name
+which resulted in a runtime error when the binding would not be found.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class IdentityTest(trt_test.TfTrtIntegrationTestBase):
+
+  def _ConstOp(self, shape):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
+
+  def GetParams(self):
+    """Testing engine with the same tensor repeated as output via identity."""
+    input_name = 'input'
+    input_dims = [100, 32]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          dtype=dtypes.float32, shape=input_dims, name=input_name)
+
+      b = self._ConstOp((32, 4))
+      x1 = math_ops.matmul(x, b)
+      b = self._ConstOp((1, 4))
+      x1 = x1 + b
+
+      out1 = array_ops.identity(x1, name='output1')
+      out2 = array_ops.identity(x1, name='output2')
+      iden1 = array_ops.identity(x1)
+      out3 = array_ops.identity(iden1, name='output3')
+
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=['output1', 'output2', 'output3'],
+        expected_output_dims=[[[100, 4]] * 3])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ['TRTEngineOp_0']
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/int32_test.py b/tensorflow/contrib/tensorrt/test/int32_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf538703880b130322a7dd504094c7a298e6522
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/int32_test.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test conversion of graphs involving INT32 tensors and operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class ExcludeUnsupportedInt32Test(trt_test.TfTrtIntegrationTestBase):
+
+  def _ConstOp(self, shape, dtype):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtype)
+
+  def GetParams(self):
+    """Test exclusion of ops which are not supported in INT32 mode by TF-TRT"""
+    input_name = 'input'
+    output_name = 'output'
+    input_dims = [100, 4]
+    dtype = dtypes.int32
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      b = self._ConstOp((4, 10), dtype)
+      x = math_ops.matmul(x, b)
+      b = self._ConstOp((10,), dtype)
+      x = nn.bias_add(x, b)
+      x = array_ops.identity(x, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=[output_name],
+        expected_output_dims=[[[100, 10]]])
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    conversion_params = super(ExcludeUnsupportedInt32Test,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        max_batch_size=100,
+        maximum_cached_engines=1,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return []
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
+    # mode, which is a bug. Re-enable this when trt library is fixed.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/lru_cache_test.py b/tensorflow/contrib/tensorrt/test/lru_cache_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7702413e6cee667796b7fbf4121c6e0d9118d35c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/lru_cache_test.py
@@ -0,0 +1,78 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test LRUCache by running different input batch sizes on same network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class LRUCacheTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [[[1, 10, 10, 2]], [[2, 10, 10, 2]], [[4, 10, 10, 2]],
+                  [[2, 10, 10, 2]]]
+    expected_output_dims = [[[1, 10, 10, 1]], [[2, 10, 10, 1]], [[4, 10, 10,
+                                                                  1]],
+                            [[2, 10, 10, 1]]]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          dtype=dtype, shape=[None, 10, 10, 2], name=input_name)
+      conv_filter = constant_op.constant(
+          np.random.randn(3, 3, 2, 1), dtype=dtypes.float32)
+      x = nn.conv2d(
+          input=x,
+          filter=conv_filter,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          name="conv")
+      bias = constant_op.constant(
+          np.random.randn(1, 10, 10, 1), dtype=dtypes.float32)
+      x = math_ops.add(x, bias)
+      x = nn.relu(x)
+      x = array_ops.identity(x, name="output")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=input_dims,
+        output_names=[output_name],
+        expected_output_dims=expected_output_dims)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+  def ShouldRunTest(self, run_params):
+    return (run_params.dynamic_engine and
+            not trt_test.IsQuantizationMode(run_params.precision_mode))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/manual_test.py b/tensorflow/contrib/tensorrt/test/manual_test.py
index 1187c759b4b5483cbf5afe136401abe86d6ef989..aad7b9f30728cbb3f4ec5fa730c5dbe46fe9fc3f 100644
--- a/tensorflow/contrib/tensorrt/test/manual_test.py
+++ b/tensorflow/contrib/tensorrt/test/manual_test.py
@@ -67,9 +67,9 @@ class ManualTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=gdef,
         input_names=params_map['input_names'],
-        input_dims=params_map['input_dims'],
+        input_dims=[params_map['input_dims']],
         output_names=params_map['output_names'],
-        expected_output_dims=params_map['expected_output_dims'])
+        expected_output_dims=[params_map['expected_output_dims']])
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
index 104bac43a0b1166dcddee9920991582f33e93316..cc64329bbd53eaaebf7929e48bbfa8d8beeeadff 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -62,9 +62,9 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 15, 15, 10)])
+        expected_output_dims=[[[2, 15, 15, 10]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
index 293f93d8a78bc8ab06002d6fc01cb8d6a0738698..a14bb0396ece74c8de73008d2007bce5c763b0ed 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -75,9 +75,9 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 4, 5, 4)])
+        expected_output_dims=[[[2, 4, 5, 4]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
index 3e1e4b088ba200db2184dd64092cbc642a17cb3a..06a86bbb8df4c11a471475c040b74099a6fe2361 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -59,9 +59,9 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 4, 5, 4)])
+        expected_output_dims=[[[2, 4, 5, 4]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
index e7d6ec4ad395d38a06f97020f2f363009f2286c7..d68211a7ee344f3d07d01e308ee60246a61816f6 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.python import trt_convert
 # pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
 # pylint: enable=unused-import
+from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import data
 from tensorflow.python import keras
@@ -144,7 +144,10 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
           outputs=[OUTPUT_NODE_NAME],
           max_batch_size=max_batch_size,
           precision_mode='INT8',
-          max_workspace_size_bytes=4096 << 19,
+          # There is a 2GB GPU memory limit for each test, so we set
+          # max_workspace_size_bytes to 256MB to leave enough room for TF
+          # runtime to allocate GPU memory.
+          max_workspace_size_bytes=1 << 28,
           minimum_segment_size=2,
           use_calibration=False,
       )
@@ -271,7 +274,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
         num_epochs=None,
         model_dir=model_dir)['accuracy']
     logging.info('accuracy_tf_native: %f', accuracy_tf_native)
-    self.assertAllClose(accuracy_tf_native, 0.9662)
+    self.assertAllClose(0.9662, accuracy_tf_native, rtol=1e-3, atol=1e-3)
 
     if trt_convert.get_linked_tensorrt_version()[0] < 5:
       return
@@ -283,7 +286,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
         num_epochs=None,
         model_dir=model_dir)['accuracy']
     logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
-    self.assertAllClose(accuracy_tf_trt, 0.9677)
+    self.assertAllClose(0.9675, accuracy_tf_trt, rtol=1e-3, atol=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/contrib/tensorrt/test/quantization_test.py
index e425a3674635650d7292ab072178e98932e6b824..ce1b25ebf3c52ac5710dea654134925bb5b6ceca 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_test.py
+++ b/tensorflow/contrib/tensorrt/test/quantization_test.py
@@ -60,9 +60,9 @@ def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
   return trt_test.TfTrtIntegrationTestParams(
       gdef=g.as_graph_def(),
       input_names=[input_name],
-      input_dims=[input_dims],
+      input_dims=[[input_dims]],
       output_names=[output_name],
-      expected_output_dims=[(8, 1)])
+      expected_output_dims=[[[8, 1]]])
 
 
 class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/contrib/tensorrt/test/rank_two_test.py
index 563232fc12675d9e1b32b7ab461591af57beadb9..97159bb008068efbbcdb0fc6844890a42a08f46c 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/contrib/tensorrt/test/rank_two_test.py
@@ -63,9 +63,9 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=input_names,
-        input_dims=input_dims,
+        input_dims=[input_dims],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims[1])])
+        expected_output_dims=[[input_dims[1]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -80,12 +80,6 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
         ],
     }
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
-    # mode, which is a bug. Re-enable this when trt library is fixed.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
index 207944468ab0b038abfe01f0096d7dc220d064ed..7fb2cbde07c4987d925e9abc915ede52119ec6df 100644
--- a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
@@ -72,9 +72,9 @@ class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -129,9 +129,9 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(24, 100, 2, 24)])
+        expected_output_dims=[[[24, 100, 2, 24]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index d26f26008635733c6c364a98b72b88c1e552f5fe..090aa8bdb0487973e186631af3b4edac48096a5f 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -191,7 +191,7 @@ def user(multi_engine,
       minimum_segment_size=2,  # minimum number of nodes in an engine
       is_dynamic_op=False,
       maximum_cached_engines=1,
-      cached_engine_batch_sizes=[])
+      cached_engine_batches=[])
   o1 = run_graph(orig_graph, dummy_input)
   o2 = run_graph(trt_graph, dummy_input)
   o3 = run_graph(trt_graph, dummy_input)
@@ -206,7 +206,7 @@ def user(multi_engine,
       minimum_segment_size=2,  # minimum number of nodes in an engine
       is_dynamic_op=False,
       maximum_cached_engines=1,
-      cached_engine_batch_sizes=[])
+      cached_engine_batches=[])
   int8_calib_gdef = trt.create_inference_graph(
       input_graph_def=orig_graph,
       outputs=["output"],
@@ -216,7 +216,7 @@ def user(multi_engine,
       minimum_segment_size=2,  # minimum number of nodes in an engine
       is_dynamic_op=False,
       maximum_cached_engines=1,
-      cached_engine_batch_sizes=[])
+      cached_engine_batches=[])
   o4 = run_graph(fp16_graph, dummy_input)
   _ = run_calibration(int8_calib_gdef, dummy_input)
   int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index 495a9391a1e818a6078988161c9bf72f6143737f..9a00cdb11a0f98d9b9be0d8e88a79cf45a8a7e3a 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -25,10 +25,10 @@ import warnings
 import numpy as np
 import six
 
-from tensorflow.contrib.tensorrt.python import trt_convert
 # pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
 # pylint: enable=unused-import
+from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import dtypes
@@ -39,9 +39,19 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
-TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
-    "gdef", "input_names", "input_dims", "output_names", "expected_output_dims"
-])
+TfTrtIntegrationTestParams = namedtuple(
+    "TfTrtIntegrationTestParams",
+    [
+        "gdef",
+        # A list of names of the input placeholder nodes.
+        "input_names",
+        # A list of list of output shapes of the input placeholder nodes.
+        "input_dims",
+        # A list of names of the output identity nodes.
+        "output_names",
+        # A list of list of expected output shapes of the output identity nodes.
+        "expected_output_dims"
+    ])
 
 RunParams = namedtuple("RunParams", [
     "use_optimizer", "precision_mode", "dynamic_engine", "test_name",
@@ -51,7 +61,7 @@ RunParams = namedtuple("RunParams", [
 ConversionParams = namedtuple("ConversionParams", [
     "max_batch_size", "max_workspace_size_bytes", "precision_mode",
     "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "cached_engine_batch_sizes", "rewriter_config", "use_calibration"
+    "cached_engine_batches", "rewriter_config", "use_calibration"
 ])
 
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
@@ -159,16 +169,24 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
+    batch_list = []
+    for dims_list in self._GetParamsCached().input_dims:
+      assert dims_list
+      # Each list of shapes should have same batch size.
+      input_batches = [dims[0] for dims in dims_list]
+      assert max(input_batches) == min(input_batches)
+      batch_list.append(input_batches[0])
     return ConversionParams(
-        max_batch_size=max([
-            dims[0] for dims in self._GetParamsCached().input_dims if len(dims)
-        ]),
+        # We use the minimum of all the batch sizes, so when multiple different
+        # input shapes are provided it'll always create new engines in the
+        # cache, and we can therefore test the cache behavior.
+        max_batch_size=min(batch_list),
         max_workspace_size_bytes=1 << 25,
         precision_mode=run_params.precision_mode,
         minimum_segment_size=2,
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
-        cached_engine_batch_sizes=None,
+        cached_engine_batches=None,
         rewriter_config=None,
         use_calibration=run_params.use_calibration)
 
@@ -239,8 +257,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
+    conversion_params = self.GetConversionParams(run_params)
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
-      conversion_params = self.GetConversionParams(run_params)
       rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
@@ -248,12 +266,15 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
           conversion_params.minimum_segment_size,
           conversion_params.is_dynamic_op,
           conversion_params.maximum_cached_engines,
-          conversion_params.cached_engine_batch_sizes,
+          conversion_params.cached_engine_batches,
           conversion_params.use_calibration)
 
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
+      if conversion_params.rewriter_config is not None:
+        graph_options.rewrite_options.CopyFrom(
+            conversion_params.rewriter_config)
 
     config = config_pb2.ConfigProto(
         gpu_options=self._GetGPUOptions(), graph_options=graph_options)
@@ -280,13 +301,16 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _RunGraph(self,
                 run_params,
                 gdef,
-                input_data,
+                inputs_data,
                 config,
                 graph_state,
                 num_runs=2):
     """Run given graphdef multiple times."""
     params = self._GetParamsCached()
-    assert len(params.input_names) == len(input_data)
+    for current_input_data in inputs_data:
+      assert len(params.input_names) == len(current_input_data)
+
+    vals = []
     g = ops.Graph()
     with g.as_default():
       io_ops = importer.import_graph_def(
@@ -294,43 +318,48 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
           return_elements=params.input_names + params.output_names,
           name="")
       inputs = [op.outputs[0] for op in io_ops[:len(params.input_names)]]
-      assert len(inputs) == len(input_data)
+      for current_input_data in inputs_data:
+        assert len(inputs) == len(current_input_data)
       outputs = [op.outputs[0] for op in io_ops[len(params.input_names):]]
-    with self.test_session(
-        graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
-      val = None
-      # Defaults to 2 runs to verify result across multiple runs is same.
-      for _ in range(num_runs):
-        self._PrepareRun(graph_state)
-        new_val = sess.run(
-            outputs, {inputs[i]: input_data[i] for i in range(len(inputs))})
-        output_len = len(params.expected_output_dims)
-        self.assertEqual(output_len, len(new_val))
-        for i in range(output_len):
-          self.assertEqual(params.expected_output_dims[i], new_val[i].shape)
-        if val is not None:
-          self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
-        val = new_val
-        self.VerifyRun(run_params, graph_state)
-    return val
+      with self.test_session(
+          graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+        # Run for each input(s) shape
+        for shape_index in range(len(inputs_data)):
+          val = None
+          # Defaults to 2 runs to verify result across multiple runs is same.
+          for _ in range(num_runs):
+            self._PrepareRun(graph_state)
+            new_val = sess.run(outputs, {
+                inputs[i]: inputs_data[shape_index][i]
+                for i in range(len(inputs))
+            })
+            output_len = len(params.expected_output_dims[shape_index])
+            self.assertEqual(output_len, len(new_val))
+            for i in range(output_len):
+              self.assertEqual(
+                  list(params.expected_output_dims[shape_index][i]),
+                  list(new_val[i].shape))
+            if val is not None:
+              self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
+            val = new_val
+            self.VerifyRun(run_params, graph_state)
+          vals.append(val)
+    return vals
 
   # Use real data that is representative of the inference dataset
   # for calibration. For this test script it is random data.
-  def _RunCalibration(self, run_params, gdef, input_data, config):
+  def _RunCalibration(self, run_params, gdef, inputs_data, config):
     """Run calibration on given graph."""
     return self._RunGraph(
-        run_params, gdef, input_data, config, GraphState.CALIBRATE, num_runs=5)
+        run_params, gdef, inputs_data, config, GraphState.CALIBRATE, num_runs=5)
 
-  def _GetTrtGraphDef(self, run_params, gdef):
+  def _GetTrtGraphDef(self, run_params, graph_state, gdef):
     """Return trt converted graphdef."""
     params = self._GetParamsCached()
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
 
-    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
-    if conversion_params.rewriter_config is not None:
-      config_for_trt.graph_options.rewrite_options.CopyFrom(
-          conversion_params.rewriter_config)
+    config_for_trt = self._GetConfigProto(run_params, graph_state)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
         outputs=params.input_names + params.output_names,
@@ -340,7 +369,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         minimum_segment_size=conversion_params.minimum_segment_size,
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
-        cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes,
+        cached_engine_batches=conversion_params.cached_engine_batches,
         use_calibration=conversion_params.use_calibration,
         session_config=config_for_trt)
 
@@ -474,26 +503,31 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
             dtypes.as_dtype(node.attr["dtype"].type).as_numpy_dtype())
     assert len(params.input_names) == len(input_dtypes)
 
-    input_data = []
-    for i in range(len(params.input_names)):
-      dtype = input_dtypes[params.input_names[i]]
-      # Multiply the input by some constant to avoid all zeros input for integer
-      # types.
-      scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
-      dims = params.input_dims[i]
-      # TODO(laigd): add debug options. E.g. we can set the input data to be
-      # continuous natural numbers:
-      # seq = np.arange(np.prod(dims))
-      # seq.resize(dims)
-      # input_data.append(scale * seq.astype(dtype))
-      input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
+    inputs_data = []
+    for inp in params.input_dims:
+      current_input_data = []
+      for i in range(len(params.input_names)):
+        dtype = input_dtypes[params.input_names[i]]
+        # Multiply the input by some constant to avoid all zeros input for
+        # integer types.
+        scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
+        dims = inp[i]
+        # TODO(laigd): add debug options. E.g. we can set the input data to be
+        # continuous natural numbers:
+        # seq = np.arange(np.prod(dims))
+        # seq.resize(dims)
+        # input_data.append(scale * seq.astype(dtype))
+        current_input_data.append(
+            (scale * np.random.random_sample(dims)).astype(dtype))
+      inputs_data.append(current_input_data)
+
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
     # Get reference result without running trt.
     config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
     logging.info("Running original graph w/o trt, config:\n%s",
                  str(config_no_trt))
-    ref_result = self._RunGraph(run_params, input_gdef, input_data,
+    ref_result = self._RunGraph(run_params, input_gdef, inputs_data,
                                 config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
@@ -503,12 +537,13 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
       logging.info("Running calibration graph, config:\n%s", str(calib_config))
       if run_params.use_optimizer:
-        result = self._RunCalibration(run_params, input_gdef, input_data,
+        result = self._RunCalibration(run_params, input_gdef, inputs_data,
                                       calib_config)
       else:
-        calib_gdef = self._GetTrtGraphDef(run_params, input_gdef)
+        calib_gdef = self._GetTrtGraphDef(run_params, GraphState.CALIBRATE,
+                                          input_gdef)
         self._VerifyGraphDef(run_params, calib_gdef, GraphState.CALIBRATE)
-        result = self._RunCalibration(run_params, calib_gdef, input_data,
+        result = self._RunCalibration(run_params, calib_gdef, inputs_data,
                                       calib_config)
       infer_gdef = trt_convert.calib_graph_to_infer_graph(
           calib_gdef, run_params.dynamic_engine)
@@ -527,10 +562,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
     if not run_params.use_optimizer:
-      infer_gdef = self._GetTrtGraphDef(run_params, infer_gdef)
+      infer_gdef = self._GetTrtGraphDef(run_params, GraphState.INFERENCE,
+                                        infer_gdef)
       self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
 
-    result = self._RunGraph(run_params, infer_gdef, input_data, infer_config,
+    result = self._RunGraph(run_params, infer_gdef, inputs_data, infer_config,
                             GraphState.INFERENCE)
     self.assertAllClose(
         ref_result,
diff --git a/tensorflow/contrib/tensorrt/test/topk_test.py b/tensorflow/contrib/tensorrt/test/topk_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..633a51982b9a6acf1926033628793c1edbd2d118
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/topk_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class TopKTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing Top-K in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 100]
+    k = 5
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
+      values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
+      values = array_ops.identity(values, name="output_values")
+      indices = array_ops.identity(indices, name="output_indices")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=["output_values", "output_indices"],
+        expected_output_dims=[[[100, k], [100, k]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {"TRTEngineOp_0": ["Const", "TopK"]}
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
index b6e5e32db1236684a06c2d44298b9a3d39667152..497ea2848aae42a61db4f8f5a5c973525d5892d9 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -100,9 +100,9 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name, input2_name],
-        input_dims=[input_dims, input2_dims],
+        input_dims=[[input_dims, input2_dims]],
         output_names=[output_name],
-        expected_output_dims=[(12, 5, 8, 12)])
+        expected_output_dims=[[[12, 5, 8, 12]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
index b29626d2c28b4def716aef9e2703b669b5e46374..b5fed73d2d75271e2c5c533670923d42f233e80b 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -70,9 +70,9 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 6, 2, 2)])
+        expected_output_dims=[[[5, 6, 2, 2]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
index 9b0b189626050f678c71e9abbf7eb5296440d879..307128f1a89c46d63e851b6a7cd2d6abe7e39ff8 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -61,9 +61,9 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 2, 2, 6)])
+        expected_output_dims=[[[5, 2, 2, 6]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 6ea15fb8eff13663625420288a37ba002d57fa47..c12895c730047898f366bf651c798c1f1c5b93f7 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -99,9 +99,9 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/stat_summarizer.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
 %}
 
 %ignoreall
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 57797214d1684550aa7ad2664b71d22b504f70ed..e10be88ece8ebba9635af955b3c3410f29e5503c 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -105,6 +105,7 @@ py_binary(
     data = ["data/multivariate_periods.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
+    visibility = ["//visibility:public"],
     deps = select({
         ":empty_condition": [],
         "//conditions:default": [],
@@ -113,6 +114,7 @@ py_binary(
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/timeseries/python/timeseries:estimators",
         "//tensorflow/contrib/timeseries/python/timeseries:model",
+        "//tensorflow/contrib/timeseries/python/timeseries:state_management",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/examples/predict_test.py b/tensorflow/contrib/timeseries/examples/predict_test.py
index 678fd71cd8b94ee0be46e10a9a673de55bd44215..b353f85cb5df0cf961d1900b241e4fa1a84a24b4 100644
--- a/tensorflow/contrib/timeseries/examples/predict_test.py
+++ b/tensorflow/contrib/timeseries/examples/predict_test.py
@@ -43,10 +43,6 @@ class PeriodTrendExampleTest(test.TestCase):
     self.assertAllEqual([700], mean.shape)
     self.assertAllEqual([700], upper_limit.shape)
     self.assertAllEqual([700], lower_limit.shape)
-    # Check that variance hasn't blown up too much. This is a relatively good
-    # indication that training was successful.
-    self.assertLess(upper_limit[-1] - lower_limit[-1],
-                    1.5 * (upper_limit[0] - lower_limit[0]))
 
   def test_ar(self):
     (times, observed, all_times, mean,
@@ -55,7 +51,6 @@ class PeriodTrendExampleTest(test.TestCase):
     self.assertAllEqual(all_times.shape, mean.shape)
     self.assertAllEqual(all_times.shape, upper_limit.shape)
     self.assertAllEqual(all_times.shape, lower_limit.shape)
-    self.assertLess((upper_limit - lower_limit).mean(), 4.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 4b90b596b28efec83aa349782c4874d79b6817c7..2a22295197dc225cefbedf2736adeea5491a9fc2 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -281,6 +281,7 @@ py_library(
         "input_pipeline.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":feature_keys",
         ":model_utils",
@@ -361,9 +362,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
+        ":math_utils",
         ":model",
         ":model_utils",
-        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index bcadf4094e1e79fff1685515f2bde0b88f717cac..3626701d24163ef52564b42d8a630bd9c5a788eb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.rnn.python.ops import lstm_ops
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import model_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
@@ -462,11 +461,12 @@ class ARModel(model.TimeSeriesModel):
     if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
       covariance = prediction_ops["covariance"]
       sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
-      normal = distributions.Normal(loc=targets, scale=sigma)
-      loss_op = -math_ops.reduce_sum(normal.log_prob(prediction))
+      loss_op = -math_ops.reduce_sum(
+          math_utils.normal_log_prob(targets, sigma, prediction))
     else:
       assert self.loss == ARModel.SQUARED_LOSS, self.loss
-      loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets))
+      loss_op = math_ops.reduce_sum(
+          math_ops.squared_difference(prediction, targets))
     loss_op /= math_ops.cast(
         math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
     return loss_op
@@ -965,16 +965,11 @@ class AnomalyMixtureARModel(ARModel):
       anomaly_variance = prediction_ops["anomaly_params"]
       anomaly_sigma = math_ops.sqrt(
           gen_math_ops.maximum(anomaly_variance, 1e-5))
-      normal = distributions.Normal(loc=targets, scale=anomaly_sigma)
-      log_prob = normal.log_prob(prediction)
+      log_prob = math_utils.normal_log_prob(targets, anomaly_sigma, prediction)
     else:
       assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY
       anomaly_scale = prediction_ops["anomaly_params"]
-      cauchy = distributions.StudentT(
-          df=array_ops.ones([], dtype=anomaly_scale.dtype),
-          loc=targets,
-          scale=anomaly_scale)
-      log_prob = cauchy.log_prob(prediction)
+      log_prob = math_utils.cauchy_log_prob(targets, anomaly_scale, prediction)
     return log_prob
 
   def loss_op(self, targets, prediction_ops):
@@ -983,8 +978,7 @@ class AnomalyMixtureARModel(ARModel):
     covariance = prediction_ops["covariance"]
     # Normal data log probability.
     sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
-    normal1 = distributions.Normal(loc=targets, scale=sigma)
-    log_prob1 = normal1.log_prob(prediction)
+    log_prob1 = math_utils.normal_log_prob(targets, sigma, prediction)
     log_prob1 += math_ops.log(1 - self._anomaly_prior_probability)
     # Anomaly log probability.
     log_prob2 = self._anomaly_log_prob(targets, prediction_ops)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index aab330643862c1ccf073d2a0e34e1c475b1ec15f..b7375e5055e29efea3f23c3b9b9f3af59f45495b 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 
+import numpy as np
+
 from tensorflow.contrib import lookup
 from tensorflow.contrib.layers.python.layers import layers
 
@@ -43,6 +45,32 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 
+def normal_log_prob(loc, scale, x):
+  """Computes the Normal log pdf."""
+  z = (x - loc) / scale
+  return -0.5 * (math_ops.square(z)
+                 + np.log(2. * np.pi) + math_ops.log(scale))
+
+
+def cauchy_log_prob(loc, scale, x):
+  """Computes the Cauchy log pdf."""
+  z = (x - loc) / scale
+  return (-np.log(np.pi) - math_ops.log(scale) -
+          math_ops.log1p(math_ops.square(z)))
+
+
+def mvn_tril_log_prob(loc, scale_tril, x):
+  """Computes the MVN log pdf under tril scale. Doesn't handle batches."""
+  x0 = x - loc
+  z = linalg_ops.matrix_triangular_solve(
+      scale_tril, x0[..., array_ops.newaxis])[..., 0]
+  log_det_cov = 2. * math_ops.reduce_sum(math_ops.log(
+      array_ops.matrix_diag_part(scale_tril)), axis=-1)
+  d = math_ops.cast(array_ops.shape(scale_tril)[-1], log_det_cov.dtype)
+  return -0.5 * (math_ops.reduce_sum(math_ops.square(z), axis=-1)
+                 + d * np.log(2. * np.pi) + log_det_cov)
+
+
 def clip_covariance(
     covariance_matrix, maximum_variance_ratio, minimum_variance):
   """Enforce constraints on a covariance matrix to improve numerical stability.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 125750e7639ad40c481472a93353e6fb7055be96..cf5e749042afd83f927a3d22edfd3a9538ab2ffd 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -78,7 +78,6 @@ py_library(
     srcs = ["kalman_filter.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -235,7 +234,6 @@ py_library(
     srcs = ["filtering_postprocessor.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
index e9e2ac0aaf4c4d6c41f5007662f261af3de9bbd1..3fa2fbd9f77cb887c30fde264815728ca345f45a 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
@@ -22,8 +22,6 @@ import abc
 
 import six
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
@@ -91,10 +89,10 @@ def cauchy_alternative_to_gaussian(current_times, current_values, outputs):
   """
   del current_times  # unused
   cauchy_scale = math_utils.entropy_matched_cauchy_scale(outputs["covariance"])
-  individual_log_pdfs = distributions.StudentT(
-      df=array_ops.ones([], dtype=current_values.dtype),
+  individual_log_pdfs = math_utils.cauchy_log_prob(
       loc=outputs["mean"],
-      scale=cauchy_scale).log_prob(current_values)
+      scale=cauchy_scale,
+      x=current_values)
   return math_ops.reduce_sum(individual_log_pdfs, axis=1)
 
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
index a614386121e000961bf8b32625a28e1251654320..c0ec797bc5b7c41ca996c807840ce38311201f87 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
@@ -137,9 +135,10 @@ class KalmanFilter(object):
     with ops.control_dependencies([non_negative_assert]):
       observation_covariance_cholesky = linalg_ops.cholesky(
           symmetrized_observation_covariance)
-    log_prediction_prob = distributions.MultivariateNormalTriL(
-        predicted_observation, observation_covariance_cholesky).log_prob(
-            observation)
+    log_prediction_prob = math_utils.mvn_tril_log_prob(
+        loc=predicted_observation,
+        scale_tril=observation_covariance_cholesky,
+        x=observation)
     (posterior_state,
      posterior_state_var) = self.posterior_from_prior_state(
          prior_state=estimated_state,
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index ec8a273ea89f0b94db7b602494ea76207be8c1a2..c1a36fecc25545c6611ea09190dd89a8e1d82afe 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -61,6 +61,7 @@ py_library(
 py_library(
     name = "tpu_estimator",
     srcs = [
+        "python/tpu/_tpu_estimator_embedding.py",
         "python/tpu/error_handling.py",
         "python/tpu/tpu_config.py",
         "python/tpu/tpu_context.py",
@@ -70,12 +71,17 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":async_checkpoint",
+        ":feature_column",
+        ":functional",
+        ":tpu_embedding",
         ":tpu_lib",
+        ":tpu_ordinal_selector_py",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:function",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
@@ -155,6 +161,25 @@ tf_gen_op_wrapper_py(
     ],
 )
 
+tf_custom_op_library(
+    name = "python/ops/_tpu_ordinal_selector.so",
+    srcs = ["ops/tpu_ordinal_selector_op.cc"],
+)
+
+tf_custom_op_py_library(
+    name = "tpu_ordinal_selector_py",
+    srcs = ["ops/gen_tpu_ordinal_selector_op.py"],
+    dso = [":python/ops/_tpu_ordinal_selector.so"],
+    kernels = [
+        ":tpu_ordinal_selector_op_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_ordinal_selector_op",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "tpu_ordinal_selector_op",
     deps = [
@@ -242,7 +267,6 @@ py_library(
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
         "//third_party/cloud_tpu/models/keras_colab:__subpackages__",
-        "//third_party/cloud_tpu/models/mnist_keras:__subpackages__",
         "//third_party/cloud_tpu/models/resnet50_keras:__subpackages__",
     ],
     deps = [
@@ -298,6 +322,7 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:xla",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
+        "//tensorflow/contrib/tpu/proto:dynamic_padding_proto_py",
         "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
         "//tensorflow/contrib/tpu/proto:topology_proto_py",
         "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
@@ -337,13 +362,15 @@ py_library(
 
 tf_py_test(
     name = "datasets_test",
+    size = "medium",
     srcs = ["python/tpu/datasets_test.py"],
     additional_deps = [
         "//tensorflow/python:client_testlib",
         ":datasets",
     ],
-    flaky = 1,  # TODO(b/117363808): fails 1/1000 OSS runs
     grpc_enabled = True,
+    shard_count = 4,
+    tags = ["no_oss"],
 )
 
 tf_py_test(
@@ -430,7 +457,8 @@ py_library(
     srcs = ["python/tpu/tpu_embedding.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/tpu:tpu_ops",
+        ":tpu_lib",
+        ":tpu_ops",
         "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index 285e11d92de7a684ed87974414ec73c274cc7aa5..d4180d1a20bc59f3fbb37b2dbc67790ded9d2d90 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -31,6 +31,7 @@ REGISTER_OP("TPUReplicateMetadata")
     // Deprecated. Use num_cores_per_replica instead.
     .Attr("computation_shape: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
+    .Attr("padding_map: list(string) = []")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -105,6 +106,7 @@ REGISTER_OP("TPUReplicate")
     .Attr("NumVariables: int >= 0")
     .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
+    .Attr("padding_map: list(string) = []")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
index 0ef29bdf734467aa9dee5c157bc8d8a7e0a85f13..676aed0b7b651494eda80ff2d7c7c31097529590 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -37,18 +37,18 @@ namespace tensorflow {
 //    pieces of the TF Graph.
 // 1. Pass this TPUEmbeddingConfiguration to tpu.initialize_system() as the
 //    tpu_embedding_config parameter.
-// 2. Use the TPUEmbeddingLoad Op to initialize the embedding tables in TPU
+// 2. Use the LoadTPUEmbedding Ops to initialize the embedding tables in TPU
 //    memories, sharded across the memories attached to each Host.
-// 3. Use TPUEmbeddingEnqueueSparseBatch to provide the TPU with embedding
+// 3. Use EnqueueTPUEmbeddingSparseBatch to provide the TPU with embedding
 //    indices and aggregation weights.
-// 4. TPUEmbeddingReceiveActivations returns a list of Tensors, containing the
+// 4. RecvTPUEmbeddingActivations returns a list of Tensors, containing the
 //    activations from each table specified in the configuration.
 // 5. TPUEmbeddingActivations, when used with appropriate Python libraries,
 //    enables the automatic differentiation of models that use embeddings.
-// 6. TPUEmbeddingSendGradients takes a list of Tensors (of the same shapes
+// 6. SendTPUEmbeddingGradients takes a list of Tensors (of the same shapes
 //    as those returned by TPUEmbeddingReceiveActivations) containing gradients
 //    to use in updating the embedding tables.
-// 7. Before saving a checkpoint, use the TPUEmbeddingRetrieve Op to update
+// 7. Before saving a checkpoint, use the RetrieveTPUEmbedding Ops to update
 //    the Graph's embedding table Variables from the updated tables in the
 //    TPU memories.
 //
@@ -455,20 +455,21 @@ REGISTER_OP("SendTPUEmbeddingGradients")
       return Status::OK();
     })
     .Doc(R"doc(
-An op that performs gradient updates of embedding tables.
-
-The TensorList argument has the same length and shapes as the return value of
-TPUEmbeddingReceiveActivations, but contains gradients of the model's loss
-with respect to the embedding activations. The embedding tables are updated
-from these gradients via the optimizer specified in the configuration given
-to tpu.initialize_system.
+An op that performs gradient updates of embedding tables using the specified
+learning rates.
 
 inputs: A TensorList of gradients with which to update embedding tables.
-    It contains one tensor per embedding table in the model.
-learning_rates: A list of float32 scalars, one for each embedding table,
-    containing the learning rates for each table when dynamic learning rate is
-    enabled through the OptimizationParameters in TPUEmbeddingConfiguration.
-    When the learning rate is constant, the list should be empty.
+    This argument has the same length and shapes as the return value of
+    RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+    with respect to the embedding activations. The embedding tables are updated
+    from these gradients via the optimizer specified in the TPU embedding
+    configuration given to tpu.initialize_system.
+learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+    rate tag: see the comments in
+    //third_party/tensorflow/contrib/tpu/proto/optimization_parameters.proto.
+    Multiple tables can share the same dynamic learning rate tag as specified
+    in the configuration. If the learning rates for all tables are constant,
+    this list should be empty.
 config: Serialized TPUEmbeddingConfiguration proto.
 )doc");
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index f27ae38e0434991da7475e631be1c6cb4a463118..807cf26fe983b4ebe17695d6f4f90ecfc0e0cbf5 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -33,7 +33,7 @@ setup(
     long_description='Tools for capture TPU profile',
     url='https://www.tensorflow.org/tfrc/',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     packages=['cloud_tpu_profiler'],
     package_data={
         'cloud_tpu_profiler': ['data/*'],
diff --git a/tensorflow/contrib/tpu/profiler/trace_events.proto b/tensorflow/contrib/tpu/profiler/trace_events.proto
index cb2b9162677a0ebe8240a98671b1cabc1cee0c9f..96c4784c691d8f34cf8715cdc0ed9886412f5f90 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events.proto
+++ b/tensorflow/contrib/tpu/profiler/trace_events.proto
@@ -56,4 +56,7 @@ message TraceEvent {
   // The duration of the event in picoseconds if applicable.
   // Events without duration are called instant events.
   uint64 duration_ps = 10;
+
+  // Extra arguments that will be displayed in trace view.
+  map<string, string> args = 11;
 }
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index c20cab844cfaf083be2702a29ac2a152c7b72c2a..ea98ee25c89e1b7bef39276bae5c98bf382dbd7f 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -49,6 +49,15 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library(
+    name = "dynamic_padding_proto",
+    srcs = [
+        "dynamic_padding.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library_py(
     name = "compilation_result_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/proto/dynamic_padding.proto b/tensorflow/contrib/tpu/proto/dynamic_padding.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c9ebf181169a583d774ef77ca0b8c243ce733615
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/dynamic_padding.proto
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.tpu;
+
+// A mapping between the dynamic shape dimension of an input and the arg that
+// represents the real shape.
+message PaddingMap {
+  // Input arg index with dynamic shapes.
+  int32 arg_index = 1;
+
+  // The dynamic shape dimension index.
+  int32 shape_index = 2;
+
+  // The arg index that dynamic dimension maps to, which represents the value
+  // of the real shape.
+  int32 padding_arg_index = 3;
+}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index aae1ab1d37a166303883e3a07a7a01efe2feab51..bc50c613f3d2a09f9e51353fab4938055549a4cd 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -9,9 +9,38 @@ message ClippingLimits {
   google.protobuf.FloatValue upper = 2;  // +inf if not set
 }
 
-// Get the learning rate from the parameters of the SendTPUEmbeddingGradients
-// op.
+// Dynamic learning rate specification in the TPUEmbeddingConfiguration. The
+// actual learning rates are provided as a scalar input list to the
+// SendTPUEmbeddingGradients Op indexed by their tag specified through the
+// following proto.
 message DynamicLearningRate {
+  // For tables where learning rates are dynamically computed and communicated
+  // to the TPU embedding program, a tag must be specified for the learning
+  // rate.
+  //
+  // The tag must be a non-negative  integer. The total number of unique tags
+  // must be less than or equal to the number of tables in the TPU embedding
+  // configuration (a table does not specify any tag if it uses a constant
+  // learning rate, and specifies exactly one tag if it uses dynamic learning
+  // rates).
+  //
+  // All tags in the range [0, number_of_unique_tags) must be present in the TPU
+  // embedding configuration, i.e. a tag cannot be skipped if a different tag
+  // numerically greater than it is used in the configuration.
+  //
+  // If multiple tables specify the same tag, they *MUST* have
+  // the same dynamic learning rate, for example, their dynamic learning rate
+  // could be computed by the same TensorFlow sub-graph. The partitioning of the
+  // embedding layer would be more optimal if the number_of_unique_tags is as
+  // *LOW* as possible, i.e., if many tables share the same tag.
+  //
+  // The learning_rate input of the SendTPUEmbeddingGradients op is used to
+  // communicate dynamic learning rates to the TPU embedding program.
+  // The learning_rate input is a list of scalars where the size of the list is
+  // equal to the number of unique tags. The learning rate associated with a
+  // particular tag is specified by populating its corresponding index in the
+  // list of learning_rate scalars.
+  int32 tag = 1;
 }
 
 // Source of learning rate to use.
@@ -186,7 +215,8 @@ message OptimizationParameters {
 }
 
 // Specification of an optimization algorithm's state variables (both the main
-// value vector and any extra accumulators, etc.).
+// value vector and any extra accumulators, etc.). This proto is only used
+// internally by the TPU software and is not exposed directly to the TF model.
 message StateVariableSpecification {
   // Parameter name for the state variable.
   string name = 1;
@@ -194,6 +224,20 @@ message StateVariableSpecification {
   // A normal state variable that should be saved and restored in checkpoints
   // and used as an input or output to non-debug TensorFlow ops.
   message UserDefined {
+    // For padding embedding rows, this field specifies the initial value to be
+    // used. Separate initial values need to be specified for the embeddings and
+    // any extra accumulators. The initial values should be specified so as to
+    // maintain two invariants during model training:
+    // (1) The embedding vector multiplied by zero returns a vector containing
+    //     all zeros. To maintain this invariant, the embedding values should
+    //     never be NaNs or +-infinity.
+    // (2) Repeatedly applying the optimizer using a gradient vector of all
+    //     zeros does not cause the embeddings or slot variables to become NaNs
+    //     or +-infinity.
+    // The padding row is looked up when no embedding IDs are present for a
+    // feature. The semantics of embedding lookup dictate that the output must
+    // be zero under this scenario.
+    double padding_initial_value = 1;
   }
 
   // A state variable that should be filled with a constant and normally hidden
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 6a6eba282a12d68cc3cd4e46a46a1b4190fb737b..9260e7b8a800c3bf160923af95867d44342000a3 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -217,13 +217,19 @@ if platform.system() != "Windows":
 
     Args:
       inputs: A TensorList of gradients with which to update embedding tables.
-        Contains one tensor per embedding table in the model.
+          This argument has the same length and shapes as the return value of
+          RecvTPUEmbeddingActivations, but contains gradients of the model's
+          loss with respect to the embedding activations. The embedding tables
+          are updated from these gradients via the optimizers specified in the
+          TPU embedding configuration given to tpu.initialize_system.
       config: Serialized TPUEmbeddingConfiguration proto.
-      learning_rates: A TensorList of float32 scalars, one for each embedding
-        table, containing the learning rates for each table when dynamic
-        learning rate is enabled through the OptimizationParameters in
-        TPUEmbeddingConfiguration. When the learning rate is constant, the list
-        should be empty (optional).
+      learning_rates: A TensorList of float32 scalars, one for each dynamic
+          learning rate tag: see the comments in
+          //third_party/tensorflow/contrib/tpu/proto/
+                                               optimization_parameters.proto.
+          Multiple tables can share the same dynamic learning rate tag as
+          specified in the configuration. If the learning rates for all tables
+          are constant, this list should be empty.
       name: A name for the operation (optional).
 
     Returns:
@@ -337,9 +343,8 @@ if platform.system() != "Windows":
     Args:
       sample_indices: A list of rank 1 Tensors specifying the training example
         to which the corresponding embedding_indices and aggregation_weights
-        values
-        belong. It corresponds to sp_ids.indices[:,0] in
-          embedding_lookup_sparse().
+        values belong. It corresponds to sp_ids.indices[:,0] in
+        embedding_lookup_sparse().
       embedding_indices: A list of rank 1 Tensors, indices into the embedding
         tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
       aggregation_weights: A list of rank 1 Tensors containing per training
diff --git a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce96e5bcdbe5777f68eb969be46423b5b3410cb
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
@@ -0,0 +1,273 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Tooling for support TPU embedding in TPUEstimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.contrib.tpu.python.tpu import feature_column as tpu_fc
+from tensorflow.contrib.tpu.python.tpu import tpu_embedding
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.feature_column import feature_column as core_fc
+from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
+
+# pylint: disable=protected-access
+_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
+                                 tpu_fc._TPUSharedEmbeddingColumn)
+_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
+                             core_fc_lib.EmbeddingColumn,
+                             core_fc._SharedEmbeddingColumn)
+_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
+
+# pylint: enable=protected-access
+
+
+def get_tpu_embedding_config_from_feature_columns(feature_columns):
+  """Create configs for TPUEmbedding from a list of feature columns.
+
+  This function will place one embedding tensor per table and the return is
+  intended to be used as input to TPUEmbedding.
+
+  Args:
+    feature_columns: a list of supported feature columns.
+
+  Returns:
+    A pair of dicts, the first maps tables to their config, the second maps
+    features to tables.
+  """
+
+  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
+
+  for column in feature_columns:
+    if not isinstance(column, allowed):
+      raise TypeError(
+          'Unsupported feature column {}. Supported types are {}.'.format(
+              type(column), allowed))
+
+  table_to_config = {}
+  feature_to_table = {}
+  for column in feature_columns:
+    feature_name = column.get_feature_key_name()
+    table_name = 'tbl_{}'.format(column.get_embedding_var_name())
+    if feature_name in feature_to_table:
+      raise ValueError(
+          'Feature column {} is used with multiple embeddings and this is '
+          'not supported.'.format(feature_name))
+    feature_to_table[feature_name] = table_name
+    vocabulary_size, dimension = column.get_embedding_table_size()
+    table_to_config[table_name] = tpu_embedding.TableConfig(
+        vocabulary_size=vocabulary_size,
+        dimension=dimension,
+        initializer=column.get_initializer(),
+        combiner=column.get_combiner())
+
+  return table_to_config, feature_to_table
+
+
+def _get_tpu_embedding_optimization_parameters(embedding_config_spec):
+  """Get tpu_embedding._OptimizationParameters from EmbeddingConfigSpec."""
+  if embedding_config_spec.optimizer_type == 'adagrad':
+    return tpu_embedding.AdagradParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.adagrad_initial_accumulator,
+        embedding_config_spec.use_gradient_accumulation)
+  elif embedding_config_spec.optimizer_type == 'sgd':
+    return tpu_embedding.StochasticGradientDescentParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.use_gradient_accumulattion)
+  elif embedding_config_spec.optimizer_type == 'adam':
+    return tpu_embedding.AdamParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.adam_parameters.beta1,
+        embedding_config_spec.adam_parameters.beta2,
+        embedding_config_spec.adam_parameters.epsilon,
+        use_gradient_accumulation=embedding_config_spec
+        .use_gradient_accumulation)
+  else:
+    raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
+
+
+AdamParameters = collections.namedtuple('AdamParameters',
+                                        ['beta1', 'beta2', 'epsilon'])
+
+
+# TODO(shizhiw): Improve the API to support more optimizer parameters in API.
+class EmbeddingConfigSpec(
+    collections.namedtuple('EmbeddingConfigSpec', [
+        'feature_columns', 'learning_rate', 'optimizer_type',
+        'adagrad_initial_accumulator', 'clipping_limit',
+        'use_gradient_accumulation', 'adam_parameters'
+    ])):
+  """Class to keep track of embedding config specification."""
+
+  def __new__(cls,
+              feature_columns,
+              learning_rate,
+              optimizer_type='adagrad',
+              adagrad_initial_accumulator=None,
+              clipping_limit=None,
+              use_gradient_accumulation=False,
+              adam_parameters=None):
+    """Creates an EmbeddingConfigSpec instance.
+
+    Args:
+      feature_columns: All `FeatureColumn`s used by model.
+      learning_rate: embedding optimizer learning rate.
+      optimizer_type: (String) Name of the optimizer for embedding gradients
+        updates. Must be either 'adagrad' ( `tf.train.AdagradOptimizer`, default
+        value), 'sgd' (`tf.train.GradientDescentOptimizer`), or 'adam'
+        (`tf.contrib.opt.LazyAdamOptimizer`) for lazy Adam. This optimizer will
+        be applied to all embedding variables specified by `feature_columns`.
+      adagrad_initial_accumulator: Initial accumulator for Adagrad. Used when
+        optimizer_type is 'adagrad'. Default is `0.1`.
+      clipping_limit: (Optional) Clipping limit (absolute value).
+      use_gradient_accumulation: (Experimental) Whether to accumulate the
+        gradients across TPU embedding mini-batches. Gradient accumulation does
+        not affect SGD and therefore this is applicable only for Adagrad.
+      adam_parameters: AdamParameters. Used when optimizer_type is 'adam'.
+        Default is 0.9 for beta1, 0.999 for beta2 and 1e-8 for epsilon.
+
+    Returns:
+      An EmbeddingConfigSpec instance.
+
+    Raises:
+      ValueError: If the feature_columns are not specified.
+      TypeError: If the feature columns are not of ths correct type (one of
+        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
+        _EMBEDDING_COLUMN_CLASSES).
+      ValueError: If use_gradient_accumulation is True for SGD.
+      ValueError: If `optimizer_type` is not one of "adagrad" or "sgd" or
+        "adam".
+    """
+    if not feature_columns:
+      raise ValueError('`feature_columns` cannot be `None` or empty.')
+
+    # It is unknown at this moment, whether the TPUEstimator is running in CPU
+    # or TPU mode. So allow non-TPU embedding columns also.
+    supported_classes = tuple(
+        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
+        list(_EMBEDDING_COLUMN_CLASSES))
+
+    for column in feature_columns:
+      if not isinstance(column, supported_classes):
+        raise TypeError(
+            'All feature columns must be supported types in {}. Got {}'.format(
+                supported_classes, type(column)))
+
+    if optimizer_type == 'adagrad':
+      if adagrad_initial_accumulator is None:
+        adagrad_initial_accumulator = 0.1
+      if adagrad_initial_accumulator <= 0:
+        raise ValueError('Adagrad initial_accumulator must be positive')
+    elif optimizer_type == 'sgd':
+      if use_gradient_accumulation:
+        raise ValueError('Gradient accumulation makes sense for Adagrad only.')
+    elif optimizer_type == 'adam':
+      if adam_parameters is None:
+        adam_parameters = AdamParameters(0.9, 0.999, 1e-8)
+      if adam_parameters.beta1 < 0. or adam_parameters.beta1 >= 1.:
+        raise ValueError('beta1 must be between 0. and 1; got {}.'.format(
+            adam_parameters.beta1))
+      if adam_parameters.beta2 < 0. or adam_parameters.beta2 >= 1.:
+        raise ValueError('beta2 must be between 0. and 1; got {}.'.format(
+            adam_parameters.beta2))
+      if adam_parameters.epsilon <= 0.:
+        raise ValueError('epsilon must be positive; got {}.'.format(
+            adam_parameters.epsilon))
+    else:
+      raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
+
+    return super(EmbeddingConfigSpec, cls).__new__(
+        cls,
+        feature_columns=feature_columns,
+        learning_rate=learning_rate,
+        optimizer_type=optimizer_type,
+        adagrad_initial_accumulator=adagrad_initial_accumulator,
+        clipping_limit=clipping_limit,
+        use_gradient_accumulation=use_gradient_accumulation,
+        adam_parameters=adam_parameters)
+
+
+class EmbeddingConfig(object):
+  """This is the internal immutable object for embedding config.
+
+  `_EmbeddingConfig` is responsible to _translate_ user provided
+  `EmbeddingConfigSpec` to internal data structures, mostly constructor
+  arguments of `TPUEmbedding`.
+  """
+
+  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
+               num_hosts, num_cores, master):
+    self._embedding_config_spec = embedding_config_spec
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._num_hosts = num_hosts
+    self._num_cores = num_cores
+    self._master = master
+
+    self._table_to_config_dict, self._feature_to_table_dict = (
+        get_tpu_embedding_config_from_feature_columns(
+            embedding_config_spec.feature_columns))
+    self._optimization_parameters = _get_tpu_embedding_optimization_parameters(
+        self._embedding_config_spec)
+    self._mode_to_tpu_embedding_dict = {}
+
+  def has_embedding_tables(self):
+    return bool(self._table_to_config_dict)
+
+  def _create_tpu_embedding(self, mode):
+    """Create tpu_embedding.TPUEmbedding based on mode."""
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      batch_size = self._train_batch_size
+    else:
+      batch_size = self._eval_batch_size
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      tpu_embedding_mode = tpu_embedding.TRAINING
+    elif (mode == model_fn_lib.ModeKeys.EVAL or
+          mode == model_fn_lib.ModeKeys.PREDICT):
+      tpu_embedding_mode = tpu_embedding.INFERENCE
+    else:
+      raise ValueError('Mode {} is not supported.'.format(mode))
+
+    tpu_embedding_ = tpu_embedding.TPUEmbedding(
+        self._table_to_config_dict,
+        self._feature_to_table_dict,
+        batch_size,
+        tpu_embedding_mode,
+        self._master,
+        self._optimization_parameters,
+    )
+    return tpu_embedding_
+
+  def get_tpu_embedding(self, mode):
+    if mode not in self._mode_to_tpu_embedding_dict:
+      self._mode_to_tpu_embedding_dict[mode] = (
+          self._create_tpu_embedding(mode))
+    return self._mode_to_tpu_embedding_dict[mode]
+
+
+def split_inputs(ctx, features, labels):
+  """Splits the dense and sparse tensors inside the features and labels."""
+  sparse_features = collections.OrderedDict()
+  if ctx.embedding_config:
+    tpu_embedding_ = ctx.embedding_config.tpu_embedding
+    for feature_key in tpu_embedding_.feature_to_table_dict:
+      sparse_features[feature_key] = features.pop(feature_key)
+
+  return features, labels, sparse_features
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 8d6245390fc3fa005c92d01bc9b64ddb47583582..bc0cd41d210ac6f8de1b20ebf744ee1e1dd04137 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -142,15 +142,12 @@ def StreamingFilesDataset(files,
       source_dataset = source_dataset.shuffle(
           buffer_size=filename_shuffle_buffer_size)
 
-    # NOTE: We perform the `repeat` on the source dataset, because the output
-    # dataset does not currently have enough information to recreate an iterator
-    # over the source dataset when it reaches the end.
-    source_dataset = source_dataset.repeat(num_epochs)
-
     source_dataset = source_dataset.apply(
         interleave_ops.parallel_interleave(
             reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
 
+    source_dataset = source_dataset.repeat(num_epochs)
+
     if batch_transfer_size:
       source_dataset = source_dataset.batch(batch_transfer_size)
 
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index 6906501ecf90c8e577aa0becf2dba818deb19df4..3313dc749c2c7606101b2dc96614df2d052dfed1 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -25,6 +25,9 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.tpu.python.tpu.topology import Topology
 
 
+SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
+
+
 def _compute_task_and_cores_to_replicas(core_assignment, topology):
   """Computes a nested dict which maps task and logical core to replicas."""
   task_and_cores_to_replicas = {}
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column.py b/tensorflow/contrib/tpu/python/tpu/feature_column.py
index d5d00d628d407bf3bb5312bd54f6ccd13dc37db4..8edf131bc24fd003806263570b63ee8514c49896 100644
--- a/tensorflow/contrib/tpu/python/tpu/feature_column.py
+++ b/tensorflow/contrib/tpu/python/tpu/feature_column.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
 import math
 
 from tensorflow.contrib.tpu.python.tpu import tpu
@@ -279,11 +278,10 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if tpu.under_tpu_inference_context():
-      # TODO(shizhiw, b/112012627, b/112336539): Replace _outside_all_rewrites()
-      # with outside compilation.
-      with _outside_all_rewrites():
+      def host_computation():
         return fc._EmbeddingColumn._get_dense_tensor(
             self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
 
     if _is_running_on_cpu():
       return fc._EmbeddingColumn._get_dense_tensor(
@@ -300,13 +298,6 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
     return tensor
 
 
-@contextlib.contextmanager
-def _outside_all_rewrites():
-  """'Break out' of a tpu.rewrite() (or shard(), etc.)."""
-  with ops.control_dependencies(None):
-    yield
-
-
 class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
                                 fc._SharedEmbeddingColumn):
   """Core Shared Embedding Column."""
@@ -385,11 +376,10 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     if tpu.under_tpu_inference_context():
-      # TODO(shizhiw, b/112012627, b/112336539): Replace _outside_all_rewrites()
-      # with outside compilation.
-      with _outside_all_rewrites():
+      def host_computation():
         return fc._SharedEmbeddingColumn._get_dense_tensor(
             self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
 
     if _is_running_on_cpu():
       return fc._SharedEmbeddingColumn._get_dense_tensor(
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 3e463823c820a3ef8628324f77e1a9caf8d385d5..f5735cecc38b7033f21fc4d4105cfead233379fa 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -185,7 +185,8 @@ def all_worker_devices(session):
   """Return a list of devices for each worker in the system."""
   devices = session.list_devices()
   return [
-      device.name for device in devices
+      device.name
+      for device in devices
       if ':CPU:' in device.name and 'coordinator' not in device.name
   ]
 
@@ -255,12 +256,14 @@ class WatchdogManager(threading.Thread):
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
             watchdog_config=event_pb2.WatchdogConfig(
-                timeout_ms=self.shutdown_timeout * 1000,)))
+                timeout_ms=self.shutdown_timeout * 1000,),
+            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
 
   def configure_and_run(self):
-    logging.info('Enabling watchdog timer with %d second timeout '
-                 'and %d second ping interval.',
-                 self.shutdown_timeout, self.ping_interval)
+    logging.info(
+        'Enabling watchdog timer with %d second timeout '
+        'and %d second ping interval.', self.shutdown_timeout,
+        self.ping_interval)
     self._reset_manager()
     self._running = True
     self.start()
@@ -269,7 +272,8 @@ class WatchdogManager(threading.Thread):
     logging.info('Stopping worker watchdog.')
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
+            shutdown_mode=event_pb2.NOT_CONFIGURED))
     self._running = False
     self.join()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
index a1494e3660bc09e3af45e81097151a35990810fb..bf492e78a15acc92017663a286e8c8f0b2045339 100644
--- a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -47,6 +47,8 @@ _TRACE_MODE_PART_TENSOR_SIZE = 3
 _TRACE_MODE_FULL_TENSOR = 'full-tensor'
 _TRACE_MODE_NORM = 'norm'
 _TRACE_MODE_MAX_ABS = 'max-abs'
+_SUBMODE_BRIEF = 'brief'
+_SUBMODE_DETAILED = 'detailed'
 _REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
 _REASON_UNSAFE_OP = 'not-traced-unsafe-op'
 _REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
@@ -57,6 +59,7 @@ _REASON_SCALAR_GET_TRACED = 'traced-scalar'
 _REASON_TENSOR_GET_TRACED = 'traced-tensor'
 _REASON_USER_INCLUDED = 'traced-user-included'
 _REASON_USER_EXCLUDED = 'not-traced-user-excluded'
+_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
 _REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
 _MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
 _MARKER_SECTION_END = '!!!!!!! section-end:'
@@ -68,6 +71,7 @@ _SECTION_NAME_GRAPH = 'graph'
 _FIELD_NAME_VERSION = 'version:'
 _FIELD_NAME_DEVICE = 'device:'
 _FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_SUBMODE = 'submode:'
 _FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
 _FIELD_NAME_NUM_OPS = 'number-of-ops:'
 _FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
@@ -76,8 +80,10 @@ _FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
 _FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
 _FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
 _FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
 _FLAG_NAME_ENABLE = 'enable'
 _FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_SUBMODE = 'submode'
 _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
 _FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
 _FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
@@ -94,7 +100,7 @@ _TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
 _TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
 
 
-def tensor_checkpoint(tensor, checkpoint_name):
+def tensor_tracepoint(tensor, checkpoint_name):
   """Adds a checkpoint with the given checkpoint name for the given tensor.
 
   The tensor will be added to the list of tensors that will be traced by the
@@ -115,10 +121,10 @@ def tensor_checkpoint(tensor, checkpoint_name):
   return tensor
 
 
-def keras_layer_checkpoint(layer, checkpoint_name):
+def keras_layer_tracepoint(layer, checkpoint_name):
   """An interface for adding the tensor outputs of a keras layer.
 
-  Encapsulates tensor_checkpoint.
+  Encapsulates tensor_tracepoint.
 
   Args:
      layer: A keras layer.
@@ -132,12 +138,12 @@ def keras_layer_checkpoint(layer, checkpoint_name):
   try:
     outputs = layer.output
     if tensor_util.is_tensor(outputs):
-      tensor_checkpoint(outputs, '%s' % (checkpoint_name))
+      tensor_tracepoint(outputs, '%s' % (checkpoint_name))
     else:
       idx = 0
       for output_tensor in outputs:
         if tensor_util.is_tensor(outputs):
-          tensor_checkpoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
+          tensor_tracepoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
         idx += 1
   except AttributeError:
     pass
@@ -165,21 +171,39 @@ class TensorTracer(object):
 
   @staticmethod
   def _match_next_flag(flags, pos):
-    """Returns the match for the next TensorTracer flag."""
+    """Returns the match for the next TensorTracer flag.
+
+    Args:
+       flags: a string that contains the flags.
+       pos: where in flags to start the search.
+
+    Returns:
+       A pair where the first element is the regular-expression
+       match found and the second element indicates if the match
+       has a value.
+    """
 
     match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
     if match:
-      return match
+      return match, True
     match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
     if match:
-      return match
+      return match, True
     match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
-    return match
+    if match:
+      return match, True
+    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
+    if match:
+      # The flag is found but is not given a value.
+      return match, False
+    # The flag is not found.
+    return None, False
 
   @staticmethod
   def validate_flag_names():
     """Validates if the TensorTrace flags passed are valid."""
     valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
+                        _FLAG_NAME_SUBMODE,
                         _FLAG_NAME_EXCLUDED_OPNAMES,
                         _FLAG_NAME_EXCLUDED_OPTYPES,
                         _FLAG_NAME_INCLUDED_OPNAMES,
@@ -193,7 +217,7 @@ class TensorTracer(object):
       return
     pos = 0
     while True:
-      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
       if not match:
         break
       flag_name = match.group(1)
@@ -216,11 +240,15 @@ class TensorTracer(object):
     result += 'Individual flag value:\n'
     pos = 0
     while True:
-      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
       if not match:
         break
       flag_name = match.group(1)
-      flag_value = match.group(2)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
       result += '  %s: %s\n'%(flag_name, flag_value)
       pos = match.end()
     result += '\n'
@@ -228,30 +256,45 @@ class TensorTracer(object):
 
   @staticmethod
   def get_flag_value(wanted_flag_name):
-    """Returns the value of a TensorTracer flags."""
+    """Returns the value of a TensorTracer flags.
+
+    Args:
+      wanted_flag_name: the name the the flag we are looking for.
+
+    Returns:
+      A pair where the first element indicates if the flag is
+      found and the second element is the value of the flag.
+
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
 
     tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
-      return ''
+      return False, None
     pos = 0
     while True:
-      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
       if not match:
-        return ''
+        return False, None
       flag_name = match.group(1)
-      flag_value = match.group(2)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
       if flag_name == wanted_flag_name:
-        return flag_value
+        return True, flag_value
       pos = match.end()
-    return ''
+    raise RuntimeError('Should not reach here.')
 
   @staticmethod
   def flag_value_to_re_list(flag_name):
     """Converts list of strings to compiled RE."""
 
     re_list = []
-    flag_value = TensorTracer.get_flag_value(flag_name)
-    if not flag_value:
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found or not flag_value:
       return re_list
     list_of_values = flag_value.split()
     for v in list_of_values:
@@ -260,32 +303,41 @@ class TensorTracer(object):
     return re_list
 
   @staticmethod
-  def is_enabled():
-    """Returns True if TensorTracer is enabled."""
+  def _is_flag_on(flag_name):
+    """Returns True if the given flag is on."""
 
-    flag_value = TensorTracer.get_flag_value(_FLAG_NAME_ENABLE)
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found:
+      return False
+    if flag_value is None:
+      return True
+    # Depends on the flag value.
     flag_value = flag_value.lower()
     enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
     return enabled
 
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    return TensorTracer._is_flag_on(_FLAG_NAME_ENABLE)
+
   @staticmethod
   def use_test_undeclared_outputs_dir():
-    """Decides the output directory of the trace file.
+    """Decides the output directory of the report and trace files.
 
     Args:
        None.
 
     Returns:
-       True if the output trace file should be written to the
+       True if the output files should be written to the
        test-undeclared-outputs-directory defined via an
        env variable.
     """
 
-    flag_value = TensorTracer.get_flag_value(
+    return TensorTracer._is_flag_on(
         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
-    flag_value = flag_value.lower()
-    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
-    return enabled
+
 
   @staticmethod
   def check_device_type(device_type):
@@ -306,6 +358,18 @@ class TensorTracer(object):
                        'Valid trace modes are: %s'%(trace_mode,
                                                     valid_trace_modes))
 
+  @staticmethod
+  def check_submode(submode):
+    """Checks if the given submode is valid."""
+
+    if not submode:
+      return
+    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
+    if submode not in valid_submodes:
+      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
+                       'Valid submodes are: %s'%(submode,
+                                                 valid_submodes))
+
   @staticmethod
   def unsafe_op(op):
     """Returns True if this op is not safe to be traced."""
@@ -314,8 +378,7 @@ class TensorTracer(object):
       return True
     # Reasons for not including following op types:
     #    Assign: cause incorrect result with CPU tracing.
-    #    others: compilation problems.
-    if op.type in ['Assign', 'Pack', 'Shape', 'Reshape', 'ArgMin', 'ArgMax']:
+    if op.type in ['Assign']:
       return True
     return False
 
@@ -350,10 +413,12 @@ class TensorTracer(object):
   def less_interesting_op(op):
     """Returns True if the given Op is not an interesting one to be traced."""
 
-    include_less_interesting = TensorTracer.get_flag_value(
+    found, _ = TensorTracer.get_flag_value(
         _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
-    if include_less_interesting:
+    if found:
+      # users force to include all ops.
       return False
+    # Following ops are highly unlikey to cause bugs.
     return op.type in ['Const', 'Identity', 'Cast', 'Shape']
 
   @staticmethod
@@ -404,7 +469,7 @@ class TensorTracer(object):
                 temporarily_marked_ops, sorted_ops)
       # pylint: disable=protected-access
       for ctrl_output_op in op._control_outputs:
-      # pylint: enable=protected-access
+        # pylint: enable=protected-access
         visit(ctrl_output_op, cycle, permanently_marked_ops,
               temporarily_marked_ops, sorted_ops)
       temporarily_marked_ops.remove(op)
@@ -460,10 +525,14 @@ class TensorTracer(object):
     self._version = 'use-outside-compilation'
     self._device_type = None
     TensorTracer.validate_flag_names()
-    self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
-    if not self._trace_mode:
+    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not found or not self._trace_mode:
       self._trace_mode = _TRACE_MODE_NAN_INF
     TensorTracer.check_trace_mode(self._trace_mode)
+    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
+    if not found or not self._submode:
+      self._submode = _SUBMODE_DETAILED
+    TensorTracer.check_submode(self._submode)
     self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
     self._instrument_records = {}
     self._set_trace_file_path()
@@ -499,8 +568,10 @@ class TensorTracer(object):
   def _set_trace_file_path(self):
     """Sets the path of the output trace file."""
 
-    self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE)
-    if self._trace_file_path and TensorTracer.use_test_undeclared_outputs_dir():
+    found, self._trace_file_path = TensorTracer.get_flag_value(
+        _FLAG_NAME_TRACE_FILE)
+    if found and self._trace_file_path \
+       and TensorTracer.use_test_undeclared_outputs_dir():
       if os.path.isabs(self._trace_file_path):
         raise ValueError('If use_test_undeclared_outputs_dir is set,'
                          'trace_file_path cannot be an absolute path (%s)'
@@ -512,7 +583,17 @@ class TensorTracer(object):
   def _set_report_file(self):
     """Sets the path of the output report file."""
 
-    self._report_file_path = TensorTracer.get_flag_value(_FLAG_NAME_REPORT_FILE)
+    found, self._report_file_path = TensorTracer.get_flag_value(
+        _FLAG_NAME_REPORT_FILE)
+    if found and self._report_file_path \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._report_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'report_file_path cannot be an absolute path (%s)'
+                         %self._report_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._report_file_path = os.path.join(outputs_dir,
+                                            self._report_file_path)
     if not self._report_file_path:
       self._report_file = None
       return
@@ -528,8 +609,8 @@ class TensorTracer(object):
   def _set_op_range(self):
     """Sets the index range of the Ops that we will consider tracing."""
 
-    op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
-    if not op_range:
+    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not found or not op_range:
       self._op_range = (-1, -1)  # this means including all ops.
       return
     match = _OP_RANGE_PAT.match(op_range)
@@ -595,6 +676,7 @@ class TensorTracer(object):
     self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
     self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
     self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
     self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
 
@@ -606,7 +688,7 @@ class TensorTracer(object):
       self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
 
-  def _write_op_list_section(self, op_list, tensorname_idx_map):
+  def _write_op_list_section(self, op_list):
     """Writes the Op-list section of the report."""
 
     self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
@@ -615,10 +697,10 @@ class TensorTracer(object):
       op = op_list[i]
       line = '%d "%s" %s'%(i, op.name, op.type)
       for out_tensor in op.outputs:
-        if out_tensor.name not in tensorname_idx_map:
+        if out_tensor.name not in self._tensorname_idx_map:
           raise ValueError(
               'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
-        line += ' %d'%tensorname_idx_map[out_tensor.name]
+        line += ' %d'%self._tensorname_idx_map[out_tensor.name]
       line += '\n'
       self._write_report(line)
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
@@ -653,12 +735,64 @@ class TensorTracer(object):
       self._write_report('%d "%s"\n'%(i, l[i].name))
     self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
 
-  def _make_tensor_trace_fun(self, op_name, output_idx):
+  def _preprocess_traced_tensor(self, tensor):
+    """Computes NAN/Norm/Max on TPUs before sending to CPU.
+
+    Args:
+      tensor: The tensor to be traced.
+    Returns:
+      A tensor that should be input to the trace_function.
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        output_tensor = math_ops.reduce_any(
+            gen_math_ops.logical_or(
+                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
+      else:
+        output_tensor = constant_op.constant(False)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_norm(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = linalg_ops.norm(tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_max_abs(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
+      zero = constant_op.constant(0, dtypes.float32)
+      output_tensor = gen_math_ops.maximum(zero, output_tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf(tensor)
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_NORM:
+      return _show_norm(tensor)
+    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+      return _show_max_abs(tensor)
+    raise RuntimeError(
+        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
+
+  def _make_tensor_trace_fun(self, tensor_name):
     """Makes the tensor tracing function called by outside compilation.
 
     Args:
-      op_name: the name of the Op that outputs the tensor to be traced.
-      output_idx: which output of the Op it is (0 means the first output).
+      tensor_name: name of the tensor being traced.
 
     Returns:
       A function to be passed as the first argument to outside compilation.
@@ -667,84 +801,72 @@ class TensorTracer(object):
       RuntimeError: If the trace mode is invalid.
     """
 
-    def _print_tensor(op_name, output_idx, num_elements, tensor, output_tensor):
+    def _print_tensor(tensor_name, num_elements, tensor, output_tensor):
       """Prints a tensor value to a file.
 
       Args:
-        op_name: the name of the Op that outputs the tensor to be printed.
-        output_idx: which output of the Op it is (0 means the first output).
+        tensor_name: name of the tensor being traced.
         num_elements: number of elements to print (-1 means print all).
         tensor: the tensor needs to be returned.
         output_tensor: the tensor needs to be printed.
 
       Returns:
         The same tensor passed via the "tensor" argument.
+
+      Raises:
+        ValueError: If tensor_name is not already in
+                    self._tensorname_idx_map.
       """
-      msg = '"%s:%d" '%(op_name, output_idx)
+
+      if self._submode == _SUBMODE_BRIEF:
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        msg = '%d'%self._tensorname_idx_map[tensor_name]
+      else:
+        msg = '"%s"'%tensor_name
+
       if self._trace_file_path:
         output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
       else:
         output_stream = sys.stderr
       print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
-                                      ' @', self._replica_id,
+                                      '@', self._replica_id,
                                       '\n', output_tensor, '\n',
                                       summarize=num_elements,
                                       output_stream=output_stream)
       with ops.control_dependencies([print_op]):
         return array_ops.identity(tensor).op
 
-    def _detect_nan_inf(tensor):
-      """Trace function for detecting any NaN/Inf in the tensor."""
-
-      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
-          dtypes.float16):
-        # Since host can't handle bf16, always convert tensor to f32.
-        tensor = math_ops.cast(tensor, dtypes.float32)
-        output_tensor = math_ops.reduce_any(
-            gen_math_ops.logical_or(gen_math_ops.is_nan(tensor),
-                                    gen_math_ops.is_inf(tensor)))
-      else:
-        output_tensor = constant_op.constant(0)
-      return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
-
-    def _show_norm(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float64)
-      output_tensor = linalg_ops.norm(tensor)
-      return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
-
-    def _show_max_abs(tensor):
-      output_tensor = math_ops.cast(math_ops.reduce_max(math_ops.abs(tensor)),
-                                    dtypes.float64)
-      zero = constant_op.constant(0, dtypes.float64)
-      output_tensor = gen_math_ops.maximum(zero, output_tensor)
-      return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
 
     def _show_part_tensor(tensor):
       """Trace function for printing part of the tensor."""
 
-      return _print_tensor(op_name, output_idx, self._part_tensor_size,
+      return _print_tensor(tensor_name, self._part_tensor_size,
                            tensor, tensor)
 
     def _show_full_tensor(tensor):
       """Trace function for printing the entire tensor."""
 
-      return _print_tensor(op_name, output_idx, -1, tensor, tensor)
+      return _print_tensor(tensor_name, -1, tensor, tensor)
 
-    if self._trace_mode == _TRACE_MODE_NAN_INF:
-      return _detect_nan_inf
     if self._trace_mode == _TRACE_MODE_PART_TENSOR:
       return _show_part_tensor
-    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
+    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
+    # performed within TPUs and only their results are transferred to CPU.
+    # Simply, print the full tensor for these trace modes.
+    if self._trace_mode in [
+        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
+        _TRACE_MODE_MAX_ABS
+    ]:
       return _show_full_tensor
-    if self._trace_mode == _TRACE_MODE_NORM:
-      return _show_norm
-    if self._trace_mode == _TRACE_MODE_MAX_ABS:
-      return _show_max_abs
 
     raise RuntimeError('Tensor trace fun for %s is not yet implemented'
                        %self._trace_mode)
 
-  def _skip_op(self, op_id, op, user_included, user_excluded):
+  def _skip_op(self, op_id, op, user_included, user_excluded,
+               in_exec_path=True):
     """Returns True if we should not trace Op."""
 
     if user_included:
@@ -755,6 +877,10 @@ class TensorTracer(object):
       self._instrument_records[op.name] = TensorTracer.reason(
           op_id, _REASON_USER_EXCLUDED)
       return True
+    if not in_exec_path:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_NOT_EXECUTED)
+      return True
     if not self._inside_op_range(op_id):
       self._instrument_records[op.name] = TensorTracer.reason(
           op_id, _REASON_OUTSIDE_OP_RANGE)
@@ -797,9 +923,18 @@ class TensorTracer(object):
           op_id, _REASON_USER_EXCLUDED)
       return True
     if not out_tensor.get_shape().is_fully_defined():
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_DYNAMIC_SHAPE)
-      return True
+      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
+      # to a scalar before the outside compilation call.
+      if self._trace_mode in [
+          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
+      ]:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_TENSOR_GET_TRACED)
+        return False
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_DYNAMIC_SHAPE)
+        return True
     rank = len(out_tensor.shape)
     if rank < 1:
       # scalar
@@ -817,14 +952,48 @@ class TensorTracer(object):
           op_id, _REASON_TENSOR_GET_TRACED)
       return False
 
+  def _filter_execution_path_operations(self, operations, fetches):
+    """Returns the set of ops in the execution path to compute given fetches."""
+    # If no fetch provided, then return all operations.
+    if fetches is None:
+      return set(operations)
+    # Convert to list, if a single element is provided.
+    if not isinstance(fetches, (list, tuple)):
+      fetches = [fetches]
+    # If a tensor is given as fetch, convert it to op.
+    op_fetches = []
+    for fetch in fetches:
+      if isinstance(fetch, ops.Operation):
+        op_fetches.append(fetch)
+      elif isinstance(fetch, ops.Tensor):
+        op_fetches.append(fetch.op)
+      else:
+        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
+                           %fetch)
+
+    execution_path_operations = set(op_fetches)
+    traverse_stack = list(op_fetches)
+    while True:
+      if not traverse_stack:
+        break
+      head_op = traverse_stack.pop()
+      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
+      input_ops.extend(head_op.control_inputs)
+
+      for input_op in input_ops:
+        if input_op not in execution_path_operations:
+          execution_path_operations.add(input_op)
+          traverse_stack.append(input_op)
+    return execution_path_operations
+
   def _pre_tracing(self, graph):
     """Work needs to be done prior to TPU or CPU tracing."""
 
     operations = graph.get_operations()
-    (opname_idx_map, tensor_list, tensorname_idx_map) = (
+    (opname_idx_map, tensor_list, self._tensorname_idx_map) = (
         TensorTracer._make_op_and_tensor_maps(operations))
     self._write_config_section()
-    self._write_op_list_section(operations, tensorname_idx_map)
+    self._write_op_list_section(operations)
     self._write_tensor_list_section(tensor_list, opname_idx_map)
     # Does the topological sort before adding any nodes to the graph.
     (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
@@ -858,13 +1027,15 @@ class TensorTracer(object):
                                   _TENSOR_TRACER_CHECKPOINT))
     return checkpoint_operations
 
-  def trace_tpu(self, graph, result_tensor, num_replicas=None):
+  def trace_tpu(self, graph, result_tensor, num_replicas=None, fetches=None):
     """Traces the tensors generated by TPU Ops in a TF graph.
 
     Args:
       graph: the graph of Ops executed on the TPU.
       result_tensor: a result tensor of evaluating the graph.
       num_replicas: number of replicas used on the TPU.
+      fetches: the list of fetches given to session.run, used to determine the
+      ops in execution path. If None, the whole graph will be traced.
 
     Returns:
       A tuple (result_tensor_copy, tracing_ops), where:
@@ -876,11 +1047,27 @@ class TensorTracer(object):
                      graph is evaluated.
     """
 
+    def _cast_unsupported_dtypes(tensor):
+      """Casts tensor to a supported type."""
+
+      if tensor.dtype.__eq__(dtypes.int64):
+        # outside-compilation doesn't support int64 input yet.
+        return math_ops.cast(tensor, dtypes.int32)
+      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
+          dtypes.float16):
+        # Since host can't handle bf16, convert tensor to f32.
+        return math_ops.cast(tensor, dtypes.float32)
+      return tensor
+
     self._device_type = _DEVICE_TYPE_TPU
     TensorTracer.check_device_type(self._device_type)
     result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
                                                        result_tensor)
     (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph)
+    # Filter out the operations that won't be executed.
+    # if fetches=None, then ops_in_exec_path = set(operations)
+    ops_in_exec_path = self._filter_execution_path_operations(operations,
+                                                              fetches)
     tracing_ops = []
     checkpoint_operations = self._get_checkpoints(graph)
 
@@ -889,16 +1076,23 @@ class TensorTracer(object):
         continue
       user_included = self._is_user_included_op(op)
       user_excluded = self._is_user_excluded_op(op)
-      if self._skip_op(op_id, op, user_included, user_excluded):
+      in_exec_path = op in ops_in_exec_path
+      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
         continue
       for i in range(len(op.outputs)):
         out_tensor = op.outputs[i]
         if self._skip_tensor(op_id, out_tensor, user_included,
                              user_excluded):
           continue
+        # Create the list of consumers before calling _preprocess_traced_tensor.
+        # Otherwise, adding control input below, will introduce a cycle in the
+        # graph.
         consumers = out_tensor.consumers()
+        tensor_name = out_tensor.name
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+        processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
         trace_op = tpu.outside_compilation(
-            self._make_tensor_trace_fun(op.name, i), out_tensor)
+            self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
         if consumers:
           for consumer_op in consumers:
             # pylint: disable=protected-access
@@ -944,8 +1138,9 @@ class TensorTracer(object):
         if self._skip_tensor(op_id, out_tensor, user_included,
                              user_excluded):
           continue
-        trace_fun = self._make_tensor_trace_fun(op.name, i)
-        trace_call = (trace_fun, [out_tensor])
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+        trace_fun = self._make_tensor_trace_fun(out_tensor.name)
+        trace_call = (trace_fun, [processed_out_tensor])
         trace_call_key = 'tensor_tracing_cpu-%s:%d'%(op.name, i)
         tracing_calls[trace_call_key] = trace_call
     self._post_tracing(succeed, sorted_or_cycle)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 9266d81cf5fc035790062f0e307a5da0b01a9fc1..ebbccea02c70f06ac3e1231a359f2df4ebad3142 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -19,23 +19,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.compiler import xla
 from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.tpu.proto import dynamic_padding_pb2 as dynamic_padding
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.compat import compat as api_compat
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import nest
 
 
 # Operations that indicate some error in the users graph, e.g. a placeholder
@@ -480,14 +486,19 @@ def replicate(computation,
               inputs=None,
               infeed_queue=None,
               device_assignment=None,
-              name=None):
+              name=None,
+              maximum_shapes=None):
   """Builds a graph operator that runs a replicated TPU computation.
 
   Args:
     computation: A Python function that builds the computation to replicate.
     inputs: A list of lists of input tensors or `None` (equivalent to
       `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs.
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to computation.
     device_assignment: If not `None`, a `DeviceAssignment` describing the
@@ -497,15 +508,125 @@ def replicate(computation,
       only one core, and there is either only one replica, or the number of
       replicas is equal to the number of cores in the TPU system.
     name: (Deprecated) Does nothing.
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
   Returns:
-    A list of lists of output tensors, indexed by `[replica_num][output_num]`.
+    A list of outputs, indexed by `[replica_num]` each output can be a nested
+    structure same as what computation() returns with a few exceptions.
+
+    Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+
   Raises:
     ValueError: If all replicas do not have equal numbers of input tensors.
     ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
   """
-  return split_compile_and_replicate(computation, inputs, infeed_queue,
-                                     device_assignment, name)[1]
+  return split_compile_and_replicate(
+      computation,
+      inputs,
+      infeed_queue,
+      device_assignment,
+      name,
+      maximum_shapes=maximum_shapes)[1]
+
+
+def _pad_all_input(inputs, padded_shapes):
+  """Pad all input tensors given padded_shapes.
+
+  The real shape tensors will be concatenated with the padded original inputs.
+
+  Args:
+    inputs: The original inputs.
+    padded_shapes: A list of padded shapes for each input.
+
+  Returns:
+    The padded inputs and a PaddingMap list which maps the padded input
+    dimension to the real shape argument index.
+  """
+  input_shape_tensors = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    for idx, input_tensor in enumerate(inputs_per_core):
+      if core_idx == 0:
+        input_shape_tensors.append([])
+      input_shape_tensors[idx].append(array_ops.shape(input_tensor))
+
+  maximum_shapes = []
+  for shapes_per_input in input_shape_tensors:
+    maximum_shapes.append(
+        math_ops.reduce_max(array_ops.stack(shapes_per_input), axis=0))
+
+  padded_inputs = []
+  real_shapes = []
+  padding_maps = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    padded_inputs.append([])
+    real_shapes.append([])
+    real_shape_idx = len(inputs_per_core) - 1
+    for idx, input_tensor in enumerate(inputs_per_core):
+      input_shape_tensor = input_shape_tensors[idx][core_idx]
+      input_shape = input_tensor.get_shape()
+      padded_shape = padded_shapes[idx]
+
+      # The static shape of inputs should be compatible with the given padded
+      # shapes.
+      input_shape.assert_is_compatible_with(padded_shape)
+
+      if input_shape.is_fully_defined():
+        # Do nothing if the shape of the whole tensor is already static.
+        padded_inputs[core_idx].append(input_tensor)
+      else:
+        # Only pad the non static shape dimension.
+        for i, s in enumerate(input_shape):
+          if s.value is None:
+            if core_idx == 0:
+              real_shape_idx += 1
+              padding_map = dynamic_padding.PaddingMap()
+              padding_map.arg_index = idx
+              padding_map.shape_index = i
+              padding_map.padding_arg_index = real_shape_idx
+              padding_maps.append(padding_map)
+            real_shapes[core_idx].append(
+                math_ops.cast(input_shape_tensor[i], dtypes.uint32))
+
+        paddings = []
+        for i, s in enumerate(padded_shape):
+          if input_shape[i].value:
+            # Don't pad if input shape is already static.
+            padding = [0, 0]
+          else:
+            if s.value:
+              # Pad to the given maximum value.
+              padding = [0, s.value - input_shape_tensor[i]]
+            else:
+              # If maximum value is not given, then pad to the maximum dimension
+              # among all the cores.
+              padding = [0, maximum_shapes[idx][i] - input_shape_tensor[i]]
+          paddings.append(padding)
+
+        padded_input = array_ops.pad(input_tensor, paddings)
+        padded_inputs[core_idx].append(padded_input)
+
+  num_replicas = len(padded_inputs)
+  for i in range(num_replicas):
+    padded_inputs[i].extend(real_shapes[i])
+
+  return padded_inputs, padding_maps
 
 
 def split_compile_and_replicate(computation,
@@ -513,7 +634,8 @@ def split_compile_and_replicate(computation,
                                 infeed_queue=None,
                                 device_assignment=None,
                                 name=None,
-                                use_tpu=True):
+                                use_tpu=True,
+                                maximum_shapes=None):
   """Builds graph operators that runs compilation and replicated computation.
 
   This is a lower level interface than replicate that returns a separate compile
@@ -526,7 +648,11 @@ def split_compile_and_replicate(computation,
     computation: A Python function that builds the computation to replicate.
     inputs: A list of lists of input tensors or `None` (equivalent to
       `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs.
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to computation.
     device_assignment: If not `None`, a `DeviceAssignment` describing the
@@ -539,6 +665,15 @@ def split_compile_and_replicate(computation,
     use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
       backends. Currently, only supports a default placement (computation is
       placed on GPU if one is available, and on CPU if not).
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+
   Returns:
     A list of lists with the first list corresponding to the compile op and the
     second a list of output tensors, indexed by `[replica_num][output_num]`.
@@ -546,6 +681,10 @@ def split_compile_and_replicate(computation,
     ValueError: If all replicas do not have equal numbers of input tensors.
     ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
   """
   del name
   inputs = [[]] if inputs is None else inputs
@@ -580,24 +719,32 @@ def split_compile_and_replicate(computation,
   if num_replicas == 0:
     return []
 
+  # Checks all replicas have the same structure.
+  for i in xrange(1, num_replicas):
+    nest.assert_same_structure(inputs[0], inputs[i])
+
+  # Flatten inputs.
+  flat_inputs = [
+      nest.flatten(per_replica_input) for per_replica_input in inputs
+  ]
   # Converts inputs to Tensors.
-  inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in inputs]
+  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
 
   # Verifies that all replicas have matching numbers and types of inputs
-  input_types = [x.dtype for x in inputs[0]]
-  input_arity = len(input_types)
+  flat_input_types = [x.dtype for x in flat_inputs[0]]
+  input_arity = len(inputs[0])
+  flat_input_arity = len(flat_input_types)
   for i in range(num_replicas):
     if len(inputs[i]) != input_arity:
       raise ValueError("Replicas must have the same number of inputs. "
                        "Replica 0 had {} inputs, replica {} had {} "
                        "inputs.".format(input_arity, i, len(inputs[i])))
 
-    types = [x.dtype for x in inputs[i]]
-    if types != input_types:
-      raise ValueError(
-          "Replicas must have matching input types. Replica 0 had "
-          "input types {}, replica {} had input types {}".format(
-              input_types, i, types))
+    types = [x.dtype for x in flat_inputs[i]]
+    if types != flat_input_types:
+      raise ValueError("Replicas must have matching input types. Replica 0 had "
+                       "input types {}, replica {} had input types {}".format(
+                           flat_input_types, i, types))
 
   arg_error = xla.check_function_argument_count(
       computation, input_arity, infeed_queue)
@@ -616,13 +763,34 @@ def split_compile_and_replicate(computation,
                for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
                                              arg_error))
 
+  if maximum_shapes:
+    if infeed_queue:
+      raise ValueError(
+          "Dynamic input shapes are not supported with infeed queues")
+
+    # Make sure maximum_shapes has the same structure as inputs.
+    nest.assert_same_structure(inputs[0], maximum_shapes, check_types=False)
+
+    # Flatten padded shapes.
+    flat_maximum_shapes = nest.flatten(maximum_shapes)
+    flat_maximum_shapes = [
+        tensor_shape.TensorShape(s) for s in flat_maximum_shapes
+    ]
+
+    flat_inputs, padding_maps = _pad_all_input(flat_inputs, flat_maximum_shapes)
+
+    serialized_padding_maps = []
+    for padding_map in padding_maps:
+      serialized_padding_maps.append(padding_map.SerializeToString())
+    metadata_kwargs["padding_map"] = serialized_padding_maps
+
   graph = ops.get_default_graph()
 
   # Fan-in: Builds a TPUReplicatedInput node for each input.
-  computation_inputs = []
-  for i in range(0, input_arity):
-    replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
-    computation_inputs.append(
+  flat_replicated_inputs = []
+  for i in range(0, len(flat_inputs[0])):
+    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
+    flat_replicated_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
   cluster_name = graph.unique_name("cluster")
@@ -642,15 +810,27 @@ def split_compile_and_replicate(computation,
       # computation. This is to avoid orphaned TPUReplicatedInput nodes.
       # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
       # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
-      computation_inputs = [
+      flat_replicated_inputs = [
           array_ops.identity(x, name="replicated_input_{}".format(i))
-          for i, x in enumerate(computation_inputs)
+          for i, x in enumerate(flat_replicated_inputs)
       ]
-      for i in computation_inputs:
+      for i in flat_replicated_inputs:
         # pylint: disable=protected-access
-        i.op._set_attr("_tpu_input_identity", attr_value_pb2.AttrValue(b=True))
+        # Add an attribute to the identity node so that they could be removed in
+        # encapsulate TPU computation pass if unused. However we don't remove
+        # inputs when dynamic padding is enabled.
+        # TODO(rxsang): Use other ways except argument index in padding_map so
+        # outside compilation can work with dynamic padding correctly.
+        if maximum_shapes is None:
+          i.op._set_attr("_tpu_input_identity",
+                         attr_value_pb2.AttrValue(b=True))
         # pylint: enable=protected-access
 
+      # Unflatten the computation inputs to match original input structure.
+      computation_inputs = nest.pack_sequence_as(
+          structure=inputs[0],
+          flat_sequence=flat_replicated_inputs[:flat_input_arity])
+
       # If there is an infeed queue, adds the dequeued values to the
       # computation's inputs.
       if infeed_queue is not None:
@@ -691,51 +871,12 @@ def split_compile_and_replicate(computation,
       vscope.set_use_resource(saved_use_resource)
       vscope.set_custom_getter(saved_custom_getter)
 
-    # If the computation returns `None`, make it an empty tuple.
-    if outputs is None:
-      outputs = tuple()
-    # If the computation only returned one value, makes it a tuple.
-    if not isinstance(outputs, (list, tuple)):
-      outputs = (outputs,)
-
-    # Append `no_op` here so that fetching any return value of this function
-    # will trigger TPUExecute node.
-    outputs += (control_flow_ops.no_op(),)
-    try:
-      with ops.device(core(0)):
-        outputs = [
-            o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-            for o in outputs
-        ]
-    except Exception as e:
-      raise ValueError(
-          "TPU function return values must all either be Operations or "
-          "convertible to Tensors. Got '%s'" % str(e))
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          "TPU functions must return zero-or more Tensor values followed by "
-          "zero or more Operations.")
-    output_arity = len(output_tensors)
+    outputs_is_flat = xla.is_flat(outputs)
+    if outputs_is_flat:
+      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+    else:
+      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
 
-    # Wraps outputs in Identity ops. Otherwise a replicated input copied
-    # straight to an output would bypass the replicate(). This would be bad
-    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-    # be rewritten away, leading to a runtime error.
-    # TODO(phawkins): extend the rewrite to elide these nodes instead.
-    new_output_tensors = []
-    for t in output_tensors:
-      with ops.device(t.device if t.device else core(0)):
-        o = array_ops.identity(t)
-        # pylint: disable=protected-access
-        o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
-        # pylint: enable=protected-access
-        new_output_tensors.append(o)
-    output_tensors = new_output_tensors
     context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
@@ -747,11 +888,6 @@ def split_compile_and_replicate(computation,
     attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
     metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
 
-  # Fan-out: Builds a TPUReplicatedOutput node for each output.
-  outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
-                                           name="output{}".format(i))
-             for i in xrange(output_arity)]
-
   with ops.control_dependencies([metadata]):
     if use_tpu:
       compile_status = tpu_ops.tpu_compilation_result()
@@ -761,28 +897,146 @@ def split_compile_and_replicate(computation,
     else:
       compile_status = control_flow_ops.no_op(name="compilation_status")
 
-  with ops.control_dependencies(output_operations):
-    if output_arity == 0:
-      # Returns a list of NoOps dependent on the replication Op, indexed by
-      # [replica_num].
-      return [
-          compile_status, [
-              control_flow_ops.no_op(name="shard_%d" % i)
-              for i in range(num_replicas)
-          ]
-      ]
-    else:
-      # Wraps the outputs in identity operators so the names of any possible
-      # `fetch` nodes are preserved by the replication rewrite.
-      return [
-          compile_status, [[
-              array_ops.identity(
-                  outputs[out][replica],
-                  name="output_%d_shard_%d" % (out, replica))
-              for out in xrange(output_arity)
-          ]
-                           for replica in xrange(num_replicas)]
+  if not output_tensors:
+    # Returns a list of NoOps dependent on the replication Op, indexed by
+    # [replica_num].
+    return [
+        compile_status,
+        [
+            control_flow_ops.group(control_deps, name="shard_%d" % i)
+            for i in range(num_replicas)
+        ]
+    ]
+
+  # Fan-out: Builds a TPUReplicatedOutput node for each output.
+  replicated_outputs = [[] for i in xrange(num_replicas)]
+  for i, t in enumerate(output_tensors):
+    # Fan-out: Builds a TPUReplicatedOutput node for each output.
+    ys = tpu_ops.tpu_replicated_output(
+        t, num_replicas, name="output{}".format(i))
+
+    # Wraps the outputs in identity operators so the names of any possible
+    # `fetch` nodes are preserved by the replication rewrite.
+    with ops.control_dependencies(control_deps):
+      for replica in xrange(num_replicas):
+        replicated_outputs[replica].append(
+            array_ops.identity(
+                ys[replica], name="output_%d_shard_%d" % (i, replica)))
+
+  if not outputs_is_flat:
+    replicated_outputs = [
+        nest.pack_sequence_as(outputs, replica_outs)
+        for replica_outs in replicated_outputs
+    ]
+
+  return [compile_status, replicated_outputs]
+
+
+def _postprocess_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors and Operations extracted from outputs.
+  """
+  # Following code segment is to preserve legacy behavior. Previously we only
+  # supported flat outputs and thus for consistency it was nice to convert even
+  # single element into a tuple. But now that we support arbitrary output
+  # structure, this is no longer necessary.
+  # TODO(b/121383831): Migrate all legacy use cases and delete this special
+  # case.
+  # If the computation returns `None`, make it an empty tuple.
+  if outputs is None:
+    outputs = tuple()
+  # If the computation only returned one value, makes it a tuple.
+  if not isinstance(outputs, collections.Sequence):
+    outputs = (outputs,)
+
+  # Append `no_op` here so that fetching any return value of this function
+  # will trigger TPUExecute node.
+  outputs += (control_flow_ops.no_op(),)
+  try:
+    with ops.device(core(0)):
+      outputs = [
+          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+          for o in outputs
       ]
+  except Exception as e:
+    raise ValueError(
+        "TPU function return values must all either be Operations or "
+        "convertible to Tensors. Got '%s'" % str(e))
+
+  # Separates the returned Operations and Tensors.
+  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
+
+  if outputs != output_tensors + output_operations:
+    raise ValueError(
+        "TPU functions must return zero-or more Tensor values followed by "
+        "zero or more Operations.")
+
+  # Wraps outputs in Identity ops. Otherwise a replicated input copied
+  # straight to an output would bypass the replicate(). This would be bad
+  # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+  # be rewritten away, leading to a runtime error.
+  # TODO(phawkins): extend the rewrite to elide these nodes instead.
+  new_output_tensors = []
+  for t in output_tensors:
+    with ops.device(t.device if t.device else core(0)):
+      o = array_ops.identity(t)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      new_output_tensors.append(o)
+  return new_output_tensors, output_operations
+
+
+def _postprocess_non_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors extracted from outputs and an empty list because Operations are not
+    allowed in non-flat outputs..
+  """
+
+  # Flatten output items.
+  flat_outputs = nest.flatten(outputs)
+
+  # Convert all non-Operation outputs to Tensors.
+  for i, o in enumerate(flat_outputs):
+    if isinstance(o, ops.Operation):
+      raise ValueError(
+          "tpu.rewrite does not support Operation as return value in non-flat "
+          "output structure. You can set returned Operations as control "
+          "dependencies of returned Tensors so Operations are triggered when "
+          'Tensors are evaluated. Operation found: "%s"' % o.name)
+
+    try:
+      o = ops.convert_to_tensor(o)
+    except Exception as e:
+      raise ValueError(
+          "TPU function return values must all either be Operations or "
+          'convertible to Tensors. Got error: "%s"' % str(e))
+
+    # Wraps outputs in Identity ops. Otherwise a replicated input copied
+    # straight to an output would bypass the replicate(). This would be bad
+    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+    # be rewritten away, leading to a runtime error.
+    # TODO(phawkins): extend the rewrite to elide these nodes instead.
+    with ops.device(core(0)):
+      o = array_ops.identity(o)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      flat_outputs[i] = array_ops.identity(o)
+
+  # All flat_outputs are Tensors, and no Operations.
+  return flat_outputs, []
 
 
 def split_compile_and_shard(computation,
@@ -809,9 +1063,6 @@ def split_compile_and_shard(computation,
     return x + 3
   ... = shard(computation, ...)
 
-  TODO(phawkins): consider adding support for broadcasting Tensors passed
-  as inputs.
-
   If `outputs_from_all_shards` is true, the outputs from all shards of
   `computation` are concatenated back together along their `output_shards_axes`.
   Otherwise, each output is taken from an arbitrary shard.
@@ -853,6 +1104,8 @@ def split_compile_and_shard(computation,
     ValueError: If len(input_shard_axes) != len(inputs)
     ValueError: If len(output_shard_axes) != len(outputs from `computation`)
   """
+  # TODO(phawkins): consider adding support for broadcasting Tensors passed as
+  # inputs.
 
   if num_shards <= 0:
     raise ValueError("num_shards must be a positive integer.")
@@ -1092,6 +1345,11 @@ def rewrite(computation,
       All `Operation`s constructed during `computation` will be executed when
       evaluating any of the returned output tensors, not just the ones returned.
     inputs: A list of input tensors or `None` (equivalent to an empty list).
+      Each input can be a nested structure containing values that are
+      convertible to tensors. Note that passing an N-dimension list of
+      compatible values will result in a N-dimention list of scalar tensors
+      rather than a single Rank-N tensors. If you need different behavior,
+      convert part of inputs to tensors with `tf.convert_to_tensor`.
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
     device_assignment: if not `None`, a `DeviceAssignment` describing the
@@ -1100,11 +1358,15 @@ def rewrite(computation,
       case the core attached to task 0, TPU device 0 is used.
     name: (Deprecated) Does nothing.
   Returns:
-    A list of output tensors.
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
   """
-  if inputs is not None and not isinstance(inputs, (list, tuple)):
-    raise TypeError("tpu.rewrite() inputs must be a list or tuple")
-
   # TODO(b/36647078) remove disable when pylint bug is fixed.
   # pylint: disable=indexing-exception
   return replicate(
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 672462447944b777375331d49727c4d5366cf295..ed1e0f0401a96c34e6ff9323685857b64e10bd14 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from contextlib import contextmanager
 import copy
 
+from tensorflow.contrib.tpu.python.tpu import _tpu_estimator_embedding
 from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
@@ -192,8 +193,14 @@ class _InternalTPUContext(object):
   ```
   """
 
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu, eval_on_tpu=True):
+  def __init__(self,
+               config,
+               train_batch_size,
+               eval_batch_size,
+               predict_batch_size,
+               use_tpu,
+               eval_on_tpu=True,
+               embedding_config_spec=None):
     self._config = config
     self._train_batch_size = train_batch_size
     self._eval_batch_size = eval_batch_size
@@ -208,7 +215,7 @@ class _InternalTPUContext(object):
         use_tpu and config.tpu_config.num_cores_per_replica)
     self._mode = None
     num_cores_per_replica = config.tpu_config.num_cores_per_replica
-    if num_cores_per_replica:
+    if self._model_parallelism_enabled:
       self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
           num_cores_per_replica]
     else:
@@ -216,6 +223,8 @@ class _InternalTPUContext(object):
     self._lazy_tpu_system_metadata_dict = {}  # key by master address
     self._lazy_device_assignment_dict = {}  # key by master address
     self._lazy_validation_dict = {}  # key by ModeKeys
+    self._embedding_config_spec = embedding_config_spec
+    self._lazy_embedding_config_dict = {}  # key by master address
 
   def _assert_mode(self):
     if self._mode is None:
@@ -293,6 +302,30 @@ class _InternalTPUContext(object):
     self._lazy_device_assignment_dict[master] = device_assignment
     return device_assignment
 
+  @property
+  def embedding_config(self):
+    """Returns the embedding config based on current mode."""
+    master = self._get_master_address()
+    if master in self._lazy_embedding_config_dict:
+      embedding_config = self._lazy_embedding_config_dict[master]
+    else:
+      embedding_config = None
+      if self._use_tpu and self._embedding_config_spec:
+        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
+            self._embedding_config_spec, self._train_batch_size,
+            self._eval_batch_size, self.num_hosts, self.num_cores, master)
+        if not embedding_config.has_embedding_tables():
+          embedding_config = None
+      self._lazy_embedding_config_dict[master] = embedding_config
+
+    if embedding_config is not None:
+      mode = self._assert_mode()
+      # Dynamically attach tpu_embedding based on mode. With
+      # this, we could keep embedding_config immutable but call site always
+      # accesses the unified API '.tpu_embedding'.
+      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
+    return embedding_config
+
   @property
   def model_parallelism_enabled(self):
     return self._model_parallelism_enabled
@@ -710,11 +743,15 @@ class _OneCoreTPUContext(_InternalTPUContext):
 
 
 def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu, eval_on_tpu):
+                     predict_batch_size, use_tpu, eval_on_tpu,
+                     embedding_config_spec):
   """Returns an instance of `_InternalTPUContext`."""
 
   if (config.tpu_config.num_shards == 1 and
       config.tpu_config.num_cores_per_replica is None):
+    if embedding_config_spec is not None:
+      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
+                       'when embedding_config_spec is not None.')
     logging.warning(
         'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
         'Please fix as soon as possible (leaving num_shards as None.)')
@@ -722,4 +759,5 @@ def _get_tpu_context(config, train_batch_size, eval_batch_size,
                               predict_batch_size, use_tpu)
 
   return _InternalTPUContext(config, train_batch_size, eval_batch_size,
-                             predict_batch_size, use_tpu, eval_on_tpu)
+                             predict_batch_size, use_tpu, eval_on_tpu,
+                             embedding_config_spec)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
index ccba8a46c7cad0337119672e02314684f4451479..04e7397162624dfc1f6203dd267c1c1b90163dd4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.tpu.ops import gen_tpu_ops
 from tensorflow.contrib.tpu.proto import tpu_embedding_configuration_pb2 as elc
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -43,19 +44,6 @@ from tensorflow.python.ops import variables
 TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
 INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
 
-# TODO(shizhiw): A better interface is to make `num_hosts` and
-# `num_cores_per_host` optional parameters for `TPUEmbedding`
-# constructor. Usually they can be automatically detected, but
-# user can also specify them for debugging (b/112112496).
-# Auto-detection can be done with `tpu_system_metadata.py`.
-_MASTER_JOB = 'tpu_worker'
-_HOST_PATTERN = '/job:tpu_worker/task:{}/device:CPU:0'
-_NUM_CORES_PER_HOST = 8
-
-_TEST_MASTER_JOB = None
-_TEST_HOST = '/replica:0/task:0/device:CPU:0'
-_TEST_NUM_CORES_PER_HOST = 2
-
 
 class TableConfig(
     collections.namedtuple(
@@ -301,10 +289,9 @@ class TPUEmbedding(object):
                table_to_config_dict,
                feature_to_table_dict,
                batch_size,
-               num_hosts,
                mode,
-               optimization_parameters=None,
-               tpu_embedding_test=False):
+               master,
+               optimization_parameters=None):
     """API for using TPU for embedding lookups.
 
     Args:
@@ -315,12 +302,11 @@ class TPUEmbedding(object):
         to string of table name. Feature refers to ids to lookup in embedding
         table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
       batch_size: An `int` representing the global batch size.
-      num_hosts: An `int` representing the number of TPU hosts.
       mode: `TRAINING` or `INFERENCE`.
+      master: A `string` representing the TensorFlow master to use.
       optimization_parameters: `AdagradParameters`, `AdamParameters`,
         `Stochasticgradientdescentparameters`. Must be set in training and must
         be `None` in inference.
-      tpu_embedding_test: A `bool`. Only used for testing.
 
     Raises:
       ValueError: if any input is invalid.
@@ -337,15 +323,17 @@ class TPUEmbedding(object):
 
     self._batch_size = batch_size
 
-    if tpu_embedding_test:
-      self._num_hosts = 1
-      self._hosts = [_TEST_HOST]
-      self._num_cores_per_host = _TEST_NUM_CORES_PER_HOST
-    else:
-      self._num_hosts = num_hosts
-      self._hosts = [_HOST_PATTERN.format(i) for i in range(self._num_hosts)]
-      self._num_cores_per_host = _NUM_CORES_PER_HOST
-    self._num_cores = self._num_cores_per_host * self._num_hosts
+    self._master = master
+    self._tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
+    if self._tpu_system_metadata.num_cores == 0:
+      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
+                       'TPUs.'.format(self._master))
+    self._num_hosts = self._tpu_system_metadata.num_hosts
+    self._hosts = [device.name for device in self._tpu_system_metadata.devices
+                   if 'device:CPU:' in device.name]
+    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
+    self._num_cores = self._tpu_system_metadata.num_cores
 
     _validate_batch_size(self._batch_size, self._num_cores)
     self._batch_size_per_core = self._batch_size // self._num_cores
@@ -389,7 +377,7 @@ class TPUEmbedding(object):
     Returns:
       A list of device names for CPU hosts.
     """
-    return self._hosts
+    return copy.copy(self._hosts)
 
   # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
   # to be consistent with `tpu_embedding_configuration.proto`.
@@ -452,6 +440,10 @@ class TPUEmbedding(object):
   def table_to_table_variables_dict(self):
     return copy.copy(self._table_to_table_variables_dict)
 
+  @property
+  def feature_to_table_dict(self):
+    return copy.copy(self._feature_to_table_dict)
+
   def get_slot_names(self):
     """Return a list of the names of slots created by `TPUEmbedding`."""
     return self._optimizer_handler.get_slot_names()
@@ -1077,34 +1069,3 @@ def _create_partitioned_variables(name,
       initializer=initializer,
       collections=collections,
       trainable=False))
-
-
-@ops.RegisterGradient('TPUEmbeddingActivations')
-def _embedding_activations_grad(activations_op, grad_wrt_activations):
-  """Saves the gradient of embedding activations ops in a graph collection."""
-  g = ops.get_default_graph()
-  table_id = activations_op.get_attr('table_id')
-  lookup_id = activations_op.get_attr('lookup_id')
-  table_gradients = g.get_collection_ref(
-      'tpu_embedding_gradients_table_%d' % table_id)
-
-  if not table_gradients:
-    raise RuntimeError(
-        'Gradients for TPUEmbedding have been generated in non-training mode. '
-        'This is not expected. Consider putting your Optimizer.minimize code '
-        'behind the training mode condition check. For Estimator, you can '
-        'do \n\n'
-        '    if mode == tf.estimator.ModeKeys.TRAIN:\n'
-        '        train_op = opt.minimize(loss)\n'
-        '\n')
-
-  table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
-  return [
-      # RegisterGradient requires that value be returned for all inputs. Since
-      # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
-      # we will return zeros(shape=[1]). The actual gradient w.r.t. the
-      # embedding activations (grad_wrt_activations) has the same shape as the
-      # activations returned by  embedding_activations.
-      array_ops.zeros(arg.shape, dtype=dtypes.float32)
-      for arg in activations_op.inputs
-  ]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 87a970f0523363426b0da5b12838b797d7f8bebb..0620598ea00316d112245fa17bf5e56b1a015af4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -31,17 +31,22 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.tpu.ops import gen_tpu_ordinal_selector_op
 from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
-from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import _tpu_estimator_embedding
 from tensorflow.contrib.tpu.python.tpu import error_handling
+from tensorflow.contrib.tpu.python.tpu import functional as tpu_functional
 from tensorflow.contrib.tpu.python.tpu import session_support
+from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.contrib.tpu.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
+from tensorflow.contrib.tpu.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
 from tensorflow.contrib.training.python.training import hparam
 from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
@@ -55,6 +60,7 @@ from tensorflow.python.estimator.export import export_output as export_output_li
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -90,6 +96,7 @@ _ONE_GIGABYTE = 1024 * 1024 * 1024
 _TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
 _TPU_TRAIN_OP = '_tpu_train_op'
 _REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
+_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
 
 # Ideally _USE_TPU_KEY should be reserved as well. However there are already
 # models that make use of this key, thus it can not be reserved now to prevent
@@ -120,6 +127,16 @@ def _is_iterable(obj):
     return False
 
 
+class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
+
+  def AddOp(self, op):
+    if op.type in [
+        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
+        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
+    ]:
+      raise ValueError('Use tf.contrib.summary inside of host_calls.')
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -427,13 +444,20 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
                run_infeed_loop_on_coordinator=True,
                rendezvous=None,
                master=None,
-               session_config=None):
+               session_config=None,
+               tpu_init_ops=None):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
     self._rendezvous = rendezvous
     self._master = master
     self._session_config = session_config
+    self._init_ops = list(tpu_init_ops or [])
+    if ctx.embedding_config is None:
+      self._embedding_layer_config = None
+    else:
+      self._embedding_layer_config = (
+          ctx.embedding_config.tpu_embedding.config_proto)
     self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
@@ -446,7 +470,6 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_ops = []
     if self._should_initialize_tpu:
       self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
     else:
@@ -506,7 +529,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       with ops.Graph().as_default():
         with tf_session.Session(
             self._master, config=self._session_config) as sess:
-          sess.run(tpu.initialize_system(job=self._master_job))
+          sess.run(
+              tpu.initialize_system(
+                  job=self._master_job,
+                  embedding_config=self._embedding_layer_config))
       logging.info('Initialized TPU in %d seconds', time.time() - start)
 
     session.run(self._init_ops,
@@ -848,6 +874,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
     """Generates the per_host enqueue ops."""
     control_deps = []
     per_host_sharded_inputs = []
+    sparse_features_list = []
     num_replicas_per_host = ctx.num_of_replicas_per_host
     cached_signals = None
     with ops.device(device):
@@ -866,6 +893,10 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
           else:
             cached_signals = signals
 
+        features, labels, sparse_features = (
+            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
+        sparse_features_list.append(sparse_features)
+
         inputs_structure_recorder.validate_and_record_structure(
             features, labels)
         flattened_inputs = (
@@ -894,6 +925,11 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
             tpu_ordinal_function=tpu_ordinal_function_impl)
       captured_infeed_queue.capture(infeed_queue)
 
+    if ctx.embedding_config:
+      per_host_enqueue_ops.extend(
+          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
+              sparse_features_list))
+
     if signals is None:
       return per_host_enqueue_ops
     else:
@@ -1303,6 +1339,44 @@ class _InputPipeline(object):
         logging.warn(err_msg)
 
 
+def call_computation(computation,
+                     experimental_exported_model_uses_all_cores=True):
+  """Call computation.
+
+  computation uses a single-core for TPU inference. If
+  `experimental_exported_model_uses_all_cores` is `True`, this function will
+  round-robin
+  computation among all TPU cores visible to the host; otherwise, it will use
+  a single core.
+
+  Args:
+    computation: A Python function that takes no inputs and builds computation
+      graph. If `computation` returns m outputs, this function will return a
+      list of m Tensors.
+    experimental_exported_model_uses_all_cores: Whether to round-robin among all
+      cores visible to the host, or to use a single core.
+
+  Returns:
+    A list of output tensors.
+  """
+  if experimental_exported_model_uses_all_cores:
+    # Using `TPUPartitionedCall` makes it possible to target a different
+    # TPU core with every `Session.run()` call. Note that the entire inference
+    # graph executes on a single core, and that invocations of this graph
+    # will round-robin among the cores attached to a host.
+    @function.Defun()
+    def tpu_subgraph():
+      return computation()
+
+    return tpu_functional.TPUPartitionedCall(
+        args=tpu_subgraph.captured_inputs,
+        device_ordinal=gen_tpu_ordinal_selector_op.tpu_ordinal_selector(),
+        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
+        f=tpu_subgraph)
+  else:
+    return computation()
+
+
 class _ModelFnWrapper(object):
   """A `model_fn` wrapper.
 
@@ -1322,6 +1396,12 @@ class _ModelFnWrapper(object):
   def call_without_tpu(self, features, labels, is_export_mode):
     return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
 
+  def _add_embedding_features(self, features):
+    if self._ctx.embedding_config:
+      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+      embedding_activations = tpu_embedding_.get_activations()
+      features.update(embedding_activations)
+
   def convert_to_single_tpu_train_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single train step on TPU.
 
@@ -1354,6 +1434,7 @@ class _ModelFnWrapper(object):
       del loss  # unused; required in function signature.
       inputs = dequeue_fn()
       features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features)
 
       estimator_spec = self._verify_estimator_spec(
           self._call_model_fn(features, labels))
@@ -1370,11 +1451,19 @@ class _ModelFnWrapper(object):
       if tensor_tracer.TensorTracer.is_enabled():
         tt = tensor_tracer.TensorTracer()
         loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
-                                         self._ctx.num_replicas)
+                                         self._ctx.num_replicas,
+                                         fetches=[loss, train_op])
+
+      if self._ctx.embedding_config is None:
+        apply_sparse_grads = []
+      else:
+        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+        apply_sparse_grads = [tpu_embedding_.generate_send_gradients_op()]
 
       # We must run train_op to update the variables prior to running the
       # outfeed.
-      with ops.control_dependencies([train_op]+tracing_ops):
+      with ops.control_dependencies([train_op] + tracing_ops +
+                                    apply_sparse_grads):
         host_call_outfeed_ops = []
         if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
             and estimator_spec.host_call is not None):
@@ -1420,6 +1509,7 @@ class _ModelFnWrapper(object):
       """Evaluation step function for use inside a while loop."""
       inputs = dequeue_fn()
       features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features)
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
       if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
@@ -1759,6 +1849,10 @@ class _OutfeedHostCall(object):
           dequeue_ops[j].append(item)
 
     # Deconstruct dequeue ops.
+    flat_dequeue_ops = []
+    for l in dequeue_ops:
+      flat_dequeue_ops.extend(l)
+
     dequeue_ops_by_name = {}
     pos = 0
     for name in self._names:
@@ -1766,6 +1860,14 @@ class _OutfeedHostCall(object):
                                               len(self._tensors[name])]
       pos += len(self._tensors[name])
 
+    def _call_host_fn(fn, *args, **kw):
+      context = CatchInvalidHostcallFunctions()
+      context.Enter()
+      result = fn(*args, **kw)
+      context.Exit()
+      context.ExitResult(result)
+      return result
+
     # It is assumed evaluation always happens on single host TPU system. So,
     # place all ops on tpu host if possible.
     #
@@ -1799,7 +1901,7 @@ class _OutfeedHostCall(object):
           # The user-provided eval_metrics[1] is a dict.
           dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
           try:
-            ret[name] = self._host_fns[name](**dequeue_ops)
+            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
           except TypeError as e:
             logging.warning(
                 'Exception while calling %s: %s. It is likely the tensors '
@@ -1807,8 +1909,10 @@ class _OutfeedHostCall(object):
                 'function\'s arguments', name, e, name)
             raise
         else:
-          ret[name] = self._host_fns[name](*dequeue_ops)
+          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
 
+    # force all dequeue operations to be run if not consumed by the host calls
+    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
     return ret
 
 
@@ -2100,7 +2204,11 @@ class TPUEstimator(estimator_lib.Estimator):
                batch_axis=None,
                eval_on_tpu=True,
                export_to_tpu=True,
-               warm_start_from=None):
+               export_to_cpu=True,
+               warm_start_from=None,
+               experimental_exported_model_uses_all_cores=False,
+               experimental_export_device_assignment=False,
+               experimental_embedding_config_spec=None):
     """Constructs an `TPUEstimator` instance.
 
     Args:
@@ -2143,12 +2251,29 @@ class TPUEstimator(estimator_lib.Estimator):
       eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
         model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
       export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU besides the one on CPU.
+        serving on TPU. Note that unsupported export modes such as EVAL will be
+        ignored. For those modes, only a CPU model will be exported.
+        Currently, export_to_tpu only supports PREDICT.
+      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on CPU.
       warm_start_from: Optional string filepath to a checkpoint or SavedModel to
         warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
         configure warm-starting.  If the string filepath is provided instead of
         a `WarmStartSettings`, then all variables are warm-started, and it is
         assumed that vocabularies and Tensor names are unchanged.
+      experimental_exported_model_uses_all_cores: Whether to round-robin among
+        all cores visible to the host which is serving the saved model, or to
+        use a single core. This is a temporary flag to enable using all TPU
+        cores for inference with TPUPartitionedCall(). Once outside compilation
+        is supported in TPUPartitionedCall(), this flag will be enabled by
+        default.
+      experimental_export_device_assignment: Whether to include the device
+        assignment in the exported model. Doing so is useful in case of model
+        parallel inference but will tie the exported model to the TPU topology
+        used to export the model.
+      experimental_embedding_config_spec: Optional EmbeddingConfigSpec instance
+        to support using TPU embedding. IT IS STILL WORK IN PROGRESS, SO PLEASE
+        DO NOT USE.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -2210,9 +2335,19 @@ class TPUEstimator(estimator_lib.Estimator):
     # pylint: disable=protected-access
     self._ctx = tpu_context._get_tpu_context(
         self._config, train_batch_size, eval_batch_size, predict_batch_size,
-        use_tpu, eval_on_tpu)
+        use_tpu, eval_on_tpu, experimental_embedding_config_spec)
 
+    self._export_to_cpu = export_to_cpu
     self._export_to_tpu = export_to_tpu
+    self._experimental_exported_model_uses_all_cores = (
+        experimental_exported_model_uses_all_cores)
+    self._experimental_export_device_assignment = (
+        experimental_export_device_assignment)
+    if (experimental_exported_model_uses_all_cores and
+        experimental_export_device_assignment):
+      raise ValueError('experimental_exported_model_uses_all_cores and '
+                       'experimental_export_device_assignment is not supported '
+                       'at the same time.')
 
     self._is_input_fn_invoked = None
     self._rendezvous = {}
@@ -2226,35 +2361,43 @@ class TPUEstimator(estimator_lib.Estimator):
                                export_tags=None,
                                check_variables=True):
     if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      raise NotImplementedError(
-          'TPUEstimator only handles mode PREDICT for exporting '
-          'when `export_to_tpu` is `True`; '
-          'got {}.'.format(mode))
-
-    (super(TPUEstimator, self)._add_meta_graph_for_mode(
-        builder,
-        input_receiver_fn_map,
-        checkpoint_path,
-        save_variables,
-        mode=mode,
-        export_tags=export_tags,
-        check_variables=check_variables))
+      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
+                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
+                      'for TPU.'.format(mode))
+
+    if not self._export_to_cpu and not self._export_to_tpu:
+      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
+
+    if self._export_to_cpu:
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
 
-    if self._export_to_tpu:
+    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
       input_receiver_fn_map = {
           _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
       }
       export_tags = [tag_constants.SERVING, tag_constants.TPU]
       mode = _REWRITE_FOR_INFERENCE_MODE
+
       # See b/110052256 for why `check_variables` is `False`.
+      if not self._export_to_cpu:
+        check_variables = save_variables = True
+      else:
+        check_variables = save_variables = False
       (super(TPUEstimator, self)._add_meta_graph_for_mode(
           builder,
           input_receiver_fn_map,
           checkpoint_path,
-          save_variables=False,
+          save_variables=save_variables,
           mode=mode,
           export_tags=export_tags,
-          check_variables=False))
+          check_variables=check_variables))
 
   def _call_model_fn(self, features, labels, mode, config):
     if mode == _REWRITE_FOR_INFERENCE_MODE:
@@ -2269,6 +2412,88 @@ class TPUEstimator(estimator_lib.Estimator):
       raise ValueError('mode must be {}; '
                        'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
 
+    computation, capture = self._build_computation_for_inference(
+        features, labels, mode, config)
+    tensors = call_computation(
+        computation,
+        experimental_exported_model_uses_all_cores=self
+        ._experimental_exported_model_uses_all_cores)
+    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
+        capture.get())
+    predictions_list = tensors[:len(predictions_dict)]
+    export_outputs_list_without_none = tensors[len(predictions_dict):]
+
+    # Reinsert `None`s which we've taken out in
+    # `_build_computation_for_inference()`.
+    export_outputs_list = []
+    while none_indices or export_outputs_list_without_none:
+      if none_indices and none_indices[0] == len(export_outputs_list):
+        export_outputs_list.append(None)
+        none_indices.pop(0)
+      else:
+        export_outputs_list.append(export_outputs_list_without_none.pop(0))
+
+    # Reconstruct `export_outputs` with updated tensors.
+    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
+                                                    export_outputs_list)
+    export_outputs = estimator_spec.export_outputs
+    new_export_outputs = collections.OrderedDict(
+        (k, _clone_export_output_with_tensors(export_outputs[k], v))
+        for k, v in six.iteritems(new_export_outputs_dict))
+    # Reconstruct `predictions` with updated tensors.
+    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
+    if (len(new_predictions) == 1 and
+        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
+      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
+
+    return estimator_spec._replace(
+        export_outputs=new_export_outputs, predictions=new_predictions)
+
+  def _build_computation_for_inference(self, features, labels, mode, config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Computation to be passed to `TPUPartitionedCall()`."""
+      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
+          features, labels, mode, config)
+
+      if self._experimental_export_device_assignment:
+        # Export the device assignment as part of the model. This is useful for
+        # model parallel usecases where the model relies on the mapping between
+        # logical and physical devices.
+        with self._ctx.with_mode(mode) as ctx:
+          device_assignment = ctx.device_assignment
+      else:
+        device_assignment = None
+      tensors_on_cpu = tpu.rewrite_for_inference(
+          tpu_computation, device_assignment=device_assignment)
+      (estimator_spec, export_outputs_dict, export_outputs_list,
+       predictions_dict) = (
+           tpu_capture.get())
+      predictions_list = tensors_on_cpu[:len(predictions_dict)]
+      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
+
+      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
+      # with their CPU counterpart returned from `rewrite_for_inference()`.
+      # `function.Defun()` does not like `None`s in return values, so we leave
+      # `None`s out but record their positions for later reconstruction.
+      export_outputs_list_without_none = []
+      none_indices = []
+      for i, t in enumerate(export_outputs_list):
+        if t is None:
+          none_indices.append(i)
+        else:
+          export_outputs_list_without_none.append(
+              export_outputs_tpu_on_cpu_list.pop(0))
+
+      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
+                       none_indices))
+      return predictions_list + export_outputs_list_without_none
+
+    return computation, capture
+
+  def _build_tpu_computation_for_inference(self, features, labels, mode,
+                                           config):
     capture = _CapturedObject()
 
     def computation():
@@ -2289,38 +2514,30 @@ class TPUEstimator(estimator_lib.Estimator):
 
       # We pick the TPU tensors out from `export_output` and later return them
       # from `computation` for rewriting.
-      tensors_dict = collections.OrderedDict(
+      export_outputs_dict = collections.OrderedDict(
           (k, _export_output_to_tensors(v))
           for k, v in six.iteritems(estimator_spec.export_outputs))
-      tensors = nest.flatten(tensors_dict)
-      tpu_tensors = [t for t in tensors if t is not None]
-
-      # We cannot return anything other than `tpu_tensors` here so we capture
-      # the rest for later use.
-      capture.capture((estimator_spec, tensors_dict, tensors))
-      return tpu_tensors
-
-    tpu_tensors_on_cpu = tpu.rewrite_for_inference(computation)
-    estimator_spec, tensors_dict, tensors = capture.get()
-
-    # Reconstruct `tensors`, but with `tpu_tensors` replaced with
-    # `tpu_tensors_on_cpu`.
-    new_tensors = []
-    for t in tensors:
-      if t is None:
-        new_tensors.append(None)
+      export_outputs_list = nest.flatten(export_outputs_dict)
+      export_outputs_tpu_list = [
+          t for t in export_outputs_list if t is not None
+      ]
+
+      if isinstance(estimator_spec.predictions, dict):
+        predictions_dict = collections.OrderedDict(
+            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
       else:
-        new_tensors.append(tpu_tensors_on_cpu.pop(0))
+        predictions_dict = {
+            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
+        }
+      predictions_list = nest.flatten(predictions_dict)
 
-    # Reconstruct `tensors_dict`.
-    new_tensors_dict = nest.pack_sequence_as(tensors_dict, new_tensors)
-    # Reconstruct `export_outputs`.
-    export_outputs = estimator_spec.export_outputs
-    new_export_outputs = collections.OrderedDict(
-        (k, _clone_export_output_with_tensors(export_outputs[k], v))
-        for k, v in six.iteritems(new_tensors_dict))
+      # We cannot return everything we want through the return values, so
+      # capture the rest here for later use.
+      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
+                       predictions_dict))
+      return predictions_list + export_outputs_tpu_list
 
-    return estimator_spec._replace(export_outputs=new_export_outputs)
+    return computation, capture
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -2538,7 +2755,11 @@ class TPUEstimator(estimator_lib.Estimator):
         if self._log_every_n_steps is not None:
           examples_hook = ExamplesPerSecondHook(
               ctx.global_batch_size,
-              output_dir=self.model_dir,
+              # pylint:disable=g-long-ternary
+              output_dir=(self.model_dir
+                          if not config or config.save_summary_steps
+                          else None),
+              # pylint:enable=g-long-ternary
               every_n_steps=self._log_every_n_steps)
 
         if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
@@ -2555,6 +2776,10 @@ class TPUEstimator(estimator_lib.Estimator):
         assert callable(features), '`input_fn` is not callable.'
         input_fn = features
 
+        tpu_init_ops = []
+        if ctx.embedding_config:
+          tpu_init_ops.extend(ctx.embedding_config.tpu_embedding.init_ops)
+
         input_holders = _InputPipeline(input_fn, batch_axis, ctx)
         enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
@@ -2608,7 +2833,7 @@ class TPUEstimator(estimator_lib.Estimator):
                   rendezvous=self._rendezvous[mode],
                   master=self._config.master,
                   session_config=self._session_config,
-              ),
+                  tpu_init_ops=tpu_init_ops),
               InstallSignalHandlerHook()
           ])
           if self._log_every_n_steps is not None:
@@ -2645,6 +2870,10 @@ class TPUEstimator(estimator_lib.Estimator):
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops(ctx)
 
+          if ctx.embedding_config:
+            update_ops.extend(
+                ctx.embedding_config.tpu_embedding.retrieve_parameters_ops)
+
           # Validate the TPU training graph to catch basic errors
           _validate_tpu_training_graph()
 
@@ -2714,7 +2943,8 @@ class TPUEstimator(estimator_lib.Estimator):
                   rendezvous=self._rendezvous[mode],
                   master=self._config.evaluation_master,
                   session_config=self._session_config,
-              )] + input_hooks
+                  tpu_init_ops=tpu_init_ops)
+          ] + input_hooks
 
           if eval_hooks:
             hooks.extend(eval_hooks)
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
index 76cb5531cd0bc3a375d1434c31fa14a9d7f42476..d98e0b7a5ed52c00a8cf2b1a1bbc53f1b1cd28c7 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
@@ -134,12 +134,16 @@ Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
   }
 }
 namespace {
-// Make a normal state variable specification.
+// Make a normal state variable specification. Please refer to
+// //third_party/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+// (StateVariableSpecification message) for instructions on how to set the
+// padding_initial_value field.
 StateVariableSpecification MakeStandardStateVariableSpecification(
-    const string& name) {
+    const string& name, double padding_initial_value) {
   StateVariableSpecification result;
   result.set_name(name);
-  result.mutable_user_defined();
+  result.mutable_user_defined()->set_padding_initial_value(
+      padding_initial_value);
   return result;
 }
 }  // namespace
@@ -149,14 +153,14 @@ Status GetOptimizationAlgorithmStateVariables(
     std::vector<StateVariableSpecification>* state_variables) {
   // The first parameter set is always the weights themselves.
   state_variables->push_back(
-      MakeStandardStateVariableSpecification("parameters"));
+      MakeStandardStateVariableSpecification("parameters", 0.0));
   // The order of the returned parameters needs to match the offsets used by
   // the algorithm implementations in test_util.cc and
   // address_handler_program_creator.cc.
   switch (alg) {
     case OptimizationAlgorithm::kAdagrad: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
     case OptimizationAlgorithm::kStochasticGradientDescent: {
@@ -165,53 +169,58 @@ Status GetOptimizationAlgorithmStateVariables(
     }
     case OptimizationAlgorithm::kFtrl: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("linears"));
+          MakeStandardStateVariableSpecification("linears", 0.0));
       break;
     }
     case OptimizationAlgorithm::kAdam: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta"));
+          MakeStandardStateVariableSpecification("momenta", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("velocities"));
+          MakeStandardStateVariableSpecification("velocities", 0.0));
       break;
     }
     case OptimizationAlgorithm::kMomentum: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta"));
+          MakeStandardStateVariableSpecification("momenta", 0.0));
       break;
     }
     case OptimizationAlgorithm::kRmsProp: {
-      state_variables->push_back(MakeStandardStateVariableSpecification("ms"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mom"));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("ms", 1.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mom", 0.0));
       break;
     }
     case OptimizationAlgorithm::kCenteredRmsProp: {
-      state_variables->push_back(MakeStandardStateVariableSpecification("ms"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mom"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mg"));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("ms", 1.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mom", 0.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mg", 0.0));
       break;
     }
     case OptimizationAlgorithm::kMdlAdagradLight: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("weights"));
+          MakeStandardStateVariableSpecification("weights", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("benefits"));
+          MakeStandardStateVariableSpecification("benefits", 0.0));
       break;
     }
     case OptimizationAlgorithm::kAdadelta: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("updates"));
+          MakeStandardStateVariableSpecification("updates", 0.0));
       break;
     }
     case OptimizationAlgorithm::kProximalAdagrad: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
     case OptimizationAlgorithm::PARAMETERS_NOT_SET: {
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index bcc177601b95172b05d327247bd370c2f8b65d59..27f0d9b2e38c433d4fb4573285ecb8c9946112e8 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -499,6 +499,7 @@ class HParams(object):
       value: New value of the hyperparameter.
 
     Raises:
+      KeyError: If the hyperparameter doesn't exist.
       ValueError: If there is a type mismatch.
     """
     param_type, is_list = self._hparam_types[name]
@@ -517,6 +518,8 @@ class HParams(object):
   def del_hparam(self, name):
     """Removes the hyperparameter with key 'name'.
 
+    Does nothing if it isn't present.
+
     Args:
       name: Name of the hyperparameter.
     """
@@ -525,19 +528,20 @@ class HParams(object):
       del self._hparam_types[name]
 
   def parse(self, values):
-    """Override hyperparameter values, parsing new values from a string.
+    """Override existing hyperparameter values, parsing new values from a string.
 
     See parse_values for more detail on the allowed format for values.
 
     Args:
-      values: String.  Comma separated list of `name=value` pairs where
-        'value' must follow the syntax described above.
+      values: String.  Comma separated list of `name=value` pairs where 'value'
+        must follow the syntax described above.
 
     Returns:
       The `HParams` instance.
 
     Raises:
-      ValueError: If `values` cannot be parsed.
+      ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+      doesn't exist.
     """
     type_map = dict()
     for name, t in self._hparam_types.items():
@@ -548,7 +552,7 @@ class HParams(object):
     return self.override_from_dict(values_map)
 
   def override_from_dict(self, values_dict):
-    """Override hyperparameter values, parsing new values from a dictionary.
+    """Override existing hyperparameter values, parsing new values from a dictionary.
 
     Args:
       values_dict: Dictionary of name:value pairs.
@@ -557,6 +561,7 @@ class HParams(object):
       The `HParams` instance.
 
     Raises:
+      KeyError: If a hyperparameter in `values_dict` doesn't exist.
       ValueError: If `values_dict` cannot be parsed.
     """
     for name, value in values_dict.items():
@@ -596,7 +601,7 @@ class HParams(object):
         sort_keys=sort_keys)
 
   def parse_json(self, values_json):
-    """Override hyperparameter values, parsing new values from a json object.
+    """Override existing hyperparameter values, parsing new values from a json object.
 
     Args:
       values_json: String containing a json object of name:value pairs.
@@ -605,6 +610,7 @@ class HParams(object):
       The `HParams` instance.
 
     Raises:
+      KeyError: If a hyperparameter in `values_json` doesn't exist.
       ValueError: If `values_json` cannot be parsed.
     """
     values_map = json.loads(values_json)
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index c272a2ac144068cfb7355c2647eebf5bd0ce9d50..fc6e38ab4a5243cb7502f4ca42db03cbfd342a40 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -419,7 +419,7 @@ def create_train_op(total_loss,
     update_ops = set(update_ops)
   if not global_update_ops.issubset(update_ops):
     logging.warning('update_ops in create_train_op does not contain all the '
-                    ' update_ops in GraphKeys.UPDATE_OPS')
+                    'update_ops in GraphKeys.UPDATE_OPS')
 
   # Make sure update_ops are computed before total_loss.
   if update_ops:
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index d9ccda8e89a4c9a1b3f3d24915b9ad3fb4d9be5f..07dbd5ca8d65ec8232d33c016a7369c68a4c9e1f 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -16,9 +16,12 @@ cc_library(
     srcs = ["convert_graphdef_memmapped_format_lib.cc"],
     hdrs = ["convert_graphdef_memmapped_format_lib.h"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/kernels:immutable_constant_op",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8bf1480d33b2d2117fb5c7ddf046262cfeb8a8ab..a932974270f5dc00ba61b1f6e57ee7b00778039c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -70,6 +70,9 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+# Export the BUILD file so automated tooling can check licenses
+exports_files(["BUILD"])
+
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
@@ -178,7 +181,6 @@ COMMON_PROTO_SRCS = [
     "framework/function.proto",
     "framework/graph.proto",
     "framework/graph_transfer_info.proto",
-    "framework/iterator.proto",
     "framework/kernel_def.proto",
     "framework/log_memory.proto",
     "framework/node_def.proto",
@@ -203,6 +205,7 @@ COMMON_PROTO_SRCS = [
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
+    "protobuf/verifier_config.proto",
     "util/event.proto",
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
@@ -445,7 +448,8 @@ cc_library(
 )
 
 cc_library(
-    name = "logger_interface",
+    name = "logger",
+    srcs = ["platform/logger.cc"],
     hdrs = ["platform/logger.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
@@ -455,23 +459,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "default_logger",
-    srcs = ["platform/default/logger.cc"],
-    hdrs = ["platform/logger.h"],
-    deps = [
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:logger_interface",
-    ],
-)
-
-cc_library(
-    name = "logger",
-    hdrs = ["platform/logger.h"],
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/core/platform/default/build_config:logger"],
-)
-
 filegroup(
     name = "platform_env_hdrs",
     srcs = [
@@ -520,6 +507,7 @@ cc_library(
         ":platform_port",
         ":platform_protobuf",
         "//tensorflow/core/platform/default/build_config:env",
+        "//tensorflow/core/platform/default/build_config:port",
     ],
 )
 
@@ -1033,6 +1021,7 @@ cc_library(
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
+        "//tensorflow/core/util/proto:proto_utils",
     ],
 )
 
@@ -1090,6 +1079,7 @@ tf_gen_op_libs(
         "tensor_forest_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
+        "clustering_ops",
         "collective_ops",
         "control_flow_ops",
         "ctc_ops",
@@ -1115,6 +1105,7 @@ tf_gen_op_libs(
         "parsing_ops",
         "random_grad",
         "random_ops",
+        "stateful_random_ops",
         "remote_fused_graph_ops",
         "rpc_ops",
         "scoped_allocator_ops",
@@ -1244,6 +1235,7 @@ cc_library(
         ":tensor_forest_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
+        ":clustering_ops_op_lib",
         ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
@@ -1269,6 +1261,7 @@ cc_library(
         ":parsing_ops_op_lib",
         ":ragged_ops",
         ":random_ops_op_lib",
+        ":stateful_random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
         ":rpc_ops_op_lib",
@@ -1387,7 +1380,7 @@ cc_library(
 
 # This includes implementations of all kernels built into TensorFlow.
 cc_library(
-    name = "all_kernels_statically_linked",
+    name = "all_kernels_impl",
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/kernels:array",
@@ -1398,12 +1391,12 @@ cc_library(
         "//tensorflow/core/kernels:tensor_forest_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
+        "//tensorflow/core/kernels:clustering_ops",
         "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:decode_proto_op",
         "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
@@ -1414,18 +1407,20 @@ cc_library(
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
-        "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
+        "//tensorflow/core/kernels:mutex_ops",
         "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/core/kernels:partitioned_function_ops",
+        "//tensorflow/core/kernels:pooling_ops",
         "//tensorflow/core/kernels:ragged_ops",
         "//tensorflow/core/kernels:random_ops",
+        "//tensorflow/core/kernels:stateful_random_ops",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
@@ -1477,8 +1472,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_dynamic_kernels(
         [],
-        otherwise = [":all_kernels_statically_linked"],
-    ),
+        otherwise = [":all_kernels_impl"],
+    ) + [
+        # TODO(gunan): Work on the API between these and rest of TF and make
+        # these also dynamically loading.
+        "//tensorflow/core/kernels:dataset_ops",  # Depends on grappler
+        "//tensorflow/core/kernels:list_kernels",  # Depends on variant_op_registry.h
+    ],
 )
 
 tf_cuda_library(
@@ -1763,6 +1763,7 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -1963,6 +1964,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rocm",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:rocm",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Clif-related proto libraries.
 
@@ -2022,6 +2031,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/step_stats_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/step_stats.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "framework/types_pyclif",
     proto_lib = ":protos_all_cc",
@@ -2199,6 +2215,7 @@ cc_library(
         ],
     }),
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
@@ -2213,7 +2230,6 @@ cc_library(
             "lib/**/*.cc",
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
-        ] + [
             "framework/resource_handle.cc",
             "util/env_var.cc",
         ],
@@ -2353,7 +2369,12 @@ cc_library(
 
 cc_library(
     name = "tflite_portable_logging",
-    srcs = [],
+    srcs = [
+    ] + if_ios([
+        "platform/default/logging.cc",
+        "platform/env_time.cc",
+        "platform/posix/env_time.cc",
+    ]),
     hdrs = [
         "lib/bfloat16/bfloat16.h",
         "platform/default/integral_types.h",
@@ -2362,7 +2383,7 @@ cc_library(
         "platform/macros.h",
         "platform/platform.h",
         "platform/types.h",
-    ] + if_windows(["platform/windows/integral_types.h"]),
+    ] + if_windows(["platform/windows/integral_types.h"]) + if_ios(["platform/env_time.h"]),
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
@@ -2772,6 +2793,7 @@ cc_library(
 # in this library.
 GRAPH_HDRS = [
     "graph/algorithm.h",
+    "graph/collective_order.h",
     "graph/colors.h",
     "graph/control_flow.h",
     "graph/costmodel.h",
@@ -2798,6 +2820,7 @@ tf_cuda_library(
     name = "graph",
     srcs = [
         "graph/algorithm.cc",
+        "graph/collective_order.cc",
         "graph/colors.cc",
         "graph/control_flow.cc",
         "graph/costmodel.cc",
@@ -2815,6 +2838,9 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2829,12 +2855,16 @@ CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
     "framework/versions.h",
     "common_runtime/process_function_library_runtime.h",
     "common_runtime/function.h",
+    "common_runtime/scoped_allocator.h",
+    "common_runtime/scoped_allocator_mgr.h",
 ]
 
 tf_cuda_library(
     name = "core_cpu_base",
     srcs = [
         "common_runtime/eval_const_tensor.cc",
+        "common_runtime/scoped_allocator.cc",
+        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/shape_refiner.cc",
         "common_runtime/shape_refiner.h",
         "framework/versions.h",
@@ -2894,6 +2924,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
+    "common_runtime/partitioning_utils.h",
     "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
@@ -2901,8 +2932,6 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
     "common_runtime/ring_reducer.h",
-    "common_runtime/scoped_allocator.h",
-    "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
     "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
@@ -2950,6 +2979,7 @@ tf_cuda_library(
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
         "common_runtime/parallel_concat_optimizer.cc",
+        "common_runtime/partitioning_utils.cc",
         "common_runtime/placer.cc",
         "common_runtime/pool_allocator.cc",
         "common_runtime/process_function_library_runtime.cc",
@@ -2959,8 +2989,6 @@ tf_cuda_library(
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
         "common_runtime/ring_reducer.cc",
-        "common_runtime/scoped_allocator.cc",
-        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/session.cc",
         "common_runtime/session_factory.cc",
         "common_runtime/session_options.cc",
@@ -2988,8 +3016,9 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
-        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:functions",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -3017,6 +3046,7 @@ tf_cuda_library(
         ":framework",
         ":graph",
         ":lib",
+        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -3504,6 +3534,29 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_fake_python_env_test",
+    size = "small",
+    srcs = ["platform/fake_python_env_test.cc"],
+    args = [
+        "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py",
+    ],
+    tags = [
+        "local",
+        "no_windows",
+        "nogpu",
+        "nomac",
+        "notap",
+    ],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "platform_abi_test",
     size = "small",
@@ -3677,7 +3730,6 @@ tf_cc_tests(
     srcs = [
         "common_runtime/buf_rendezvous_test.cc",
         "common_runtime/collective_executor_mgr_test.cc",
-        "common_runtime/collective_param_resolver_local_test.cc",
         "common_runtime/collective_rma_local_test.cc",
         "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
@@ -3793,6 +3845,7 @@ tf_cc_tests(
     name = "higher_level_tests_needing_kernels",
     size = "small",
     srcs = [
+        "common_runtime/collective_param_resolver_local_test.cc",
         "graph/graph_constructor_test.cc",
     ],
     linkopts = select({
@@ -3842,6 +3895,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_tests(
+    name = "collective_order_test",
+    size = "small",
+    srcs = [
+        "graph/collective_order_test.cc",
+    ],
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "ring_reducer_test",
     size = "medium",
@@ -4191,7 +4265,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "common_runtime_process_function_library_runtime_test",
     size = "small",
     srcs = ["common_runtime/process_function_library_runtime_test.cc"],
@@ -4200,6 +4274,7 @@ tf_cc_test(
         ":core_cpu",
         ":core_cpu_internal",
         ":framework",
+        ":framework_internal",
         ":lib",
         ":test",
         ":test_main",
@@ -4208,6 +4283,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:resource_variable_ops",
     ],
 )
 
@@ -4249,6 +4325,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_partitioning_utils_test",
+    size = "small",
+    srcs = ["common_runtime/partitioning_utils_test.cc"],
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":lib",
+        ":ops",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:identity_op",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "common_runtime_direct_session_test",
     size = "small",
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
index 070d6adb978e4a62e7209f299dba08515aa21e83..d0794de4ba4a174838547865e4f1692cff503052 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
@@ -33,6 +33,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
index ff2d9d71db646a27a88763f79bb6beb6b5ede44b..c8af9ff976688a0db78d26a495543cc3c052944a 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -41,6 +41,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
index 2de38b4263a380b5d0aec45270b9b67347c7021d..8aaae4aab6fd006931ce9f3ef1633a2c1e7c613b 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
@@ -40,6 +40,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalChooseFastestDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalChooseFastestDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7aa7a59bb67ade421ec12a9ec45326106d57ffc0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalChooseFastestDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalChooseFastestDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalTakeWhileDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalTakeWhileDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..699e0c2e39a78265a7cd5a149193d6454d7ef78a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalTakeWhileDataset.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "ExperimentalTakeWhileDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `predicate`.
+END
+  }
+  attr {
+    name: "predicate"
+    description: <<END
+A function returning a scalar boolean.
+END
+  }
+  summary: "Creates a dataset that stops iteration when predicate` is false."
+  description: <<END
+The `predicate` function must return a scalar boolean and accept the
+following arguments:
+
+* One tensor for each component of an element of `input_dataset`.
+* One tensor for each value in `other_arguments`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_KMC2ChainInitialization.pbtxt b/tensorflow/core/api_def/base_api/api_def_KMC2ChainInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff4b9e2d70afcb6836921a498dba69a834baec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_KMC2ChainInitialization.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "KMC2ChainInitialization"
+  visibility: HIDDEN
+  in_arg {
+    name: "distances"
+    description: <<END
+Vector with squared distances to the closest previously sampled cluster center
+for each candidate point.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+Scalar. Seed for initializing the random number generator.
+END
+  }
+  out_arg {
+    name: "index"
+    description: <<END
+Scalar with the index of the sampled point.
+END
+  }
+  summary: "Returns the index of a data point that should be added to the seed set."
+  description: <<END
+Entries in distances are assumed to be squared distances of candidate points to
+the already sampled centers in the seed set. The op constructs one Markov chain
+of the k-MC^2 algorithm and returns the index of one candidate point to be added
+as an additional cluster center.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_KmeansPlusPlusInitialization.pbtxt b/tensorflow/core/api_def/base_api/api_def_KmeansPlusPlusInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..461820486b94808346618bf0dbc756164032a044
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_KmeansPlusPlusInitialization.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "KmeansPlusPlusInitialization"
+  visibility: HIDDEN
+  in_arg {
+    name: "points"
+    description: <<END
+Matrix of shape (n, d). Rows are assumed to be input points.
+END
+  }
+  in_arg {
+    name: "num_to_sample"
+    description: <<END
+Scalar. The number of rows to sample. This value must not be larger than n.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+Scalar. Seed for initializing the random number generator.
+END
+  }
+  in_arg {
+    name: "num_retries_per_sample"
+    description: <<END
+Scalar. For each row that is sampled, this parameter
+specifies the number of additional points to draw from the current
+distribution before selecting the best. If a negative value is specified, a
+heuristic is used to sample O(log(num_to_sample)) additional points.
+END
+  }
+  out_arg {
+    name: "samples"
+    description: <<END
+Matrix of shape (num_to_sample, d). The sampled rows.
+END
+  }
+  summary: "Selects num_to_sample rows of input using the KMeans++ criterion."
+  description: <<END
+Rows of points are assumed to be input points. One row is selected at random.
+Subsequent rows are sampled with probability proportional to the squared L2
+distance from the nearest row selected thus far till num_to_sample rows have
+been sampled.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NearestNeighbors.pbtxt b/tensorflow/core/api_def/base_api/api_def_NearestNeighbors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bdf68fb9f0e42a0ac31334f122cf66357cc2579
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NearestNeighbors.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "NearestNeighbors"
+  visibility: HIDDEN
+  in_arg {
+    name: "points"
+    description: <<END
+Matrix of shape (n, d). Rows are assumed to be input points.
+END
+  }
+  in_arg {
+    name: "centers"
+    description: <<END
+Matrix of shape (m, d). Rows are assumed to be centers.
+END
+  }
+  in_arg {
+    name: "k"
+    description: <<END
+Number of nearest centers to return for each point. If k is larger than m, then
+only m centers are returned.
+END
+  }
+  out_arg {
+    name: "nearest_center_indices"
+    description: <<END
+Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+closest to the corresponding point, ordered by increasing distance.
+END
+  }
+  out_arg {
+    name: "nearest_center_distances"
+    description: <<END
+Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+corresponding center in nearest_center_indices.
+END
+  }
+  summary: "Selects the k nearest centers for each point."
+  description: <<END
+Rows of points are assumed to be input points. Rows of centers are assumed to be
+the list of candidate centers. For each point, the k centers that have least L2
+distance to it are computed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NextAfter.pbtxt b/tensorflow/core/api_def/base_api/api_def_NextAfter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5dfeef735aad6631d14e7b8211fd3b60e13d8791
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NextAfter.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "NextAfter"
+  summary: "Returns the next representable value of `x1` in the direction of `x2`, element-wise."
+  description: <<END
+This operation returns the same result as the C++ std::nextafter function.
+
+It can also return a subnormal number.
+
+@compatibility(cpp)
+Equivalent to C++ std::nextafter function.
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17ff15378c90f709ec6a2428a9c6408f23eeabe8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3ab3eba2c0bf06bf8a41eabc0020582c3ada8ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b00c2b7f650260d7d2150935ddfab1d65fac335
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f309f648cafb307569bdabe496ca44c8c200c585
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBias"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6b73eaae3613238d17900a4f15a7ad6839d92a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..101f72708af5cc92155b0641a14fc89889fa7488
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..697e26841539603ce2f6d26a082378881ce214a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cf52d6c897f9dc4e1e4988259b1c74043203727
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e91a2b8dc063c60cb2d8cd104bac864d063eee3b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSumAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ec528bf4a64bca8531d6daa90af2b13cebcec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
index 70ad5219267fcc84368f072a6f5a122b6cc11a89..2cc1a55676c354c9470287ccb89e39489ab18c02 100644
--- a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
@@ -10,7 +10,7 @@ op {
   }
   in_arg {
     name: "rewrite"
-    description: "The rewrite to be applied to the matched expresion."
+    description: "The rewrite to be applied to the matched expression."
   }
   out_arg {
     name: "output"
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
index d9c4d5a4a4008c439ece7fde52a2913f6a50956d..b0458207e6eb8b18a21e1f67b84e691fb5601e9a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -28,10 +28,8 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Adds sparse `updates` to individual values or slices within a given"
+  summary: "Applies sparse addition to individual values or slices in a Variable."
   description: <<END
-variable according to `indices`.
-
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
@@ -44,24 +42,24 @@ dimension of `ref`.
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
 ```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 ```
 
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
 
 ```python
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
 ```
 
 The resulting update to ref would look like this:
 
-    [1, 12, 3, 14, 14, 6, 7, 20]
+    [1, 13, 3, 14, 14, 6, 7, 20]
 
 See `tf.scatter_nd` for more details about how to make updates to
 slices.
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f12f4b5f34767e54bdd9c4ede9cb2c495eda723f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "ResourceScatterNdSub"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse subtraction to individual values or slices in a Variable."
+  description: <<END
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+with 8 elements. In Python, that subtraction would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
+
+The resulting update to ref would look like this:
+
+    [1, -9, 3, -6, -4, 6, 7, -4]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
index 5929425bc80f218627a7977a7b4e869715f7963b..b8fbcbbed29de68088db9ee12ae86cde5c7d6aa8 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
@@ -35,14 +35,12 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Applies sparse addition between `updates` and individual values or slices"
+  summary: "Applies sparse addition to individual values or slices in a Variable."
   description: <<END
-within a given variable according to `indices`.
-
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
-It must be shape `\\([d_0, ..., d_{Q-2}, K]\\)` where `0 < K <= P`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
 The innermost dimension of `indices` (with length `K`) corresponds to
 indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
@@ -50,17 +48,21 @@ dimension of `ref`.
 
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
-$$[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].$$
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
 
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
 
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
index 67346f051e75b68bc98b0e9026849f1c0f512939..b557addb7ce872edb76199a071907c59c8454abb 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
@@ -35,14 +35,14 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Applies sparse subtraction between `updates` and individual values or slices"
+  summary: "Applies sparse subtraction to individual values or slices in a Variable."
   description: <<END
 within a given variable according to `indices`.
 
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
-It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
 The innermost dimension of `indices` (with length `K`) corresponds to
 indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
@@ -50,17 +50,21 @@ dimension of `ref`.
 
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
-$$[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].$$
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
 
 For example, say we want to subtract 4 scattered elements from a rank-1 tensor
 with 8 elements. In Python, that subtraction would look like this:
 
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d963c55278d5b5638497d74677e6329a3aa615e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormal.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "StatefulStandardNormal"
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random normal values.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
index e382bcec814ecd2944bdb5ba5bffbc6d980479e4..8bb88f491abb4f4142724509690b336578aec791 100644
--- a/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
@@ -14,7 +14,7 @@ op {
   }
   attr {
     name: "rewrite"
-    description: "The rewrite to be applied to the matched expresion."
+    description: "The rewrite to be applied to the matched expression."
   }
   attr {
     name: "replace_global"
diff --git a/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt b/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
index 0fc89576ad29939837da7c55e393a0baeca90e5e..c5177612ef45573d2244eaaefceae8d0dbfbf2d5 100644
--- a/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
@@ -4,7 +4,7 @@ op {
   description: <<END
 The values of `value` are assigned to the positions in the variable
 `ref` that are selected by the slice parameters. The slice parameters
-`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+`begin`, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 
 NOTE this op currently does not support broadcasting and so `value`'s
 shape must be exactly the shape produced by the slice of `ref`.
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b34f8cec7e1c62142d280ad43e11c14afef30e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorListResize"
+  summary: "Resizes the list."
+  description: <<END
+
+input_handle: the input list
+size: size of the output list
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f520900fc0ce06d3fd6bb9bff4e164260ba71f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorListScatterV2"
+  summary: "Creates a TensorList by indexing into a Tensor."
+  description: <<END
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+tensor: The input tensor.
+indices: The indices used to index into the list.
+element_shape: The shape of the elements in the list (can be less specified than
+  the shape of the tensor).
+num_elements: The size of the output list. Must be large enough to accommodate
+  the largest index in indices. If -1, the list is just large enough to include
+  the largest index in indices.
+output_handle: The TensorList.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 7a60e4387ad0078d51eba026fcd2d9454a50e4ec..ed4a2bd5588eecb19d9d5effb386b2fe5c0c4409 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -3,7 +3,8 @@ op {
   in_arg {
     name: "segment_ids"
     description: <<END
-A tensor whose shape is a prefix of `data.shape`.END
+A tensor whose shape is a prefix of `data.shape`.
+END
   }
   out_arg {
     name: "output"
diff --git a/tensorflow/core/api_def/java_api/api_def_Abort.pbtxt b/tensorflow/core/api_def/java_api/api_def_Abort.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58448c2d17b6f4d323e5b4c041bcbdf559c98a5a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Abort.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Abort"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Abs.pbtxt b/tensorflow/core/api_def/java_api/api_def_Abs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ece45cf73f336792c532040370cc44f8709e397a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Abs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Abs"
+  endpoint {
+    name: "math.Abs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulateNV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulateNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c7a080c1159a1b4885c6d8e14cddf3ca4d07ae1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulateNV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulateNV2"
+  endpoint {
+    name: "math.AccumulateN"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49b7acad7d829838c3ba40cbdb97f1bafc96306d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorApplyGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorApplyGradient"
+  endpoint {
+    name: "train.AccumulatorApplyGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorNumAccumulated.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorNumAccumulated.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c42e819bf4d5e7bf80d42f96d13961c844f0eb8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorNumAccumulated.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorNumAccumulated"
+  endpoint {
+    name: "train.AccumulatorNumAccumulated"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorSetGlobalStep.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorSetGlobalStep.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca85302cdb4bbb1833eda5f63ce15a925ba5ee3e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorSetGlobalStep.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorSetGlobalStep"
+  endpoint {
+    name: "train.AccumulatorSetGlobalStep"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4883802c637e0d9298b4807b9ef25c2e32f2476e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorTakeGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorTakeGradient"
+  endpoint {
+    name: "train.AccumulatorTakeGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/java_api/api_def_Acos.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..847986b429de1f041e28819c33d6a1894f91f229
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Acos.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Acos"
+  endpoint {
+    name: "math.Acos"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Acosh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Acosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76d8f5fad05aae0372fae02d03c4f1da9af7343d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Acosh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Acosh"
+  endpoint {
+    name: "math.Acosh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Add.pbtxt b/tensorflow/core/api_def/java_api/api_def_Add.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f78ccc9ea6ec7e2ca5960d384dab1ae0b85cb47
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Add.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Add"
+  endpoint {
+    name: "math.Add"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddManySparseToTensorsMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddManySparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e009ba19d34f742823d00e4740260a35ef0e7b95
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddManySparseToTensorsMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AddManySparseToTensorsMap"
+  endpoint {
+    name: "sparse.AddManySparseToTensorsMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddN.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20d469ae731ec7cde431988ff198474c67c9d694
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddN.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AddN"
+  endpoint {
+    name: "math.AddN"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddSparseToTensorsMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddSparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0bb20186de38f681d659c744f686738adb5e76cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddSparseToTensorsMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AddSparseToTensorsMap"
+  endpoint {
+    name: "sparse.AddSparseToTensorsMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a070c6a51939639b7820572d8d464c79a7cd1ccb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AddV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustContrast.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustContrast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daad141027a7d6c36a0624c7ce3b92a7cb409b6c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustContrast.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustContrast"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustContrastv2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustContrastv2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81f565c1d594754a3889abb0debee81ab8bf746d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustContrastv2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AdjustContrastv2"
+  endpoint {
+    name: "image.AdjustContrast"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustHue.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustHue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0847cad4031f281f65ea19dfddccdbf1f25bc5e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustHue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AdjustHue"
+  endpoint {
+    name: "image.AdjustHue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustSaturation.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustSaturation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d685636eb12426b4755b67d55fd5f986b7a285e4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustSaturation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AdjustSaturation"
+  endpoint {
+    name: "image.AdjustSaturation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_All.pbtxt b/tensorflow/core/api_def/java_api/api_def_All.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6459c56b71f359bad5a2fda9e605eb25471e5a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_All.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "All"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AllCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_AllCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..607c208a460b923df35da8f542402380c8cdebae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AllCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AllCandidateSampler"
+  endpoint {
+    name: "random.AllCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Angle.pbtxt b/tensorflow/core/api_def/java_api/api_def_Angle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a92ccf357dbd1be80b946ea6683e48f30de5f918
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Angle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Angle"
+  endpoint {
+    name: "math.Angle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AnonymousIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_AnonymousIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..894f85ae88e7961db328d842a358879df71dd9dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AnonymousIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AnonymousIterator"
+  endpoint {
+    name: "data.AnonymousIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Any.pbtxt b/tensorflow/core/api_def/java_api/api_def_Any.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20b36eda3f8c4cb231b39a2a88f45f756ab42326
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Any.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Any"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583f164e06c17f1f0192a2a30d22665f05d0f2df
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  endpoint {
+    name: "train.ApplyAdaMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e672a8ef03bcec665878fd2c927cff7458b70af6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdadelta"
+  endpoint {
+    name: "train.ApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..980c57c5fecc0d93655efd781efcadfa2163061c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdagrad"
+  endpoint {
+    name: "train.ApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..815df985ef98d18fd45bce603416aea4e1c90387
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdagradDA"
+  endpoint {
+    name: "train.ApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56461b1d3d582d728976a3685ab3d42d4fa90caa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdam.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdam"
+  endpoint {
+    name: "train.ApplyAdam"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAddSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b54ff6eca44a4103e08bef4f69f86e5283949863
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAddSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAddSign"
+  endpoint {
+    name: "train.ApplyAddSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b831bca43675334edc0e7a0cc2565d3e1019f9b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyCenteredRMSProp"
+  endpoint {
+    name: "train.ApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..176de19a9a7f8ac71bbb8038aa20dc26b19b9452
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da0fc8fcbf794ed17e4c04291719b67721669da6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyFtrlV2"
+  endpoint {
+    name: "train.ApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa569ed329d73b5179fd0d00c2d21035299820e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyGradientDescent"
+  endpoint {
+    name: "train.ApplyGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96c21199f0902a97846e86362c64b49491fdea57
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyMomentum"
+  endpoint {
+    name: "train.ApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyPowerSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5c22347556d0cccf335ac9d5f217b5c459e5afc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyPowerSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyPowerSign"
+  endpoint {
+    name: "train.ApplyPowerSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a52d8c3591c13a8d9843856f4845cf4b762183fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyProximalAdagrad"
+  endpoint {
+    name: "train.ApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74ea29cf8882436e9d27a3ddcc1b43ff7a87b460
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyProximalGradientDescent"
+  endpoint {
+    name: "train.ApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90171ccc759c1cef4cccc2c5ee44bfd7571d0145
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyRMSProp"
+  endpoint {
+    name: "train.ApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApproximateEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApproximateEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..029dc6d29305049af5c818d05f5a4b13e53443ea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApproximateEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApproximateEqual"
+  endpoint {
+    name: "math.ApproximateEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ArgMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ArgMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9effd49c4a68f79de7473308490c576775ae2fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ArgMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ArgMax"
+  endpoint {
+    name: "math.ArgMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ArgMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ArgMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ff04c0d1ab01ab2757fd18dff22755681f0a96d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ArgMin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ArgMin"
+  endpoint {
+    name: "math.ArgMin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/java_api/api_def_AsString.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8c875ea8141d52d29bd7ef467f97d01b201187c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AsString.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AsString"
+  endpoint {
+    name: "dtypes.AsString"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Asin.pbtxt b/tensorflow/core/api_def/java_api/api_def_Asin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ffc8e3e570eeee82c69503e9f56f5ff2c9ebc19
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Asin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Asin"
+  endpoint {
+    name: "math.Asin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Asinh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Asinh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e3b30dd51254efd628e41615a9d08dc100f284f4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Asinh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Asinh"
+  endpoint {
+    name: "math.Asinh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
index b1f868897d5b88ac76eb8f85ace99c4ce3c3e037..a9e107b4780ab2405db65cf29a369495051b4c64 100644
--- a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
+++ b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
@@ -1,4 +1,6 @@
 op {
-  graph_op_name: "Assert" #TODO(karllessard) escape that reserved name
-  visibility: HIDDEN
+  graph_op_name: "Assert"
+  endpoint {
+    name: "AssertThat"
+  }
 }
diff --git a/tensorflow/core/api_def/java_api/api_def_Assign.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15d778f61e86ada53f5be1c7e2fc29c78f37333b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Assign.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Assign"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4118b64afd98192523d372aed99b8717d3ca9fb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignAddVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05fecb191bf75f92f23a384f864d3d8c33d43489
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignAddVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignAddVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aaf9246a6ac2d809afec616842c11fdaa48c37e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignSubVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e8791aed2d9f7a72ae18e343c307cb46dd52694
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignSubVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignSubVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..980e6968269e1ac35193920575f0619a4fba4a16
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Atan.pbtxt b/tensorflow/core/api_def/java_api/api_def_Atan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e51aee9abc4d4b966dc59af4004b89618b9b09e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Atan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Atan"
+  endpoint {
+    name: "math.Atan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Atan2.pbtxt b/tensorflow/core/api_def/java_api/api_def_Atan2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..302b05f9dce7c8383253f06d0f5f60191e110d54
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Atan2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Atan2"
+  endpoint {
+    name: "math.Atan2"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Atanh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Atanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9c4a4115443525152aaef949a30106b6a3cbeb8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Atanh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Atanh"
+  endpoint {
+    name: "math.Atanh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AudioSpectrogram.pbtxt b/tensorflow/core/api_def/java_api/api_def_AudioSpectrogram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd8f3a5e3353d1acabf0e264c4de09416af49ec0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AudioSpectrogram.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AudioSpectrogram"
+  endpoint {
+    name: "audio.AudioSpectrogram"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AudioSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_AudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13d30de29dc78642b421087040000dc97b8c7963
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AudioSummary"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AudioSummaryV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AudioSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4eda8b09ab44f933ef8ae650cfc39aaeece8d7b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AudioSummaryV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AudioSummaryV2"
+  endpoint {
+    name: "summary.AudioSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10d87802f0d85379c5789b897bd08dab1d5ec1a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool"
+  endpoint {
+    name: "nn.AvgPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ae2794f48b9b1174cf8de0f3d18259a2ab0d3a3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPool3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool3D"
+  endpoint {
+    name: "nn.AvgPool3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPool3DGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09aba78ca209abe86700e6afa5181e7222e1e580
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPool3DGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool3DGrad"
+  endpoint {
+    name: "nn.AvgPool3dGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc8fec89b992c961a5f78208801d5a7a1e754d53
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPoolGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPoolGrad"
+  endpoint {
+    name: "nn.AvgPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Barrier.pbtxt b/tensorflow/core/api_def/java_api/api_def_Barrier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e282ca7b390c9c2334224dc8049e828582de370
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Barrier.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Barrier"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0307318763b8450b7a0f42b0df90bae64162e394
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierClose.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierClose"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierIncompleteSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb11b18e951e75e476fddd2c7f876c69013bef5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierIncompleteSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierIncompleteSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierInsertMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierInsertMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..32e29f00158ae147399dd9d71a5f0a5d1fa95d52
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierInsertMany.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierInsertMany"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierReadySize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierReadySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ed50b257994ed0466eb5f26612d02f306ddd8ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierReadySize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierReadySize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierTakeMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierTakeMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21f08878c6d76a4426da0448cc55e44283d25305
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierTakeMany.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierTakeMany"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Batch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Batch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c21faf72d5c0850d9761f8c98ee9ee892e9c293
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Batch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Batch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchCholesky.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchCholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15048109fd64c3f2ef66341f96f87fe7cbe3717a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchCholesky.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchCholesky"
+  endpoint {
+    name: "linalg.BatchCholesky"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchCholeskyGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchCholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb0e2c6bc83c3ff93e9a635fae0e3e23b7333a23
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchCholeskyGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchCholeskyGrad"
+  endpoint {
+    name: "linalg.BatchCholeskyGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0bb7298ba90625fcb6a9b5227277db9b86e21bf6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd81b0b1cf44c20c0e8c3d51deb77e450e8a5b96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchDatasetV2"
+  endpoint {
+    name: "data.BatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dda7c1fb61ac2c6336582b99c2b4ebc23cc808b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchFFT"
+  endpoint {
+    name: "signal.BatchFft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e11860138a23888b5b51634bf0e6082570d15fc9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchFFT2D"
+  endpoint {
+    name: "signal.BatchFft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3be0b516d0e73acada03f1be1dd0816def291c1a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchFFT3D"
+  endpoint {
+    name: "signal.BatchFft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFunction.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8789dc6acb6355a0079dd85d36a0da9e1c675a94
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFunction.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchFunction"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchIFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchIFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de37ada148acde00333b377288876df6d38994c1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchIFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchIFFT"
+  endpoint {
+    name: "signal.BatchIfft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchIFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchIFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae7fb4cb0ae2dbd476617a350be79f8107af4f8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchIFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchIFFT2D"
+  endpoint {
+    name: "signal.BatchIfft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchIFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchIFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ecb52714b53419447922b5aa97cb18f3c413b56
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchIFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchIFFT3D"
+  endpoint {
+    name: "signal.BatchIfft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95aa6446157deca2318b9e0ae417b18748b01f31
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatMul"
+  endpoint {
+    name: "linalg.BatchMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixBandPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de989c6d527e45322fd7cd668a67afaffb32e9c7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixBandPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixBandPart"
+  endpoint {
+    name: "linalg.BatchMatrixBandPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixDeterminant.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a45fe25d10c0dbc205d4e5d1424c3a6c5ae9d166
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDeterminant.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixDeterminant"
+  endpoint {
+    name: "linalg.BatchMatrixDeterminant"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d85d76f02f04114d5ef8a12bad6136d550b4eb95
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixDiag"
+  endpoint {
+    name: "linalg.BatchMatrixDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiagPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4b5350b11eedcbd8b47ec7977bd275f633671561
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiagPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixDiagPart"
+  endpoint {
+    name: "linalg.BatchMatrixDiagPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixInverse.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f40ea50d4bcb878ce04609460bada01c17ccad2c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixInverse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixInverse"
+  endpoint {
+    name: "linalg.BatchMatrixInverse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixSetDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac4cd6889b63a562643d5d1bbd4d9b0686d224ff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSetDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixSetDiag"
+  endpoint {
+    name: "linalg.BatchMatrixSetDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97435acb4e49cc1c2cf10e969dd9ab052da5f61a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixSolve"
+  endpoint {
+    name: "linalg.BatchMatrixSolve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolveLs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aee0b4add3577ee97e5a4eac802e6fda47153585
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolveLs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixSolveLs"
+  endpoint {
+    name: "linalg.BatchMatrixSolveLs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..554eff15747871acdb5248b1488004e5705d1fb9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixTriangularSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixTriangularSolve"
+  endpoint {
+    name: "linalg.BatchMatrixTriangularSolve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8285ac284d8427cc7334747891e799e3ebc441b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalization"
+  endpoint {
+    name: "nn.BatchNormWithGlobalNormalization"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b18bf52accb2ef990ba96719d8fa97643fff4ea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalizationGrad"
+  endpoint {
+    name: "nn.BatchNormWithGlobalNormalizationGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEig.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..517030fd692d4c8641615338eb4e376cbaaa86a4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEig.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchSelfAdjointEig"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c973443902563cce4adda3fdc6d526d6fa740e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEigV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchSelfAdjointEigV2"
+  endpoint {
+    name: "linalg.BatchSelfAdjointEig"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchSvd.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchSvd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8696359df8d5b130979681e190aaab89c230243e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchSvd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchSvd"
+  endpoint {
+    name: "linalg.BatchSvd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchToSpace.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..affbc519e514e39a86736121c56947fcf9075353
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchToSpace.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchToSpace"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchToSpaceND.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchToSpaceND.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c7d2fbdb9fed77d3c9484b2a8442e7a16179641
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchToSpaceND.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchToSpaceND"
+  endpoint {
+    name: "BatchToSpaceNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/java_api/api_def_BesselI0e.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84eb3b5e71d6e67ce36e9ed0103468442a974fe1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BesselI0e.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BesselI0e"
+  endpoint {
+    name: "math.BesselI0e"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/java_api/api_def_BesselI1e.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43f9113b0bbe53a076719226b659f5598bb1c919
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BesselI1e.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BesselI1e"
+  endpoint {
+    name: "math.BesselI1e"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Betainc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Betainc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e9956d9ec72df62cc5db845c8f15753d2e1bc7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Betainc.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Betainc"
+  endpoint {
+    name: "math.Betainc"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BiasAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_BiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb3be23bd9a14b376c2e127137a694afbf95bd32
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BiasAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BiasAdd"
+  endpoint {
+    name: "nn.BiasAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BiasAddGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_BiasAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e040bf6df807f7395381572dee931ec188ea724
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BiasAddGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BiasAddGrad"
+  endpoint {
+    name: "nn.BiasAddGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BiasAddV1.pbtxt b/tensorflow/core/api_def/java_api/api_def_BiasAddV1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..651c434e6459a9e1bbc2bb399572a3752bdb9569
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BiasAddV1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BiasAddV1"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BigQueryReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_BigQueryReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b6e11687a2d73e706ebaa33c3c122bb43796f97
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BigQueryReader.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BigQueryReader"
+  endpoint {
+    name: "io.BigQueryReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Bincount.pbtxt b/tensorflow/core/api_def/java_api/api_def_Bincount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b894fd6ec5e5266bfdafd4866e4099479f0aecea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Bincount.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Bincount"
+  endpoint {
+    name: "math.Bincount"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Bitcast.pbtxt b/tensorflow/core/api_def/java_api/api_def_Bitcast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d2db26851d02076c17f802a89d04e257f407f68
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Bitcast.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Bitcast"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BitwiseAnd.pbtxt b/tensorflow/core/api_def/java_api/api_def_BitwiseAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db5fada2461e313e40a755b0974cc061a960e1c7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BitwiseAnd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseAnd"
+  endpoint {
+    name: "bitwise.BitwiseAnd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BitwiseOr.pbtxt b/tensorflow/core/api_def/java_api/api_def_BitwiseOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f9d1bc2fe4dcf6d9a7836e18f62edeb02795547
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BitwiseOr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseOr"
+  endpoint {
+    name: "bitwise.BitwiseOr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BitwiseXor.pbtxt b/tensorflow/core/api_def/java_api/api_def_BitwiseXor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28f405b8adac55d336985aa74f1dc44dbe2e2d46
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BitwiseXor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseXor"
+  endpoint {
+    name: "bitwise.BitwiseXor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesBucketize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..acec845ca4708c0a0e7d90d5ce380dab3f074eb0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesBucketize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesBucketize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa22216ed5b92a30ff7d64f1924d128b45cf5111
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCalculateBestGainsPerFeature"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCenterBias.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCenterBias.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b65775a126ff9d29635cd066214a6ef48c4b604
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCenterBias.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCenterBias"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..194251d4337bc3df80c33d8ad3fa2281df74c110
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCreateEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d9c8c9229c61e75f6bc8d6fdc08fa2617077f48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCreateQuantileStreamResource"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ed17ca30f495d8de16c62861f7365b79ca01040
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesDeserializeEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a44c86614a30ff8f2686191cadb8d386f3c493a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesEnsembleResourceHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d6f276911617edc905d37b699087912e96a2179
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesExampleDebugOutputs"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a591013fb71b03a9c543443376833fd4ce1e278e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesGetEnsembleStates"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e147765a7b84fd03819aa1d6623d0bbaf6c5bfc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesMakeQuantileSummaries"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbee8bb47ca54e1e4b4a11abfd061d5feb688533
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesMakeStatsSummary"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e25d43a18fb382d56a5485439bca40587e337bad
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesPredict.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesPredict"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9fe96d3cd1b8dca091617fd9eb958ea9fcdfdab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceAddSummaries"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86e80902417f877ff8ad5622519f06a60a9ea820
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceDeserialize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c98375bb24119c30d6a4c33e74d274c4a72e01ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceFlush"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e0421be40aabb4a17ec2df719a6917968c5dfd40
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b7f5e8aa65d8d913a3702d47948d25a33f29d5d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db5807344bd6ff0556a6d8a335cd432b223ef075
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesSerializeEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesTrainingPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b722233953b6e6b11daf38818bc44b030960b6da
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesTrainingPredict"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb642dd430e3ffa97910a41335c459ea1378a441
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesUpdateEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BroadcastArgs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BroadcastArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..484742a2d02739a4129961768fd7221d1976a05d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BroadcastArgs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BroadcastArgs"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BroadcastGradientArgs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BroadcastGradientArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50f631b2a694ee353551f1c345872da56c8d4ed3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BroadcastGradientArgs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BroadcastGradientArgs"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..127458816ce278404877c255a581618c6e236fac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BroadcastTo"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Bucketize.pbtxt b/tensorflow/core/api_def/java_api/api_def_Bucketize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a99712fd6cebd5a4b3d53f65903524d01821aa0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Bucketize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Bucketize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_BytesProducedStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd7f24d961415c3329ba8f564edfcde49e02077d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BytesProducedStatsDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BytesProducedStatsDataset"
+  endpoint {
+    name: "data.BytesProducedStatsDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CTCBeamSearchDecoder.pbtxt b/tensorflow/core/api_def/java_api/api_def_CTCBeamSearchDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39739f03a315996d455af77c9743e71c7707e48f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CTCBeamSearchDecoder.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CTCBeamSearchDecoder"
+  endpoint {
+    name: "nn.CtcBeamSearchDecoder"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CTCGreedyDecoder.pbtxt b/tensorflow/core/api_def/java_api/api_def_CTCGreedyDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..009742f097389146c8d9d432860bfcbbe5151a39
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CTCGreedyDecoder.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CTCGreedyDecoder"
+  endpoint {
+    name: "nn.CtcGreedyDecoder"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CTCLoss.pbtxt b/tensorflow/core/api_def/java_api/api_def_CTCLoss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbeefa4017181ed291b0de2777f7c8fcee3af1fe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CTCLoss.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CTCLoss"
+  endpoint {
+    name: "nn.CtcLoss"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_CacheDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11c26c1dfc58eff917bfbc41c32a42c1ad39a9de
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CacheDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CacheDataset"
+  endpoint {
+    name: "data.CacheDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cast.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea9f812e2a1b25c14022588dcf1dbeca0a05d5ee
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cast.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cast"
+  endpoint {
+    name: "dtypes.Cast"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Ceil.pbtxt b/tensorflow/core/api_def/java_api/api_def_Ceil.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1a75f27d9ae6494d9fb38d7295d97a416b5a731
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Ceil.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Ceil"
+  endpoint {
+    name: "math.Ceil"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/java_api/api_def_CheckNumerics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..731e9030a039aed7d4c899aca24ccec5635e0fcc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CheckNumerics.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CheckNumerics"
+  endpoint {
+    name: "math.CheckNumerics"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cholesky.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a60c4e3663e28128303435f845db9f319f1dd6b7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cholesky.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cholesky"
+  endpoint {
+    name: "linalg.Cholesky"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CholeskyGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_CholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2601d41554206fb268b00add8493d2184dee5ffa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CholeskyGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CholeskyGrad"
+  endpoint {
+    name: "linalg.CholeskyGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4e055c117c140e9e027983917b31014a6892690
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ClipByValue"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CloseSummaryWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_CloseSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d5fbe557db0b3583db341692279ab262715900de
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CloseSummaryWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CloseSummaryWriter"
+  endpoint {
+    name: "summary.CloseSummaryWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ada333e4466d26cb892e979e5b7eac141ac922f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  endpoint {
+    name: "collective.BroadcastRecv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18b4bef345e4f8d1667860eae6b6612643076376
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  endpoint {
+    name: "collective.BroadcastSend"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/java_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6226cc05ec3eef71864af69372273011d2d4c14c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  endpoint {
+    name: "collective.AllReduce"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CompareAndBitpack.pbtxt b/tensorflow/core/api_def/java_api/api_def_CompareAndBitpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d744fbbc90fcc631ab626fd7ab9fedcb795cb88b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CompareAndBitpack.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CompareAndBitpack"
+  endpoint {
+    name: "math.CompareAndBitpack"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Complex.pbtxt b/tensorflow/core/api_def/java_api/api_def_Complex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4889360a96af146a97ef22add49c1d8167e07697
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Complex.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Complex"
+  endpoint {
+    name: "dtypes.Complex"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ComplexAbs.pbtxt b/tensorflow/core/api_def/java_api/api_def_ComplexAbs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42a6a3c6a1c56f00f89d3bfdab13806f4acb5031
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ComplexAbs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ComplexAbs"
+  endpoint {
+    name: "math.ComplexAbs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ComputeAccidentalHits.pbtxt b/tensorflow/core/api_def/java_api/api_def_ComputeAccidentalHits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca9e590fbce09a0e7a64229077320e1507f8fa84
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ComputeAccidentalHits.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ComputeAccidentalHits"
+  endpoint {
+    name: "nn.ComputeAccidentalHits"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Concat.pbtxt b/tensorflow/core/api_def/java_api/api_def_Concat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc7eef887c053fa3e7c0a2a1d5065332022018
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Concat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Concat"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConcatOffset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConcatOffset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8e23cf5593b274732fd9461ceecdbdaaad8476f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConcatOffset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ConcatOffset"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConcatV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7035796981f4ce98c27488e3f5aef49dad4ed8cd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConcatV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConcatV2"
+  endpoint {
+    name: "Concat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConcatenateDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec8253e9b5bf1a69d1c9fbc15cc32a688b749ba6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConcatenateDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConcatenateDataset"
+  endpoint {
+    name: "data.ConcatenateDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConditionalAccumulator.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08431982daa9e259676c26fcda8311912dfba423
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConditionalAccumulator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConditionalAccumulator"
+  endpoint {
+    name: "train.ConditionalAccumulator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conj.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conj.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7de199b55fa9dd93f1c7741a7c2fcba555b8b406
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conj.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conj"
+  endpoint {
+    name: "math.Conj"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConjugateTranspose.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConjugateTranspose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42173088ae0e45c959ffc2ae92f03dba1f1caae3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConjugateTranspose.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConjugateTranspose"
+  endpoint {
+    name: "linalg.ConjugateTranspose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
index 2dbdca34e0072e4b92f9f9ae7f721c1485d75285..a73f1e6c3ad9193587bd3e48c536edd79dd9448b 100644
--- a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
+++ b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
@@ -1,4 +1,4 @@
 op {
-  graph_op_name: "Const" #TODO(karllessard) escape that reserved name
-  visibility: HIDDEN
+  graph_op_name: "Const"
+  visibility: SKIP
 }
diff --git a/tensorflow/core/api_def/java_api/api_def_ConsumeMutexLock.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConsumeMutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e0d136bc2f5b70fbf7557a8aa2bc37678e8240a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConsumeMutexLock.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ConsumeMutexLock"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ControlTrigger.pbtxt b/tensorflow/core/api_def/java_api/api_def_ControlTrigger.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4517b4373f3f736eca06e3e1b6f015be141af29b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ControlTrigger.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ControlTrigger"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21d1398e0980311593564c142ff094786f7a2b05
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2D"
+  endpoint {
+    name: "nn.Conv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30eb55c6f2872a63963d202f8f7d13bbb892d7e4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2DBackpropFilter"
+  endpoint {
+    name: "nn.Conv2dBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c98646c137bf97bafb0ff82c9416374effd2c21
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2DBackpropInput"
+  endpoint {
+    name: "nn.Conv2dBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ee1befcff19c373b34ce171db21fc8d60ae04dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3D"
+  endpoint {
+    name: "nn.Conv3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e5f6c99d50d275804eb4971c0fcc1b730afbf3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropFilter"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0643cc14a9a362472cdd3f634b0d5debef825e89
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.Conv3dBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbb2c9f136b6577ad5f17773b81e0fb87b266bb3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropInput"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInputV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInputV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33c8f5a3ce0577f7dd2f92188af1c38b1ac6e4c4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInputV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3DBackpropInputV2"
+  endpoint {
+    name: "nn.Conv3dBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cos.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cos.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db1f62806e255fcb750eecd2a88844b2d530162c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cos.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cos"
+  endpoint {
+    name: "math.Cos"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cosh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4b5e752bf5d5149d32b119ae7b5debbc805d162
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cosh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cosh"
+  endpoint {
+    name: "math.Cosh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CountUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_CountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb9f328ce0cf96eb582577b599d2b7197866c913
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CountUpTo.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "CountUpTo"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CreateSummaryDbWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_CreateSummaryDbWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..299f881dd44cd7fe92d3e24b99581e74d1001bbe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CreateSummaryDbWriter"
+  endpoint {
+    name: "summary.CreateSummaryDbWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CreateSummaryFileWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_CreateSummaryFileWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26c7941ce57670ab0c6cc30ef2bc958edf95b391
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CreateSummaryFileWriter"
+  endpoint {
+    name: "summary.CreateSummaryFileWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/java_api/api_def_CropAndResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbf9aa8f99639083cecd895accd85ee90aa2297c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CropAndResize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResize"
+  endpoint {
+    name: "image.CropAndResize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradBoxes.pbtxt b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44354bdfa03fee68e594f2d1265a61c81c074510
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradBoxes.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResizeGradBoxes"
+  endpoint {
+    name: "image.CropAndResizeGradBoxes"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradImage.pbtxt b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradImage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0618db9a8d715ddf854f5f6e13b11f2376a07bc1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradImage.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResizeGradImage"
+  endpoint {
+    name: "image.CropAndResizeGradImage"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cross.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c027884250e09948595d8bdef720f2534f91da54
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cross.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cross"
+  endpoint {
+    name: "linalg.Cross"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e88d20713f0ae44678f5bddf6e05fefb8cda3f2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNN"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c9fc5f029a0e76f85ac57f8b143d2a2e9ddb731
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackprop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNBackprop"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackpropV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackpropV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2e7ebc27d69eb5ef5a9bf79a2730d242899f226
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackpropV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNBackpropV2"
+  endpoint {
+    name: "nn.CudnnRnnBackprop"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d351fa860fc8c99099f241beb756ba4362d2124
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNCanonicalToParams"
+  endpoint {
+    name: "nn.CudnnRnnCanonicalToParams"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3f1193fe6068e7443df5d88293dde0fdd6375ea6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNParamsSize"
+  endpoint {
+    name: "nn.CudnnRnnParamsSize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2e4c6201e1b511637d71a612ba5e807215b2321
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNParamsToCanonical"
+  endpoint {
+    name: "nn.CudnnRnnParamsToCanonical"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6dd5f42fc9a33e83b7746799f5944350e344653
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNV2"
+  endpoint {
+    name: "nn.CudnnRnn"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cumprod.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cumprod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cb7862413daf26daac361d9ee6540f612bad19b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cumprod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cumprod"
+  endpoint {
+    name: "math.Cumprod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cumsum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cumsum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7d90765326c89a3661317056d06329fab35940d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cumsum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cumsum"
+  endpoint {
+    name: "math.Cumsum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DataFormatDimMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_DataFormatDimMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36ea17793fde8ab968cd871ff02c32b310f5f912
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DataFormatDimMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DataFormatDimMap"
+  endpoint {
+    name: "nn.DataFormatDimMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/java_api/api_def_DataFormatVecPermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6b7e2dc7672de636e61d8c2f5874be2337deba4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DataFormatVecPermute.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DataFormatVecPermute"
+  endpoint {
+    name: "nn.DataFormatVecPermute"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DatasetToGraph.pbtxt b/tensorflow/core/api_def/java_api/api_def_DatasetToGraph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e7d48961db295e6e2ef3d6ab403e61697e52ed8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DatasetToGraph.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DatasetToGraph"
+  endpoint {
+    name: "data.DatasetToGraph"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/java_api/api_def_DatasetToSingleElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ac42e0e9369b886b7889e08af861d4c6e967a43
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DatasetToSingleElement.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DatasetToSingleElement"
+  endpoint {
+    name: "data.DatasetToSingleElement"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/java_api/api_def_DatasetToTFRecord.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d388570630ae1f993df4577b263d8f16fcbc3f0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DatasetToTFRecord.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DatasetToTFRecord"
+  endpoint {
+    name: "data.DatasetToTfRecord"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DebugGradientIdentity.pbtxt b/tensorflow/core/api_def/java_api/api_def_DebugGradientIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d50c5c8687a2cb3f550a04654fc6f0d7ec86a89
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DebugGradientIdentity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DebugGradientIdentity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DebugGradientRefIdentity.pbtxt b/tensorflow/core/api_def/java_api/api_def_DebugGradientRefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e14e5fffd6e3683eec6eca65f587b5f0ab0016b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DebugGradientRefIdentity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DebugGradientRefIdentity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeAndCropJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c07bb7a1bdf4de0860b001ba246ec231fafb1edc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeAndCropJpeg"
+  endpoint {
+    name: "image.DecodeAndCropJpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeBase64.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeBase64.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49c93453f7b9ea52e122ece339f2845e36570bb1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeBase64.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeBase64"
+  endpoint {
+    name: "io.DecodeBase64"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeBmp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..049cfa153d190f1c63e800f7da4f38a417f4bde8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeBmp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeBmp"
+  endpoint {
+    name: "image.DecodeBmp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeCSV.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeCSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d60107adab5f3ef845556ccd752bc10dd8f48be
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeCSV.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeCSV"
+  endpoint {
+    name: "io.DecodeCsv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeCompressed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..91327a92ecb0c8e69441344e2b19986441f4a29e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeCompressed.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeCompressed"
+  endpoint {
+    name: "io.DecodeCompressed"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeGif.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..355643ff77cb4d9b75f6f17cd3ef13ab6ef45a66
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeGif.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeGif"
+  endpoint {
+    name: "image.DecodeGif"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeJSONExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeJSONExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ecba5ab0534cc2e80fa51b4f9904b0df4ae0d7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeJSONExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeJSONExample"
+  endpoint {
+    name: "io.DecodeJsonExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0ebf2e315f160e10b5d66adac9ad472308040d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeJpeg"
+  endpoint {
+    name: "image.DecodeJpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d94537dc92891066f56e8a2f50fd924f8d251927
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodePng.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodePng"
+  endpoint {
+    name: "image.DecodePng"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeProtoV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ba118cb0e67dd2ab8b763286110647b19d9ded8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeProtoV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeProtoV2"
+  endpoint {
+    name: "DecodeProto"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeRaw.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeRaw.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73067173edd90183457312494f681883836a6d5a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeRaw.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeRaw"
+  endpoint {
+    name: "io.DecodeRaw"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeWav.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b249cc6e95b74ea835dceb8bd46910355fbee38
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeWav"
+  endpoint {
+    name: "audio.DecodeWav"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88a87c9291887c5614f4f88cb941c253c9420689
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DeepCopy"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeleteSessionTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeleteSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1865b461de785cef8d53d2fb143419c86bb3981a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeleteSessionTensor.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DeleteSessionTensor"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DenseToDenseSetOperation.pbtxt b/tensorflow/core/api_def/java_api/api_def_DenseToDenseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f85def92ee3b7d7bb2fa3fa1650e1ad7d4ed49e6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DenseToDenseSetOperation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DenseToDenseSetOperation"
+  endpoint {
+    name: "sparse.DenseToDenseSetOperation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_DenseToSparseBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76f6ba0b8ac2180d8d19c388df0a1969d8ec2168
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DenseToSparseBatchDataset"
+  endpoint {
+    name: "data.DenseToSparseBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DenseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/java_api/api_def_DenseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11fbef8ff1fdcefa68b8cb9242efe8ec69507bed
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DenseToSparseSetOperation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DenseToSparseSetOperation"
+  endpoint {
+    name: "sparse.DenseToSparseSetOperation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthToSpace.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d2cbd2b904a98661ccd2b8c16f764f8107e822f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthToSpace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthToSpace"
+  endpoint {
+    name: "nn.DepthToSpace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNative.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1aaa480fefd8815630ba5707ddae43de72e7e776
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNative"
+  endpoint {
+    name: "nn.DepthwiseConv2dNative"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a62d8cf632d72b58277c15bb0e393a3901fbac4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
+  endpoint {
+    name: "nn.DepthwiseConv2dNativeBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9106dd2f8fe103043969947740b8539364032cba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropInput"
+  endpoint {
+    name: "nn.DepthwiseConv2dNativeBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dequantize.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ee4daa2f7e746e9a24f0d60208c33bf39b7073a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dequantize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dequantize"
+  endpoint {
+    name: "quantization.Dequantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeserializeIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeserializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdd03f5dc672f40a238cf7dcc72840592f8838c8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeserializeIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DeserializeIterator"
+  endpoint {
+    name: "data.DeserializeIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeserializeManySparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeserializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..826d49f54655aa2472c7a34a6a40ae2ec54bd32e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeserializeManySparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DeserializeManySparse"
+  endpoint {
+    name: "io.DeserializeManySparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeserializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6f24bb6257d6922398a325997e94143188443aa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeserializeSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DeserializeSparse"
+  endpoint {
+    name: "sparse.DeserializeSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DestroyResourceOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_DestroyResourceOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..733e5e5029c85bcf8b6ed1f7b73849876f1c3db8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DestroyResourceOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DestroyResourceOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DestroyTemporaryVariable.pbtxt b/tensorflow/core/api_def/java_api/api_def_DestroyTemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd416eb68fb46513aa79e32957c943b64a154924
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DestroyTemporaryVariable.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DestroyTemporaryVariable"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Diag.pbtxt b/tensorflow/core/api_def/java_api/api_def_Diag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..374b3c97e17002f6c77759c847be5a0cb3835ec8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Diag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Diag"
+  endpoint {
+    name: "linalg.TensorDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DiagPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_DiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70db2357d0612181119564e775f63ac03ce35df5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DiagPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DiagPart"
+  endpoint {
+    name: "linalg.TensorDiagPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Digamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Digamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68dc74c64ead3b739ce19b5b5a6c9fbc7253c85c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Digamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Digamma"
+  endpoint {
+    name: "math.Digamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dilation2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..914ea29812ce67c38d92a86d2d9f1ee8f6dc2255
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dilation2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2D"
+  endpoint {
+    name: "nn.Dilation2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db3c68e088ef20312d3fc96b7cb3f064c343f1e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2DBackpropFilter"
+  endpoint {
+    name: "nn.Dilation2dBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c935144f7af3e944608945faf085169492450f69
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2DBackpropInput"
+  endpoint {
+    name: "nn.Dilation2dBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Div.pbtxt b/tensorflow/core/api_def/java_api/api_def_Div.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2abba7f05f35f7ca834ed224df8f7462f7d62ca8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Div.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Div"
+  endpoint {
+    name: "math.Div"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DivNoNan.pbtxt b/tensorflow/core/api_def/java_api/api_def_DivNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c124044604b779de8bbab012c1953c8ff98edfad
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DivNoNan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DivNoNan"
+  endpoint {
+    name: "math.DivNoNan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DrawBoundingBoxes.pbtxt b/tensorflow/core/api_def/java_api/api_def_DrawBoundingBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e7954e2b7ffe576e81e7a93aad7bb082d2a94fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DrawBoundingBoxes.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DrawBoundingBoxes"
+  endpoint {
+    name: "image.DrawBoundingBoxes"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DynamicPartition.pbtxt b/tensorflow/core/api_def/java_api/api_def_DynamicPartition.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc585676e486989591e774e0e8237cfc57166998
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DynamicPartition.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DynamicPartition"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DynamicStitch.pbtxt b/tensorflow/core/api_def/java_api/api_def_DynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac1fef4b6afd3905383f14e080e072f537eedd78
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DynamicStitch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DynamicStitch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/java_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e097041d731610447e2f67115373d004bb982f0e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EditDistance.pbtxt b/tensorflow/core/api_def/java_api/api_def_EditDistance.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca65c2c6e5821d79e60b3b6c6305de6b5c3ff4bb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EditDistance.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EditDistance"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Elu.pbtxt b/tensorflow/core/api_def/java_api/api_def_Elu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bfe8d972cf69cab5d3ce847f9507c0ee9c8b5072
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Elu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Elu"
+  endpoint {
+    name: "nn.Elu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_EluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3757357c000b902ee793c2da072fbac8e4c28c4c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EluGrad"
+  endpoint {
+    name: "nn.EluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/java_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6522f51d9dcc34a529f70efbae3da15df1132c96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Empty.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Empty"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EmptyTensorList.pbtxt b/tensorflow/core/api_def/java_api/api_def_EmptyTensorList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef3f533964cd10318ec8ff2e97c2e64a6aa146b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EmptyTensorList.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EmptyTensorList"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeBase64.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeBase64.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66f19def9aec58a9ce6221564da6c209eb118ea2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeBase64.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeBase64"
+  endpoint {
+    name: "io.EncodeBase64"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e151665f87203f6d56cc2c03225827ed128fdc1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeJpeg"
+  endpoint {
+    name: "image.EncodeJpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodePng.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a8d713c865b825e9c896e56964e300ce82deda6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodePng.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodePng"
+  endpoint {
+    name: "image.EncodePng"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeProto.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeProto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac6a04b4bc2958c9bb7628949928b258d1e23059
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeProto.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EncodeProto"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeWav.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3b22fde666b83bbde15d5f54c131660c171a61d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeWav"
+  endpoint {
+    name: "audio.EncodeWav"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26051ab446f9a5f8405de5fae67992ee1c993167
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+  endpoint {
+    name: "data.EnqueueInQueueDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EnsureShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_EnsureShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6238947598f2640e0f1b6a1a88d7700fd62b9cbe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EnsureShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EnsureShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Enter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Enter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffc10c91beb9e9181c7543f94266dd15b9ee14cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Enter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Enter"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Equal.pbtxt b/tensorflow/core/api_def/java_api/api_def_Equal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2256c24337b6bc7d4e50ba1368a484fa87b4776
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Equal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Equal"
+  endpoint {
+    name: "math.Equal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/java_api/api_def_Erf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9efcc3983c411a43910807d059582bb35e9f16e3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Erf.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Erf"
+  endpoint {
+    name: "math.Erf"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Erfc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Erfc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0f4db61ff44ba4d88717d8daa7e1c4665323943
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Erfc.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Erfc"
+  endpoint {
+    name: "math.Erfc"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Exit.pbtxt b/tensorflow/core/api_def/java_api/api_def_Exit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6215cd22299cc41a0bb6f9c1bb0e4239e9f67efe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Exit.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Exit"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/java_api/api_def_Exp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2790c8306f0277bb7613528557c8598afc5dbf6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Exp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Exp"
+  endpoint {
+    name: "math.Exp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExpandDims.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExpandDims.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66902ccb5b0e152b2504469d94b305fb0dd8a64f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExpandDims.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ExpandDims"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalAssertNextDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalAssertNextDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cec4c229e4a2a17aecf54717e5541edc7edf3b91
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalAssertNextDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalAssertNextDataset"
+  endpoint {
+    name: "data.ExperimentalAssertNextDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalCSVDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalCSVDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51fdd9f0b045360ed717b602361670ea9c908f5e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalCSVDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalCSVDataset"
+  endpoint {
+    name:  "data.ExperimentalCsvDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77fe42fd94db16cc2d0fb414543a2872c0527aa3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalDirectedInterleaveDataset"
+  endpoint {
+    name: "data.ExperimentalDirectedInterleaveDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResource.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..320f4affb54d00d4994726e9f538e5eed919b632
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResource"
+  endpoint {
+    name: "data.ExperimentalFunctionBufferingResource"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3dbe7600a5bf3322ba5895b8d4f94ee63d4b27a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResourceGetNext"
+  endpoint {
+    name: "data.ExperimentalFunctionBufferingResourceGetNext"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6909e8678a3bb4d592a15dc022868963548e0c46
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResourceReset"
+  endpoint {
+    name: "data.ExperimentalFunctionBufferingResourceReset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c49c6de2177a96ef0e366cf788e9b10506dedb36
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIdentityIndexedDataset"
+  endpoint {
+    name: "data.ExperimentalIdentityIndexedDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7750a43de2806f86fd44bff9b2a4c43a373e0b3e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIgnoreErrorsDataset"
+  endpoint {
+    name: "data.ExperimentalIgnoreErrorsDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetGet.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96a3befe8bb49bcef4d90cabf6402185d059b5f0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIndexedDatasetGet"
+  endpoint {
+    name: "data.ExperimentalIndexedDatasetGet"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..731309d3291fb3c5107a0bb603bd01a108d333b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIndexedDatasetMaterialize"
+  endpoint {
+    name: "data.ExperimentalIndexedDatasetMaterialize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIteratorGetDevice.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIteratorGetDevice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..838d579ef742d4da801a1adb8509a33091820ad5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIteratorGetDevice.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIteratorGetDevice"
+  endpoint {
+    name: "data.ExperimentalIteratorGetDevice"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalLMDBDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalLMDBDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a092b1cf396772dcbd309a1365fbfe08ae8dfb1b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalLMDBDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalLMDBDataset"
+  endpoint {
+    name: "data.ExperimentalLmdbDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalMapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bea6dffd9c16305796d09602b1b7ca12f5374969
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalMapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalMapDataset"
+  endpoint {
+    name: "data.ExperimentalMapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06632e9041d8827e95063f025b83fa47252534cd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalMaterializedIndexDatasetHandle"
+  endpoint {
+    name: "data.ExperimentalMaterializedIndexDatasetHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalNonSerializableDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b26cf129cf0ca37d69f27e4b7f51c3b76f254cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalNonSerializableDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalNonSerializableDataset"
+  endpoint {
+    name: "data.ExperimentalNonSerializableDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2c0d92f896a8191e428f76ee88d49b0e5ef2a86
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalNumaMapAndBatchDataset"
+  endpoint {
+    name: "data.ExperimentalNumaMapAndBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalSleepDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalSleepDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e95c55ac2628f0eaa858b81844e57f76226d9f4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalSleepDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalSleepDataset"
+  endpoint {
+    name: "data.ExperimentalSleepDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e4aef1c68324b4c12c2c76c5a8947c567bff134
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalThreadPoolDataset"
+  endpoint {
+    name: "data.ExperimentalThreadPoolDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73cf6767b3691ccfdc5fdb1c95d3d3edfe82fb14
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalThreadPoolHandle"
+  endpoint {
+    name: "data.ExperimentalThreadPoolHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalUniqueDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalUniqueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d834193ddbdc90b3a695489c1e5df06f1c3fc99
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalUniqueDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalUniqueDataset"
+  endpoint {
+    name: "data.ExperimentalUniqueDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Expm1.pbtxt b/tensorflow/core/api_def/java_api/api_def_Expm1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71a8fcf02250b4886d5f37b88eeb969ae8b96cf1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Expm1.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Expm1"
+  endpoint {
+    name: "math.Expm1"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractGlimpse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3591f93f71f27e465d65c8bb8d521ed350781786
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractGlimpse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractGlimpse"
+  endpoint {
+    name: "image.ExtractGlimpse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractImagePatches.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cdcfd752855f13714a4ebb9b80eed9bec65165a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractImagePatches.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractImagePatches"
+  endpoint {
+    name: "image.ExtractImagePatches"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractJpegShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c95fcc9cef4f657a89fd8c531d970e4587cc6205
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractJpegShape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractJpegShape"
+  endpoint {
+    name: "image.ExtractJpegShape"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractVolumePatches.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractVolumePatches.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f61c8321097957f62f8872dfd84880de3da4019
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractVolumePatches.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ExtractVolumePatches"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_FFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9655903086119a4cea7adb97cea89793b34109f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FFT"
+  endpoint {
+    name: "signal.Fft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..474103076b96682fba824bc633d77ec4588c0ea9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FFT2D"
+  endpoint {
+    name: "signal.Fft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e1606b8f9df51cdd04707483fd2ec59fd049855
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FFT3D"
+  endpoint {
+    name: "signal.Fft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FIFOQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_FIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5b2f73c55d5a3ac3ec7193ba7dd1da147ffc96d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FIFOQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FIFOQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FIFOQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0861a6e8dda0abe83925f8163babab778d71e28
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FIFOQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FIFOQueueV2"
+  endpoint {
+    name: "io.FifoQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Fact.pbtxt b/tensorflow/core/api_def/java_api/api_def_Fact.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..436664e554b2829dbe257b819842f9dc70d1eb0d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Fact.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Fact"
+  endpoint {
+    name: "math.Fact"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeParam.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeParam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac8f751442c2f5864b51812688c514cd36509368
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeParam.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "FakeParam"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..809d231a55ca4be0a563fed29ab0493608f271b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxArgs"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxArgs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50d0f51a1409436c9f4ca7c7519c8df16b482792
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxArgsGradient"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxArgsGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVars.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b86258aab2bfb40c4dbc8e1bb3d5960773a767f2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVars.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVars"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVars"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c1343423c18fe3eebf2eafbfaea73217b262f66
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsGradient"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVarsGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..afe45a290d30f204ff132d165aa46fca3f55e747
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsPerChannel"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVarsPerChannel"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9dd62fdffd0fbcb2fdfc6fc7348bb206cdcaef33
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVarsPerChannelGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8960966f0842cbc586abdf37975a162fd9a47915
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQueue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQueue"
+  endpoint {
+    name: "io.FakeQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Fill.pbtxt b/tensorflow/core/api_def/java_api/api_def_Fill.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3997328ed3100f5ffe6c22b9f481fb5421304353
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Fill.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Fill"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FilterByLastComponentDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FilterByLastComponentDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b7111f48fa921a7ec0f91f668f1ba607d4666ff6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FilterByLastComponentDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FilterByLastComponentDataset"
+  endpoint {
+    name: "data.FilterByLastComponentDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FilterDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..930fff419151a687fc5520435da4502c98ef272a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FilterDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FilterDataset"
+  endpoint {
+    name: "data.FilterDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4f23d94c03c5f9d5c6578c1a7fd8b32cd9434e8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8012bbe1684a9f48b9c2829c080ad16b7697848
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+  endpoint {
+    name: "data.FixedLengthRecordDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f76cd494561027929a7011dffc2552bf3c53047f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f897c21365b024fc7e698691627dc8bb2968674e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FixedLengthRecordReaderV2"
+  endpoint {
+    name: "io.FixedLengthRecordReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb9c68d4dbaac22e1ac55d495712a854fae40db5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FixedUnigramCandidateSampler"
+  endpoint {
+    name: "nn.FixedUnigramCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FlatMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6e96cb4e00e534cea88ca52379d4ba361e84dc1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FlatMapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FlatMapDataset"
+  endpoint {
+    name: "data.FlatMapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Floor.pbtxt b/tensorflow/core/api_def/java_api/api_def_Floor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2b80f97e0598345138dafa9f8fd7e6986c0a6d6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Floor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Floor"
+  endpoint {
+    name: "math.Floor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_FloorDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..054d85f55c9b4c5dc13bf63ce1e5f5efec82bd5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FloorDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FloorDiv"
+  endpoint {
+    name: "math.FloorDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/java_api/api_def_FloorMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff2216a9357fe72429fa95046cdf81e147229a62
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FloorMod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FloorMod"
+  endpoint {
+    name: "math.FloorMod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FlushSummaryWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_FlushSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..feaa3a6dc22dc4486189c5d030d81cdeb76d30a9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FlushSummaryWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FlushSummaryWriter"
+  endpoint {
+    name: "summary.FlushSummaryWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_For.pbtxt b/tensorflow/core/api_def/java_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30363d1e9637d4c15146cf91b190e95f34aa773f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_For.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "For"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc2e6ca54b802a5ddec908853ccec47d6725b52b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalAvgPool"
+  endpoint {
+    name: "nn.FractionalAvgPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalAvgPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e11d5e3950feaaa58f54f626334a8a9cee98e19
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPoolGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalAvgPoolGrad"
+  endpoint {
+    name: "nn.FractionalAvgPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..061b358ec27ab86c844e1669e73a935fe1d7170e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalMaxPool"
+  endpoint {
+    name: "nn.FractionalMaxPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalMaxPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c70e6d721e5bed4a62ed170d9ced09a061210f5c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPoolGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalMaxPoolGrad"
+  endpoint {
+    name: "nn.FractionalMaxPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNorm.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d6166fe816f02456a31a45bf1e24dd1dc120cbc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNorm"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e1d066d8dc70891256d25bc36a32aa18a2fd958
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f333c91f4ffbd25d7928a842d14333ef10c35bd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGradV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedBatchNormGradV2"
+  endpoint {
+    name: "nn.FusedBatchNormGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNormV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a4e76c94989ad22bd571a3b82b21bb97be49c8f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedBatchNormV2"
+  endpoint {
+    name: "nn.FusedBatchNorm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedPadConv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7ee10e0c58f7e454a17a2f0f047e6be0f49327e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedPadConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedPadConv2D"
+  endpoint {
+    name: "nn.FusedPadConv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedResizeAndPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6948fc1b87d2d6b250520e67053a329407268e09
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedResizeAndPadConv2D"
+  endpoint {
+    name: "nn.FusedResizeAndPadConv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Gather.pbtxt b/tensorflow/core/api_def/java_api/api_def_Gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c4ccda48bf15552b05c8d6895576d3cf74dfc6d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Gather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Gather"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/java_api/api_def_GatherNd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..257c0316ea0ae7e9ae007684e2074a33605f60e3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GatherNd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GatherNd"
+  endpoint {
+    name: "GatherNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_GatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0927e77a9688e6ae338a6643bff19e20333ab13c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GatherV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GatherV2"
+  endpoint {
+    name: "Gather"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GcsConfigureBlockCache.pbtxt b/tensorflow/core/api_def/java_api/api_def_GcsConfigureBlockCache.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ba3044d4c349e5397d96033f37f817395b6d553
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GcsConfigureBlockCache.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GcsConfigureBlockCache"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GcsConfigureCredentials.pbtxt b/tensorflow/core/api_def/java_api/api_def_GcsConfigureCredentials.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..98bd555fb87506a57afca2dc86e6157adb534683
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GcsConfigureCredentials.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GcsConfigureCredentials"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GenerateBigQueryReaderPartitions.pbtxt b/tensorflow/core/api_def/java_api/api_def_GenerateBigQueryReaderPartitions.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..956f40762d7499f4dbfbb083ae1f28b3190ff968
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GenerateBigQueryReaderPartitions.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GenerateBigQueryReaderPartitions"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GenerateVocabRemapping.pbtxt b/tensorflow/core/api_def/java_api/api_def_GenerateVocabRemapping.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9aac3b17f3cf91ac26b19ccd82147a0dd11e9141
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GenerateVocabRemapping.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GenerateVocabRemapping"
+  endpoint {
+    name: "train.GenerateVocabRemapping"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_GeneratorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1719005e99077e857295eb72e681875eeb50dd3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GeneratorDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GeneratorDataset"
+  endpoint {
+    name: "data.GeneratorDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GetSessionHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_GetSessionHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ee6fe18a2a768bfb451d16630ce613c0cd31fbf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GetSessionHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetSessionHandle"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GetSessionHandleV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_GetSessionHandleV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba89942d77f11166cab0406a8a309feb9a43e881
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GetSessionHandleV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GetSessionHandleV2"
+  endpoint {
+    name: "GetSessionHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GetSessionTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_GetSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b6e627cdc9ffbc72d2ef390c6a3c7d61d45d9b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GetSessionTensor.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GetSessionTensor"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Greater.pbtxt b/tensorflow/core/api_def/java_api/api_def_Greater.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..594f9276be1292f2499f2338213ed8a222af486d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Greater.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Greater"
+  endpoint {
+    name: "math.Greater"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GreaterEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_GreaterEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17ea8696b0dc9e84f1cef1ac9555385e7e2848dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GreaterEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GreaterEqual"
+  endpoint {
+    name: "math.GreaterEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_GroupByReducerDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bd2c8f531b705524bf227d6ed141f03adf66423
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GroupByReducerDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GroupByReducerDataset"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_GroupByWindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e4c4cd4ff25cc7eaee31b017c4c95b725fee489
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GroupByWindowDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GroupByWindowDataset"
+  endpoint {
+    name: "data.GroupByWindowDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GuaranteeConst.pbtxt b/tensorflow/core/api_def/java_api/api_def_GuaranteeConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cac25787dad3e7be49496e74e0f6361523525d2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GuaranteeConst.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GuaranteeConst"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HSVToRGB.pbtxt b/tensorflow/core/api_def/java_api/api_def_HSVToRGB.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95b042d5d688e62d6a4fcb9f8250adb2b68d35ac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HSVToRGB.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HSVToRGB"
+  endpoint {
+    name: "image.HsvToRgb"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HashTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_HashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d48c2224f63207b9ab392659b0392ee2e850ab39
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "HashTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HashTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_HashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38cc5818d3bad14272a532db3568439667472286
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HashTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HashTableV2"
+  endpoint {
+    name: "HashTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HistogramFixedWidth.pbtxt b/tensorflow/core/api_def/java_api/api_def_HistogramFixedWidth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f64d9ae1d2387db0e88b0d28d762c294dd00d7a9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HistogramFixedWidth.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "HistogramFixedWidth"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HistogramSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_HistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97f28335bb96b3c98fd3f556cfbc8b77ef6763cc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HistogramSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HistogramSummary"
+  endpoint {
+    name: "summary.HistogramSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HostConst.pbtxt b/tensorflow/core/api_def/java_api/api_def_HostConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba589e73e718d784d98afe1f04d0eacd15cf5fdc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HostConst.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "HostConst"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_IFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a15ebec7f67c4aea28391a2f3af25c7f26352cc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IFFT"
+  endpoint {
+    name: "signal.Ifft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35d696ee739951927961aa903fd92c5af4306bff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IFFT2D"
+  endpoint {
+    name: "signal.Ifft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76a3164e6aaff6e9d7a18792debd32e3eba0a223
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IFFT3D"
+  endpoint {
+    name: "signal.Ifft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IRFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_IRFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a68b01524ecd5fbc1a439c559edc67b5c843e96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IRFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IRFFT"
+  endpoint {
+    name: "signal.Irfft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IRFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IRFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..239ec445d020736ed0fa642b646331d5493a0a87
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IRFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IRFFT2D"
+  endpoint {
+    name: "signal.Irfft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IRFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IRFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87969436b7f2b5d6eb156781006121f0b7653ada
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IRFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IRFFT3D"
+  endpoint {
+    name: "signal.Irfft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Identity.pbtxt b/tensorflow/core/api_def/java_api/api_def_Identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6df3c6cfec45d6fffa9484722099e582529ba8c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Identity.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Identity"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IdentityN.pbtxt b/tensorflow/core/api_def/java_api/api_def_IdentityN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..827df10c65a190ab37d8445d11d5cfc8b7873593
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IdentityN.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IdentityN"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IdentityReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_IdentityReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42fe85a5675796eceb9213bc986676f659d36bc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IdentityReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IdentityReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IdentityReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_IdentityReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8081ac26b3d4bebf525c9afd5a734763c3007720
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IdentityReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IdentityReaderV2"
+  endpoint {
+    name: "io.IdentityReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_If.pbtxt b/tensorflow/core/api_def/java_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3bc33ac2cee060877f5a10d97537d77ca60949e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_If.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "If"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Igamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbdd8b984c46fa7df6f41c3e7e98a8382c194cb3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Igamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Igamma"
+  endpoint {
+    name: "math.Igamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IgammaGradA.pbtxt b/tensorflow/core/api_def/java_api/api_def_IgammaGradA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0659c80c39fb085a0ca1629c958a7d66f19acd59
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IgammaGradA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IgammaGradA"
+  endpoint {
+    name: "math.IgammaGradA"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Igammac.pbtxt b/tensorflow/core/api_def/java_api/api_def_Igammac.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94f6085e1a97c3ec7b4e4a17c06ebb1c3b1c4fd2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Igammac.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Igammac"
+  endpoint {
+    name: "math.Igammac"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Imag.pbtxt b/tensorflow/core/api_def/java_api/api_def_Imag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4227c7078fcc3fac47998d51917bb09f45a6eb3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Imag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Imag"
+  endpoint {
+    name: "math.Imag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_ImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1871e6b65503461e123153977f66c1b9f574e125
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ImageSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ImageSummary"
+  endpoint {
+    name: "summary.ImageSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ImmutableConst.pbtxt b/tensorflow/core/api_def/java_api/api_def_ImmutableConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd0384dc45203e458ca1179615b5f3c3e1ee5a86
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ImmutableConst.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ImmutableConst"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ImportEvent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ImportEvent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2d8d2eba35a341a2ab12d4cc7af9dd3b4d956a2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ImportEvent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ImportEvent"
+  endpoint {
+    name: "summary.ImportEvent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InTopK.pbtxt b/tensorflow/core/api_def/java_api/api_def_InTopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf90fd0f814824528815e844918b0c46de11adca
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InTopK.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InTopK"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InTopKV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_InTopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..400ee714624943669a216be70bdb6b09a4743cba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InTopKV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InTopKV2"
+  endpoint {
+    name: "nn.InTopK"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30e3d66bfe9c5285165808ee74de2e21abac7dd0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFile.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..786e22cd474647bf203cfdb58d4e2ef027f37ee9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFile.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTableFromTextFile"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFileV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFileV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a4f70220184eb5eb36d14c0066a4e7fdf837abc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFileV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InitializeTableFromTextFileV2"
+  endpoint {
+    name: "InitializeTableFromTextFile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7a9a813d070b6bda559a70bb31f4e4096000661
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InitializeTableV2"
+  endpoint {
+    name: "InitializeTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d157ab7f83f6d7efaa1a996cfa5ed42f16d5fde
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "InplaceAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2ed54964304de7f813ba7e14c250eec5a53bb77
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "InplaceSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..91041b43abd3e8d06e31fe2d5f3b9f1ecd96aaea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "InplaceUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_InterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25e40ac2dbe604f7f6165da35c1d674e07fdbb2c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InterleaveDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InterleaveDataset"
+  endpoint {
+    name: "data.InterleaveDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Inv.pbtxt b/tensorflow/core/api_def/java_api/api_def_Inv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49f3e6c0429b85d7b03e34b9c408a95d0a112151
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Inv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Inv"
+  endpoint {
+    name: "linalg.Inv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InvGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_InvGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3bfa78e99ce6c734d6acb9e606666737322c477
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InvGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InvGrad"
+  endpoint {
+    name: "nn.InvGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Invert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Invert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9898bfa003d040cb23c00655a8fe41241261d702
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Invert.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Invert"
+  endpoint {
+    name: "bitwise.Invert"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InvertPermutation.pbtxt b/tensorflow/core/api_def/java_api/api_def_InvertPermutation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ee103f554e47d090f133c6cb72edc67a4c430f5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InvertPermutation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InvertPermutation"
+  endpoint {
+    name: "math.InvertPermutation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35f17b79243e0f651bd24f0da7675e84fe632935
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IsBoostedTreesEnsembleInitialized"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesQuantileStreamResourceInitialized.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6cd2dcc45133637a8462f8176e02159d1968371
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IsBoostedTreesQuantileStreamResourceInitialized"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsFinite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fce589039916324a4493cf4000ff8685087b214d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsFinite.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IsFinite"
+  endpoint {
+    name: "math.IsFinite"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsInf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..823c1d72812c30e14b6b080dc47366f3c341a85c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsInf.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IsInf"
+  endpoint {
+    name: "math.IsInf"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58805bf99f60467680d1fac62a3cdc78bb1dd746
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsNan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IsNan"
+  endpoint {
+    name: "math.IsNan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsVariableInitialized.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsVariableInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bf51da2da97ea46cd8465d4d8f86b4851ebc64a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsVariableInitialized.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IsVariableInitialized"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Iterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_Iterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f4d9967c3cc1650b65a9e75624fa7fc21358910
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Iterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Iterator"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a4e443cde0a3ed5239aaa73b31288f1c593d0fd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorFromStringHandle"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandleV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandleV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86745a3a56470c32f5a59b7323b549d6959b0008
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandleV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorFromStringHandleV2"
+  endpoint {
+    name: "data.IteratorFromStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorGetNext.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorGetNext.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2248ff9f5f5efffe2fee50d109bfe3404b8f7cc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorGetNext.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorGetNext"
+  endpoint {
+    name: "data.IteratorGetNext"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorGetNextAsOptional.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextAsOptional.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae02a0e0171a9eee8aa925c6e9f22c427bfda2bd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextAsOptional.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorGetNextAsOptional"
+  endpoint {
+    name: "data.IteratorGetNextAsOptional"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorGetNextSync.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextSync.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4aa7c07a7741752fd90f5ff592f54cd5c8b21b3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextSync.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorGetNextSync"
+  endpoint {
+    name: "data.IteratorGetNextSync"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorToStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorToStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7413ec846e7e6a44c35c34ee8ed35f418946bfc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorToStringHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorToStringHandle"
+  endpoint {
+    name: "data.IteratorToStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87720d441b368abf1d5532b59019c7860c5739a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorV2"
+  endpoint {
+    name: "data.Iterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_L2Loss.pbtxt b/tensorflow/core/api_def/java_api/api_def_L2Loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c348e0f0e0b5524f9f98310d9be7392579a7a5fd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_L2Loss.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "L2Loss"
+  endpoint {
+    name: "nn.L2Loss"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LMDBReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_LMDBReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6cd7bdbb7ee578434d69bd943fc0f6d5c7b486a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LMDBReader.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LMDBReader"
+  endpoint {
+    name: "io.LmdbReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LRN.pbtxt b/tensorflow/core/api_def/java_api/api_def_LRN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d16fea31843a5cb785988cf72fd86a60247479e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LRN.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LRN"
+  endpoint {
+    name: "nn.LocalResponseNormalization"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LRNGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_LRNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a50e738d785b2ca8c5b032221aa9259e4bb521a4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LRNGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LRNGrad"
+  endpoint {
+    name: "nn.LocalResponseNormalizationGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_LatencyStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf0bf2a5ed712c8bf36af21fb8a59dab9eec82ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LatencyStatsDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LatencyStatsDataset"
+  endpoint {
+    name: "data.LatencyStatsDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LeakyRelu.pbtxt b/tensorflow/core/api_def/java_api/api_def_LeakyRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31a4f01167bda19909928b34e32b11746aadca61
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LeakyRelu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeakyRelu"
+  endpoint {
+    name: "nn.LeakyRelu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LeakyReluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_LeakyReluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9899c64c13e12c5184b09f0935b0ee360d41edc9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LeakyReluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeakyReluGrad"
+  endpoint {
+    name: "data.LeakyReluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LearnedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_LearnedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f193da1be112a169f632611c64328cbf3d0dadc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LearnedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LearnedUnigramCandidateSampler"
+  endpoint {
+    name: "nn.LearnedUnigramCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LeftShift.pbtxt b/tensorflow/core/api_def/java_api/api_def_LeftShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44a8727e40f4415c3ee197c64f4ea8a93c46a621
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LeftShift.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeftShift"
+  endpoint {
+    name: "bitwise.LeftShift"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Less.pbtxt b/tensorflow/core/api_def/java_api/api_def_Less.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..577d2556b81c37282cea21b342b9ea557f531590
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Less.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Less"
+  endpoint {
+    name: "math.Less"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LessEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_LessEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cad35c62263042f6683c5f8437c84b345462e53
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LessEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LessEqual"
+  endpoint {
+    name: "math.LessEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Lgamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Lgamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb7bc9660c01044fb8ff0282a50a04c79b257536
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Lgamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Lgamma"
+  endpoint {
+    name: "math.Lgamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/java_api/api_def_LinSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..599c310021a0aa1a511b818949a5816574dce0d9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LinSpace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LinSpace"
+  endpoint {
+    name: "LinSpace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ListDiff.pbtxt b/tensorflow/core/api_def/java_api/api_def_ListDiff.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa94c958f17463d2c616306acbe000acd465fafc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ListDiff.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ListDiff"
+  endpoint {
+    name: "SetDiff1d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LoadAndRemapMatrix.pbtxt b/tensorflow/core/api_def/java_api/api_def_LoadAndRemapMatrix.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54ee68fde44b6c8954927532d5953ae49ef08e1e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LoadAndRemapMatrix.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LoadAndRemapMatrix"
+  endpoint {
+    name: "linalg.LoadAndRemapMatrix"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Log.pbtxt b/tensorflow/core/api_def/java_api/api_def_Log.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d11c26c71ab1c783a09b76fc89dc0f47903880b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Log.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Log"
+  endpoint {
+    name: "math.Log"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/java_api/api_def_Log1p.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cc1d6e6c82254efac0d941fb97538f501d76daf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Log1p.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Log1p"
+  endpoint {
+    name: "math.Log1p"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogMatrixDeterminant.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e52d9ecedce8a98977bfaa2035f22e18c3171b8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogMatrixDeterminant.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogMatrixDeterminant"
+  endpoint {
+    name: "linalg.LogMatrixDeterminant"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogSoftmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19518a71ea474e4182cbda83b6ff54bcf1b92618
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogSoftmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogSoftmax"
+  endpoint {
+    name: "nn.LogSoftmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogUniformCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogUniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdcf01c20fde6dab975a7299f494212b40a6cc6c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogUniformCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogUniformCandidateSampler"
+  endpoint {
+    name: "random.LogUniformCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogicalAnd.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogicalAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12921dd932c3e89f70f43724bb503424ffd4d672
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogicalAnd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogicalAnd"
+  endpoint {
+    name: "math.LogicalAnd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogicalNot.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogicalNot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e0960958ed00291bec72e81a127608659df9ebd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogicalNot.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogicalNot"
+  endpoint {
+    name: "math.LogicalNot"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogicalOr.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogicalOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c834e464146a3dcaf23afca7da925a7347d6117
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogicalOr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogicalOr"
+  endpoint {
+    name: "math.LogicalOr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableExport.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableExport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..29885222a491a4c8499ab6a9b18fd7a3e7d28415
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableExport.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableExport"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableExportV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableExportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d780f2a21d7f60a9004b62def0e5be5b21354e5d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableExportV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableExportV2"
+  endpoint {
+    name: "LookupTableExport"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableFind.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableFind.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23f7facaa24f796ca65864771701af7eb2a69d76
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableFind.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableFind"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableFindV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableFindV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2247547b62b358e4f94d324feba0a15706bfd0fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableFindV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableFindV2"
+  endpoint {
+    name: "LookupTableFind"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableImport.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableImport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f87ea9c0736ce28c1d28ca9dbc9cd7eebce32e48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableImport.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableImport"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableImportV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableImportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a39cffa12d1a11129870b7110e64a1c9b22ab2dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableImportV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableImportV2"
+  endpoint {
+    name: "LookupTableImport"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableInsert.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableInsert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a45b3f52a5b87c47255caf9c1e94a64520734a0e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableInsert.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableInsert"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableInsertV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableInsertV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..037b743b6be20ac5313218549d2f7fe100f1f40d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableInsertV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableInsertV2"
+  endpoint {
+    name: "LookupTableInsert"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableRemoveV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableRemoveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61f6d8db36a1c50659bb1a905832caff878cafa4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableRemoveV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableRemoveV2"
+  endpoint {
+    name: "LookupTableRemove"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..391dc5dfadf027c40e15c523bb91873daae187e8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableSize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableSizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad646e25a6b4ce9f529f2aab60b79b767a284fba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableSizeV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableSizeV2"
+  endpoint {
+    name: "LookupTableSize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LoopCond.pbtxt b/tensorflow/core/api_def/java_api/api_def_LoopCond.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..492f78f62ee1ece2ce7dfecfbca63db711d9b847
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LoopCond.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "LoopCond"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LowerBound.pbtxt b/tensorflow/core/api_def/java_api/api_def_LowerBound.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31f1d3038ca1a41ed8e57af2233ee95af29ca67e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LowerBound.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "LowerBound"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MakeIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_MakeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9dfa761370d0e817785fa714788cf88d98721a4a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MakeIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MakeIterator"
+  endpoint {
+    name: "data.MakeIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb96bf63d8f0d15bb47f92a7f8e1ea055ed8208f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapAndBatchDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b29c21888fae479bc7ced724a711bd724e71241f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MapAndBatchDatasetV2"
+  endpoint {
+    name: "data.MapAndBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapClear.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ed9bf8a5d8641c8ca136feaf788fceeb185c5d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapClear.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapClear"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fdec9eb857411cc007769bb4e8e28eaffda9f60f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MapDataset"
+  endpoint {
+    name: "data.MapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapDefun.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapDefun.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43b1dc722c031d7303b3e8f640c40c617fd88ab5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapDefun.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapDefun"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapIncompleteSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..659993e42b0d707b7eccca92fe7bee2b5b6865ed
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapIncompleteSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapIncompleteSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapPeek.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb1bd158f0dec1fd2955a28aea210a73c1d26ad2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapPeek.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapPeek"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4da151152c90e0175ede0f74cd130812f88f6232
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapStage.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d9f66cfc4884c7e86e97d82c3f017c59e7b189a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapStage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapStage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapUnstage.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb118f0fcb9787ce3277643c056cae525e770462
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapUnstage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapUnstage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapUnstageNoKey.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1004e96482a6de52b4fec020c9fd620e7b43534c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapUnstageNoKey.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapUnstageNoKey"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe4b8405b9c62c757dcee1fe1b4c7579d1a33458
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatMul"
+  endpoint {
+    name: "linalg.MatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatchingFiles.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb7b096895760ee466675450aca627a7b42cdbd7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatchingFiles.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatchingFiles"
+  endpoint {
+    name: "io.MatchingFiles"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatchingFilesDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatchingFilesDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..749257c37b5b9a88058464547091aba401d5490a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatchingFilesDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatchingFilesDataset"
+  endpoint {
+    name: "data.MatchingFilesDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixBandPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaf426c00eff9ff469ff72240229bde9da946d5b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixBandPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixBandPart"
+  endpoint {
+    name: "linalg.BandPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixDeterminant.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b56d2dad3a8a1509fd1b859b754974e9aab1c4d4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixDeterminant.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixDeterminant"
+  endpoint {
+    name: "linalg.Det"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..839cd82b8910cf7d30a73e6a85b1e8b60bc20d0a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixDiag"
+  endpoint {
+    name: "linalg.Diag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixDiagPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..008f75c1e99b199fbde137ea809ed9987c211d09
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixDiagPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixDiagPart"
+  endpoint {
+    name: "linalg.DiagPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixExponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb232dab983c8ab1c9911ec1c2861a359106e5d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixExponential.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixExponential"
+  endpoint {
+    name: "linalg.MatrixExponential"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixInverse.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68721fc78db0cb4a0979e8e1208e539d1ec53e16
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixInverse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixInverse"
+  endpoint {
+    name: "linalg.Inv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixLogarithm.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixLogarithm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04137ffae7980d597783494a011c881227a68be2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixLogarithm.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixLogarithm"
+  endpoint {
+    name: "linalg.MatrixLogarithm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSetDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61001fa38cf2fb324bb5ad6ad5b28fa4ed189513
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSetDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSetDiag"
+  endpoint {
+    name: "linalg.SetDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02c21448bba33e86593e36a8cd9f1c190235ee89
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSolve"
+  endpoint {
+    name: "linalg.Solve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSolveLs.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cee578ec287b45025561d5957bdf2ce1ad12ad4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSolveLs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSolveLs"
+  endpoint {
+    name: "linalg.MatrixSolveLs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSquareRoot.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSquareRoot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..14c7624fe37a9846d804c2006789e2beeb006c79
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSquareRoot.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSquareRoot"
+  endpoint {
+    name: "linalg.Sqrtm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f61e99efe411f7f21ed902ed6ce3edc54b8fc48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixTriangularSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixTriangularSolve"
+  endpoint {
+    name: "linalg.TriangularSolve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Max.pbtxt b/tensorflow/core/api_def/java_api/api_def_Max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03868720edf1e010b7d4c5d70ae39822b55274dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Max.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Max"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ebc9e6a6f970dac1870800afe56257e9a754ded
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPool"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17aeb6a8c9313a9b9a954952f58c5413446f3c19
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3D"
+  endpoint {
+    name: "nn.MaxPool3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool3DGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca7a7be835b5761d7079c17c69a8fe2f7aef9f2b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3DGrad"
+  endpoint {
+    name: "nn.MaxPool3dGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool3DGradGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c70aa3fe30a26a250a72c09d050bb8b764fa19e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGradGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3DGradGrad"
+  endpoint {
+    name: "nn.MaxPool3dGradGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ad85fa08e6f35bb69fb48b1de44e9314bec60fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3375ebc77d5b8ebb7c6c61eb89653600d6dd47b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGradGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ca8a7b02217d3561bde7dbfae737067ae442d96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradGradV2"
+  endpoint {
+    name: "nn.MaxPoolGradGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d43cf7447cc3f50495d3b0c3dde4c3c436f1a19d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradGradWithArgmax"
+  endpoint {
+    name: "nn.MaxPoolGradGradWithArgmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..556dd0be502c343a23ba522d77989aa6384d6979
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradV2"
+  endpoint {
+    name: "nn.MaxPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradWithArgmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c10701f555eee78bfbad8ae67937693d764047dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradWithArgmax"
+  endpoint {
+    name: "nn.MaxPoolGradWithArgmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..846349435533953c6d0a3be3a4bbc4c0b9631bf9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolV2"
+  endpoint {
+    name: "nn.MaxPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43630534cb70e52425cb4d188b889a7ed2984c4d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolWithArgmax"
+  endpoint {
+    name: "nn.MaxPoolWithArgmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Maximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1df9c6053050c3defd97f39c8ea02c134d2cc1a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Maximum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Maximum"
+  endpoint {
+    name: "math.Maximum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mean.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bdcdc3d74e58ccd5aa754440be5c7e241c448d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mean.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mean"
+  endpoint {
+    name: "math.Mean"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Merge.pbtxt b/tensorflow/core/api_def/java_api/api_def_Merge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..954d5085adf1ef31e37889fdacedd10e41f28b36
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Merge.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Merge"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MergeSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_MergeSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f52c7c0996b8fa5b1d75e97cc4f7ae78df08b561
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MergeSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MergeSummary"
+  endpoint {
+    name: "summary.MergeSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MergeV2Checkpoints.pbtxt b/tensorflow/core/api_def/java_api/api_def_MergeV2Checkpoints.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8899c8c4ed768cd74d87ee89fd1c00f344163919
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MergeV2Checkpoints.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MergeV2Checkpoints"
+  endpoint {
+    name: "train.MergeV2Checkpoints"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mfcc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mfcc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cb04e73ff400129bc0cd02568c7dbd365c026fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mfcc.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mfcc"
+  endpoint {
+    name: "audio.Mfcc"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Min.pbtxt b/tensorflow/core/api_def/java_api/api_def_Min.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72894c1ffd44d179583a9b87d04a8d5f7ee807c3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Min.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Min"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Minimum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Minimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..69f76a982995cee304e9d877b3d9e126c0c553a4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Minimum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Minimum"
+  endpoint {
+    name: "math.Minimum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MirrorPad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MirrorPad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1cb766f8fdca3fdcd60376e020a6dcf22ff04b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MirrorPad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MirrorPad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MirrorPadGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MirrorPadGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ddd8ab3ba1d8f94b1e175781e4070f1893ac7db8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MirrorPadGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MirrorPadGrad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mod.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76fbbe97a89cf80b756198be85f8072eeda7835d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mod"
+  endpoint {
+    name: "math.Mod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ModelDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ModelDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..143c7afd720c64e581e36bec25af9f2c3cb62378
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ModelDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ModelDataset"
+  endpoint {
+    name: "data.ModelDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mul.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..605e110931e21d73e738190aa70207989e334bb5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mul"
+  endpoint {
+    name: "math.Mul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81eabf9bdcfac310fc15eb652585a930a025246d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIterator"
+  endpoint {
+    name: "data.MultiDeviceIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorFromStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorFromStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4006f72d5204cad2b502f3e87579ad31bd623100
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorFromStringHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorFromStringHandle"
+  endpoint {
+    name: "data.MultiDeviceIteratorFromStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorGetNextFromShard.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorGetNextFromShard.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e6fc1508249b937a1b454e11aa75c2073999f1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorGetNextFromShard.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorGetNextFromShard"
+  endpoint {
+    name: "data.MultiDeviceIteratorGetNextFromShard"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorInit.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorInit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1663bc5c22659d3087eb7048677de01d1a4fce8f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorInit.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorInit"
+  endpoint {
+    name: "data.MultiDeviceIteratorInit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorToStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorToStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff061da390b0bef3c53514aa8abcbfb7a954ad04
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorToStringHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorToStringHandle"
+  endpoint {
+    name: "data.MultiDeviceIteratorToStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Multinomial.pbtxt b/tensorflow/core/api_def/java_api/api_def_Multinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd98bb10b4b25591684f6a7423827ee171bf1855
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Multinomial.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Multinomial"
+  endpoint {
+    name: "random.Multinomial"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7494815d8d2ca6fcbd2020d6b180851e9844e78
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableDenseHashTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1f7f26848b5304afebe5ff87e77af5ec91350dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutableDenseHashTableV2"
+  endpoint {
+    name: "MutableDenseHashTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c446ff8b27e76beb50a7cb4eb6249297f74ac342
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensors.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76df883d7d4fea91ef58a07513be9b0ece097b65
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTableOfTensors"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensorsV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensorsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6d7451267bf09d2f07a2fbf365a4c9578a745ce
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensorsV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutableHashTableOfTensorsV2"
+  endpoint {
+    name: "MutableHashTableOfTensors"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45d619d6747742866c6d70bdc3044d6d69f8717e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutableHashTableV2"
+  endpoint {
+    name: "MutableHashTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutexLock.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75c7be5286ab638ad8ea1075879a03bbe24a5447
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutexLock.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MutexLock"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutexV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutexV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f89cd106432d0a7323cee0c0b4d32f62d5996b0a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutexV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutexV2"
+  endpoint {
+    name: "Mutex"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NcclAllReduce.pbtxt b/tensorflow/core/api_def/java_api/api_def_NcclAllReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7133d4a4a8455f84dad8b2305f8ab560042bc15
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NcclAllReduce.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NcclAllReduce"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NcclBroadcast.pbtxt b/tensorflow/core/api_def/java_api/api_def_NcclBroadcast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9b1a345c933deb321e4a8eb0e919b5f8166e7cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NcclBroadcast.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NcclBroadcast"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NcclReduce.pbtxt b/tensorflow/core/api_def/java_api/api_def_NcclReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18dc89017782ea1264229f5efc650044adf6bdfe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NcclReduce.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NcclReduce"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/java_api/api_def_Neg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7e9ede2a568f4a33c8a10611c5215270e01ea41
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Neg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Neg"
+  endpoint {
+    name: "math.Neg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NegTrain.pbtxt b/tensorflow/core/api_def/java_api/api_def_NegTrain.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb62186362a511b39ab39b42b8e56ea6993a6f71
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NegTrain.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NegTrain"
+  endpoint {
+    name: "train.NegTrain"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NextIteration.pbtxt b/tensorflow/core/api_def/java_api/api_def_NextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc63f6ada14200917958aff04dabcb13486d1572
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NextIteration.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NextIteration"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NoOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_NoOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..337fb5da146b30bed15fc3f7082e5f91acfa7dd8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NoOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NoOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppression.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppression.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49ac0de4ce7635b9b750fada815f33c24cc5c86d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppression.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppression"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fabf5c62157c7ae9148d2f43382e4d13af2be2d3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0aefcb55098ccc2226447fc080bfa3fdd56f9faa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV3"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV4.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV4.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71de7f4a6a2a36f9b8a3bb7c29a28c70fcd790b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV4.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NonMaxSuppressionV4"
+  endpoint {
+    name: "image.NonMaxSuppression"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionWithOverlaps.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionWithOverlaps.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06fa52920d2b93e0df8b8032d9b89d7b59e78f17
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionWithOverlaps.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NonMaxSuppressionWithOverlaps"
+  endpoint {
+    name: "image.NonMaxSuppressionWithOverlaps"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NotEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_NotEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d99af40b5d326a92180194dc8a6d01cf578b5a8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NotEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NotEqual"
+  endpoint {
+    name: "math.NotEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NthElement.pbtxt b/tensorflow/core/api_def/java_api/api_def_NthElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..57097e634aadf233de10770ea14faa0a2ec2e9dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NthElement.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NthElement"
+  endpoint {
+    name: "nn.NthElement"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OneHot.pbtxt b/tensorflow/core/api_def/java_api/api_def_OneHot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66872d5eb8ead8571136a57f9ba2e5cb7e04c9e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OneHot.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OneHot"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OneShotIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_OneShotIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39af8cefde4f88d5a2973f93f3b861374af94de0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OneShotIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OneShotIterator"
+  endpoint {
+    name: "data.OneShotIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OnesLike.pbtxt b/tensorflow/core/api_def/java_api/api_def_OnesLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97abe0814a9b988999e965cc5c5d61c62a4a4763
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OnesLike.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OnesLike"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptimizeDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptimizeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7ddf97d1ab755efe9b3bdb05552f8ec39f86d1f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptimizeDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptimizeDataset"
+  endpoint {
+    name: "data.OptimizeDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalFromValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalFromValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d251fd5d9482e6d6e93cad59e9c2b5be3f01f32e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalFromValue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalFromValue"
+  endpoint {
+    name: "data.OptionalFromValue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalGetValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalGetValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fcdb5ac6943fe0a44b838e2adbb8f95e01a2f5d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalGetValue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalGetValue"
+  endpoint {
+    name: "data.OptionalGetValue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalHasValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalHasValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ffa15b564c547dbe391232d9eff5375b1b40242
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalHasValue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalHasValue"
+  endpoint {
+    name: "data.OptionalHasValue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalNone.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalNone.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cec29a42ae2e8cb1765369bb1c586ec73e5735c9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalNone.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalNone"
+  endpoint {
+    name: "data.OptionalNone"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapClear.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e36b2aa3e4f46c0e725b6b17817869900de5cd33
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapClear.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapClear"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapIncompleteSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c609e9e50a2e4e88700788cd81b0a9a649df1e42
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapIncompleteSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapIncompleteSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapPeek.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06fc2182773658f2b8bcc2bf32550ef32faa3e64
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapPeek.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapPeek"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7beef3f376ba6d7917bb753394984c5b65ae8108
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapStage.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b579d21a0e38923edd9dcb306f085f3be12828a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapStage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapStage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstage.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3d6862fbe0ae91e9f3e7089a6f65a52f9570832
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapUnstage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstageNoKey.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d275c85d942a1dd6175c1b7fcddf2644200772e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstageNoKey.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapUnstageNoKey"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Pack.pbtxt b/tensorflow/core/api_def/java_api/api_def_Pack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9e9897d77cde18efc7a9f6831a71e0819280198
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Pack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Pack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/java_api/api_def_Pad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f83f451552d924b37b214e58849a2da38772957f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Pad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Pad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PadV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffc121645600ef2f159b3356407e540e2565a1c3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PadV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PadV2"
+  endpoint {
+    name: "Pad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cec77427cfe6314eb223bf28898853a4667856e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddedBatchDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddedBatchDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22dfe84f0cadd59a83cae827deb4ba9c33fbe19d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PaddedBatchDatasetV2"
+  endpoint {
+    name: "data.PaddedBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03db4bf185d78bbafd9040aa9f6d39dcdf1e81d3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddingFIFOQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..605025be7910993c691c22c2eb9cd681145b9e58
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PaddingFIFOQueueV2"
+  endpoint {
+    name: "io.PaddingFifoQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b17c7d256926f975afb2b21bb83f4298248bfeb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelConcat.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ParallelConcat"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelDynamicStitch.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelDynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79a55b763f356478d6cb35adda051ea9ab9d8b5a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelDynamicStitch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ParallelDynamicStitch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a985d24fa74062c2262961abacd7e60654617b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParallelInterleaveDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c74149a6c3497ec75f62c0c227d09558b543493
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParallelInterleaveDatasetV2"
+  endpoint {
+    name: "ParallelInterleaveDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5091bb9cec21eff4ee4b168bf11e3001abe4fe9f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelMapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParallelMapDataset"
+  endpoint {
+    name: "data.ParallelMapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26ca2fc86fa2fbc2aaefac6b0be210e98dd3947e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParameterizedTruncatedNormal"
+  endpoint {
+    name: "random.ParameterizedTruncatedNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e7384f2e337ff2d4d597cb1ee4229e3ef7a01ac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseExample"
+  endpoint {
+    name: "io.ParseExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseExampleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4309645093d8dd55e92a39da2963e7c7f7c14041
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseExampleDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseExampleDataset"
+  endpoint {
+    name: "data.ParseExampleDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseSequenceExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseSequenceExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09ee715ac715f5cad68141e337c575f040019db5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseSequenceExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseSequenceExample"
+  endpoint {
+    name: "io.ParseSequenceExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseSingleExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseSingleExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7559957b35e1169edc0a399448656c355a41e007
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseSingleExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseSingleExample"
+  endpoint {
+    name: "io.ParseSingleExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseSingleSequenceExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseSingleSequenceExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00eb325b2a437e955e9fcb9c18f241726892e1c0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseSingleSequenceExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseSingleSequenceExample"
+  endpoint {
+    name: "io.ParseSingleSequenceExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a78cdc7f5c44df056fb587cdd263062fd333f227
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseTensor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseTensor"
+  endpoint {
+    name: "io.ParseTensor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/java_api/api_def_PartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ac10b602803a38abb00dd3766ca409cdab51c27
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PartitionedCall.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PartitionedCall"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Placeholder.pbtxt b/tensorflow/core/api_def/java_api/api_def_Placeholder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e6daa2ae42f1bab3bfc00f0630b1727c77352d4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Placeholder.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Placeholder"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PlaceholderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PlaceholderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..419bdf10f794b051b16ef45e5762cac4a8bff087
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PlaceholderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PlaceholderV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PlaceholderWithDefault.pbtxt b/tensorflow/core/api_def/java_api/api_def_PlaceholderWithDefault.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59067a9c688782a998fda06206b6ec460b882afd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PlaceholderWithDefault.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PlaceholderWithDefault"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Polygamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Polygamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..746b3375a0f05c5c31b4298d534e8d5177586d90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Polygamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Polygamma"
+  endpoint {
+    name: "math.Polygamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PopulationCount.pbtxt b/tensorflow/core/api_def/java_api/api_def_PopulationCount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6aacdf4d1218bc5d05c8332beff51a34a34f0377
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PopulationCount.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PopulationCount"
+  endpoint {
+    name: "math.PopulationCount"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Pow.pbtxt b/tensorflow/core/api_def/java_api/api_def_Pow.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7eaaed6952d221c074466024e7f5e49ea47c7ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Pow.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Pow"
+  endpoint {
+    name: "math.Pow"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_PrefetchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..beaad84d15344f0bb26421c4757f4e508e1f0c3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PrefetchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PrefetchDataset"
+  endpoint {
+    name: "data.PrefetchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c9d509b16346776a9e085edab06f0587d73d0ae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+  endpoint {
+    name: "data.PrependFromQueueAndPaddedBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PreventGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_PreventGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4731f21af4c8a56dde6c8c9e573d75fad1effffe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PreventGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PreventGradient"
+  endpoint {
+    name: "train.PreventGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Print.pbtxt b/tensorflow/core/api_def/java_api/api_def_Print.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21a085a1c2c5d4568f5609e9efc080efaefe7c59
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Print.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Print"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PrintV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PrintV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1e4d74b1e3b8a324abbf04d4f9ca37b814f20bd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PrintV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PrintV2"
+  endpoint {
+    name: "Print"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PriorityQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_PriorityQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a9909d122f8b3b60d605d2ce458c57b907b4fdc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PriorityQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PriorityQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PriorityQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PriorityQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f6a6f2906b00e538215af03249594faea9141c2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PriorityQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PriorityQueueV2"
+  endpoint {
+    name: "io.PriorityQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Prod.pbtxt b/tensorflow/core/api_def/java_api/api_def_Prod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a98972a871d4ed3be1554f184b33b236b6bca67
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Prod.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Prod"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PyFunc.pbtxt b/tensorflow/core/api_def/java_api/api_def_PyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f7c47ca040c2b2a9392523be7f955b5310725
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PyFunc"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PyFuncStateless.pbtxt b/tensorflow/core/api_def/java_api/api_def_PyFuncStateless.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..684ef58d1b258c4448df5095d3dfcc4050d5f583
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PyFuncStateless.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PyFuncStateless"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Qr.pbtxt b/tensorflow/core/api_def/java_api/api_def_Qr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0d31e79778dce19d70db6d1824b00451e3e280b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Qr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Qr"
+  endpoint {
+    name: "linalg.Qr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantize.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6ba0ce8b8a239c5c9d27a3a8fe853b616e940f6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678a77113cf483c90240a1c5802849bddbf4b219
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c9e52e770a1f48d965b4775174e1f4f471f02017
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV3"
+  endpoint {
+    name: "quantization.QuantizeAndDequantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeDownAndShrinkRange.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeDownAndShrinkRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7119f53cb29a9c23771b5e0c288dc1f0a515cac2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeDownAndShrinkRange.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizeDownAndShrinkRange"
+  endpoint {
+    name: "quantization.QuantizeDownAndShrinkRange"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25c9c3bdce467e06b92929dfc2476f5704443442
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizeV2"
+  endpoint {
+    name: "quantization.Quantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a2bfa36ed7d7bb0bbf9a5d46dc637e6746b99b8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedAdd"
+  endpoint {
+    name: "math.QuantizedAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f16fb046dd975fe8480ab6d415a101e28256d28
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedAvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedAvgPool"
+  endpoint {
+    name: "nn.QuantizedAvgPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2101ce5d69071c1d6e8f0667b8492d86fcc2aa70
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedBatchNormWithGlobalNormalization"
+  endpoint {
+    name: "nn.QuantizedBatchNormWithGlobalNormalization"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedBiasAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedBiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c18fb1c5741abf83da47e75aff393380e7917ec7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedBiasAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedBiasAdd"
+  endpoint {
+    name: "nn.QuantizedBiasAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb5d0c0fda46090ae282265b501147c1ceb32fbc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedConcat.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizedConcat"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9909157603eb329af85d3b574eca44adb4f8d979
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedConv2D"
+  endpoint {
+    name: "nn.QuantizedConv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedInstanceNorm.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedInstanceNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbd2e7fc5ec85c5f343450df78575dbff8e8b988
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedInstanceNorm.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedInstanceNorm"
+  endpoint {
+    name: "nn.QuantizedInstanceNorm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7962cbade6f9b4a4b59795b1312c43587e4fb2c1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMatMul"
+  endpoint {
+    name: "linalg.QuantizedMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..57e900494e994bcead9de2c302741bfdf2f1e7e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedMaxPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMaxPool"
+  endpoint {
+    name: "nn.QuantizedMaxPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be23ef706e206b45f4d75173fb3aa952617de8fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMul"
+  endpoint {
+    name: "math.QuantizedMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedRelu.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b9a11640ba6ac6b3483e256694ea37aeac8153e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedRelu"
+  endpoint {
+    name: "nn.QuantizedRelu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedRelu6.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a60e2112ee4609596e6c2adb652314d9159bbf4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu6.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedRelu6"
+  endpoint {
+    name: "nn.QuantizedRelu6"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedReluX.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc47d322b270118b9e8497826525ad8f7e9f13d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedReluX.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedReluX"
+  endpoint {
+    name: "nn.QuantizedReluX"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedReshape.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4557853d94f550776c5d4e257d3872ca7f5e9e0e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedReshape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizedReshape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81dca490944cb07f90d00e76baf40559776eff7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedResizeBilinear"
+  endpoint {
+    name: "image.QuantizedResizeBilinear"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a6bada741da9c3f0634320be8b6a122c6d9d490
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueClose"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueCloseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce779650e5bf3f58bc4bd4320013ad2c1c6eaf88
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueCloseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueCloseV2"
+  endpoint {
+    name: "io.QueueClose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeue.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45c811a6b44f63a6aa5abe19e97d3df6623629d9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e088ef2587c071dc07706c1f8d38e5b63dc9b27
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueMany"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueManyV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10fe198ff261c6638710b43f92060a577009e7f6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueManyV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueDequeueManyV2"
+  endpoint {
+    name: "io.QueueDequeueMany"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b96e568c4114f2044c00b982227e7b2438cf09d2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueUpTo"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpToV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fadea0926b6b7f80733775246669a7d7e0f36014
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpToV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueDequeueUpToV2"
+  endpoint {
+    name: "io.QueueDequeueUpTo"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ba03afbfebddb0bde4399ae36319848683a978d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueDequeueV2"
+  endpoint {
+    name: "io.QueueDequeue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2945c46d6eb5a9cbe0d468cac90e62c83aa22395
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueueMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..442ddcbc038ab14aa6f72bb49414c77bd63bdc00
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueueMany"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueueManyV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be3fed47896d4a56f77b23395d94145d4fdbf463
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueManyV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueEnqueueManyV2"
+  endpoint {
+    name: "io.QueueEnqueueMany"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71a2211e1e72cd2df836fdf2fe91ad88e0aed7c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueEnqueueV2"
+  endpoint {
+    name: "io.QueueEnqueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueIsClosed.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueIsClosed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b51b208b76ddca86a31a5e30def3f972b244024
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueIsClosed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueIsClosed"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueIsClosedV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueIsClosedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..148d313a6d317479a8bf70d0ee82a55b8cb7a57a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueIsClosedV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueIsClosedV2"
+  endpoint {
+    name: "io.QueueIsClosed"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bd7244d6814624d15c7d631f825e806bbbc6a2d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueSize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueSizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e93e07a2b32e9cc96620c3d3c68f5446068a69e7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueSizeV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueSizeV2"
+  endpoint {
+    name: "io.QueueSize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_RFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9576600e756b00f1a7f8d01ad89955bc91e7d726
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RFFT"
+  endpoint {
+    name: "signal.Rfft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_RFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41d638b26a8a3f6824dcf00429cc7de533362e1f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RFFT2D"
+  endpoint {
+    name: "signal.Rfft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_RFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a762d22e5cdf8d0ec910bc23ca54f39ba07d06e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RFFT3D"
+  endpoint {
+    name: "signal.Rfft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RGBToHSV.pbtxt b/tensorflow/core/api_def/java_api/api_def_RGBToHSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b35891ae2cc53ca905c8fbf99f02c6f0a7bb49a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RGBToHSV.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RGBToHSV"
+  endpoint {
+    name: "image.RgbToHsv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RaggedGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_RaggedGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f060daeb6571631f2bb029676b3aa5b3a28be6d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RaggedGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RaggedGather"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RaggedRange.pbtxt b/tensorflow/core/api_def/java_api/api_def_RaggedRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1a5bab0acee54ec89a67b5e63edce5a6b080d3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RaggedRange.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RaggedRange"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RaggedTensorToSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_RaggedTensorToSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f049f47b46c45d9a44e85f57f9821e69c1ea869e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RaggedTensorToSparse.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RaggedTensorToSparse"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomCrop.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomCrop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3b8a3cecda8720f9fae41fd4fd7501c0c8c0414
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomCrop.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomCrop"
+  endpoint {
+    name: "image.RandomCrop"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43921e6eafe6d49d5dd9f28e7d6d7f3e4c206527
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomDataset"
+  endpoint {
+    name: "data.RandomDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomGamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomGamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..927f2c56937f08a440cd94b09ca90b553df9182a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomGamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomGamma"
+  endpoint {
+    name: "random.RandomGamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomGammaGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomGammaGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9257495c9bd15239036c6a3971f60dd7fa2c6466
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomGammaGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomGammaGrad"
+  endpoint {
+    name: "random.RandomGammaGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomPoisson.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomPoisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1ea79502415c8096097ffc16e99979ac0b3383a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomPoisson.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomPoisson"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomPoissonV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomPoissonV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5efe01bf4011c628a1b54e95de3a3bd83f529ff2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomPoissonV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomPoissonV2"
+  endpoint {
+    name: "random.RandomPoisson"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomShuffle.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomShuffle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6dcd12fd375d7db664b4070efaed4a8525e3a4bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomShuffle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomShuffle"
+  endpoint {
+    name: "random.RandomShuffle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9660121a073124e7dd600d4d70c60195fcd4b1cb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomShuffleQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..779363303cac64b4037d3f1fb0de883760424247
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomShuffleQueueV2"
+  endpoint {
+    name: "io.RandomShuffleQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomStandardNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..413fc87bdf06dcbe59dbbbb8c549512d16afe885
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomStandardNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomStandardNormal"
+  endpoint {
+    name: "random.RandomStandardNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomUniform.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a93df83df65462a717b7aa347e69c2036d4efb6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomUniform.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomUniform"
+  endpoint {
+    name: "random.RandomUniform"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomUniformInt.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1383f406a384905e9b0862e6f07345149da9eeb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomUniformInt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomUniformInt"
+  endpoint {
+    name: "random.RandomUniformInt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Range.pbtxt b/tensorflow/core/api_def/java_api/api_def_Range.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24f3787a8e38f2deb446724cd35ca7acfa57a424
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Range.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Range"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_RangeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6180261b053e1949778e7fd327d1e5db53c19b0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RangeDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RangeDataset"
+  endpoint {
+    name: "data.RangeDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rank.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rank.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..baa84aab10d65e5cbc58d2d2ab0ece807c7ef8ff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rank.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Rank"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReadFile.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReadFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f74250d42f76b10d18d7fa39fb3f43099e7eb137
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReadFile.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReadFile"
+  endpoint {
+    name: "io.ReadFile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReadVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReadVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..018886d5b825586a687024f513065cf4b24456bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReadVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ReadVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProduced.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProduced.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b087d11182e77847ff8adac8241aa1f67ae2a3c1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProduced.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumRecordsProduced"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProducedV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProducedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54a30abe18728a01409fa5ec1786a2b75ea37ef9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProducedV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderNumRecordsProducedV2"
+  endpoint {
+    name: "io.ReaderNumRecordsProduced"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e30e97fd08cf750a9d332adaf276c4be508993fe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompleted"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0904ba19e5398ba60054ca9c31646e718a8daabb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompletedV2"
+  endpoint {
+    name: "io.ReaderNumWorkUnitsCompleted"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderRead.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f98d88bda84ebff75f84989b1983dbccec23a22
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderRead.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRead"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReadUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d418b00b2736061599852356aa99e9acb7da6c3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReadUpTo"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReadUpToV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..777d09fa2c6c5cc9a6d2c8210dde6bfcb6777120
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpToV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderReadUpToV2"
+  endpoint {
+    name: "io.ReaderReadUpTo"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReadV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5d45bd1db9a05597e531aa87a3e68797af9e1ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReadV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderReadV2"
+  endpoint {
+    name: "io.ReaderRead"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6041caabd1590d077abaff3bf169314a2d7c558
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderResetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderResetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..265a3442f5345e2cfdcae269e8d10ab0f336d6e4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderResetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderResetV2"
+  endpoint {
+    name: "io.ReaderReset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderRestoreState.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0aa0ec595d600d338e03d6520bf4824ee25199ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreState.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRestoreState"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderRestoreStateV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4728ce7796bed9d8591bfb104235abb3fa87d105
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreStateV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderRestoreStateV2"
+  endpoint {
+    name: "io.ReaderRestoreState"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderSerializeState.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e23e285fb13e692191cea3b6d6b9272b765337d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeState.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderSerializeState"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderSerializeStateV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa396095b176239ffbae67be76d644c0cb421183
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeStateV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderSerializeStateV2"
+  endpoint {
+    name: "io.ReaderSerializeState"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Real.pbtxt b/tensorflow/core/api_def/java_api/api_def_Real.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3aaea928dec5b68cb501cfa48882abf8fd720b6b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Real.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Real"
+  endpoint {
+    name: "math.Real"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_RealDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..415bd29da04f1244c521530a17488c9623048ef3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RealDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RealDiv"
+  endpoint {
+    name: "math.RealDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Reciprocal.pbtxt b/tensorflow/core/api_def/java_api/api_def_Reciprocal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c0d787c24ebf717fb1b7ba227fa28cbaf05f115
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Reciprocal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Reciprocal"
+  endpoint {
+    name: "math.Reciprocal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReciprocalGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReciprocalGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68879669b5f76a3b8751aa8b7d690f64dc5ead5b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReciprocalGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReciprocalGrad"
+  endpoint {
+    name: "math.ReciprocalGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RecordInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_RecordInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4807c68dee88b427a8ad77caec06c2755ccd790
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RecordInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RecordInput"
+  endpoint {
+    name: "random.RecordInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReduceDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReduceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b16c5dbb96c7f5005025b47b4cf39be6a66b76d6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReduceDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ReduceDataset"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReduceJoin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReduceJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79193222018b7f09dc0db09b718762f79f13033d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReduceJoin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReduceJoin"
+  endpoint {
+    name: "strings.ReduceJoin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefEnter.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefEnter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cd2281bc6a1f738a990546044050970c08219be
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefEnter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefEnter"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefExit.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefExit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67e8d39c9af68802cbcd99b58e4b4a99034734dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefExit.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefExit"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefIdentity.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..53483bd1bb79064dda0a95717c41b06ab99b3852
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefIdentity.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefIdentity"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefMerge.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefMerge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ea3145841b3496d13d88327e49033f9c2ad49c8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefMerge.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefMerge"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefNextIteration.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefNextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d008204b7ad34421c90cbed831ea3349502e14b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefNextIteration.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefNextIteration"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefSelect.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefSelect.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7cda2d5b3020400dcffd2b78906af16df45b063
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefSelect.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefSelect"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefSwitch.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefSwitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78261d8b7e60dbcd57213bcee9ad01fbf9cd8e2a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefSwitch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefSwitch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f88e24eac6c94f58c2bd7a431ea022ac2c2e1d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  endpoint {
+    name: "strings.RegexFullMatch"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/java_api/api_def_RegexReplace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01c9e93cab7b8253b518302853a2ab2cba6f748c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RegexReplace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RegexReplace"
+  endpoint {
+    name: "strings.RegexReplace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Relu.pbtxt b/tensorflow/core/api_def/java_api/api_def_Relu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39d7fec4526622788a9ef85bb0a23d5a1b97646a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Relu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu"
+  endpoint {
+    name: "nn.Relu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Relu6.pbtxt b/tensorflow/core/api_def/java_api/api_def_Relu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fcc012b5033a1311bc57bd82a8a33beb0f38c9a6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Relu6.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu6"
+  endpoint {
+    name: "nn.Relu6"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Relu6Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_Relu6Grad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33e959cc7b36a8fc03f57a3c82ab0f1734696e43
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Relu6Grad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu6Grad"
+  endpoint {
+    name: "nn.Relu6Grad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec4a8b5f972d1075dde62c74ad9ae987c4e8984c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReluGrad"
+  endpoint {
+    name: "nn.ReluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RemoteCall.pbtxt b/tensorflow/core/api_def/java_api/api_def_RemoteCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0ddb01784792f3c215cbdf60fd51748f16b5916
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RemoteCall.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RemoteCall"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RemoteFusedGraphExecute.pbtxt b/tensorflow/core/api_def/java_api/api_def_RemoteFusedGraphExecute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b73e633ef2c45c8e9e11f2a46b6d5fd5f33fd3e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RemoteFusedGraphExecute.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RemoteFusedGraphExecute"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_RepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..871824f8867b1e1b7d777711b4a27b1df27f81da
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RepeatDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RepeatDataset"
+  endpoint {
+    name: "data.RepeatDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/java_api/api_def_RequantizationRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d365e8992558ce804133114c2fbcf59db30d4379
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RequantizationRange.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RequantizationRange"
+  endpoint {
+    name: "quantization.RequantizationRange"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/java_api/api_def_Requantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d397cde4db932987e3a2a8880673c2f7e2561d1a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Requantize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Requantize"
+  endpoint {
+    name: "quantization.Requantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Reshape.pbtxt b/tensorflow/core/api_def/java_api/api_def_Reshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4bf3a409d1afcd5c2b763037a5b1b05073499701
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Reshape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Reshape"
+  endpoint {
+    name: "Reshape"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeArea.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5358c18d4b423749e63bc77bcfe206005988a9d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeArea.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeArea"
+  endpoint {
+    name: "image.ResizeArea"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBicubic.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d0942e26624981a4e1966765fee11bb390c0813
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBicubic.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBicubic"
+  endpoint {
+    name: "image.ResizeBicubic"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBicubicGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBicubicGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12e61dc8238e78f6f4e55f2cdd224488212b7a52
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBicubicGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBicubicGrad"
+  endpoint {
+    name: "image.ResizeBicubicGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad123744a920406562327e368b2acacdd3ab719a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBilinear.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBilinear"
+  endpoint {
+    name: "image.ResizeBilinear"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBilinearGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBilinearGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04f3e9f19efb782f33eac84cb4aa747588217991
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBilinearGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBilinearGrad"
+  endpoint {
+    name: "image.ResizeBilinearGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighbor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86ad39a51738067d9e4c5fc5fec98c070f1cc504
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeNearestNeighbor"
+  endpoint {
+    name: "image.ResizeNearestNeighbor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighborGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70eeb906fab4e3bcb3884841d2dd777016958973
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighborGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeNearestNeighborGrad"
+  endpoint {
+    name: "image.ResizeNearestNeighborGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff57bd5849a9b291fee68fb658f4df651dbdff13
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  endpoint {
+    name: "train.ResourceApplyAdaMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4369f0eade908f5e7c45f9089167f5433357a51
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdadelta"
+  endpoint {
+    name: "train.ResourceApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf7c20ee7b17be285e5f876c90f57eb854ecef0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdagrad"
+  endpoint {
+    name: "train.ResourceApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..afddaaff5737e5e3cdd0dd660a9d6db3fbd0e64f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdagradDA"
+  endpoint {
+    name: "train.ResourceApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..390bd999c4555ec12401c8c49309a6400c281e5d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdam.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdam"
+  endpoint {
+    name: "train.ResourceApplyAdam"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAddSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf944477be351677541625b38e9fafe62eb0030e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAddSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAddSign"
+  endpoint {
+    name: "train.ResourceApplyAddSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85c97b430a83aeda97859392d5064320e3828b4d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyCenteredRMSProp"
+  endpoint {
+    name: "train.ResourceApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61bec5bb10983da12d8acb36d3d6e7afcaf2416e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8209fd607e172f72b5a39ad52bca683fd13eb56d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyFtrlV2"
+  endpoint {
+    name: "train.ResourceApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a54fed14d1733fdaab577c1cf04ab59eb50b35b2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyGradientDescent"
+  endpoint {
+    name: "train.ResourceApplyGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..debb0a8131eb9542ca95ce4fc01f96298b2ae3ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyMomentum"
+  endpoint {
+    name: "train.ResourceApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyPowerSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96df22c81f0cd66054372545cbcccf2b8a8d7d7e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyPowerSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyPowerSign"
+  endpoint {
+    name: "train.ResourceApplyPowerSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..809b8b3af3baacfeb28ec282f33b9aa64920676a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyProximalAdagrad"
+  endpoint {
+    name: "train.ResourceApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c9ff5a499d5dba071f1a3cea8e2266602e1fc88c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyProximalGradientDescent"
+  endpoint {
+    name: "train.ResourceApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa3adf759e0d9e1fe9d0cb0d9f18a77722b197d0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyRMSProp"
+  endpoint {
+    name: "train.ResourceApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceCountUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceCountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..439c1f17557de26121c00074122f224cfd1bdd1f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceCountUpTo.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceCountUpTo"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79c6e8abbcb14e0075854d6f1584f7d8ff4e0759
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceGather"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4184e33bf00d1a5c8825e406da311eb371436ba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e21c24a588968daef1d519d039230e6b02b7617
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterDiv.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterDiv"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d25b14272d63e750d163906bbfff7b5244446f87
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMax.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterMax"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6243cc1ae3e54e4d77a7615c6260206a95242fbe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMin.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterMin"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..393e5556c0beb1e10e412813bf571100d3770efd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMul.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterMul"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fd38f7be87241c77c213babd43ea91f0743d4e9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterNdAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..732de5f1cc2de7389075f192fe94ba9f7a8e1406
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterNdUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77081dda4d59a9bee17a69493dc85800f81bcdc3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c2cc0ec210c944220558ffb4c1a5bdcc255a446
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6e3ae2219c7a13f1ce07a0c0a68c9827dd50f5e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdadelta"
+  endpoint {
+    name: "train.ResourceSparseApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5be4d6199b55f8a75b8506d6491dcff2136f6143
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagrad"
+  endpoint {
+    name: "train.ResourceSparseApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0547687d6408a1ab7caf1471415552ce269affc7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagradDA"
+  endpoint {
+    name: "train.ResourceSparseApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..632b0ab4c20c36f2f61a1d771de799d7d8ae5f92
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyCenteredRMSProp"
+  endpoint {
+    name: "train.ResourceSparseApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e6fed94691a2490311c04e0d0d5a6bc0ab0d786
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd126d78ab6ff277299b8a763a11e4d37c2e3904
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrlV2"
+  endpoint {
+    name: "train.ResourceSparseApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e00039e01411cc473efb03bd2bbaebb8bf1ee14
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyMomentum"
+  endpoint {
+    name: "train.ResourceSparseApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04fe8504e5dc019e467024a8e0a5146bb8b6cd46
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalAdagrad"
+  endpoint {
+    name: "train.ResourceSparseApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27df43c9c505c3ff25e6b1fcc08aee1efb71867d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalGradientDescent"
+  endpoint {
+    name: "train.ResourceSparseApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec8910a88a725bb7a6b5327e9ebd44122d272ff4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyRMSProp"
+  endpoint {
+    name: "train.ResourceSparseApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceStridedSliceAssign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceStridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83805389b98dec01432cb3aa184ec5c5db0ccec4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceStridedSliceAssign.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceStridedSliceAssign"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Restore.pbtxt b/tensorflow/core/api_def/java_api/api_def_Restore.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e5b021b084b66f94e400544433455feb28d95e6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Restore.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Restore"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RestoreSlice.pbtxt b/tensorflow/core/api_def/java_api/api_def_RestoreSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0728f5908b2cb9164df8e03a8218542278ac3261
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RestoreSlice.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RestoreSlice"
+  endpoint {
+    name: "train.RestoreSlice"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RestoreV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_RestoreV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909968873f25553704dfac60cd6be8dd8ab8f5dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RestoreV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RestoreV2"
+  endpoint {
+    name: "train.Restore"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Reverse.pbtxt b/tensorflow/core/api_def/java_api/api_def_Reverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a199d2fcde0776e90f2760ecfe89e26f733d6c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Reverse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Reverse"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReverseSequence.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReverseSequence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87638c0dcc99a3c1f4bc86caf00c61d551a64b50
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReverseSequence.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ReverseSequence"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReverseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReverseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71efbe1892e71f26cf8bf1f8f52a86d45851588f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReverseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReverseV2"
+  endpoint {
+    name: "Reverse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RightShift.pbtxt b/tensorflow/core/api_def/java_api/api_def_RightShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68fab3e8cf87574a13436df6a96400e32168ff5c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RightShift.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RightShift"
+  endpoint {
+    name: "bitwise.RightShift"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rint.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48fbcc7c346fe14302800cc3fe7fe78b325d9819
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rint.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Rint"
+  endpoint {
+    name: "math.Rint"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/java_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50f7915a65a91d5e02085c794d19d84976e54c4a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Roll.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Roll"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Round.pbtxt b/tensorflow/core/api_def/java_api/api_def_Round.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd612a33d63407af961de2f564feaaebc61f6d3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Round.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Round"
+  endpoint {
+    name: "math.Round"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rpc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d1e2b90e6a7b08f546b67a2767bfa14e9b0e534
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rpc.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Rpc"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rsqrt.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rsqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06b1b81ecd47413d456d1b3a38ef8f6ae5045821
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rsqrt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Rsqrt"
+  endpoint {
+    name: "math.Rsqrt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RsqrtGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_RsqrtGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88073b6f25459375d01446352617f5528a1cce40
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RsqrtGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RsqrtGrad"
+  endpoint {
+    name: "math.RsqrtGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBox.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dffd53b05918d96d574b1265d6eb45963bdb8f8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..655731413788b10d93f35f49153c3db77df75ee3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBoxV2"
+  endpoint {
+    name: "image.SampleDistortedBoundingBox"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Save.pbtxt b/tensorflow/core/api_def/java_api/api_def_Save.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36d44001d5b081a2216df2f6912b238243551fed
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Save.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Save"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SaveSlices.pbtxt b/tensorflow/core/api_def/java_api/api_def_SaveSlices.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b33603568009a406e0628d5a6f2ba5592383d2d4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SaveSlices.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SaveSlices"
+  endpoint {
+    name: "train.SaveSlices"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SaveV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SaveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..644d1824aa15831139e2c245ae1ff517aca74f82
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SaveV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SaveV2"
+  endpoint {
+    name: "train.Save"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScalarSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c339ce0a7a55bd2d2b284260ffb3d6ef56e06046
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScalarSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ScalarSummary"
+  endpoint {
+    name: "summary.ScalarSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScanDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..89b63c53f70d3092acd4da749f17d49ab793532b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScanDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ScanDataset"
+  endpoint {
+    name: "data.ScanDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41c63dc0a4064cd9ad370e94ebd1c1a3b18ab43c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5754249eafcf294fb55b13e462db0310d2de3284
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterDiv.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterDiv"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa6375cbd76f1c569d1ab5d9e0c1f6dcd8c7d200
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterMax.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterMax"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea007120c36c942a69bc2689da67487d5a5f6367
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterMin.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterMin"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1d91258e4bdd247077b7278ff2ab7160c777406
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterMul.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterMul"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef01b2fcacb9bed05eb38de1fa93013a1408cd3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ScatterNd"
+  endpoint {
+    name: "ScatterNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bea152a9da5a1ce7959751c26c8c26b25f430ce4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdNonAliasingAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dd756bfc0c3631a3bbb53d9ab041b61326dcd2f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdNonAliasingAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdNonAliasingAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..384e79d64ef908b1df0dd3117097d5b9181d64ce
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..92fce7f0ac3b91f77841ae9d53b99d5bfc2956e6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5baaa4f6045eeb9bd89a4ec66e7fcaff6ad8abc6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83ac128ed60ed25b5f601950007ea581c19a1277
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaFprint.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaFprint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce179918cd07b727bfe7aab129edc82b8de9fc3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaFprint.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaFprint"
+  endpoint {
+    name: "train.SdcaFprint"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaOptimizer.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fab6393f602d6bf8831a8579ba101f917eb7daf5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizer.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SdcaOptimizer"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaOptimizerV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizerV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b72ee64e501aa0c550cf31082217b58d76aebc82
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizerV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaOptimizerV2"
+  endpoint {
+    name: "train.SdcaOptimizer"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaShrinkL1.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaShrinkL1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83993bcf1490cdd74aa278fe8548d5967e64c84c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaShrinkL1.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaShrinkL1"
+  endpoint {
+    name: "train.SdcaShrinkL1"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ac26c9e9e3964eeda25bd56de2d4e3bd2b634f3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentMax"
+  endpoint {
+    name: "math.SegmentMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..982db87bf09f7fd5dc6066f8019cb3af21fde183
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentMean.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentMean"
+  endpoint {
+    name: "math.SegmentMean"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a403b6c63d073e60250f0220aea982cb2596205
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentMin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentMin"
+  endpoint {
+    name: "math.SegmentMin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentProd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bf280edc4310fbf158be33e579b1a525e6152b4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentProd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentProd"
+  endpoint {
+    name: "math.SegmentProd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dcbc352253da3fa65ad488fa1c5db1db7dae98e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentSum"
+  endpoint {
+    name: "math.SegmentSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Select.pbtxt b/tensorflow/core/api_def/java_api/api_def_Select.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eeff79284c2ea2d4aeb2519b316d935922bf3e06
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Select.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Select"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SelfAdjointEig.pbtxt b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc25ae9de2513a136d7889953dbbd9239ff3393e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEig.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SelfAdjointEig"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c79f08ac32360f616e547285858fe7548b221774
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEigV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SelfAdjointEigV2"
+  endpoint {
+    name: "linalg.SelfAdjointEig"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/java_api/api_def_Selu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7002d5be1221d6d34ed7f1cace1f6d672a8974fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Selu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Selu"
+  endpoint {
+    name: "nn.Selu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SeluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SeluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b34e2f223934da532d6099a452506245304d5879
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SeluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SeluGrad"
+  endpoint {
+    name: "nn.SeluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37789c753b4762e699a7059211db51d0ea352668
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeIterator"
+  endpoint {
+    name: "data.SerializeIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeManySparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..567a8e4b7f0f39a5cc3b52f170ec5504364b20e0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeManySparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeManySparse"
+  endpoint {
+    name: "io.SerializeManySparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad86c7095c366b68ac99098a90c8e3fe95cd13ae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeSparse"
+  endpoint {
+    name: "io.SerializeSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..169120a0635b812d1f585d0d1b7fa33873817ce8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeTensor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeTensor"
+  endpoint {
+    name: "io.SerializeTensor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SetSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_SetSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c000e9c8aacdb7c9b76bc4cc0b7160759bf73fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SetSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SetSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f57abe5a667bf4cffee537a3948ef69e2577fdde
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+  endpoint {
+    name: "data.SetStatsAggregatorDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Shape.pbtxt b/tensorflow/core/api_def/java_api/api_def_Shape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5074000b53745c40ab39a046e0ea05a47f148991
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Shape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Shape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShapeN.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShapeN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4bfb91118a30cefaf847c12452cdea89ff3be2b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShapeN.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ShapeN"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShardedFilename.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShardedFilename.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8efd0afb8b3ce3b0b19929926be53e880d2282d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShardedFilename.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShardedFilename"
+  endpoint {
+    name: "io.ShardedFilename"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShardedFilespec.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShardedFilespec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e31cac1040c390ba30c9af34eef6ec28436a91d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShardedFilespec.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShardedFilespec"
+  endpoint {
+    name: "io.ShardedFilespec"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShuffleAndRepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..543d5a109c7fc602b5511aadcef81a107ccadc64
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShuffleAndRepeatDataset"
+  endpoint {
+    name: "data.ShuffleAndRepeatDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShuffleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36f4979c969a0863f4083c8c268383dd6de62602
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShuffleDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShuffleDataset"
+  endpoint {
+    name: "data.ShuffleDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sigmoid.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sigmoid.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6e16a41104972d93738f09940ff8370032f6afe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sigmoid.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sigmoid"
+  endpoint {
+    name: "math.Sigmoid"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SigmoidGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SigmoidGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb141abe3ba6e89b112dba88d0244cca3fbb46f0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SigmoidGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SigmoidGrad"
+  endpoint {
+    name: "math.SigmoidGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..435fb9e825d16f4e3eabbe4faef499b4b008e1bb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sign"
+  endpoint {
+    name: "math.Sign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sin.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fc95755bac13de35ca248bd38f7a6e2c79e2e02
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sin"
+  endpoint {
+    name: "math.Sin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sinh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sinh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f343685e80bf1cf683991f67a13a7c4f91dad831
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sinh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sinh"
+  endpoint {
+    name: "math.Sinh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Size.pbtxt b/tensorflow/core/api_def/java_api/api_def_Size.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a41cddd8ac7aadfc2b11599305fa0b921e5ca1a6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Size.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Size"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SkipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39bce67a3e445e6b656b8f0979d8533a8d9cf53b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SkipDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SkipDataset"
+  endpoint {
+    name: "data.SkipDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Skipgram.pbtxt b/tensorflow/core/api_def/java_api/api_def_Skipgram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d095c7b61b9c772cd2ac09c8333b15077f4ef78e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Skipgram.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Skipgram"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Slice.pbtxt b/tensorflow/core/api_def/java_api/api_def_Slice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..adfe6fa4fd90657eb1d06ff285b07d0d81651c82
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Slice.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Slice"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc284c2833a6cc502d12155e0ce9ca09fef120cb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SlideDataset"
+  endpoint {
+    name: "data.SlideDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/java_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e49c1a5431b7ea49037bc97a9f2190ea425c013
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Snapshot"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Softmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_Softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb27a04d217da850d99923ce4fa3a8d04f20c25a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Softmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softmax"
+  endpoint {
+    name: "nn.Softmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/java_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e064562c0f25e4f63353a36d08206b25eb0d4d08
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SoftmaxCrossEntropyWithLogits"
+  endpoint {
+    name: "nn.SoftmaxCrossEntropyWithLogits"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Softplus.pbtxt b/tensorflow/core/api_def/java_api/api_def_Softplus.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..671656c28736a0376a89cf1ed5c1b29edd646fc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Softplus.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softplus"
+  endpoint {
+    name: "math.Softplus"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SoftplusGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SoftplusGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1b074a9b7a2e6bee8dcbae3f5eed3610a753e80
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SoftplusGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SoftplusGrad"
+  endpoint {
+    name: "math.SoftplusGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Softsign.pbtxt b/tensorflow/core/api_def/java_api/api_def_Softsign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23aa1e3c58bce96423ed388b059b1bd66c8135ec
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Softsign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softsign"
+  endpoint {
+    name: "nn.Softsign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SoftsignGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SoftsignGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73faf74511fbe36a045c653c01e4ee0e5b811186
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SoftsignGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SoftsignGrad"
+  endpoint {
+    name: "nn.SoftsignGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SpaceToBatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_SpaceToBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe7232e544111b861ec9b36afc7cd369ca35903
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SpaceToBatch.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SpaceToBatch"
+  endpoint {
+    name: "nn.SpaceToBatch"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SpaceToBatchND.pbtxt b/tensorflow/core/api_def/java_api/api_def_SpaceToBatchND.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6612b48286c065fd933687b2e9c09162aad4f231
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SpaceToBatchND.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SpaceToBatchND"
+  endpoint {
+    name: "SpaceToBatchNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SpaceToDepth.pbtxt b/tensorflow/core/api_def/java_api/api_def_SpaceToDepth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb421c75db0d3eb0d6d0a9168cc8b772f6c1588a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SpaceToDepth.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SpaceToDepth"
+  endpoint {
+    name: "nn.SpaceToDepth"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdb0b14b6711badf9f0a304dc0411eadc1f59611
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorApplyGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAccumulatorApplyGradient"
+  endpoint {
+    name: "sparse.SparseAccumulatorApplyGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8fc1e70959691bf7b521da47173cf75d9592521b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorTakeGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAccumulatorTakeGradient"
+  endpoint {
+    name: "sparse.SparseAccumulatorTakeGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0761f2ed1623643bba0e5557f566dd96534f7962
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAdd"
+  endpoint {
+    name: "sparse.SparseAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAddGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6529c46a17edec341a694f74d41c6000d732cf40
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAddGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAddGrad"
+  endpoint {
+    name: "sparse.SparseAddGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7122f210a432b31ee54b603f12450de181472288
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyAdadelta"
+  endpoint {
+    name: "train.SparseApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..184a8cfb2fb98053189b611ed7688e30ba9f4f3d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyAdagrad"
+  endpoint {
+    name: "train.SparseApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..950dc00dd3578b8c9128f3f4f1e208bdee339713
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyAdagradDA"
+  endpoint {
+    name: "train.SparseApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..090536f5eb9fc447e22ef4f8971446cb0ca99b7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyCenteredRMSProp"
+  endpoint {
+    name: "train.SparseApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e961fb7f6b1922911954abd39033626feeace80f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43b9833451a18e99b3c864453e1e777b0e7ee48f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyFtrlV2"
+  endpoint {
+    name: "train.SparseApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7e79c5e7e43cbf594bfe94405f0239ad39343db
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyMomentum"
+  endpoint {
+    name: "train.SparseApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ac6cf771cb608ba6f96b67374f439dc7cfd9e7b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyProximalAdagrad"
+  endpoint {
+    name: "train.SparseApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..926ed2c1d4d4c7dcbc8d8c4f51b33ec5a410f389
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyProximalGradientDescent"
+  endpoint {
+    name: "train.SparseApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e39f4ffa58dc1fafc6eddc30479275a1a6f4e52
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyRMSProp"
+  endpoint {
+    name: "train.SparseApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ceb600a42c45fed650d09d82570d92bec7a0e93
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseConcat.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseConcat"
+  endpoint {
+    name: "sparse.SparseConcat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseConditionalAccumulator.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dc2c1ea8a00778af4a440a33015b7cb7c4cce08
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseConditionalAccumulator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseConditionalAccumulator"
+  endpoint {
+    name: "sparse.SparseConditionalAccumulator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseCross.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseCross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..130f333d35bfc0a4ba4d430217e1682d0e2794ec
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseCross.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseCross"
+  endpoint {
+    name: "sparse.SparseCross"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..013b7eede948c672379e163a58b805614262df87
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseDenseCwiseAdd"
+  endpoint {
+    name: "sparse.SparseDenseCwiseAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cf56d7b41897a85cd7a0a2f94c3b753b8335f48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseDenseCwiseDiv"
+  endpoint {
+    name: "sparse.SparseDenseCwiseDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37c50f232767cb6a3c9dbe3bb7953da3a63a64e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseDenseCwiseMul"
+  endpoint {
+    name: "sparse.SparseDenseCwiseMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRows.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRows.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3adddbd34bffeb005f9c69b75c0a48b8e25413d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRows.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseFillEmptyRows"
+  endpoint {
+    name: "sparse.SparseFillEmptyRows"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRowsGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRowsGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..708069d028dcfa31183bf2c45cda5d7dc1762aa5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRowsGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseFillEmptyRowsGrad"
+  endpoint {
+    name: "sparse.SparseFillEmptyRowsGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..445d53b02364de43e5191cd6e2753214aa0bbb5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseMatMul"
+  endpoint {
+    name: "sparse.SparseMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7467b9b474415da3ae4c6aaaa924984b0a066a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceMax"
+  endpoint {
+    name: "sparse.SparseReduceMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceMaxSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceMaxSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..987e98467eda2adb3ff4729acfcf35f0136abbd4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceMaxSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceMaxSparse"
+  endpoint {
+    name: "sparse.SparseReduceMaxSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..739fb5fb952c23970ab9c40ae20062682fffba34
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceSum"
+  endpoint {
+    name: "sparse.SparseReduceSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceSumSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceSumSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64d6d45f1a2bf009fce79e62641d87f9f60122f1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceSumSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceSumSparse"
+  endpoint {
+    name: "sparse.SparseReduceSumSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReorder.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReorder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..202066e76f502c8b94d2cdfd55dcc25ab6f8a6d1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReorder.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReorder"
+  endpoint {
+    name: "sparse.SparseReorder"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReshape.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a393a6105f751df49a231182f9f86b515502e1e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReshape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReshape"
+  endpoint {
+    name: "sparse.SparseReshape"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9187dbed670b534721163f4387f4ff1d671f2b74
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMean.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentMean"
+  endpoint {
+    name: "sparse.SparseSegmentMean"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b7d5bbcf0c7fa9c3771ab03a6e5b0d549a4362f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentMeanGrad"
+  endpoint {
+    name: "sparse.SparseSegmentMeanGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ac2256d121319355b798390c129c29c1b6144
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentMeanWithNumSegments"
+  endpoint {
+    name: "sparse.SparseSegmentMeanWithNumSegments"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2cecff503ff5534c1367f9c67d9598f7df7d92c2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSqrtN"
+  endpoint {
+    name: "sparse.SparseSegmentSqrtN"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6973eb773a1fdfe1f2aa33c659ed4b8997fe4fe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNGrad"
+  endpoint {
+    name: "sparse.SparseSegmentSqrtNGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78aed85f0a96f9ae941bd32afec406b3e0f17b58
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNWithNumSegments"
+  endpoint {
+    name: "sparse.SparseSegmentSqrtNWithNumSegments"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f18f35166700173e20a5d4af3326d395a589f680
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSum"
+  endpoint {
+    name: "sparse.SparseSegmentSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e384857cc76b0f79cdbca3714a6de88def13355
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSumWithNumSegments"
+  endpoint {
+    name: "sparse.SparseSegmentSumWithNumSegments"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSlice.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d794df9252472b618e92e3763b70e01364e5281
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSlice.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSlice"
+  endpoint {
+    name: "sparse.SparseSlice"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSliceGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSliceGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..979326c0fc78effddb67aa9a726b8dd174fb6fdb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSliceGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSliceGrad"
+  endpoint {
+    name: "sparse.SparseSliceGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSoftmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56c96640cb447f7b2956acb581c618275f1fb025
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSoftmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSoftmax"
+  endpoint {
+    name: "sparse.SparseSoftmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7627d5f6074cc919e5b325179412bc38d1bd2159
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSoftmaxCrossEntropyWithLogits"
+  endpoint {
+    name: "nn.SparseSoftmaxCrossEntropyWithLogits"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSparseMaximum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSparseMaximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..314309621389a7cd4004c9ec37144d331c6728c9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSparseMaximum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSparseMaximum"
+  endpoint {
+    name: "sparse.SparseSparseMaximum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSparseMinimum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSparseMinimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc04bb4fed527b8d6543ea489396fb514f3a28ff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSparseMinimum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSparseMinimum"
+  endpoint {
+    name: "sparse.SparseSparseMinimum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSplit.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0afc95199aa602dc1103fa02d1a0e586b78b08e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSplit.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSplit"
+  endpoint {
+    name: "sparse.SparseSplit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc5c882f8342d7f0fc1b12539c89c44631da2b6b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseTensorDenseAdd"
+  endpoint {
+    name: "sparse.SparseTensorDenseAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd1f1e09e948aa94aac5650e3ebd847455449e99
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseTensorDenseMatMul"
+  endpoint {
+    name: "sparse.SparseTensorDenseMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseTensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb0d1d7a949617eda6bf23be393d31f6bcc6e343
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseTensorSliceDataset"
+  endpoint {
+    name: "data.SparseTensorSliceDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseToDense.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseToDense.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68df155e4619dd45f1681130db20c4e5a8cc0874
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseToDense.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseToDense"
+  endpoint {
+    name: "sparse.SparseToDense"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb04366feadde65034765f5b47458b17def23cd2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseToSparseSetOperation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseToSparseSetOperation"
+  endpoint {
+    name: "sparse.SparseToSparseSetOperation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Split.pbtxt b/tensorflow/core/api_def/java_api/api_def_Split.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffb7b52e091a35a0f8109d99c497208aa9774f24
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Split.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Split"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SplitV.pbtxt b/tensorflow/core/api_def/java_api/api_def_SplitV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94f4a08d70df26359e2aa32806ecb1f8f933be3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SplitV.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SplitV"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SqlDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8764e81af254e977d660b36a51a28e434979d83a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SqlDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SqlDataset"
+  endpoint {
+    name: "data.SqlDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daa2a1ee86a6ec1a6de09886bb54b55f55b6dec5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sqrt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sqrt"
+  endpoint {
+    name: "math.Sqrt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SqrtGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SqrtGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8eca1b89b31b4fac524edbe17f777fdd85824fee
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SqrtGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SqrtGrad"
+  endpoint {
+    name: "math.SqrtGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Square.pbtxt b/tensorflow/core/api_def/java_api/api_def_Square.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..561e10e020b9264176156be36f05c7b48deb0d7a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Square.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Square"
+  endpoint {
+    name: "math.Square"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SquaredDifference.pbtxt b/tensorflow/core/api_def/java_api/api_def_SquaredDifference.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..752dbcce7b81485ef6a6b5ed79e86a91999cdbaf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SquaredDifference.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SquaredDifference"
+  endpoint {
+    name: "math.SquaredDifference"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Squeeze.pbtxt b/tensorflow/core/api_def/java_api/api_def_Squeeze.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..264c38460364d80e035b9c0347af67196d8ca00d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Squeeze.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Squeeze"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Stack.pbtxt b/tensorflow/core/api_def/java_api/api_def_Stack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8370beee630ace617651a4dafb56fc9e68998280
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Stack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Stack"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac3c410c1cc90739d4b657c6f6ffe66ac2e5a115
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackClose"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackCloseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28aff9e191371f024b0c8e97336c3d14487dcd0f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackCloseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackCloseV2"
+  endpoint {
+    name: "StackClose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPop.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8658ecbad4fb889968d19a02a52ed18d32fa6c0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPop"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPopV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPopV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2ecf4e5a8782c0a39895020887232a6027bef41
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPopV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackPopV2"
+  endpoint {
+    name: "StackPop"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPush.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPush.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d08fa27b2104f150227281ca3a76fbc2ce9ff001
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPush.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPush"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPushV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPushV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..519fd6c6b28b2b199a9cfc4bbfc9e937a24c7ba7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPushV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackPushV2"
+  endpoint {
+    name: "StackPush"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..725e469a031e6077b0c907b574fdca9607bc57b1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackV2"
+  endpoint {
+    name: "Stack"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Stage.pbtxt b/tensorflow/core/api_def/java_api/api_def_Stage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87e0c7d9811f01aef0e35973512d53ea320c11d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Stage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Stage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StageClear.pbtxt b/tensorflow/core/api_def/java_api/api_def_StageClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26890a55b39827dad13ffdf701cf78c62c0a8f90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StageClear.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StageClear"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StagePeek.pbtxt b/tensorflow/core/api_def/java_api/api_def_StagePeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c3ed3dc91c51d9a5c16d3ac3780310b4c9cdc8c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StagePeek.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StagePeek"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StageSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_StageSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8188c3e0b30e20d734f8273b6b8cfb9c52e30df
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StageSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StageSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatefulPartitionedCall.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatefulPartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e6decf19adcc07a3b6d26bfef92af0909206432
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatefulPartitionedCall.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StatefulPartitionedCall"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessIf.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessIf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37c7b9a9629fb8353c1d6c8b58d35d44b73f717c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessIf.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StatelessIf"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessMultinomial.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessMultinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cfbbfb2c256278b2af37c2083fea371343097fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessMultinomial.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessMultinomial"
+  endpoint {
+    name: "random.StatelessMultinomial"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessRandomNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessRandomNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e648cfa691ae0aeda8ed7d5d3a6692b15c40f6e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessRandomNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessRandomNormal"
+  endpoint {
+    name: "random.StatelessRandomNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniform.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f8f328b41f3fda2e6a1394a15a62a35b112db69
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniform.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessRandomUniform"
+  endpoint {
+    name: "random.StatelessRandomUniform"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniformInt.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fec4a7cc622f76209b6da7c6170889b5cf4615d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniformInt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessRandomUniformInt"
+  endpoint {
+    name: "random.StatelessRandomUniformInt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessTruncatedNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c05071dd737de6cdfc5e8f818250fdbad9169540
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessTruncatedNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessTruncatedNormal"
+  endpoint {
+    name: "random.StatelessTruncatedNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessWhile.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessWhile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a4252c90ae791719eac3903ad7313e3dd472ac6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessWhile.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StatelessWhile"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StaticRegexFullMatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_StaticRegexFullMatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66b841ad74b26af5ec51653df35e68729d3aba31
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StaticRegexFullMatch.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StaticRegexFullMatch"
+  endpoint {
+    name: "strings.StaticRegexFullMatch"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StaticRegexReplace.pbtxt b/tensorflow/core/api_def/java_api/api_def_StaticRegexReplace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..305bc8a3caf84c9f92636c04290ac61dd98c3799
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StaticRegexReplace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StaticRegexReplace"
+  endpoint {
+    name: "strings.StaticRegexReplace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..301dc982e3266d0fc2fb6f97ed90e3276aff4f7b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatsAggregatorHandle"
+  endpoint {
+    name: "data.StatsAggregatorHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f83c8f5a4f66c639045813897d19b2b5b52a2457
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatsAggregatorSummary"
+  endpoint {
+    name: "summary.StatsAggregatorSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StopGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_StopGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad8473e8161e97652889e7bc6749b6837a5b2419
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StopGradient.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StopGradient"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StridedSlice.pbtxt b/tensorflow/core/api_def/java_api/api_def_StridedSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0c301dab134e0cde6389fff5bbaee96649ea0bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StridedSlice.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StridedSlice"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StridedSliceAssign.pbtxt b/tensorflow/core/api_def/java_api/api_def_StridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6850dc2d1b2c8120b70fdbbbca2ca3cc9eb5a423
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StridedSliceAssign.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StridedSliceAssign"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StridedSliceGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_StridedSliceGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b03204bcc4d2e7e84e98bd54105dcfa465a883a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StridedSliceGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StridedSliceGrad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringFormat.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringFormat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdd03139966e6fc662093cc55989dfb83a250aa2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringFormat.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringFormat"
+  endpoint {
+    name: "strings.StringFormat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringJoin.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b43ff157cd529481c2c3d634e0445492412bc477
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringJoin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringJoin"
+  endpoint {
+    name: "strings.Join"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringLength.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringLength.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8eb48cc3c720ee057647443b58ed79c38996d09
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringLength.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringLength"
+  endpoint {
+    name: "strings.StringLength"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringSplit.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0e6d1851df16d31df70bedd52f8a2d0861637e85
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplit"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18c71d6bd7c90de2c93f185afd00fe1685f74709
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringSplitV2"
+  endpoint {
+    name: "strings.StringSplit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringStrip.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringStrip.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01691211ff6e2b4df5117bf0eb388865d92abc36
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringStrip.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringStrip"
+  endpoint {
+    name: "strings.Strip"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToHashBucket.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToHashBucket.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..765517578d0ff7d3212055b91e08d90afee92671
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToHashBucket.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToHashBucket"
+  endpoint {
+    name: "strings.ToHashBucket"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToHashBucketFast.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketFast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de08bc2d36ff4d7fe15ccbb8dc55389a5835261a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketFast.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToHashBucketFast"
+  endpoint {
+    name: "strings.ToHashBucketFast"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToHashBucketStrong.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketStrong.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15b9138238a64a8594c027e21a38cc9731750a37
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketStrong.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToHashBucketStrong"
+  endpoint {
+    name: "strings.ToHashBucketStrong"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToNumber.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToNumber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..196f694da5cf2993dd1b420f86aad5b9a26c3251
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToNumber.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToNumber"
+  endpoint {
+    name: "strings.ToNumber"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sub.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e79c6a0036bb17bb9e38f5edf0baa5cfb8c1f7d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sub.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sub"
+  endpoint {
+    name: "math.Sub"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Substr.pbtxt b/tensorflow/core/api_def/java_api/api_def_Substr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78f34550a5b98608ef09e2d18c769d078c5feb96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Substr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Substr"
+  endpoint {
+    name: "strings.Substr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3999fa6ed12982a23fa37e9af1709f1c80a66e37
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sum.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Sum"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SummaryWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_SummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8338c0fa1889e8cdcb82bbe4fa1c165485b38215
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SummaryWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SummaryWriter"
+  endpoint {
+    name: "summary.SummaryWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Svd.pbtxt b/tensorflow/core/api_def/java_api/api_def_Svd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..889d41628187892ed86bd394e46dda21e567cbdd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Svd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Svd"
+  endpoint {
+    name: "linalg.Svd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
index 0d3362a91e151093292ba6a30fd1554b6f3fba11..edd9255452dd97c0c7107d98063e13a382430da7 100644
--- a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
+++ b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
@@ -1,4 +1,6 @@
 op {
-  graph_op_name: "Switch" #TODO(karllessard) escape that reserved name
-  visibility: HIDDEN
+  graph_op_name: "Switch"
+  endpoint {
+    name: "SwitchCond"
+  }
 }
diff --git a/tensorflow/core/api_def/java_api/api_def_SymbolicGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_SymbolicGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c6e68ae7408d6c043ed38cac4f1400c71b048ae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SymbolicGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SymbolicGradient"
+  endpoint {
+    name: "train.SymbolicGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TFRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1d42edd63d757a3d84b7160223192314631ca63
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TFRecordDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TFRecordDataset"
+  endpoint {
+    name: "data.TfRecordDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TFRecordReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_TFRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ffbeba0ec979524d588f0f92632b145fd01fa13
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TFRecordReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TFRecordReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TFRecordReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TFRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d252e4942bab615ee3b80cfb4e03c947de52b0d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TFRecordReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TFRecordReaderV2"
+  endpoint {
+    name: "io.TfRecordReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TakeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b0c0544fbc29ff63bd364206817ce1584aea6ac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TakeDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TakeDataset"
+  endpoint {
+    name: "data.TakeDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TakeManySparseFromTensorsMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_TakeManySparseFromTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..620e9fb0120eb5e06672022fa1a18661ddb28f9f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TakeManySparseFromTensorsMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TakeManySparseFromTensorsMap"
+  endpoint {
+    name: "sparse.TakeManySparseFromTensorsMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Tan.pbtxt b/tensorflow/core/api_def/java_api/api_def_Tan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..560ca546b765d6aeef71da8a7aed031ff3dc59d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Tan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Tan"
+  endpoint {
+    name: "math.Tan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Tanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90e441808f9de9d18e98cd6fcdf2012204f3b61c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Tanh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Tanh"
+  endpoint {
+    name: "math.Tanh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TanhGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_TanhGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0837d04874dc57246813993d4a9722a64c9a4e0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TanhGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TanhGrad"
+  endpoint {
+    name: "math.TanhGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TemporaryVariable.pbtxt b/tensorflow/core/api_def/java_api/api_def_TemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ce760f9c94f66f4f4bdfeac0ceaab9f3f04d3bb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TemporaryVariable.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TemporaryVariable"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArray.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArray.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e315486af2ab109b011349ffa9d44b7b5cfde945
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArray.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArray"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..951ace8005b8b17cf1b02c15834049b3c2226566
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayClose"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fd2d02592ee647ea9b07c7d3ce8f59df14e72ce
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayCloseV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a065779790d682b4e5e7db020e8f1110a3a6d28
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayCloseV3"
+  endpoint {
+    name: "TensorArrayClose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5071110939f85c0887814462ed1c39e27d0b132
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcat"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e92cebf2a794a4b6a72be7bf8180b0a224c4579d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcatV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34d09c901a07d3133723d0cdc8e7206adaddb2a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayConcatV3"
+  endpoint {
+    name: "TensorArrayConcat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95866b9778e731b47c1dd101130261fe31e091e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGather"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f75b50c6670fcbc9db7b773e804b37b7be35f470
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGatherV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b792ee988267c1f28c484cacba7600e3cea4a8a3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayGatherV3"
+  endpoint {
+    name: "TensorArrayGather"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..beb9b5ca12d754b1d4d064d36b05c8d8f623f9cc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41d25a491015bcef366bb0b90039909601724de0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGradV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c9adebd044c921ce437df32ae42649c7aadc2db
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayGradV3"
+  endpoint {
+    name: "TensorArrayGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGradWithShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradWithShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6804fb697bcfea87fbeb8b78914db19603c32cb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradWithShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayGradWithShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayPack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayPack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..030950b06fcbf2fd5e0c2eed99e154640e0adbec
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayPack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayPack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayRead.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72704746a5c94077640156107d48f9f1bf30b79d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayRead.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayRead"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43cd0a2b7867455348183b5c375643e44cd5da11
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayReadV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6d38d64df94575e015c96501ad2c44c0dc6bce3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayReadV3"
+  endpoint {
+    name: "TensorArrayRead"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayScatter.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76092a45ed2d9b76b68e964d15372a1d45974cb4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatter"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7dba0fab4cd0b7337da7cd136dfbebd1b2d4ac2c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatterV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..179c9611f5854171ef5427c2e2db67ea18e6ae5e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayScatterV3"
+  endpoint {
+    name: "TensorArrayScatter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb2be098c67c53ce43acfa6bc11f05b0babfa037
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e8e44cfe2969e81a6f130578475db23d83f47f7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySizeV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2df9a2d3f135038d4f61eeaae61681b19adb730f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArraySizeV3"
+  endpoint {
+    name: "TensorArraySize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySplit.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..105031eb983d2f681c6c7dd12b557e4a509f805e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplit"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef5d88832aab642546c5df9523e33a4643a2dc03
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplitV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..721af074d0d538a95e5dbb95a3071b9e8b2f3ddc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArraySplitV3"
+  endpoint {
+    name: "TensorArraySplit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayUnpack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayUnpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9011de23ea12eee3de3f3ba83ff86907c6c967b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayUnpack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayUnpack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43a441a07101408b31362e302dedadb8d5585ffd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b87617a1f1712ffe9bbabe6aed81e4e5a0abbee
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayV3"
+  endpoint {
+    name: "TensorArray"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayWrite.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayWrite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2462dae80d96a1c90d85ca48a7dbbe528c3e996a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayWrite.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWrite"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9f670ae18162956a19edf6708072ca02c716adb8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWriteV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7321057b2f0da7a789ac86ee4567464477576df0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayWriteV3"
+  endpoint {
+    name: "TensorArrayWrite"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ed0ead6e7ab7187a67c152d94834cdb0cc0ccfac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorDataset"
+  endpoint {
+    name: "data.TensorDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2410a0dd7dca78eaff1096365392beb44393d3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestCreateTreeVariable"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeDeserialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..724bdb282d0252652865c6625dd6ddbce964c918
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeDeserialize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e93af8508f3b8a6ca3ffe505a567ec2f6bc548b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeIsInitializedOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreePredict.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreePredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73770fa2913ec24662b0d8a43f57c5d1d99d91df
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreePredict.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreePredict"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2ef0ee5dd34187ce814beb23edceb47d6dca988
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeResourceHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSerialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSerialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d10f9e632b649036fdcf34ae35014070df42ac63
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeSerialize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d81f1ea8c01d3a9a359f7eb16e2a8d61e7255e9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListConcatLists.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListConcatLists.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3fa6265e1082369a9c42c3286b44da800496de6b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListConcatLists.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListConcatLists"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListElementShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListElementShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..396a0cfa8fe7142defa30d046c834773bf5118d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListElementShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListElementShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListFromTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListFromTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3179feddd6042ae483e11f73500ef8088ac3555e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListFromTensor.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListFromTensor"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..580d34b68f5fbc5d2c75ba492589b73146d5f261
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGather"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListGetItem.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListGetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c47208fa0525ccc7f91711bade66b0c86b914a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListGetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGetItem"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListLength.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListLength.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ec689d71c821a0648c10f94b6699a07f709baca
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListLength.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListLength"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListPopBack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListPopBack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d0d6ed55b34ef3a3016b6ec085f0987ff1cc562
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListPopBack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPopBack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListPushBack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListPushBack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..285351cf4f308e9d330ae2cc6aff034ec9911d85
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListPushBack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListPushBackBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f33d4926018f6ebad79c7e2e69fca9a1966eb5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListPushBackBatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBackBatch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListReserve.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListReserve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..533660068879237f7bde3d5f8cc51c6163c11c51
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListReserve.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListReserve"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListScatter.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListScatter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3a56f12928141b2541cf009b603982ca864870e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListScatter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListScatter"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListSetItem.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListSetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..002e2a9bd37c2e6a2b41ba43237278bc42119bf7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListSetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListSetItem"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListStack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListStack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b03c86e574c85a65b7b91bb73ae349d9783125
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListStack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListStack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3729a025e66e30c558b283d1ba596d812bbea044
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorSliceDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorSliceDataset"
+  endpoint {
+    name: "data.TensorSliceDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..165478d3a0f764dfdd46c451b653952d53be3c9d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorSummary"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorSummaryV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c285ada0128fca3f97d3c14f60ca15906d9cb4eb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorSummaryV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorSummaryV2"
+  endpoint {
+    name: "summary.TensorSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TextLineDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3c75d5703b52ad0656f84cbef8ec11a0010198b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TextLineDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TextLineDataset"
+  endpoint {
+    name: "data.TextLineDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TextLineReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_TextLineReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f86b15cf86b34b3aed2121aa040ead096ae48102
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TextLineReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TextLineReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TextLineReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TextLineReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee57dd84082dee03df452967437261eb9dbfaea6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TextLineReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TextLineReaderV2"
+  endpoint {
+    name: "io.TextLineReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ef4a834781fa308678561e06fab079b0c8e76bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ThreadUnsafeUnigramCandidateSampler"
+  endpoint {
+    name: "random.ThreadUnsafeUnigramCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Tile.pbtxt b/tensorflow/core/api_def/java_api/api_def_Tile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e4e63a2228d126561142d678c0454fed22dad1b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Tile.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Tile"
+  endpoint {
+    name: "Tile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TileGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_TileGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cd975c02f6b74f95b01f3fae4f94bfec0a72490
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TileGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TileGrad"
+  endpoint {
+    name: "train.TileGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Timestamp.pbtxt b/tensorflow/core/api_def/java_api/api_def_Timestamp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ebc664ae15f45b937760beffbd2de1570c6ad44
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Timestamp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Timestamp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TopK.pbtxt b/tensorflow/core/api_def/java_api/api_def_TopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb090aa6f141d84f8b85513ae55f95da9827813e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TopK.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TopK"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TopKV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b0dcf7c2a2658c16fe6a1a1c22bd2ad4fab1190
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TopKV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TopKV2"
+  endpoint {
+    name: "nn.TopK"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Transpose.pbtxt b/tensorflow/core/api_def/java_api/api_def_Transpose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad7900c00b21cc2c1921899f39e562b7096d0832
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Transpose.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Transpose"
+  endpoint {
+    name: "linalg.Transpose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_TruncateDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fbd6823401778512d1aec18e24b9870daf3bd90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TruncateDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TruncateDiv"
+  endpoint {
+    name: "math.TruncateDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/java_api/api_def_TruncateMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d1ae9a14fafc8556828dc29484bdbc269e9ac56
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TruncateMod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TruncateMod"
+  endpoint {
+    name: "math.TruncateMod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TruncatedNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_TruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2dd52c955841971bedae10fc0301affd783969a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TruncatedNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TruncatedNormal"
+  endpoint {
+    name: "random.TruncatedNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TryRpc.pbtxt b/tensorflow/core/api_def/java_api/api_def_TryRpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ca476086a8a0a135d9c02388e3eead5e4f7f5d0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TryRpc.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TryRpc"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unbatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unbatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..513b05593a1bbca7bef198a6d36efdd8e986eb30
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unbatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Unbatch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24907c804b0c80d8d3038a8eddb1fd412b9e3ab5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  endpoint {
+    name: "data.UnbatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnbatchGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnbatchGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce612f84f6f5f66e0e3a8523d57c13cb0d9e7a90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnbatchGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UnbatchGrad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..16cc033140c37d00fd4057d68fb07711903fa790
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeDecodeWithOffsets"
+  endpoint {
+    name: "strings.UnicodeDecodeWithOffsets"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnicodeScript.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a873151d5fc612e67ae2d0ae1d95c85ce7c774d2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnicodeScript.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeScript"
+  endpoint {
+    name: "strings.UnicodeScript"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnicodeTranscode.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnicodeTranscode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..131cc6169c7771653f31e830e5947d02d8874d1a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnicodeTranscode.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeTranscode"
+  endpoint {
+    name: "strings.UnicodeTranscode"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniformCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7a56c2a6ebd96f7a2d321748bc402a9e007c6da
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniformCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UniformCandidateSampler"
+  endpoint {
+    name: "random.UniformCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unique.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unique.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cc8ec0feddaf2fc89e57121cf23e5c58f0861f5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unique.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Unique"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fe22cb1020a9378a2d591f0dd5257a80014f7f9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UniqueV2"
+  endpoint {
+    name: "Unique"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithCounts.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithCounts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0248fab17e6594c357b18a4b0d12273b94181d0f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCounts"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb157451143c0a795704755f02850afafa765175
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  endpoint {
+    name: "UniqueWithCounts"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unpack.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3ad63e8f8027ec67d5827ee7bac88a19b316187
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unpack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Unpack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a08dc8f1e70acfc7bf7760c648087ce022f8835
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UnravelIndex"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58d8740d56eabd28212dd3059eec59822869d03
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentMax"
+  endpoint {
+    name: "math.UnsortedSegmentMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f17ab4624e736489b6804d0c1123b3436bd542c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentMin"
+  endpoint {
+    name: "math.UnsortedSegmentMin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentProd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c594941bcc0bdc12d0eafe35d676bb7c7c99dfe7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentProd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentProd"
+  endpoint {
+    name: "math.UnsortedSegmentProd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e10cf8a6c2c076314ee749ba7d307921d411b994
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentSum"
+  endpoint {
+    name: "math.UnsortedSegmentSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unstage.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31a97cf84db28567856c72d53e4c7f54124504dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unstage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Unstage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UpperBound.pbtxt b/tensorflow/core/api_def/java_api/api_def_UpperBound.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..229a6ddfc365d0b89845478741c48c6cc67348b1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UpperBound.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UpperBound"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_VarHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30bdace0e6e9e796233cf8056147ca3884b2b4af
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VarHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "VarHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VarIsInitializedOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_VarIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3992019bc753352ad573a2eb0061fa1583c5133
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VarIsInitializedOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "VarIsInitializedOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Variable.pbtxt b/tensorflow/core/api_def/java_api/api_def_Variable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0978e61451b6dd1f2fdcd3f5f8625f3e6ccee777
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Variable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Variable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VariableShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_VariableShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38c63b5b7015c09bf2046ae0cf670732c6dd84f4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VariableShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "VariableShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VariableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_VariableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c566dd1e79ffb289c2127a077232a952f54f7038
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VariableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "VariableV2"
+  endpoint {
+    name: "Variable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Where.pbtxt b/tensorflow/core/api_def/java_api/api_def_Where.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f130181a6e3afe451f68509d4f8c01155d93f77c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Where.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Where"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_While.pbtxt b/tensorflow/core/api_def/java_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d0f3b07633aa8c97428f09dd27af93b2a89855e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_While.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "While"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WholeFileReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_WholeFileReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa839ed38019de504c3c92dd1795cf109de9d0c6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WholeFileReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WholeFileReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WholeFileReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_WholeFileReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e031d705fb2dd266da7dd436b5bc68811cdce2b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WholeFileReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WholeFileReaderV2"
+  endpoint {
+    name: "io.WholeFileReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WindowDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_WindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..69f12c55e1d1bdcfaf6752778408432d9db20c90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WindowDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WindowDataset"
+  endpoint {
+    name: "data.WindowDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteAudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fcd0df85c2c4bc7f5061bb6d2f4ca5b74ff0e4c7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteAudioSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteAudioSummary"
+  endpoint {
+    name: "summary.WriteAudioSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteFile.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2d6a5bace44397b51f0fa67dc55d1ded73febc8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteFile.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteFile"
+  endpoint {
+    name: "io.WriteFile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteGraphSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e461bbba333a73cf99c9004dcc31e5fdb343422
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteGraphSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteGraphSummary"
+  endpoint {
+    name: "summary.WriteGraphSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteHistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c991079032593a7c8811283bda4ee8e318786831
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteHistogramSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteHistogramSummary"
+  endpoint {
+    name: "summary.WriteHistogramSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08bf0adb2f26e8d2d308b5753bb2fcd0637328f3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteImageSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteImageSummary"
+  endpoint {
+    name: "summary.WriteImageSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc7c16a0c7d19937c0acb4bfde7d89ad79628d6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteScalarSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteScalarSummary"
+  endpoint {
+    name: "summary.WriteScalarSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1141cb6dbb16b984057aab3053b9bca770cabbad
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteSummary"
+  endpoint {
+    name: "summary.WriteSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Xdivy.pbtxt b/tensorflow/core/api_def/java_api/api_def_Xdivy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13a94b8a56eed2b3c132a0baa169ab00732105b6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Xdivy.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Xdivy"
+  endpoint {
+    name: "math.Xdivy"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Xlogy.pbtxt b/tensorflow/core/api_def/java_api/api_def_Xlogy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52f457d6458dec89f5a1aa8e2b5ec978f3bafcfa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Xlogy.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Xlogy"
+  endpoint {
+    name: "math.Xlogy"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ZerosLike.pbtxt b/tensorflow/core/api_def/java_api/api_def_ZerosLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef17aba9b4431e04f6e78fc9d6099db4ac3eb7a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ZerosLike.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ZerosLike"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Zeta.pbtxt b/tensorflow/core/api_def/java_api/api_def_Zeta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b602bbc7e1f5e877d64b2636b6e49b8c226735f6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Zeta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Zeta"
+  endpoint {
+    name: "math.Zeta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ZipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e32362bef2b9c2d042dc097a7c321d0261ce787a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ZipDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ZipDataset"
+  endpoint {
+    name: "data.ZipDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
index d51defc376ff9a0961ed5bd43b848ea3f6df288d..bc8cc309f552e93e1dd6ff1fb0d74f8fda0cd1f7 100644
--- a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
@@ -2,6 +2,10 @@ op {
   graph_op_name: "AsString"
   endpoint {
     name: "dtypes.as_string"
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "strings.as_string"
   }
   endpoint {
     name: "as_string"
diff --git a/tensorflow/core/api_def/python_api/api_def_AudioSpectrogram.pbtxt b/tensorflow/core/api_def/python_api/api_def_AudioSpectrogram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38e4b38f3f8a5c386a0c4b56d5469ab0c5dd0a7d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AudioSpectrogram.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AudioSpectrogram"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6fd4691f74f51e14ea43a26fdac9d3e87fa1140
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DecodeWav"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeWav.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edcf2bded125cba51053d5b401d03f21b8649595
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeWav"
+  endpoint {
+    name: "audio.encode_wav"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_KMC2ChainInitialization.pbtxt b/tensorflow/core/api_def/python_api/api_def_KMC2ChainInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e8ec113f81ee7b8049b19201da41ae9206c63cb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_KMC2ChainInitialization.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "KMC2ChainInitialization"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_KmeansPlusPlusInitialization.pbtxt b/tensorflow/core/api_def/python_api/api_def_KmeansPlusPlusInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83aa3a3a69f4120f14bc0dc72d368953132b7eee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_KmeansPlusPlusInitialization.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "KmeansPlusPlusInitialization"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Mfcc.pbtxt b/tensorflow/core/api_def/python_api/api_def_Mfcc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21a0c8b8b2448ec38b6819712ef0980f47afdd84
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Mfcc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Mfcc"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NearestNeighbors.pbtxt b/tensorflow/core/api_def/python_api/api_def_NearestNeighbors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad05bb34862362530d90a1df67a1b46376b107ab
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NearestNeighbors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NearestNeighbors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NextAfter.pbtxt b/tensorflow/core/api_def/python_api/api_def_NextAfter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9188a0d6bd8c41ea766ff91bca9a2df97145bb13
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NextAfter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NextAfter"
+  endpoint {
+    name: "math.nextafter"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ff1d01db6bf5279c99c9305c1eec97ed8b6e84f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterNdSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb3c3dbdb110c7a04a44b7c201bc3b432565139c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatefulStandardNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d689d4f2b16a9e18064fe9c8be09650a3e4a641
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListResize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a92a529d361cd3684d4306cb82bb3648e2b7e9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatterV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 92e56df18105218fc8a5112a880b6c999f1a2649..03c0e9ce2b6e7c508f902f45e3c2d3284e081028 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -296,4 +296,37 @@ Status BaseCollectiveExecutor::CreateCollective(
   return status;
 }
 
+bool BaseCollectiveExecutor::CheckDependencies(
+    const CollectiveParams& col_params) {
+  for (int32 instance : col_params.instance.impl_details.dependencies) {
+    auto find_iter = launched_.find(instance);
+    if (find_iter == launched_.end() || find_iter->second != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void BaseCollectiveExecutor::WaitForDependencies(
+    const CollectiveParams& col_params) {
+  mutex_lock l(launch_mu_);
+  while (!CheckDependencies(col_params)) {
+    launch_cv_.wait(l);
+  }
+}
+
+void BaseCollectiveExecutor::Launched(const CollectiveParams& col_params) {
+  mutex_lock l(launch_mu_);
+  if (launched_.find(col_params.instance.instance_key) == launched_.end()) {
+    const string& task_name =
+        col_params.instance.task_names[col_params.default_rank];
+    const int32 num_devices =
+        col_params.instance.num_devices_per_task.at(task_name);
+    launched_[col_params.instance.instance_key] = num_devices;
+  }
+  if (--launched_[col_params.instance.instance_key] == 0) {
+    launch_cv_.notify_all();
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 09826a8814511cb46c907b983f240fe17df70e3d..b711aa6d5004947466d25e5e6e7d1be9216a2afb 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -135,15 +135,33 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                                client_locality, done);
   }
 
+  // If we need to enforce an ordering on any portion of collective
+  // implementation, and the ordering is encoded via attribute on the collective
+  // op, this function will block until all dependencies for this collective
+  // have completed.
+  void WaitForDependencies(const CollectiveParams& col_params) override;
+  // Record that this collective has completed the portion of the implementation
+  // that needs to be ordered wrt other collectives, to unblock any of its
+  // dependent ops.
+  void Launched(const CollectiveParams& col_params) override;
+
  protected:
   const int64 step_id_;
   const DeviceMgr* dev_mgr_;  // Not owned.
   std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
   const string* gpu_ring_order_;  // Not owned.
+  mutex launch_mu_;
+  condition_variable launch_cv_;
+  // collective instance key -> number of local devices for which NCCL ops have
+  // been launched.
+  std::unordered_map<int32, int32> launched_ GUARDED_BY(launch_mu_);
 
  private:
   Status CreateCollective(const CollectiveParams& col_params,
                           CollectiveImplementationInterface** col_impl);
+  // Check if all ops on which this collective depends on have launched.
+  bool CheckDependencies(const CollectiveParams& col_params)
+      EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/build_graph_options.cc b/tensorflow/core/common_runtime/build_graph_options.cc
index 00f7a8e6452f9cc234c5868437d40ccc99dbaf87..b095fcfa3bb1fea6fccafa3015734e71582a6829 100644
--- a/tensorflow/core/common_runtime/build_graph_options.cc
+++ b/tensorflow/core/common_runtime/build_graph_options.cc
@@ -35,6 +35,19 @@ string BuildGraphOptions::DebugString() const {
   if (collective_graph_key != kNoCollectiveGraphKey) {
     strings::StrAppend(&rv, "\ncollective_graph_key: ", collective_graph_key);
   }
+  string collective_order_str;
+  switch (collective_order) {
+    case GraphCollectiveOrder::kNone:
+      collective_order_str = "none";
+      break;
+    case GraphCollectiveOrder::kEdges:
+      collective_order_str = "edges";
+      break;
+    case GraphCollectiveOrder::kAttrs:
+      collective_order_str = "attrs";
+      break;
+  }
+  strings::StrAppend(&rv, "\ncollective_order: ", collective_order_str);
   return rv;
 }
 
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index 3d0f242ea5177fd5a99a925f998ec5252a313327..24b71cc741df325617b0c129b4b592c28fcc57cd 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/graph/collective_order.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -34,6 +35,11 @@ struct BuildGraphOptions {
   static const int64 kNoCollectiveGraphKey = 0;
   int64 collective_graph_key = kNoCollectiveGraphKey;
 
+  // If not `kNone`, order all CollectiveReduce operations statically and
+  // deterministically.  If `kEdges`, encode dependencies as explicit control
+  // edges, if `kAttrs` encode as attribute on collective op.
+  GraphCollectiveOrder collective_order = GraphCollectiveOrder::kNone;
+
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index a8e3f4c881afc9c37ce4b5196c32ec591be5506d..8907f6d56a6d098b690b8e79cdc12a5da013d7d1 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -39,7 +40,10 @@ void CollectiveParamResolverLocal::InstanceRec::WaitForOutMu(mutex_lock& lock) {
 CollectiveParamResolverLocal::CollectiveParamResolverLocal(
     const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
     const string& task_name)
-    : dev_mgr_(dev_mgr), dev_resolver_(dev_resolver), task_name_(task_name) {}
+    : nccl_(false),  // (b/111897089): turn on NCCL collectives.
+      dev_mgr_(dev_mgr),
+      dev_resolver_(dev_resolver),
+      task_name_(task_name) {}
 
 void CollectiveParamResolverLocal::CompleteGroupAsync(
     const CompleteGroupRequest* request, CompleteGroupResponse* response,
@@ -316,29 +320,28 @@ GlobalDeviceMap EstablishGlobalRank(
 // cp->same_num_devices_per_task.  Requires cp->instance.task_names
 // be sorted.
 void SetDevPerTask(CollectiveParams* cp) {
-  cp->instance.same_num_devices_per_task = false;
-  if (cp->instance.task_names.empty()) return;
-  int dev_per_task = -1;
-  int count = 0;
+  cp->instance.num_devices_per_task.clear();
   const string* last_task_name = &cp->instance.task_names[0];
+  int count = 0;
   for (const string& task_name : cp->instance.task_names) {
-    if (task_name != *last_task_name) {
-      CHECK_GT(count, 0);
-      if (dev_per_task < 0) {
-        dev_per_task = count;
-      } else {
-        CHECK_GT(dev_per_task, 0);
-        if (count != dev_per_task) return;
-      }
+    if (task_name == *last_task_name) {
+      ++count;
+    } else {
+      cp->instance.num_devices_per_task[*last_task_name] = count;
       count = 1;
       last_task_name = &task_name;
-    } else {
-      ++count;
     }
   }
-  CHECK_GT(count, 0);
-  if ((dev_per_task > 0) && (count != dev_per_task)) {
-    return;
+  cp->instance.num_devices_per_task[*last_task_name] = count;
+
+  cp->instance.same_num_devices_per_task = false;
+  int dev_per_task = -1;
+  for (const auto& task_dev : cp->instance.num_devices_per_task) {
+    if (dev_per_task == -1) {
+      dev_per_task = task_dev.second;
+    } else if (dev_per_task != task_dev.second) {
+      return;
+    }
   }
   cp->instance.same_num_devices_per_task = true;
   CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
@@ -398,7 +401,6 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 void CollectiveParamResolverLocal::InitInstanceSharedParams(
     const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
     const StatusCallback& done) {
-  VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
     mutex_lock gl(gr->mu);
@@ -412,8 +414,8 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
   }
   ir->shared.default_rank = -1;
 
-  // Sort devce_names lexicographcally, keeping task_names in
-  // corresponding order.
+  // Sort device_names lexicographically, keeping task_names in corresponding
+  // order.  Also set number of devices per task.
   SortDevicesAndTasks(&ir->shared);
 
   // Get Locality data for all devices.
@@ -605,6 +607,25 @@ void CollectiveParamResolverLocal::CompleteInstanceAsync(
                        "intended only for non-distributed deployment."));
 }
 
+// TODO(b/111897089): we need a better way to pick the collective
+// implementation.  The ideal way would depend upon the topology and link
+// strength before picking a particular implementation.
+void CollectiveParamResolverLocal::AssignCollectiveType(CollectiveParams* cp) {
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    cp->instance.impl_details.collective_name = "HierarchicalTreeBroadcast";
+  } else if (cp->instance.type == REDUCTION_COLLECTIVE) {
+    if (nccl_) {
+      cp->instance.impl_details.collective_name = "NcclReduce";
+    } else {
+      cp->instance.impl_details.collective_name = "RingReduce";
+    }
+  } else {
+    cp->instance.impl_details.collective_name = "undef";
+  }
+  VLOG(1) << "AssignCollectiveType "
+          << cp->instance.impl_details.collective_name;
+}
+
 void CollectiveParamResolverLocal::CompleteInstanceLocal(
     const string& device, const GroupRec* gr, CollectiveParams* cp,
     bool is_source, const StatusCallback& done) {
@@ -641,48 +662,57 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
     // custom operator= does a deep copy.
     cp->instance = ir->shared.instance;
   }
-  // Populate the fields common across task, also default_rank.
+  // Populate the fields common across task.
+  AssignCollectiveType(cp);
   SetDefaultRank(device, cp);
   CompleteTaskIsLocal(task_name_, cp);
-  // TODO(b/113171733): we need a better way to pick the collective
-  // implementation.  The ideal way would depend upon the topology and link
-  // strength before picking a particular implementation.
-  cp->instance.impl_details.collective_name =
-      (cp->instance.type == BROADCAST_COLLECTIVE) ? "HierarchicalTreeBroadcast"
-                                                  : "RingReduce";
+
   CollectiveImplementationInterface* col_impl;
-  Status lookup_status = CollectiveRegistry::LookupParamResolverInstance(
+  Status status = CollectiveRegistry::LookupParamResolverInstance(
       cp->instance.impl_details.collective_name, &col_impl);
-  if (!lookup_status.ok()) {
-    done(lookup_status);
+  if (status.ok()) {
+    status = col_impl->InitializeInstanceBeforeGroupDiscovery(cp);
+  }
+  if (!status.ok()) {
+    done(status);
     return;
   }
-  // If broadcast, may need to wait for source discovery.
-  if (cp->instance.type == BROADCAST_COLLECTIVE) {
-    CompleteInstanceSource(ir, cp, is_source,
-                           [col_impl, ir, device, cp, done](InstanceRec* irec) {
-                             CHECK_EQ(ir, irec);
-                             Status s;
-                             {
-                               mutex_lock l(irec->out_mu);
-                               irec->WaitForOutMu(l);
-                               s = irec->status;
-                               cp->source_rank = irec->source_rank;
-                             }
-                             if (s.ok()) {
-                               s = col_impl->InitializeCollectiveParams(cp);
-                             }
-                             done(s);
-                           });
+
+  //  We may need to wait for the group if:
+  //  * this is a broadcast, for source discovery;
+  //  * we are using NCCL with more than 1 worker, for the communicator key from
+  //    rank 0.
+  bool broadcast = cp->instance.type == BROADCAST_COLLECTIVE;
+  bool nccl = cp->instance.type == REDUCTION_COLLECTIVE &&
+              cp->instance.impl_details.collective_name == "NcclReduce" &&
+              cp->group.num_tasks > 1;
+  if (broadcast || nccl) {
+    WaitForGroup(ir, cp, is_source, broadcast, nccl,
+                 [col_impl, ir, device, cp, done](InstanceRec* irec) {
+                   Status s;
+                   if (ir != irec) {
+                     s = errors::Internal("Expected ir ", ir, " and irec ",
+                                          irec, " to be equal");
+                   } else {
+                     mutex_lock l(irec->out_mu);
+                     irec->WaitForOutMu(l);
+                     s = irec->status;
+                     cp->source_rank = irec->source_rank;
+                     cp->instance.communicator_key = irec->communicator_key;
+                   }
+                   if (s.ok()) {
+                     s = col_impl->InitializeCollectiveParams(cp);
+                   }
+                   done(s);
+                 });
   } else {
     done(col_impl->InitializeCollectiveParams(cp));
   }
 }
 
-void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
-                                                          CollectiveParams* cp,
-                                                          bool is_source,
-                                                          const IRConsumer& f) {
+void CollectiveParamResolverLocal::WaitForGroup(
+    InstanceRec* ir, CollectiveParams* cp, bool is_source, bool init_source,
+    bool init_nccl, const IRConsumer& f) {
   std::vector<IRConsumer> ready_waiters;
   {
     mutex_lock l(ir->out_mu);
@@ -692,7 +722,8 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
     if (!ir->known[cp->default_rank]) {
       ir->known[cp->default_rank] = true;
       ++ir->known_count;
-      if (is_source) {
+      if (init_source && is_source) {
+        // Initialize source rank.
         if (ir->source_rank >= 0) {
           ir->status = errors::Internal("Instance ", cp->instance.instance_key,
                                         " already has source ", ir->source_rank,
@@ -702,13 +733,26 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
           ir->source_rank = cp->default_rank;
         }
       }
+      if (init_nccl && cp->default_rank == 0) {
+        // Initialize communicator key.
+        if (!ir->communicator_key.empty()) {
+          ir->status =
+              errors::Internal("Instance ", cp->instance.instance_key,
+                               " already has communicator_key ",
+                               str_util::CEscape(ir->communicator_key),
+                               ", received second claim from device ",
+                               cp->instance.device_names[cp->default_rank]);
+        } else {
+          ir->communicator_key = cp->instance.communicator_key;
+        }
+      }
     }
     if (ir->known_count < ir->shared.group.group_size) {
       ir->known_waiters.push_back(f);
       return;
     }
     CHECK_EQ(ir->known_count, ir->shared.group.group_size);
-    if (ir->source_rank < 0) {
+    if (init_source && ir->source_rank < 0) {
       // NOTE(ayushd): changing the error message below would also require
       // updating CompleteParamsBroadcastForgotSend test in
       // CollectiveParamResolverLocalTest.
@@ -718,6 +762,13 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
                            "could mean that there were group_size=",
                            ir->known_count, " BcastRecvs but no BcastSend.");
     }
+    if (init_nccl && ir->communicator_key.empty()) {
+      ir->status = errors::Internal(
+          "Instance ", cp->instance.instance_key, " device ",
+          cp->instance.device_names[cp->default_rank],
+          " did not find rank 0 for setting communicator key.  This is an "
+          "internal error in collective param resolution");
+    }
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 365bddc787a7ba3d97f2df29b4ebd2a3c7118ef7..fd408e4ef31bd7bb8ff5f1e28a0b49e4283bc18c 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -130,8 +130,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
     Status status GUARDED_BY(out_mu);
 
     // These fields are used to count the instances that have called
-    // in and become known while resolving broadcast source identity.
+    // in and become known while resolving broadcast source identity and
+    // communicator key.
     int source_rank GUARDED_BY(out_mu);
+    string communicator_key GUARDED_BY(out_mu);
     int known_count GUARDED_BY(out_mu);
     std::vector<bool> known GUARDED_BY(out_mu);
     std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
@@ -197,10 +199,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                                            const StatusCallback& done)
       LOCKS_EXCLUDED(ir->out_mu);
 
-  // Complete source data for a broadcast instance.
+  // Complete source data and/or nccl communicator key.
   // Precondition: *cp has complete group data and default_rank.
-  void CompleteInstanceSource(InstanceRec* ir, CollectiveParams* cp,
-                              bool is_source, const IRConsumer& f)
+  void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, bool is_source,
+                    bool init_source, bool init_nccl, const IRConsumer& f)
       LOCKS_EXCLUDED(ir->out_mu);
 
   // If cp.device_names contains only devices local to this process
@@ -216,10 +218,15 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // current ordering of cp->instance.device_names.
   void SetDefaultRank(const string& device, CollectiveParams* cp);
 
+  // Sets cp->instance.type based on collective op type, and attempts to assign
+  // best implementation.
+  void AssignCollectiveType(CollectiveParams* cp);
+
   // Helper to grab status under lock, invoke callback out of lock.
   void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
       LOCKS_EXCLUDED(irec->out_mu);
 
+  const bool nccl_;
   const DeviceMgr* dev_mgr_;
   DeviceResolverInterface* dev_resolver_;  // Not owned.
   string task_name_;
diff --git a/tensorflow/core/common_runtime/collective_util.cc b/tensorflow/core/common_runtime/collective_util.cc
index 195521a0784fd43f7bcd1b98065c7fcb641d52b4..bee4a13d1826f894b6d81539d7439a37ed1a8cfa 100644
--- a/tensorflow/core/common_runtime/collective_util.cc
+++ b/tensorflow/core/common_runtime/collective_util.cc
@@ -79,5 +79,36 @@ string SubdivPermDebugString(const CollectiveParams& col_params) {
   return buf;
 }
 
+SubContext::SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+                       OpKernel* op, Tensor* output, Tensor* input)
+    : sub_params_(*params),
+      sub_inputs_({output, input}),
+      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
+      sub_input_dc_(
+          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
+  sub_params_.op_kernel = op;
+  sub_params_.inputs = &sub_inputs_;
+  sub_params_.input_alloc_attrs = &sub_input_attr_;
+  sub_params_.input_device_contexts = &sub_input_dc_;
+  sub_params_.eigen_gpu_device = nullptr;
+  sub_params_.ensure_eigen_gpu_device();
+  sub_params_.forward_from_array = &forward_from_;
+  sub_ctx_.reset(new OpKernelContext(&sub_params_, 1));
+}
+
+Status ComputeBinOp(OpKernelContext* op_ctx, OpKernelContext::Params* params,
+                    Device* device, OpKernel* op, Tensor* output,
+                    Tensor* input) {
+  // Prepare an OpKernelContext that is identical to that of the original Op
+  // (i.e. the collective), except for the input output sizes and identities and
+  // the Op itself.
+  // TODO(ayushd, tucker): Is it possible to cache and reuse these objects?
+  // They're mostly identical inside one device execution.
+  std::unique_ptr<SubContext> sub_ctx(
+      new SubContext(op_ctx, params, op, output, input));
+  device->Compute(op, sub_ctx->sub_ctx_.get());
+  return sub_ctx->sub_ctx_->status();
+}
+
 }  // namespace collective_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_util.h b/tensorflow/core/common_runtime/collective_util.h
index ebb5731becadec3b88bea86641887c31b63ae3a5..01fb8b8c81cd2f4dc390c2b6467d7c54c7753bf0 100644
--- a/tensorflow/core/common_runtime/collective_util.h
+++ b/tensorflow/core/common_runtime/collective_util.h
@@ -32,6 +32,27 @@ Status InitializeDeviceAndLocality(const DeviceMgr* dev_mgr,
                                    DeviceLocality* device_locality);
 string SubdivPermDebugString(const CollectiveParams& col_params);
 
+// Used for executing a sub-operation, e.g. a merge_op instance, with
+// an OpKernelContext based on the one passed into this Op.
+class SubContext {
+ public:
+  OpKernelContext::Params sub_params_;
+  gtl::InlinedVector<TensorValue, 4> sub_inputs_;
+  gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
+  gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
+  // Used only for Binary and Unary Ops for which we require
+  // the calculation to be in-place on the first input.
+  int forward_from_ = 0;
+  std::unique_ptr<OpKernelContext> sub_ctx_;
+  SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+             OpKernel* op, Tensor* output, Tensor* input);
+  ~SubContext() = default;
+};
+
+Status ComputeBinOp(OpKernelContext* op_ctx, OpKernelContext::Params* params,
+                    Device* device, OpKernel* op, Tensor* output,
+                    Tensor* input);
+
 }  // namespace collective_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..124862dbb73422e7645fe460576ac35c83f018aa
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/BUILD
@@ -0,0 +1,35 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
+
+cc_library(
+    name = "standalone",
+    srcs = ["standalone.cc"],
+    hdrs = ["standalone.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cc_test(
+    name = "standalone_test",
+    srcs = ["standalone_test.cc"],
+    deps = [
+        ":standalone",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + tf_protos_all(),
+)
diff --git a/tensorflow/core/common_runtime/data/standalone.cc b/tensorflow/core/common_runtime/data/standalone.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05bff566f538970fa857a8a38888cd074a06c2f
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/data/standalone.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
+  return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
+}
+
+Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx)
+    : iterator_(iterator), ctx_(ctx) {}
+
+Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
+                          const string& fetch_node,
+                          std::unique_ptr<Dataset>* result) {
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+
+  // Instantiate enough of the TensorFlow runtime to run `graph` on a single CPU
+  // device.
+  std::unique_ptr<DeviceMgr> device_mgr = MakeUnique<DeviceMgr>(
+      DeviceFactory::NewDevice("CPU", params.session_options, ""));
+  Device* device = device_mgr->ListDevices()[0];
+  // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond
+  // the lifetime of `graph`.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def =
+      MakeUnique<FunctionLibraryDefinition>(graph.flib_def());
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr =
+      MakeUnique<ProcessFunctionLibraryRuntime>(
+          device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION,
+          flib_def.get(), OptimizerOptions{}, nullptr /* parent */);
+
+  // Run graph up to `output_node` and extract the `DatasetBase` stored in the
+  // DT_VARIANT output tensor.
+  data::DatasetBase* dataset;
+  {
+    std::vector<Tensor> outputs;
+    GraphRunner graph_runner(device);
+    TF_RETURN_IF_ERROR(graph_runner.Run(&graph, pflr->GetFLR("/device:CPU:0"),
+                                        {}, {fetch_node}, &outputs));
+    TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
+    // NOTE(mrry): The dataset is currently owned by `outputs[0]`, so acquire an
+    // additional reference.
+    dataset->Ref();
+  }
+
+  std::unique_ptr<thread::ThreadPool> pool(
+      NewThreadPoolFromSessionOptions(params.session_options));
+  *result =
+      WrapUnique(new Dataset(dataset, device_mgr.release(), pflr.release(),
+                             flib_def.release(), pool.release()));
+  return Status::OK();
+}  // static
+
+Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
+  // Create an `IteratorContext`, which bundles together the necessary runtime
+  // support to create and get elements from an iterator.
+  std::unique_ptr<IteratorContext> ctx;
+  {
+    // NOTE(mrry): In the current API, an `IteratorContext` is always initially
+    // created from an `OpKernelContext*`, so we need to create a fake
+    // `OpKernelContext` with the appropriate subset of parameters.
+    OpKernelContext::Params op_params;
+    op_params.function_library = pflr_->GetFLR("/device:CPU:0");
+    op_params.device = device_mgr_->ListDevices()[0];
+    op_params.runner = &runner_;
+    OpKernelContext op_ctx(&op_params, 0);
+    IteratorContext::Params params(&op_ctx);
+    params.function_handle_cache = function_handle_cache_.get();
+    ctx = MakeUnique<IteratorContext>(std::move(params));
+  }
+
+  // Create the iterator from the dataset.
+  std::unique_ptr<IteratorBase> iterator;
+  TF_RETURN_IF_ERROR(dataset_->MakeIterator(ctx.get(), "iterator", &iterator));
+
+  *result = WrapUnique(new Iterator(iterator.release(), ctx.release()));
+
+  return Status::OK();
+}
+
+Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+                 ProcessFunctionLibraryRuntime* pflr,
+                 FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool)
+    : dataset_(dataset),
+      device_mgr_(device_mgr),
+      flib_def_(flib_def),
+      pflr_(pflr),
+      pool_(pool) {
+  runner_ = [this](std::function<void()> c) { pool_->Schedule(std::move(c)); };
+  function_handle_cache_ =
+      MakeUnique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
+}
+
+Dataset::~Dataset() { dataset_->Unref(); }
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/data/standalone.h b/tensorflow/core/common_runtime/data/standalone.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecea5ba21d0e807b72808c31336916b5f12cb854
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone.h
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
+
+#include <memory>
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+// The purpose of the API in this file is to facilitate standalone execution of
+// a tf.data input pipeline graph.
+//
+// The API exposes two abstractions -- a `Dataset` and an `Iterator` -- which
+// encapsulate TensorFlow runtime.
+//
+// The `Dataset` abstraction represents an input pipeline as a collection
+// of data sources and a logical plan of transformations that operate over the
+// data.
+//
+// The `Iterator` abstraction represents an execution of an input pipeline that
+// can be used to enumerate its elements.
+//
+// Example usage:
+//
+//   // Create a `Dataset` by running the `graph_def` graph and fetching the
+//   // output of the `fetch_node` node.
+//   tensorflow::data:standalone::Dataset::Params params;
+//   std::unique_ptr<tensorflow::data::standalone::Dataset> dataset;
+//   Status s = tensorflow::data::standalone::Dataset::FromGraph(
+//      params, graph_def, fetch_node, &dataset);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   std::unique_ptr<tensorflow::data::standalone::Iterator> iterator;
+//   s = dataset->MakeIterator(&iterator);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   bool end_of_input = false;
+//   while (!end_of_input) {
+//     std::vector<tensorflow::Tensor> outputs;
+//     s = iterator->GetNext(&outputs, &end_of_input);
+//     if (!s.ok()) { /* error handling */ }
+//     if (!end_of_input) { /* output handling */ }
+//   }
+
+class Dataset;
+
+// Represents an execution of an input pipeline that can be used to enumerate
+// its elements.
+class Iterator {
+ public:
+  // Returns the next element of the input pipeline (if there is one) and an
+  // indication of whether the end of the input pipeline has been reached.
+  Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
+
+ private:
+  friend class Dataset;
+
+  Iterator(IteratorBase* iterator, IteratorContext* ctx);
+
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<IteratorContext> ctx_;
+};
+
+// Represents an input pipeline as a collection of data sources and a logical
+// plan of transformations that operate over the data.
+class Dataset {
+ public:
+  // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
+  struct Params {
+    SessionOptions session_options;
+  };
+
+  // Creates a new `Dataset` instance by running the TensorFlow graph `graph`
+  // and fetching the output of the `fetch_node` node.
+  static Status FromGraph(Params params, const GraphDef& graph_def,
+                          const string& fetch_node,
+                          std::unique_ptr<Dataset>* result);
+
+  ~Dataset();
+
+  // Creates an iterator for this dataset.
+  Status MakeIterator(std::unique_ptr<Iterator>* result);
+
+ private:
+  Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+          ProcessFunctionLibraryRuntime* pflr,
+          FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool);
+
+  DatasetBase* dataset_;  // owned
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> pool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+};
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
diff --git a/tensorflow/core/common_runtime/data/standalone_test.cc b/tensorflow/core/common_runtime/data/standalone_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7a7a9b6195c247d94ed137f4bce18cee9851b4
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone_test.cc
@@ -0,0 +1,188 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/data/standalone.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+namespace {
+
+constexpr const char* const kGraphProto = R"proto(
+  node {
+    name: "Const/_0"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 0
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_1"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 10
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_2"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 1
+        }
+      }
+    }
+  }
+  node {
+    name: "RangeDataset/_3"
+    op: "RangeDataset"
+    input: "Const/_0"
+    input: "Const/_1"
+    input: "Const/_2"
+    attr {
+      key: "output_shapes"
+      value { list { shape { unknown_rank: true } } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+  }
+  node {
+    name: "MapDataset/_4"
+    op: "MapDataset"
+    input: "RangeDataset/_3"
+    attr {
+      key: "Targuments"
+      value { list {} }
+    }
+    attr {
+      key: "f"
+      value { func { name: "Dataset_map_<lambda>_10" } }
+    }
+    attr {
+      key: "output_shapes"
+      value { list { shape {} } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+    attr {
+      key: "preserve_cardinality"
+      value { b: false }
+    }
+    attr {
+      key: "use_inter_op_parallelism"
+      value { b: true }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "Dataset_map_<lambda>_10"
+        input_arg { name: "arg0" type: DT_INT64 }
+        output_arg { name: "mul" type: DT_INT64 }
+        description: "Wrapper for passing nested structures to and from tf.data functions."
+      }
+      node_def {
+        name: "mul_0"
+        op: "Mul"
+        input: "arg0"
+        input: "arg0"
+        attr {
+          key: "T"
+          value { type: DT_INT64 }
+        }
+      }
+      ret { key: "mul" value: "mul_0:z:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 }
+)proto";
+
+TEST(Scalar, Standalone) {
+  GraphDef graph_def;
+  protobuf::TextFormat::ParseFromString(kGraphProto, &graph_def);
+  struct TestCase {
+    string fetch_node;
+    std::vector<int64> expected_outputs;
+  };
+  auto test_cases = {
+      TestCase{"RangeDataset/_3", {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}},
+      TestCase{"MapDataset/_4", {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}},
+  };
+  for (auto test_case : test_cases) {
+    std::unique_ptr<Dataset> dataset;
+    auto s = Dataset::FromGraph({}, graph_def, test_case.fetch_node, &dataset);
+    TF_EXPECT_OK(s);
+    std::unique_ptr<Iterator> iterator;
+    s = dataset->MakeIterator(&iterator);
+    TF_EXPECT_OK(s);
+    bool end_of_input = false;
+    for (int num_outputs = 0; !end_of_input; ++num_outputs) {
+      std::vector<tensorflow::Tensor> outputs;
+      s = iterator->GetNext(&outputs, &end_of_input);
+      TF_EXPECT_OK(s);
+      if (!end_of_input) {
+        EXPECT_EQ(outputs[0].scalar<int64>()(),
+                  test_case.expected_outputs[num_outputs]);
+      } else {
+        EXPECT_EQ(test_case.expected_outputs.size(), num_outputs);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 8dfbb21eda641ff9f70c58f1f4bf150ba4cceef3..5a0ef28ff22a9bf67cb4355b6d5373e957eb8df0 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -122,9 +123,21 @@ class Device : public DeviceBase {
   // version.
   virtual void Sync(const DoneCallback& done);
 
-  // Override this to return true for devices that require a Sync() call before
-  // session completion.
-  virtual bool RequiresSyncOnCompletion() const { return false; }
+  // On session completion, the executor may call Device::Sync() depending on
+  // flag settings. Override this to return false for devices that don't allow
+  // such calls. Instead, these devices must use other mechanisms (such as
+  // num_deferred_ops) to ensure the device has finished processing necessary
+  // work at session completion.
+  //
+  // Devices that override this function must also implement CurrentStatus.
+  virtual bool AllowsSyncOnCompletion() const { return true; }
+
+  // This is used in conjunction with AllowsSyncOnCompletion to allow the
+  // executor to get execution result status at session completion.
+  virtual Status CurrentStatus() {
+    return errors::Unimplemented(
+        "CurrentStatus is not supported on this device.");
+  }
 
   // Optionally modify the device's GraphDef before execution.
   //
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0434ca47b68f28ff65cb3d5e165bc5545ebe96f0..80b62f273ce785c700f93ed68af1af9429276f79 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -302,10 +303,8 @@ DirectSession::DirectSession(const SessionOptions& options,
   if (!status.ok()) {
     LOG(ERROR) << status.error_message();
   }
-  // NOTE(mrry): We do not need to use a unique string for the session
-  // handle, because DirectSession owns its devices. This may change
-  // in future versions.
-  session_handle_ = "direct";
+  session_handle_ =
+      strings::StrCat("direct", strings::FpToString(random::New64()));
   int devices_added = 0;
   if (options.config.log_device_placement()) {
     const string mapping_str = device_mgr_->DeviceMappingString();
@@ -370,6 +369,7 @@ Status DirectSession::MaybeInitializeExecutionState(
   GraphExecutionStateOptions options;
   options.device_set = &device_set_;
   options.session_options = &options_;
+  options.session_handle = session_handle_;
   // TODO(mrry,suharshs): We explicitly copy `graph` so that
   // `MakeForBaseGraph()` can take ownership of its
   // contents. Previously this happened implicitly in calls to the
@@ -532,6 +532,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   CancellationManager step_cancellation_manager;
   args.cancellation_manager = &step_cancellation_manager;
   args.session_state = &session_state_;
+  args.session_handle = session_handle_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
   args.sync_on_finish = sync_on_finish_;
@@ -718,7 +719,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
-  UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
+  metrics::UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
 
   return Status::OK();
 }
@@ -887,6 +888,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
     SchedClosure(pool, std::move(c));
   };
   args.session_state = &session_state_;
+  args.session_handle = session_handle_;
   args.tensor_store = &run_state->tensor_store;
   args.step_container = &run_state->step_container;
   if (LogMemory::IsEnabled()) {
@@ -1189,6 +1191,10 @@ Status DirectSession::CreateExecutors(
   options.use_function_convention = !run_state_args->is_partial_run;
   options.collective_graph_key =
       callable_options.run_options().experimental().collective_graph_key();
+  if (options_.config.experimental()
+          .collective_deterministic_sequential_execution()) {
+    options.collective_order = GraphCollectiveOrder::kEdges;
+  }
 
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
   std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
@@ -1464,6 +1470,7 @@ Status DirectSession::CreateGraphs(
     prune_options.device_set = &device_set_;
     prune_options.session_options = &options_;
     prune_options.stateful_placements = stateful_placements_;
+    prune_options.session_handle = session_handle_;
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForPrunedGraph(
         execution_state_->original_graph_def().library(), prune_options,
         execution_state_->original_graph_def(), subgraph_options,
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 6754e9cfb71700090049107cf4dd122175527ffe..bcac34154407eb461a80fd3d638ee51a88f3d7fa 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -317,6 +317,7 @@ class DirectSession : public Session {
   std::vector<Device*> devices_;  // not owned
   DeviceSet device_set_;
 
+  // Unique session identifier.
   string session_handle_;
   mutex graph_state_lock_;
   bool graph_created_ GUARDED_BY(graph_state_lock_) = false;
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 77e3246df045785df5908c263edbf668762acc38..1df55d9c882b91158b7daa6e147a759de3d87dd3 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -106,6 +106,7 @@ tf_cuda_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "@com_google_absl//absl/strings",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -154,6 +155,7 @@ tf_cuda_library(
     deps = [
         ":attr_builder",
         "@farmhash_archive//:farmhash",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -165,6 +167,7 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core/grappler/optimizers:meta_optimizer",
         ],
     }),
 )
@@ -179,12 +182,20 @@ tf_cc_test(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels:constant_op",
+        "//tensorflow/core/kernels:matmul_op",
         "@com_google_absl//absl/memory",
     ],
 )
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index a750f8cbba4de4abd33d6ec395b6b0a5fb76cc67..77be4c951e6a85e71a3d19f5cf43099027c80696 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -42,7 +42,7 @@ const uint32 kIsList = 1U << 31;
 AttrTypeMap* DefaultFunctionAttrTypeMap() {
   AttrTypeMap* map = new AttrTypeMap();
   (*map)["executor_type"] = TF_ATTR_STRING;
-  (*map)["config"] = TF_ATTR_STRING;
+  (*map)["config_proto"] = TF_ATTR_STRING;
   return map;
 }
 
@@ -125,6 +125,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
     value_field.push_back(std::make_pair(string(attr_name), value));         \
+    cached_cache_key_ = absl::nullopt;                                       \
     return *this;                                                            \
   }
 
@@ -231,7 +232,17 @@ inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) {
 
 }  // namespace
 
-tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) const {
+tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) {
+  if (!cached_cache_key_ || device != device_for_cached_cache_key_) {
+    cached_cache_key_ = BuildCacheKeyForDevice(device);
+    device_for_cached_cache_key_ = device;
+  }
+
+  return *cached_cache_key_;
+}
+
+tensorflow::Fprint128 AttrBuilder::BuildCacheKeyForDevice(
+    const string& device) const {
   tensorflow::Fprint128 f = tensorflow::Fingerprint128(op_name_);
   f = tensorflow::FingerprintCat128(f, tensorflow::Fingerprint128(device));
   if (node_def_ != nullptr) {
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 5e0172dfd328dbd4f16abdce879be1d1338e692c..aa64b5f59bd0cb54b1872c0328c10ebb1de622b6 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
@@ -74,7 +75,7 @@ Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
 // AttrBuilder a;
 // a.NumInputs(2);
 // a.Set("T", TF_FLOAT);
-// uint64 cache_key = a.CacheKey("cpu:0");
+// tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
 // const NodeDef& n = a.BuildNodeDef();
 //
 // Note that all calls to Set and NumInputs should happen before calling
@@ -100,10 +101,11 @@ class AttrBuilder {
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
     SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
+    cached_cache_key_ = absl::nullopt;
     return *this;
   }
 
-  tensorflow::Fprint128 CacheKey(const string& device) const;
+  tensorflow::Fprint128 CacheKey(const string& device);
 
   void FillAttrValueMap(AttrValueMap* m) const { FillAttrValueMap(m, true); }
   const NodeDef& BuildNodeDef();
@@ -112,6 +114,8 @@ class AttrBuilder {
   template <class T>
   using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
+  tensorflow::Fprint128 BuildCacheKeyForDevice(const string& device) const;
+
   void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
   // well as any default attr-value pairs from the associated op_def, if there
@@ -148,6 +152,9 @@ class AttrBuilder {
   int num_inputs_;
   std::unique_ptr<NodeDef> node_def_;
   bool node_def_finalized_;
+
+  absl::optional<tensorflow::Fprint128> cached_cache_key_;
+  string device_for_cached_cache_key_;
 };  // namespace tensorflow
 
 template <>
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 220cc6f5ce0bff32cfdc8d4e837c6900c773728e..31c998a670a0a6613bbaca437d8d3e4f9f976443 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -41,8 +41,10 @@ TEST(AttrTypeMap, Lookup) {
   Status s = AttrTypeMapForOp("SomeFunctionName", &m, &is_function);
   EXPECT_TRUE(s.ok());
   EXPECT_TRUE(is_function);
+  ASSERT_NE(m->end(), m->find("executor_type"));
   EXPECT_EQ(TF_ATTR_STRING, m->find("executor_type")->second);
-  EXPECT_EQ(TF_ATTR_STRING, m->find("config")->second);
+  ASSERT_NE(m->end(), m->find("config_proto"));
+  EXPECT_EQ(TF_ATTR_STRING, m->find("config_proto")->second);
 
   is_function = true;
   s = AttrTypeMapForOp("MatMul", &m, &is_function);
@@ -67,5 +69,18 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_NE(is_list, 0);
 }
 
+TEST(AttrTypeMap, CacheKey) {
+  AttrBuilder a("op_name");
+  a.NumInputs(2);
+  a.Set("T", TF_FLOAT);
+  tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
+
+  ASSERT_FALSE(cache_key == a.CacheKey("cpu:1"));
+  ASSERT_TRUE(cache_key == a.CacheKey("cpu:0"));
+
+  a.Set("x", 1.0);
+  ASSERT_FALSE(cache_key == a.CacheKey("cpu:0"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 2212bda53449c1944a75318725eec0faf46438f1..4d8712f4dc670b79877c1281090ac89e19b9ba0b 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -57,8 +57,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
       rendezvous_(rendezvous),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
-          device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
-          thread_pool_.get())),
+          device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+          opts.config.graph_options().optimizer_options(), thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
       num_active_steps_(0),
       async_default_(async),
@@ -207,6 +207,14 @@ EagerContext::~EagerContext() {
   executor_.WaitForAllPendingNodes().IgnoreError();
   ClearCaches();
   rendezvous_->Unref();
+
+  for (auto& thread : child_threads_) {
+    thread.reset();
+  }
+}
+
+void EagerContext::AddChildThread(std::unique_ptr<Thread> thread) {
+  child_threads_.push_back(std::move(thread));
 }
 
 bool EagerContext::FindFunctionByName(const string& name) {
@@ -234,6 +242,29 @@ Status EagerContext::FindDeviceByName(const string& name, Device** result) {
   return Status::OK();
 }
 
+void EagerContext::ClearRunMetadata() {
+  if (metadata_listener_ != nullptr) {
+    metadata_listener_->BeforeClearRunMetadata();
+  }
+  run_metadata_.Clear();
+}
+
+Status EagerContext::RegisterRunMetadataListener(
+    RunMetadataListener* listener) {
+  mutex_lock l(metadata_mu_);
+  if (metadata_listener_ != nullptr) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "Cannot run two eager profiler at the same time");
+  }
+  metadata_listener_ = listener;
+  return Status::OK();
+}
+
+void EagerContext::ClearRunMetadataListener() {
+  mutex_lock l(metadata_mu_);
+  metadata_listener_ = nullptr;
+}
+
 void EagerContext::StartStep() {
   mutex_lock ml(metadata_mu_);
   num_active_steps_++;
@@ -317,10 +348,15 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
   gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
 }
 
+bool EagerContext::ShouldStoreMetadata() {
+  mutex_lock ml(metadata_mu_);
+  return should_store_metadata_.load() || metadata_listener_ != nullptr;
+}
+
 void EagerContext::SetShouldStoreMetadata(bool value) {
+  mutex_lock ml(metadata_mu_);
   should_store_metadata_.store(value);
-  if (!value) {
-    mutex_lock ml(metadata_mu_);
+  if (!value || metadata_listener_ != nullptr) {
     run_metadata_.Clear();
   }
 }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5ff6b3ffbdd9ed7a6aa2e56e1ddb8648f9265ef0..1f24109a76793f125bcfd06d699632efd69bd5a2 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/platform/env.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
@@ -66,6 +68,12 @@ enum ContextDevicePlacementPolicy {
   DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 };
 
+class RunMetadataListener {
+ public:
+  virtual ~RunMetadataListener() {}
+  virtual void BeforeClearRunMetadata() = 0;
+};
+
 class EagerContext {
  public:
   // TODO: remove this constructor once we migrate all callers to the next one.
@@ -86,6 +94,8 @@ class EagerContext {
     return pflr_->GetFLR(d->name());
   }
 
+  ProcessFunctionLibraryRuntime* pflr() const { return pflr_.get(); }
+
   // True if running in asynchronous mode.
   bool Async() const;
 
@@ -131,7 +141,7 @@ class EagerContext {
 
   Status FindDeviceByName(const string& name, Device** result);
 
-  Device* HostCPU() { return devices_[0]; }
+  Device* HostCPU() const { return devices_[0]; }
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
@@ -145,10 +155,10 @@ class EagerContext {
 
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
-  bool LogDevicePlacement() { return log_device_placement_; }
-  bool LogMemory() { return log_memory_; }
+  bool LogDevicePlacement() const { return log_device_placement_; }
+  bool LogMemory() const { return log_memory_; }
 
-  Rendezvous* GetRendezvous() { return rendezvous_; }
+  Rendezvous* GetRendezvous() const { return rendezvous_; }
   CollectiveExecutorMgrInterface* collective_executor_mgr() {
     return (collective_executor_mgr_ != nullptr)
                ? collective_executor_mgr_.get()
@@ -164,7 +174,7 @@ class EagerContext {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
                                               : local_unowned_device_manager_;
   }
-  const tensorflow::DeviceMgr* remote_device_mgr() {
+  const tensorflow::DeviceMgr* remote_device_mgr() const {
     return remote_device_manager_.get();
   }
 
@@ -172,10 +182,15 @@ class EagerContext {
   void ReleaseDeviceMgr() { local_device_manager_.release(); }
 
   // TODO(apassos) clean up RunMetadata storage.
-  mutex* MetadataMu() { return &metadata_mu_; }
-  bool ShouldStoreMetadata() { return should_store_metadata_.load(); }
+  mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
+  bool ShouldStoreMetadata() LOCKS_EXCLUDED(metadata_mu_);
   void SetShouldStoreMetadata(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
+  void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
+
+  Status RegisterRunMetadataListener(RunMetadataListener* listener)
+      LOCKS_EXCLUDED(metadata_mu_);
+  void ClearRunMetadataListener() LOCKS_EXCLUDED(metadata_mu_);
 
   void StartStep();
   void EndStep();
@@ -224,6 +239,9 @@ class EagerContext {
 
   tensorflow::Env* TFEnv() const { return env_; }
 
+  // All child threads will be reset() when destructing EagerContext.
+  void AddChildThread(std::unique_ptr<Thread> thread);
+
  private:
   void InitDeviceMapAndAsync();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
@@ -269,6 +287,7 @@ class EagerContext {
   std::atomic<bool> should_store_metadata_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  RunMetadataListener* metadata_listener_ GUARDED_BY(metadata_mu_) = nullptr;
   GraphCollector graph_collector_;
   const bool log_device_placement_;
   // EagerExecutor for async execution.
@@ -319,6 +338,7 @@ class EagerContext {
 
   bool use_send_tensor_rpc_;
   const bool pin_small_ops_to_cpu_;
+  std::vector<std::unique_ptr<tensorflow::Thread>> child_threads_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 5bc3bb689e076467672af85d28bb340b56e7ee79..a807e7f68d3cffe0c71393acb537c6b3a732fde6 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -30,7 +30,7 @@ class CopyToDeviceNode : public EagerNode {
         src_(src),
         dstd_(dstd),
         ctx_(ctx),
-        dst_(new TensorHandle(id, dstd_, dstd_, src->dtype, ctx)) {
+        dst_(new TensorHandle(id, dstd_, dstd_, nullptr, src->dtype, ctx)) {
     src_->Ref();
     dst_->Ref();
   }
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 381b05ada8594fde1aa917053acd0371167f66ed..b10320ca30bd4423bc755722dafb85908d922f8e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -30,4 +30,28 @@ void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
   inputs_.push_back(h);
   attrs_.NumInputs(static_cast<int>(inputs_.size()));
 }
+
+void EagerOperation::ConsumeInput(tensorflow::TensorHandle* h) {
+  inputs_.push_back(h);
+  attrs_.NumInputs(static_cast<int>(inputs_.size()));
+}
+
+string EagerOperation::DebugString() const {
+  string out;
+  VLOG(1) << "EagerOperation::DebugString() over " << this;
+
+  strings::StrAppend(&out, "Name: ", name_, "\n");
+  strings::StrAppend(
+      &out, "Device: ", Device() ? Device()->DebugString() : "[]", "\n");
+  for (const auto& input : inputs_) {
+    VLOG(1) << "Input ptr: " << input;
+    strings::StrAppend(&out, "Input: ", input->DebugString(), "\n");
+  }
+
+  NodeDef ndef;
+  Attrs().FillAttrValueMap(ndef.mutable_attr());
+  strings::StrAppend(&out, "Attrs: ", ndef.DebugString(), "\n");
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 935ca7f9aa766a69582b4c94fec6c508e3f5a369..23a2d1bf986d8cd2b1670432e48ff3c6b3a1ee1c 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -53,6 +53,7 @@ class EagerOperation {
     return &inputs_;
   }
   void AddInput(tensorflow::TensorHandle* h);
+  void ConsumeInput(tensorflow::TensorHandle* h);
 
   const tensorflow::string& Name() const { return name_; }
   const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; }
@@ -63,6 +64,8 @@ class EagerOperation {
 
   void SetUseXla(bool use_xla) { use_xla_ = use_xla; }
 
+  string DebugString() const;
+
  private:
   tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
   const tensorflow::string name_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 783baa96c92f224e45404e5f6586011599f02292..9df1511bc75f38caa5dd0fb60c2b1782ae8b69c3 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
@@ -81,18 +84,20 @@ int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
 // tensor handle.
 //
 // The passed in *handle will be Unreffed if it is replaced.
-Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
-                                      const Device* expected_device,
+//
+// `op_device` is passed in explicitly because `op->device()` might be unset
+// and we might have selected some specific device to run this op on.
+Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
+                                      const Device* op_device, int i,
+                                      const Device* expected_input_device,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
   EagerContext* ctx = op->EagerContext();
   Device* handle_device = (*handle)->device();
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
-  const Device* op_device =
-      op->Device() == nullptr ? ctx->HostCPU() : op->Device();
 
-  if (expected_device != actual_device) {
+  if (expected_input_device != actual_device) {
     switch (ctx->GetDevicePlacementPolicy()) {
       case DEVICE_PLACEMENT_SILENT_FOR_INT32:
         // TODO(xpan): See if we could bubble python related error up
@@ -108,7 +113,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
             "Tensors on conflicting devices:"
             " cannot compute ",
             op->Name(), " as input #", i, " was expected to be on ",
-            expected_device->name(), " but is actually on ",
+            expected_input_device->name(), " but is actually on ",
             actual_device->name(), " (operation running on ", op_device->name(),
             ")",
             " Tensors can be copied explicitly using .gpu() or .cpu() "
@@ -119,9 +124,10 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
             " may slow down your model");
       case DEVICE_PLACEMENT_WARN:
         LOG(WARNING) << "before computing " << op->Name() << " input #" << i
-                     << " was expected to be on " << expected_device->name()
-                     << " but is actually on " << actual_device->name()
-                     << " (operation running on " << op_device->name()
+                     << " was expected to be on "
+                     << expected_input_device->name() << " but is actually on "
+                     << actual_device->name() << " (operation running on "
+                     << op_device->name()
                      << "). This triggers a copy which can be a performance "
                         "bottleneck.";
         break;
@@ -133,7 +139,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
     auto pre_time_nanos = Env::Default()->NowNanos();
     TensorHandle* result_handle = nullptr;
     Status status = EagerCopyToDevice(
-        *handle, ctx, expected_device->name().c_str(), &result_handle);
+        *handle, ctx, expected_input_device->name().c_str(), &result_handle);
     if (run_metadata != nullptr) {
       auto* step_stats = run_metadata->mutable_step_stats();
       MaybeInitializeStepStats(step_stats, ctx);
@@ -155,10 +161,10 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
     }
     if (!status.ok()) {
       if (result_handle != nullptr) result_handle->Unref();
-      return errors::Internal("Failed copying input tensor from ",
-                              actual_device->name(), " to ",
-                              expected_device->name(), " in order to run ",
-                              op->Name(), ": ", status.error_message());
+      return errors::Internal(
+          "Failed copying input tensor from ", actual_device->name(), " to ",
+          expected_input_device->name(), " in order to run ", op->Name(), ": ",
+          status.error_message());
     }
 
     (*handle)->Unref();
@@ -168,19 +174,18 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
 }
 
 Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
-                                     EagerOperation* op, const OpKernel* kernel,
+                                     EagerOperation* op,
+                                     const KernelAndDevice* kernel,
                                      RunMetadata* run_metadata) {
-  Device* host_device = ctx->HostCPU();
-  const MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->Inputs().size()) {
-    return errors::InvalidArgument("expected ", memtypes.size(),
+  if (kernel->num_inputs() != op->Inputs().size()) {
+    return errors::InvalidArgument("expected ", kernel->num_inputs(),
                                    " inputs, got ", op->Inputs().size());
   }
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    const Device* expected_device =
-        memtypes[i] == HOST_MEMORY ? host_device : op_device;
+    const Device* expected_device = kernel->InputDevice(i);
     TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-        op, i, expected_device, run_metadata, &((*op->MutableInputs())[i])));
+        op, op_device, i, expected_device, run_metadata,
+        &((*op->MutableInputs())[i])));
     tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
       return errors::InvalidArgument(
@@ -246,6 +251,89 @@ bool OnSameTask(EagerContext* ctx, Device* first, Device* second) {
          first->parsed_name().task == second->parsed_name().task;
 }
 
+inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
+                                               const tensorflow::Fprint128& b) {
+  return {tensorflow::FingerprintCat64(a.low64, b.low64),
+          tensorflow::FingerprintCat64(a.high64, b.high64)};
+}
+
+Status FindDeviceFromName(const EagerContext* ctx, const char* device_name,
+                          Device** device) {
+  *device = ctx->HostCPU();
+  if (device_name == nullptr || strlen(device_name) == 0) {
+    return Status::OK();
+  }
+
+  auto status = ctx->local_device_mgr()->LookupDevice(device_name, device);
+  if (status.ok()) {
+    return status;
+  }
+
+  if (ctx->remote_device_mgr() != nullptr) {
+    return ctx->remote_device_mgr()->LookupDevice(device_name, device);
+  }
+
+  return status;
+}
+
+bool IsMultiDevice(const FunctionDef* fdef, const string& op_device) {
+  if (fdef == nullptr) {
+    // Primitive op.
+    return false;
+  }
+
+  // Run all functions as multi-device.
+  return true;
+
+  // We can eliminate some overhead by running simple functions using regular
+  // CallOp kernel. However, it is tricky to figure out which functions should
+  // be run using CallOp. Also, currently CallOp runs neither optimization
+  // passes (needed for TPU/XLA) nor grappler.
+  // Here are some cases where a function should be run in multi-device mode:
+  //  - Function takes at least two resources on different devices.
+  //  - Function takes a resource on deviceA and a body op explicitly placed
+  //  on deviceB.
+  //  - Function has a colocation constraint.
+  //  - Function has an explicit device annotation (which might not be using
+  //    full canonical device name) different from op_device. Note that false
+  //    positives are ok.
+  //  - Function has a node or a (node) attribute that can potentially make
+  //    the function multi-device after a rewrite pass (e.g. various XLA/TPU
+  //    special nodes and attributes)
+}
+
+Status AddInputDevicesToCacheKey(const EagerContext* ctx,
+                                 const EagerOperation* op,
+                                 std::vector<Device*>* input_dev_ptrs,
+                                 Fprint128* cache_key) {
+  input_dev_ptrs->reserve(op->Inputs().size());
+  Device* cpu_device = ctx->HostCPU();
+  for (TensorHandle* tensor_handle : op->Inputs()) {
+    string device_name;
+    if (tensor_handle->dtype == DT_RESOURCE) {
+      // Use the resource's actual device because it is the device that will
+      // influence partitioning the multi-device function.
+      const Tensor* tensor;
+      TF_RETURN_IF_ERROR(tensor_handle->Tensor(&tensor));
+      const ResourceHandle& handle = tensor->flat<ResourceHandle>()(0);
+      device_name = handle.device();
+
+      Device* input_device;
+      TF_RETURN_IF_ERROR(
+          FindDeviceFromName(ctx, device_name.c_str(), &input_device));
+      input_dev_ptrs->push_back(input_device);
+    } else if (MTypeFromDType(tensor_handle->dtype) == HOST_MEMORY) {
+      input_dev_ptrs->push_back(cpu_device);
+    } else {
+      Device* device = tensor_handle->device();
+      device_name = device != nullptr ? device->name() : cpu_device->name();
+      input_dev_ptrs->push_back(device == nullptr ? cpu_device : device);
+    }
+    *cache_key = FingerprintCat128(*cache_key, Fingerprint128(device_name));
+  }
+  return Status::OK();
+}
+
 Status EagerLocalExecute(EagerOperation* op,
                          gtl::InlinedVector<TensorHandle*, 2>* retvals,
                          int* num_retvals) {
@@ -254,18 +342,34 @@ Status EagerLocalExecute(EagerOperation* op,
   if (!status.ok()) return status;
   Device* device = op->Device();
 
-  Fprint128 cache_key = op->MutableAttrs()->CacheKey(
-      device == nullptr ? "unspecified" : device->name());
+  const string& maybe_unspecified_device_name =
+      device == nullptr ? "unspecified" : device->name();
+  Fprint128 cache_key =
+      op->MutableAttrs()->CacheKey(maybe_unspecified_device_name);
+
+  bool is_multi_device_function = IsMultiDevice(
+      ctx->FindFunctionDef(op->Name()), maybe_unspecified_device_name);
+
+  std::vector<Device*> input_dev_ptrs;
+  if (is_multi_device_function) {
+    TF_RETURN_IF_ERROR(
+        AddInputDevicesToCacheKey(ctx, op, &input_dev_ptrs, &cache_key));
+  }
+
   KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
   if (kernel == nullptr) {
+    VLOG(2) << "Creating new kernel for " << op->Name() << " on device "
+            << maybe_unspecified_device_name;
     // If we are running a function on explicitly requested TPU,
     // compile it with XLA.
     // Note that it is not ideal, but currently ok, to set this
     // attribute after computing the kernel cache key above.
+    bool compile_with_xla = false;
     if (op->is_function() && device != nullptr &&
         (device->device_type() == "TPU" || device->device_type() == "XLA_GPU" ||
          device->device_type() == "XLA_CPU")) {
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
+      compile_with_xla = true;
     }
 
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
@@ -273,7 +377,6 @@ Status EagerLocalExecute(EagerOperation* op,
       status = SelectDevice(ndef, ctx, &device);
       if (!status.ok()) return status;
     }
-    CHECK(device != nullptr);
     if (ctx->LogDevicePlacement()) {
       LOG(INFO) << "Executing op " << ndef.op() << " in device "
                 << device->name();
@@ -285,9 +388,35 @@ Status EagerLocalExecute(EagerOperation* op,
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
-    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory(),
-                                 ctx->GetCollectiveExecutorHandle());
-    status = KernelAndDevice::Init(ndef, flr, ctx->runner(), kernel);
+    GraphCollector* graph_collector = nullptr;
+    if (ctx->ShouldStoreMetadata()) {
+      graph_collector = ctx->GetGraphCollector();
+    }
+    // Treat the function as multi_device only when we are not compiling
+    // it wholly with XLA. When compiling wholly with XLA, flr->CreateKernel
+    // will create an XlaLaunchOp kernel to compile and run the function.
+    if (is_multi_device_function && !compile_with_xla) {
+      // Multi-device functions don't use the rendezvous from eager context.
+      // If we use that rendezvous, multiple concurrent calls to the same
+      // function will likely result in collisions. However, this also means
+      // that we don't support legitimate sending/receiving across function
+      // boundary.
+      VLOG(2) << "Running " << ndef.op() << " using multi-device function. "
+              << "compile_with_xla=" << compile_with_xla
+              << ". Full node_def=" << ndef.DebugString();
+      kernel = new KernelAndDeviceFunc(
+          flr, ctx->pflr(), std::move(input_dev_ptrs), ctx->runner(),
+          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU());
+    } else {
+      VLOG(2) << "Running " << ndef.op() << " using op kernel. "
+              << "compile_with_xla=" << compile_with_xla
+              << ". Full node_def=" << ndef.DebugString();
+      kernel = new KernelAndDeviceOp(
+          ctx->GetRendezvous(), ctx->LogMemory(), flr, ctx->runner(),
+          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU());
+    }
+
+    status = kernel->Init(ndef, graph_collector);
     if (!status.ok()) {
       delete kernel;
       return status;
@@ -309,7 +438,7 @@ Status EagerLocalExecute(EagerOperation* op,
     device = kernel->device();
   }
   status = ValidateInputTypeAndPlacement(
-      ctx, device, op, kernel->kernel(),
+      ctx, device, op, kernel,
       ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
   if (!status.ok()) return status;
   std::unique_ptr<NodeExecStats> maybe_stats;
@@ -333,12 +462,15 @@ Status EagerLocalExecute(EagerOperation* op,
   if (ctx->Async()) {
     // Note that for async mode, execution order will make sure that all
     // input handles are ready before executing them.
-    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
+    // TODO(agarwal): Consider executing "cheap" kernels inline for
+    // performance.
     tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
-      (*retvals)[i] = new TensorHandle(id, /* d= */ kernel->OutputDevice(i),
-                                       /* op_device= */ kernel->device(),
-                                       output_dtypes[i], ctx);
+      (*retvals)[i] = new TensorHandle(
+          id, /* d= */ kernel->OutputDevice(i),
+          /* op_device= */ kernel->device(),
+          /* resource_device= */ kernel->OutputResourceDevice(i),
+          output_dtypes[i], ctx);
     }
     EagerNode* node = new ExecuteNode(
         id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
@@ -347,9 +479,9 @@ Status EagerLocalExecute(EagerOperation* op,
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
-    status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel,
-                          maybe_stats.get(), maybe_step_stats, graph_collector,
-                          retvals->data(), *num_retvals);
+    status = EagerKernelExecute(ctx, op->Device(), op->Inputs(), kernel,
+                                maybe_stats.get(), maybe_step_stats,
+                                graph_collector, retvals->data(), *num_retvals);
   }
 
   return status;
@@ -361,9 +493,9 @@ std::function<void()> GetRemoteTensorDestructor(
     uint64 op_id, int output_num) {
   return [ctx, eager_client, context_id, op_id, output_num]() {
     if (!ctx->HasActiveRemoteContext(context_id)) {
-      // This means that this tensor was pointing to a remote device, which has
-      // been changed out from under us. Simply return since there is nothing we
-      // can do.
+      // This means that this tensor was pointing to a remote device, which
+      // has been changed out from under us. Simply return since there is
+      // nothing we can do.
       return tensorflow::Status::OK();
     }
 
@@ -423,10 +555,10 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
 
   Device* tensor_handle_device = h->device();
 
-  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence copy
-  // it to the CPU before copying it out.
-  // TODO(nareshmodi): this is currently slow, but can be fixed by making tensor
-  // handles aware of more than one device.
+  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+  // copy it to the CPU before copying it out.
+  // TODO(nareshmodi): this is currently slow, but can be fixed by making
+  // tensor handles aware of more than one device.
   TensorHandle* actual_handle;
   if (tensor_handle_device != nullptr &&
       tensor_handle_device->device_type() != "CPU") {
@@ -458,7 +590,8 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
 
   *result = new TensorHandle(id, /*output_num=*/0, /*remote_shape_node_id=*/0,
                              tensor->dtype(), std::move(destructor),
-                             recv_device, recv_device, ctx);
+                             /*d=*/recv_device, /*op_device=*/recv_device,
+                             /*resource_device=*/nullptr, ctx);
   (*result)->SetRemoteShape(MakeUnique<TensorShape>(tensor->shape()));
 
   actual_handle->Unref();
@@ -497,7 +630,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       // TODO(b/110044833): It's possible the same tensor gets copied to the
       // remote device repeatedly.
       TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-          op, i, op->Device(), /* run_metadata= */ nullptr,
+          op, op->Device(), i, op->Device(), /* run_metadata= */ nullptr,
           &(*op->MutableInputs())[i]));
     }
 
@@ -537,14 +670,25 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   const tensorflow::uint64 id = remote_op->id();
   for (int i = 0; i < *num_retvals; i++) {
-    // TODO(nareshmodi): Change the callback to instead add the decref to a list
-    // of pending decrefs that we can send as a batch with the next execute.
+    // TODO(nareshmodi): Change the callback to instead add the decref to a
+    // list of pending decrefs that we can send as a batch with the next
+    // execute.
     std::function<void()> destructor =
         GetRemoteTensorDestructor(ctx, eager_client, context_id, id, i);
 
-    retvals[i] = new TensorHandle(remote_op->id(), i, remote_node_id,
-                                  output_dtypes[i], std::move(destructor),
-                                  op_device, op_device, op->EagerContext());
+    // The device_ and resource_device_ or this TensorHandle are not correct.
+    // It is pretty hard to make it correct because for multi-device functions,
+    // we don't know the output device until the function is instantiated.
+    // Luckily, we don't need to know the correct remote device here. We just
+    // need to know that it is remote. If we need to copy this tensor to this
+    // process, the remote end will know the correct device of this handle.
+    retvals[i] = new TensorHandle(
+        remote_op->id(), i, remote_node_id, output_dtypes[i],
+        std::move(destructor),
+        /*d=*/op_device, /*op_device=*/op_device,
+        /*resource_device=*/output_dtypes[i] == DT_RESOURCE ? op_device
+                                                            : nullptr,
+        op->EagerContext());
   }
 
   if (is_async) {
@@ -619,43 +763,56 @@ bool IsPinnableOp(const string& op_type) {
 // "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
 Status MaybeUpdateOpDevice(EagerOperation* op) {
   EagerContext* ctx = op->EagerContext();
-  bool device_set_for_resource_variable = false;
   bool all_inputs_eligible_for_cpu_pinning =
       ctx->PinSmallOpsToCPU() && IsPinnableOp(op->Name());
-
+  Device* op_device = op->Device() == nullptr ? ctx->HostCPU() : op->Device();
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    Device* input_op_device = op->Inputs()[i]->op_device();
-    VLOG(2) << "for op " << op->Name() << " input " << i << " "
-            << DataTypeString(op->Inputs()[i]->dtype) << " "
-            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
-            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
-    if (op->Inputs()[i]->dtype == DT_RESOURCE &&
-        (input_op_device != op->Device() || input_op_device == nullptr)) {
-      Device* d = input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
-      VLOG(1) << "Changing device of operation " << op->Name() << " to "
-              << d->name() << " because input #" << i
-              << " is a resource in this device.";
-      op->SetDevice(d);
-
-      device_set_for_resource_variable = true;
+    TensorHandle* tensor_handle = op->Inputs()[i];
+    if (tensor_handle->dtype == DT_RESOURCE) {
+      Device* resource_device = tensor_handle->resource_device();
+      VLOG(2) << "for op " << op->Name() << " input " << i << " "
+              << DataTypeString(tensor_handle->dtype)
+              << " input device = " << resource_device->name()
+              << ", op device = " << op_device->name();
+      // We check for `op->Device() == nullptr` because it can be later
+      // interpreted as unspecified device and a different device can
+      // be selected based on device priority. If any input to an op
+      // is a resource we must pin it to prevent different device selection.
+      // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
+      if (resource_device != op_device || op->Device() == nullptr) {
+        VLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
+                << "device of operation " << op->Name() << " to "
+                << resource_device->name() << " because input #" << i
+                << " is a resource in this device.";
+        op->SetDevice(resource_device);
+      }
       all_inputs_eligible_for_cpu_pinning = false;
+      // No point in looking at other inputs. If there are other resources,
+      // they must have the same device and we already declared the op to be
+      // ineligible for CPU pinning.
+      break;
     } else if (all_inputs_eligible_for_cpu_pinning) {
-      TensorHandle* handle = op->Inputs()[i];
+      Device* input_device = tensor_handle->device();
+      input_device = input_device == nullptr ? ctx->HostCPU() : input_device;
+      VLOG(2) << "for op " << op->Name() << " input " << i << " "
+              << DataTypeString(tensor_handle->dtype)
+              << " input device = " << input_device->name()
+              << ", op device = " << op_device->name();
 
       // Input is on CPU.
-      if (input_op_device != nullptr && input_op_device != ctx->HostCPU()) {
+      if (input_device != ctx->HostCPU()) {
         all_inputs_eligible_for_cpu_pinning = false;
         continue;
       }
 
-      if (handle->dtype != DataType::DT_INT32 &&
-          handle->dtype != DataType::DT_INT64) {
+      if (tensor_handle->dtype != DataType::DT_INT32 &&
+          tensor_handle->dtype != DataType::DT_INT64) {
         all_inputs_eligible_for_cpu_pinning = false;
         continue;
       }
 
       int64 num_elements;
-      TF_RETURN_IF_ERROR(handle->NumElements(&num_elements));
+      TF_RETURN_IF_ERROR(tensor_handle->NumElements(&num_elements));
       if (num_elements > 64) {
         all_inputs_eligible_for_cpu_pinning = false;
       }
@@ -697,12 +854,12 @@ Status EagerExecute(EagerOperation* op,
   return EagerRemoteExecute(op, retvals->data(), num_retvals);
 }
 
-Status EagerExecute(EagerContext* ctx, Device* device,
-                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
-                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
-                    StepStats* maybe_step_stats,
-                    GraphCollector* graph_collector, TensorHandle** retvals,
-                    int num_retvals) {
+Status EagerKernelExecute(EagerContext* ctx, Device* device,
+                          const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                          KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                          StepStats* maybe_step_stats,
+                          GraphCollector* graph_collector,
+                          TensorHandle** retvals, int num_retvals) {
   if (device == nullptr) {
     // TODO(apassos) debug how the assignment below might return a different
     // device from the one requested above.
@@ -710,24 +867,37 @@ Status EagerExecute(EagerContext* ctx, Device* device,
   }
 
   std::vector<Tensor> outputs(1);
-  const MemoryTypeVector* output_memory_types = nullptr;
-  output_memory_types = &kernel->kernel()->output_memory_types();
-  std::vector<Tensor> inputs(op_inputs.size());
+
+  // If there are multiple references to a TensorHandle in 'op_inputs' we must
+  // increment the reference count of the corresponding Tensor or risk it being
+  // overwritten during kernel execution. The reference count is incremented
+  // below when we insert a copy of the Tensor into protected_tensors, and will
+  // be decremented once execution is complete.
+  std::vector<tensorflow::Tensor> protected_tensors;
+  for (int i = 0; i < op_inputs.size(); ++i) {
+    if (!op_inputs[i]->RefCountIsOne()) {
+      const Tensor* input_tensor = nullptr;
+      TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
+      protected_tensors.push_back(*input_tensor);
+    }
+  }
+
+  gtl::InlinedVector<TensorValue, 4> input_vector(op_inputs.size());
   for (int i = 0; i < op_inputs.size(); ++i) {
-    const Tensor* input_tensor = nullptr;
-    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
-    inputs[i] = *input_tensor;
+    TF_RETURN_IF_ERROR(op_inputs[i]->TensorValue(&input_vector[i]));
   }
+
   //  TODO(apassos) figure out how to record stats for ops which are a part of
   //  functions.
   // TODO(agarwal): change Run to take vector of handles ?
   ScopedStepContainer* container = ctx->StepContainer();
   if (container == nullptr) {
-    TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats,
+    TF_RETURN_IF_ERROR(kernel->Run(input_vector, &outputs, maybe_stats,
                                    maybe_step_stats, graph_collector));
   } else {
-    TF_RETURN_IF_ERROR(kernel->Run(container, &inputs, &outputs, maybe_stats,
-                                   maybe_step_stats, graph_collector));
+    TF_RETURN_IF_ERROR(kernel->Run(container, input_vector, &outputs,
+                                   maybe_stats, maybe_step_stats,
+                                   graph_collector));
   }
   if (maybe_stats != nullptr) {
     int64 nanos = Env::Default()->NowNanos();
@@ -737,8 +907,8 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
                                         maybe_stats->all_start_micros());
     maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    mutex_lock ml(*ctx->MetadataMu());
     if (ctx->ShouldStoreMetadata()) {
+      mutex_lock ml(*ctx->MetadataMu());
       {
         GraphCollector* collector = ctx->GetGraphCollector();
         mutex_lock mll(collector->mu);
@@ -775,8 +945,8 @@ Status EagerExecute(EagerContext* ctx, Device* device,
                            /* op_device= */ device, ctx);
     } else {
       // In the async case, the retval is not a nullptr, and its device is
-      // already set since all TensorHandles always have their device set during
-      // construction.
+      // already set since all TensorHandles always have their device set
+      // during construction.
       DCHECK_EQ(device, retvals[i]->op_device());
       DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
 
@@ -807,25 +977,6 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
   }
 }
 
-Status FindDeviceFromName(EagerContext* ctx, const char* device_name,
-                          Device** device) {
-  *device = ctx->HostCPU();
-  if (device_name == nullptr || strlen(device_name) == 0) {
-    return Status::OK();
-  }
-
-  auto status = ctx->local_device_mgr()->LookupDevice(device_name, device);
-  if (status.ok()) {
-    return status;
-  }
-
-  if (ctx->remote_device_mgr() != nullptr) {
-    return ctx->remote_device_mgr()->LookupDevice(device_name, device);
-  }
-
-  return status;
-}
-
 Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
                    TensorHandle* h, StringPiece wire_id,
                    const string& recv_device) {
@@ -887,8 +1038,8 @@ Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
   return Status::OK();
 }
 
-// This gets a unique wire ID. We add a random identifier so that if the worker
-// has other clients that it is servicing, we don't have any collision.
+// This gets a unique wire ID. We add a random identifier so that if the
+// worker has other clients that it is servicing, we don't have any collision.
 string GetUniqueWireID() {
   static tensorflow::uint64 random_seed = random::New64();
   static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 6143a52d4b9c83444eb98567decf26dbfca58504..80d381cecdd421fe9c580cd0a10fbc6db5953080 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -43,12 +43,12 @@ Status EagerExecute(
 
 // Low-level utility to execute the kernel specified by kernel on device device,
 // with the inputs op_inputs, in the context ctx.
-Status EagerExecute(EagerContext* ctx, Device* device,
-                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
-                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
-                    StepStats* maybe_step_stats,
-                    GraphCollector* graph_collector, TensorHandle** retvals,
-                    int num_retvals);
+Status EagerKernelExecute(EagerContext* ctx, Device* device,
+                          const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                          KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                          StepStats* maybe_step_stats,
+                          GraphCollector* graph_collector,
+                          TensorHandle** retvals, int num_retvals);
 
 // Low-level utility to copy a tensor handle from one device to another.
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index a99d509dd60c4ad50b67ef237423570d7b595234..4459e3221b9f2387867e1efed4324322619e4388 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -64,7 +64,7 @@ class ExecuteNode : public EagerNode {
   }
 
   tensorflow::Status Run() override {
-    const Status status = EagerExecute(
+    const Status status = EagerKernelExecute(
         ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
         maybe_step_stats_, graph_collector_, retvals_.begin(), retvals_.size());
     if (status.ok()) {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 317e9a16074b37ef6ecaf1d7f8c1a2daa412f75e..09f60a7e918d7b95fc6846912b6f01ec8910d4d2 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -15,49 +15,123 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#ifndef __ANDROID__
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif
 
 namespace tensorflow {
 
-// static
-Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flr,
-                             std::function<void(std::function<void()>)>* runner,
-                             KernelAndDevice* out) {
+KernelAndDeviceFunc::~KernelAndDeviceFunc() {
+  if (handle_ != kInvalidHandle) {
+    Status status = pflr_->ReleaseHandle(handle_);
+    if (!status.ok()) {
+      LOG(INFO) << "Ignoring error status when releasing multi-device function "
+                   "handle "
+                << status.ToString();
+    }
+  }
+}
+
+Status KernelAndDeviceOp::Init(const NodeDef& ndef,
+                               GraphCollector* graph_collector) {
   OpKernel* k = nullptr;
-  TF_RETURN_IF_ERROR(flr->CreateKernel(ndef, &k));
-  out->device_ = flr->device();
-  out->kernel_.reset(k);
-  out->flr_ = flr;
-  out->runner_ = runner;
-  out->default_runner_ = [](std::function<void()> f) { f(); };
-
-  // Update output_dtypes_.
+  TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
+  kernel_.reset(k);
+  return Status::OK();
+}
+
+Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
+                                 GraphCollector* graph_collector) {
   const OpDef* op_def = nullptr;
   const FunctionDef* function_def =
-      flr->GetFunctionLibraryDefinition()->Find(ndef.op());
+      flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
     TF_RETURN_IF_ERROR(OpDefForOp(ndef.op().c_str(), &op_def));
   }
-  return OutputTypesForNode(ndef, *op_def, &out->output_dtypes_);
+  TF_RETURN_IF_ERROR(
+      InOutTypesForNode(ndef, *op_def, &input_dtypes_, &output_dtypes_));
+
+  FunctionLibraryRuntime::InstantiateOptions options;
+  options.target = device_->name();
+  options.is_multi_device_function = true;
+  for (const Device* device : input_devices_) {
+    options.input_devices.push_back(device->name());
+  }
+
+  const auto& it = ndef.attr().find("executor_type");
+  if (it != ndef.attr().end()) {
+    options.executor_type = it->second.s();
+  }
+#ifndef __ANDROID__
+  // Android tf library does not include grappler.
+  const auto& config_it = ndef.attr().find("config_proto");
+  if (it != ndef.attr().end()) {
+    ConfigProto config_proto;
+    if (!config_proto.ParseFromString(config_it->second.s())) {
+      return errors::InvalidArgument(
+          "Failed to parse config_proto attribute as tensorflow::ConfigProto "
+          "proto.");
+    }
+    // We are going to execute the graph via function library runtime, and
+    // because function execution semantics is slightly different from the
+    // regular tensorlow graph, we need to make sure that Grappler respects it
+    // when doing it's optimization passes (e.g. do not prune stateful and
+    // dataset ops).
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+    optimization_options.is_function_instantiation = true;
+
+    // Keras graphs expected to be executed with regular graph execution
+    // semantics (it's allowed to prune stateful and dataset ops).
+    if (absl::StrContains(function_def->signature().name(), "keras_graph")) {
+      optimization_options.is_function_instantiation = false;
+    }
+
+    // Wrapped function expects execution semantics to be the same as
+    // `session.run`, so we should prune unreachable stateful and dataset ops.
+    if (absl::StrContains(function_def->signature().name(),
+                          "wrapped_function")) {
+      optimization_options.is_function_instantiation = false;
+    }
+
+    options.optimize_graph_fn = std::bind(
+        grappler::OptimizeGraph, std::placeholders::_1, std::placeholders::_2,
+        std::placeholders::_3, std::placeholders::_4, config_proto,
+        optimization_options, std::placeholders::_5);
+  }
+#endif
+  options.graph_collector = graph_collector;
+
+  TF_RETURN_IF_ERROR(
+      pflr_->Instantiate(ndef.op(), AttrSlice(ndef), options, &handle_));
+  return pflr_->GetOutputDevices(handle_, &output_devices_);
+  return Status::OK();
 }
 
-Status KernelAndDevice::Run(std::vector<Tensor>* inputs,
+Status KernelAndDevice::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
                             std::vector<Tensor>* outputs, NodeExecStats* stats,
                             StepStats* step_stats,
                             GraphCollector* graph_collector) {
@@ -68,16 +142,39 @@ Status KernelAndDevice::Run(std::vector<Tensor>* inputs,
                    graph_collector);
 }
 
-Status KernelAndDevice::Run(ScopedStepContainer* step_container,
-                            std::vector<Tensor>* inputs,
-                            std::vector<Tensor>* outputs, NodeExecStats* stats,
-                            StepStats* step_stats,
-                            GraphCollector* graph_collector) {
-  gtl::InlinedVector<TensorValue, 4> input_vector;
-  for (Tensor& t : *inputs) {
-    input_vector.push_back(TensorValue(&t));
+namespace {
+void UpdateStats(OpKernelContext* context,
+                 StepStatsCollector* step_stats_collector,
+                 NodeExecStats* stats) {
+  for (const auto& allocator_pair : context->ConsumeWrappedAllocators()) {
+    AllocatorMemoryUsed* memory = stats->add_memory();
+    memory->set_allocator_name(allocator_pair.first->Name());
+    auto sizes = allocator_pair.second->GetSizes();
+    memory->set_total_bytes(std::get<0>(sizes));
+    memory->set_peak_bytes(std::get<1>(sizes));
+    memory->set_live_bytes(std::get<2>(sizes));
+
+    AllocatorStats allocator_stats;
+    allocator_pair.first->GetStats(&allocator_stats);
+    memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
+    allocator_pair.second->GetRecordsAndUnRef();
+  }
+  auto* ms = stats->mutable_memory_stats();
+  ms->set_temp_memory_size(context->temp_memory_allocated());
+  for (const auto& alloc_id : context->persistent_alloc_ids()) {
+    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
   }
 
+  ms->set_persistent_memory_size(context->persistent_memory_allocated());
+  step_stats_collector->Finalize();
+}
+}  // anonymous namespace
+
+Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
+                              const gtl::InlinedVector<TensorValue, 4>& inputs,
+                              std::vector<Tensor>* outputs,
+                              NodeExecStats* stats, StepStats* step_stats,
+                              GraphCollector* graph_collector) {
   std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
   for (size_t i = 0; i < out_attrs.size(); ++i) {
     out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
@@ -85,7 +182,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   }
 
   gtl::InlinedVector<DeviceContext*, 4> input_device_contexts;
-  for (int i = 0; i < inputs->size(); i++) {
+  for (int i = 0; i < inputs.size(); i++) {
     DeviceContext* device_context = nullptr;
     if (device_->tensorflow_gpu_device_info() != nullptr) {
       device_context = device_->tensorflow_gpu_device_info()->default_context;
@@ -96,7 +193,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
-  params.inputs = &input_vector;
+  params.inputs = &inputs;
   params.op_kernel = kernel_.get();
   params.resource_manager = device_->resource_manager();
   params.output_attr_array = gtl::vector_as_array(&out_attrs);
@@ -112,11 +209,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     params.stats_collector = step_stats_collector.get();
     params.graph_collector = graph_collector;
   }
-  if (runner_ == nullptr) {
-    params.runner = &default_runner_;
-  } else {
-    params.runner = runner_;
-  }
+  params.runner = runner_ != nullptr ? runner_ : &default_runner_;
 
   params.step_container = step_container;
   params.collective_executor =
@@ -134,7 +227,16 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     device_->ComputeAsync(async, &context, [&done]() { done.Notify(); });
     done.WaitForNotification();
   } else {
-    device_->Compute(kernel_.get(), &context);
+    const string& op_name = kernel_->name();
+    // If tracing if off, the overheads of ScopedAnnotation and ScopedActivity
+    // are negligible.
+    if (device_->TraceUsingAnnotations()) {
+      tracing::ScopedAnnotation activity(op_name, kernel_->type_string());
+      device_->Compute(kernel_.get(), &context);
+    } else {
+      tracing::ScopedActivity activity(op_name, kernel_->type_string());
+      device_->Compute(kernel_.get(), &context);
+    }
   }
   if (!context.status().ok()) return context.status();
 
@@ -143,37 +245,111 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     outputs->push_back(Tensor(*context.mutable_output(i)));
   }
   if (stats != nullptr) {
-    for (const auto& allocator_pair : context.ConsumeWrappedAllocators()) {
-      AllocatorMemoryUsed* memory = stats->add_memory();
-      memory->set_allocator_name(allocator_pair.first->Name());
-      auto sizes = allocator_pair.second->GetSizes();
-      memory->set_total_bytes(std::get<0>(sizes));
-      memory->set_peak_bytes(std::get<1>(sizes));
-      memory->set_live_bytes(std::get<2>(sizes));
-
-      AllocatorStats allocator_stats;
-      allocator_pair.first->GetStats(&allocator_stats);
-      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
-      allocator_pair.second->GetRecordsAndUnRef();
-    }
-    auto* ms = stats->mutable_memory_stats();
-    ms->set_temp_memory_size(context.temp_memory_allocated());
-    for (const auto& alloc_id : context.persistent_alloc_ids()) {
-      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
-    }
+    UpdateStats(&context, step_stats_collector.get(), stats);
+  }
+  return Status::OK();
+}
 
-    ms->set_persistent_memory_size(context.persistent_memory_allocated());
+Status KernelAndDeviceFunc::Run(
+    ScopedStepContainer* step_container,
+    const gtl::InlinedVector<TensorValue, 4>& inputs,
+    std::vector<Tensor>* outputs, NodeExecStats* stats, StepStats* step_stats,
+    GraphCollector* graph_collector) {
+  FunctionLibraryRuntime::Options opts;
+  // We don't pass rendezvous from eager context because we can get tensor
+  // name collisions in send/recv ops when running multiple instances
+  // of the same multi-device function concurrently. Instead, we ask the
+  // function library runtime to create a new for this call. We could have
+  // created one here but it requires more state to be kept in
+  // KernelAndDeviceFunc.
+  opts.rendezvous = nullptr;
+  opts.create_rendezvous = true;
+  opts.cancellation_manager = &cm_;
+  // eager runtime does not yet support collective ops.
+  opts.collective_executor = nullptr;
+  opts.allow_dead_tensors = true;
+  opts.step_container = step_container;
+  opts.collective_executor =
+      collective_executor_ ? collective_executor_->get() : nullptr;
+
+  std::unique_ptr<StepStatsCollector> step_stats_collector;
+  if (stats != nullptr) {
+    step_stats_collector.reset(new StepStatsCollector(step_stats));
+  }
+  opts.stats_collector = step_stats_collector.get();
+  opts.runner = (runner_ == nullptr) ? &default_runner_ : runner_;
+
+  Notification done;
+  Status status;
+  outputs->clear();
+  std::vector<Tensor> input_vector;
+  input_vector.reserve(inputs.size());
+  for (const TensorValue& tensor_value : inputs) {
+    input_vector.push_back(*tensor_value.tensor);
+  }
+
+  flr_->Run(opts, handle_, input_vector, outputs,
+            [&status, &done](const Status& s) {
+              status = s;
+              done.Notify();
+            });
+  done.WaitForNotification();
+
+  if (step_stats_collector != nullptr) {
     step_stats_collector->Finalize();
   }
-  return Status::OK();
+  return status;
+}
+
+tensorflow::Device* KernelAndDeviceOp::OutputDevice(int idx) const {
+  if (kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+    return nullptr;
+  }
+  return device_;
 }
 
-tensorflow::Device* KernelAndDevice::OutputDevice(int idx) const {
-  if (device_ != nullptr &&
-      kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+tensorflow::Device* KernelAndDeviceFunc::OutputDevice(int idx) const {
+  if (output_dtypes_[idx] == DT_RESOURCE) {
     return nullptr;
   }
+  return output_devices_[idx];
+}
+
+tensorflow::Device* KernelAndDeviceOp::OutputResourceDevice(int idx) const {
+  if (kernel_->output_type(idx) == DT_RESOURCE) {
+    return device_;
+  }
+  return nullptr;
+}
+
+tensorflow::Device* KernelAndDeviceFunc::OutputResourceDevice(int idx) const {
+  if (output_dtypes_[idx] == DT_RESOURCE) {
+    return output_devices_[idx];
+  }
+  return nullptr;
+}
+
+DataType KernelAndDeviceOp::input_type(int i) const {
+  return kernel_->input_type(i);
+}
+
+DataType KernelAndDeviceFunc::input_type(int i) const {
+  return input_dtypes_[i];
+}
+
+Device* KernelAndDeviceOp::InputDevice(int i) const {
+  if (kernel_->input_memory_types()[i] == HOST_MEMORY) {
+    return host_cpu_device_;
+  }
   return device_;
 }
 
+Device* KernelAndDeviceFunc::InputDevice(int i) const {
+  if (input_dtypes_[i] == DT_RESOURCE) {
+    return host_cpu_device_;
+  } else {
+    return input_devices_[i];
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index ee430b7fc70e1f4e5256e9dd28f4240ce57de86a..c4ea99f53e14287c4852bcb1becb3c352fa9746b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -38,8 +38,13 @@ namespace tensorflow {
 // include the proto header
 class NodeExecStats;
 class StepStats;
+class ProcessFunctionLibraryRuntime;
+class FunctionLibraryRuntime;
 
-// KernelAndDevice encapsulates an instantiated kernel and the device it is on.
+// KernelAndDevice encapsulates the logic needed to run a computation eagerly.
+// The computation can be a single instantiated kernel (implemented by
+// KernelAndDeviceOp below) or a multi-device function (implemented by
+// KernelAndDeviceFunc below).
 //
 // Also see:
 // https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -47,59 +52,171 @@ class StepStats;
 // https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
 class KernelAndDevice {
  public:
-  // Populates 'out' with a kernel appropriate for 'ndef'.
+  // Populates this with a kernel appropriate for 'ndef'.
   //
   // The provided FunctionLibraryRuntime MUST outlive all calls to
   // Run() on the returned KernelAndDevice.
-  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flr,
-                     std::function<void(std::function<void()>)>* runner,
-                     KernelAndDevice* out);
-
-  KernelAndDevice(tensorflow::Rendezvous* rendez, bool log_memory)
-      : KernelAndDevice(rendez, log_memory, nullptr) {}
+  virtual Status Init(const NodeDef& ndef, GraphCollector* graph_collector) = 0;
 
+  // Non-multi-device functions are run using regular CallOp and look like
+  // primitive operations from KernelAndDevice perspective.
   KernelAndDevice(
-      tensorflow::Rendezvous* rendez, bool log_memory,
-      std::unique_ptr<CollectiveExecutor::Handle> collective_executor)
-      : device_(nullptr),
-        flr_(nullptr),
-        rendez_(rendez),
-        log_memory_(log_memory),
+      FunctionLibraryRuntime* flr,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : device_(flr->device()),
+        host_cpu_device_(host_cpu_device),
+        flr_(flr),
+        runner_(runner),
+        default_runner_([](std::function<void()> f) { f(); }),
         collective_executor_(std::move(collective_executor)) {}
 
-  // TODO(ashankar): Handle list-valued inputs.
-  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
-             NodeExecStats* stats, StepStats* step_stats,
-             GraphCollector* graph_collector);
+  virtual ~KernelAndDevice() {}
 
-  Status Run(ScopedStepContainer* step_container, std::vector<Tensor>* inputs,
+  // TODO(ashankar): Handle list-valued inputs.
+  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
              std::vector<Tensor>* outputs, NodeExecStats* stats,
              StepStats* step_stats, GraphCollector* graph_collector);
 
-  Device* OutputDevice(int idx) const;
-
-  const OpKernel* kernel() const { return kernel_.get(); }
-
+  virtual Status Run(ScopedStepContainer* step_container,
+                     const gtl::InlinedVector<TensorValue, 4>& inputs,
+                     std::vector<Tensor>* outputs, NodeExecStats* stats,
+                     StepStats* step_stats,
+                     GraphCollector* graph_collector) = 0;
+
+  virtual Device* InputDevice(int i) const = 0;
+  virtual Device* OutputDevice(int idx) const = 0;
+  // If idx'th output is a resource, returns the device backing the resource.
+  // Else, returns nullptr.
+  virtual Device* OutputResourceDevice(int idx) const = 0;
+
+  // Returns nullptr for functions.
+  virtual const OpKernel* kernel() const = 0;
+
+  // Returns the device on which this kernel will run. In the case of
+  // multi-device functions, this is the default device that is passed to the
+  // placer but actual computation can happen on a different set of devices.
+  // Also, outputs can be produced on devices different from what this method
+  // returns.
   Device* device() const { return device_; }
 
-  const DataTypeVector& output_dtypes() { return output_dtypes_; }
+  virtual const DataTypeVector& output_dtypes() const = 0;
 
- private:
+  virtual DataType input_type(int i) const = 0;
+  virtual int num_inputs() const = 0;
+  virtual int num_outputs() const = 0;
+
+ protected:
   // TODO(apassos) Consider a shared cancellation manager. Note that this
   // cancellation manager is not useful to actually cancel anything, and is
   // provided here only for the few kernels which can't handle one being
   // missing.
   CancellationManager cm_;
+  Device* const device_;           // non-null
+  Device* const host_cpu_device_;  // non-null
+  FunctionLibraryRuntime* const flr_;
+  std::function<void(std::function<void()>)>* const runner_;
+  std::function<void(std::function<void()>)> default_runner_;
+  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
+};
+
+// Represents an op kernel and the device it will be run on.
+class KernelAndDeviceOp final : public KernelAndDevice {
+ public:
+  KernelAndDeviceOp(
+      tensorflow::Rendezvous* rendez, bool log_memory,
+      FunctionLibraryRuntime* flr,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : KernelAndDevice(flr, runner, std::move(collective_executor),
+                        host_cpu_device),
+        rendez_(rendez),
+        log_memory_(log_memory) {}
+
+  Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
+
+  using KernelAndDevice::Run;
+
+  Status Run(ScopedStepContainer* step_container,
+             const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
+
+  const OpKernel* kernel() const override { return kernel_.get(); }
+
+  Device* InputDevice(int i) const override;
+  Device* OutputDevice(int idx) const override;
+  Device* OutputResourceDevice(int idx) const override;
+
+  DataType input_type(int i) const override;
+  const DataTypeVector& output_dtypes() const {
+    return kernel_->output_types();
+  }
+  int num_inputs() const override { return kernel_->num_inputs(); }
+  int num_outputs() const override { return kernel_->num_outputs(); }
+
+ private:
   std::unique_ptr<OpKernel> kernel_;
-  Device* device_;
-  FunctionLibraryRuntime* flr_;
+  Rendezvous* const rendez_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
-  Rendezvous* rendez_;
-  DataTypeVector output_dtypes_;
-  std::function<void(std::function<void()>)>* runner_;
-  std::function<void(std::function<void()>)> default_runner_;
   const bool log_memory_;
-  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
+};
+
+// Represents a multi-device function. Functions can also be run using
+// various function-calling kernels including CallOp and PartitionedCallOp.
+// In such cases, KernelAndDeviceOp is used.
+class KernelAndDeviceFunc final : public KernelAndDevice {
+ public:
+  KernelAndDeviceFunc(
+      FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
+      std::vector<Device*> input_devices,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : KernelAndDevice(flr, runner, std::move(collective_executor),
+                        host_cpu_device),
+        pflr_(pflr),
+        handle_(kInvalidHandle),
+        input_devices_(std::move(input_devices)) {}
+
+  virtual ~KernelAndDeviceFunc();
+
+  Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
+
+  using KernelAndDevice::Run;
+
+  Status Run(ScopedStepContainer* step_container,
+             const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
+
+  const OpKernel* kernel() const override { return nullptr; }
+
+  Device* InputDevice(int i) const override;
+  Device* OutputDevice(int idx) const override;
+  Device* OutputResourceDevice(int idx) const override;
+
+  DataType input_type(int i) const override;
+  const DataTypeVector& output_dtypes() const override {
+    return output_dtypes_;
+  }
+  int num_inputs() const override { return input_dtypes_.size(); }
+  int num_outputs() const override { return output_dtypes_.size(); }
+
+ private:
+  ProcessFunctionLibraryRuntime* const pflr_;
+  FunctionLibraryRuntime::Handle handle_;
+  // CPU devices are null. Resource handles' devices are actual backing
+  // devices.
+  std::vector<Device*> output_devices_;
+  // CPU devices are not null. Resource handles' devices are actual backing
+  // devices.
+  std::vector<Device*> input_devices_;
+
+  DataTypeVector input_dtypes_;
+  DataTypeVector output_dtypes_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 3ffed3ce321e79d021c302acf444f93cc9ccce53..703f3eb9b750f031ff0f69b3395a32c1d9414168 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -27,10 +27,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -41,20 +44,27 @@ class TestEnv {
     std::vector<std::unique_ptr<Device>> devices;
     devices.push_back(
         DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    cpu_device_ = devices.back().get();
     device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
-    flib_runtime_ = NewFunctionLibraryRuntime(
-        device_mgr_.get(), Env::Default(), device_mgr_->ListDevices()[0],
-        TF_GRAPH_DEF_VERSION, &flib_def_, nullptr, {}, nullptr);
-  }
+    OptimizerOptions opts;
+    pflr_ = tensorflow::MakeUnique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, &flib_def_,
+        opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
 
-  FunctionLibraryRuntime* function_library_runtime() const {
-    return flib_runtime_.get();
+    flr_ = pflr_->GetFLR("/job:a/replica:0/task:0/device:CPU:0");
+    CHECK(flr_ != nullptr);
   }
 
+  FunctionLibraryRuntime* function_library_runtime() const { return flr_; }
+  ProcessFunctionLibraryRuntime* pflr() const { return pflr_.get(); }
+  Device* cpu_device() { return cpu_device_; }
+
  private:
   FunctionLibraryDefinition flib_def_;
   std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+  FunctionLibraryRuntime* flr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  Device* cpu_device_;
 };
 
 void BM_CreateGraph(int iters) {
@@ -106,11 +116,11 @@ void BM_KernelAndDeviceInit(int iters) {
                    .NumInputs(2)
                    .BuildNodeDef());
   TestEnv env;
-  KernelAndDevice k(nullptr, false);
+  KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
+                      nullptr, env.cpu_device());
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(KernelAndDevice::Init(ndef, env.function_library_runtime(),
-                                      nullptr, &k));
+    TF_CHECK_OK(k.Init(ndef, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceInit);
@@ -118,9 +128,9 @@ BENCHMARK(BM_KernelAndDeviceInit);
 void BM_KernelAndDeviceRun(int iters) {
   tensorflow::testing::StopTiming();
   Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
-  std::vector<Tensor> inputs;
-  inputs.push_back(t);
-  inputs.push_back(t);
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  inputs.push_back(TensorValue(&t));
+  inputs.push_back(TensorValue(&t));
   std::vector<Tensor> outputs;
   NodeDef ndef(AttrBuilder("MatMul")
                    .Set("T", DT_FLOAT)
@@ -129,12 +139,12 @@ void BM_KernelAndDeviceRun(int iters) {
                    .NumInputs(inputs.size())
                    .BuildNodeDef());
   TestEnv env;
-  KernelAndDevice kernel(nullptr, false);
-  TF_CHECK_OK(KernelAndDevice::Init(ndef, env.function_library_runtime(),
-                                    nullptr, &kernel));
+  KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
+                      nullptr, env.cpu_device());
+  TF_CHECK_OK(k.Init(ndef, nullptr));
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr, nullptr, nullptr));
+    TF_CHECK_OK(k.Run(inputs, &outputs, nullptr, nullptr, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceRun);
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 0acd1609361453a0901e346f3b9d76e6e3a7b872..e44a97b2655fee02b77c965dcc8d3aa04dbcd091 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -22,16 +22,17 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
-#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -44,6 +45,74 @@ limitations under the License.
 
 namespace tensorflow {
 
+TensorHandle::TensorHandle(const class Tensor& t, Device* d, Device* op_device,
+                           EagerContext* ctx)
+    : dtype(t.dtype()),
+      node_id_(0),
+      tensor_(t),
+      device_(d),
+      op_device_(op_device),
+      resource_device_(GetResourceDevice(t, ctx)),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(ctx),
+      is_ready_(true) {}
+
+TensorHandle::TensorHandle(uint64 node_id, Device* d, Device* op_device,
+                           Device* resource_device, DataType dtype,
+                           EagerContext* ctx)
+    : dtype(dtype),
+      node_id_(node_id),
+      tensor_(dtype),
+      device_(d),
+      op_device_(op_device),
+      resource_device_(resource_device),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(ctx),
+      is_ready_(ctx == nullptr) {
+  DCHECK_GT(node_id_, 0);
+  DCHECK(dtype == DT_RESOURCE ? resource_device_ != nullptr
+                              : resource_device_ == nullptr);
+}
+
+TensorHandle::TensorHandle(int64 op_id, int32 output_num,
+                           uint64 remote_shape_node_id, DataType dtype,
+                           std::function<void()> call_on_destroy, Device* d,
+                           Device* op_device, Device* resource_device,
+                           EagerContext* ctx)
+    : dtype(dtype),
+      node_id_(0),
+      device_(d),
+      op_device_(op_device),
+      resource_device_(resource_device),
+      remote_op_id_(op_id),
+      remote_output_num_(output_num),
+      remote_shape_node_id_(remote_shape_node_id),
+      call_on_destroy_(std::move(call_on_destroy)),
+      ctx_(ctx),
+      is_ready_(true) {
+  DCHECK(IsRemote()) << "Op ID and output num should be >= 0. Op ID: " << op_id
+                     << ", Output num: " << output_num;
+  DCHECK(dtype == DT_RESOURCE ? resource_device_ != nullptr
+                              : resource_device_ == nullptr);
+}
+
+TensorHandle::TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype)
+    : dtype(dtype),
+      node_id_(0),
+      device_(nullptr),
+      op_device_(nullptr),
+      resource_device_(nullptr),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(nullptr),
+      is_ready_(true),
+      symbolic_tensor(new OutputGraphNode(symbolic_tensor)) {}
+
 bool TensorHandle::IsReady() {
   if (node_id_ == 0) return true;
   mutex_lock l(ctx_mutex_);
@@ -79,6 +148,13 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
   return Status::OK();
 }
 
+Status TensorHandle::TensorValue(tensorflow::TensorValue* t) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *t = tensorflow::TensorValue(&tensor_);
+  return Status::OK();
+}
+
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
@@ -239,4 +315,31 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   return status;
 }
 
+Device* GetResourceDevice(const Tensor& t, EagerContext* ctx) {
+  if (t.dtype() != DT_RESOURCE) {
+    return nullptr;
+  }
+  const ResourceHandle& resource_handle = t.flat<ResourceHandle>()(0);
+  const auto& map = *ctx->device_map();
+  auto it = map.find(resource_handle.device());
+  DCHECK(it != map.end());
+  return it->second;
+}
+
+string TensorHandle::DebugString() const {
+  VLOG(1) << "Calling TensorHandle::DebugString() on " << this;
+
+  if (symbolic_tensor) {
+    return absl::Substitute("TF_Output($0, $1)", symbolic_tensor->oper,
+                            symbolic_tensor->index);
+  }
+
+  string out;
+  strings::StrAppend(&out, "Device: ", device_ ? device_->DebugString() : "[]");
+  // Consider supporting non-CPU tensors (when device_ is non-NULL) if needed.
+  strings::StrAppend(&out, ", Tensor: ", device_ ? "?" : tensor_.DebugString(),
+                     "\n");
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 0fdd31ab5fcfe99c92074fc69d831d17f46d607e..e4ccb11dba9aee4bd6bea1c4909e37a13957021d 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
-#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -42,59 +41,37 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+struct TF_Operation;
+
 namespace tensorflow {
 
+// This struct is isomorphic to TF_Output, but we cannot use the latter here due
+// to layering concerns (TF_Output is defined at the C API layer).
+struct OutputGraphNode {
+  TF_Operation* oper;
+  int index;  // The index of the output within oper.
+};
+
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
 // of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
 class TensorHandle : public core::RefCounted {
  public:
-  TensorHandle(const Tensor& t, Device* d, Device* op_device, EagerContext* ctx)
-      : dtype(t.dtype()),
-        node_id_(0),
-        tensor_(t),
-        device_(d),
-        op_device_(op_device),
-        remote_op_id_(-1),
-        remote_output_num_(-1),
-        remote_shape_node_id_(-1),
-        ctx_(ctx),
-        is_ready_(true) {}
-
-  TensorHandle(uint64 node_id, Device* d, Device* op_device, DataType dtype,
-               EagerContext* ctx)
-      : dtype(dtype),
-        node_id_(node_id),
-        tensor_(dtype),
-        device_(d),
-        op_device_(op_device),
-        remote_op_id_(-1),
-        remote_output_num_(-1),
-        remote_shape_node_id_(-1),
-        ctx_(ctx),
-        is_ready_(ctx == nullptr) {
-    DCHECK_GT(node_id_, 0);
-  }
+  TensorHandle(const Tensor& t, Device* d, Device* op_device,
+               EagerContext* ctx);
+  TensorHandle(uint64 node_id, Device* d, Device* op_device,
+               Device* resource_device, DataType dtype, EagerContext* ctx);
 
   // Remote tensor handle constructor.
   TensorHandle(int64 op_id, int32 output_num, uint64 remote_shape_node_id,
                DataType dtype, std::function<void()> call_on_destroy, Device* d,
-               Device* op_device, EagerContext* ctx)
-      : dtype(dtype),
-        node_id_(0),
-        device_(d),
-        op_device_(op_device),
-        remote_op_id_(op_id),
-        remote_output_num_(output_num),
-        remote_shape_node_id_(remote_shape_node_id),
-        call_on_destroy_(std::move(call_on_destroy)),
-        ctx_(ctx),
-        is_ready_(true) {
-    DCHECK(IsRemote()) << "Op ID and output num should be >= 0. Op ID: "
-                       << op_id << ", Output num: " << output_num;
-  }
+               Device* op_device, Device* resource_device, EagerContext* ctx);
+
+  // Symbolic tensor constructor.
+  TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype);
 
   ~TensorHandle() override {
+    VLOG(1) << "Deleting internal TensorHandle " << this;
     if (call_on_destroy_) {
       call_on_destroy_();
     }
@@ -102,9 +79,11 @@ class TensorHandle : public core::RefCounted {
 
   Status Tensor(const tensorflow::Tensor** t);
 
-  tensorflow::Device* device() const { return device_; }
+  Status TensorValue(tensorflow::TensorValue* t);
 
+  tensorflow::Device* device() const { return device_; }
   tensorflow::Device* op_device() const { return op_device_; }
+  tensorflow::Device* resource_device() const { return resource_device_; }
 
   Status TensorAndDevice(const tensorflow::Tensor** tensor,
                          tensorflow::Device** device,
@@ -146,6 +125,12 @@ class TensorHandle : public core::RefCounted {
            (ctx_ == nullptr || ctx_->HostCPU() == device_);
   }
 
+  bool IsRemote();
+
+  OutputGraphNode* getSymbolicTensor() const { return symbolic_tensor.get(); }
+
+  string DebugString() const;
+
  private:
   // If the contents of the Tensor pointed to by this handle is yet to be
   // computed by a EagerNode, this function will block till that compuatation is
@@ -155,8 +140,6 @@ class TensorHandle : public core::RefCounted {
 
   bool IsReady();
 
-  bool IsRemote();
-
   // Id for the EagerNode that will compute the value pointed to by this handle.
   // If the value is 0, the handle is already ready, but not vice-versa.
   const uint64 node_id_;
@@ -177,6 +160,10 @@ class TensorHandle : public core::RefCounted {
   // device_ for constant tensors.
   tensorflow::Device* const op_device_;
 
+  // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
+  // backing the resource. Else resource_device_ is nullptr.
+  tensorflow::Device* const resource_device_;
+
   // IDs required when this class is representing a remote tensor handle.
   const int64 remote_op_id_;
   const int32 remote_output_num_;
@@ -195,8 +182,17 @@ class TensorHandle : public core::RefCounted {
   // `ctx` object is not owned and should outlive this handle.
   EagerContext* ctx_ GUARDED_BY(ctx_mutex_);
   bool is_ready_ GUARDED_BY(ctx_mutex_);
+
+  // When non-NULL, this tensor handle instance represents a symbolic tensor
+  // (corresponding to a graph node), whose concrete value is to be produced by
+  // executing that graph node.
+  std::unique_ptr<OutputGraphNode> symbolic_tensor;
 };
 
+// If tensor's dtype is DT_RESOURCE, returns the device backing the resource.
+// Else, returns nullptr.
+Device* GetResourceDevice(const Tensor& t, EagerContext* ctx);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
index 87749da7afed9f67c469cbcd63e685c2c534a4bb..fb51e2dec3ac63f64cd70bececa5734bb5afc8a4 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.cc
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 6b3284b84a0d2741f315c3f91db35eebc68f9e98..d068bbf1e4a15245ca1c8ef9f91d24722f682be1 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
@@ -131,6 +132,16 @@ struct EdgeInfo {
   int input_slot;
 };
 
+// Time the execution of kernels (in CPU cycles).  Used to dynamically identify
+// inexpensive kernels which can be dispatched inline.
+struct KernelTimer {
+  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+
+  uint64 ElapsedCycles() {
+    return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
+  }
+};
+
 struct NodeItem {
   NodeItem() {}
 
@@ -140,7 +151,6 @@ struct NodeItem {
   // The kernel for this node.
   OpKernel* kernel = nullptr;
 
-  bool kernel_is_expensive : 1;  // True iff kernel->IsExpensive()
   bool kernel_is_async : 1;      // True iff kernel->AsAsync() != nullptr
   bool is_merge : 1;             // True iff IsMerge(node)
   bool is_enter : 1;             // True iff IsEnter(node)
@@ -625,7 +635,6 @@ Status ExecutorImpl::Initialize() {
       return s;
     }
     CHECK(item->kernel);
-    item->kernel_is_expensive = item->kernel->IsExpensive();
     item->kernel_is_async = (item->kernel->AsAsync() != nullptr);
     item->is_merge = IsMerge(n);
     item->is_enter = IsEnter(n);
@@ -1235,6 +1244,7 @@ class ExecutorState {
   Rendezvous* rendezvous_;
   CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
+  string session_handle_;
   TensorStore* tensor_store_;
   // Step-local container.
   ScopedStepContainer* step_container_;
@@ -1266,6 +1276,11 @@ class ExecutorState {
 
   std::atomic_int_fast32_t num_outstanding_ops_;
 
+  // Available via OpKernelContext to every OpKernel invocation.
+  mutex num_deferred_ops_mu_;
+  condition_variable num_deferred_ops_cv_;
+  int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
+
   mutex mu_;
   Status status_ GUARDED_BY(mu_);
 
@@ -1362,6 +1377,7 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       rendezvous_(args.rendezvous),
       collective_executor_(args.collective_executor),
       session_state_(args.session_state),
+      session_handle_(args.session_handle),
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
@@ -1580,7 +1596,8 @@ bool MightTrace(const NodeItem& item,
     if (using_annotations) {
       return trace_collector->IsEnabledForAnnotations();
     } else {
-      return trace_collector->IsEnabledForActivities(item.kernel_is_expensive);
+      return trace_collector->IsEnabledForActivities(
+          item.kernel->IsExpensive());
     }
   }
   return false;
@@ -1606,6 +1623,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.rendezvous = rendezvous_;
   params.collective_executor = collective_executor_;
   params.session_state = session_state_;
+  params.session_handle = session_handle_;
   params.tensor_store = tensor_store_;
   params.cancellation_manager = cancellation_manager_;
   params.call_frame = call_frame_;
@@ -1618,6 +1636,15 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.input_alloc_attrs = &input_alloc_attrs;
   params.runner = &runner_;
   params.stats_collector = stats_collector_;
+  params.inc_num_deferred_ops_function = [this]() {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops_++;
+  };
+  params.dec_num_deferred_ops_function = [this]() {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops_--;
+    num_deferred_ops_cv_.notify_all();
+  };
 
   Status s;
   NodeExecStatsInterface* stats = nullptr;
@@ -1780,12 +1807,18 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
                 op_name,
                 strings::StrCat(op_kernel->type_string(), "#id=", step_id_,
                                 "#"),
-                item.kernel_is_expensive);
+                item.kernel->IsExpensive());
             device->Compute(op_kernel, &ctx);
           }
         } else {
           // In the common case, avoid creating any tracing objects.
-          device->Compute(op_kernel, &ctx);
+          if (op_kernel->IsExpensive()) {
+            KernelTimer timer;
+            device->Compute(op_kernel, &ctx);
+            op_kernel->UpdateCostEstimate(timer.ElapsedCycles());
+          } else {
+            device->Compute(op_kernel, &ctx);
+          }
         }
 
         nodestats::SetOpEnd(stats);
@@ -1883,7 +1916,7 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
       inp->tensor = entry->val.get();
     } else {
       {
-        mutex_lock ml(*entry->ref_mu);
+        tf_shared_lock ml(*entry->ref_mu);
         if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) {
           return AttachDef(errors::FailedPrecondition(
                                "Attempting to use uninitialized value ",
@@ -1899,7 +1932,7 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
         // tensor but is given a ref to a tensor.  Need to deref it
         // under the mutex.
         {
-          mutex_lock l(*(entry->ref_mu));
+          tf_shared_lock l(*(entry->ref_mu));
           DCHECK(!entry->val_field_is_set);
           entry->val.Init(*entry->ref);
           entry->val_field_is_set = true;
@@ -1988,7 +2021,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       // Sanity check of output tensor types.
       DataType dtype;
       if (val.is_ref()) {
-        mutex_lock ml(*val.mutex_if_ref);
+        tf_shared_lock ml(*val.mutex_if_ref);
         dtype = MakeRefType(val->dtype());
       } else {
         dtype = val->dtype();
@@ -2005,7 +2038,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
             Tensor to_log;
             {
               // Dereference the tensor under the lock.
-              mutex_lock l(*out->ref_mu);
+              tf_shared_lock l(*out->ref_mu);
               to_log = *out->ref;
             }
             LogMemory::RecordTensorOutput(ctx->op_kernel().name(),
@@ -2218,6 +2251,7 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
   if (stats_collector_) {
     scheduled_nsec = nodestats::NowInNsec();
   }
+
   if (inline_ready == nullptr) {
     // Schedule to run all the ready ops in thread pool.
     for (auto& tagged_node : ready) {
@@ -2225,11 +2259,12 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
     }
     return;
   }
+
   const GraphView& gview = impl_->gview_;
   const TaggedNode* curr_expensive_node = nullptr;
   for (auto& tagged_node : ready) {
     const NodeItem& item = *gview.node(tagged_node.node->id());
-    if (tagged_node.is_dead || !item.kernel_is_expensive) {
+    if (tagged_node.is_dead || !item.kernel->IsExpensive()) {
       // Inline this inexpensive node.
       inline_ready->push_back(tagged_node);
     } else {
@@ -2395,7 +2430,45 @@ void ExecutorState::Finish() {
   CHECK(done_cb != nullptr);
   Device* device = impl_->params_.device;
 
-  if ((sync_on_finish_ && status.ok()) || device->RequiresSyncOnCompletion()) {
+  // There are several potential race conditions below. To name a few:
+  // 1. Even if the device's status is OK at the precise moment when
+  // num_deferred_ops_ reaches 0, it could go bad before device->CurrentStatus()
+  // is called below, caused by work enqueued onto the same device by other
+  // concurrent ExecutorState objects.
+  // 2. Some implementations of Device::CurrentStatus, such as
+  // XlaDevice::CurrentStatus, may be inherently racy because it releases the
+  // device mutex after a stream pointer is acquired and before the stream is
+  // queried for status.
+  // 3. It's the same for some implementations of Device::Sync, such as
+  // XlaDevice::Sync.
+  //
+  // However, these race conditions are acceptable because a stream (and
+  // therefore an XlaDevice) can only go from OK to not-OK, never the opposite,
+  // which means we will at worst report errors when there isn't any, never the
+  // opposite.
+
+  // If inc_num_deferred_ops_function has ever been called, ExecutorState must
+  // wait for all corresponding dec_num_deferred_ops_function calls to happen
+  // regardless of status. This ensures that dec_num_deferred_ops_function can
+  // safely use ExecutorState's resources.
+  {
+    mutex_lock lock(num_deferred_ops_mu_);
+    while (num_deferred_ops_ > 0) {
+      num_deferred_ops_cv_.wait(lock);
+    }
+  }
+
+  // An early exit for devices don't allow sync on completion. Ops that run on
+  // these devices should have used num_deferred_ops correctly to ensure the
+  // device has finished all relevant work at this point.
+  if (!device->AllowsSyncOnCompletion()) {
+    status.Update(device->CurrentStatus());
+    delete this;
+    runner([=]() { done_cb(status); });
+    return;
+  }
+
+  if (sync_on_finish_ && status.ok()) {
     // Block until the device has finished all queued operations. For
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 02930168a4b053895827a54d065011bc9d657463..4be60c67713bc801a8249201d65a5dbc26646138 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -88,6 +88,8 @@ class Executor {
     CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
+    // Unique session identifier. Can be empty.
+    string session_handle;
     TensorStore* tensor_store = nullptr;
     ScopedStepContainer* step_container = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 7eb622dc117f40a68079e6cea1a829227acfed7a..82bbb7e68900164f86c718ec2f799a28c7125e81 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -104,6 +104,10 @@ static Node* AddIdentity(Graph* g, Endpoint input) {
   NodeDef ndef;
   ndef.set_name(g->NewName(kNodeLabel));
   ndef.set_op("Identity");
+  // NOTE(skyewm): we explicitly set the device here to address a multi-GPU
+  // performance issue where this Identity would be placed alone on a GPU,
+  // causing unnecessary device traffic. See b/122483225 for details.
+  ndef.set_device(input.node->def().device());
   ndef.add_input(input.name());
   AddNodeAttr("T", BaseType(input.dtype()), &ndef);
   Status s;
@@ -453,7 +457,9 @@ class CallOp : public AsyncOpKernel {
   CallOp(FunctionLibraryRuntime::Handle handle, OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx), handle_(handle) {}
 
-  ~CallOp() override {}
+  ~CallOp() override {
+    // TODO(iga): Release the cached handle_
+  }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     FunctionLibraryRuntime* lib = ctx->function_library();
@@ -628,11 +634,20 @@ bool FunctionLibraryRuntimeImpl::IsLocalTarget(
     const InstantiateOptions& options) {
   if (device_ == nullptr) return true;
   if (options.target.empty()) return true;
+  if (options.is_multi_device_function) return false;
   Device* target_device;
   if (!device_mgr_->LookupDevice(options.target, &target_device).ok()) {
+    VLOG(1) << "Not instantiating function in FLR because failed to "
+            << "find device " << options.target << " in device manager";
+    return false;
+  }
+  if (target_device != device_) {
+    VLOG(1) << "Not instantiating function in FLR because target device "
+            << options.target
+            << " is different from FLR's device: " << device_->DebugString();
     return false;
   }
-  return target_device == device_;
+  return true;
 }
 
 Status FunctionLibraryRuntimeImpl::Instantiate(
@@ -732,15 +747,32 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (h == kInvalidLocalHandle) {
     return parent_->ReleaseHandle(handle);
   }
-  mutex_lock l(mu_);
-  CHECK_EQ(1, items_.count(h));
-  std::unique_ptr<Item>& item = items_[h];
-  --item->instantiation_counter;
-  if (item->instantiation_counter == 0) {
-    items_.erase(h);
-    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+
+  std::unique_ptr<Item> item_to_delete;
+  Status parent_status;
+  {
+    mutex_lock l(mu_);
+    auto it = items_.find(h);
+    if (it == items_.end()) {
+      return errors::Internal(
+          "Inconsistent FunctionLibraryRuntime. Expected to find an item for "
+          "handle ",
+          h, " but found none");
+    }
+    std::unique_ptr<Item>& item = it->second;
+    --item->instantiation_counter;
+    if (item->instantiation_counter == 0) {
+      // We don't simply erase h's item because that would trigger
+      // item destruction while holding mu_. Item destruction can
+      // trigger graph destruction. If the graph contains kernels like
+      // CallOp or PartitionCallOp, their destructors will release cached
+      // function handles, resulting in deadlock here.
+      item_to_delete = std::move(item);
+      items_.erase(h);
+      parent_status = parent_->RemoveHandle(handle);
+    }
   }
-  return Status::OK();
+  return parent_status;
 }
 
 void DumpGraph(StringPiece label, const Graph* g) {
@@ -754,13 +786,19 @@ void DumpGraph(StringPiece label, const Graph* g) {
   }
 }
 
-void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options) {
   OptimizerOptions opts;
   opts.set_do_common_subexpression_elimination(true);
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
-  optimizer.Optimize(lib, lib->env(), lib->device(), g, /*shape_map=*/nullptr);
+  optimizer.Optimize(lib, lib->env(), lib->device(), g,
+                     graph_optimizer_options);
+}
+
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
+  OptimizeGraph(lib, g, GraphOptimizer::Options());
 }
 
 namespace {
@@ -780,7 +818,7 @@ void PruneFunctionBody(Graph* g) {
     // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
     // still needed. It would be preferable to prune entire loops and/or
     // conditionals if they are not used in the graph.
-    if (n->IsControlFlow() ||
+    if (n->IsControlFlow() || n->IsDataset() ||
         (n->op_def().is_stateful() && n->type_string() != kArgOp)) {
       nodes.insert(n);
     }
@@ -1407,6 +1445,12 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     if (override_device || ndef.device().empty()) {
       ndef.set_device(caller->def().device());
     }
+    for (auto& attr : *ndef.mutable_attr()) {
+      if (attr.first == "_class") {
+        attr.second.set_s(
+            strings::StrCat(caller->name(), "/", attr.second.s()));
+      }
+    }
     Node* clone = g->AddNode(ndef, &s);
     TF_CHECK_OK(s);
     node_map[n->id()] = clone;
@@ -1586,6 +1630,13 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
     for (const auto& attr : n->attrs()) {
       (*ndef->mutable_attr())[attr.first] = attr.second;
     }
+
+    if (!n->assigned_device_name().empty()) {
+      ndef->set_device(n->assigned_device_name());
+    } else {
+      ndef->set_device(n->requested_device());
+    }
+
     inputs.clear();
     inputs.resize(n->num_inputs());
     for (const Edge* e : n->in_edges()) {
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index eeca66f5d0bdef6b036b77b170ccd07945be28b7..94b6beeb933ea3f27a1829835bb886d842301c19 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -133,6 +134,8 @@ void DumpGraph(StringPiece label, const Graph* g);
 // OptimizeGraph mutates **g extensively and replaces '*g' with a
 // complete copy. Therefore, the caller should not keep any references
 // to nodes *g.
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options);
 void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g);
 
 // Convert the Graph of a function to a GraphDef.
@@ -157,6 +160,8 @@ FunctionBody* SymbolicGradient(const FunctionBody& f);
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly. "override_device" specifies whether inlining should replace
 // explicitly specified devices inside fbody with the callee's device.
+//
+// TODO(ezhulenev): Return Status::error if function inlining failed.
 void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
                         Node* caller, const FunctionBody* fbody,
                         bool override_device = true);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index cab95cb596858f99285c3cfc5673f87b70368a32..97e46f406cf96cc284ec14718f9500767f5e9861 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -246,9 +246,10 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2))
+        << "Actual status: " << status2.ToString();
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
@@ -316,9 +317,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 1b803736fb881c8f133198ab39e5801a357c5659..1dca25e0064e12c9b21c76102278e1bebdc67a4a 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -149,9 +149,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index d85ca8892f6d19c2c10a5f35368a476506ecc370..4be1bbb7df37c1aa954ea3350f82eee5b15ad1bf 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "cuda/include/cuda.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
@@ -41,7 +42,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // allocate with cudaMalloc
   se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
-  CUresult res = cuMemAlloc(&rv, num_bytes);
+  CUresult res = tensorflow::wrap::cuMemAlloc(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes;
     return nullptr;
@@ -54,7 +55,8 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #ifdef GOOGLE_CUDA
   // free with cudaFree
-  CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
+  CUresult res =
+      tensorflow::wrap::cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemFree failed to free " << ptr;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 989ddbe4af53ee200f994ea8e3f2ae42e5bcab7f..c22bfcea2cedab93409d761686d852a5c4bbeeb9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -44,8 +44,9 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
-  if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
-    LOG(FATAL) << "Could not copy debug mask";
+  Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not copy debug mask, " << result;
   }
 
   bool ok = true;
@@ -63,8 +64,9 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
-  if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
-    LOG(FATAL) << "Could not copy debug mask";
+  Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not copy debug mask, " << result;
   }
 }
 
@@ -171,8 +173,10 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   se::DeviceMemory<float> nan_ptr{
       se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  Status result =
+      stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
+  if (!result.ok()) {
+    LOG(ERROR) << "Could not initialize to NaNs, " << result;
   }
 
   return allocated_ptr;
@@ -185,8 +189,10 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
                             std::nanf(""));
     se::DeviceMemory<float> nan_ptr{
         se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-      LOG(ERROR) << "Could not initialize to NaNs";
+    Status result =
+        stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
+    if (!result.ok()) {
+      LOG(ERROR) << "Could not initialize to NaNs, " << result;
     }
   }
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 04d658f0472e3ea07855f4bae6a89ad5199eb2f9..b185ea1fa50fb866b36e928f441cd267a8da4301 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/collective_order.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
@@ -59,6 +61,7 @@ GraphExecutionState::GraphExecutionState(
     : stateful_placements_(options.stateful_placements),
       device_set_(options.device_set),
       session_options_(options.session_options),
+      session_handle_(options.session_handle),
       flib_def_(new FunctionLibraryDefinition(OpRegistry::Global(),
                                               graph_def->library())),
       graph_(nullptr) {
@@ -198,6 +201,7 @@ Status GraphExecutionState::Extend(
   GraphExecutionStateOptions combined_options;
   combined_options.device_set = device_set_;
   combined_options.session_options = session_options_;
+  combined_options.session_handle = session_handle_;
   combined_options.stateful_placements = stateful_placements_;
 
   // NOTE(mrry): `gdef` is no longer valid after the constructor
@@ -558,6 +562,7 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   RestoreStatefulNodes(new_graph.get());
 
   GraphOptimizationPassOptions optimization_options;
+  optimization_options.session_handle = session_handle_;
   optimization_options.session_options = session_options_;
   optimization_options.graph = &new_graph;
   optimization_options.flib_def = flib_def_.get();
@@ -728,6 +733,7 @@ Status GraphExecutionState::OptimizeGraph(
 Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
                                        std::unique_ptr<ClientGraph>* out) {
   VLOG(1) << "BuildGraph";
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   if (!graph_) {
     // It is only valid to call this method directly when the original graph
     // was created with the option `place_pruned_graph == false`.
@@ -816,6 +822,12 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
     }
   }
 
+  // Make collective execution order deterministic if needed.
+  if (options.collective_order != GraphCollectiveOrder::kNone) {
+    TF_RETURN_IF_ERROR(
+        OrderCollectives(optimized_graph.get(), options.collective_order));
+  }
+
   // Copy the extracted graph in order to make its node ids dense,
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
@@ -825,7 +837,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   CopyGraph(*optimized_graph, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
-
+  metrics::UpdateGraphBuildTime(Env::Default()->NowMicros() - start_time_usecs);
   *out = std::move(dense_copy);
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 9cabe478a68a72252579755dca1e8957242344ba..56315bb1ef7947d788a7ada6ef0fa14f50e2a978 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -41,6 +41,8 @@ struct RewriteGraphMetadata;
 struct GraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
+  // Unique session identifier. Can be empty.
+  string session_handle;
   // A map from node name to device name, representing the unchangeable
   // placement of stateful nodes.
   std::unordered_map<string, string> stateful_placements;
@@ -192,6 +194,8 @@ class GraphExecutionState {
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
   const SessionOptions* session_options_;  // Not owned
+  // Unique session identifier. Can be empty.
+  string session_handle_;
 
   // Map from name to Node for the full graph in placed_.
   NodeNameToCostIdMap node_name_to_cost_id_map_;
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 37a979a8f1929ed6312dc79354a3c206f7c4c5f4..7905944fb18105e38059a892d32b9509273a7742 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -38,8 +38,7 @@ void GraphOptimizer::Optimize(
     std::unique_ptr<Graph>* graph,
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
-    const std::function<bool(const Node*)>& cse_consider_fn,
-    const std::function<bool(const Node*)>& cf_consider_fn) {
+    const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -103,4 +102,11 @@ void GraphOptimizer::Optimize(
   DumpGraph("ReCopy", graph->get());
 }
 
+void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
+                              Device* device, std::unique_ptr<Graph>* graph,
+                              const Options& options) {
+  Optimize(runtime, env, device, graph, options.shape_map,
+           options.cse_consider_fn, options.cf_consider_fn);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 789cc5694219e1386bde0fb1821dfdc9928523f1..05150608f02ab52fe135b003dddbcee7783e11a7 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -26,6 +26,28 @@ namespace tensorflow {
 
 class GraphOptimizer {
  public:
+  using NodePredicate = std::function<bool(const Node*)>;
+
+  struct Options {
+    // If not null it maps from nodes in graph to partially-known
+    // shapes of their outputs, and may be used, e.g., in the constant folding
+    // pass. The use of shape_map implies that the mapping from node name to the
+    // vector of partial shapes of its outputs is stable, i.e., no optimization
+    // pass may replace a node with a different node of the same name that has a
+    // different number of outputs, or outputs with different known shapes.
+    // TODO(b/65453533) introduce a unique way to name nodes in a graph.
+    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map =
+        nullptr;
+
+    // If not null then only nodes for which cse_consider_fn returns true will
+    // be considered for CSE.
+    NodePredicate cse_consider_fn = nullptr;
+
+    // If not null then only nodes for which cf_consider_fn returns true will be
+    // considered for CF.
+    NodePredicate cf_consider_fn = nullptr;
+  };
+
   GraphOptimizer(const OptimizerOptions& opts);
   ~GraphOptimizer();
 
@@ -34,26 +56,17 @@ class GraphOptimizer {
   // on which the 'graph' will execute. It's passed to the optimizers
   // so that they can respect constraints if any, that should be
   // respected.
-  //
-  // If shape_map is not null it maps from nodes in graph to partially-known
-  // shapes of their outputs, and may be used, e.g., in the constant folding
-  // pass. The use of shape_map implies that the mapping from node name to the
-  // vector of partial shapes of its outputs is stable, i.e., no optimization
-  // pass may replace a node with a different node of the same name that has a
-  // different number of outputs, or outputs with different known shapes.
-  // TODO(b/65453533) introduce a unique way to name nodes in a graph.
-  //
-  // If cse_consider_fn is not null then only nodes for which cse_consider_fn
-  // returns true will be considered for CSE.
-  // If cf_consider_fn is not null then only nodes for which cf_consider_fn
-  // returns true will be considered for CF.
+  void Optimize(FunctionLibraryRuntime* runtime, Env* env, Device* device,
+                std::unique_ptr<Graph>* graph,
+                const Options& graph_optimizer_options);
+  // DEPRECATED: Consider passing a GraphOptimizer::Options object instead.
   void Optimize(
       FunctionLibraryRuntime* runtime, Env* env, Device* device,
       std::unique_ptr<Graph>* graph,
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
           shape_map,
-      const std::function<bool(const Node*)>& cse_consider_fn = nullptr,
-      const std::function<bool(const Node*)>& cf_consider_fn = nullptr);
+      const NodePredicate& cse_consider_fn = nullptr,
+      const NodePredicate& cf_consider_fn = nullptr);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
index ceb9baad30b214e5d3bec0cdbb470474d84e7227..76392b8e59e904d3bde7739f640ab92ff53aa96b 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
@@ -41,6 +41,11 @@ class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface {
   // and device_locality.  Also saves the CollectiveContext in this object.
   Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
 
+  // No-op for hierarchical tree broadcaster.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
   // Begins async execution of the hierarchical tree broadcast.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 8b68c31a722474e4b73fa9e1d46ccafbc7b66ddd..7552838ca11d5c8e863bddca4398b98caeac2759 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -53,9 +53,10 @@ using NodeOut = NodeBuilder::NodeOut;
 class LowerWhileHelper {
  public:
   static Status Run(Node* while_op, const string& cond_fn_name,
-                    const string& body_fn_name, Graph* graph,
-                    const FunctionLibraryDefinition& flib) {
-    LowerWhileHelper helper(while_op, cond_fn_name, body_fn_name, graph, flib);
+                    const string& body_fn_name, int parallel_iterations,
+                    Graph* graph, const FunctionLibraryDefinition& flib) {
+    LowerWhileHelper helper(while_op, cond_fn_name, body_fn_name,
+                            parallel_iterations, graph, flib);
     return helper.RunInternal();
   }
 
@@ -64,8 +65,8 @@ class LowerWhileHelper {
   // and body functions named `cond_fn_name` and `body_fn_name` respectively in
   // the given graph.
   LowerWhileHelper(Node* while_op, const string& cond_fn_name,
-                   const string& body_fn_name, Graph* graph,
-                   const FunctionLibraryDefinition& flib);
+                   const string& body_fn_name, int parallel_iterations,
+                   Graph* graph, const FunctionLibraryDefinition& flib);
 
   Status RunInternal();
 
@@ -132,6 +133,8 @@ class LowerWhileHelper {
   const FunctionLibraryDefinition& flib_;
   // Name of the `while_op_`.
   string name_;
+  // Max number of parallel_iterations for the while loop.
+  const int parallel_iterations_;
 
   NodeDebugInfo debug_info_;
   NodeBuilder cond_call_builder_;
@@ -147,12 +150,14 @@ class LowerWhileHelper {
 };
 
 LowerWhileHelper::LowerWhileHelper(Node* while_op, const string& cond_fn_name,
-                                   const string& body_fn_name, Graph* graph,
+                                   const string& body_fn_name,
+                                   int parallel_iterations, Graph* graph,
                                    const FunctionLibraryDefinition& flib)
     : while_op_(while_op),
       graph_(graph),
       flib_(flib),
       name_(while_op->name()),
+      parallel_iterations_(parallel_iterations),
       debug_info_(*while_op_),
       cond_call_builder_(NewName("cond"), cond_fn_name, graph->op_registry(),
                          &debug_info_),
@@ -194,6 +199,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(edge->src(), edge->src_output()))
                            .Attr("frame_name", name_)
+                           .Attr("parallel_iterations", parallel_iterations_)
                            .Finalize(graph_, &enter_node));
     enter_nodes_[edge->dst_input()] = enter_node;
   }
@@ -392,9 +398,15 @@ Status RewriteWhileNode(Node* n, Graph* g,
   if (body_attr == nullptr) {
     return errors::InvalidArgument("While body function missing");
   }
+  const AttrValue* parallel_iterations_attr =
+      n->attrs().Find("parallel_iterations");
+  if (parallel_iterations_attr == nullptr) {
+    return errors::InvalidArgument("parallel_iterations attr missing");
+  }
 
-  TF_RETURN_IF_ERROR(LowerWhileHelper::Run(n, cond_attr->func().name(),
-                                           body_attr->func().name(), g, flib));
+  TF_RETURN_IF_ERROR(LowerWhileHelper::Run(
+      n, cond_attr->func().name(), body_attr->func().name(),
+      parallel_iterations_attr->i(), g, flib));
   g->RemoveNode(n);
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 24fd4ed5bb5939e066fa5b8d75b9b9c3aaf5895a..fcb10bc75dbe574efee9c4c28ab00dcb55c194d3 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -66,6 +66,7 @@ TEST(LowerWhileOpTest, Simple) {
                    .Attr("T", {DT_INT32})
                    .Attr("cond", cond_func)
                    .Attr("body", body_func)
+                   .Attr("parallel_iterations", 100)
                    .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
                    .Finalize(root.graph(), &while_node));
   TF_ASSERT_OK(root.DoShapeInference(while_node));
@@ -97,6 +98,7 @@ TEST(LowerWhileOpTest, Simple) {
   for (const auto* op : graph->op_nodes()) {
     if (op->IsEnter()) {
       ++enter_count;
+      ASSERT_EQ(op->attrs().Find("parallel_iterations")->i(), 100);
     }
     if (op->IsExit()) {
       ++exit_count;
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index f4c94ed7ec0cb1c5e8b341b75f1d075d30d6125a..a34a580c2ae13e81b8c4e468adfab1669add8287 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/counter.h"
 
 namespace tensorflow {
-
+namespace metrics {
 namespace {
 
 auto* graph_runs = monitoring::Counter<0>::New(
@@ -28,8 +28,47 @@ auto* graph_runs = monitoring::Counter<0>::New(
 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
     "/tensorflow/core/graph_run_time_usecs",
     "The total time spent on executing graphs in microseconds.");
+
+auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/autotune", "tf.data autotuning", "name");
+
+auto* tf_data_elements_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/elements", "tf.data elements", "name");
+
+auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/optimization", "tf.data optimization", "name");
+
+auto* build_graph_calls = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_build_calls",
+    "The number of times TensorFlow has created a new client graph. "
+    "A client graph is a sub-graph of the full graph, induced by a set of "
+    "options, including the requested feeds and fetches. It includes time "
+    "spent optimizing the graph with Grappler, and time spent pruning the "
+    "sub-graph.");
+
+auto* build_graph_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_build_time_usecs",
+    "The amount of time TensorFlow has spent creating new client graphs in "
+    "microseconds. "
+    "A client graph is a sub-graph of the full graph, induced by a set of "
+    "options, including the requested feeds and fetches. It includes time "
+    "spent optimizing the graph with Grappler, and time spent pruning the "
+    "sub-graph.");
+
 }  // namespace
 
+void RecordTFDataAutotune(const string& name) {
+  tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
+}
+
+void RecordTFDataElements(const string& name, int64 num_elements) {
+  tf_data_elements_counter->GetCell(name)->IncrementBy(num_elements);
+}
+
+void RecordTFDataOptimization(const string& name, int64 num_changes) {
+  tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
+}
+
 void UpdateGraphExecTime(const uint64 running_time_usecs) {
   if (running_time_usecs > 0) {
     graph_runs->GetCell()->IncrementBy(1);
@@ -37,4 +76,12 @@ void UpdateGraphExecTime(const uint64 running_time_usecs) {
   }
 }
 
+void UpdateGraphBuildTime(const uint64 running_time_usecs) {
+  if (running_time_usecs > 0) {
+    build_graph_calls->GetCell()->IncrementBy(1);
+    build_graph_time_usecs->GetCell()->IncrementBy(running_time_usecs);
+  }
+}
+
+}  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index d3430c9f030998f118c1626e6bbed93dd316a525..49dbddd911f73577480c4d5fada63cedc6bcbd59 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -19,9 +19,42 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace metrics {
+
+// Records that a tf.data dataset op executed by the program used autotuning.
+//
+// The `name` argument identifies the dataset (e.g. "ParallelMap").
+void RecordTFDataAutotune(const string& name);
+
+// Records the number of elements produced by a tf.data dataset.
+//
+// The `name` argument identifies the dataset (e.g. "Batch" or "Map").
+void RecordTFDataElements(const string& name, int64 num_elements);
+
+// Records the number of independent graph changes resulting from the applicaton
+// of a tf.data optimization.
+//
+// The `name` argument identifies the optimization (e.g. "noop_eliminiation").
+void RecordTFDataOptimization(const string& name, int64 num_changes);
 
 void UpdateGraphExecTime(const uint64 running_time_usecs);
 
+// Updates the metrics stored about time spent building graphs.
+//
+// By "GraphBuild", we refer to building a client graph, which is a sub-graph of
+// the full graph, induced by a set of options. In particular, these options
+// include the feeds and fetches requested.
+//
+// This includes time spent:
+//   * optimizing the graphs with Grappler
+//   * pruning the sub-graph (unless the place_pruned_graph option is set)
+//
+// When executing eagerly, this will not record any activity.
+//
+// TODO(jtkeeling): Should we record building/optimizing tf.functions?
+void UpdateGraphBuildTime(const uint64 running_time_usecs);
+
+}  // namespace metrics
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 429b19599b63740370ae49d7dbe9edcdf1e2c0ce..94f1b1f2044b1888062c3c5108c5be678e560fd8 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -39,6 +39,8 @@ typedef unsigned int uint;
 
 namespace tensorflow {
 
+static bool mkl_small_allocator_collect_stats = false;
+
 class MklSubAllocator : public BasicCPUAllocator {
  public:
   MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
@@ -62,15 +64,8 @@ class MklSmallSizeAllocator : public Allocator {
   inline string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    void* ptr = sub_allocator_->Alloc(alignment, num_bytes);
-    if (ptr != nullptr) {
-      std::pair<void*, size_t> map_val(ptr, num_bytes);
-      mutex_lock l(mutex_);
-      // Check that insertion in the hash map was successful.
-      CHECK(map_.insert(map_val).second);
-      // Increment statistics for small-size allocations.
-      IncrementStats(num_bytes);
-    }
+    void* ptr = port::AlignedMalloc(num_bytes, alignment);
+    if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes);
     return ptr;
   }
 
@@ -80,23 +75,11 @@ class MklSmallSizeAllocator : public Allocator {
       return;
     }
 
-    mutex_lock l(mutex_);
-    auto map_iter = map_.find(ptr);
-    if (map_iter != map_.end()) {
-      // Call free visitors.
-      size_t dealloc_bytes = map_iter->second;
-      sub_allocator_->Free(ptr, dealloc_bytes);
-      DecrementStats(dealloc_bytes);
-      map_.erase(map_iter);
-    } else {
-      LOG(ERROR) << "tried to deallocate invalid pointer";
-      return;
+    if (mkl_small_allocator_collect_stats) {
+      const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
+      DecrementStats(alloc_size);
     }
-  }
-
-  inline bool IsSmallSizeAllocation(const void* ptr) const {
-    mutex_lock l(mutex_);
-    return map_.find(ptr) != map_.end();
+    port::AlignedFree(ptr);
   }
 
   void GetStats(AllocatorStats* stats) override {
@@ -111,8 +94,8 @@ class MklSmallSizeAllocator : public Allocator {
 
  private:
   // Increment statistics for the allocator handling small allocations.
-  inline void IncrementStats(size_t alloc_size)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void IncrementStats(size_t alloc_size) LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
     ++stats_.num_allocs;
     stats_.bytes_in_use += alloc_size;
     stats_.max_bytes_in_use =
@@ -122,8 +105,8 @@ class MklSmallSizeAllocator : public Allocator {
   }
 
   // Decrement statistics for the allocator handling small allocations.
-  inline void DecrementStats(size_t dealloc_size)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void DecrementStats(size_t dealloc_size) LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
     stats_.bytes_in_use -= dealloc_size;
   }
 
@@ -135,10 +118,6 @@ class MklSmallSizeAllocator : public Allocator {
   // Allocator name
   string name_;
 
-  // Hash map to keep track of "small" allocations
-  // We do not use BFC allocator for small allocations.
-  std::unordered_map<const void*, size_t> map_ GUARDED_BY(mutex_);
-
   // Allocator stats for small allocs
   AllocatorStats stats_ GUARDED_BY(mutex_);
 };
@@ -215,23 +194,52 @@ class MklCPUAllocator : public Allocator {
   }
 
   inline string Name() override { return kName; }
+  inline bool IsSmallSizeAllocation(const void* ptr) const
+      LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    return large_allocations_map_.find(ptr) == large_allocations_map_.end();
+  }
+  // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
+  inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    if (ptr != nullptr) {
+      std::pair<void*, size_t> map_val(ptr, num_bytes);
+      large_allocations_map_.insert(map_val);
+    }
+  }
+  inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    auto map_iter = large_allocations_map_.find(ptr);
+    if (map_iter != large_allocations_map_.end()) {
+      large_allocations_map_.erase(map_iter);
+    } else {
+      LOG(ERROR) << "tried to deallocate invalid pointer";
+    }
+    return;
+  }
 
   inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     // If the allocation size is less than threshold, call small allocator,
     // otherwise call large-size allocator (BFC). We found that BFC allocator
     // does not deliver good performance for small allocations when
     // inter_op_parallelism_threads is high.
-    return (num_bytes < kSmallAllocationsThreshold)
-               ? small_size_allocator_->AllocateRaw(alignment, num_bytes)
-               : large_size_allocator_->AllocateRaw(alignment, num_bytes);
+    if (num_bytes < kSmallAllocationsThreshold) {
+      return small_size_allocator_->AllocateRaw(alignment, num_bytes);
+    } else {
+      mutex_lock l(mutex_);
+      void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes);
+      AddLargeAllocMap(ptr, num_bytes);
+      return ptr;
+    }
   }
 
   inline void DeallocateRaw(void* ptr) override {
     // Check if ptr is for "small" allocation. If it is, then call Free
     // directly. Otherwise, call BFC to handle free.
-    if (small_size_allocator_->IsSmallSizeAllocation(ptr)) {
+    if (IsSmallSizeAllocation(ptr)) {
       small_size_allocator_->DeallocateRaw(ptr);
     } else {
+      mutex_lock l(mutex_);
+      RemoveLargeAllocMap(ptr);
       large_size_allocator_->DeallocateRaw(ptr);
     }
   }
@@ -299,6 +307,12 @@ class MklCPUAllocator : public Allocator {
   MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
 
   SubAllocator* sub_allocator_;  // not owned by this class
+  mutable mutex mutex_;
+
+  // Hash map to keep track of "BFC" allocations
+  // We do not use BFC allocator for small allocations.
+  std::unordered_map<const void*, size_t> large_allocations_map_
+      GUARDED_BY(mutex_);
 
   // Size in bytes that defines the upper-bound for "small" allocations.
   // Any allocation below this threshold is "small" allocation.
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index 6fcd2afd2752007996d16358d5118211357fe6c6..0e31f389aa71a5734b1f11b95a056c0d07aabeb9 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -35,6 +35,7 @@ struct SessionOptions;
 // as a key into a state dictionary if it wants to keep state across
 // calls.
 struct GraphOptimizationPassOptions {
+  // Filled in by DirectSession for PRE_PLACEMENT optimizations. Can be empty.
   string session_handle;
   const SessionOptions* session_options = nullptr;
   const CostModel* cost_model = nullptr;
@@ -94,6 +95,10 @@ class OptimizationPassRegistry {
   void Register(Grouping grouping, int phase,
                 std::unique_ptr<GraphOptimizationPass> pass);
 
+  const std::map<Grouping, GraphOptimizationPasses>& groups() {
+    return groups_;
+  }
+
   // Run all passes in grouping, ordered by phase, with the same
   // options.
   Status RunGrouping(Grouping grouping,
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d51caaea8f1d12b472232718c973749e47146728
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+
+namespace tensorflow {
+
+Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
+  PartitionOptions partition_options;
+  partition_options.node_to_loc = [](const Node* node) {
+    // TODO(iga): To support the distributed case, first split the graph by
+    // worker (e.g,. using the master session's `SplitByWorker` policy), and
+    // then recursively partition the per-worker shards at the remote worker(s).
+    // Currently, we simply split the graph at device boundaries.
+    return node->assigned_device_name();
+  };
+  int64 edge_name_counter = 0;
+  partition_options.new_name = [&edge_name_counter](const string& prefix) {
+    return strings::StrCat(prefix, "/_", ++edge_name_counter);
+  };
+  partition_options.get_incarnation =
+      [&device_set](const string& name) -> int64 {
+    const Device* d = device_set.FindDeviceByName(name);
+    if (d == nullptr) {
+      return PartitionOptions::kIllegalIncarnation;
+    } else {
+      return d->attributes().incarnation();
+    }
+  };
+  partition_options.control_flow_added = false;
+  std::unordered_map<string, GraphDef> partitions;
+  TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
+
+  for (const auto& partition : partitions) {
+    const string& device = partition.first;
+    const GraphDef& graph_def = partition.second;
+    // Each partition gets a copy of all the
+    // std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
+    std::unique_ptr<Graph> subgraph(
+        new Graph(graph->flib_def().ReachableDefinitions(graph_def)));
+    FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
+    TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
+    GraphConstructorOptions opts;
+    opts.allow_internal_ops = true;
+    opts.expect_device_spec = true;
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
+    subgraphs->emplace(device, std::move(subgraph));
+  }
+
+  return Status::OK();
+}
+
+Status UpdateArgAndRetvalMetadata(
+    Graph* subgraph, std::vector<int>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs) {
+  std::vector<std::pair<Node*, int>> arg_nodes;
+  std::vector<std::pair<Node*, int>> ret_nodes;
+  const AttrValue* attr_value;
+
+  // Find the Arg and Retval nodes, along with their corresponding indices
+  // in the original function.
+  for (Node* node : subgraph->op_nodes()) {
+    string node_type = node->type_string();
+    if (node_type == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int index = static_cast<int>(attr_value->i());
+      arg_indices->push_back(index);
+      arg_nodes.push_back(std::make_pair(node, index));
+    } else if (node_type == FunctionLibraryDefinition::kRetOp) {
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int index = static_cast<int>(attr_value->i());
+      ret_indices->push_back(index);
+      ret_nodes.push_back(std::make_pair(node, index));
+    }
+  }
+
+  for (int i = 0; i < arg_nodes.size(); ++i) {
+    Node* arg = arg_nodes[i].first;
+    arg->AddAttr("index", i);
+    TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
+    AllocatorAttributes alloc_attr;
+    DataType type = attr_value->type();
+    if (MTypeFromDType(type) == HOST_MEMORY) {
+      alloc_attr.set_on_host(true);
+    }
+    arg_alloc_attrs->push_back(alloc_attr);
+  }
+  for (int i = 0; i < ret_nodes.size(); ++i) {
+    Node* ret = ret_nodes[i].first;
+    ret->AddAttr("index", i);
+    TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
+    AllocatorAttributes alloc_attr;
+    DataType type = attr_value->type();
+    if (MTypeFromDType(type) == HOST_MEMORY) {
+      alloc_attr.set_on_host(true);
+    }
+    ret_alloc_attrs->push_back(alloc_attr);
+  }
+
+  return Status::OK();
+}
+
+std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
+                                      gtl::ArraySlice<Tensor> arguments) {
+  std::vector<Tensor> args;
+  args.reserve(indices.size());
+  for (int i : indices) {
+    args.push_back(arguments[i]);
+  }
+  return args;
+}
+
+string FunctionNameGenerator::GetName() {
+  for (;; ++counter_) {
+    const string candidate = strings::StrCat(name_, "_", counter_);
+    if (flib_def_->Find(candidate) == nullptr) {
+      return candidate;
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c282647e7027414b4f925d1d6d93fcc1624dc81a
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Given a `device_set` and a `graph`, partitions the `graph` into
+// `subgraphs`. `subgraphs` maps device names to the graph assigned to that
+// device. `graph` must have been placed (e.g. by running Placer),
+// i.e. all nodes must have an assigned_device set.
+// `graph` is non-const because the underlying Partition() function transforms
+// the graph to correctly partition distributed control flow.
+Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs);
+
+// Each subgraph produced by partitioning the function body contains a subset
+// of the original `Arg` and `Retval` nodes. This function performs
+// bookkeeping to track which `Arg` and `Retval` nodes were placed on a
+// particular device / subgraph.
+//
+// More specifically, this function
+//  (1) rewrites the indices of the `Arg` and `Retval` nodes placed
+//      on a particular device.  When a function is parittioned each
+//      partition, `subgraph`, get a subset of the arguments and
+//      return values. The `index` attributes of these _Arg and _Retval
+//      nodes reflect the indices of these parameters in the original
+//      function. To convert `subgraph` to a function, we need to replace
+//      there original indices with 0, 1, 2, ... .
+//
+//      The argument and return value order in the partitioned function is
+//      determined by the node iteration order in `subgraph`. This order
+//      is also used in UpdateArgAndRetvalMetadata. This is fine because the
+//      node iteration order is deterministic - it follows the node ids.
+//  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
+//      device in `*_indices`, and
+//  (3) records which `Arg` and `Retval` nodes live in host memory in
+//      `*_alloc_attrs`.
+Status UpdateArgAndRetvalMetadata(
+    Graph* subgraph, std::vector<int>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs);
+
+// Extracts tensors at `indices` from `arguments`.
+std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
+                                      gtl::ArraySlice<Tensor> arguments);
+
+// Utility for generating function names not present in `flib_def`, using
+// given `name` as the base for the name.
+class FunctionNameGenerator {
+ public:
+  // `flib_def` must outlive this.
+  FunctionNameGenerator(const FunctionLibraryDefinition* flib_def,
+                        const string& name)
+      : flib_def_(flib_def), name_(name), counter_(0) {}
+
+  // Returns a function name not present in `flib_def` using `name` as
+  // the base and appending a numeric suffix.
+  string GetName();
+
+ private:
+  const FunctionLibraryDefinition* flib_def_;
+  const string name_;
+  uint32 counter_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d4e36222ba7809dae73fb6eaaceda7fd497288a
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+class PartitioningUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 2});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
+                                          &devices));
+    device0_ = devices[0].get();
+    device1_ = devices[1].get();
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
+
+    for (auto d : device_mgr_->ListDevices()) {
+      device_set_.AddDevice(d);
+    }
+  }
+
+  void SwapGraph(Graph* graph, bool assign_device = false) {
+    Scope s = Scope::NewRootScope();
+    if (assign_device) {
+      s = s.WithDevice(device0_->name());
+    }
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto id_x = ops::Identity(s.WithOpName("id_x"), x);
+    auto id_y = ops::Identity(s.WithOpName("id_y"), y);
+    auto dx_retval = ops::_Retval(s.WithOpName("retval1"), id_y, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("retval2"), id_x, 1);
+    TF_ASSERT_OK(s.ToGraph(graph));
+
+    if (assign_device) {
+      Placer placer(graph, &device_set_, nullptr, /* No session options */
+                    device0_);
+      TF_ASSERT_OK(placer.Run());
+    }
+  }
+
+  void TwoDeviceSwapGraph(Graph* graph) {
+    Scope s = Scope::NewRootScope();
+    Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
+    Scope s2 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:1");
+    auto x = ops::_Arg(s1.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s2.WithOpName("y"), DT_FLOAT, 1);
+    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+    auto id_y = ops::Identity(s2.WithOpName("id_y"), y);
+    auto dx_retval = ops::_Retval(s2.WithOpName("retval1"), id_y, 0);
+    auto dy_retval = ops::_Retval(s1.WithOpName("retval2"), id_x, 1);
+    TF_ASSERT_OK(s.ToGraph(graph));
+    Placer placer(graph, &device_set_, nullptr, /* No session options */
+                  device0_);
+    TF_ASSERT_OK(placer.Run());
+  }
+
+  // Fills subgraph with an identify function arg->identity->ret
+  // where each node has type `dtype` and arg/ret nodes have
+  // indices `arg_index` and `ret_index`.
+  void SubGraph(Graph* subgraph, DataType dtype, int arg_index, int ret_index) {
+    Scope s = Scope::NewRootScope();
+    Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
+    auto x = ops::_Arg(s1.WithOpName("x"), dtype, arg_index);
+    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+    auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
+    TF_ASSERT_OK(s.ToGraph(subgraph));
+    Placer placer(subgraph, &device_set_, nullptr, /* No session options */
+                  device0_);
+    TF_ASSERT_OK(placer.Run());
+  }
+
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  DeviceSet device_set_;
+};
+
+TEST_F(PartitioningUtilsTest, GraphWithoutAssignedDevicesFails) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SwapGraph(graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+TEST_F(PartitioningUtilsTest, OneDevice) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SwapGraph(graph.get(), true);
+  int num_nodes = graph->num_op_nodes();
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  ASSERT_EQ(1, subgraphs.size());
+  const auto& pair = *subgraphs.begin();
+  ASSERT_EQ("/job:a/replica:0/task:0/device:CPU:0", pair.first);
+  ASSERT_EQ(num_nodes, pair.second->num_op_nodes());
+}
+
+TEST_F(PartitioningUtilsTest, TwoDevices) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  TwoDeviceSwapGraph(graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  ASSERT_EQ(2, subgraphs.size());
+
+  const auto& part1 = subgraphs["/job:a/replica:0/task:0/device:CPU:0"];
+  ASSERT_EQ(3, part1->num_op_nodes());
+  const auto& part2 = subgraphs["/job:a/replica:0/task:0/device:CPU:1"];
+  ASSERT_EQ(3, part2->num_op_nodes());
+}
+
+void CheckIndices(const std::vector<int>& expected,
+                  const std::vector<int>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i]) << " at index " << i;
+  }
+}
+
+void CheckAlloc(const std::vector<bool>& expected,
+                const std::vector<AllocatorAttributes>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i].on_host()) << " at index " << i;
+  }
+}
+
+void CheckIndex(const Node& node, int expected_index) {
+  const AttrValue* attr_value;
+  TF_ASSERT_OK(node.attrs().Find("index", &attr_value));
+  int index = static_cast<int>(attr_value->i());
+  ASSERT_EQ(expected_index, index);
+}
+
+TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_FLOAT, 3, 5);
+
+  std::vector<int> arg_indices;
+  std::vector<int> ret_indices;
+  std::vector<AllocatorAttributes> arg_alloc_attrs;
+  std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+  Status status =
+      UpdateArgAndRetvalMetadata(graph.get(), &arg_indices, &ret_indices,
+                                 &arg_alloc_attrs, &ret_alloc_attrs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  CheckIndices({3}, arg_indices);
+  CheckIndices({5}, ret_indices);
+  CheckAlloc({false}, arg_alloc_attrs);
+  CheckAlloc({false}, ret_alloc_attrs);
+
+  std::unordered_map<string, Node*> nodes = graph->BuildNodeNameIndex();
+  ASSERT_EQ(1, nodes.count("x"));
+  CheckIndex(*nodes["x"], 0);
+  ASSERT_EQ(1, nodes.count("retval1"));
+  CheckIndex(*nodes["retval1"], 0);
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 515c1971d9d5cb179b7b9764ff3462579e742dfc..54f03ab85857710ad1801596675945b7239d5c3e 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -95,6 +95,26 @@ std::vector<Device*> FilterSupportedDevices(
   return filtered_devices;
 }
 
+// Returns true if the node has no inputs and produces outputs
+// that are consumed by a single node.
+//
+// TODO(vrv): Currently this handles only nodes with one output, but
+// this could be extended to handle the case where a node has many
+// outputs that are connected to nodes in the same colocation group.
+bool IsGeneratorNode(const Node* node) {
+  return node->num_inputs() == 0 && node->num_outputs() == 1 &&
+         !IsRefType(node->output_type(0));
+}
+
+bool IsExemptFromResourceInputColocation(const Node* node) {
+  // Note: Partitioned function calls, which place and partition their
+  // function bodies, are exempt from this check: they forward resource and
+  // ref inputs to operations that are appropriately placed, instead of
+  // dereferencing them.
+  const string& op_type = node->op_def().name();
+  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+}
+
 // This class maintains the connected components of a colocation
 // constraint graph, and uses this information to assign a satisfying
 // device placement to the nodes of the graph.
@@ -123,15 +143,21 @@ std::vector<Device*> FilterSupportedDevices(
 // This implementation uses the Union-Find algorithm to efficiently maintain the
 // connected components and incrementally adds edges via
 // ColocationGraph::ColocateNodes() invocations.
+//
+// ColocationGraph does not assign any devices to graph nodes. The
+// `log_device_placement` argument is used to log messages when requested
+// device is ignored.
 class ColocationGraph {
  public:
-  ColocationGraph(Graph* graph, const DeviceSet* device_set,
-                  bool allow_soft_placement, const Device* default_device)
+  ColocationGraph(const Graph* graph, const DeviceSet* device_set,
+                  const Device* default_device, bool allow_soft_placement,
+                  bool log_device_placement)
       : graph_(graph),
         device_set_(device_set),
         device_types_(device_set->PrioritizedDeviceTypeList()),
+        default_device_(default_device),
         allow_soft_placement_(allow_soft_placement),
-        default_device_(default_device) {
+        log_device_placement_(log_device_placement) {
     members_.resize(graph->num_node_ids());
   }
 
@@ -159,7 +185,7 @@ class ColocationGraph {
     std::unordered_map<StringPiece, const Node*, StringPieceHasher>
         colocation_group_root;
 
-    for (Node* node : graph_->op_nodes()) {
+    for (const Node* node : graph_->op_nodes()) {
       // When adding the node, identify whether it is part of a colocation
       // group.
 
@@ -194,10 +220,84 @@ class ColocationGraph {
     return Status::OK();
   }
 
+  Status ColocateResourceOrRefEdge(Node* src, Node* dst) {
+    // Colocate `src` and `dst` to maintain the invariant that nodes
+    // connected by reference edges are colocated.
+    int src_root_id = FindRoot(src->id());
+    int dst_root_id = FindRoot(dst->id());
+    auto& src_root = members_[src_root_id];
+    auto& dst_root = members_[dst_root_id];
+    // If both the source node and this node have partially
+    // specified a device, then 'dst's device should be
+    // cleared: the reference edge forces 'node' to be on the
+    // same device as the source node.
+    const auto& source_parsed_name = src_root.device_name;
+    const auto& dest_parsed_name = dst_root.device_name;
+    if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
+        DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
+      // Ignore a specified device for 'dst' if the two names were
+      // incompatible.
+      if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
+                                                  dest_parsed_name)) {
+        TF_RETURN_IF_ERROR(VerifyResourceAndRefInputsCanBeColocated(
+            dst, src, source_parsed_name));
+        if (log_device_placement_) {
+          LOG(INFO) << "Ignoring device specification "
+                    << DeviceNameUtils::ParsedNameToString(dest_parsed_name)
+                    << " for node '" << dst->name()
+                    << "' because the input edge from '" << src->name()
+                    << "' is a reference connection and already has a device "
+                       "field set to "
+                    << DeviceNameUtils::ParsedNameToString(source_parsed_name);
+        }
+
+        // Make 'dst' colocated with the source
+        dst_root.device_name = source_parsed_name;
+      }
+    }
+    Status status = ColocateNodes(*src, src_root_id, *dst, dst_root_id);
+    if (!status.ok()) {
+      return AttachDef(
+          errors::InvalidArgument("Nodes were connected by a "
+                                  "reference connection (requiring them to "
+                                  "be on the same device), but the two nodes "
+                                  "were assigned two different devices: ",
+                                  status.error_message()),
+          *dst);
+    }
+    return Status::OK();
+  }
+
+  Status ColocateResourceAndRefEdges() {
+    // Enumerate the constraint edges, and use them to update the disjoint
+    // node set.
+    // If `node` has an input edge with reference type, add an edge from the
+    // source of that edge to `node`.
+    for (const Edge* edge : graph_->edges()) {
+      if (edge->IsControlEdge()) {
+        continue;
+      }
+      Node* src = edge->src();
+      Node* dst = edge->dst();
+      DataType input_type = dst->input_type(edge->dst_input());
+      if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
+          !IsExemptFromResourceInputColocation(dst)) {
+        TF_RETURN_IF_ERROR(ColocateResourceOrRefEdge(src, dst));
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Initialize() {
+    TF_RETURN_IF_ERROR(InitializeMembers());
+    TF_RETURN_IF_ERROR(ColocateAllNodes());
+    return ColocateResourceAndRefEdges();
+  }
+
   Status ColocateNodeToGroup(
       std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
           colocation_group_root,
-      Node* node, StringPiece colocation_group) {
+      const Node* node, StringPiece colocation_group) {
     const Node*& root_node = (*colocation_group_root)[colocation_group];
     if (root_node == nullptr) {
       // This is the first node of the colocation group, so
@@ -783,34 +883,14 @@ class ColocationGraph {
     return Status::OK();
   }
 
-  Graph* const graph_;  // Not owned.
+  const Graph* const graph_;  // Not owned.
   std::vector<Member> members_;
   const DeviceSet* device_set_;  // Not owned.
   const std::vector<DeviceType> device_types_;
-  const bool allow_soft_placement_;
   const Device* default_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
 };
-
-// Returns true if the node has no inputs and produces outputs
-// that are consumed by a single node.
-//
-// TODO(vrv): Currently this handles only nodes with one output, but
-// this could be extended to handle the case where a node has many
-// outputs that are connected to nodes in the same colocation group.
-bool IsGeneratorNode(const Node* node) {
-  return node->num_inputs() == 0 && node->num_outputs() == 1 &&
-         !IsRefType(node->output_type(0));
-}
-
-bool IsExemptFromResourceInputColocation(const Node* node) {
-  // Note: Partitioned function calls, which place and partition their
-  // function bodies, are exempt from this check: they forward resource and
-  // ref inputs to operations that are appropriately placed, instead of
-  // dereferencing them.
-  const string& op_type = node->op_def().name();
-  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
-}
-
 }  // namespace
 
 Placer::Placer(Graph* graph, const DeviceSet* devices,
@@ -833,94 +913,13 @@ Status Placer::Run() {
   }
 
   ColocationGraph colocation_graph(
-      graph_, devices_,
+      graph_, devices_, default_device_,
       options_ == nullptr || options_->config.allow_soft_placement(),
-      default_device_);
-
-  TF_RETURN_IF_ERROR(colocation_graph.InitializeMembers());
-
-  // 1. First add all of the nodes. Note that steps (1) and (2)
-  // requires two passes over the nodes because the graph (and hence
-  // the constraints) may not be acyclic.
-  TF_RETURN_IF_ERROR(colocation_graph.ColocateAllNodes());
-
-  // 2. Enumerate the constraint edges, and use them to update the disjoint
-  // node set.
-
-  // If `node` has an input edge with reference type, add an edge from the
-  // source of that edge to `node`.
-  for (const Edge* edge : graph_->edges()) {
-    if (edge->IsControlEdge()) {
-      continue;
-    }
-    Node* src = edge->src();
-    Node* dst = edge->dst();
-    DataType input_type = dst->input_type(edge->dst_input());
-    if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
-        !IsExemptFromResourceInputColocation(dst)) {
-      // Colocate `src` and `dst` to maintain the invariant that nodes connected
-      // by reference edges are colocated.
-      int src_root_id = colocation_graph.FindRoot(src->id());
-      int dst_root_id = colocation_graph.FindRoot(dst->id());
-      auto& src_root = colocation_graph.members_[src_root_id];
-      auto& dst_root = colocation_graph.members_[dst_root_id];
-      // If both the source node and this node have partially
-      // specified a device, then 'node's device should be
-      // cleared: the reference edge forces 'node' to be on the
-      // same device as the source node.
-      const auto& source_parsed_name = src_root.device_name;
-      const auto& dest_parsed_name = dst_root.device_name;
-      if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
-          DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
-        // Ignore a specified device for 'dst' if the two names were
-        // incompatible.
-        if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
-                                                    dest_parsed_name)) {
-          TF_RETURN_IF_ERROR(
-              colocation_graph.VerifyResourceAndRefInputsCanBeColocated(
-                  dst, src, source_parsed_name));
-          if (log_device_placement_) {
-            LOG(INFO) << "Ignoring device specification "
-                      << DeviceNameUtils::ParsedNameToString(dest_parsed_name)
-                      << " for node '" << dst->name()
-                      << "' because the input edge from '" << src->name()
-                      << "' is a reference connection and already has a device "
-                         "field set to "
-                      << DeviceNameUtils::ParsedNameToString(
-                             source_parsed_name);
-          }
-
-          // Make 'dst' colocated with the source
-          dst_root.device_name = source_parsed_name;
-        } else {
-          bool source_subset_of_dest = DeviceNameUtils::IsSpecification(
-              source_parsed_name, dest_parsed_name);
-          bool dest_subset_of_source = DeviceNameUtils::IsSpecification(
-              dest_parsed_name, source_parsed_name);
-
-          if (source_subset_of_dest && !dest_subset_of_source) {
-            src_root.device_name = dest_parsed_name;
-          } else {
-            dst_root.device_name = source_parsed_name;
-          }
-        }
-      }
+      log_device_placement_);
 
-      Status status =
-          colocation_graph.ColocateNodes(*src, src_root_id, *dst, dst_root_id);
-      if (!status.ok()) {
-        return AttachDef(
-            errors::InvalidArgument("Nodes were connected by a "
-                                    "reference connection (requiring them to "
-                                    "be on the same device), but the two nodes "
-                                    "were assigned two different devices: ",
-                                    status.error_message()),
-            *dst);
-      }
-    }
-  }
+  TF_RETURN_IF_ERROR(colocation_graph.Initialize());
 
-  // 3. For each node, assign a device based on the constraints in the
+  // For each node, assign a device based on the constraints in the
   // disjoint node set.
   std::vector<Node*> second_pass;
   for (Node* node : graph_->op_nodes()) {
@@ -987,7 +986,7 @@ Status Placer::Run() {
     AssignAndLog(assigned_device, node);
   }
 
-  // 4. Perform a second pass assignment for those nodes explicitly
+  // Perform a second pass assignment for those nodes explicitly
   // skipped during the first pass.
   for (Node* node : second_pass) {
     std::vector<Device*>* devices;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index c43a9d7dc211dd82a1b5771ad22888a2ba275a48..950a93671c7773c83de8c22add3ff23f884a4b1a 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -16,11 +16,27 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
 
 namespace tensorflow {
 
@@ -52,13 +68,13 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, this);
   }
 }
@@ -77,13 +93,13 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, std::move(custom_kernel_creator), this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, custom_kernel_creator, this);
   }
 }
@@ -126,7 +142,7 @@ void ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
-    const string& device_name, int64* incarnation) {
+    const string& device_name, int64* incarnation) const {
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found");
@@ -136,7 +152,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceContext(
-    const string& device_name, DeviceContext** device_context) {
+    const string& device_name, DeviceContext** device_context) const {
   *device_context = nullptr;
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
@@ -181,9 +197,26 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     const string& function_key, const string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
+  return AddHandleLocked(function_key, device_name, local_handle);
+}
+
+FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
+    const string& function_key, const string& device_name,
+    FunctionLibraryRuntime::LocalHandle local_handle) {
+  auto h = next_handle_;
+  function_data_[h] =
+      MakeUnique<FunctionData>(device_name, local_handle, function_key);
+  table_[function_key] = h;
+  next_handle_++;
+  return h;
+}
+
+FunctionLibraryRuntime::Handle
+ProcessFunctionLibraryRuntime::AddMultiDeviceHandle(
+    std::unique_ptr<MultiDeviceFunctionData> data, const string& function_key) {
+  mutex_lock l(mu_);
   auto h = next_handle_;
-  function_data_[h] = MakeUnique<FunctionData>(
-      device_name, local_handle, function_key);
+  mdevice_data_[h] = std::move(data);
   table_[function_key] = h;
   next_handle_++;
   return h;
@@ -196,14 +229,20 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
   return GetHandleOnDevice(device_name, handle) != kInvalidHandle;
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
+
+  auto miter = mdevice_data_.find(handle);
+  if (miter != mdevice_data_.end()) {
+    return kInvalidLocalHandle;
+  }
+
   auto iter = function_data_.find(handle);
   if (iter == function_data_.end()) {
     return kInvalidLocalHandle;
@@ -216,7 +255,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
 }
 
 string ProcessFunctionLibraryRuntime::GetDeviceName(
-    FunctionLibraryRuntime::Handle handle) {
+    FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
   auto iter = function_data_.find(handle);
   CHECK(iter != function_data_.end());
@@ -224,10 +263,498 @@ string ProcessFunctionLibraryRuntime::GetDeviceName(
   return function_data->target_device();
 }
 
+ProcessFunctionLibraryRuntime::MultiDeviceFunctionData*
+ProcessFunctionLibraryRuntime::IsMultiDevice(
+    FunctionLibraryRuntime::Handle handle) const {
+  tf_shared_lock l(mu_);
+  const auto& it = mdevice_data_.find(handle);
+  if (it != mdevice_data_.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+namespace {
+// Sets `group` to the first colocation group specified in `node`. If no
+// group is specified, does not touch `group`.
+void GetColocationGroup(const Node* node, string* group) {
+  // We hoist the conversion from C-style string literal to string here,
+  // so that we can avoid the many repeated calls to strlen().
+  static const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+  const AttrValue* attr_value =
+      node->attrs().Find(kColocationAttrNameStringPiece);
+  if (attr_value != nullptr && attr_value->has_list() &&
+      attr_value->list().s_size() > 0) {
+    *group = attr_value->list().s(0);
+  }
+}
+
+}  // anonymous namespace
+
+Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
+    const std::vector<string>& input_devices,
+    const std::vector<string>& output_devices, const DeviceSet& device_set,
+    Graph* graph) const {
+  // If output_devices are not specified, we want to set the output device
+  // based on the device of the output producing node. The output producing
+  // node can be an arg node because functions can simply return their
+  // arguments. To make sure that the output producing nodes have assigned
+  // devices, we assign them to arguments first.
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kArgOp) {
+      const AttrValue* attr_value;
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int64 index = attr_value->i();
+      node->set_assigned_device_name(input_devices[index]);
+    }
+  }
+
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kRetOp) {
+      if (output_devices.empty()) {
+        // If output_devices are empty, the node producing retval
+        // must have explicitly assigned device or a colocation constraint
+        // to a node with explicitly assigned device.
+        for (const auto& it : node->in_edges()) {
+          if (!it->IsControlEdge()) {
+            Node* src_node = it->src();
+            const string* src_device = &src_node->requested_device();
+            string colocation_group = "";
+            GetColocationGroup(src_node, &colocation_group);
+            while (src_device->empty() && colocation_group.empty() &&
+                   src_node->IsIdentity()) {
+              src_node = *src_node->in_nodes().begin();
+              src_device = &src_node->requested_device();
+              if (src_device->empty()) {
+                // Some node (e.g. _Args) can have no requested_device,
+                // but have assigned_device.
+                src_device = &src_node->assigned_device_name();
+              }
+
+              GetColocationGroup(src_node, &colocation_group);
+            }
+
+            if (!colocation_group.empty()) {
+              AttrValue::ListValue colo_attr;
+              colo_attr.add_s(colocation_group);
+              std::vector<string> colo_slice = {colocation_group};
+              node->AddAttr(kColocationAttrName, colo_slice);
+            } else if (!src_device->empty()) {
+              // src_device can be a partially specified device. Find the
+              // matching device in the device_set.
+              DeviceNameUtils::ParsedName parsed;
+              if (!DeviceNameUtils::ParseFullName(*src_device, &parsed)) {
+                return errors::InvalidArgument(
+                    "Failed to parse explicit device specification ",
+                    *src_device);
+              }
+              std::vector<Device*> matching_devices;
+              device_set.FindMatchingDevices(parsed, &matching_devices);
+              if (matching_devices.empty()) {
+                return errors::InvalidArgument(
+                    "Unable to find any devices for spec ", *src_device);
+              } else if (matching_devices.size() != 1) {
+                // Convert a vector of devices to a string.
+                // Using absl::StrJoin did not work in Android builds.
+                string devices = "[";
+                for (Device* device : matching_devices) {
+                  devices.append(device->name());
+                  devices.append(", ");
+                }
+                if (devices.size() > 2) {
+                  devices.resize(devices.size() - 2);
+                }
+                devices.append("]");
+
+                return errors::InvalidArgument(
+                    "When FunctionLibraryRuntime::Options.output_devices are "
+                    "not specified for a multi-device function, the device "
+                    "specification on the output node must match exactly one "
+                    "device. Matched devices are ",
+                    devices);
+              }
+              node->set_assigned_device_name(matching_devices[0]->name());
+            }
+          }
+        }
+      } else {
+        const AttrValue* attr_value;
+        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+        int64 index = attr_value->i();
+        // output_devices size is checked in InstantiateMultiDevice
+        DCHECK_GT(output_devices.size(), index);
+        node->set_assigned_device_name(output_devices[index]);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+
+Status ValidateNoListArguments(
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args, const char* arg_type,
+    const string& function_name) {
+  for (const OpDef::ArgDef& arg : args) {
+    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
+      return errors::InvalidArgument(
+          "Function ", function_name, " has an ", arg_type, " named \"",
+          arg.name(),
+          "\" that is a list of tensors."
+          " Multi-device functions support only single-tensor inputs "
+          " and outputs");
+    }
+  }
+  return Status::OK();
+}
+
+Status ValidateMultiDeviceOptions(
+    const FunctionDef& fdef,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
+  const OpDef& signature = fdef.signature();
+  // Multi-device functions don't currently support list inputs or outputs
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.input_arg(), "input",
+                                             signature.name()));
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.output_arg(), "output",
+                                             signature.name()));
+
+  if (fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b()) {
+    return errors::Unimplemented(
+        "Function '", signature.name(), "' has `",
+        FunctionLibraryDefinition::kIntsOnDeviceAttr,
+        "` attribute set. This attribute is not currently supported by "
+        "multi-device functions.");
+  }
+
+  if (options.input_devices.size() != signature.input_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.input_devices must have the same length "
+        "as the number of arguments: input_devices length = ",
+        options.input_devices.size(),
+        " number of arguments = ", signature.input_arg_size());
+  }
+  if (!options.output_devices.empty() &&
+      options.output_devices.size() != signature.output_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.output_devices must either be empty or have "
+        "the same length as the number of arguments: output_devices length "
+        "= ",
+        options.output_devices.size(),
+        " number of arguments = ", signature.output_arg_size());
+  }
+
+  if (!options.state_handle.empty()) {
+    return errors::Unimplemented(
+        "InstantiateOptions.state_handle is not supported for multi-device "
+        "functions. Function: ",
+        signature.name());
+  }
+  if (options.create_kernels_eagerly) {
+    return errors::Unimplemented(
+        "InstantiateOptions.create_kernels_eagerly is not supported for "
+        "multi-device functions. Function: ",
+        signature.name());
+  }
+
+  return Status::OK();
+}
+
+Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
+                       const FunctionDef* fdef,
+                       const FunctionLibraryDefinition* lib_def,
+                       std::unique_ptr<Graph>* graph,
+                       std::vector<string>* ret_node_names) {
+  auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
+    return lib_def->LookUpOpDef(op, sig);
+  };
+  FunctionBody* tmp_fbody;
+  // TODO(iga): FunctionDefToBodyHelper copies fdef. Avoid this copy.
+  TF_RETURN_IF_ERROR(
+      FunctionDefToBodyHelper(*fdef, attrs, lib_def, get_func_sig, &tmp_fbody));
+  if (tmp_fbody == nullptr) {
+    LOG(ERROR) << "Failed to get FunctionBody for \"" << function_name << "\"";
+    return errors::Internal("Failed to construct FunctionBody for ",
+                            function_name);
+  }
+  std::unique_ptr<FunctionBody> fbody(tmp_fbody);
+  *graph = std::unique_ptr<Graph>(fbody->graph);
+  fbody->graph = nullptr;
+  ret_node_names->reserve(fbody->ret_nodes.size());
+  for (const Node* node : fbody->ret_nodes) {
+    ret_node_names->push_back(node->name());
+  }
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    FunctionLibraryRuntime::Handle* handle) {
+  // Check if this function has already been instantiated.
+  const string& function_key = Canonicalize(function_name, attrs, options);
+
+  {
+    mutex_lock l(mu_);
+    const auto& it = table_.find(function_key);
+    if (it != table_.end()) {
+      *handle = it->second;
+      ++mdevice_data_[*handle]->instantiation_counter_;
+      return Status::OK();
+    }
+  }
+
+  VLOG(1) << "Instantiating MultiDevice function \"" << function_name
+          << "\" on default device " << options.target;
+
+  const FunctionLibraryDefinition* lib_def =
+      options.overlay_lib == nullptr ? lib_def_ : options.overlay_lib;
+
+  const FunctionDef* fdef = lib_def->Find(function_name);
+  if (fdef == nullptr) {
+    return errors::InvalidArgument("Failed to find function \"", function_name,
+                                   "\" in function library: ", lib_def);
+  }
+
+  TF_RETURN_IF_ERROR(ValidateMultiDeviceOptions(*fdef, options));
+
+  std::unique_ptr<Graph> graph;
+  std::vector<string> ret_node_names;
+
+  TF_RETURN_IF_ERROR(GetGraphAndRets(function_name, attrs, fdef, lib_def,
+                                     &graph, &ret_node_names));
+
+  DeviceSet device_set;
+  for (auto d : device_mgr_->ListDevices()) {
+    device_set.AddDevice(d);
+  }
+
+  TF_RETURN_IF_ERROR(PinArgsAndRets(
+      options.input_devices, options.output_devices, device_set, graph.get()));
+
+  // Make the FunctionLibraryRuntime's device the default device if
+  // nothing else is hard coded. This allows the same function definition
+  // to be specialized to different devices depending on the
+  // PartitionedCallOp's device.
+  FunctionLibraryRuntime* flr = GetFLR(options.target);
+  if (flr == nullptr) {
+    return errors::InvalidArgument(
+        "Cannot instantiate multi-device function with target device ",
+        options.target);
+  }
+
+  std::unique_ptr<MultiDeviceFunctionData> data =
+      MakeUnique<MultiDeviceFunctionData>(function_name, function_key,
+                                          ret_node_names.size(),
+                                          lib_def->ReachableDefinitions(*fdef));
+
+  GraphOptimizationPassOptions optimization_options;
+  // TODO(iga): Thread other relevant options from SessionOptions.
+  SessionOptions session_options;
+  session_options.env = flr->env();
+  optimization_options.session_options = &session_options;
+  optimization_options.graph = &graph;
+  optimization_options.flib_def = &data->overlay_lib_;
+  optimization_options.device_set = &device_set;
+
+  DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
+
+  DumpGraph("Before calling Placer", graph.get());
+  Placer placer(graph.get(), &device_set, nullptr, /* No session options */
+                flr->device() /* Default device */);
+  TF_RETURN_IF_ERROR(placer.Run());
+
+  DumpGraph("Before running POST_PLACEMENT passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
+
+  Device* cpu_device;
+  TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
+
+  if (options.optimize_graph_fn) {
+    DumpGraph("Before running graph optimization fn", graph.get());
+    Status status = options.optimize_graph_fn(std::move(ret_node_names),
+                                              &data->overlay_lib_, device_set,
+                                              cpu_device, &graph);
+    if (!status.ok()) {
+      LOG(WARNING) << "Ignoring multi-device function optimization failure: "
+                   << status.ToString();
+    }
+    DumpGraph("After optimization", graph.get());
+  }
+
+  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
+  DumpGraph("After all optimization passes", graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  TF_RETURN_IF_ERROR(
+      PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
+
+  if (options.graph_collector != nullptr) {
+    for (const auto& pair : subgraphs) {
+      GraphDef def;
+      pair.second->ToGraphDef(&def);
+      options.graph_collector->CollectGraph(def);
+    }
+  }
+
+  int i = 0;
+  FunctionNameGenerator name_generator(&data->overlay_lib_, function_name);
+  for (const auto& pair : subgraphs) {
+    i += 1;
+    // TODO(iga): Fail gracefully if the set of devices corresponds
+    // to more than one address space.
+    const string& target = pair.first;
+    Graph* subgraph = pair.second.get();
+
+    ComponentFunctionData* comp_data = &data->glue_[target];
+    TF_RETURN_IF_ERROR(UpdateArgAndRetvalMetadata(
+        subgraph, &comp_data->arg_indices_, &comp_data->ret_indices_,
+        &comp_data->arg_alloc_attrs_, &comp_data->ret_alloc_attrs_));
+    FunctionDef shard;
+    string unique_name = name_generator.GetName();
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*subgraph, unique_name, &shard));
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    TF_RETURN_IF_ERROR(data->overlay_lib_.AddFunctionDef(shard));
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.executor_type = options.executor_type;
+    opts.target = target;
+    opts.overlay_lib = &data->overlay_lib_;
+    FunctionLibraryRuntime::Handle component_handle;
+
+    TF_RETURN_IF_ERROR(target_flr->Instantiate(
+        unique_name, AttrSlice(&shard.attr()), opts, &component_handle));
+    VLOG(1) << "Instantiated component function " << unique_name
+            << " on device " << target << " with component handle "
+            << component_handle;
+    VLOG(2) << DebugString(shard);
+    comp_data->handle_ = component_handle;
+  }
+
+  *handle = AddMultiDeviceHandle(std::move(data), function_key);
+  VLOG(2) << "Instantiated MultiDevice function \"" << function_name
+          << "\" with handle " << *handle;
+  return Status::OK();
+}
+
+Status ProcessFunctionLibraryRuntime::GetOutputDevices(
+    FunctionLibraryRuntime::Handle handle,
+    std::vector<Device*>* output_devices) const {
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data == nullptr) {
+    return errors::InvalidArgument(
+        "Failed for find multi-device function handle ", handle);
+  }
+
+  for (const auto& pair : data->glue_) {
+    const ComponentFunctionData& comp_data = pair.second;
+    DCHECK(comp_data.ret_alloc_attrs_.size() == comp_data.ret_indices_.size());
+
+    const string& target = pair.first;
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    Device* target_device = target_flr->device();
+    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle_);
+    DCHECK(fbody != nullptr);
+
+    output_devices->resize(data->num_outputs_);
+    for (int j = 0; j < comp_data.ret_indices_.size(); ++j) {
+      int ret_index = comp_data.ret_indices_[j];
+      if (fbody->ret_types[j] == DT_RESOURCE) {
+        (*output_devices)[ret_index] = target_device;
+      } else {
+        (*output_devices)[ret_index] =
+            comp_data.ret_alloc_attrs_[j].on_host() ? nullptr : target_device;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void ProcessFunctionLibraryRuntime::RunMultiDevice(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done) const {
+  if (opts.create_rendezvous) {
+    // FLR->Run() is the default entry point. It checks for cancellation,
+    // creates rendezvous, etc.
+    // Letting create_rendezvous through will do the wrong thing - each
+    // component function will get a separate rendezvous created by its FLR.
+    done(
+        errors::Internal("Cannot call ProcessFunctionLibraryRuntime::Run with "
+                         "create_rendezvous=true. Please run the function "
+                         "using FunctionLibraryRuntime::Run"));
+    return;
+  }
+
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data == nullptr) {
+    done(
+        errors::InvalidArgument("Failed for find multi-device function handle ",
+                                handle, ". Was the function instantiated?"));
+    return;
+  }
+
+  if (data->glue_.empty()) {
+    // Trivial case where the function body is empty.
+    done(Status::OK());
+    return;
+  }
+
+  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
+  for (int i = 0; i < data->glue_.size(); ++i) {
+    refcounted_done->Ref();
+  }
+
+  FunctionLibraryRuntime::Options opts_copy = opts;
+  for (const auto& pair : data->glue_) {
+    const string& target = pair.first;
+    const ComponentFunctionData& comp_data = pair.second;
+    FunctionLibraryRuntime::Handle handle = pair.second.handle_;
+    VLOG(1) << "Running function shard on device " << target << " with handle "
+            << handle;
+
+    opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs_;
+    opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs_;
+    opts_copy.remote_execution = false;
+    std::vector<Tensor> comp_args =
+        GetArgsForIndices(comp_data.arg_indices_, args);
+    std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
+    rets->resize(data->num_outputs_);
+    GetFLR(target)->Run(
+        opts_copy, handle, comp_args, comp_rets,
+        [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+          if (!status.ok()) {
+            LOG(ERROR) << "Component function execution failed: " << status;
+            refcounted_done->UpdateStatus(status);
+          } else {
+            for (int i = 0; i < comp_rets->size(); ++i) {
+              (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+            }
+          }
+          delete comp_rets;
+          // refcounted_done is thread-safe
+          refcounted_done->Unref();
+        });
+  }
+  refcounted_done->Unref();
+}
+
 Status ProcessFunctionLibraryRuntime::Instantiate(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
+  if (options.is_multi_device_function) {
+    return InstantiateMultiDevice(function_name, attrs, options, handle);
+  }
+
   *handle = kInvalidHandle;
   FunctionLibraryRuntime* flr = GetFLR(options.target);
   if (flr != nullptr) {
@@ -247,11 +774,7 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
     FunctionLibraryRuntime::Handle h =
         gtl::FindWithDefault(table_, function_key, kInvalidHandle);
     if (h == kInvalidHandle || function_data_.count(h) == 0) {
-      h = next_handle_;
-      function_data_[h] = MakeUnique<FunctionData>(
-          options.target, kInvalidHandle, function_key);
-      table_[function_key] = h;
-      next_handle_++;
+      h = AddHandleLocked(function_key, options.target, kInvalidHandle);
     }
     f = function_data_[h].get();
     *handle = h;
@@ -272,8 +795,48 @@ Status ProcessFunctionLibraryRuntime::RemoveHandle(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  std::unique_ptr<MultiDeviceFunctionData> mdata;
+  {
+    mutex_lock l(mu_);
+    auto it = mdevice_data_.find(handle);
+    --it->second->instantiation_counter_;
+    if (it->second->instantiation_counter_ != 0) {
+      return Status::OK();
+    }
+    mdata = std::move(it->second);
+    table_.erase(mdata->function_key_);
+    mdevice_data_.erase(it);
+  }
+
+  // If we are here we are releasing the last instantiation of `handle`.
+  // Release all component function handles.
+  Status overall_status;
+  for (const auto& it : mdata->glue_) {
+    const string& device = it.first;
+    FunctionLibraryRuntime::Handle flr_handle = it.second.handle_;
+    FunctionLibraryRuntime* flr = GetFLR(device);
+    if (flr == nullptr) {
+      return errors::InvalidArgument(
+          "Failed to find FunctionLibraryRuntime for device ", device,
+          " when releasing multi-device function handle ", handle);
+    }
+    Status status = flr->ReleaseHandle(flr_handle);
+    if (!status.ok()) {
+      overall_status = status;
+    }
+  }
+
+  return overall_status;
+}
+
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
+  if (IsMultiDevice(handle)) {
+    return ReleaseMultiDeviceHandle(handle);
+  }
+
   FunctionLibraryRuntime* flr = nullptr;
   string target_device;
   {
@@ -291,12 +854,15 @@ Status ProcessFunctionLibraryRuntime::ReleaseHandle(
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
-    std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
-  if (!opts.remote_execution) {
-    done(errors::InvalidArgument(
-        "ProcessFunctionLibraryRuntime::Run should only be called when there ",
-        "is a remote execution."));
-    return;
+    std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done) const {
+  bool multi_device;
+  {
+    tf_shared_lock l(mu_);
+    multi_device = mdevice_data_.find(handle) != mdevice_data_.end();
+  }
+  if (multi_device) {
+    return RunMultiDevice(opts, handle, args, rets, done);
   }
 
   FunctionLibraryRuntime* flr = nullptr;
@@ -313,6 +879,15 @@ void ProcessFunctionLibraryRuntime::Run(
     target_device = function_data->target_device();
     local_handle = function_data->local_handle();
   }
+
+  if (!opts.remote_execution) {
+    done(
+        errors::InvalidArgument("ProcessFunctionLibraryRuntime::Run should "
+                                "only be called for multi-device functions or "
+                                "for remote execution."));
+    return;
+  }
+
   flr = GetFLR(target_device);
   if (flr != nullptr) {
     auto rendezvous = opts.rendezvous;
@@ -374,7 +949,7 @@ Status ProcessFunctionLibraryRuntime::Clone(
     Env* env, int graph_def_version, const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
-    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) {
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) const {
   out_lib_def->reset(new FunctionLibraryDefinition(*lib_def_));
   out_pflr->reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, env, graph_def_version, out_lib_def->get(),
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 53815715d8b9d033f5600320108cb443c36b3e93..a08e84510737190c628775f6a8002a1190056207 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -79,7 +80,8 @@ class ProcessFunctionLibraryRuntime {
   FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
-  Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
+  Status GetDeviceIncarnation(const string& device_name,
+                              int64* incarnation) const;
 
   // For a given canonicalized key signature of the function instantiated
   // on device `device_name` and a `local_handle`, creates a handle and returns
@@ -94,14 +96,23 @@ class ProcessFunctionLibraryRuntime {
 
   // For the given handle instantiated on device `device_name` returns the local
   // index of instantiation of that function. If the function was not
-  // instantiated on `device_name` returns kInvalidLocalHandle.
+  // instantiated on `device_name` or the function is multi-device,
+  // returns kInvalidLocalHandle.
   FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
-      const string& device_name, FunctionLibraryRuntime::Handle handle);
+      const string& device_name, FunctionLibraryRuntime::Handle handle) const;
+
+  // Fills `output_devices` with the devices on which the results will
+  // be produced. If some output is produced on CPU, the corresponding Device*
+  // is set to nullptr. If some output is DT_RESOURCE, the corresponding Device*
+  // is set to the device backing the resource.
+  // REQUIRES: `handle` identifies a multi-device function.
+  Status GetOutputDevices(FunctionLibraryRuntime::Handle handle,
+                          std::vector<Device*>* output_devices) const;
 
   // Returns true if function with handle `handle` was instantiated on device
-  // `device_name`.
+  // `device_name`. Returns false for multi-device functions.
   bool IsInstantiatedOnDevice(const string& device_name,
-                              FunctionLibraryRuntime::Handle handle);
+                              FunctionLibraryRuntime::Handle handle) const;
 
   // Instantiates the function. See framework/function.h for more details.
   // Allows for function_name to be instantiated on different devices
@@ -114,6 +125,9 @@ class ProcessFunctionLibraryRuntime {
   // tells it to release it. If the `handle` isnt' needed at all, the local FLR
   // might call RemoveHandle on this to get rid of the state owned by the Proc
   // FLR.
+  // For multi-device functions, calls ReleaseHandle on local FLRs for each
+  // component function that is part of this multi-device function.
+  // Each local FLR might call RemoveHandle on this.
   Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
 
   // Runs the function with given `handle`. Function could have been
@@ -121,17 +135,78 @@ class ProcessFunctionLibraryRuntime {
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets,
-           FunctionLibraryRuntime::DoneCallback done);
+           FunctionLibraryRuntime::DoneCallback done) const;
 
  private:
+  friend class FunctionLibraryRuntimeImpl;
+
+  using DeviceAndFHandle = std::pair<string, FunctionLibraryRuntime::Handle>;
+  using ArgAndRetIndices = std::pair<std::vector<int>, std::vector<int>>;
+  using ArgAndRetAllocAttrs = std::pair<std::vector<AllocatorAttributes>,
+                                        std::vector<AllocatorAttributes>>;
+
+  FunctionLibraryRuntime::Handle AddHandleLocked(
+      const string& function_key, const string& device_name,
+      FunctionLibraryRuntime::LocalHandle local_handle)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Structure to keep track of how a component function (a single-device
+  // piece of a multi-device function) fits into the multi-device function.
+  struct ComponentFunctionData {
+    // The handle for the instantiated component function.
+    FunctionLibraryRuntime::Handle handle_;
+    // arg_indices_.size() is the number of arguments to the component function.
+    // The i'th argument of the component function comes from the
+    // `arg_indices_[i]`th argument of the multi-device function.
+    std::vector<int> arg_indices_;
+    // ret_indices_.size() is the number of return value of the component
+    // function.  The i'th return value of the component function goes to the
+    // `ret_indices_[i]`th return value of the multi-device function.
+    std::vector<int> ret_indices_;
+    // arg_alloc_attrs_[i] are the allocator attributes of the i'th argument to
+    // the component function.
+    std::vector<AllocatorAttributes> arg_alloc_attrs_;
+    // ret_alloc_attrs_[i] are the allocator attributes of the i'th return value
+    // of the component function.
+    std::vector<AllocatorAttributes> ret_alloc_attrs_;
+  };
+
+  // Data structure holding information for a single instantiated multi-device
+  // function.
+  // The fields are filled in during instantiation. Once the object is
+  // added to mdevice_data_, all fields are constant.
+  struct MultiDeviceFunctionData {
+    MultiDeviceFunctionData(const string& function_name,
+                            const string& function_key, int num_outputs,
+                            const FunctionLibraryDefinition& overlay_lib)
+        : num_outputs_(num_outputs),
+          instantiation_counter_(1),
+          function_name_(function_name),
+          function_key_(function_key),
+          overlay_lib_(overlay_lib) {}
+
+    // Stored here to resize the output tensor vector when function is run.
+    const int num_outputs_;
+    uint64 instantiation_counter_;
+    const string function_name_;
+    const string function_key_;
+    // The overlay library holding component function definitions as well as
+    // the definitions of functions they call.
+    FunctionLibraryDefinition overlay_lib_;
+
+    // Maps the device name to the information about the component function
+    // be run on this device.
+    std::unordered_map<string, ComponentFunctionData> glue_;
+  };
+
   // For a given device_name, returns a DeviceContext for copying
   // tensors to/from the device.
   Status GetDeviceContext(const string& device_name,
-                          DeviceContext** device_context);
+                          DeviceContext** device_context) const;
 
   // Looks up the information for the given `handle` and returns the name
   // of the device where the function is registered.
-  string GetDeviceName(FunctionLibraryRuntime::Handle handle);
+  string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
 
   // Removes handle from the state owned by this object.
   Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
@@ -140,12 +215,39 @@ class ProcessFunctionLibraryRuntime {
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
                std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
-               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr);
-
-  friend class FunctionLibraryRuntimeImpl;
-
-  mutable mutex mu_;
-
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) const;
+
+  Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
+
+  // If handle represents a multi-device function, returns the multi-device
+  // data associated with handle. Else, nullptr.
+  MultiDeviceFunctionData* IsMultiDevice(
+      FunctionLibraryRuntime::Handle handle) const;
+
+  Status InstantiateMultiDevice(
+      const string& function_name, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::Handle* handle);
+
+  FunctionLibraryRuntime::Handle AddMultiDeviceHandle(
+      const std::unique_ptr<MultiDeviceFunctionData> data,
+      const string& function_key);
+
+  // TODO(iga): Reword
+  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
+  // corresponding resource lives. This ensures that the Placer assigns ops that
+  // access these resources to the appropriate devices.
+  Status PinArgsAndRets(const std::vector<string>& input_devices,
+                        const std::vector<string>& output_devices,
+                        const DeviceSet& device_set, Graph* graph) const;
+
+  void RunMultiDevice(const FunctionLibraryRuntime::Options& opts,
+                      FunctionLibraryRuntime::Handle handle,
+                      gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                      FunctionLibraryRuntime::DoneCallback done) const;
+
+  // Data structure holding information for a single instantiated remote
+  // (to be executed on `target_device`) function.
   class FunctionData {
    public:
     FunctionData(const string& target_device,
@@ -181,15 +283,26 @@ class ProcessFunctionLibraryRuntime {
     Notification init_done_;
   };
 
+  mutable mutex mu_;
+
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
-  // Holds all the function invocations here.
+
+  // Holds all the function instantiations. Maps function_keys to handles.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
+
+  // Function data for instantitated remote functions.
   std::unordered_map<FunctionLibraryRuntime::Handle,
                      std::unique_ptr<FunctionData>>
       function_data_ GUARDED_BY(mu_);
+
+  // Function data for instantiated multi-device functions.
+  std::unordered_map<FunctionLibraryRuntime::Handle,
+                     std::unique_ptr<MultiDeviceFunctionData>>
+      mdevice_data_ GUARDED_BY(mu_);
+
   std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
   int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 21cb62118aebafa8a03903296b65f0617510f080..b4d3ac0df304e7caf0b742d018d43c9def2d76e6 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -29,6 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+#ifdef GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 namespace {
 
@@ -65,9 +72,18 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
                                           &devices));
-    device0_ = devices[0].get();
-    device1_ = devices[1].get();
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
+    TF_CHECK_OK(device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:CPU:0", &device0_));
+    TF_CHECK_OK(device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:CPU:1", &device1_));
+    // If no GPU is available, gpu_device_ will remain nullptr.
+    Status status = device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:GPU:0", &gpu_device_);
+    if (!status.ok()) {
+      CHECK_EQ(nullptr, gpu_device_);
+    }
+
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -86,6 +102,55 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return proc_flr_->Instantiate(name, attrs, instantiate_opts, handle);
   }
 
+  Tensor GPUToCPU(const Tensor& device_tensor) {
+#ifdef GOOGLE_CUDA
+    CHECK(gpu_device_);
+    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
+    DeviceContext* device_context =
+        gpu_device_->tensorflow_gpu_device_info()->default_context;
+
+    Notification n;
+    Status status;
+    Tensor cpu_tensor(device_tensor.dtype(), device_tensor.shape());
+    device_context->CopyDeviceTensorToCPU(&device_tensor, "", gpu_device_,
+                                          &cpu_tensor,
+                                          [&n, &status](const Status& s) {
+                                            status = s;
+                                            n.Notify();
+                                          });
+    n.WaitForNotification();
+    CHECK(status.ok());
+    return cpu_tensor;
+#else
+    CHECK(false);
+#endif  // GOOGLE_CUDA
+  }
+
+  Tensor CPUToGPU(const Tensor& cpu_tensor) {
+#ifdef GOOGLE_CUDA
+    CHECK(gpu_device_);
+    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
+    DeviceContext* device_context =
+        gpu_device_->tensorflow_gpu_device_info()->default_context;
+
+    Notification n;
+    Status status;
+    Tensor device_tensor(gpu_device_->GetAllocator({}), cpu_tensor.dtype(),
+                         cpu_tensor.shape(), {});
+    device_context->CopyCPUTensorToDevice(&cpu_tensor, gpu_device_,
+                                          &device_tensor,
+                                          [&n, &status](const Status& s) {
+                                            status = s;
+                                            n.Notify();
+                                          });
+    n.WaitForNotification();
+    CHECK(status.ok());
+    return device_tensor;
+#else
+    CHECK(false);
+#endif  // GOOGLE_CUDA
+  }
+
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
              test::function::Attrs attrs,
              const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
@@ -135,7 +200,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
     EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
@@ -144,6 +209,8 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   std::unique_ptr<DeviceMgr> device_mgr_;
   Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  // Remains as nullptr if no GPU is available.
+  Device* gpu_device_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
@@ -345,5 +412,300 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRParallelTest) {
   rendezvous_->Unref();
 }
 
+bool IsCUDATensor(const Tensor& t) {
+#ifdef GOOGLE_CUDA
+  cudaPointerAttributes attributes;
+  cudaError_t err =
+      cudaPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == cudaErrorInvalidValue) return false;
+  CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
+  return (attributes.memoryType == cudaMemoryTypeDevice);
+#else
+  CHECK(false)
+      << "IsCUDATensor should not be called when CUDA is not available";
+#endif  // GOOGLE_CUDA
+}
+
+void TestTwoDeviceMult(
+    ProcessFunctionLibraryRuntimeTest* fixture,
+    const FunctionLibraryRuntime::InstantiateOptions& inst_opts,
+    const string& error = "") {
+  fixture->Init({test::function::TwoDeviceMult()});
+  FunctionLibraryRuntime::Options opts;
+  auto x = test::AsTensor<float>({1, 2, 3});
+  Tensor y_cpu;
+  Tensor y_gpu;
+  Status status = fixture->Run("TwoDeviceMult", opts, {{"T", DT_FLOAT}},
+                               inst_opts, {x}, {&y_cpu, &y_gpu});
+  if (!error.empty()) {
+    EXPECT_TRUE(errors::IsInvalidArgument(status))
+        << "Actual status: " << status;
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
+        << "Actual error message: " << status.error_message();
+    fixture->rendezvous_->Unref();
+    return;
+  }
+
+  EXPECT_TRUE(status.ok()) << "Actual status: " << status;
+  EXPECT_FALSE(IsCUDATensor(y_cpu));
+  test::ExpectTensorEqual<float>(y_cpu, test::AsTensor<float>({2, 4, 6}));
+
+  EXPECT_TRUE(IsCUDATensor(y_gpu));
+  Tensor y_gpu_on_cpu = fixture->GPUToCPU(y_gpu);
+  test::ExpectTensorEqual<float>(y_gpu_on_cpu,
+                                 test::AsTensor<float>({3, 6, 9}));
+  fixture->rendezvous_->Unref();
+}
+
+void TestTwoDeviceInputOutput(
+    ProcessFunctionLibraryRuntimeTest* fixture,
+    const FunctionLibraryRuntime::InstantiateOptions& inst_opts) {
+  if (fixture->gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  fixture->Init({test::function::TwoDeviceInputOutput()});
+  FunctionLibraryRuntime::Options opts;
+  Tensor x1 = test::AsTensor<float>({1, 2});
+  if (str_util::StrContains(inst_opts.input_devices[0], "GPU")) {
+    x1 = fixture->CPUToGPU(x1);
+  }
+  Tensor x2 = test::AsTensor<float>({10, 20});
+  if (str_util::StrContains(inst_opts.input_devices[1], "GPU")) {
+    x2 = fixture->CPUToGPU(x2);
+  }
+
+  Tensor y1;
+  Tensor y2;
+  TF_CHECK_OK(fixture->Run("TwoDeviceInputOutput", opts, {{"T", DT_FLOAT}},
+                           inst_opts, {x1, x2}, {&y1, &y2}));
+
+  if (str_util::StrContains(inst_opts.output_devices[0], "GPU")) {
+    EXPECT_TRUE(IsCUDATensor(y1));
+    y1 = fixture->GPUToCPU(y1);
+  } else {
+    EXPECT_FALSE(IsCUDATensor(y1));
+  }
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({2, 4}));
+
+  if (str_util::StrContains(inst_opts.output_devices[1], "GPU")) {
+    EXPECT_TRUE(IsCUDATensor(y2));
+    y2 = fixture->GPUToCPU(y2);
+  } else {
+    EXPECT_FALSE(IsCUDATensor(y2));
+  }
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({30, 60}));
+
+  fixture->rendezvous_->Unref();
+}
+
+std::vector<string> CompleteDevices(const std::vector<string>& v) {
+  std::vector<string> result;
+  result.reserve(v.size());
+  for (const string& s : v) {
+    result.push_back(strings::StrCat("/job:a/replica:0/task:0/device:", s));
+  }
+  return result;
+}
+
+FunctionLibraryRuntime::InstantiateOptions MakeOptions(
+    const string& target, const std::vector<string>& input_devices,
+    const std::vector<string>& output_devices) {
+  FunctionLibraryRuntime::InstantiateOptions inst_opts;
+  inst_opts.target = target;
+  inst_opts.input_devices = CompleteDevices(input_devices);
+  inst_opts.output_devices = CompleteDevices(output_devices);
+  inst_opts.is_multi_device_function = true;
+  return inst_opts;
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ExplicitOutputDevice) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_InferredOutputDevice) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0"}, {}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenNoInputDevices) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {}, {}),
+                    "input_devices must have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenTooManyInputDevices) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0", "CPU:1"}, {}),
+                    "input_devices must have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenTooManyOutputDevices) {
+  TestTwoDeviceMult(
+      this, MakeOptions("CPU:0", {"CPU:0"}, {"CPU:0", "GPU:0", "CPU:1"}),
+      "output_devices must either be empty or have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenBadTargetDevice) {
+  TestTwoDeviceMult(
+      this, MakeOptions("GPU:11", {"CPU:0"}, {"CPU:0", "GPU:0"}),
+      "Cannot instantiate multi-device function with target device GPU:11");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListInput) {
+  const FunctionDef& def = test::function::FuncWithListInput();
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status = proc_flr_->Instantiate(
+      "FuncWithListInput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
+      MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "FuncWithListInput has an input named \"x1\" that is a list of tensors"))
+      << "Actual error message: " << status.error_message();
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
+  const FunctionDef& def = test::function::FuncWithListOutput();
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status = proc_flr_->Instantiate(
+      "FuncWithListOutput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
+      MakeOptions("CPU:0", {}, {"CPU:0"}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "FuncWithListOutput has an output named \"y\" that is a list of tensors"))
+      << "Actual error message: " << status.error_message();
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ExplicitMultiInputOutput) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"CPU:0", "GPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipInputs) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipOutputs) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"CPU:0", "GPU:0"}, {"GPU:0", "CPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipBoth) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"GPU:0", "CPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_EmptyBodySwap) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  FunctionLibraryRuntime::InstantiateOptions inst_opts =
+      MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"CPU:0", "GPU:0"});
+  Init({test::function::EmptyBodySwap()});
+
+  Tensor x1 = CPUToGPU(test::AsTensor<float>({1, 2}));
+  Tensor x2 = test::AsTensor<float>({10, 20});
+  Tensor y1;
+  Tensor y2;
+  TF_CHECK_OK(Run("EmptyBodySwap", {}, {{"T", DT_FLOAT}}, inst_opts, {x1, x2},
+                  {&y1, &y2}));
+
+  EXPECT_FALSE(IsCUDATensor(y1));
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({10, 20}));
+
+  EXPECT_TRUE(IsCUDATensor(y2));
+  y2 = GPUToCPU(y2);
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2}));
+
+  rendezvous_->Unref();
+}
+
+Tensor GetResourceHandle(const string& var_name, const string& container,
+                         const string& device_name) {
+  ResourceHandle handle;
+  handle.set_device(device_name);
+  handle.set_container(container);
+  handle.set_name(var_name);
+  handle.set_hash_code(MakeTypeIndex<Var>().hash_code());
+  handle.set_maybe_type_name(MakeTypeIndex<Var>().name());
+  Tensor tensor(DT_RESOURCE, TensorShape({}));
+  tensor.scalar<ResourceHandle>()() = handle;
+  return tensor;
+}
+
+void TestResourceOutputAndUse(ProcessFunctionLibraryRuntimeTest* fixture,
+                              const string& resource_return_device) {
+  if (fixture->gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  FunctionLibraryRuntime::InstantiateOptions inst_opts = MakeOptions(
+      "CPU:0", {"GPU:0", "GPU:0"}, {resource_return_device, "GPU:0"});
+  fixture->Init({test::function::ResourceOutput(),
+                 test::function::ReadResourceVariable()});
+
+  // Make resource var
+  Tensor resource_value = fixture->CPUToGPU(test::AsTensor<float>({10, 20}));
+  Var* resource = new Var(DT_FLOAT);
+  *resource->tensor() = resource_value;
+  resource->is_initialized = true;
+  ResourceMgr* mgr = fixture->gpu_device_->resource_manager();
+  Status status = mgr->Create(mgr->default_container(), "my_gpu_var", resource);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+
+  // Run the function taking a resource and outputing it
+  Tensor x1 = fixture->CPUToGPU(test::AsTensor<float>({1, 2}));
+  Tensor x2 = GetResourceHandle("my_gpu_var", mgr->default_container(),
+                                "/job:a/replica:0/task:0/device:GPU:0");
+  Tensor returned_handle;
+  Tensor y2;
+  TF_CHECK_OK(fixture->Run("ResourceOutput", {}, {{"T", DT_FLOAT}}, inst_opts,
+                           {x1, x2}, {&returned_handle, &y2}));
+
+  EXPECT_FALSE(IsCUDATensor(returned_handle));
+  EXPECT_TRUE(IsCUDATensor(y2));
+  y2 = fixture->GPUToCPU(y2);
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({2, 4}));
+
+  // Read the variable using the handle returned from previous function to
+  // make sure the handle and read value is on the right device.
+  inst_opts = MakeOptions("GPU:0", {"GPU:0"}, {"GPU:0"});
+  Tensor read_resource;
+  TF_CHECK_OK(fixture->Run("ReadResourceVariable", {}, {{"T", DT_FLOAT}},
+                           inst_opts, {returned_handle}, {&read_resource}));
+  EXPECT_TRUE(IsCUDATensor(read_resource));
+  read_resource = fixture->GPUToCPU(read_resource);
+  test::ExpectTensorEqual<float>(read_resource,
+                                 test::AsTensor<float>({10, 20}));
+
+  fixture->rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
+  TestResourceOutputAndUse(this, "GPU:0");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_CPU) {
+  TestResourceOutputAndUse(this, "CPU:0");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 092f15e49e330de21452e0f7b4d8cc51607a44ed..8ed2fc2f1c973467cfe881b4e5a0a0d870fc58fd 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -394,37 +394,6 @@ void RingReducer::Finish(bool ok) {
   done_(s);
 }
 
-RingReducer::SubContext::SubContext(OpKernelContext* ctx,
-                                    OpKernelContext::Params* params,
-                                    OpKernel* op, Tensor* output, Tensor* input)
-    : sub_params_(*params),
-      sub_inputs_({output, input}),
-      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
-      sub_input_dc_(
-          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
-  sub_params_.op_kernel = op;
-  sub_params_.inputs = &sub_inputs_;
-  sub_params_.input_alloc_attrs = &sub_input_attr_;
-  sub_params_.input_device_contexts = &sub_input_dc_;
-  sub_params_.eigen_gpu_device = nullptr;
-  sub_params_.ensure_eigen_gpu_device();
-  sub_params_.forward_from_array = &forward_from_;
-  sub_ctx_ = new OpKernelContext(&sub_params_, 1);
-}
-
-Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
-                                 Tensor* input) {
-  // Prepare an OpKernelContext that is identical to that of the original Op
-  // (i.e. the collective), except for the input output sizes and identities and
-  // the Op itself.
-  // TODO(tucker): Is it possible to cache and reuse these objects?  They're
-  // mostly identical inside one device execution.
-  std::unique_ptr<SubContext> sub_ctx(
-      new SubContext(col_ctx_->op_ctx, col_ctx_->op_params, op, output, input));
-  device->Compute(op, sub_ctx->sub_ctx_);
-  return sub_ctx->sub_ctx_->status();
-}
-
 // At the beginning of the algorithm initialize a RingField struct for
 // every independent field of the tensor.
 void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
@@ -632,9 +601,9 @@ bool RingReducer::RunAsyncParts() {
           --recv_pending_count;
           if (!rf->second_pass) {
             rf->action = RF_REDUCE;
-            Status s =
-                ComputeBinOp(col_ctx_->device, col_params_->merge_op.get(),
-                             &rf->chunk, &rf->tmp_chunk);
+            Status s = collective_util::ComputeBinOp(
+                col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+                col_params_->merge_op.get(), &rf->chunk, &rf->tmp_chunk);
             if (!s.ok()) {
               aborted = true;
               StartAbort(s);
@@ -647,9 +616,9 @@ bool RingReducer::RunAsyncParts() {
           if (!rf->second_pass && col_params_->final_op.get() && rf->is_final) {
             rf->action = RF_FINALIZE;
             group_size_tensor_ready_.WaitForNotification();
-            Status s =
-                ComputeBinOp(col_ctx_->device, col_params_->final_op.get(),
-                             &rf->chunk, &group_size_tensor_);
+            Status s = collective_util::ComputeBinOp(
+                col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+                col_params_->final_op.get(), &rf->chunk, &group_size_tensor_);
             if (!s.ok()) {
               aborted = true;
               StartAbort(s);
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
index 0848e37b5225b16a82e19943a3bcc57148fd744c..a5aa8fad70caa27e2c3f1f2d6b50a9ec843b8f07 100644
--- a/tensorflow/core/common_runtime/ring_reducer.h
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -40,6 +40,11 @@ class RingReducer : public CollectiveImplementationInterface {
   // and device_locality.  Also saves the CollectiveContext in this object.
   Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
 
+  // No-op for ring reducer.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
   // Begins async execution of the ring reduce algorithm.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
@@ -52,27 +57,8 @@ class RingReducer : public CollectiveImplementationInterface {
   void StartAbort(const Status& s);
   void ContinueAfterInputCopy();
   void Finish(bool ok);
-  Status ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
-                      Tensor* input);
   bool RunAsyncParts();
 
-  // Used for executing a sub-operation, e.g. a merge_op instance, with
-  // an OpKernelContext based on the one passed into this Op.
-  class SubContext {
-   public:
-    OpKernelContext::Params sub_params_;
-    gtl::InlinedVector<TensorValue, 4> sub_inputs_;
-    gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
-    gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
-    // Used only for Binary and Unary Ops for which we require
-    // the calculation to be in-place on the first input.
-    int forward_from_ = 0;
-    OpKernelContext* sub_ctx_;
-    SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
-               OpKernel* op, Tensor* output, Tensor* input);
-    ~SubContext() { delete sub_ctx_; }
-  };
-
   // Current status of a RingField
   enum RingFieldAction {
     RF_INIT = 0,    // Just initialized for a pass
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 9488a447789e67f3a9e73af43a0f3a849457e51f..8f28d2790358456df1414ba201d58e29e80221c9 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 591c22b8f625554acfe25d744cb53998f551ff29..e1961b822b45ec8610c8d6c30c897fb8ba38a8b2 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -221,10 +221,14 @@ tf_cc_test(
     deps = [
         ":debug_grpc_testlib",
         ":debug_io_utils",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index e388d3e6f0f5636c044c36ee03c826f1872cac9f..351dbebf797aea999d993cbf37951e809758dfa5 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -587,6 +587,7 @@ tf_cc_test(
         ":collective_param_resolver_distributed",
         ":device_resolver_distributed",
         ":test_utils",
+        "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -594,6 +595,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:collective_ops",
     ],
 )
 
@@ -647,6 +649,7 @@ tf_cuda_cc_test(
         ":remote_device",
         ":worker_interface",
         "//tensorflow:grpc++",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -712,9 +715,14 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 1dd10d309b5f5acad2acab660aa709a9c0e9751d..9f94a24fcdda5ee8f3033a05f44b72540baf4cca 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -181,6 +181,10 @@ void CollectiveParamResolverDistributed::CompleteInstanceAsync(
                           ir->WaitForOutMu(l);
                           response->set_instance_key(cp->instance.instance_key);
                           response->set_source_rank(ir->source_rank);
+                          if (!cp->instance.communicator_key.empty()) {
+                            response->set_communicator_key(
+                                cp->instance.communicator_key);
+                          }
                           done_and_cleanup(fi_status);
                         } else {
                           done_and_cleanup(fi_status);
@@ -283,8 +287,10 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
   using InstanceRecPointer = InstanceRec*;
   InstanceRecPointer* irp = new InstanceRecPointer(nullptr);
   int32 source_rank = resp.source_rank();
+  string communicator_key = resp.communicator_key();
 
-  auto continue_with_ir = [this, cp, irp, source_rank, done](const Status& s) {
+  auto continue_with_ir = [cp, irp, source_rank, communicator_key,
+                           done](const Status& s) {
     if (!s.ok()) {
       done(s);
       delete irp;
@@ -306,6 +312,19 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
         }
         ir->source_rank = source_rank;
       }
+      if (ir->communicator_key != communicator_key) {
+        if (!ir->communicator_key.empty()) {
+          ir->status = errors::Internal(
+              "UpdateInstanceCache: CompleteInstanceResponse for instance ",
+              cp->instance.instance_key,
+              " gives communicator_key with size =", communicator_key.size(),
+              " but cache already holds communicator_key with size=",
+              ir->communicator_key.size());
+          status = ir->status;
+          break;
+        }
+        ir->communicator_key = communicator_key;
+      }
       if (ir->known_count < cp->group.group_size) {
         ir->known_count = cp->group.group_size;
         if (ir->known.size() != cp->group.group_size) {
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 40b18d321a1cb3fafeaa4b864e737f6d86695842..823d7d5eb980bcbc414e4683cc7fabd154e7c28d 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -268,6 +268,8 @@ class DeviceResDistTest : public ::testing::Test {
         EXPECT_EQ(cp_[idx].instance.device_names[idx], device_name);
         EXPECT_EQ(cp_[idx].instance.task_names[idx], task_name);
         if (idx > 0) {
+          EXPECT_EQ(cp_[0].instance.communicator_key,
+                    cp_[idx].instance.communicator_key);
           for (int i = 0; i < dev_count; ++i) {
             EXPECT_EQ(cp_[0].instance.device_names[i],
                       cp_[idx].instance.device_names[i]);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 55b2657e74ef5c2be8c1b0f11d4a00186e063e31..6f08943e2d2408d0623cca05dd4e69fb74783d5e 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -84,6 +84,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index ee5823e314f777f758a6c0d8ef7129c4bbd2916c..1065f021a1b0f97dc955e2b00ff333976575b519 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -451,7 +451,8 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
       cancellation_manager,
       [item, rendezvous, ce_handle, done, start_time_usecs](const Status& s) {
         done(s);
-        UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
+        metrics::UpdateGraphExecTime(Env::Default()->NowMicros() -
+                                     start_time_usecs);
         rendezvous->Unref();
         item->Unref();
         delete ce_handle;
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 5a524eba7625f43116eea762c0e8171a746a8ae6..48b72fb9483f632012a69a1f3f8bf3e099310fbd 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -292,8 +292,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     if (tot >= 0.1 * 1048576.0) {
       bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
     }
-    return strings::StrCat(bytes, stats.node_name(), " = ",
-                           details.type_string, details.detail_text);
+    return strings::StrCat(bytes, stats.node_name(), " = ", details.type_string,
+                           details.detail_text);
   }
 
   // Send/Recv nodes that are the result of client-added
@@ -1081,17 +1081,18 @@ void CopyAndSortStrings(size_t size,
 }  // namespace
 
 void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
+                            const ConfigProto& config,
                             BuildGraphOptions* opts) {
   CallableOptions* callable_opts = &opts->callable_options;
-  CopyAndSortStrings(req.num_feeds(),
-                     [&req](size_t i) { return req.feed_name(i); },
-                     callable_opts->mutable_feed());
-  CopyAndSortStrings(req.num_fetches(),
-                     [&req](size_t i) { return req.fetch_name(i); },
-                     callable_opts->mutable_fetch());
-  CopyAndSortStrings(req.num_targets(),
-                     [&req](size_t i) { return req.target_name(i); },
-                     callable_opts->mutable_target());
+  CopyAndSortStrings(
+      req.num_feeds(), [&req](size_t i) { return req.feed_name(i); },
+      callable_opts->mutable_feed());
+  CopyAndSortStrings(
+      req.num_fetches(), [&req](size_t i) { return req.fetch_name(i); },
+      callable_opts->mutable_fetch());
+  CopyAndSortStrings(
+      req.num_targets(), [&req](size_t i) { return req.target_name(i); },
+      callable_opts->mutable_target());
 
   if (!req.options().debug_options().debug_tensor_watch_opts().empty()) {
     *callable_opts->mutable_run_options()->mutable_debug_options() =
@@ -1100,19 +1101,23 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
 
   opts->collective_graph_key =
       req.options().experimental().collective_graph_key();
+  if (config.experimental().collective_deterministic_sequential_execution()) {
+    opts->collective_order = GraphCollectiveOrder::kEdges;
+  }
 }
 
 void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
                             BuildGraphOptions* opts) {
   CallableOptions* callable_opts = &opts->callable_options;
-  CopyAndSortStrings(req.feed_size(), [&req](size_t i) { return req.feed(i); },
-                     callable_opts->mutable_feed());
-  CopyAndSortStrings(req.fetch_size(),
-                     [&req](size_t i) { return req.fetch(i); },
-                     callable_opts->mutable_fetch());
-  CopyAndSortStrings(req.target_size(),
-                     [&req](size_t i) { return req.target(i); },
-                     callable_opts->mutable_target());
+  CopyAndSortStrings(
+      req.feed_size(), [&req](size_t i) { return req.feed(i); },
+      callable_opts->mutable_feed());
+  CopyAndSortStrings(
+      req.fetch_size(), [&req](size_t i) { return req.fetch(i); },
+      callable_opts->mutable_fetch());
+  CopyAndSortStrings(
+      req.target_size(), [&req](size_t i) { return req.target(i); },
+      callable_opts->mutable_target());
 
   // TODO(cais): Add TFDBG support to partial runs.
 }
@@ -1354,9 +1359,7 @@ Status MasterSession::DeleteWorkerSessions() {
         &workers[i].call_opts, &workers[i].request, &workers[i].response, cb);
   }
 
-  if (!done.WaitFor(std::chrono::milliseconds(10000))) {
-    LOG(WARNING) << "Timeout for closing worker session";
-  }
+  done.Wait();
   for (size_t i = 0; i < workers.size(); ++i) {
     status.Update(workers[i].status);
   }
@@ -1854,7 +1857,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Prepare.
   BuildGraphOptions bgopts;
-  BuildBuildGraphOptions(req, &bgopts);
+  BuildBuildGraphOptions(req, session_opts_.config, &bgopts);
   ReffedClientGraph* rcg = nullptr;
   int64 count;
   TF_RETURN_IF_ERROR(StartStep(bgopts, false, &rcg, &count));
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
index 5f7c0cb3cae7c97fac4b4c335a617687f31bd3b5..a2b799c3e42bf5609a37edf89fdbb99a96856a68 100644
--- a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
@@ -139,7 +139,7 @@ TEST_P(StatusPropagationTest, PartialRunDoneFirst) {
 // ExecutorDone and PartialRunDone.
 Status ExecutorError() { return errors::Internal("executor error"); }
 Status PartialRunError() { return errors::Internal("partial run error"); }
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     PartialRunMgr, StatusPropagationTest,
     ::testing::Values(
         StatusTestParam{Status::OK(), Status::OK(), Status::OK()},
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 273709a01fd799f7f4aa8afc80d3bdfc48d36322..a081ec7b67f9ef1e328c29024ac3779bf0b9b7b0 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -314,7 +314,9 @@ tf_cc_binary(
         "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:data_flow",
@@ -330,9 +332,12 @@ tf_cc_binary(
     deps = [
         ":grpc_server_lib",
         "//tensorflow:grpc++",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 4f5975bbc11a6217355c1fcf368996a0fca45969..7f63cc9344f87010ceb1225dbe4b031bd5272f2c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -159,7 +159,7 @@ GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
 
   // Set a standard backoff timeout of 1s instead of the
   // (sometimes default) 20s.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   return ::grpc::CreateCustomChannel(
       /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index ac73182190f6978d5cac11b23e4f09b23b5b4488..1405c760d547485e5f03c5c7c3dbaecf453e9fb4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -112,12 +112,7 @@ GrpcServer::~GrpcServer() {
 
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func,
-    const WorkerCreationFunction& worker_func,
-    const StatsPublisherFactory& stats_factory) {
+Status GrpcServer::Init(const GrpcServerOptions& opts) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
@@ -165,9 +160,9 @@ Status GrpcServer::Init(
   worker_env_.device_mgr = new DeviceMgr(std::move(devices));
   master_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
-  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+  worker_env_.rendezvous_mgr = opts.rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
-                                   : rendezvous_mgr_func(&worker_env_);
+                                   : opts.rendezvous_mgr_func(&worker_env_);
   string unused;
   string default_worker_name;
   if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
@@ -200,15 +195,16 @@ Status GrpcServer::Init(
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
-  worker_impl_ = worker_func ? worker_func(&worker_env_, config)
-                             : NewGrpcWorker(&worker_env_, config);
-  worker_service_ =
-      NewGrpcWorkerService(worker_impl_.get(), &builder).release();
+  worker_impl_ = opts.worker_func ? opts.worker_func(&worker_env_, config)
+                                  : NewGrpcWorker(&worker_env_, config);
+  worker_service_ = NewGrpcWorkerService(worker_impl_.get(), &builder,
+                                         opts.worker_service_options)
+                        .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
 
   // extra service:
-  if (service_func != nullptr) {
-    service_func(&worker_env_, &builder);
+  if (opts.service_func != nullptr) {
+    opts.service_func(&worker_env_, &builder);
   }
   server_ = builder.BuildAndStart();
 
@@ -222,9 +218,9 @@ Status GrpcServer::Init(
       WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
   CHECK_NE(nullptr, worker_cache);
 
-  if (collective_mgr_func) {
+  if (opts.collective_mgr_func) {
     worker_env_.collective_executor_mgr =
-        collective_mgr_func(config, &worker_env_, worker_cache);
+        opts.collective_mgr_func(config, &worker_env_, worker_cache);
     if (!worker_env_.collective_executor_mgr) {
       return errors::Internal(
           "collective_mgr_func did not return CollectiveExecutorMgr");
@@ -256,6 +252,7 @@ Status GrpcServer::Init(
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
   master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  StatsPublisherFactory stats_factory = opts.stats_factory;
   master_env_.master_session_factory =
       [config, stats_factory](
           SessionOptions options, const MasterEnv* env,
@@ -282,31 +279,6 @@ Status GrpcServer::Init(
   return Status::OK();
 }
 
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func,
-    const WorkerCreationFunction& worker_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
-              worker_func, CreateNoOpStatsPublisher);
-}
-
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
-              nullptr);
-}
-
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, nullptr, nullptr);
-}
-
-Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); }
-
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
   for (const auto& job : options.cluster_def->job()) {
@@ -457,7 +429,9 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init();
   if (!s.ok()) {
     LOG(ERROR) << s;
     return s;
@@ -471,8 +445,9 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<GrpcServer>* out_server) {
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  ServiceInitFunction service_func = nullptr;
-  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return s;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c7f543e5bfc0655a603da7436eaaca5351b2f07a..f66d7eb82e8d9bcd43868a5b65c08248f7d860da 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 
+// GrpcServer manages the lifecycle of an Eager, Worker and Master service.
+
 #include <memory>
 
 #include "grpcpp/grpcpp.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
@@ -57,6 +60,15 @@ typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
                                                   const ConfigProto& config)>
     WorkerCreationFunction;
 
+struct GrpcServerOptions {
+  ServiceInitFunction service_func = nullptr;
+  RendezvousMgrCreationFunction rendezvous_mgr_func = nullptr;
+  CollectiveMgrCreationFunction collective_mgr_func = nullptr;
+  WorkerCreationFunction worker_func = nullptr;
+  StatsPublisherFactory stats_factory = CreateNoOpStatsPublisher;
+  GrpcWorkerServiceOptions worker_service_options;
+};
+
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
@@ -86,25 +98,7 @@ class GrpcServer : public ServerInterface {
   std::shared_ptr<GrpcChannelCache> channel_cache() { return channel_cache_; }
 
  protected:
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func,
-              const WorkerCreationFunction& worker_func,
-              const StatsPublisherFactory& stats_factory);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func,
-              const WorkerCreationFunction& worker_func);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
-
-  Status Init();
+  Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
   virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index b67f3c4563107882a556e83c07ee20ca69b3f3b4..d73638651f2b78fb935ab8865a776a708826c930 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
 
 namespace tensorflow {
@@ -92,6 +93,12 @@ class RPCState : public GrpcClientCQTag {
       }
     } else {
       VLOG(2) << "Call returned with non-ok status: " << s;
+
+      // Attach additional GRPC error information if any
+      s = Status(s.code(),
+                 strings::StrCat(s.error_message(),
+                                 "\nAdditional GRPC error information:\n",
+                                 context_.debug_error_string()));
       done_(s);
       delete this;
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index de80992095d13fa38172b3a30c5fdd6c177994e1..34bf629c8abf7646c1a8cdf85380a67021a1b9a0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 
 #include <deque>
+#include <unordered_map>
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
@@ -50,37 +52,6 @@ namespace tensorflow {
 
 namespace {
 
-class GrpcWorkerService : public AsyncServiceInterface {
-  // TODO(ncteisen): consider adding a config var or flag for this
-  static constexpr const size_t kGrpcWorkerServiceThreadCount = 8;
-
- public:
-  GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder)
-      : is_shutdown_(false) {
-    builder->RegisterService(&worker_service_);
-    for (int i = 0; i < kGrpcWorkerServiceThreadCount; i++) {
-      threads_.emplace_back(
-          new GrpcWorkerServiceThread(worker, builder, &worker_service_));
-    }
-  }
-
-  void Shutdown() override {
-    bool did_shutdown = false;
-    {
-      mutex_lock l(service_shutdown_mu_);
-      if (!is_shutdown_) {
-        LOG(INFO) << "Shutting down GrpcWorkerService.";
-        is_shutdown_ = true;
-        did_shutdown = true;
-      }
-    }
-    if (did_shutdown) {
-      for (auto& worker_thread : threads_) {
-        worker_thread->Shutdown();
-      }
-    }
-  }
-
 // This macro creates a new request for the given RPC method name
 // (e.g., `ENQUEUE_REQUEST(GetStatus, false);`), and enqueues it on
 // `this->cq_`.
@@ -105,308 +76,305 @@ class GrpcWorkerService : public AsyncServiceInterface {
     }                                                                        \
   } while (0)
 
-  // This method blocks forever handling requests from the completion queue.
-  void HandleRPCsLoop() override {
-    for (auto& worker_thread : threads_) {
-      worker_thread->Start();
-    }
-    for (auto& worker_thread : threads_) {
-      worker_thread->Join();
-    }
+#define SETUP_FOR_REQUEST(method, default_depth, supports_cancel)              \
+  for (int i = 0;                                                              \
+       i < gtl::FindWithDefault(queue_depth_,                                  \
+                                static_cast<int>(GrpcWorkerMethod::k##method), \
+                                default_depth);                                \
+       ++i) {                                                                  \
+    ENQUEUE_REQUEST(method, supports_cancel);                                  \
   }
 
- private:
-  // Thread wrapping class that drives work over a single gRPC
-  // CompletionQueue.
-  class GrpcWorkerServiceThread {
-   public:
-    explicit GrpcWorkerServiceThread(
-        GrpcWorker* worker, ::grpc::ServerBuilder* builder,
-        grpc::WorkerService::AsyncService* worker_service)
-        : worker_(worker),
-          worker_service_(worker_service),
-          is_shutdown_(false) {
-      cq_ = builder->AddCompletionQueue();
-    }
-
-    void Start() {
-      thread_.reset(worker_->env()->env->StartThread(
-          ThreadOptions(), "grpc_worker_service",
-          [this]() { HandleRPCsLoop(); }));
-    }
-
-    void Join() { thread_.reset(); }  // Blocks until thread exits
-
-    void Shutdown() {
-      {
-        mutex_lock lock(shutdown_mu_);
-        is_shutdown_ = true;
-      }
-      cq_->Shutdown();
-    }
-
-   private:
-    void HandleRPCsLoop() {
-      // TODO(ncteisen): This may require performance engineering. We can
-      // change the number of threads, the number of handlers per thread,
-      // or even decide to specialize certain threads to certain methods.
-      ENQUEUE_REQUEST(GetStatus, false);
-      ENQUEUE_REQUEST(CreateWorkerSession, false);
-      ENQUEUE_REQUEST(DeleteWorkerSession, false);
-      ENQUEUE_REQUEST(CleanupAll, false);
-      ENQUEUE_REQUEST(RegisterGraph, false);
-      ENQUEUE_REQUEST(DeregisterGraph, false);
-
-      // TODO(ncteisen): Determine a better policy for enqueuing the
-      // appropriate number of each request type.
-      for (int i = 0; i < 1000; ++i) {
-        EnqueueRecvTensorRequestRaw();
-      }
-      for (int i = 0; i < 500; ++i) {
-        ENQUEUE_REQUEST(RecvBuf, true);
-      }
-      for (int i = 0; i < 100; ++i) {
-        ENQUEUE_REQUEST(RunGraph, true);
-      }
-      for (int i = 0; i < 100; ++i) {
-        ENQUEUE_REQUEST(CleanupGraph, false);
-      }
-
-      ENQUEUE_REQUEST(Logging, false);
-      ENQUEUE_REQUEST(Tracing, false);
-
-      for (int i = 0; i < 10; ++i) {
-        ENQUEUE_REQUEST(CompleteGroup, true);
-        ENQUEUE_REQUEST(CompleteInstance, true);
-        ENQUEUE_REQUEST(GetStepSequence, true);
-      }
+// GrpcWorkerService spawns one or more GrpcWorkerServiceThreads to service
+// requests.  Each thread operates on an independent completion queue.
+class GrpcWorkerServiceThread {
+ public:
+  explicit GrpcWorkerServiceThread(
+      GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+      std::unordered_map<int, int> queue_depth,
+      grpc::WorkerService::AsyncService* worker_service)
+      : worker_(worker),
+        queue_depth_(queue_depth),
+        worker_service_(worker_service),
+        is_shutdown_(false) {
+    cq_ = builder->AddCompletionQueue();
+  }
 
-      void* tag;
-      bool ok;
+  void Start() {
+    thread_.reset(
+        worker_->env()->env->StartThread(ThreadOptions(), "grpc_worker_service",
+                                         [this]() { HandleRPCsLoop(); }));
+  }
 
-      while (cq_->Next(&tag, &ok)) {
-        UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
-            static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
-        CHECK(callback_tag);
-        callback_tag->OnCompleted(this, ok);
-      }
-    }
+  void Join() { thread_.reset(); }  // Blocks until thread exits
 
-   private:
-    void Schedule(std::function<void()> f) {
-      worker_->env()->compute_pool->Schedule(std::move(f));
+  void Shutdown() {
+    {
+      mutex_lock lock(shutdown_mu_);
+      is_shutdown_ = true;
     }
+    cq_->Shutdown();
+  }
 
-    // The following section contains one request handler method per
-    // RPC. The `FooHandler` method is called (indirectly) by
-    // `HandleRPCsLoop()` when the next Foo RPC is received. Each
-    // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
-    // and is responsible for requesting the next Foo call by calling
-    // `ENQUEUE_REQUEST(Foo)`.
-
-    template <class RequestMessage, class ResponseMessage>
-    using WorkerCall =
-        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
-             RequestMessage, ResponseMessage>;
-
-    void GetStatusHandler(
-        WorkerCall<GetStatusRequest, GetStatusResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->GetStatus(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(GetStatus, false);
+ private:
+  // Add one or more completion queue entries for each worker method, then
+  // begin servicing requests from the completion queue.
+  void HandleRPCsLoop() {
+    // TODO(ncteisen): This may require performance engineering. We can
+    // change the number of threads, the number of handlers per thread,
+    // or even decide to specialize certain threads to certain methods.
+    SETUP_FOR_REQUEST(GetStatus, 1, false);
+    SETUP_FOR_REQUEST(CreateWorkerSession, 1, false);
+    SETUP_FOR_REQUEST(DeleteWorkerSession, 1, false);
+    SETUP_FOR_REQUEST(CleanupAll, 1, false);
+    SETUP_FOR_REQUEST(RegisterGraph, 1, false);
+    SETUP_FOR_REQUEST(DeregisterGraph, 1, false);
+    SETUP_FOR_REQUEST(Logging, 1, false);
+    SETUP_FOR_REQUEST(Tracing, 1, false);
+    SETUP_FOR_REQUEST(CompleteGroup, 10, true);
+    SETUP_FOR_REQUEST(CompleteInstance, 10, true);
+    SETUP_FOR_REQUEST(GetStepSequence, 10, true);
+    SETUP_FOR_REQUEST(RecvBuf, 500, true);
+    SETUP_FOR_REQUEST(RunGraph, 100, true);
+    SETUP_FOR_REQUEST(CleanupGraph, 100, false);
+
+    // TODO(ncteisen): Determine a better policy for enqueuing the
+    // appropriate number of each request type.
+    for (int i = 0;
+         i < gtl::FindWithDefault(
+                 queue_depth_, static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+                 1000);
+         ++i) {
+      EnqueueRecvTensorRequestRaw();
     }
 
-    void CreateWorkerSessionHandler(
-        WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
-            call) {
-      Schedule([this, call]() {
-        Status s =
-            worker_->CreateWorkerSession(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CreateWorkerSession, false);
-    }
+    void* tag;
+    bool ok;
 
-    void DeleteWorkerSessionHandler(
-        WorkerCall<DeleteWorkerSessionRequest, DeleteWorkerSessionResponse>*
-            call) {
-      Schedule([this, call]() {
-        Status s =
-            worker_->DeleteWorkerSession(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(DeleteWorkerSession, false);
+    while (cq_->Next(&tag, &ok)) {
+      UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
+          static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
+      CHECK(callback_tag);
+      callback_tag->OnCompleted(this, ok);
     }
+  }
 
-    void CleanupAllHandler(
-        WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->CleanupAll(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CleanupAll, false);
-    }
+ private:
+  void Schedule(std::function<void()> f) {
+    worker_->env()->compute_pool->Schedule(std::move(f));
+  }
 
-    void RegisterGraphHandler(
-        WorkerCall<RegisterGraphRequest, RegisterGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->RegisterGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(RegisterGraph, false);
-    }
+  // The following section contains one request handler method per
+  // RPC. The `FooHandler` method is called (indirectly) by
+  // `HandleRPCsLoop()` when the next Foo RPC is received. Each
+  // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
+  // and is responsible for requesting the next Foo call by calling
+  // `ENQUEUE_REQUEST(Foo)`.
+  template <class RequestMessage, class ResponseMessage>
+  using WorkerCall =
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           RequestMessage, ResponseMessage>;
+
+  // Handle all non-cancellable simple methods with a standard wrapper.
+#define HANDLE_CALL(method)                                                   \
+  void method##Handler(WorkerCall<method##Request, method##Response>* call) { \
+    Schedule([this, call]() {                                                 \
+      Status s = worker_->method(&call->request, &call->response);            \
+      if (!s.ok()) {                                                          \
+        VLOG(1) << "Bad response from " << #method << ": " << s;              \
+      }                                                                       \
+      call->SendResponse(ToGrpcStatus(s));                                    \
+    });                                                                       \
+    ENQUEUE_REQUEST(method, false);                                           \
+  }
 
-    void DeregisterGraphHandler(
-        WorkerCall<DeregisterGraphRequest, DeregisterGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->DeregisterGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(DeregisterGraph, false);
-    }
+  HANDLE_CALL(GetStatus);
+  HANDLE_CALL(CreateWorkerSession);
+  HANDLE_CALL(DeleteWorkerSession);
+  HANDLE_CALL(CleanupAll);
+  HANDLE_CALL(RegisterGraph);
+  HANDLE_CALL(DeregisterGraph);
+  HANDLE_CALL(CleanupGraph);
+  HANDLE_CALL(Logging);
+  HANDLE_CALL(Tracing);
+
+#undef HANDLE_CALL
+
+  void GetStepSequenceHandler(
+      WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
+    Schedule([this, call]() {
+      worker_->GetStepSequenceAsync(
+          &call->request, &call->response, [call](const Status& s) {
+            VLOG(1) << "Bad response from GetStepSequence:" << s;
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(GetStepSequence, true);
+  }
 
-    void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        ProtoRunGraphRequest* wrapped_request =
-            new ProtoRunGraphRequest(&call->request);
-        NonOwnedProtoRunGraphResponse* wrapped_response =
-            new NonOwnedProtoRunGraphResponse(&call->response);
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
-                               [call, call_opts, wrapped_request,
-                                wrapped_response](const Status& s) {
-                                 call->ClearCancelCallback();
-                                 delete call_opts;
-                                 delete wrapped_request;
-                                 delete wrapped_response;
-                                 call->SendResponse(ToGrpcStatus(s));
-                               });
-      });
-      ENQUEUE_REQUEST(RunGraph, true);
-    }
+  void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      ProtoRunGraphRequest* wrapped_request =
+          new ProtoRunGraphRequest(&call->request);
+      NonOwnedProtoRunGraphResponse* wrapped_response =
+          new NonOwnedProtoRunGraphResponse(&call->response);
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
+                             [call, call_opts, wrapped_request,
+                              wrapped_response](const Status& s) {
+                               if (!s.ok()) {
+                                 VLOG(1) << "Bad response from RunGraph:" << s;
+                               }
+                               call->ClearCancelCallback();
+                               delete call_opts;
+                               delete wrapped_request;
+                               delete wrapped_response;
+                               call->SendResponse(ToGrpcStatus(s));
+                             });
+    });
+    ENQUEUE_REQUEST(RunGraph, true);
+  }
 
-    void RecvTensorHandlerRaw(
-        WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->GrpcRecvTensorAsync(call_opts, &call->request, &call->response,
-                                     [call, call_opts](const Status& s) {
-                                       call->ClearCancelCallback();
-                                       delete call_opts;
-                                       call->SendResponse(ToGrpcStatus(s));
-                                     });
-      });
-      EnqueueRecvTensorRequestRaw();
-    }
+  void RecvTensorHandlerRaw(
+      WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->GrpcRecvTensorAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from RecvTensor:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    EnqueueRecvTensorRequestRaw();
+  }
 
-    void CleanupGraphHandler(
-        WorkerCall<CleanupGraphRequest, CleanupGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->CleanupGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CleanupGraph, false);
-    }
+  void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->RecvBufAsync(call_opts, &call->request, &call->response,
+                            [call, call_opts](const Status& s) {
+                              call->ClearCancelCallback();
+                              delete call_opts;
+                              if (!s.ok()) {
+                                VLOG(1) << "Bad response from RecvBuf:" << s;
+                              }
+                              call->SendResponse(ToGrpcStatus(s));
+                            });
+    });
+    ENQUEUE_REQUEST(RecvBuf, true);
+  }
 
-    void LoggingHandler(WorkerCall<LoggingRequest, LoggingResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->Logging(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(Logging, false);
-    }
+  void CompleteGroupHandler(
+      WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->CompleteGroupAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from CompleteGroup:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(CompleteGroup, true);
+  }
 
-    void TracingHandler(WorkerCall<TracingRequest, TracingResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->Tracing(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(Tracing, false);
-    }
+  void CompleteInstanceHandler(
+      WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->CompleteInstanceAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from CompleteInstance:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(CompleteInstance, false);
+  }
+#undef ENQUEUE_REQUEST
 
-    void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->RecvBufAsync(call_opts, &call->request, &call->response,
-                              [call, call_opts](const Status& s) {
-                                call->ClearCancelCallback();
-                                delete call_opts;
-                                call->SendResponse(ToGrpcStatus(s));
-                              });
-      });
-      ENQUEUE_REQUEST(RecvBuf, true);
+  void EnqueueRecvTensorRequestRaw() {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           RecvTensorRequest, ::grpc::ByteBuffer>::
+          EnqueueRequestForMethod(
+              worker_service_, cq_.get(),
+              static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+              &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
+              true /* supports cancel*/);
     }
+  }
 
-    void CompleteGroupHandler(
-        WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->CompleteGroupAsync(call_opts, &call->request, &call->response,
-                                    [call, call_opts](const Status& s) {
-                                      call->ClearCancelCallback();
-                                      delete call_opts;
-                                      call->SendResponse(ToGrpcStatus(s));
-                                    });
-      });
-      ENQUEUE_REQUEST(CompleteGroup, true);
-    }
+  GrpcWorker* const worker_ = nullptr;  // Not owned.
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  std::unique_ptr<Thread> thread_;
+  std::unordered_map<int, int> queue_depth_;
+  grpc::WorkerService::AsyncService* const worker_service_;
 
-    void CompleteInstanceHandler(
-        WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->CompleteInstanceAsync(call_opts, &call->request,
-                                       &call->response,
-                                       [call, call_opts](const Status& s) {
-                                         call->ClearCancelCallback();
-                                         delete call_opts;
-                                         call->SendResponse(ToGrpcStatus(s));
-                                       });
-      });
-      ENQUEUE_REQUEST(CompleteInstance, false);
-    }
+  mutex shutdown_mu_;
+  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
+};
 
-    void GetStepSequenceHandler(
-        WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
-      Schedule([this, call]() {
-        worker_->GetStepSequenceAsync(
-            &call->request, &call->response,
-            [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
-      });
-      ENQUEUE_REQUEST(GetStepSequence, true);
+class GrpcWorkerService : public AsyncServiceInterface {
+ public:
+  GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+                    GrpcWorkerServiceOptions options)
+      : is_shutdown_(false) {
+    builder->RegisterService(&worker_service_);
+    for (int i = 0; i < options.num_worker_threads; i++) {
+      threads_.emplace_back(new GrpcWorkerServiceThread(
+          worker, builder, options.queue_depth, &worker_service_));
     }
-#undef ENQUEUE_REQUEST
+  }
 
-    void EnqueueRecvTensorRequestRaw() {
-      mutex_lock l(shutdown_mu_);
+  void Shutdown() override {
+    bool did_shutdown = false;
+    {
+      mutex_lock l(service_shutdown_mu_);
       if (!is_shutdown_) {
-        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
-             RecvTensorRequest, ::grpc::ByteBuffer>::
-            EnqueueRequestForMethod(
-                worker_service_, cq_.get(),
-                static_cast<int>(GrpcWorkerMethod::kRecvTensor),
-                &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
-                true /* supports cancel*/);
+        LOG(INFO) << "Shutting down GrpcWorkerService.";
+        is_shutdown_ = true;
+        did_shutdown = true;
       }
     }
+    if (did_shutdown) {
+      for (auto& worker_thread : threads_) {
+        worker_thread->Shutdown();
+      }
+    }
+  }
 
-    GrpcWorker* const worker_ = nullptr;  // Not owned.
-    std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
-    std::unique_ptr<Thread> thread_;
-    grpc::WorkerService::AsyncService* const worker_service_;
-
-    mutex shutdown_mu_;
-    bool is_shutdown_ GUARDED_BY(shutdown_mu_);
-    TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
-  };  // GrpcWorkerServiceThread
+  // This method blocks forever handling requests from the completion queue.
+  void HandleRPCsLoop() override {
+    for (auto& worker_thread : threads_) {
+      worker_thread->Start();
+    }
+    for (auto& worker_thread : threads_) {
+      worker_thread->Join();
+    }
+  }
 
+ private:
   grpc::WorkerService::AsyncService worker_service_;
   std::vector<std::unique_ptr<GrpcWorkerServiceThread>> threads_;
 
@@ -640,9 +608,10 @@ std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env,
 }
 
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
-    GrpcWorker* worker, ::grpc::ServerBuilder* builder) {
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions options) {
   return std::unique_ptr<AsyncServiceInterface>(
-      new GrpcWorkerService(worker, builder));
+      new GrpcWorkerService(worker, builder, options));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 996617d385d1c0e397c30eeceb4f737690fb9490..88beb6c21652d22774f1b0d0b59f18af3f76be2e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 
+#include <unordered_map>
 #include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 
 namespace grpc {
@@ -57,9 +59,17 @@ class GrpcWorker : public Worker {
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env,
                                           const ConfigProto& config);
 
+struct GrpcWorkerServiceOptions {
+  // Map from GrpcWorkerMethod id to queue depth.  If set this overrides the
+  // default queue depth for a method.
+  std::unordered_map<int, int> queue_depth;
+  int num_worker_threads = 8;
+};
+
 // Returns an implementation of WorkerService rpc service.
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
-    GrpcWorker* worker, ::grpc::ServerBuilder* builder);
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions opts = GrpcWorkerServiceOptions());
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 7915c3aafd8a97de2830962d2851b247e7d4db4a..d2ae4eeaeec3c50d5101ec46a468d753cb0f3980 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -88,6 +88,7 @@ enum class GrpcWorkerMethod {
   kCompleteInstance,
   kGetStepSequence,
 };
+
 static const int kGrpcNumWorkerMethods =
     static_cast<int>(GrpcWorkerMethod::kGetStepSequence) + 1;
 
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index c9581fa00f3e946b212717107809182a6a5d00f2..98eb1467700a5e3259a3635f71c5cebae094751f 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -56,7 +56,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
   void RetireStepId(int64 graph_key, int64 step_id) override;
 
  protected:
-  CollectiveExecutor* Create(int64 step_id) override;
+  virtual CollectiveExecutor* Create(int64 step_id) override;
 
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   const string task_name_;
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index ce97085494175e57b41215779b32234c1c1d5f3c..7da1727e47cee69f9cfbbb5cb9473bc2a76bb220 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -75,7 +75,7 @@ TEST_P(Bfloat16Test, TruncateTest) {
   EXPECT_EQ(GetParam().expected_rounding, float(rounded));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     Bfloat16Test_Instantiation, Bfloat16Test,
     ::testing::Values(
         Bfloat16TestParam{
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 7fa58347f258acf327e112f4c9cd58c37134ceee..b83d183f14b28672f8da47ae642a386c69253a9b 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -64,7 +65,9 @@ CollInstanceParams& CollInstanceParams::operator=(
     device_names.assign(other.device_names.begin(), other.device_names.end());
     task_names.assign(other.task_names.begin(), other.task_names.end());
     same_num_devices_per_task = other.same_num_devices_per_task;
+    num_devices_per_task = other.num_devices_per_task;
     gpu_ring_order = other.gpu_ring_order;
+    communicator_key = other.communicator_key;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
@@ -76,6 +79,7 @@ CollInstanceParams& CollInstanceParams::operator=(
     impl_details.subdiv_source_rank.assign(
         other.impl_details.subdiv_source_rank.begin(),
         other.impl_details.subdiv_source_rank.end());
+    impl_details.dependencies = other.impl_details.dependencies;
   }
   return *this;
 }
@@ -91,6 +95,13 @@ string CollInstanceParams::ToString() const {
   for (const auto& n : task_names) {
     strings::StrAppend(&v, n, ", ");
   }
+  strings::StrAppend(&v, "} num_devices_per_task={");
+  for (const auto dpt : num_devices_per_task) {
+    strings::StrAppend(&v, dpt.first, ": ", dpt.second, ", ");
+  }
+  strings::StrAppend(&v, "}, collective_name=", impl_details.collective_name,
+                     ", communicator_key=", str_util::CEscape(communicator_key),
+                     ", subdiv_offsets={");
   strings::StrAppend(&v, "}, subdiv_offsets={");
   for (const auto& d : impl_details.subdiv_offsets) {
     strings::StrAppend(&v, d, ",");
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 0321429702af74dfb18ca631b0314c705150ec06..546e3938a828a1007de43c2bdc188eee174c911e 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -70,6 +70,8 @@ struct CollImplDetails {
   std::vector<std::vector<int>> subdiv_permutations;
   std::vector<int> subdiv_offsets;
   std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
+  std::vector<int32>
+      dependencies;  // collective instances on which this node depends
 };
 
 // Data common to all members of a collective instance.
@@ -85,9 +87,13 @@ struct CollInstanceParams {
   std::vector<string> task_names;
   // True if every task has the same number of devices.
   bool same_num_devices_per_task = false;
+  // Task -> number of devices on that task.
+  std::unordered_map<string, int32> num_devices_per_task;
   // If passed in to GPUOptions in ConfigProto, defines a good ring order for
   // GPUs.  Assumes same GPU configuration at each worker.
   string gpu_ring_order = "";
+  // Valid when using a communicator-based collective mechanism, e.g. NCCL.
+  string communicator_key;
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -269,6 +275,21 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
 
   virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
 
+  // `WaitForDependencies` and `Launched` are used for fine-grained control of
+  // execution order between collective instances.  These functions are intended
+  // to be called in `Run` function of collective implementations, and may be
+  // used to make part, or whole, of the collective execution ordered with
+  // respect to other collective instances.
+  //
+  // `WaitForDependencies` will block until it is safe to continue the callee's
+  // execution, where safety is defined as: ordered with respect to the
+  // collective instances defined in the callee's `wait_for` attribute.
+  virtual void WaitForDependencies(const CollectiveParams& col_params) {}
+  // `Launched` unblocks the dependent collective instances by recording that
+  // this callee device has completed the critical portion of the collective
+  // execution.
+  virtual void Launched(const CollectiveParams& col_params) {}
+
   // Used to designate an invalid group or instance key.
   static int64 kInvalidId;
 
@@ -347,7 +368,8 @@ class CollectiveImplementationInterface {
 
   // Initializes the portions of `col_params` specific to this
   // implementation.  Called exactly once for every Collective instance during
-  // the CollectiveParams resolution process when the graph is first executed.
+  // the CollectiveParams resolution process when the graph is first executed,
+  // at the end of `CompleteInstanceLocal()`.
   // NOTE(ayushd): This is effectively a static function because it modifies the
   // `col_params` passed in and should not manipulate any data members.  However
   // because it is virtual and needs to be implemented by every derived class we
@@ -360,6 +382,14 @@ class CollectiveImplementationInterface {
   // object.
   virtual Status InitializeCollectiveContext(CollectiveContext* col_ctx) = 0;
 
+  // Initializes instance params at the beginning of `CompleteInstanceLocal()`,
+  // unlike `InitializeCollectiveParams` which is called at the end.  This
+  // function is called before all devices in the instance are discovered, and
+  // may be used to broadcast data via the shared `InstanceRec` object in
+  // collective param resolution to all devices.
+  virtual Status InitializeInstanceBeforeGroupDiscovery(
+      CollectiveParams* col_params) = 0;
+
   // Processes and moves data according to the logic of this Collective
   // implementation.  Relies on appropriate initialization of op-specific
   // CollectiveParams in InitializeCollectiveParams(), as well as appropriate
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index bf2d902af41c690be25a170da6fc22a4902e2d50..83bc95065e00dfb37a6a732f668270743ac3ab51 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -37,6 +38,11 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
       *output_size = (input_size - effective_filter_size + stride) / stride;
       *padding_before = *padding_after = 0;
       break;
+    case Padding::EXPLICIT:
+      *output_size = (input_size + *padding_before + *padding_after -
+                      effective_filter_size + stride) /
+                     stride;
+      break;
     case Padding::SAME:
       *output_size = (input_size + stride - 1) / stride;
       const int64 padding_needed =
@@ -71,6 +77,11 @@ Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
 Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
                              Padding padding_type, int64* output_size,
                              int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSize does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerbose instead");
+  }
   int64 padding_after_unused;
   return GetWindowedOutputSizeVerbose(input_size, filter_size, stride,
                                       padding_type, output_size, padding_size,
@@ -81,6 +92,11 @@ Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64 dilation_rate, int64 stride,
                                Padding padding_type, int64* output_size,
                                int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeV2 does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerboseV2 instead");
+  }
   int64 padding_after_unused;
   return GetWindowedOutputSizeVerboseV2(input_size, filter_size, dilation_rate,
                                         stride, padding_type, output_size,
@@ -123,8 +139,8 @@ Status GetWindowedOutputSizeFromDimsV2(
     shape_inference::InferenceContext* c,
     shape_inference::DimensionHandle input_size,
     shape_inference::DimensionOrConstant filter_size, int64 dilation_rate,
-    int64 stride, Padding padding_type,
-    shape_inference::DimensionHandle* output_size) {
+    int64 stride, Padding padding_type, int64 padding_before,
+    int64 padding_after, shape_inference::DimensionHandle* output_size) {
   if (stride <= 0) {
     return errors::InvalidArgument("Stride must be > 0, but got ", stride);
   }
@@ -137,6 +153,11 @@ Status GetWindowedOutputSizeFromDimsV2(
   // See also the parallel implementation in GetWindowedOutputSizeVerbose.
   switch (padding_type) {
     case Padding::VALID:
+      padding_before = padding_after = 0;
+      TF_FALLTHROUGH_INTENDED;
+    case Padding::EXPLICIT:
+      TF_RETURN_IF_ERROR(
+          c->Add(input_size, padding_before + padding_after, &input_size));
       if (dilation_rate > 1) {
         DimensionHandle window_size;
         TF_RETURN_IF_ERROR(
@@ -166,13 +187,26 @@ Status GetWindowedOutputSizeFromDims(
     shape_inference::DimensionHandle input_size,
     shape_inference::DimensionOrConstant filter_size, int64 stride,
     Padding padding_type, shape_inference::DimensionHandle* output_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeFromDims does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeFromDimsV2 instead");
+  }
   return GetWindowedOutputSizeFromDimsV2(c, input_size, filter_size,
                                          /*dilation_rate=*/1, stride,
-                                         padding_type, output_size);
+                                         padding_type,
+                                         // Give dummy values of -1 to
+                                         // padding_before and padding_after,
+                                         // since explicit padding is not used.
+                                         -1, -1, output_size);
 }
 
 Status UnchangedShape(shape_inference::InferenceContext* c) {
   c->set_output(0, c->input(0));
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data != nullptr) {
+    c->set_output_handle_shapes_and_types(0, *handle_data);
+  }
   return Status::OK();
 }
 
@@ -371,7 +405,10 @@ Status ShapeFromDimensions(DimensionHandle batch_dim,
   return tensorflow::Status::OK();
 }
 
-Status Conv2DShape(shape_inference::InferenceContext* c) {
+namespace {
+
+Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
+                       bool supports_explicit_padding) {
   string data_format_str, filter_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
@@ -464,13 +501,35 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
+  std::vector<int64> explicit_paddings;
+  if (supports_explicit_padding) {
+    Status s = c->GetAttr("explicit_paddings", &explicit_paddings);
+    // Use the default value, which is an empty list, if the attribute is not
+    // found. Otherwise return the error to the caller.
+    if (!s.ok() && !errors::IsNotFound(s)) {
+      return s;
+    }
+    TF_RETURN_IF_ERROR(CheckValidPadding(padding, explicit_paddings,
+                                         /*num_dims=*/4, data_format));
+  } else {
+    DCHECK(padding != Padding::EXPLICIT);
+  }
+
   DimensionHandle output_rows, output_cols;
+  int64 pad_rows_before = -1, pad_rows_after = -1;
+  int64 pad_cols_before = -1, pad_cols_after = -1;
+  if (padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, input_spatial_dims[0], filter_rows_dim, dilation_rows, stride_rows,
-      padding, &output_rows));
+      padding, pad_rows_before, pad_rows_after, &output_rows));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, input_spatial_dims[1], filter_cols_dim, dilation_cols, stride_cols,
-      padding, &output_cols));
+      padding, pad_cols_before, pad_cols_after, &output_cols));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(
@@ -480,6 +539,19 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+}  // namespace
+
+// Shape function for Conv2D-like operations that support explicit padding.
+Status Conv2DShapeWithExplicitPadding(shape_inference::InferenceContext* c) {
+  return Conv2DShapeImpl(c, true);
+}
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
+Status Conv2DShape(shape_inference::InferenceContext* c) {
+  return Conv2DShapeImpl(c, false);
+}
+
 // TODO(mjanusz): Unify all conv/pooling shape functions.
 Status Conv3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
@@ -551,13 +623,13 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
 
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes,
-      padding, &output_planes));
+      padding, -1, -1, &output_planes));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
-      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding,
-      &output_rows));
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding, -1,
+      -1, &output_rows));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
-      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding,
-      &output_cols));
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding, -1,
+      -1, &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCDHW") {
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 362899b947b1fd479d227ac5421a5f458405f3c6..14b9688bdc5d41e8cb2e92b1f1a8640fb9687d8c 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -38,11 +38,12 @@ namespace tensorflow {
 //
 // Padding (P): the padding we apply to the input tensor along each
 // dimension. This is usually used to make sure that the spatial dimensions
-// do not shrink when we progress with convolutions. Two types of padding are
-// often used:
+// do not shrink when we progress with convolutions. This function supports two
+// types of padding.
 //   SAME: the pad value is computed so that the output will have size H/S.
 //   VALID: no padding is carried out.
-// The padded area is zero-filled.
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerbose must be
+// called instead. Note the padded area is zero-filled.
 //
 // The output dimensions for convolution and many other operations, when given
 // all the parameters above, are as follows:
@@ -95,6 +96,9 @@ Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
 //   When the stride is 1, the expression simplifies to
 //     H' = H-K'+1.
 //
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerboseV2 must be
+// called instead
+//
 // TODO(b/67112639): Merge V2 versions and the original versions eventually.
 Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64 dilation_rate, int64 stride,
@@ -102,9 +106,12 @@ Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64* padding_size);
 
 // Returns the same output dimensions as in GetWindowedOutputSize, but returns
-// verbose padding dimensions (before/after). Any excess padding
-// (caused by an odd padding size value) is added to the 'padding_after'
-// dimension.
+// verbose padding dimensions (before/after), and EXPLICIT padding is supported.
+// When padding_type is EXPLICIT, *padding_before and *padding_after must
+// already point to initialized integers with the padding amounts. Otherwise,
+// *padding_before and *padding_after are set by this function, and any
+// excess padding (caused by an odd padding size value) is added to the
+// 'padding_after' dimension.
 Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
                                     int64 stride, Padding padding_type,
                                     int64* output_size, int64* padding_before,
@@ -122,7 +129,8 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
 // of the output tensor and padding to be applied to the input tensor at the
 // lower end of every dimension. Use for 3D convolutions, where the input data
 // is padded with zeros, as well as for 3D avg/max pooling, where the input data
-// is padded with invalid values that are not considered for pooling.
+// is padded with invalid values that are not considered for pooling. EXPLICIT
+// padding is not supported.
 Status Get3dOutputSize(const std::array<int64, 3>& input,
                        const std::array<int64, 3>& window,
                        const std::array<int64, 3>& strides,
@@ -140,21 +148,23 @@ Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
 
 namespace shape_inference {
 
-// Like GetWindowedOutputSize, but deals with DimensionHandles.
+// Like GetWindowedOutputSize, but deals with DimensionHandles. Does not support
+// EXPLICIT padding.
 Status GetWindowedOutputSizeFromDims(InferenceContext* c,
                                      DimensionHandle input_size,
                                      DimensionOrConstant filter_size,
                                      int64 stride, Padding padding_type,
                                      DimensionHandle* output_size);
 
-// The V2 version computes the same outputs with arbitrary dilation_rate. For
-// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
-Status GetWindowedOutputSizeFromDimsV2(InferenceContext* c,
-                                       DimensionHandle input_size,
-                                       DimensionOrConstant filter_size,
-                                       int64 dilation_rate, int64 stride,
-                                       Padding padding_type,
-                                       DimensionHandle* output_size);
+// The V2 version computes the same outputs with arbitrary dilation_rate, and
+// supports EXPLICIT padding. For detailed equations, refer to the comments
+// for GetWindowedOutputSizeV2(). The 'padding_before' and 'padding_after'
+// parameters are only used if padding_type == EXPLICIT.
+Status GetWindowedOutputSizeFromDimsV2(
+    InferenceContext* c, DimensionHandle input_size,
+    DimensionOrConstant filter_size, int64 dilation_rate, int64 stride,
+    Padding padding_type, int64 padding_before, int64 padding_after,
+    DimensionHandle* output_size);
 
 // Transfers shape of input(0) to output(0).
 Status UnchangedShape(shape_inference::InferenceContext* c);
@@ -222,7 +232,11 @@ Status BiasAddShape(shape_inference::InferenceContext* c);
 // Shape function for BiasAddGrad-like operations.
 Status BiasAddGradShape(shape_inference::InferenceContext* c);
 
-// Shape function for Conv2D-like operations.
+// Shape function for Conv2D-like operations that support explicit padding.
+Status Conv2DShapeWithExplicitPadding(shape_inference::InferenceContext* c);
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
 Status Conv2DShape(shape_inference::InferenceContext* c);
 
 // Shape function for Conv3D-like operations.
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 7c395679d304ffab1dfeff6804eede0d09b63734..b94925c04ee2794033b072a1bc62cf841081a769 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -408,12 +408,14 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format) {
+                      const string& data_format, const string& filter_format,
+                      const std::vector<int32>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
+                    .Attr("explicit_paddings", explicit_paddings)
                     .Attr("data_format", data_format)
                     .Attr("filter_format", filter_format)
                     .Finalize(&op.node_def));
@@ -536,19 +538,73 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,?,4,1];[?,?,?,?]", "[d0_0,?,2,d1_3]");
   INFER_OK(op, "[1,4,?,1];[?,?,?,?]", "[d0_0,2,?,d1_3]");
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,2,2,d1_3]");
+
+  // Some tests for "EXPLICIT" padding
+
+  // 4x4 input, 1x1 filter, 1x1 stride, [0, 2, 1, 4] padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 2, 1, 4, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,6,9,d1_3]");
+
+  // 3x3 input, 2x2 filter, 1x1 stride, [1, 0, 1, 2] padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 1, 0, 1, 2, 0, 0});
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,3,5,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 stride, [3, 2, 1, 0] padding
+  set_op({{1, 2, 2, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 3, 2, 1, 0, 0, 0});
+  INFER_OK(op, "[1,4,4,2];[2,2,2,3]", "[d0_0,4,2,d1_3]");
+
+  // 2x2 input, 2x1 filter, 1x2 stride, [1, 1, 2, 2] padding
+  set_op({{1, 1, 2, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 1, 1, 2, 2, 0, 0});
+  INFER_OK(op, "[1,2,2,1];[2,1,1,1]", "[d0_0,3,3,d1_3]");
+
+  // Unknown dims in the critical fields lead to partial inference.
+  INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,5,4,d1_3]");
+  INFER_OK(op, "[1,?,4,1];[2,1,1,1]", "[d0_0,?,4,d1_3]");
+  INFER_OK(op, "[1,4,?,1];[2,1,1,1]", "[d0_0,5,?,d1_3]");
+  INFER_OK(op, "[1,4,4,?];[2,1,1,1]", "[d0_0,5,4,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[?,1,1,1]", "[d0_0,?,4,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[2,?,1,1]", "[d0_0,5,?,d1_3]");
+
+  // Explicit padding errors
+  // Negative padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, -1, 0, 0, 0, 0});
+  INFER_ERROR("must be nonnegative", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Too little padding (7 explicit paddings instead of 8)
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must contain 8 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Too much padding (9 explicit paddings instead of 8)
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO",
+         {0, 0, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must contain 8 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding in batch dimension
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {1, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("batch or depth dimensions", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding in depth dimension
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 1, 0});
+  INFER_ERROR("batch or depth dimensions", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding explicit_paddings when padding is not EXPLICIT
+  set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must be empty", op, "[1,2,2,1];[1,1,1,1]");
 }
 
 TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& dilations,
                       const std::vector<int32>& strides, const string& padding,
-                      const string& data_format) {
+                      const string& data_format,
+                      const std::vector<int32>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("dilations", dilations)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
+                    .Attr("explicit_paddings", explicit_paddings)
                     .Attr("data_format", data_format)
                     .Finalize(&op.node_def));
   };
@@ -628,6 +684,28 @@ TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
   // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride
   set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // Some tests for "EXPLICIT" padding
+
+  // 4x4 input, 1x1 filter, 2x1 dilations, 1x1 stride, [0, 2, 1, 4] padding
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 0, 2, 1, 4, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,6,9,d1_3]");
+
+  // 3x3 input, 2x2 filter, 2x2 dilations, 1x1 stride, [1, 0, 1, 2] padding
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 1, 0, 1, 2, 0, 0});
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,2,4,d1_3]");
+
+  // 4x4 input, 2x2 filter, 1x2 dilations, 2x2 stride, [3, 2, 1, 0] padding
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 3, 2, 1, 0, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride, [1, 1, 2, 2] padding
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 1, 1, 2, 2, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,6,d1_3]");
 }
 
 TEST(CommonShapeFnsTest, Conv3DShapeTest) {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 94af4ee580b1e7dc1e760ed7d62575e3f8ddb817..b7adfd0c947b60ff9295c867f4afdf756208b126 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -682,8 +682,9 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
-  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
-                        fdef.attr().at("experimental_ints_on_device").b();
+  bool ints_on_device =
+      fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b();
 
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
@@ -868,7 +869,8 @@ string FunctionLibraryRuntime::ExecutorType(const InstantiateOptions& options,
 string Canonicalize(const string& funcname, AttrSlice attrs,
                     const FunctionLibraryRuntime::InstantiateOptions& options) {
   std::vector<string> entries;
-  entries.reserve(options.target.empty() ? attrs.size() : (attrs.size() + 1));
+  entries.reserve(attrs.size() + static_cast<int>(options.target.empty()) +
+                  options.input_devices.size());
   for (auto p : attrs) {
     if (p.first != kExecutorAttr) {
       entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
@@ -878,6 +880,14 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         strings::StrCat("_target", "=", str_util::CEscape(options.target)));
   }
+  for (int i = 0; i < options.input_devices.size(); ++i) {
+    entries.push_back(strings::StrCat(
+        "_input_dev", i, "=", str_util::CEscape(options.input_devices[i])));
+  }
+  for (int i = 0; i < options.output_devices.size(); ++i) {
+    entries.push_back(strings::StrCat(
+        "_output_dev", i, "=", str_util::CEscape(options.output_devices[i])));
+  }
   if (options.overlay_lib) {
     entries.push_back(strings::StrCat(
         "_overlay_lib", "=", reinterpret_cast<uintptr_t>(options.overlay_lib)));
@@ -1491,6 +1501,9 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
   for (const string& d : this->dep) {
     n.add_input(strings::StrCat("^", d));
   }
+  if (!this->device.empty()) {
+    n.set_device(this->device);
+  }
   return n;
 }
 
@@ -1533,6 +1546,7 @@ FunctionDef FunctionDefHelper::Create(
       fdef.mutable_signature()->set_is_stateful(true);
     }
   }
+
   return fdef;
 }
 
@@ -1640,4 +1654,4 @@ Status GetOpGradientCreator(const string& op, Creator* creator) {
 
 }  // end namespace gradient
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 9cf4b0f4cdf1d4c3604eebcf33bb51274578d73c..79755f599cfc80fa3ccdbadc83cef65667d07250 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -35,6 +36,8 @@ namespace tensorflow {
 
 class CancellationManager;
 class CollectiveExecutor;
+class DeviceSet;
+class Graph;
 class GraphDef;
 class OpKernel;
 class ProcessFunctionLibraryRuntime;
@@ -114,6 +117,7 @@ class FunctionDefHelper {
     std::vector<string> arg;
     std::vector<std::pair<string, AttrValueWrapper>> attr;
     std::vector<string> dep;
+    string device;
 
     NodeDef ToNodeDef() const;
   };
@@ -382,6 +386,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
   static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
+  static constexpr const char* const kIntsOnDeviceAttr =
+      "experimental_ints_on_device";
 
   static constexpr const char* const kGradientOp = "SymbolicGradient";
   static constexpr const char* const kFuncAttr = "f";
@@ -489,6 +495,27 @@ class FunctionLibraryRuntime {
     // instantiated on the local device.
     string target;
 
+    // Should the function be instantiated as a multi-device function?
+    bool is_multi_device_function = false;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's inputs. The device of resource inputs must be the device
+    // backing the resource, not the CPU device backing the resource handle.
+    // Must have the same length as number of inputs to the function.
+    std::vector<string> input_devices;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's outputs. The device of resource outputs should be the CPU
+    // device, not the device backing the resource.
+    // If specified, must have the same length as the number of function
+    // outputs.
+    // If not specified, output devices are picked automatically. If operations
+    // producing the output tensors have explicit device specification, they
+    // will be respected. These device specifications must identify a unique
+    // device, i.e.  a general specification like "job:foo" matching multiple
+    // devices will result in an error.
+    std::vector<string> output_devices;
+
     // This interface is EXPERIMENTAL and subject to change.
     //
     // If non-null, the runtime will use `overlay_lib` to resolve
@@ -523,6 +550,17 @@ class FunctionLibraryRuntime {
     // instantiation time, rather than on the first run. This can be used to
     // surface errors earlier.
     bool create_kernels_eagerly = false;
+
+    // If provided, this optimization function will be invoked before
+    // the placer for multi-device functions.
+    std::function<Status(std::vector<string> /*ret_node_names*/,
+                         FunctionLibraryDefinition*, const DeviceSet&,
+                         Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
+        optimize_graph_fn;
+
+    // If set, partitioned functions will be added to `graph_collector`.
+    // `graph_collector` must be alive during the call to Instantiate.
+    GraphCollector* graph_collector = nullptr;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 75d45fa2c84ebc340dfb79b76f7b406d7a099c1f..6a828e9afaaec536d4d5ef51d50dec88fdd6d391 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -505,7 +505,8 @@ TEST(TFunc, IntsOnDeviceArgNotSet) {
 
 TEST(TFunc, IntsOnDeviceArgSet) {
   auto fdef = test::function::XTimesTwoInt32();
-  (*fdef.mutable_attr())["experimental_ints_on_device"].set_b(true);
+  (*fdef.mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr].set_b(
+      true);
   InstantiationResult result;
   TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   EXPECT_EQ(5, result.nodes.size());
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 0445c242e95f490a10e9d54f986dd6b281fb6e0a..0bc07d7f91cf63e93b1188b163d00767fa73a3d8 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -135,6 +135,114 @@ FunctionDef XTimesTwo() {
       });
 }
 
+FunctionDef TwoDeviceMult() {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  const Tensor kThree = test::AsScalar<int64>(3);
+  return FDH::Create(
+      // Name
+      "TwoDeviceMult",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y_cpu: T", "y_gpu: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"num_3"}, "Const", {}, {{"value", kThree}, {"dtype", DT_INT64}}},
+          {{"factor_2"},
+           "Cast",
+           {"num_2:output:0"},
+           {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"factor_3"},
+           "Cast",
+           {"num_3:output:0"},
+           {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y_cpu"},
+           "Mul",
+           {"x", "factor_2:y:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:CPU:0"},
+          {{"y_gpu"},
+           "Mul",
+           {"x", "factor_3:y:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:GPU:0"},
+      },
+      {{"y_cpu", "y_cpu:z:0"}, {"y_gpu", "y_gpu:z:0"}});
+}
+
+FunctionDef TwoDeviceInputOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  const Tensor kThree = test::AsScalar<float>(3);
+  return FDH::Create(
+      // Name
+      "TwoDeviceInputOutput",
+      // Args
+      {"x1: T", "x2: T"},
+      // Return values
+      {"y_cpu: T", "y_gpu: T"},
+      // Attr def
+      {"T: {float}"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"num_3"}, "Const", {}, {{"value", kThree}, {"dtype", DT_FLOAT}}},
+          {{"y_cpu"},
+           "Mul",
+           {"x1", "num_2:output:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:CPU:0"},
+          {{"y_gpu"},
+           "Mul",
+           {"x2", "num_3:output:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:GPU:0"},
+      },
+      {{"y_cpu", "y_cpu:z:0"}, {"y_gpu", "y_gpu:z:0"}});
+}
+
+FunctionDef FuncWithListInput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "FuncWithListInput",
+      // Args
+      {"x1: N * T"},
+      // Return values
+      {},
+      // Attr def
+      {"T: {float}", "N: int >= 1"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+      },
+      {});
+}
+
+FunctionDef FuncWithListOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "FuncWithListOutput",
+      // Args
+      {},
+      // Return values
+      {"y: N * T"},
+      // Attr def
+      {"T: {float}", "N: int >= 1"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+      },
+      {{"y", "num_2:output:0"}});
+}
+
 FunctionDef XAddX() {
   return FDH::Define(
       // Name
@@ -243,6 +351,58 @@ FunctionDef Swap() {
        {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
 }
 
+FunctionDef EmptyBodySwap() {
+  return FDH::Create(
+      // Name
+      "EmptyBodySwap",
+      // Args
+      {"i0: T", "i1: T"},
+      // Return values
+      {"o0: T", "o1: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {},
+      // Output mapping
+      {{"o0", "i1"}, {"o1", "i0"}});
+}
+
+FunctionDef ResourceOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "ResourceOutput",
+      // Args
+      {"x: float", "y: resource"},
+      // Return values
+      {"y_out: resource", "two_x: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"mul"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}, {}},
+      },
+      {{"y_out", "y"}, {"two_x", "mul:z:0"}});
+}
+
+FunctionDef ReadResourceVariable() {
+  return FDH::Create(
+      // Name
+      "ReadResourceVariable",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"read"}, "ReadVariableOp", {"x"}, {{"dtype", DT_FLOAT}}, {}},
+      },
+      {{"y", "read:value:0"}});
+}
+
 FunctionDef InvalidControlFlow() {
   return FDH::Create(
       // Name
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index a01743423bbfd5c684e82768ee347f1d0734fc04..28532b29d4509105c4b6b7c203e9e81c5780a58f 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -63,6 +63,21 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 // x:T -> x * 2.
 FunctionDef XTimesTwo();
 
+// x:T -> cpu(x * 2) + cpu(x * 3).
+FunctionDef TwoDeviceTimesFive();
+
+// x:T -> cpu(x * 2), gpu(x * 3).
+FunctionDef TwoDeviceMult();
+
+// cpu(x):T, gpu(y):T -> cpu(x * 2), gpu(y * 3).
+FunctionDef TwoDeviceInputOutput();
+
+// Function taking a list of Tensors as input.
+FunctionDef FuncWithListInput();
+
+// Function returning a list of Tensors as output.
+FunctionDef FuncWithListOutput();
+
 // x:T -> x + x.
 FunctionDef XAddX();
 
@@ -90,6 +105,15 @@ FunctionDef RandomUniform();
 // x:T, y:T -> y:T, x:T
 FunctionDef Swap();
 
+// x:T, y:T -> y:T, x:T, the body has no nodes.
+FunctionDef EmptyBodySwap();
+
+// x:float, y:resource -> y:resource, 2*x:float.
+FunctionDef ResourceOutput();
+
+// x:resource -> y:float.
+FunctionDef ReadResourceVariable();
+
 // Contains malformed control flow which can't be run by the executor.
 FunctionDef InvalidControlFlow();
 
diff --git a/tensorflow/core/framework/iterator.proto b/tensorflow/core/framework/iterator.proto
deleted file mode 100644
index f015342e13313ea69838030ae4c1ccda6c1628f7..0000000000000000000000000000000000000000
--- a/tensorflow/core/framework/iterator.proto
+++ /dev/null
@@ -1,18 +0,0 @@
-syntax = "proto3";
-
-package tensorflow;
-option cc_enable_arenas = true;
-option java_outer_classname = "IteratorProtos";
-option java_multiple_files = true;
-option java_package = "org.tensorflow.util";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
-
-// Protocol buffer representing the metadata for an iterator's state stored
-// as a Variant tensor.
-message IteratorStateMetadata {
-  // A user-specified version string.
-  string version = 1;
-
-  // Keys for tensors in the VariantTensorDataProto.
-  repeated string keys = 2;
-}
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index d33945fd1b0c44264855ed518714eb35faf4b29f..7e5dbe5632becb40fd75763eb4be9dfdc09ec82b 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -131,7 +131,7 @@ class LookupInterface : public ResourceBase {
   // - the default_value tensor shape matches the table's value shape.
   Status CheckFindArguments(const Tensor& keys, const Tensor& default_value);
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("A lookup table of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 6dff6fe654a51d3c274f7e2c7ca34961eb4f3c2a..8caea351be4442d348f4405bf4385a1349fc197b 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -62,7 +62,7 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
 
 bool IsFunctionCallOp(const string& op_type) {
   return op_type == "SymbolicGradient" || op_type == "PartitionedCall" ||
-         op_type == "StatefulPartitionedCall";
+         op_type == "StatefulPartitionedCall" || op_type == "While";
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index b7c6d8091092ac64af8de7ab5daf3e60797970e8..a1c87a3f4210b7fb95597bed03a4d922a81fbfdf 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -29,6 +29,32 @@ std::shared_ptr<Parameter> MakeParameter(const string& name,
 
 namespace {
 
+// Given the average time between output events (`output_time`), the average
+// time between input events (`input_time`) and the buffer size, the method
+// computes the expected time an input event will have to wait.
+//
+// The wait time is approximated as the product of the probability the buffer
+// will be empty and the time it takes to produce an element into the buffer.
+//
+// The formula used for computing the probability is derived by modeling the
+// problem as an M/M/1/K queue
+// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+int64 ComputeWaitTime(int64 output_time, int64 input_time, int64 buffer_size) {
+  if (output_time == 0 || input_time == 0) {
+    return output_time;
+  }
+  if (input_time == output_time) {
+    const double p_buffer_empty = 1.0L / static_cast<double>(buffer_size + 1);
+    return p_buffer_empty * output_time;
+  }
+  const double alpha = 1.0L / static_cast<double>(input_time);
+  const double beta = 1.0L / static_cast<double>(output_time);
+  const double p_buffer_empty =
+      (1.0L - beta / alpha) /
+      (1.0L - std::pow((beta / alpha), static_cast<double>(buffer_size + 1)));
+  return p_buffer_empty * output_time;
+}
+
 // The first input of InterleaveMany corresponds to the input dataset whose
 // elements are used to create the (derived) input datasets whose elements are
 // interleaved as output.
@@ -119,8 +145,8 @@ class AsyncInterleaveMany : public Node {
         static_cast<double>(OutputTimeForInputs(input_times) -
                             inputs_.front()->OutputTime(input_times)) /
         static_cast<double>(inputs_.size() - 1) / parallelism;
-    return std::max(0LL,
-                    NanosPerElementLocked() + output_time - old_input_time);
+    return ComputeWaitTime(NanosPerElementLocked() + output_time,
+                           old_input_time, parallelism);
   }
 
   int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
@@ -202,7 +228,7 @@ class AsyncKnownRatio : public Node {
     if (ratio_ == 0.0) {
       int64 output_time =
           static_cast<double>(NanosPerElementLocked()) / parallelism;
-      return std::max(0LL, output_time - input_times->back());
+      return ComputeWaitTime(output_time, input_times->back(), parallelism);
     }
     int64 old_input_time = input_times->back();
     int64 new_input_time = static_cast<int64>(
@@ -213,7 +239,7 @@ class AsyncKnownRatio : public Node {
     int64 output_time = static_cast<int64>(
         static_cast<double>(NanosPerElementLocked()) / parallelism +
         ratio_ * OutputTimeForInputs(input_times));
-    return std::max(0LL, output_time - old_input_time);
+    return ComputeWaitTime(output_time, old_input_time, parallelism);
   }
 
   int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
@@ -354,7 +380,12 @@ std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
     output_ = node;
   }
   if (output) {
+    VLOG(3) << "Adding " << node->name() << "(id:" << node->id()
+            << ") as input for " << output->name() << "(id:" << output->id()
+            << ")";
     output->add_input(node);
+  } else {
+    VLOG(3) << "Adding " << node->name() << "(id:" << node->id() << ")";
   }
   collect_resource_usage_ =
       collect_resource_usage_ || node->has_tunable_parameters();
@@ -467,8 +498,12 @@ void Model::RecordStop(const string& name, bool start_output) {
 void Model::RemoveNode(const string& name) {
   mutex_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node && (*node)->output()) {
-    (*node)->output()->remove_input(*node);
+  if (node) {
+    if ((*node)->output()) {
+      (*node)->output()->remove_input(*node);
+    }
+    VLOG(3) << "Removing " << (*node)->name() << "(id:" << (*node)->id() << ")";
+    remove_node_hook_(*node);
   }
   lookup_table_.erase(name);
 }
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index c3a694227c229884aef60374e494ade6ca539383..7fac1753a6332e1db4d01c15e68242ac15b388ca 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -310,7 +310,7 @@ class Node {
   std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
   std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
 
-  // The reference to the output node is not owned so that that deletion of a
+  // The reference to the output node is not owned so that deletion of a
   // node results in recursive deletion of the subtree rooted in the node.
   Node* const output_;
 };
@@ -359,7 +359,19 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args);
 // implementation of `DatasetBase` and `DatasetBaseIterator` respectively.
 class Model {
  public:
-  Model() : collect_resource_usage_(false) {}
+  using NodeHook = std::function<void(std::shared_ptr<Node>)>;
+
+  // Creates a new model.
+  //
+  // The `remove_node_hook` argument can be used to specify functionality that
+  // should be invoked before a node is removed from the model. The hook can be
+  // used for dependency injection -- to allow the model to invoke functionality
+  // from modules that it could not depend on statically.
+  Model(NodeHook remove_node_hook)
+      : collect_resource_usage_(false),
+        remove_node_hook_(std::move(remove_node_hook)) {
+    DCHECK(remove_node_hook_ != nullptr);
+  }
 
   // Indicates whether to collect resource usage.
   bool collect_resource_usage() const { return collect_resource_usage_; }
@@ -414,6 +426,9 @@ class Model {
   // tunable parameter (because the information is used for for tuning the value
   // of the parameter) and never stops.
   std::atomic<bool> collect_resource_usage_;
+
+  // A hook invoked immediately before a node is removed from the model.
+  const NodeHook remove_node_hook_;
 };
 
 }  // namespace model
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 90bd570f90cdab2182f3d46e009b2cd972667ef9..1d7f407e180d37a61bfb3191dbd04f9bb1ca60d5 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -57,33 +57,36 @@ TEST_P(AsyncInterleaveManyTest, Model) {
   });
   std::vector<int64> input_times(1, input_time);
   async_interleave_many->add_processing_time(100);
-  EXPECT_EQ(100, async_interleave_many->processing_time());
-  EXPECT_EQ(0, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(0, async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->processing_time(), 100);
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_interleave_many->OutputTime(&input_times), 0);
   async_interleave_many->record_element();
-  EXPECT_EQ(1, async_interleave_many->num_elements());
-  EXPECT_EQ(100, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 100 - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->num_elements(), 1);
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(100, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 100 - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(100 + 250, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 100 + 250 / parallelism - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100 + 250);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times),
+            100 + 250 / parallelism);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   async_interleave_many->record_element();
-  EXPECT_EQ(50 + 250, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 50 + 250 / parallelism - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 50 + 250);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times),
+            50 + 250 / parallelism);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
 }
 
-INSTANTIATE_TEST_CASE_P(Test, AsyncInterleaveManyTest,
-                        ::testing::Combine(::testing::Values(1, 2),
-                                           ::testing::Values(0, 50, 100, 200)));
+INSTANTIATE_TEST_SUITE_P(Test, AsyncInterleaveManyTest,
+                         ::testing::Combine(::testing::Values(1, 2),
+                                            ::testing::Values(0, 50, 100,
+                                                              200)));
 
 class AsyncKnownRatioTest
     : public ::testing::TestWithParam<std::tuple<int64, int64, int64>> {};
@@ -106,53 +109,58 @@ TEST_P(AsyncKnownRatioTest, Model) {
   async_known_many->add_input(source2);
   std::vector<int64> input_times(1, input_time);
   source1->add_processing_time(100);
-  EXPECT_EQ(0, async_known_many->ProcessingTime());
-  EXPECT_EQ(0, async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(0, async_known_many->ProcessingTime());
-  EXPECT_EQ(0, async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * 100, async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * 100 - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * 100);
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (100 + 200),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (100 + 200) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (100 + 200));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (100 + 200));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 200),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 200) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 200));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 200));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->add_processing_time(128);
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) +
-                              128 / parallelism - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 128);
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 128 / parallelism);
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) +
-                              64 / parallelism - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 64);
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 64 / parallelism);
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
 }
 
-INSTANTIATE_TEST_CASE_P(Test, AsyncKnownRatioTest,
-                        ::testing::Combine(::testing::Values(1, 2, 4, 8),
-                                           ::testing::Values(0, 50, 100, 200),
-                                           ::testing::Values(0, 1, 2, 4)));
+INSTANTIATE_TEST_SUITE_P(Test, AsyncKnownRatioTest,
+                         ::testing::Combine(::testing::Values(1, 2, 4, 8),
+                                            ::testing::Values(0, 50, 100, 200),
+                                            ::testing::Values(0, 1, 2, 4)));
 
 TEST(InterleaveManyTest, Model) {
   std::shared_ptr<Node> interleave_many =
@@ -168,24 +176,24 @@ TEST(InterleaveManyTest, Model) {
   interleave_many->add_input(source2);
   std::vector<int64> input_times(1, 0);
   interleave_many->add_processing_time(100);
-  EXPECT_EQ(100, interleave_many->processing_time());
-  EXPECT_EQ(0, interleave_many->ProcessingTime());
-  EXPECT_EQ(0, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->processing_time(), 100);
+  EXPECT_EQ(interleave_many->ProcessingTime(), 0);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 0);
   interleave_many->record_element();
-  EXPECT_EQ(1, interleave_many->num_elements());
-  EXPECT_EQ(100, interleave_many->ProcessingTime());
-  EXPECT_EQ(100, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->num_elements(), 1);
+  EXPECT_EQ(interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(100, interleave_many->ProcessingTime());
-  EXPECT_EQ(100, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(350, interleave_many->ProcessingTime());
-  EXPECT_EQ(350, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->ProcessingTime(), 350);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 350);
   interleave_many->record_element();
-  EXPECT_EQ(300, interleave_many->ProcessingTime());
-  EXPECT_EQ(300, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->ProcessingTime(), 300);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 300);
 }
 
 class KnownRatioTest : public ::testing::TestWithParam<int64> {};
@@ -202,59 +210,59 @@ TEST_P(KnownRatioTest, Model) {
   known_many->add_input(source2);
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(0, known_many->ProcessingTime());
-  EXPECT_EQ(0, known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), 0);
+  EXPECT_EQ(known_many->OutputTime(&input_times), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(0, known_many->ProcessingTime());
-  EXPECT_EQ(0, known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), 0);
+  EXPECT_EQ(known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * 100, known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * 100, known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_EQ(known_many->OutputTime(&input_times), num_inputs_per_output * 100);
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (100 + 200), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (100 + 200),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (100 + 200));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (100 + 200));
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 200), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 200),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 200));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 200));
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
   known_many->add_processing_time(128);
-  EXPECT_EQ(num_inputs_per_output * (50 + 100), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
   known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
-            known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 128);
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 128);
   known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
-            known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 64);
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 64);
 }
 
-INSTANTIATE_TEST_CASE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
+INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
   std::vector<int64> input_times(1, 0);
   source->add_processing_time(100);
-  EXPECT_EQ(100, source->processing_time());
-  EXPECT_EQ(0, source->ProcessingTime());
-  EXPECT_EQ(0, source->OutputTime(&input_times));
+  EXPECT_EQ(source->processing_time(), 100);
+  EXPECT_EQ(source->ProcessingTime(), 0);
+  EXPECT_EQ(source->OutputTime(&input_times), 0);
   source->record_element();
-  EXPECT_EQ(1, source->num_elements());
-  EXPECT_EQ(100, source->ProcessingTime());
-  EXPECT_EQ(100, source->OutputTime(&input_times));
+  EXPECT_EQ(source->num_elements(), 1);
+  EXPECT_EQ(source->ProcessingTime(), 100);
+  EXPECT_EQ(source->OutputTime(&input_times), 100);
   source->record_element();
-  EXPECT_EQ(2, source->num_elements());
-  EXPECT_EQ(50, source->ProcessingTime());
-  EXPECT_EQ(50, source->OutputTime(&input_times));
+  EXPECT_EQ(source->num_elements(), 2);
+  EXPECT_EQ(source->ProcessingTime(), 50);
+  EXPECT_EQ(source->OutputTime(&input_times), 50);
 }
 
 TEST(UnknownRatioTest, Model) {
@@ -268,24 +276,24 @@ TEST(UnknownRatioTest, Model) {
   unknown_many->add_input(source2);
   std::vector<int64> input_times(1, 0);
   unknown_many->add_processing_time(100);
-  EXPECT_EQ(100, unknown_many->processing_time());
-  EXPECT_EQ(0, unknown_many->ProcessingTime());
-  EXPECT_EQ(0, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->processing_time(), 100);
+  EXPECT_EQ(unknown_many->ProcessingTime(), 0);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 0);
   unknown_many->record_element();
-  EXPECT_EQ(1, unknown_many->num_elements());
-  EXPECT_EQ(100, unknown_many->ProcessingTime());
-  EXPECT_EQ(100, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->num_elements(), 1);
+  EXPECT_EQ(unknown_many->ProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
   source1->add_processing_time(100);
   source2->add_processing_time(200);
-  EXPECT_EQ(100, unknown_many->ProcessingTime());
-  EXPECT_EQ(100, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->ProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(400, unknown_many->ProcessingTime());
-  EXPECT_EQ(400, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->ProcessingTime(), 400);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 400);
   unknown_many->record_element();
-  EXPECT_EQ(200, unknown_many->ProcessingTime());
-  EXPECT_EQ(200, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->ProcessingTime(), 200);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 200);
 }
 
 TEST(UnknownTest, Model) {
@@ -299,35 +307,35 @@ TEST(UnknownTest, Model) {
   unknown->add_input(source2);
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(0, unknown->ProcessingTime());
-  EXPECT_EQ(0, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 0);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 0);
   source2->add_processing_time(100);
-  EXPECT_EQ(0, unknown->ProcessingTime());
-  EXPECT_EQ(0, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 0);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   source2->record_element();
-  EXPECT_EQ(200, unknown->ProcessingTime());
-  EXPECT_EQ(200, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 200);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 200);
   source1->record_element();
-  EXPECT_EQ(150, unknown->ProcessingTime());
-  EXPECT_EQ(150, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 150);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 150);
   source2->record_element();
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   // Unknown node processing time should not affect its ProcessingTime() or
   // OutputTime().
   unknown->add_processing_time(100);
-  EXPECT_EQ(100, unknown->processing_time());
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->processing_time(), 100);
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   // Unknown node number of elements should not affect its ProcessingTime() or
   // OutputTime().
   unknown->record_element();
-  EXPECT_EQ(1, unknown->num_elements());
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->num_elements(), 1);
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
 }
 
 class TestNode : public model::Node {
@@ -355,35 +363,35 @@ class TestNode : public model::Node {
 TEST(SetterGetterTest, Node) {
   std::shared_ptr<TestNode> node =
       std::make_shared<TestNode>(model::Node::Args{-1, "TestNode", nullptr});
-  EXPECT_EQ(-1, node->id());
-  EXPECT_EQ("TestNode", node->name());
-  EXPECT_EQ(nullptr, node->output());
+  EXPECT_EQ(node->id(), -1);
+  EXPECT_EQ(node->name(), "TestNode");
+  EXPECT_EQ(node->output(), nullptr);
 
-  EXPECT_EQ(0, node->buffered_bytes());
+  EXPECT_EQ(node->buffered_bytes(), 0);
   node->add_buffered_bytes(42);
-  EXPECT_EQ(42, node->buffered_bytes());
+  EXPECT_EQ(node->buffered_bytes(), 42);
 
-  EXPECT_EQ(0, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 0);
   node->record_start(1);
-  EXPECT_EQ(0, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 0);
   node->record_stop(41);
-  EXPECT_EQ(40, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 40);
   node->add_processing_time(2);
-  EXPECT_EQ(42, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 42);
 
   std::shared_ptr<TestNode> input =
       std::make_shared<TestNode>(model::Node::Args{-1, "TestInput", node});
-  EXPECT_EQ(node.get(), input->output());
-  EXPECT_EQ(0, node->inputs().size());
+  EXPECT_EQ(input->output(), node.get());
+  EXPECT_EQ(node->inputs().size(), 0);
   node->add_input(input);
-  EXPECT_EQ(1, node->inputs().size());
-  EXPECT_EQ(input, node->inputs().front());
+  EXPECT_EQ(node->inputs().size(), 1);
+  EXPECT_EQ(node->inputs().front(), input);
   node->remove_input(input);
-  EXPECT_EQ(0, node->inputs().size());
+  EXPECT_EQ(node->inputs().size(), 0);
 
-  EXPECT_EQ(0, node->num_elements());
+  EXPECT_EQ(node->num_elements(), 0);
   node->record_element();
-  EXPECT_EQ(1, node->num_elements());
+  EXPECT_EQ(node->num_elements(), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 8071da5b6d454708a10c7d4a9d77b8a3ae6287bd..e369e882a0961e60d20f52e0155e9738bf16415e 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -697,15 +697,23 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
   return Status::OK();
 }
 
-Status AttachDef(const Status& status, const NodeDef& node_def) {
+Status AttachDef(const Status& status, const NodeDef& node_def,
+                 bool allow_multiple_formatted_node) {
   Status ret = status;
-  errors::AppendToMessage(
-      &ret, strings::StrCat(" [[", FormatNodeDefForError(node_def), "]]"));
+  string node_error;
+  if (!allow_multiple_formatted_node &&
+      status.error_message().find("{{node ") != string::npos) {
+    node_error = node_def.name();
+  } else {
+    node_error = FormatNodeDefForError(node_def);
+  }
+  errors::AppendToMessage(&ret, strings::StrCat(" [[", node_error, "]]"));
   return ret;
 }
 
-Status AttachDef(const Status& status, const Node& node) {
-  return AttachDef(status, node.def());
+Status AttachDef(const Status& status, const Node& node,
+                 bool allow_multiple_formatted_node) {
+  return AttachDef(status, node.def(), allow_multiple_formatted_node);
 }
 
 void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 619d44e65be97c4e163ca0b922ce53df54be83ba..598a3fb601086d34d72fa795eae1b94aab24f33b 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -315,10 +315,14 @@ void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def);
 // NodeName     = [A-Za-z0-9.], [A-Za-z0-9_./] *
 Status ValidateExternalNodeDefSyntax(const NodeDef& node_def);
 
-// Returns "status" with kernel's NodeDef attached as additional text
-// in the error message.
-Status AttachDef(const Status& status, const NodeDef& node_def);
-Status AttachDef(const Status& status, const Node& node);
+// Returns "status" with formatted NodeDef attached as additional text
+// in the error message. If 'allow_multiple_formatted_node' is false and there
+// is already a formatted NodeDef present in 'status', we simply attach the name
+// of the NodeDef instead of the formatted string.
+Status AttachDef(const Status& status, const NodeDef& node_def,
+                 bool allow_multiple_formatted_node = false);
+Status AttachDef(const Status& status, const Node& node,
+                 bool allow_multiple_formatted_node = false);
 
 // Appends the given prefix and suffix to the original node name in order to
 // make the name unique. If it's an "Enter" node, use the same way to reset
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index d9d437024ab0f330f56901dc8da8faae794c61c4..7b2506336968dc94fc65c84bbb43551d7ebd44ea 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -573,5 +573,29 @@ TEST(FormatNodeForErrorTest, NodeDef) {
   EXPECT_EQ("{{node enter}}", FormatNodeDefForError(node_def));
 }
 
+TEST(AttachDef, AllowMultipleFormattedNode) {
+  NodeDef a;
+  a.set_name("a");
+  NodeDef b;
+  b.set_name("b");
+  Status s = Status(error::CANCELLED, "Error");
+  Status s2 = AttachDef(s, a, true);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.error_message());
+  Status s3 = AttachDef(s2, b, true);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[{{node b}}]]", s3.error_message());
+}
+
+TEST(AttachDef, DisallowMultipleFormattedNode) {
+  NodeDef a;
+  a.set_name("a");
+  NodeDef b;
+  b.set_name("b");
+  Status s = Status(error::CANCELLED, "Error");
+  Status s2 = AttachDef(s, a, false);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.error_message());
+  Status s3 = AttachDef(s2, b, false);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[b]]", s3.error_message());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 505ab547755b46e0ff4af9920df6eb8961a4a9db..92a7038a404d2bf7f5bbf1e643f727f8c3dfc74a 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 
 namespace tensorflow {
 
@@ -488,14 +489,21 @@ Status ApiDefMap::LoadFile(Env* env, const string& filename) {
   if (filename.empty()) return Status::OK();
   string contents;
   TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &contents));
-  TF_RETURN_IF_ERROR(LoadApiDef(contents));
+  Status status = LoadApiDef(contents);
+  if (!status.ok()) {
+    // Return failed status annotated with filename to aid in debugging.
+    return Status(status.code(),
+                  strings::StrCat("Error parsing ApiDef file ", filename, ": ",
+                                  status.error_message()));
+  }
   return Status::OK();
 }
 
 Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
   const string contents = PBTxtFromMultiline(api_def_file_contents);
   ApiDefs api_defs;
-  protobuf::TextFormat::ParseFromString(contents, &api_defs);
+  TF_RETURN_IF_ERROR(
+      proto_utils::ParseTextFormatFromString(contents, &api_defs));
   for (const auto& api_def : api_defs.op()) {
     // Check if the op definition is loaded. If op definition is not
     // loaded, then we just skip this ApiDef.
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index e0e77c74495d62d0d0d2bc1c75d50fb1963bdcfd..6b43d7dc68d2120b5bc06cb6eaa12ef460188bda 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -39,7 +40,7 @@ constexpr char kTestOpList[] = R"(op {
     version: 123
     explanation: "foo"
   }
-)";
+})";
 
 constexpr char kTestApiDef[] = R"(op {
   graph_op_name: "testop"
@@ -455,6 +456,18 @@ op {
   ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
 }
 
+TEST(OpGenLibTest, ApiDefInvalidSyntax) {
+  const string api_def = R"pb(
+    op { bad_op_name: "testop" }
+  )pb";
+
+  OpList op_list;
+  ApiDefMap api_map(op_list);
+  // Loading with invalid syntax (e.g. unrecognized field name) should fail.
+  auto status = api_map.LoadApiDef(api_def);
+  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT, status.code());
+}
+
 TEST(OpGenLibTest, ApiDefUpdateDocs) {
   const string op_list1 = R"(op {
   name: "testop"
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 692da603f19652ab285d9c05143faeb87514e780..789f0fda7526fadc667e51046a344062a9532670 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -102,7 +102,8 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       graph_def_version_(context->graph_def_version()),
       is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
-      output_name_map_(context->num_outputs()) {
+      output_name_map_(context->num_outputs()),
+      cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
   OP_REQUIRES_OK(context,
                  NameRangesForNode(*def_, *context->op_def_, &input_name_map_,
                                    &output_name_map_));
@@ -117,6 +118,10 @@ OpKernel::OpKernel(OpKernelConstruction* context,
 
 OpKernel::~OpKernel() {}
 
+const uint64 OpKernel::kInitialCostEstimateCycles;
+const uint64 OpKernel::kOpIsExpensiveThresholdCycles;
+const uint64 OpKernel::kCostDecay;
+
 const string& OpKernel::name() const { return def_->name(); }
 const string& OpKernel::type_string() const { return def_->op(); }
 const string& OpKernel::requested_device() const { return def_->device(); }
@@ -407,7 +412,7 @@ Tensor OpKernelContext::mutable_input(int index, bool lock_held) {
     record_tensor_reference(tensor);
     return tensor;
   } else {
-    mutex_lock l(*input_ref_mutex(index));
+    tf_shared_lock l(*input_ref_mutex(index));
     Tensor& tensor = *((*params_->inputs)[index].tensor);
     record_tensor_reference(tensor);
     return tensor;
@@ -599,7 +604,7 @@ Status OpKernelContext::mutable_input(StringPiece name, Tensor* tensor,
   if (lock_held) {
     *tensor = *(*params_->inputs)[start].tensor;
   } else {
-    mutex_lock l(*input_ref_mutex(start));
+    tf_shared_lock l(*input_ref_mutex(start));
     *tensor = *(*params_->inputs)[start].tensor;
   }
   record_tensor_reference(*tensor);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 6a25d2b92f26609b47e6913d77abdffb3377ab72..06b90964ad1f7e4c8047f79ec37bee097327be9a 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
 #define TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
 
+#include <atomic>
 #include <functional>
 
 #include <utility>
@@ -47,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -116,10 +118,34 @@ class OpKernel {
   virtual AsyncOpKernel* AsAsync() { return nullptr; }
   virtual const AsyncOpKernel* AsAsync() const { return nullptr; }
 
+  // Initial time (in CPU cycles) we expect an operation to take.  Used to
+  // determine whether an operation should be place in a threadpool.  Operations
+  // start out "expensive".
+  static const uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
+  static const uint64 kOpIsExpensiveThresholdCycles = 5000;
+  static const uint64 kCostDecay = 10;
+
   // Returns true iff this op kernel is considered "expensive". The
   // runtime may use this flag to optimize graph execution for example
   // to "inline" inexpensive kernels.
-  virtual bool IsExpensive() { return expensive_; }
+  virtual bool IsExpensive() {
+    return expensive_ && (cost_estimate_.load(std::memory_order_relaxed) >
+                          kOpIsExpensiveThresholdCycles);
+  }
+
+  // Updates the dynamic cost estimate, which is used to determine whether this
+  // op is expensive. The new cost estimate is a weighted average of the old
+  // cost estimate and the latest cost.
+  void UpdateCostEstimate(uint64 elapsed_cycles) {
+    // N.B. Updates to `cost_estimate_` are atomic but unlocked.  Simulataneous
+    // updates may result in one or more updates being ignored.  This does not
+    // affect correctness but may slow down the update frequency.
+    cost_estimate_.store(
+        (kCostDecay - 1) * cost_estimate_.load(std::memory_order_relaxed) /
+                kCostDecay +
+            (elapsed_cycles / kCostDecay),
+        std::memory_order_relaxed);
+  }
 
   // Accessors.
   const NodeDef& def() const { return *def_; }
@@ -184,6 +210,7 @@ class OpKernel {
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
   bool expensive_;
+  std::atomic_uint_fast64_t cost_estimate_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
 };
@@ -204,8 +231,6 @@ class AsyncOpKernel : public OpKernel {
   const AsyncOpKernel* AsAsync() const final { return this; }
 
   void Compute(OpKernelContext* context) final;
-
-  bool IsExpensive() override { return true; }
 };
 
 // Wraps a tensor that is held by an Op across calls to Compute(). For
@@ -378,7 +403,9 @@ class OpArgIterator {
   using iterator_category = std::forward_iterator_tag;
   using value_type = ElementType;
   using pointer = ElementType*;
+  using const_pointer = const ElementType*;
   using reference = ElementType&;
+  using const_reference = const ElementType&;
   using difference_type = ptrdiff_t;
 
   OpArgIterator(const ListType* list, int i) : list_(list), i_(i) {}
@@ -407,6 +434,9 @@ class OpArgIterator {
   reference operator*() { return (*list_)[i_]; }
   pointer operator->() { return &(*list_)[i_]; }
 
+  const_reference operator*() const { return (*list_)[i_]; }
+  const_pointer operator->() const { return &(*list_)[i_]; }
+
  private:
   const ListType* const list_;
   int i_;
@@ -576,6 +606,9 @@ class OpKernelContext {
     // The session state for this op.
     SessionState* session_state = nullptr;
 
+    // Unique session identifier. Can be empty.
+    string session_handle;
+
     // The tensor store for this op.
     TensorStore* tensor_store = nullptr;
 
@@ -613,6 +646,10 @@ class OpKernelContext {
     static const int kNoReservation = -1;
     // Values in [0,...) represent reservations for the indexed output.
     const int* forward_from_array = nullptr;
+
+    // For tracking actively running deferred ops.
+    std::function<void()> inc_num_deferred_ops_function = []() {};
+    std::function<void()> dec_num_deferred_ops_function = []() {};
   };
 
   // params must outlive the OpKernelContext.
@@ -1004,6 +1041,9 @@ class OpKernelContext {
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
+  // Unique identifier of the session it belongs to. Can be empty.
+  string session_handle() const { return params_->session_handle; }
+
   // An op kernel can access the tensor store of the run it belongs to.
   TensorStore* tensor_store() const { return params_->tensor_store; }
 
@@ -1136,6 +1176,24 @@ class OpKernelContext {
 
   bool input_is_ref(int index) const;
 
+  // Used by OpKernel implementations to track actively running deferred ops.
+  //
+  // A deferred op is one whose Compute method returns (or whose ComputeAsync
+  // method invokes the callback) when work is scheduled onto a device. At that
+  // point, we don't know when the work will actually complete (or if it has
+  // already completed) on the device. These functions allow the executor to
+  // track the status of deferred ops and act accordingly.
+  //
+  // Deferred OpKernel implementations must use these methods to get two
+  // functions. It then must call these two functions in pairs, before and after
+  // device execution, respectively.
+  TF_MUST_USE_RESULT std::function<void()> inc_num_deferred_ops_function() {
+    return params_->inc_num_deferred_ops_function;
+  }
+  TF_MUST_USE_RESULT std::function<void()> dec_num_deferred_ops_function() {
+    return params_->dec_num_deferred_ops_function;
+  }
+
  private:
   Allocator* get_allocator(AllocatorAttributes attr);
 
diff --git a/tensorflow/core/framework/op_segment.cc b/tensorflow/core/framework/op_segment.cc
index 75ed4a4eaf231839999efa285c88e2bceda61a07..f7e194baeede8deb529aa7d1f4a0ba3ccc44e792 100644
--- a/tensorflow/core/framework/op_segment.cc
+++ b/tensorflow/core/framework/op_segment.cc
@@ -104,7 +104,8 @@ bool OpSegment::ShouldOwnKernel(FunctionLibraryRuntime* lib,
                                 const string& node_op) {
   // OpSegment should not own kernel if the node is stateless, or a function.
   return lib->IsStateful(node_op) &&
-         lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr;
+         lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr &&
+         node_op != "PartitionedCall" && node_op != "StatefulPartitionedCall";
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/ops_util.cc b/tensorflow/core/framework/ops_util.cc
index e8cf014ca03457e4673a14765cee5a05746b901a..4e603b9598fc43f894415b9b8aef6f641e484b6a 100644
--- a/tensorflow/core/framework/ops_util.cc
+++ b/tensorflow/core/framework/ops_util.cc
@@ -30,6 +30,9 @@ Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) {
       return Eigen::PADDING_VALID;
     case Padding::SAME:
       return Eigen::PADDING_SAME;
+    case Padding::EXPLICIT:
+      LOG(FATAL) << "Eigen does not have explicit padding enum "  // Crash OK
+                    "value";
   }
   return Eigen::PADDING_SAME;  // Prevent compiler warning about missing return
 }
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index 4ca4416c5ac1471247758cd943d52a7c65f7afaf..9395cce1644f7e8fd09cf40a48b2d7a5abb30bb2 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -85,11 +85,11 @@ class QueueInterface : public ResourceBase {
   virtual Status MatchesNodeDef(const NodeDef& node_def) = 0;
 
   // Returns the number of elements in the queue.
-  virtual int32 size() = 0;
+  virtual int32 size() const = 0;
 
   virtual const DataTypeVector& component_dtypes() const = 0;
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("A Queue of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index f894acbe1d5119081f088bb091049342b881f340..e47644cb8f27af63e1a96d9c3d44d84e8a55224d 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -76,7 +76,7 @@ class ReaderInterface : public ResourceBase {
   // Note: Must Reset on error.
   virtual Status RestoreState(const string& state) = 0;
 
-  string DebugString() override { return "a reader"; }
+  string DebugString() const override { return "a reader"; }
 
  protected:
   virtual ~ReaderInterface() {}
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index db213669a3f30b3b5587a4d587e2bfb039dacdda..d1f6771bf31e492ac47eb260c7d701d7a6c97b36 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -67,6 +67,11 @@ class ResourceHandle {
 
   string DebugString() const;
 
+  // GUID for anonymous resources. Resources with this shared_name will have
+  // their shared_name replaced with a GUID at creation time
+  static constexpr const char* ANONYMOUS_NAME =
+      "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+
  public:
   string device_;
   string container_;
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 9f3204ab96050a1cc06ab3052741f0044369b83e..6a94ff6642e6f50655083756ae24a2c2b97bc7ec 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
+
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -26,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/demangle.h"
 
 namespace tensorflow {
+
+// Used to generate unique names for anonymous variables
+static std::atomic<int64> current_id_;
+
 ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
                                   const string& name,
                                   const TypeIndex& type_index) {
@@ -38,7 +44,11 @@ ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
     actual_container = ctx->resource_manager()->default_container();
   }
   result.set_container(actual_container);
-  result.set_name(name);
+  if (name == ResourceHandle::ANONYMOUS_NAME) {
+    result.set_name(strings::StrCat("_AnonymousVar", current_id_.fetch_add(1)));
+  } else {
+    result.set_name(name);
+  }
   result.set_hash_code(type_index.hash_code());
   result.set_maybe_type_name(type_index.name());
   return result;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3195cd2e9dccaaf26ac6111a78acdb7278ea92e7..9c381e7d6b4e909689591d3a75bfabbecd886a0d 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -77,7 +77,7 @@ namespace tensorflow {
 class ResourceBase : public core::RefCounted {
  public:
   // Returns a debug string for *this.
-  virtual string DebugString() = 0;
+  virtual string DebugString() const = 0;
 
   // Returns memory used by this resource.
   virtual int64 MemoryUsed() const { return 0; }
@@ -619,20 +619,31 @@ ResourceHandleOp<T>::ResourceHandleOp(OpKernelConstruction* context)
 
 template <typename T>
 void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
-  if (!initialized_.load()) {
-    mutex_lock ml(mutex_);
-    // Checking again to see if another thread has initialized the resource.
+  if (name_ == ResourceHandle::ANONYMOUS_NAME) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    Tensor handle;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
+    handle.scalar<ResourceHandle>()() =
+        MakeResourceHandle<T>(ctx, container_, name_);
+    ctx->set_output(0, handle);
+  } else {
     if (!initialized_.load()) {
-      AllocatorAttributes attr;
-      attr.set_on_host(true);
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
-                                             &resource_, attr));
-      resource_.scalar<ResourceHandle>()() =
-          MakeResourceHandle<T>(ctx, container_, name_);
-      initialized_.store(true);
+      mutex_lock ml(mutex_);
+      // Checking again to see if another thread has initialized the resource.
+      if (!initialized_.load()) {
+        AllocatorAttributes attr;
+        attr.set_on_host(true);
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                               &resource_, attr));
+        resource_.scalar<ResourceHandle>()() =
+            MakeResourceHandle<T>(ctx, container_, name_);
+        initialized_.store(true);
+      }
     }
+    ctx->set_output(0, resource_);
   }
-  ctx->set_output(0, resource_);
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 7c7f0af0ce46abbde5b66facf4d33db47f9773b8..1c785736e60b2f03899924f34a207066582a590e 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -32,7 +32,7 @@ class Resource : public ResourceBase {
   explicit Resource(const string& label) : label_(label) {}
   ~Resource() override {}
 
-  string DebugString() override { return strings::StrCat("R/", label_); }
+  string DebugString() const override { return strings::StrCat("R/", label_); }
 
  private:
   string label_;
@@ -43,7 +43,7 @@ class Other : public ResourceBase {
   explicit Other(const string& label) : label_(label) {}
   ~Other() override {}
 
-  string DebugString() override { return strings::StrCat("O/", label_); }
+  string DebugString() const override { return strings::StrCat("O/", label_); }
 
  private:
   string label_;
@@ -245,7 +245,7 @@ class StubDevice : public DeviceBase {
 // Empty stub resource for testing resource handles.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
   int value_{0};
 };
 
@@ -305,7 +305,7 @@ TEST(ResourceHandleTest, DifferentDevice) {
 // Other stub resource to test type-checking of resource handles.
 class OtherStubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
 };
 
 TEST(ResourceHandleTest, DifferentType) {
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index c1e503dc57643d2023d89f317a6c5ff643a3c60b..7a2a87045bf20970a6a996cb9d32b264af0662c7 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -46,7 +46,7 @@ class StubDevice : public DeviceBase {
 // Stub resource for testing resource op kernel.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
   int code;
 };
 
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index f5de5dba8854adcfd5b94447da3ba42566a26bd8..9387b6c23c77dadfd423865b23bc7dc5fdf41672 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -67,7 +67,7 @@ class Var : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 4dcc80680ff7c62b31fb266c0f5cd80a9325fe81..18a278f07ff4e5b07061047021a86411e04e2511 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/shape_inference.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -1259,7 +1259,6 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
     return false;
   }
   std::vector<ShapeAndType> new_values(shapes_and_types.size());
-  bool refined = false;
   for (int i = 0; i < shapes_and_types.size(); ++i) {
     const ShapeAndType& existing = (*to_update)[i];
     if (shapes_and_types[i].dtype == existing.dtype) {
@@ -1269,16 +1268,9 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
         return false;
       } else {
         new_values[i].dtype = shapes_and_types[i].dtype;
-        refined = true;
       }
     }
     Relax(existing.shape, shapes_and_types[i].shape, &new_values[i].shape);
-    if (!existing.shape.SameHandle(new_values[i].shape)) {
-      refined = true;
-    }
-  }
-  if (!refined) {
-    return false;
   }
   to_update->swap(new_values);
   return true;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e3885b7d9e8a3f746d0cc2121dad71221d4ec06b..bf8b633c0137f856932689aed18456e8946eb778 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -588,9 +588,9 @@ class InferenceContext {
   // position idx with the specified shapes and types. This requires idx to be
   // in the [0, num_inputs) range.
   //
-  // If the relax is successful and any of the new shapes differs from the old
-  // one, or any of the old dtypes was DT_INVALID, store the new shapes and
-  // return true.  Return false otherwise.
+  // If the relax is successful (sizes are the same, old dtypes match new ones
+  // or are DT_INVALID), then store the relaxed shapes and return true.
+  // Return false otherwise.
   //
   // See 'RelaxInput' function for full details and examples.
   bool RelaxInputHandleShapesAndMergeTypes(
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index af53ed0a3ca64aefe310db3b2d07ce6a18afa181..7c960840d7446889bee1ba22cdbb4af072acd53e 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -83,7 +83,7 @@ class StatsAggregatorResource : public ResourceBase {
     return stats_aggregator_;
   }
 
-  string DebugString() { return "StatsAggregatorResource"; }
+  string DebugString() const override { return "StatsAggregatorResource"; }
 
  private:
   const std::shared_ptr<StatsAggregator> stats_aggregator_;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 7e841489eb35d4ec3d18fe255472107ef9d60efe..ecbffecd66d691e3e1b1722625381665ce61ffcc 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -650,14 +651,21 @@ void Tensor::CopyFromInternal(const Tensor& other, const TensorShape& shape) {
   }
 }
 
-void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
-                                    const TensorShape& shape) {
+Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
+                           const TensorShape& shape) {
   int in_size = DataTypeSize(other.dtype());
   int out_size = DataTypeSize(dtype);
-  CHECK_NE(in_size, 0);
-  CHECK_NE(out_size, 0);
-  CHECK_EQ(shape.num_elements() * out_size,
-           other.shape().num_elements() * in_size);
+  if (in_size == 0) {
+    return errors::InvalidArgument("other tensor has zero-sized data type");
+  }
+  if (out_size == 0) {
+    return errors::InvalidArgument("specified output type is zero-sized");
+  }
+  if (shape.num_elements() * out_size !=
+      other.shape().num_elements() * in_size) {
+    return errors::InvalidArgument(
+        "input and output shapes/data type sizes are not compatible");
+  }
   shape_ = shape;
   shape_.set_data_type(dtype);
   if (buf_ != other.buf_) {
@@ -665,6 +673,7 @@ void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
     buf_ = other.buf_;
     RefIfNonNull(buf_);
   }
+  return Status::OK();
 }
 
 // Notice that buf_ either points to a regular TensorBuffer or a SubBuffer.
@@ -932,10 +941,18 @@ namespace {
 // logic is so simple we can just replicate it here, where it is close to its
 // usage and easy to change later. And there's the extra benefit of not
 // accessing an 'internal' namespace.
-inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a) {
+inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
+                                                bool print_v2) {
   return a;
 }
-inline float PrintOneElement(const Eigen::half& h) {
+inline string PrintOneElement(const string& a, bool print_v2) {
+  if (print_v2) {
+    return "\"" + str_util::CEscape(a) + "\"";
+  } else {
+    return str_util::CEscape(a);
+  }
+}
+inline float PrintOneElement(const Eigen::half& h, bool print_v2) {
   return static_cast<float>(h);
 }
 
@@ -957,7 +974,7 @@ void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
         return;
       }
       if (i > 0) strings::StrAppend(result, " ");
-      strings::StrAppend(result, PrintOneElement(data[(*data_index)++]));
+      strings::StrAppend(result, PrintOneElement(data[(*data_index)++], false));
     }
     return;
   }
@@ -1000,7 +1017,7 @@ void PrintOneDimV2(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
   // We have recursed beyond all the dimensions into a single element
   // of the tensor.
   if (dim_index == num_dims) {
-    strings::StrAppend(result, PrintOneElement(data[data_index]));
+    strings::StrAppend(result, PrintOneElement(data[data_index], true));
     return;
   }
 
@@ -1048,7 +1065,7 @@ string SummarizeArray(int64 limit, int64 num_elts,
   if (shape.empty()) {
     for (int64 i = 0; i < limit; ++i) {
       if (i > 0) strings::StrAppend(&ret, " ");
-      strings::StrAppend(&ret, PrintOneElement(array[i]));
+      strings::StrAppend(&ret, PrintOneElement(array[i], print_v2));
     }
     if (num_elts > limit) strings::StrAppend(&ret, "...");
     return ret;
@@ -1123,6 +1140,9 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       // will emit "1 0..." which is more compact.
       return SummarizeArray<bool>(limit, num_elts, shape_, data, print_v2);
       break;
+    case DT_STRING:
+      return SummarizeArray<string>(limit, num_elts, shape_, data, print_v2);
+      break;
     default: {
       // All irregular cases
       string ret;
@@ -1134,9 +1154,6 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       for (size_t i = 0; i < limit; ++i) {
         if (i > 0) strings::StrAppend(&ret, " ");
         switch (dtype()) {
-          case DT_STRING:
-            strings::StrAppend(&ret, str_util::CEscape(flat<string>()(i)));
-            break;
           case DT_VARIANT: {
             const Variant& v = flat<Variant>()(i);
             strings::StrAppend(&ret, v.DebugString());
@@ -1166,10 +1183,15 @@ bool Tensor::SharesBufferWith(const Tensor& b) const {
          buf_->root_buffer() == b.buf_->root_buffer();
 }
 
-string Tensor::DebugString() const {
+string Tensor::DebugString(int num_values) const {
   return strings::StrCat("Tensor<type: ", DataTypeString(dtype()),
                          " shape: ", shape().DebugString(),
-                         " values: ", SummarizeValue(3), ">");
+                         " values: ", SummarizeValue(num_values), ">");
+}
+
+string Tensor::DeviceSafeDebugString() const {
+  return strings::StrCat("Tensor<type: ", DataTypeString(dtype()),
+                         " shape: ", shape().DebugString(), ">");
 }
 
 void Tensor::FillDescription(TensorDescription* description) const {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 009dd0846d2639eb9cf1ef47f8f12c10994dcb3b..6454cb818f2e3e237ca4bc49070399f3fff31dd7 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -526,7 +526,16 @@ class Tensor {
   string SummarizeValue(int64 max_entries, bool print_v2 = false) const;
 
   /// A human-readable summary of the tensor suitable for debugging.
-  string DebugString() const;
+  // `num_values` is the number of actual data values in the tensor
+  // included in the message. If the tensor might be resident in
+  // GPU/TPU memory use DeviceSafeDebugString instead.
+  string DebugString(int num_values) const;
+  string DebugString() const { return DebugString(3); }
+
+  // Variant of DebugString() that should be used for possibly non-CPU tensors.
+  // If the tensor is not resident on CPU, we can't read its values as
+  // DebugString() does.
+  string DeviceSafeDebugString() const;
 
   /// Fill in the `TensorDescription` proto with metadata about the
   /// tensor that is useful for monitoring and debugging.
@@ -545,12 +554,37 @@ class Tensor {
   /// REQUIRES: `DataTypeCanUseMemcpy(dtype())`.
   StringPiece tensor_data() const;
 
-  /// Copy the other tensor into this tensor and reshape it and reinterpret the
-  /// buffer's datatype.
+  /// Copy the other tensor into this tensor, reshape it and reinterpret the
+  /// buffer's datatype. If Status::OK() is returned, the two tensors now share
+  /// the same underlying storage.
+  ///
+  /// This call requires that the `other` tensor and the given type and shape
+  /// are "compatible" (i.e. they occupy the same number of bytes).
+  ///
+  /// Specifically:
+  ///
+  /// shape.num_elements() * DataTypeSize(type)
+  ///
+  /// must equal
+  ///
+  /// other.num_elements() * DataTypeSize(other.dtype())
   ///
-  /// This tensor shares other's underlying storage.
-  void UnsafeCopyFromInternal(const Tensor&, DataType dtype,
-                              const TensorShape&);
+  /// In addition, this function requires:
+  ///   * DataTypeSize(other.dtype()) != 0
+  ///   * DataTypeSize(type) != 0
+  ///
+  /// If any of the requirements are not met, errors::InvalidArgument is
+  /// returned.
+  Status BitcastFrom(const Tensor& other, DataType dtype,
+                     const TensorShape& shape);
+
+  /// Like BitcastFrom, but CHECK fails if any preconditions are not met.
+  ///
+  /// Deprecated. Use BitcastFrom instead and check the returned Status.
+  void UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
+                              const TensorShape& shape) {
+    TF_CHECK_OK(BitcastFrom(other, dtype, shape));
+  }
 
  private:
   // Returns true if the refcount on buf_ and any possible underlying root
@@ -594,7 +628,7 @@ class Tensor {
       OpKernelContext* ctx, Var* var);  // For access to RefCountIsOne().
   friend Status batch_util::CopyElementToSlice(
       Tensor element, Tensor* parent,
-      int64 index);                // For access to RefCountIsOne().
+      int64 index);  // For access to RefCountIsOne().
   friend Status batch_util::MaybeMoveSliceToElement(
       Tensor* parent, Tensor* element,
       int64 index);  // For access to RefCountIsOne().
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 5e0b976e1736dff6b8a18c7b801cb6d1ef500f11..7158f1925f65483c3087a6bfc480e5647eacb5d6 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -136,6 +136,89 @@ template <class Shape>
 TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64> dim_sizes) {
   set_tag(REP16);
   set_data_type(DT_INVALID);
+  InitDims(dim_sizes);
+}
+
+// Returns true iff partial is true and val is < 0.
+// REQUIRES: val < kMaxRep16
+// REQUIRES: partial || val >= 0
+static inline bool Set16(bool partial, uint16* dst, int dim, int64 val) {
+  if (partial) {
+    if (val < 0) {
+      dst[dim] = std::numeric_limits<uint16>::max();
+      return true;
+    }
+  } else {
+    CHECK_GE(val, 0);
+  }
+  dst[dim] = val;
+  return false;
+}
+
+template <class Shape>
+void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
+  DCHECK_EQ(tag(), REP16);
+
+  // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
+  // below cannot overflow.
+  static const uint64 kMaxSmall = 0xd744;
+  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
+                "bad overflow check");
+  bool large_size = false;
+  for (auto s : dim_sizes) {
+    if (s > kMaxSmall) {
+      large_size = true;
+      break;
+    }
+  }
+
+  if (!large_size) {
+    // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
+    uint16* dst = as16()->dims_;
+    switch (dim_sizes.size()) {
+      case 1: {
+        set_ndims_byte(1);
+        const int64 size = dim_sizes[0];
+        const bool neg = Set16(kIsPartial, dst, 0, size);
+        set_num_elements(neg ? -1 : size);
+        return;
+      }
+      case 2: {
+        set_ndims_byte(2);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        set_num_elements(neg ? -1 : (size0 * size1));
+        return;
+      }
+      case 3: {
+        set_ndims_byte(3);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        const int64 size2 = dim_sizes[2];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        neg |= Set16(kIsPartial, dst, 2, size2);
+        set_num_elements(neg ? -1 : (size0 * size1 * size2));
+        return;
+      }
+      case 4: {
+        set_ndims_byte(4);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        const int64 size2 = dim_sizes[2];
+        const int64 size3 = dim_sizes[3];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        neg |= Set16(kIsPartial, dst, 2, size2);
+        neg |= Set16(kIsPartial, dst, 3, size3);
+        set_num_elements(neg ? -1 : (size0 * size1 * size2 * size3));
+        return;
+      }
+    }
+  }
+
   set_ndims_byte(0);
   set_num_elements(1);
   for (int64 s : dim_sizes) {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 625d88ec1bdcdd9765dd64b09a1bad51f7fa3370..3473a441f2cdcc9b6932fcc1e78071ab8b7fa1fd 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -256,6 +256,7 @@ class TensorShapeBase : public TensorShapeRep {
 
  private:
   void RecomputeNumElements();
+  void InitDims(gtl::ArraySlice<int64> dim_sizes);
 
   // True for PartialTensorShape, false for TensorShape
   static constexpr bool kIsPartial =
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 6329aa6d8edf3795ed8018b7802661749683fe41..d25652ce81815e636b8f1a188171eec4cedb9689 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -684,6 +684,15 @@ static std::vector<int64> MakeSizes(int arg) {
   return sizes;
 }
 
+static void BM_TensorShape_Init(int iters, int arg) {
+  auto sizes = MakeSizes(arg);
+  while (--iters > 0) {
+    TensorShape shape(sizes);
+    tensorflow::testing::DoNotOptimize(shape.num_elements());
+  }
+}
+BENCHMARK(BM_TensorShape_Init)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
+
 static void BM_TensorShape_Assign(int iters, int arg) {
   TensorShape s(MakeSizes(arg));
   while (--iters > 0) {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 713f91fe04c6fe498209d88193f6fbb1729ec57c..d4aed387610579dc02a7566fdda44d042d203c35 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1370,7 +1370,7 @@ TEST(SummarizeValue, STRING) {
   EXPECT_EQ("one two three four five", x.SummarizeValue(16));
   x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
                        {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("one two three four five one...", x.SummarizeValue(6));
+  EXPECT_EQ("[[one two three four five]][[one...]]...", x.SummarizeValue(6));
 }
 
 TEST(SummarizeValue, INT32_PRINT_V2) {
@@ -1423,11 +1423,16 @@ TEST(SummarizeValue, BOOL_PRINT_V2) {
 TEST(SummarizeValue, STRING_PRINT_V2) {
   Tensor x = MkTensor<string>(DT_STRING, TensorShape({5}),
                               {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("[one two three four five]", x.SummarizeValue(16, true));
-  EXPECT_EQ("[one two three four five]", x.SummarizeValue(-1, true));
-  x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
+  EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
+            x.SummarizeValue(16, true));
+  EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
+            x.SummarizeValue(-1, true));
+  EXPECT_EQ("[\"one\" \"two\" ... \"four\" \"five\"]",
+            x.SummarizeValue(2, true));
+  x = MkTensor<string>(DT_STRING, TensorShape({2, 2}),
                        {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("[one two three four five one...]", x.SummarizeValue(6, true));
+  EXPECT_EQ("[[\"one\" \"two\"]\n [\"three\" \"four\"]]",
+            x.SummarizeValue(16, true));
 }
 
 void BM_CreateAndDestroy(int iters) {
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index 31630028516a4f7896986220f4ff0bd8f09fd37a..b58292b3b0225e6f2df7710347019a1c6d7bc512 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -206,7 +206,7 @@ struct Expector<T, true> {
     const T* b = y.flat<T>().data();
     for (int i = 0; i < size; ++i) {
       EXPECT_TRUE(Near(a[i], b[i], abs_err))
-          << "a = " << a[i] << " b = " << b << " index = " << i;
+          << "a = " << a[i] << " b = " << b[i] << " index = " << i;
     }
   }
 };
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index ef5b240aeaa8faef08d4c004f0f6d42e9516c48f..b5107a02a7fa2efeebbfc66a8539590727698882 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -37,57 +37,6 @@ UnaryVariantOpRegistry* UnaryVariantOpRegistry::Global() {
   return global_unary_variant_op_registry;
 }
 
-UnaryVariantOpRegistry::VariantShapeFn* UnaryVariantOpRegistry::GetShapeFn(
-    const TypeIndex& type_index) {
-  auto found = shape_fns.find(type_index);
-  if (found == shape_fns.end()) return nullptr;
-  return &found->second;
-}
-
-void UnaryVariantOpRegistry::RegisterShapeFn(const TypeIndex& type_index,
-                                             const VariantShapeFn& shape_fn) {
-  VariantShapeFn* existing = GetShapeFn(type_index);
-  CHECK_EQ(existing, nullptr)
-      << "Unary VariantShapeFn for type_index: "
-      << port::MaybeAbiDemangle(type_index.name()) << " already registered";
-  shape_fns.insert(std::pair<TypeIndex, VariantShapeFn>(type_index, shape_fn));
-}
-
-Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape) {
-  CHECK_EQ(variant_tensor.dtype(), DT_VARIANT);
-  CHECK_EQ(variant_tensor.dims(), 0);
-  const Variant& v = variant_tensor.scalar<Variant>()();
-  UnaryVariantOpRegistry::VariantShapeFn* shape_fn =
-      UnaryVariantOpRegistry::Global()->GetShapeFn(v.TypeId());
-  if (shape_fn == nullptr) {
-    return errors::Internal(
-        "No unary variant shape function found for Variant type_index: ",
-        port::MaybeAbiDemangle(v.TypeId().name()));
-  }
-  return (*shape_fn)(v, shape);
-}
-
-// Add some basic registrations for use by others, e.g., for testing.
-namespace {
-template <typename T>
-Status ScalarShape(const T&, TensorShape* shape) {
-  *shape = TensorShape({});
-  return Status::OK();
-}
-}  // namespace
-
-#define REGISTER_VARIANT_SHAPE_TYPE(T) \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(T, ScalarShape<T>);
-
-// No encode/shape registered for std::complex<> and Eigen::half
-// objects yet.
-REGISTER_VARIANT_SHAPE_TYPE(int);
-REGISTER_VARIANT_SHAPE_TYPE(float);
-REGISTER_VARIANT_SHAPE_TYPE(bool);
-REGISTER_VARIANT_SHAPE_TYPE(double);
-
-#undef REGISTER_VARIANT_SHAPE_TYPE
-
 UnaryVariantOpRegistry::VariantDecodeFn* UnaryVariantOpRegistry::GetDecodeFn(
     StringPiece type_name) {
   auto found = decode_fns.find(type_name);
@@ -177,6 +126,37 @@ Status VariantDeviceCopy(
   return (*device_copy_fn)(from, to, copy_fn);
 }
 
+namespace {
+template <typename T>
+Status DeviceCopyPrimitiveType(
+    const T& in, T* out,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copier) {
+  // Dummy copy, we don't actually bother copying to the device and back for
+  // testing.
+  *out = in;
+  return Status::OK();
+}
+}  // namespace
+
+#define REGISTER_VARIANT_DEVICE_COPY_TYPE(T)            \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::HOST_TO_DEVICE,    \
+      DeviceCopyPrimitiveType<T>);                      \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::DEVICE_TO_HOST,    \
+      DeviceCopyPrimitiveType<T>);                      \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::DEVICE_TO_DEVICE,  \
+      DeviceCopyPrimitiveType<T>);
+
+// No zeros_like registered for std::complex<> or Eigen::half objects yet.
+REGISTER_VARIANT_DEVICE_COPY_TYPE(int);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(float);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(double);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(bool);
+
+#undef REGISTER_VARIANT_DEVICE_COPY_TYPE
+
 // Special casing UnaryOpFn per op and per device.
 UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn(
     VariantUnaryOp op, StringPiece device, const TypeIndex& type_index) {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 7eb37e859f51992cf74a12736f5099839db5e1fd..488a606f6ee4564abaa0113f9886166afc76dacd 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -58,7 +58,6 @@ enum VariantDeviceCopyDirection {
 
 class UnaryVariantOpRegistry {
  public:
-  typedef std::function<Status(const Variant& v, TensorShape*)> VariantShapeFn;
   typedef std::function<bool(Variant*)> VariantDecodeFn;
   typedef std::function<Status(OpKernelContext*, const Variant&, Variant*)>
       VariantUnaryOpFn;
@@ -93,13 +92,6 @@ class UnaryVariantOpRegistry {
                                AsyncTensorDeviceCopyFn copy_fn)>
       AsyncVariantDeviceCopyFn;
 
-  // Add a shape lookup function to the registry.
-  void RegisterShapeFn(const TypeIndex& type_index,
-                       const VariantShapeFn& shape_fn);
-
-  // Returns nullptr if no shape function was found for the given TypeIndex.
-  VariantShapeFn* GetShapeFn(const TypeIndex& type_index);
-
   // Add a decode function to the registry.
   void RegisterDecodeFn(const string& type_name,
                         const VariantDecodeFn& decode_fn);
@@ -154,7 +146,6 @@ class UnaryVariantOpRegistry {
     std::size_t operator()(const TypeIndex& x) const { return x.hash_code(); }
   };
 
-  gtl::FlatMap<TypeIndex, VariantShapeFn, TypeIndexHash> shape_fns;
   gtl::FlatMap<StringPiece, VariantDecodeFn, StringPieceHasher> decode_fns;
 
   // Map std::pair<Direction, type_name> to function.
@@ -235,15 +226,6 @@ inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
   return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
          (lhs.type_index_ == rhs.type_index_);
 }
-// Gets a TensorShape from a Tensor containing a scalar Variant.
-// Returns an Internal error if the Variant does not have a registered shape
-// function, or if it's a serialized Variant that cannot be decoded.
-//
-// REQUIRES:
-//   variant_tensor.dtype() == DT_VARIANT
-//   variant_tensor.dims() == 0
-//
-Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape);
 
 // Decodes the Variant whose data_type has a registered decode
 // function.  Returns an Internal error if the Variant does not have a
@@ -326,29 +308,6 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
 
 namespace variant_op_registry_fn_registration {
 
-template <typename T>
-class UnaryVariantShapeRegistration {
- public:
-  typedef std::function<Status(const T& t, TensorShape*)> LocalVariantShapeFn;
-
-  UnaryVariantShapeRegistration(const TypeIndex& type_index,
-                                const LocalVariantShapeFn& shape_fn) {
-    const string type_index_name = port::MaybeAbiDemangle(type_index.name());
-    UnaryVariantOpRegistry::Global()->RegisterShapeFn(
-        type_index,
-        [type_index_name, shape_fn](const Variant& v,
-                                    TensorShape* s) -> Status {
-          const T* t = v.get<T>();
-          if (t == nullptr) {
-            return errors::Internal(
-                "VariantShapeFn: Could not access object, type_index: ",
-                type_index_name);
-          }
-          return shape_fn(*t, s);
-        });
-  }
-};
-
 template <typename T>
 class UnaryVariantDecodeRegistration {
  public:
@@ -471,23 +430,6 @@ class UnaryVariantBinaryOpRegistration {
 
 };  // namespace variant_op_registry_fn_registration
 
-// Register a unary shape variant function with the signature:
-//    Status ShapeFn(const T& t, TensorShape* s);
-// to Variants having TypeIndex type_index.
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(T, shape_function) \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ_HELPER(             \
-      __COUNTER__, T, MakeTypeIndex<T>(), shape_function)
-
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ_HELPER(ctr, T, type_index, \
-                                                          shape_function)     \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ(ctr, T, type_index, shape_function)
-
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ(ctr, T, type_index,         \
-                                                   shape_function)             \
-  static variant_op_registry_fn_registration::UnaryVariantShapeRegistration<T> \
-      register_unary_variant_op_shape_registration_fn_##ctr(type_index,        \
-                                                            shape_function)
-
 // Register a unary decode variant function for the given type.
 #define REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, type_name) \
   REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ_HELPER(__COUNTER__, T, type_name)
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index b2443e8676e7b986992fd130d5e162818e5fe075..e1a46ebd59d6ae8503d5ae3b31d4f31c7a6f1be1 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -39,13 +39,6 @@ namespace {
 
 struct VariantValue {
   string TypeName() const { return "TEST VariantValue"; }
-  static Status ShapeFn(const VariantValue& v, TensorShape* s) {
-    if (v.early_exit) {
-      return errors::InvalidArgument("early exit!");
-    }
-    *s = TensorShape({-0xdeadbeef});
-    return Status::OK();
-  }
   static Status CPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v,
                                VariantValue* v_out) {
     if (v.early_exit) {
@@ -89,8 +82,6 @@ struct VariantValue {
   int value;
 };
 
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, VariantValue::ShapeFn);
-
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(VariantValue, "TEST VariantValue");
 
 INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(
@@ -113,38 +104,6 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
 
 }  // namespace
 
-TEST(VariantOpShapeRegistryTest, TestBasic) {
-  class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetShapeFn(MakeTypeIndex<Blah>()),
-            nullptr);
-
-  auto* shape_fn = UnaryVariantOpRegistry::Global()->GetShapeFn(
-      MakeTypeIndex<VariantValue>());
-  EXPECT_NE(shape_fn, nullptr);
-  TensorShape shape;
-
-  VariantValue vv_early_exit{true /* early_exit */};
-  Variant v = vv_early_exit;
-  Status s0 = (*shape_fn)(v, &shape);
-  EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
-
-  VariantValue vv_ok{false /* early_exit */};
-  v = vv_ok;
-  TF_EXPECT_OK((*shape_fn)(v, &shape));
-  EXPECT_EQ(shape, TensorShape({-0xdeadbeef}));
-}
-
-TEST(VariantOpShapeRegistryTest, TestDuplicate) {
-  UnaryVariantOpRegistry registry;
-  UnaryVariantOpRegistry::VariantShapeFn f;
-  class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
-  registry.RegisterShapeFn(kTypeIndex, f);
-  EXPECT_DEATH(registry.RegisterShapeFn(kTypeIndex, f),
-               "FjFjFj already registered");
-}
-
 TEST(VariantOpDecodeRegistryTest, TestBasic) {
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetDecodeFn("YOU SHALL NOT PASS"),
             nullptr);
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index 3e67e4a86405819925f153400340145821cce414..c169e867ec44947d8a761aa1f3be0e06188f974b 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -43,6 +43,12 @@ Tensor* VariantTensorData::add_tensors() {
   return &(tensors_[tensors_.size() - 1]);
 }
 
+template <typename... TensorConstructorArgs>
+Tensor* VariantTensorData::add_tensor(TensorConstructorArgs&&... args) {
+  tensors_.emplace_back(std::forward<TensorConstructorArgs>(args)...);
+  return &tensors_.back();
+}
+
 void VariantTensorData::ToProto(VariantTensorDataProto* proto) const {
   proto->set_type_name(type_name());
   proto->set_metadata(metadata_);
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 8c69c870345a68a2c5fc5f1f33015c7bb97c123e..ca99e83f9c7e575de622349749f8dfb98ab37747 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -68,6 +68,11 @@ class VariantTensorData {
   const std::vector<Tensor>& tensors() const;
   Tensor* add_tensors();
 
+  // A more general version of add_tensors. Parameters are perfectly forwarded
+  // to the constructor of the tensor added here.
+  template <typename... TensorConstructorArgs>
+  Tensor* add_tensor(TensorConstructorArgs&&... args);
+
   // Conversion to and from VariantTensorDataProto
   void ToProto(VariantTensorDataProto* proto) const;
   // This allows optimizations via std::move.
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 9b4200e0b47ec37ddbef1e375e1955c6ec814caf..5ad1c19dc1a7bbbd087628a41f613d9d44377147 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -22,25 +22,29 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
-
-void DFS(const Graph& g, const std::function<void(Node*)>& enter,
-         const std::function<void(Node*)>& leave,
-         const NodeComparator& stable_comparator,
-         const EdgeFilter& edge_filter) {
+namespace {
+template <typename T>
+void DFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
+                   const std::function<void(T)>& enter,
+                   const std::function<void(T)>& leave,
+                   const NodeComparator& stable_comparator,
+                   const EdgeFilter& edge_filter) {
   // Stack of work to do.
   struct Work {
-    Node* node;
+    T node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.source_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
     Work w = stack.back();
     stack.pop_back();
 
-    Node* n = w.node;
+    T n = w.node;
     if (w.leave) {
       leave(n);
       continue;
@@ -80,6 +84,23 @@ void DFS(const Graph& g, const std::function<void(Node*)>& enter,
     }
   }
 }
+}  // namespace
+
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave,
+         const NodeComparator& stable_comparator,
+         const EdgeFilter& edge_filter) {
+  DFSFromHelper(g, {g.source_node()}, enter, leave, stable_comparator,
+                edge_filter);
+}
+
+void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+             const std::function<void(const Node*)>& enter,
+             const std::function<void(const Node*)>& leave,
+             const NodeComparator& stable_comparator,
+             const EdgeFilter& edge_filter) {
+  DFSFromHelper(g, start, enter, leave, stable_comparator, edge_filter);
+}
 
 void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
                 const std::function<void(Node*)>& leave,
@@ -222,11 +243,12 @@ bool FixupSourceAndSinkEdges(Graph* g) {
   bool changed = false;
   for (Node* n : g->nodes()) {
     if (!n->IsSource() && n->in_edges().empty()) {
-      g->AddControlEdge(g->source_node(), n);
+      g->AddControlEdge(g->source_node(), n,
+                        true /* skip test for duplicates */);
       changed = true;
     }
     if (!n->IsSink() && n->out_edges().empty()) {
-      g->AddControlEdge(n, g->sink_node());
+      g->AddControlEdge(n, g->sink_node(), true /* skip test for duplicates */);
       changed = true;
     }
   }
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 45f8a29a92d5201af626c77a6aa07daf1a756b6d..3479605df86e37dc52388651d049968d02239e19 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -55,6 +55,18 @@ extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
                 const NodeComparator& stable_comparator = {},
                 const EdgeFilter& edge_filter = {});
 
+// Perform a depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any children of n.
+// If leave is not empty, calls leave(n) after visiting all children of n.
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+// If edge_filter is set then ignores edges for which edge_filter returns false.
+extern void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
+
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
diff --git a/tensorflow/core/graph/collective_order.cc b/tensorflow/core/graph/collective_order.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c24166cb8e139c75370ff07e8367f5d01b41a039
--- /dev/null
+++ b/tensorflow/core/graph/collective_order.cc
@@ -0,0 +1,202 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/graph/collective_order.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+namespace {
+
+// Find all CollectiveReduce nodes and the existing data dependencies between
+// them.
+Status DiscoverDataDependencies(
+    const Graph* graph, std::vector<Node*>* collective_nodes,
+    std::vector<int32>* instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies) {
+  Status s;
+  // Algorithm: do Reverse DFS starting at sink.  `node_leave` is called when
+  // all parents of `node` have been visited.  At that point,
+  // `data_dependencies[node]` is a list containing `instance_key` of every
+  // `CollectiveReduce` on which `node` has a data dependency.
+  // For this node's children, add all these instance keys.  Also, if this node
+  // is collective, add as a dependency for the children.
+  auto node_leave = [collective_nodes, instance_keys, data_dependencies,
+                     &s](Node* node) {
+    int32 instance_key;
+    bool enter_node =
+        node->IsCollective() && node->type_string() == "CollectiveReduce";
+    if (enter_node) {
+      Status get_attr_status =
+          GetNodeAttr(node->attrs(), "instance_key", &instance_key);
+      s.Update(get_attr_status);
+      collective_nodes->push_back(node);
+      instance_keys->push_back(instance_key);
+      VLOG(2) << "collective node " << node->DebugString();
+    }
+    const auto& node_deps = (*data_dependencies)[node];
+    for (const Edge* out_edge : node->out_edges()) {
+      auto& child_deps = (*data_dependencies)[out_edge->dst()];
+      child_deps.insert(node_deps.begin(), node_deps.end());
+      if (enter_node && s.ok()) {
+        child_deps.insert(instance_key);
+      }
+    }
+  };
+  ReverseDFS(*graph, nullptr, node_leave);
+  return s;
+}
+
+// Given a list of `collective_nodes` and `data_dependencies` between the
+// collective nodes, create control dependencies between concurrent collectives
+// and store in `dependency_edges`.
+// If there exists an edge a -> b then `dependency_edges[a]` contains `b`
+Status CreateControlDependencies(
+    const std::vector<Node*>& collective_nodes,
+    const std::vector<int32>& instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>* dependency_edges) {
+  // If there exists some path a -> ... -> b then `all_paths[a]` contains `b`
+  absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> all_paths;
+  for (int i = 0; i < collective_nodes.size() - 1; i++) {
+    if (!collective_nodes[i]->IsCollective() ||
+        collective_nodes[i]->type_string() != "CollectiveReduce") {
+      return errors::Internal("Unexpected node ",
+                              collective_nodes[i]->DebugString());
+    }
+    const auto& deps_i = (*data_dependencies)[collective_nodes[i]];
+    for (int j = i + 1; j < collective_nodes.size(); j++) {
+      if (collective_nodes[i]->requested_device() !=
+          collective_nodes[j]->requested_device()) {
+        continue;
+      }
+      if (instance_keys[i] == instance_keys[j]) {
+        return errors::Internal("Unexpected same instance_key ",
+                                instance_keys[i],
+                                " on 2 nodes with the same device ",
+                                collective_nodes[i]->requested_device());
+      }
+      const auto& deps_j = (*data_dependencies)[collective_nodes[j]];
+      if (deps_i.find(instance_keys[j]) == deps_i.end() &&
+          deps_j.find(instance_keys[i]) == deps_j.end()) {
+        int src_idx = instance_keys[i] > instance_keys[j] ? i : j;
+        int dst_idx = instance_keys[i] > instance_keys[j] ? j : i;
+        Node* src_node = collective_nodes[src_idx];
+        Node* dst_node = collective_nodes[dst_idx];
+        VLOG(1) << "Adding control dependency from node " << src_node->name()
+                << " instance " << instance_keys[src_idx] << " to node "
+                << dst_node->name() << " instance " << instance_keys[dst_idx];
+        (*dependency_edges)[src_node].insert(dst_node);
+        auto& src_paths = all_paths[src_node];
+        src_paths.insert(dst_node);
+        for (Node* downstream_node : all_paths[dst_node]) {
+          src_paths.insert(downstream_node);
+        }
+      }
+    }
+  }
+
+  // Prune dependency edges so that if there are edges a -> b, b -> c, and a ->
+  // c, then remove a -> c.  This dependency would be handled naturally during
+  // op scheduling.
+  for (int i = 0; i < collective_nodes.size(); ++i) {
+    Node* node = collective_nodes[i];
+    auto& neighbor_set = (*dependency_edges)[node];
+    std::vector<Node*> neighbor_list(neighbor_set.begin(), neighbor_set.end());
+    // For all n1, n2 in `neighbor_list` if there is a path from n1 -> n2 then
+    // eliminate n2 from `neighbor_set` and `neighbor_list`.  We remove from
+    // `neighbor_list` by replacing with a `nullptr`, hence the `nullptr` checks
+    // below.
+    for (int j = 0; j < neighbor_list.size(); ++j) {
+      Node* n1 = neighbor_list[j];
+      if (n1 == nullptr) continue;
+      auto& n1_paths = all_paths[n1];
+      for (int k = 0; k < neighbor_list.size(); ++k) {
+        Node* n2 = neighbor_list[k];
+        if (j == k || n2 == nullptr) continue;
+        if (n1_paths.find(n2) != n1_paths.end()) {
+          neighbor_set.erase(n2);
+          neighbor_list[k] = nullptr;
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+// Insert control dependencies defined by `dependency_edges` in `graph`.  If
+// `order_type` is `kEdges`, insert explicit control edges, else if `order_type`
+// is `kAttrs`, encode depdencies as an attribute on collective node.
+Status InsertControlDependencies(
+    Graph* graph, GraphCollectiveOrder order_type,
+    const absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>&
+        dependency_edges) {
+  if (order_type == GraphCollectiveOrder::kEdges) {
+    for (const auto& pair : dependency_edges) {
+      Node* src_node = pair.first;
+      for (Node* dst_node : pair.second) {
+        graph->AddControlEdge(src_node, dst_node);
+      }
+    }
+  } else if (order_type == GraphCollectiveOrder::kAttrs) {
+    // `wait_for` is the inverse of `dependency_edges`, i.e. `wait_for[node]`
+    // contains the list of instance keys for which `node` must wait.
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> wait_for;
+    for (const auto& pair : dependency_edges) {
+      int32 src_instance;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(pair.first->attrs(), "instance_key", &src_instance));
+      for (Node* dst_node : pair.second) {
+        wait_for[dst_node].insert(src_instance);
+      }
+    }
+    for (const auto& pair : wait_for) {
+      std::vector<int32> wait_for_list(pair.second.begin(), pair.second.end());
+      pair.first->ClearAttr("wait_for");
+      pair.first->AddAttr("wait_for", wait_for_list);
+    }
+  } else {
+    return errors::Internal("Unexpected GraphCollectiveOrder type ",
+                            static_cast<int>(order_type));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type) {
+  // `instance_keys[i]` corresponds to `collective_nodes[i]`
+  std::vector<Node*> collective_nodes;
+  std::vector<int32> instance_keys;
+  // node -> set of collectives on which node depends.
+  absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> data_dependencies;
+  TF_RETURN_IF_ERROR(DiscoverDataDependencies(
+      graph, &collective_nodes, &instance_keys, &data_dependencies));
+
+  if (collective_nodes.empty()) return Status::OK();
+
+  absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> dependency_edges;
+  // For all pairs of collective nodes n1 and n2 on the same device, if n1 does
+  // not depend on n2 and n2 does not depend on n1, then they are potentially
+  // concurrent.  Create an arbitrary, deterministic ordering between them.
+  TF_RETURN_IF_ERROR(CreateControlDependencies(
+      collective_nodes, instance_keys, &data_dependencies, &dependency_edges));
+
+  return InsertControlDependencies(graph, order_type, dependency_edges);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/collective_order.h b/tensorflow/core/graph/collective_order.h
new file mode 100644
index 0000000000000000000000000000000000000000..67a1427a96635f08d0fbe9f77f92d4d213a93dd8
--- /dev/null
+++ b/tensorflow/core/graph/collective_order.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
+#define TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+enum class GraphCollectiveOrder { kNone, kEdges, kAttrs };
+
+// Introduces a deterministic execution order between potentially concurrent
+// CollectiveOps.  This may be used to execute collectives in the same order
+// across all workers in a distributed execution, if all workers are executing
+// the same graph.
+// If `order_type` is `kEdges`, introduce the ordering in the form of explicit
+// control edges between collective graph nodes.  If `order_type` is `kAttrs`,
+// add an attribute to the node which may be used by collective executor to
+// ensure the required ordering.
+Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
diff --git a/tensorflow/core/graph/collective_order_test.cc b/tensorflow/core/graph/collective_order_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a158e5c3fd040ca2242249aec51f701e785a4b6
--- /dev/null
+++ b/tensorflow/core/graph/collective_order_test.cc
@@ -0,0 +1,235 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/graph/collective_order.h"
+
+#include <gmock/gmock.h>
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::UnorderedElementsAreArray;
+
+REGISTER_OP("TestParams").Output("o: float");
+
+// Verifies that the list of collective nodes in `graph` matches
+// `expected_collective_nodes`, and that the list of control edges between these
+// collective nodes matches `expected_collective_control_edges`.
+void VerifyGraph(const Graph& graph,
+                 const std::vector<string>& expected_collective_nodes,
+                 const std::vector<std::pair<string, string>>&
+                     expected_collective_control_edges) {
+  std::vector<string> actual_collective_nodes;
+  std::vector<std::pair<string, string>> actual_collective_control_edges;
+  for (const Node* src : graph.nodes()) {
+    if (!src->IsCollective()) {
+      continue;
+    }
+    actual_collective_nodes.push_back(src->name());
+    for (const Edge* edge : src->out_edges()) {
+      VLOG(2) << "collective edge " << edge->src()->name() << " -> "
+              << edge->dst()->name();
+      // Add all control edges found except those to `_SINK`.
+      if (!edge->IsControlEdge() || edge->dst()->name() == "_SINK") {
+        continue;
+      }
+      actual_collective_control_edges.emplace_back(src->name(),
+                                                   edge->dst()->name());
+    }
+  }
+  EXPECT_THAT(actual_collective_nodes,
+              UnorderedElementsAreArray(expected_collective_nodes));
+  EXPECT_THAT(actual_collective_control_edges,
+              UnorderedElementsAreArray(expected_collective_control_edges));
+}
+
+// Verifies that the `wait_for` attribute on collective nodes matches
+// `wait_for_map`.
+void VerifyAttrs(
+    const Graph& graph,
+    const std::unordered_map<string, std::vector<int32>> wait_for_map) {
+  for (const Node* node : graph.nodes()) {
+    if (node->IsCollective() ||
+        wait_for_map.find(node->name()) == wait_for_map.end()) {
+      continue;
+    }
+    std::vector<int32> wait_for_actual;
+    TF_EXPECT_OK(GetNodeAttr(node->attrs(), "wait_for", &wait_for_actual));
+    auto wait_for_expected = wait_for_map.at(node->name());
+    EXPECT_THAT(wait_for_actual, UnorderedElementsAreArray(wait_for_expected));
+  }
+}
+
+Node* CollectiveReduceNode(GraphDefBuilder* builder, Node* input,
+                           const string& name, const string& device,
+                           int instance_key) {
+  Node* collective_node =
+      ops::UnaryOp("CollectiveReduce", input,
+                   builder->opts()
+                       .WithName(name)
+                       .WithDevice(device)
+                       .WithAttr("T", DT_FLOAT)
+                       .WithAttr("group_size", 2)
+                       .WithAttr("group_key", 1)
+                       .WithAttr("instance_key", instance_key)
+                       .WithAttr("merge_op", "Add")
+                       .WithAttr("final_op", "Id")
+                       .WithAttr("subdiv_offsets", {1}));
+  return collective_node;
+}
+
+// Initialize the following graph:
+//
+//       (cpu0) (cpu1)
+//         a      b
+//         |      |
+//         c1     c1
+//         |      |
+//         id     id
+//        /  \   /  \
+//       c2  c3 c2  c3
+//
+// Here ci denotes a collective node with `instance_key` i.  `a` and `b` are
+// inputs, `id` is identity node.
+std::unique_ptr<Graph> InitGraph() {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string dev1 = "/job:localhost/replica:0/task:0/device:CPU:1";
+  Node* a = ops::SourceOp("TestParams",
+                          builder.opts().WithName("a").WithDevice(dev0));
+  Node* b = ops::SourceOp("TestParams",
+                          builder.opts().WithName("b").WithDevice(dev1));
+  Node* c1_0 = CollectiveReduceNode(&builder, a, "c1_0", dev0, 1);
+  Node* c1_1 = CollectiveReduceNode(&builder, b, "c1_1", dev1, 1);
+  Node* id0 = ops::UnaryOp(
+      "Identity", c1_0,
+      builder.opts().WithName("id0").WithDevice(dev0).WithAttr("T", DT_FLOAT));
+  Node* id1 = ops::UnaryOp(
+      "Identity", c1_1,
+      builder.opts().WithName("id1").WithDevice(dev1).WithAttr("T", DT_FLOAT));
+  CollectiveReduceNode(&builder, id0, "c2_0", dev0, 2);
+  CollectiveReduceNode(&builder, id1, "c2_1", dev1, 2);
+  CollectiveReduceNode(&builder, id0, "c3_0", dev0, 3);
+  CollectiveReduceNode(&builder, id1, "c3_1", dev1, 3);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Status s = GraphDefBuilderToGraph(builder, graph.get());
+  if (!s.ok()) {
+    LOG(FATAL) << "Error building graph " << s;
+  }
+  return graph;
+}
+
+// Tests that in the graph created by `InitGraph`, exactly 2 control edges are
+// added after calling `OrderCollectives`: c3_0 -> c2_0 and c3_1 -> c2_1.
+TEST(CollectiveOrderTest, SimpleOrder) {
+  std::unique_ptr<Graph> graph = InitGraph();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kEdges));
+  VerifyGraph(*graph, {"c1_0", "c1_1", "c2_0", "c2_1", "c3_0", "c3_1"},
+              {{"c3_0", "c2_0"}, {"c3_1", "c2_1"}});
+}
+
+TEST(CollectiveOrderTest, SimpleOrderAttr) {
+  std::unique_ptr<Graph> graph = InitGraph();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kAttrs));
+  VerifyAttrs(*graph, {{"c2_0", {3}}, {"c2_1", {3}}});
+}
+
+// Initialize the following graph:
+//
+//         a
+//         |
+//         c1
+//        /  \
+//       c4  id
+//          /  \
+//         c2  c3
+//
+// Here ci denotes a collective node with `instance_key` i.  `a` is an input,
+// `id` is identity node.
+std::unique_ptr<Graph> InitGraph2() {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  Node* a = ops::SourceOp("TestParams",
+                          builder.opts().WithName("a").WithDevice(dev0));
+  Node* c1 = CollectiveReduceNode(&builder, a, "c1", dev0, 1);
+  CollectiveReduceNode(&builder, c1, "c4", dev0, 4);
+  Node* id = ops::UnaryOp(
+      "Identity", c1,
+      builder.opts().WithName("id").WithDevice(dev0).WithAttr("T", DT_FLOAT));
+  CollectiveReduceNode(&builder, id, "c2", dev0, 2);
+  CollectiveReduceNode(&builder, id, "c3", dev0, 3);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Status s = GraphDefBuilderToGraph(builder, graph.get());
+  if (!s.ok()) {
+    LOG(FATAL) << "Error building graph " << s;
+  }
+  return graph;
+}
+
+// Tests that in the graph created by `InitGraph2`, we add the following control
+// edges after calling `OrderCollectives`: c4 -> c3, c3 -> c2.  c4->c2 is
+// pruned because it follows from the other two edges.
+TEST(CollectiveOrderTest, SimpleOrder2) {
+  std::unique_ptr<Graph> graph = InitGraph2();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kEdges));
+  VerifyGraph(*graph, {"c1", "c2", "c3", "c4"}, {{"c4", "c3"}, {"c3", "c2"}});
+}
+
+// Initialize the following graph:
+//
+//         w   x   y   z
+//         |   |   |   |
+//         c1  c2  c3  c4
+//
+std::unique_ptr<Graph> InitGraphForPruning() {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  Node* w = ops::SourceOp("TestParams",
+                          builder.opts().WithName("w").WithDevice(dev0));
+  Node* x = ops::SourceOp("TestParams",
+                          builder.opts().WithName("x").WithDevice(dev0));
+  Node* y = ops::SourceOp("TestParams",
+                          builder.opts().WithName("y").WithDevice(dev0));
+  Node* z = ops::SourceOp("TestParams",
+                          builder.opts().WithName("z").WithDevice(dev0));
+  CollectiveReduceNode(&builder, w, "c1", dev0, 1);
+  CollectiveReduceNode(&builder, x, "c2", dev0, 2);
+  CollectiveReduceNode(&builder, y, "c3", dev0, 3);
+  CollectiveReduceNode(&builder, z, "c4", dev0, 4);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Status s = GraphDefBuilderToGraph(builder, graph.get());
+  if (!s.ok()) {
+    LOG(FATAL) << "Error building graph " << s;
+  }
+  return graph;
+}
+
+// Tests that in the graph created by `InitGraphForPruning`, we only add c4 ->
+// c3, c3 -> c2, c2 -> c1, and other edges are pruned away.
+TEST(CollectiveOrderTest, Pruning) {
+  std::unique_ptr<Graph> graph = InitGraphForPruning();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kAttrs));
+  VerifyAttrs(*graph, {{"c3", {4}}, {"c2", {3}}, {"c1", {2}}});
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 8e1e56d29bc474dedf7c0b01dbdf8099ebf86c4d..66237a349796929d17bab473a390e9bba35480ad 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -59,7 +59,7 @@ Status ValidateControlFlowInfo(const Graph* graph,
           "Invalid loop structure: Mismatched parent frames for \"",
           cf.frame_name, "\": \"", parent->name, "\" vs \"", frame.parent->name,
           "\". The node giving this error: ", FormatNodeForError(*node),
-          "This is an internal bug, please file a bug report with "
+          ". This is an internal bug, please file a bug report with "
           "instructions on how to reproduce the error.");
     }
     if (IsLoopCond(node)) {
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 3ea222c13c5aa06f708bce61454cef9c24e56c8b..00d3549312aee9669eb588ace593f347263c1a11 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -85,6 +85,10 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"CollectiveBcastSend", NC_COLLECTIVE},
         {"CollectiveBcastRecv", NC_COLLECTIVE},
         {"FakeParam", NC_FAKE_PARAM},
+        {"IteratorGetNext", NC_DATASET},
+        {"IteratorGetNextSync", NC_DATASET},
+        {"DatasetToSingleElement", NC_DATASET},
+        {"ReduceDataset", NC_DATASET},
     });
 
 #undef REF_CLASS
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 289a3d2a2307280830e23b8b12513e20feccb153..f65e4b921efb3298bad090198a0e1d32c31b8fd3 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -174,6 +174,8 @@ class Node {
   bool IsMetadata() const { return class_ == NC_METADATA; }
   bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
 
+  bool IsDataset() const { return class_ == NC_DATASET; }
+
   template <typename T>
   void AddAttr(const string& name, const T& val) {
     SetAttrValue(val, AddAttrHelper(name));
@@ -254,6 +256,7 @@ class Node {
     NC_SCOPED_ALLOCATOR,
     NC_COLLECTIVE,
     NC_FAKE_PARAM,
+    NC_DATASET,
     NC_OTHER  // Not a special kind of node
   };
 
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index f213eb7c107c92be55d4efcf7b8551f1ac282154..00c7a5b091c0dbfbcf08a3611faaab4d41a08152 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -58,22 +59,15 @@ struct DupRecvKey {
   int src_output_slot;       // Edge's src node output slot
   GraphDef* dst_graph;       // Edge's dst node is in this subgraph
   bool recv_output_on_host;  // The output of recv is on host
-};
 
-struct DupRecvKeyHash {
-  size_t operator()(const DupRecvKey& k) const {
-    size_t h = Hash64(reinterpret_cast<const char*>(&k.src_node_id),
-                      sizeof(k.src_node_id), k.src_output_slot);
-    h = Hash64(reinterpret_cast<const char*>(&k.dst_graph), sizeof(k.dst_graph),
-               h);
-    h = Hash64(reinterpret_cast<const char*>(&k.recv_output_on_host),
-               sizeof(k.recv_output_on_host), h);
-    return h;
+  template <typename H>
+  friend H AbslHashValue(H h, const DupRecvKey& c) {
+    return H::combine(std::move(h), c.src_node_id, c.src_output_slot,
+                      reinterpret_cast<std::uintptr_t>(c.dst_graph),
+                      c.recv_output_on_host);
   }
-};
 
-struct DupRecvKeyEq {
-  bool operator()(const DupRecvKey& x, const DupRecvKey& y) const {
+  friend bool operator==(const DupRecvKey& x, const DupRecvKey& y) {
     return (x.src_node_id == y.src_node_id) &&
            (x.src_output_slot == y.src_output_slot) &&
            (x.dst_graph == y.dst_graph) &&
@@ -88,19 +82,26 @@ struct RecvInfo {
   int64 start_time;
 };
 
-typedef std::unordered_map<DupRecvKey, RecvInfo, DupRecvKeyHash, DupRecvKeyEq>
-    DupRecvTable;
+typedef absl::flat_hash_map<DupRecvKey, RecvInfo> DupRecvTable;
 
-struct PairIntHash {
- public:
-  std::size_t operator()(const std::pair<int, int>& x) const {
-    return std::hash<int>()(x.first) ^ std::hash<int>()(x.second);
-  }
-};
 // A map used to store memory types for the inputs/outputs of every node.
 // The key is a pair of ints consisting of a node id and input/output index.
-typedef std::unordered_map<std::pair<int, int>, MemoryType, PairIntHash>
-    MemoryTypeMap;
+// TODO(power): migrate back to std::pair when absl::Hash is fixed for MSVC.
+struct NodePort {
+  int node_id;
+  int index;
+
+  friend bool operator==(const NodePort& x, const NodePort& y) {
+    return x.node_id == y.node_id && x.index == y.index;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const NodePort& c) {
+    return H::combine(std::move(h), c.node_id, c.index);
+  }
+};
+
+typedef absl::flat_hash_map<NodePort, MemoryType> MemoryTypeMap;
 
 // We collect the following information about the graph before performing
 // graph partitioning.
@@ -564,10 +565,10 @@ Status BuildMemoryDeviceInfo(const Graph& g, GraphInfo* info) {
 
     int node_id = node->id();
     info->device_types[node_id] = DeviceType(parsed.type);
-    for (size_t i = 0; i < input_memory_types.size(); ++i) {
+    for (int i = 0; i < input_memory_types.size(); ++i) {
       info->input_types[{node_id, i}] = input_memory_types[i];
     }
-    for (size_t i = 0; i < output_memory_types.size(); ++i) {
+    for (int i = 0; i < output_memory_types.size(); ++i) {
       info->output_types[{node_id, i}] = output_memory_types[i];
     }
   }
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 3c868dc22261fae7ebc061ce7a0aec51477dfdc1..1d839208e9e7969820173b8e9e13e0e041058609 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -258,6 +258,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv3d = "Conv3D";
     csinfo_.conv3d_grad_input = "Conv3DBackpropInputV2";
     csinfo_.conv3d_grad_filter = "Conv3DBackpropFilterV2";
+    csinfo_.depthwise_conv2d = "DepthwiseConv2dNative";
+    csinfo_.depthwise_conv2d_grad_input = "DepthwiseConv2dNativeBackpropInput";
+    csinfo_.depthwise_conv2d_grad_filter =
+        "DepthwiseConv2dNativeBackpropFilter";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.fused_conv2d = "_FusedConv2D";
@@ -277,10 +281,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
         "_MklConv2DBackpropFilterWithBias";
+    csinfo_.mkl_depthwise_conv2d_grad_input =
+        "_MklDepthwiseConv2dNativeBackpropInput";
+    csinfo_.mkl_depthwise_conv2d_grad_filter =
+        "_MklDepthwiseConv2dNativeBackpropFilter";
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
+    csinfo_.mkl_pad_with_fused_conv2d = "_MklPadWithFusedConv2D";
     csinfo_.pad = "Pad";
     csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
+    csinfo_.pad_with_fused_conv2d = "__MklDummyPadWithFusedConv2D";
 // Temporarily don't convert quantized operators into MKL versions for now.
 // TODO(Intel-tf) Once all the relevant PRs have been merged then remove
 // the ifdef.
@@ -377,6 +387,17 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.conv3d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_input),
                       CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.depthwise_conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d),
+                      CopyAttrsConv2DDepthwise, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.depthwise_conv2d_grad_input,
+         mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input),
+         CopyAttrsConv2DDepthwise, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.depthwise_conv2d_grad_filter,
+         mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
+         CopyAttrsConv2DDepthwise, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
@@ -419,6 +440,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
                       CopyAttrsPadWithConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.pad_with_fused_conv2d,
+                      csinfo_.mkl_pad_with_fused_conv2d,
+                      CopyAttrsPadWithFusedConv2D, AlwaysRewrite});
 #ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
@@ -529,10 +553,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
-    minfo_.push_back(
-        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
     // Merge Pad and Conv2d, only if the pad op is "Pad"
     // Doesn't merge if pad op is "PadV2" or "MirrorPad"
+    minfo_.push_back(
+        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
+
+    minfo_.push_back({csinfo_.pad, csinfo_.fused_conv2d,
+                      csinfo_.pad_with_fused_conv2d, GetPadOrFusedConv2D});
 
     // The fusion patterns in "finfo_" that show up first will get applied
     // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
@@ -675,6 +702,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string conv3d;
     string conv3d_grad_input;
     string conv3d_grad_filter;
+    string depthwise_conv2d;
+    string depthwise_conv2d_grad_input;
+    string depthwise_conv2d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
     string fused_conv2d;
@@ -694,11 +724,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_grad_filter;
     string mkl_conv2d_grad_filter_with_bias;
     string mkl_conv2d_with_bias;
+    string mkl_depthwise_conv2d_grad_input;
+    string mkl_depthwise_conv2d_grad_filter;
     string mkl_fused_conv2d;
     string mkl_pad_with_conv2d;
+    string mkl_pad_with_fused_conv2d;
     string mul;
     string pad;
     string pad_with_conv2d;
+    string pad_with_fused_conv2d;
     string quantized_avg_pool;
     string quantized_conv2d;
     string quantized_conv2d_with_requantize;
@@ -838,6 +872,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     if (m->type_string() == csinfo_.bias_add) {
       // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
       TF_CHECK_OK(m->input_node(0, &n));
@@ -872,6 +912,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DCHECK(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     const Node* conv_node;
     if (m->type_string() == csinfo_.pad) {
       // If m is Pad, then Conv2D is the output of Pad.
@@ -912,6 +958,59 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     return n;
   }
+
+  // Find Pad or _FusedConv2D node that can be merged with input node 'm'.
+  // If input 'm' is Pad, then check if there exists _FusedConv2D node that can
+  // be merged with 'm'. If input 'm' is _FusedConv2D, then check if there
+  // exists Pad node that can be merged with 'm'.
+  static Node* GetPadOrFusedConv2D(const Node* m) {
+    DCHECK(m);
+    Node* n = nullptr;
+
+    const Node* conv_node;
+    if (m->type_string() == csinfo_.pad) {
+      // If m is Pad, then _FusedConv2D is the output of Pad.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.fused_conv2d) {
+          n = e->dst();
+          conv_node = n;
+          break;
+        }
+      }
+    } else {
+      DCHECK_EQ(m->type_string(), csinfo_.fused_conv2d);
+      // If m is _FusedConv2D, Go over all input edges
+      // and search for Pad node.
+      for (const Edge* e : m->in_edges()) {
+        if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) {
+          n = e->src();
+          conv_node = m;
+          break;
+        }
+      }
+    }
+    // Check if only VALID type of padding is used or not.
+    if (n != nullptr) {
+      string padding;
+      TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding));
+      if (padding != "VALID") {
+        // Then do not merge.
+        n = nullptr;
+        VLOG(1) << "MklLayoutRewritePass: Could match Pad and _FusedConv2D "
+                << "nodes but cannot merge them. Only conv ops with padding "
+                << "type VALID can be merged with Pad op Input node: "
+                << m->DebugString();
+      }
+    } else {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Pad and _FusedConv2D node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
   // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
   // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
   // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
@@ -930,6 +1029,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     if (m->type_string() == csinfo_.bias_add_grad) {
       // Get 1st input 'g' of BiasAddGrad.
       Node* g = nullptr;
@@ -1025,7 +1130,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             e->dst_input() == kPermTensorIndex) {
           // we find the "perm" node, now try to retrieve its value.
           const TensorProto* proto = nullptr;
-          DCHECK(GetNodeAttr(perm_node->def(), "value", &proto).ok());
+          TF_CHECK_OK(GetNodeAttr(perm_node->def(), "value", &proto));
 
           DataType type;
           GetNodeAttr(perm_node->def(), "dtype", &type);
@@ -1386,6 +1491,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                               bool change_format = false);
   static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
                                 bool change_format = false);
+  static void CopyAttrsConv2DDepthwise(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
   static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
                             bool change_format = false);
   static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
@@ -1400,9 +1507,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                            bool change_format = false);
   static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
                                      bool change_format = false);
+  static void CopyAttrsPadWithFusedConv2D(const Node* orig_node,
+                                          NodeBuilder* nb,
+                                          bool change_format = false);
   static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                         const Node* orig_node2, NodeBuilder* nb,
                                         bool change_format = false);
+  static void CopyAttrsFromPadAndFusedConv2D(const Node* orig_node1,
+                                             const Node* orig_node2,
+                                             NodeBuilder* nb,
+                                             bool change_format = false);
   static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
                                bool change_format = false);
   static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
@@ -1620,6 +1734,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     for (const Edge* e : filter_node->out_edges()) {
       if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
            e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_pad_with_fused_conv2d ||
            e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias ||
            e->dst()->type_string() == csinfo_.mkl_fused_conv2d) &&
           e->dst_input() == kConv2DFilterInputSlotIdx
@@ -2008,6 +2123,24 @@ void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
   nb->Attr("Tpaddings", Tpaddings);
 }
 
+void MklLayoutRewritePass::CopyAttrsPadWithFusedConv2D(const Node* orig_node,
+                                                       NodeBuilder* nb,
+                                                       bool change_format) {
+  DataType Tpaddings;
+
+  CopyAttrsFusedConv2D(orig_node, nb, change_format);
+
+  // Get attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
+  // Check if filter is a constant.
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
+  // Add attributes to new node.
+  nb->Attr("Tpaddings", Tpaddings);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+}
+
 // Used with MergePadWithConv2D
 void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                                      const Node* orig_node2,
@@ -2042,6 +2175,66 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
   nb->Attr("Tpaddings", Tpaddings);
 }
 
+void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
+    const Node* fused_conv2d, const Node* pad, NodeBuilder* nb,
+    bool change_format) {
+  DataType T;
+  int num_args;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  float epsilon;
+  std::vector<string> fused_ops;
+  DataType Tpaddings;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "num_args", &num_args));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "fused_ops", &fused_ops));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(pad->def(), "Tpaddings", &Tpaddings));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_args", num_args);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("Tpaddings", Tpaddings);
+  nb->Attr("fused_ops", fused_ops);
+}
+
+void MklLayoutRewritePass::CopyAttrsConv2DDepthwise(const Node* orig_node,
+                                                    NodeBuilder* nb,
+                                                    bool change_format) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
 void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
                                          bool change_format) {
   DataType T;
@@ -2410,7 +2603,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gnu;
+  bool use_cudnn_on_gpu;
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
@@ -2418,7 +2611,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(GetNodeAttr(pred->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
   // We check to ensure that data formats of both succ and pred are same.
   // We expect them to be same, so we can enforce this as assert.
   // But assert can be too strict, so we enforce this as a check.
@@ -2543,11 +2736,15 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
 
 Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
                                                 Node* m, Node* n) {
-  DCHECK(((m->type_string() == csinfo_.pad &&
-           n->type_string() == csinfo_.conv2d)) ||
-         ((n->type_string() == csinfo_.pad &&
-           m->type_string() == csinfo_.conv2d)));
-
+  DCHECK((m->type_string() == csinfo_.pad &&
+          (n->type_string() == csinfo_.conv2d ||
+           n->type_string() == csinfo_.fused_conv2d)) ||
+         (n->type_string() == csinfo_.pad &&
+          (m->type_string() == csinfo_.conv2d ||
+           m->type_string() == csinfo_.fused_conv2d)));
+
+  bool is_fused_conv2d = n->type_string() == csinfo_.fused_conv2d ||
+                         m->type_string() == csinfo_.fused_conv2d;
   // Conv2D is successor node, and Pad predecessor node.
   Node* pred = m->type_string() == csinfo_.pad ? m : n;
   Node* succ = m->type_string() == csinfo_.pad ? n : m;
@@ -2558,18 +2755,14 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gnu;
+
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations));
-  // Data format for pad is not available and not necessary, thus
-  // dont need to match data format for Pad
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
-  // Check if the data types and devices of both succ and pred are the same.
-  // Assert is not used,  because it can be too strict.
+  // Check if the devices of both succ and pred are the same.
+  // Assert is not used because it can be too strict.
   // Don't need to check for data formats because it is not available in Pad.
   if (T_pred != T_succ ||
       pred->assigned_device_name() != succ->assigned_device_name() ||
@@ -2613,29 +2806,45 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   }
   DCHECK_EQ(PadDataInputEdges, 2);
 
-  // Conv2D must have 2 data inputs: pad output and Filter
+  // Conv2D must have 2 data inputs: Pad output and Filter
+  // FusedConv2D have 3 data inputs: Pad output, Filter and Args;
   int ConvDataInputEdges = 0;
   for (const Edge* e : succ->in_edges()) {
     if (!e->IsControlEdge()) {
       ConvDataInputEdges++;
     }
   }
-  DCHECK_EQ(ConvDataInputEdges, 2);
+
+  DCHECK_EQ(ConvDataInputEdges, is_fused_conv2d ? 3 : 2);
 
   // We will use the node name of Conv2D as the name of new node
   // Build new node. We use same name as original node, but change the op
   // name.
-  NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d);
+
+  NodeBuilder nb(succ->name(), is_fused_conv2d ? csinfo_.pad_with_fused_conv2d
+                                               : csinfo_.pad_with_conv2d);
   nb.Input(pred_in[0].first, pred_in[0].second);  // In1 (input data)  of Pad
   // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 (filter) of conv2d
   // In1 of Conv2D is same as output of Pad.
   // Thus, only need to add In2 of Conv2D
-  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
 
-  // Copy attributes from Pad and conv2D to PadWithConv2D.
-  CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
-                            const_cast<const Node*>(pred), &nb);
+  if (is_fused_conv2d) {
+    // FusedConv2D has one additional input, args
+    std::vector<NodeBuilder::NodeOut> args;
+    args.emplace_back(succ_in[2].first, succ_in[2].second);
+    nb.Input(gtl::ArraySlice<NodeBuilder::NodeOut>{
+        args});                                     // In3 (args) of FusedConv2D
+    nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+    // Copy attributes from Pad and FusedConv2D to PadWithFusedConv2D.
+    CopyAttrsFromPadAndFusedConv2D(const_cast<const Node*>(succ),
+                                   const_cast<const Node*>(pred), &nb);
+  } else {
+    nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+    // Copy attributes from Pad and conv2D to PadWithConv2D.
+    CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
+                              const_cast<const Node*>(pred), &nb);
+  }
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -2833,10 +3042,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
         m->type_string() == csinfo_.conv2d))) {
     return this->MergeConv2DWithBiasAdd(g, m, n);
   }
-  if (((m->type_string() == csinfo_.pad &&
-        n->type_string() == csinfo_.conv2d)) ||
-      ((n->type_string() == csinfo_.pad &&
-        m->type_string() == csinfo_.conv2d))) {
+  if ((m->type_string() == csinfo_.pad &&
+       (n->type_string() == csinfo_.conv2d ||
+        (n->type_string() == csinfo_.fused_conv2d && FusedConv2DRewrite(n)))) ||
+      (n->type_string() == csinfo_.pad &&
+       (m->type_string() == csinfo_.conv2d ||
+        (m->type_string() == csinfo_.fused_conv2d && FusedConv2DRewrite(m))))) {
     return this->MergePadWithConv2D(g, m, n);
   }
 
@@ -2990,6 +3201,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // names do not match Mkl node names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
       n->type_string() != csinfo_.pad_with_conv2d &&
+      n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
       n->type_string() != csinfo_.fused_conv2d &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
@@ -3110,8 +3322,9 @@ Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
   for (const Edge* e : transpose_to_nchw->out_edges()) {
     if (!e->IsControlEdge()) {
       const int kTransposeWithMklOpOutputSlot = 0;
-      DCHECK((*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot, e->dst(),
-                           e->dst_input()));
+      auto new_edge = (*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot,
+                                    e->dst(), e->dst_input());
+      DCHECK(new_edge);
     }
   }
 
@@ -3312,7 +3525,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
 
-#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
   for (Node* n : order) {
@@ -3334,7 +3546,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
     }
   }
   DumpGraph("After running MklLayoutRewritePass(NodeFusion)", &**g);
-#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
 
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 197ec0c4aebcaab0a57c6b021dc146a9c6534db1..987d56f15d1c1b98ebdd30b67fc8948b97753362 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -707,7 +707,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) {
       "C:control->DMT/_0:control;C:control->DMT/_1:control;"
       "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1");
 }
-#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+
 TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Positive) {
   InitGraph(
       "node { name: 'Input0' op: 'Input'}"
@@ -1016,7 +1016,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Negative) {
       "Transpose0:control->DMT/"
       "_1:control;Transpose1->Relu;Transpose1:control->DMT/_2:control");
 }
-#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node to Mkl node
@@ -1045,6 +1044,28 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
             "DMT/_1->C:3");
 }
 
+// Test case for the Depthwise FWD pass
+TEST_F(MklLayoutPassTest, NodeRewrite_DepthwiseConv2dNative_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'DepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);C(_MklDepthwiseConv2dNative);D(Zeta);DMT/_0(Const);"
+      "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+      "DMT/_1->C:3");
+}
+
 // 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
 // have 2 outputs, both of which will be inputs to next Conv2D.
 TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
@@ -1222,6 +1243,338 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative2) {
             "D(_FusedConv2D);E(Zeta)|A->D;B->D:1;C->D:2;C->E:1;D->E");
 }
 
+// Merge test for PadWithFusedConv2D Op with BiasAdd fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E)
+// After layout pass
+// _MklPadWithFusedConv2D(A, D, E, B, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+      "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+      "G(Zeta)|A->F;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+      "A:control->DMT/_2:control;A:control->DMT/_3:control;B->F:3;D->F:1;DMT/"
+      "_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/_3->F:7;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd+Relu fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E) (With relu)
+// G = Zeta(F, E)
+// After layout pass
+// _MklPadWithFusedConv2D(A, D, E, B, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta)|A->F;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->F:3;"
+            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/"
+            "_3->F:7;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with unsupported fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias),
+// F = _FusedConv2D(C, D, E) (With Unsupported), G = Zeta(F, E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);E(Input);F(_FusedConv2D);G("
+            "Zeta)|A->C;B->C:1;C->F;D->F:1;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd fusion
+// padding is SAME type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C,D,E)
+// G = Zeta(F,E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/"
+            "_1(Const);DMT/_2(Const);E(Input);F(_MklFusedConv2D);G(Zeta)|A->C;"
+            "B->C:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "C:control->DMT/_2:control;D->F:1;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd+Relu fusion
+// padding is SAME type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C,D,E)(With relu)
+// G = Zeta(F,E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/"
+            "_1(Const);DMT/_2(Const);E(Input);F(_MklFusedConv2D);G(Zeta)|A->C;"
+            "B->C:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "C:control->DMT/_2:control;D->F:1;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;E->G:1;F->G");
+}
+
+// Tests that there are no duplicate input control edges after merge.
+// If both the merging ops have input control edges from a common op
+// then, the merged op will have only one control edge from that
+// common op. This test only add additional input control edge check
+// based on the previous test NodeMerge_PadWithFusedConv2D_Positive1
+// padding is VALID type
+// A = input(image), X = input, B = input(paddings),
+// C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E)
+// X:control->C:control
+// X:control->F:control
+// After layout pass:
+// _MklPadWithFusedConv2D(A, D, B, F, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+// X:control->E:control (only one control edge)
+TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithFusedConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['F', 'E']}");
+  Node* x = FindNode("X");
+  Node* c = FindNode("C");
+  Node* f = FindNode("F");
+  const Edge* edge = graph_.AddControlEdge(x, c);
+  const Edge* edge_1 = graph_.AddControlEdge(x, f);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta);X(Input)|A->F;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;"
+            "A:control->DMT/_3:control;B->F:3;D->F:1;DMT/_0->F:4;"
+            "DMT/_1->F:5;DMT/_2->F:6;DMT/_3->F:7;E->F:2;E->G:1;F->G;"
+            "X:control->F:control");
+}
+
+// ts that there are no duplicate output control edges after merge.
+// If both the merging ops have output control edge to a common op,
+// then after merge, the merged op will have only one control edge
+// to that commom op. This test only add additional output control edge check
+// based on the previous test NodeMerge_PadWithFusedConv2D_Positive1
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E), X = input
+// C:control->X:control
+// F:control->X:control
+// After layout pass:
+// _MklPadWithFusedConv2D(A, D, B, F, DMT/_0, DMT/_1, DMT/_2, DMT/_2)
+// F:control->X:control (only one control edge)
+TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithFusedConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['F', 'E']}");
+  Node* x = FindNode("X");
+  Node* c = FindNode("C");
+  Node* f = FindNode("F");
+  const Edge* edge = graph_.AddControlEdge(c, x);
+  const Edge* edge_1 = graph_.AddControlEdge(f, x);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta);X(Input)|A->F;A:control->DMT/_0:control;A:control->DMT/"
+            "_1:control;A:control->DMT/_2:control;A:control->DMT/"
+            "_3:control;B->F:3;D->F:1;DMT/_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/"
+            "_3->F:7;E->F:2;E->G:1;F->G;F:control->X:control");
+}
+
+// Pad + _FusedConv2D with padding is VALID,
+// Input node pointing to both Pad and _FusedConv2D
+// Output of both Pad and _FusedConv2D feeds one node (G as Output2)
+// A = input(as image), B = input(as paddings), C = Pad(A, B)
+// E = input(as bias), F = _FusedConv2D(C, A, E), G = Output(C, F)
+// After layout pass - No merging, since Pad and _FusedConv2D both
+// feed to the same node (Z)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Common_InOutput) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'A', 'E']}"
+      "node { name: 'G' op: 'Output2'"
+      " input: ['C', 'F']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);E(Input);F(_MklFusedConv2D);G(Output2)|A->C;A->F:1;B->C:"
+            "1;C->F;C->G;C:control->DMT/_0:control;C:control->DMT/"
+            "_1:control;C:control->DMT/_2:control;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;F->G:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -1268,6 +1621,55 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
             "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
+TEST_F(MklLayoutPassTest,
+       NodeRewrite_DepthwiseConv2dNativeGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_"
+            "MklDepthwiseConv2dNativeBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_DepthwiseConv2dNativeGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_"
+            "MklDepthwiseConv2dNativeBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
 // Check that we never rewrite BiasAddGrad.
 TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
   InitGraph(
@@ -2301,6 +2703,29 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
 }
 
+TEST_F(MklLayoutPassTest,
+       NodeRewrite_DepthwiseConv2dNativeGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D("
+            "DepthwiseConv2dNativeBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index e44eb91d4883f3e8a6ad34e96d8dcd9d9076298b..4487f738c8e97e803618ae483b4551b47fd14c33 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/graph/validate.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -113,5 +115,16 @@ Status ValidateGraphHasNoCycle(const Graph& graph) {
   return Status::OK();
 }
 
+Status VerifyNoDuplicateNodeNames(const GraphDef& graph) {
+  absl::flat_hash_set<absl::string_view> nodes;
+  for (const auto& node : graph.node()) {
+    if (nodes.contains(node.name())) {
+      return errors::AlreadyExists("Node already exists: ", node.name());
+    }
+    nodes.insert(node.name());
+  }
+  return Status::OK();
+}
+
 }  // namespace graph
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/validate.h b/tensorflow/core/graph/validate.h
index 08879dca6037bcab21f4cbf107b3829c1b6600e8..bfb3a25ac91761449b1762fa2125d7758cc8c560 100644
--- a/tensorflow/core/graph/validate.h
+++ b/tensorflow/core/graph/validate.h
@@ -59,6 +59,9 @@ void GetOpListForValidation(
 // be less than the total node count.
 Status ValidateGraphHasNoCycle(const Graph& graph);
 
+// Returns OK if the graph has no duplicate node names.
+Status VerifyNoDuplicateNodeNames(const GraphDef& graph);
+
 }  // namespace graph
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index d58cdc3c5baf02f89cff52ef0396816cb00b48a3..f6a0d2614acfe147eb65b75fb843bc84d0b6dbeb 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -147,5 +147,36 @@ TEST(GetOpListForValidationTest, ShouldStripDocs) {
   EXPECT_TRUE(found_has_docs);
 }
 
+TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'B' op: 'Int32Input' }"
+      "node { "
+      "       name: 'C' op: 'Sum' "
+      "       attr { key: 'T' value { type: DT_FLOAT } }"
+      "       input: ['A', 'B'] "
+      "}";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+  TF_ASSERT_OK(graph::VerifyNoDuplicateNodeNames(graph_def));
+}
+
+TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'A' op: 'Int32Input' }"
+      "node { "
+      "       name: 'C' op: 'Sum' "
+      "       attr { key: 'T' value { type: DT_FLOAT } }"
+      "       input: ['A', 'A'] "
+      "}";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+  EXPECT_EQ(graph::VerifyNoDuplicateNodeNames(graph_def).code(),
+            tensorflow::error::ALREADY_EXISTS);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 6de12192ba87fe2020a5ae2474dc1fd59b2ac366..9fe699360feec6686312a4d11b67b4f411832126 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -62,6 +63,36 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "graph_topology_view",
+    srcs = ["graph_topology_view.cc"],
+    hdrs = ["graph_topology_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_view",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_topology_view_test",
+    srcs = ["graph_topology_view_test.cc"],
+    deps = [
+        ":graph_topology_view",
+        ":graph_view",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "graph_view",
     srcs = ["graph_view.cc"],
@@ -178,9 +209,12 @@ cc_library(
         ":graph_view",
         ":op_types",
         ":utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -201,5 +235,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/core/grappler/clusters/utils_test.cc b/tensorflow/core/grappler/clusters/utils_test.cc
index 3863d62980fb20611285d3efeade1aa998f1a1f3..3cf72fd8170ca271124d59135d592e2db1ba9b67 100644
--- a/tensorflow/core/grappler/clusters/utils_test.cc
+++ b/tensorflow/core/grappler/clusters/utils_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -82,12 +83,14 @@ TEST(UtilsTest, GetDeviceInfo) {
 
 #if GOOGLE_CUDA
   // Invalid platform GPU id.
-  GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100));
+  TF_ASSERT_OK(
+      GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
   properties = GetDeviceInfo(device);
   EXPECT_EQ("UNKNOWN", properties.type());
 
   // Valid platform GPU id.
-  GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0));
+  TF_ASSERT_OK(
+      GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0)));
   device.id = 1;
   properties = GetDeviceInfo(device);
   EXPECT_EQ("GPU", properties.type());
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index dbd8f26c286f07107a63e9c745c442b171f29aaa..118f74e8b01171e3780317b4ea36750c66a22b98 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -67,13 +67,17 @@ Status VirtualCluster::Run(const GraphDef& graph,
                            const std::vector<string>& fetch,
                            RunMetadata* metadata) {
   // Initialize a virtual scheduler to process the graph. Make sure to use
-  // static shape inference to prevent the schedulrer from calling the Run
-  // method on the cluster, and create an infinite loop.
+  // static shape inference to prevent the scheduler from calling the Run
+  // method on the cluster and creating an infinite loop.
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
-  VirtualScheduler scheduler(true, this, node_manager_.get());
+  // Note that we do not use aggressive shape inference to preserve unknown
+  // shapes from the input graph.
+  VirtualScheduler scheduler(/*use_static_shapes=*/true,
+                             /*use_aggressive_shape_inference=*/false, this,
+                             node_manager_.get());
   TF_RETURN_IF_ERROR(scheduler.Init(&item));
 
   if (metadata) {
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 15dc7074b9c035bc31e3b1ed6132329b1c7768f5..35ca93d9345d30c834c753e9c3ef7b25ca5ed8d5 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -41,8 +41,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler:mutable_graph_view",
@@ -171,6 +169,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index b7804ffaa5378c67028b39819a07fc00719c9896..5baf306f6fe39e80fc006ed1183eb70aa5fb5180 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -104,19 +104,19 @@ AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
                                                  bool use_static_shapes)
     : AnalyticalCostEstimator(
           cluster, absl::make_unique<OpLevelCostEstimator>(),
-          ReadyNodeManagerFactory("FirstReady"), use_static_shapes, nullptr) {}
+          ReadyNodeManagerFactory("FirstReady"), use_static_shapes) {}
 
 AnalyticalCostEstimator::AnalyticalCostEstimator(
     Cluster* cluster, std::unique_ptr<OpLevelCostEstimator> node_estimator,
-    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes,
-    RunMetadata* run_metadata)
+    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes)
     : cluster_(cluster),
       node_estimator_(std::move(node_estimator)),
       node_manager_(std::move(node_manager)),
-      use_static_shapes_(use_static_shapes),
-      run_metadata_(run_metadata) {
-  scheduler_ = absl::make_unique<VirtualScheduler>(use_static_shapes_, cluster_,
-                                                   node_manager_.get());
+      use_static_shapes_(use_static_shapes) {
+  // Use aggressive static shape inference to minimize unknown shapes.
+  scheduler_ = absl::make_unique<VirtualScheduler>(
+      use_static_shapes_,
+      /*use_aggressive_shape_inference=*/true, cluster_, node_manager_.get());
 }
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
@@ -124,9 +124,8 @@ Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
   return Status::OK();
 }
 
-// TODO(b/67607683): unify logic with VirtualCluster logic
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
-                                             CostGraphDef* cost_graph,
+                                             RunMetadata* run_metadata,
                                              Costs* costs) const {
   GrapplerItem item = item_;
   item.graph = optimized_graph;
@@ -138,7 +137,9 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   gtl::FlatMap<string, CostGraphDef::Node*> name_to_cost_node;
-  if (cost_graph) {
+  CostGraphDef* cost_graph = nullptr;
+  if (run_metadata) {
+    cost_graph = run_metadata->mutable_cost_graph();
     // TODO(pcma): Clear nodes in cost_graph after we make sure we always pass
     // in an empty cost_graph (a non-empty but incomplete cost_graph will cause
     // problems, e.g., no node_id in cost_graph)
@@ -179,18 +180,13 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
   }
 
-  *costs = scheduler_->Summary(run_metadata_);
-  // run_metadata_ gets step_stats and parition_graphs from Summary.
-  // Note that cost_graph could already point to the cost_graph field of
-  // run_metadata_, since both are set by the caller.
-  if (run_metadata_ && cost_graph &&
-      run_metadata_->mutable_cost_graph() != cost_graph)
-    *run_metadata_->mutable_cost_graph() = *cost_graph;
+  // run_metadata gets step_stats and partition_graphs from Summary.
+  *costs = scheduler_->Summary(run_metadata);
 
   if (VLOG_IS_ON(1)) {
     bool verbose = VLOG_IS_ON(2);
-    if (run_metadata_) {
-      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata_, verbose);
+    if (run_metadata) {
+      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata, verbose);
     } else {
       RunMetadata run_metadata;
       scheduler_->GenerateRunMetadata(&run_metadata);
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index 2629672459c512a22a861bd5c0dfe0207afc38a0..d058ba411527f0c001d59ac4aaa8aeea3d422c77 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -34,21 +34,16 @@ class Cluster;
 struct GrapplerItem;
 
 // Estimate the cost of running a Grappler item based on the theoretical
-// performance of the hardware that will run the model.
+// performance of the hardware that will run the model. Note that this
+// internally uses aggressive shape inference with static shape inference.
 class AnalyticalCostEstimator : public CostEstimator {
  public:
   // Does not take ownership of cluster.
   AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
-  // Does not take ownership of cluster or run_metadata
-  //
-  // When metadata is provided, step_stats and partition_graphs fields will
-  // always be filled during PredictCosts, and the cost_graph field of metadata
-  // will be filled only when cost_graph is not nullptr when invoking
-  // PredictCosts.
   AnalyticalCostEstimator(Cluster* cluster,
                           std::unique_ptr<OpLevelCostEstimator> node_estimator,
                           std::unique_ptr<ReadyNodeManager> node_manager,
-                          bool use_static_shapes, RunMetadata* run_metadata);
+                          bool use_static_shapes);
   ~AnalyticalCostEstimator() override {}
 
   // Initializes the estimator for the specified grappler item.
@@ -56,10 +51,10 @@ class AnalyticalCostEstimator : public CostEstimator {
   Status Initialize(const GrapplerItem& item) override;
 
   // Predict the performance of each node of the optimized graph and annotate
-  // the CostGraphDef with the corresponding estimates. Also returns the
+  // the RunMetadata with the corresponding estimates. Also returns the
   // expected cost for the whole graph.
-  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
-                      Costs* cost) const override;
+  Status PredictCosts(const GraphDef& optimized_graph,
+                      RunMetadata* run_metadata, Costs* cost) const override;
 
   const VirtualScheduler* GetScheduler() const { return scheduler_.get(); }
 
@@ -70,8 +65,6 @@ class AnalyticalCostEstimator : public CostEstimator {
   std::unique_ptr<ReadyNodeManager> node_manager_;
   bool use_static_shapes_;
   std::unique_ptr<VirtualScheduler> scheduler_;
-
-  RunMetadata* run_metadata_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index a9a1abfa989c9d8276b6ae263b95e7a71be41c8a..eb7ee8dc0a10147d6bfe201f21d437579850b6d9 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -98,9 +98,9 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   AnalyticalCostEstimator estimator(cluster_.get(), true);
   TF_ASSERT_OK(estimator.Initialize(item));
 
-  CostGraphDef cost_graph;
+  RunMetadata run_metadata;
   Costs summary;
-  TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
+  TF_ASSERT_OK(estimator.PredictCosts(item.graph, &run_metadata, &summary));
 
   EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index e3b3a36b096da807d05bee50d52a7a5c37884b52..d85ae0b77f923e9c7678eb9d8dd0a9f128ac5846 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cmath>
 #include <unordered_map>
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class GraphDef;
@@ -215,14 +216,14 @@ class CostEstimator {
 
   // Predicts the cost of running the given optimized version of the grappler
   // item.
-  // If a CostGraphDef is passed, it will be populated with detailed information
+  // If a RunMetadata is passed, it will be populated with detailed information
   // about the cost of running each operation of the optimized graph.
   // if a double value is passed, it will be set to a value that reflects the
   // overall cost of running the graph (e.g. the latency of the computation).
   // Returns a status that indicate is the performance could be estimated or
   // not.
   virtual Status PredictCosts(const GraphDef& optimized_graph,
-                              CostGraphDef* cost_graph, Costs* cost) const = 0;
+                              RunMetadata* run_metadata, Costs* cost) const = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index d69997989643972b4dfe7159ecbd9fa0901c7381..d0ac87c962b48d0fa39126a1b6c7c4e7c89f8bb8 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,12 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
-#include <limits>
-#include <list>
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -44,7 +38,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-using TensorVector = gtl::InlinedVector<TensorValue, 4>;
 
 namespace {
 
@@ -52,6 +45,7 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeAndType;
 using shape_inference::ShapeHandle;
+using TensorVector = gtl::InlinedVector<TensorValue, 4>;
 
 template <typename Handle>
 struct HashHandle {
@@ -416,6 +410,7 @@ NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
   return MakeConstNodeDefFromTensorProto(
       ic, MakeTensorProtoFromShape(ic, shape, tensor_as_shape, dtype), dtype);
 }
+
 }  // namespace
 
 // Queue of nodes to process. Nodes can be enqueued in any order, but will be
@@ -425,9 +420,11 @@ NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
 // information is refined.
 class TopoQueue {
  public:
-  explicit TopoQueue(const std::unordered_map<const NodeDef*, int>& topo_order)
-      : topo_order_(topo_order) {}
+  explicit TopoQueue(const std::vector<const NodeDef*>& topo_order)
+      : topo_order_(TopoOrder(topo_order)) {}
+
   void push(const NodeDef* n) { queue_.emplace(n, topo_order_.at(n)); }
+
   const NodeDef* pop() {
     CHECK(!empty());
     auto it = queue_.begin();
@@ -448,10 +445,49 @@ class TopoQueue {
       return lhs.second < rhs.second;
     }
   };
-  const std::unordered_map<const NodeDef*, int>& topo_order_;
+
+  const std::unordered_map<const NodeDef*, int> TopoOrder(
+      const std::vector<const NodeDef*>& topo_order) const {
+    std::unordered_map<const NodeDef*, int> map;
+    map.reserve(topo_order.size());
+    for (int i = 0; i < topo_order.size(); ++i) {
+      map.emplace(topo_order[i], i);
+    }
+    return map;
+  }
+
+  const std::unordered_map<const NodeDef*, int> topo_order_;
   std::set<NodeAndId, OrderByIdAscending> queue_;
 };
 
+bool IsNumericType(const DataType dtype) {
+  static const gtl::FlatSet<DataType>* const kRealNumberTypes =
+      CHECK_NOTNULL((new gtl::FlatSet<DataType>{
+          // Floating point.
+          DT_BFLOAT16,
+          DT_HALF,
+          DT_FLOAT,
+          DT_DOUBLE,
+          // Int / UInt.
+          DT_INT8,
+          DT_INT16,
+          DT_INT32,
+          DT_INT64,
+          DT_UINT8,
+          DT_UINT16,
+          DT_UINT32,
+          DT_UINT64,
+          // Quantized Int.
+          DT_QINT8,
+          DT_QUINT8,
+          DT_QINT16,
+          DT_QINT32,
+          // Bool.
+          DT_BOOL,
+      }));
+  return kRealNumberTypes->find(dtype) != kRealNumberTypes->end();
+}
+
 bool IsWhiteListedOpTypeForEvaluateNode(const string& op_type) {
   static const gtl::FlatSet<string>* const kOpTpeWhitelist =
       CHECK_NOTNULL((new gtl::FlatSet<string>{
@@ -496,6 +532,7 @@ bool IsWhiteListedOpTypeForEvaluateNode(const string& op_type) {
           "Split",
           "Range",
           "Fill",
+          "Cast",
       }));
   return kOpTpeWhitelist->find(op_type) != kOpTpeWhitelist->end();
 }
@@ -594,8 +631,8 @@ class SymbolicShapeRefiner {
                                           " was not found in the graph.");
       }
 
-      InferenceContext* input_inference_context = GetContext(input_node);
-      if (input_inference_context == nullptr) {
+      InferenceContext* input_ic = GetContext(input_node);
+      if (input_ic == nullptr) {
         return errors::FailedPrecondition(
             "Inference context has not been created for ", input_tensor.node());
       }
@@ -603,8 +640,8 @@ class SymbolicShapeRefiner {
       int output_port_num = input_tensor.index();
       AttrValue attr_output_shape;
       TensorShapeProto proto;
-      const auto& handle = input_inference_context->output(output_port_num);
-      input_inference_context->ShapeHandleToProto(handle, &proto);
+      const auto& handle = input_ic->output(output_port_num);
+      input_ic->ShapeHandleToProto(handle, &proto);
       // There may be dim.size < -1 in SymbolicShapeRefiner. Change those to -1.
       for (int i = 0; i < proto.dim_size(); i++) {
         if (proto.dim(i).size() < -1) {
@@ -656,7 +693,7 @@ class SymbolicShapeRefiner {
     ctx->output_tensor_protos.resize(grappler_function_item.output_size(),
                                      nullptr);
     for (auto const& out_arg : grappler_function_item.outputs()) {
-      if (out_arg.output_tensors.size() > 1) {
+      if (out_arg.output_nodes.size() > 1) {
         // TODO(jmdecker): Handle case of multiple output tensors
         return errors::Unimplemented(
             "Output arguments with multiple output tensors are not yet "
@@ -665,7 +702,7 @@ class SymbolicShapeRefiner {
 
       // It is guaranteed that output_tensors does not contain any control
       // inputs, so port_id >= 0.
-      TensorId out_tensor = ParseTensorName(out_arg.output_tensors[0]);
+      TensorId out_tensor = ParseTensorName(out_arg.output_nodes[0]);
 
       const NodeDef* retnode = gv.GetNode(out_tensor.node());
       if (retnode == nullptr) {
@@ -702,145 +739,132 @@ class SymbolicShapeRefiner {
     return Status::OK();
   }
 
+  // Prepares input shapes/values/handles, then runs shape inference, and
+  // finally sets output shapes/values/handles.
   Status UpdateNode(const NodeDef* node, bool* refined) {
-    NodeContext* node_context = GetNodeContext(node);
-    if (node_context == nullptr) {
+    NodeContext* ctx = GetNodeContext(node);
+    if (ctx == nullptr) {
       TF_RETURN_IF_ERROR(AddNode(node));
-      node_context = CHECK_NOTNULL(GetNodeContext(node));
+      ctx = CHECK_NOTNULL(GetNodeContext(node));
       *refined = true;
     }
 
     // Check if the shapes of the nodes in the fan-in of this node have changed,
     // and if they have, update the node input shapes.
-    InferenceContext* inference_context = node_context->inference_context.get();
-    std::vector<Tensor> const_values(inference_context->num_inputs());
-    std::vector<const Tensor*> input_tensors(inference_context->num_inputs(),
-                                             nullptr);
-    std::vector<ShapeHandle> input_tensors_as_shapes(
-        inference_context->num_inputs());
-    node_context->input_tensor_protos.resize(inference_context->num_inputs(),
-                                             nullptr);
-
-    for (int dst_input = 0; dst_input < inference_context->num_inputs();
-         ++dst_input) {
-      GraphView::InputPort port(node, dst_input);
-      for (const GraphView::OutputPort fanin : graph_.GetFanin(port)) {
-        int src_output = fanin.port_id;
-        const NodeDef* input = fanin.node;
-        NodeContext* c = GetNodeContext(input);
-        if (c == nullptr) {
-          return errors::FailedPrecondition(
-              "Input ", dst_input, " ('", input->name(), "') for '",
-              node->name(),
-              "' was not previously added to SymbolicShapeRefiner.");
-        }
+    InferenceContext* ic = ctx->inference_context.get();
+    std::vector<Tensor> const_values(ic->num_inputs());
+    std::vector<const Tensor*> input_tensors(ic->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes(ic->num_inputs());
+    ctx->input_tensor_protos.resize(ic->num_inputs(), nullptr);
+
+    for (int dst_input = 0; dst_input < ic->num_inputs(); ++dst_input) {
+      const GraphView::InputPort port(node, dst_input);
+      const GraphView::OutputPort fanin = graph_.GetRegularFanin(port);
+      int src_output = fanin.port_id;
+      const NodeDef* src = fanin.node;
+      NodeContext* src_ctx = GetNodeContext(src);
+      InferenceContext* src_ic = src_ctx->inference_context.get();
+      if (src_ctx == nullptr) {
+        return errors::FailedPrecondition(
+            "Input ", dst_input, " ('", src->name(), "') for '", node->name(),
+            "' was not previously added to SymbolicShapeRefiner.");
+      }
 
-        if (src_output >= c->inference_context->num_outputs())
-          return errors::OutOfRange("src_output = ", src_output,
-                                    ", but num_outputs is only ",
-                                    c->inference_context->num_outputs());
+      if (src_output >= src_ic->num_outputs()) {
+        return errors::OutOfRange("src_output = ", src_output,
+                                  ", but num_outputs is only ",
+                                  src_ic->num_outputs());
+      }
 
-        // Propagate input node's NodeContext info to the current node's
-        // NodeContext:
-        // output_tensor_protos to input_tensor_protos and input_tensors, and
-        // output_tensors_as_shapes to input_tensors_as_shapes.
+      // Propagate input node's NodeContext info to the current node's
+      // NodeContext:
+      // output_tensor_protos to input_tensor_protos and input_tensors, and
+      // output_tensors_as_shapes to input_tensors_as_shapes.
 
-        if (c->output_tensors_as_shapes.size() > src_output) {
-          input_tensors_as_shapes[dst_input] =
-              c->output_tensors_as_shapes[src_output];
-        }
+      if (src_ctx->output_tensors_as_shapes.size() > src_output) {
+        input_tensors_as_shapes[dst_input] =
+            src_ctx->output_tensors_as_shapes[src_output];
+      }
 
-        if (c->output_tensor_protos.size() > src_output) {
-          auto* tensor_proto = c->output_tensor_protos[src_output];
-          if (tensor_proto != nullptr &&
-              const_values[dst_input].FromProto(*tensor_proto)) {
-            input_tensors[dst_input] = &const_values[dst_input];
-            node_context->input_tensor_protos[dst_input] = tensor_proto;
-
-            if (!inference_context->FullyDefined(
-                    input_tensors_as_shapes[dst_input])) {
-              // Shape from a Const is not fully defined when the Const has
-              // value -1 (e.g., Reshape(x, Const(-1)) to reshape an arbitrary
-              // tensor x to a vector).
-              // It's possible that the same Const with -1 is used in many
-              // places, but that doesn't mean the resultant shapes are
-              // identical. e.g., x1 = Reshape(x, c) and y1 = Reshape(y, c),
-              // where c is -1. In this case, shape inference yields both x1 and
-              // y1 as rank 1, size unknown, but still the shapes of x1 and y1
-              // can be different. (even if we use different Const(-1) for x1
-              // and x2, graph optimzier may merge them to single Const through
-              // duplicate removal.)
-              // If we reuse output_tensors_as_shapes to input_tensors_as_shapes
-              // by copying ShapeHandle, they share the same Shape object, and
-              // SymbolicShapeManager, later in InferStatically(), assigns the
-              // same symbolic dim value (unique value < -1); in the above
-              // Reshape example, the shapes of x1 and y1 become, for example,
-              // [-278] and graph optimizer may yield incorrect output 'cause it
-              // assumes x1 and y1 have the same shape.
-              // To prevent this, we re-create a ShapeHandle from the Const
-              // tensor, instead of reusing output_tensors_as_shapes (so that
-              // ShapeHandles of the const fanouts have the same values,
-              // but different Shape objects -- SymbolicShapeManager assigns
-              // different symbol id to each fanout shape).
-              // TODO(dyoon): clean up the way values are propagated.
-              MaybeTensorValueToShape(inference_context,
-                                      const_values[dst_input],
-                                      &input_tensors_as_shapes[dst_input]);
-            }
+      if (src_ctx->output_tensor_protos.size() > src_output) {
+        auto* tensor_proto = src_ctx->output_tensor_protos[src_output];
+        if (tensor_proto != nullptr &&
+            const_values[dst_input].FromProto(*tensor_proto)) {
+          input_tensors[dst_input] = &const_values[dst_input];
+          ctx->input_tensor_protos[dst_input] = tensor_proto;
+
+          if (!ic->FullyDefined(input_tensors_as_shapes[dst_input])) {
+            // Shape from a Const is not fully defined when the Const has
+            // value -1 (e.g., Reshape(x, Const(-1)) to reshape an arbitrary
+            // tensor x to a vector).
+            // It's possible that the same Const with -1 is used in many
+            // places, but that doesn't mean the resultant shapes are
+            // identical. e.g., x1 = Reshape(x, c) and y1 = Reshape(y, c),
+            // where c is -1. In this case, shape inference yields both x1 and
+            // y1 as rank 1, size unknown, but still the shapes of x1 and y1
+            // can be different. (even if we use different Const(-1) for x1
+            // and x2, graph optimzier may merge them to single Const through
+            // duplicate removal.)
+            // If we reuse output_tensors_as_shapes to input_tensors_as_shapes
+            // by copying ShapeHandle, they share the same Shape object, and
+            // SymbolicShapeManager, later in InferStatically(), assigns the
+            // same symbolic dim value (unique value < -1); in the above
+            // Reshape example, the shapes of x1 and y1 become, for example,
+            // [-278] and graph optimizer may yield incorrect output 'cause it
+            // assumes x1 and y1 have the same shape.
+            // To prevent this, we re-create a ShapeHandle from the Const
+            // tensor, instead of reusing output_tensors_as_shapes (so that
+            // ShapeHandles of the const fanouts have the same values,
+            // but different Shape objects -- SymbolicShapeManager assigns
+            // different symbol id to each fanout shape).
+            // TODO(dyoon): clean up the way values are propagated.
+            MaybeTensorValueToShape(ic, const_values[dst_input],
+                                    &input_tensors_as_shapes[dst_input]);
           }
         }
+      }
 
-        DCHECK_GE(dst_input, 0);
-        // NOTE: we check only shape is refined; we do not (yet) check whether
-        // tensor value is refined.
-        if (!*refined && !inference_context->input(dst_input).SameHandle(
-                             c->inference_context->output(src_output))) {
-          *refined = true;
-        }
-        inference_context->SetInput(dst_input,
-                                    c->inference_context->output(src_output));
-
-        if (!*refined &&
-            inference_context->requested_input_tensor_as_partial_shape(
-                dst_input)) {
-          // The input value may have changed. Since we have no way to know if
-          // that's indeed the case, err on the safe side.
-          *refined = true;
-        }
+      // NOTE: we check only shape is refined; we do not (yet) check whether
+      // tensor value is refined.
+      if (!*refined &&
+          !ic->input(dst_input).SameHandle(src_ic->output(src_output))) {
+        *refined = true;
+      }
+      ic->SetInput(dst_input, src_ic->output(src_output));
 
-        // Also propagate handle shape and dtype of edges which are carrying
-        // resource handles.
-        if (node_context->input_types[dst_input] == DT_RESOURCE) {
-          auto* outputs =
-              c->inference_context->output_handle_shapes_and_types(src_output);
-          if (!outputs) continue;
-          auto* inputs =
-              inference_context->input_handle_shapes_and_types(dst_input);
-
-          if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
-            *refined = true;
-          }
-          inference_context->set_input_handle_shapes_and_types(dst_input,
-                                                               *outputs);
-        }
+      if (!*refined && ic->requested_input_tensor_as_partial_shape(dst_input)) {
+        // The input value may have changed. Since we have no way to know if
+        // that's indeed the case, err on the safe side.
+        *refined = true;
+      }
+
+      // Also propagate handle shape and dtype of edges which are carrying
+      // resource handles.
+      if (ctx->input_types[dst_input] == DT_RESOURCE) {
+        auto* outputs = src_ic->output_handle_shapes_and_types(src_output);
+        if (!outputs) continue;
+        auto* inputs = ic->input_handle_shapes_and_types(dst_input);
+
+        if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs))
+          *refined = true;
+        ic->set_input_handle_shapes_and_types(dst_input, *outputs);
       }
     }
 
     // Make sure we schedule the fanout of resources (which have no input)
     // whenever the resources are updated.
-    *refined |= inference_context->num_inputs() == 0;
+    *refined |= ic->num_inputs() == 0;
 
     if (!*refined) {
       // No input shape has changed, we're done.
       return Status::OK();
     }
 
-    node_context->inference_context->set_input_tensors(input_tensors);
-    node_context->inference_context->set_input_tensors_as_shapes(
-        input_tensors_as_shapes);
+    ic->set_input_tensors(input_tensors);
+    ic->set_input_tensors_as_shapes(input_tensors_as_shapes);
 
     // Properly handle function nodes.
-    if (node_context->op_data && node_context->op_data->is_function_op) {
+    if (ctx->op_data && ctx->op_data->is_function_op) {
       // TODO(jmdecker): Detect if the input shapes have changed for this
       // function. Note that when we hit a function call node, refined will be
       // true, as the updates to the call node will have changed, even if it's
@@ -857,7 +881,7 @@ class SymbolicShapeRefiner {
     }
 
     // Update the shapes of the outputs.
-    return InferShapes(*node, node_context);
+    return InferShapes(*node, ctx);
   }
 
   Status SetUnknownShape(const NodeDef* node, int output_port) {
@@ -1074,15 +1098,20 @@ class SymbolicShapeRefiner {
         c->output_tensor_protos.size() < ic->num_outputs()) {
       return false;
     } else {
+      // Checks if we can get output value via either output_tensor_proto or
+      // output_tensors_as_shapes.
       for (int i = 0; i < ic->num_outputs(); i++) {
-        if (c->output_tensor_protos.size() <= i ||
-            c->output_tensor_protos[i] == nullptr) {
-          return false;
+        if (c->output_tensor_protos.size() > i &&
+            c->output_tensor_protos[i] != nullptr) {
+          continue;
         }
-        if (c->output_tensors_as_shapes.size() <= i ||
-            !ic->FullyDefined(c->output_tensors_as_shapes[i])) {
-          return false;
+        if (c->output_tensors_as_shapes.size() > i &&
+            ic->FullyDefined(c->output_tensors_as_shapes[i])) {
+          continue;
         }
+
+        // Unknown for output[i].
+        return false;
       }
     }
     return true;
@@ -1120,16 +1149,16 @@ class SymbolicShapeRefiner {
       return false;
     }
 
-    // Check input dtypes are integer.
+    // Check input dtypes are number types.
     for (const auto& input_type : c->input_types) {
-      if (input_type != DT_INT32 && input_type != DT_INT64) {
+      if (!IsNumericType(input_type)) {
         return false;
       }
     }
 
-    // Check output dtypes are integer.
+    // Check output dtypes are number types.
     for (const auto& output_type : c->output_types) {
-      if (output_type != DT_INT32 && output_type != DT_INT64) {
+      if (!IsNumericType(output_type)) {
         return false;
       }
     }
@@ -1450,9 +1479,9 @@ class SymbolicShapeRefiner {
       // Due to the cost of EvaluateNode(), we run it only for certain op types
       // (white listed) and small integer tensors.
 
-      const int max_elelment_size = 17;  // Max up to 4x4 matrix or similar.
+      const int max_element_size = 17;  // Max up to 4x4 matrix or similar.
       if (AllOutputValuesKnown(c) || !AllInputValuesKnown(c) ||
-          !ShouldUpdateOutputValues(c, max_elelment_size)) {
+          !ShouldUpdateOutputValues(c, max_element_size)) {
         return Status::OK();
       }
       UpdateOutputValues(node, c).IgnoreError();  // This is optional.
@@ -1663,45 +1692,52 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
 
 // Compute the output shape of the merge node as the union of the available
 // input shapes.
-Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                        const NodeDef* node,
-                                        bool* new_shapes) const {
-  InferenceContext* c = shape_refiner->GetContext(node);
-  if (!c) {
+Status GraphProperties::UpdateMerge(SymbolicShapeRefiner* shape_refiner,
+                                    const NodeDef* node,
+                                    bool* new_shapes) const {
+  InferenceContext* ic = shape_refiner->GetContext(node);
+  if (!ic) {
     // Now we can run shape inference
     TF_RETURN_IF_ERROR(shape_refiner->AddNode(node));
-    c = CHECK_NOTNULL(shape_refiner->GetContext(node));
+    ic = CHECK_NOTNULL(shape_refiner->GetContext(node));
     *new_shapes = true;
 
     // Infer the shape of the second output once and for all since it never
     // changes.
-    ShapeHandle out1 = c->Scalar();
-    c->set_output(1, out1);
+    ShapeHandle out1 = ic->Scalar();
+    ic->set_output(1, out1);
   }
 
   ShapeHandle out;
+  const std::vector<ShapeAndType>* out_handle = nullptr;
   bool out_initialized = false;
-  for (const GraphView::Edge fanin :
-       shape_refiner->graph().GetFaninEdges(*node, false)) {
-    InferenceContext* in = shape_refiner->GetContext(fanin.src.node);
-    if (!in) {
+  for (const GraphView::Edge fanin : shape_refiner->graph().GetFaninEdges(
+           *node, /*include_controlling_edges=*/false)) {
+    InferenceContext* src_ic = shape_refiner->GetContext(fanin.src.node);
+    if (!src_ic) {
       // Handling a loop for the first time, the back edge won't have any shape
       // info.
       continue;
     }
-    ShapeHandle input = in->output(fanin.src.port_id);
-    CHECK_EQ(fanin.dst.node, node);
-    c->SetInput(fanin.dst.port_id, input);
+    ShapeHandle input = src_ic->output(fanin.src.port_id);
+    ic->SetInput(fanin.dst.port_id, input);
+    auto* input_handle =
+        src_ic->output_handle_shapes_and_types(fanin.src.port_id);
+    if (input_handle)
+      ic->set_input_handle_shapes_and_types(fanin.dst.port_id, *input_handle);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
-      continue;
+      out_handle = input_handle;
+    } else {
+      // Note here only out, not out_handle, is modified.
+      out = shape_refiner->OutputAsUnion(node, 0, input, out);
     }
-    out = shape_refiner->OutputAsUnion(node, 0, input, out);
   }
 
-  if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) {
-    c->set_output(0, out);
+  if (*new_shapes || !shape_refiner->EquivalentShapes(out, ic->output(0))) {
+    ic->set_output(0, out);
+    if (out_handle) ic->set_output_handle_shapes_and_types(0, *out_handle);
     *new_shapes = true;
   }
 
@@ -1711,26 +1747,26 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 // Manually propagate the input shape for Enter nodes.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                                     const NodeDef* node, bool* new_shapes) {
-  auto enter_ctx = shape_refiner->GetContext(node);
-  if (!enter_ctx) {
+  InferenceContext* ic = shape_refiner->GetContext(node);
+  if (!ic) {
     TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, new_shapes));
-    enter_ctx = shape_refiner->GetContext(node);
+    ic = shape_refiner->GetContext(node);
   }
 
-  GraphView::InputPort inp(node, 0);
-  GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
+  GraphView::InputPort port(node, 0);
+  GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(port);
 
-  InferenceContext* in = shape_refiner->GetContext(fanin.node);
-  ShapeHandle input = in->output(fanin.port_id);
-  if (!enter_ctx->output(0).SameHandle(input)) {
-    enter_ctx->SetInput(0, input);
-    enter_ctx->set_output(0, input);
+  InferenceContext* src_ic = shape_refiner->GetContext(fanin.node);
+  ShapeHandle input = src_ic->output(fanin.port_id);
+  if (!ic->output(0).SameHandle(input)) {
+    ic->SetInput(0, input);
+    ic->set_output(0, input);
     *new_shapes = true;
   }
-  auto* outputs = in->output_handle_shapes_and_types(fanin.port_id);
+  auto* outputs = src_ic->output_handle_shapes_and_types(fanin.port_id);
   if (outputs) {
-    enter_ctx->set_input_handle_shapes_and_types(0, *outputs);
-    enter_ctx->set_output_handle_shapes_and_types(0, *outputs);
+    ic->set_input_handle_shapes_and_types(0, *outputs);
+    ic->set_output_handle_shapes_and_types(0, *outputs);
     *new_shapes = true;
   }
   return Status::OK();
@@ -1746,7 +1782,7 @@ Status GraphProperties::UpdateShapes(
     TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, new_shapes));
   } else if (IsMerge(*n)) {
     // Properly handle merge nodes.
-    TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, new_shapes));
+    TF_RETURN_IF_ERROR(UpdateMerge(shape_refiner, n, new_shapes));
   } else if (IsEnqueue(*n)) {
     // Make sure the shapes of enqueued tensors are propagated to the queue
     // itself.
@@ -1793,8 +1829,8 @@ Status GraphProperties::PropagateShapes(
       TF_RETURN_IF_ERROR(
           UpdateShapes(shape_refiner, resource_handles, n, &updated));
       if (updated) {
-        for (const GraphView::InputPort& fanout :
-             shape_refiner->graph().GetFanouts(*n, false)) {
+        for (const auto& fanout : shape_refiner->graph().GetFanouts(
+                 *n, /*include_controlled_nodes=*/false)) {
           new_shapes->push(fanout.node);
         }
         // Make sure the corresponding queue nodes are (re)processed.
@@ -1819,7 +1855,7 @@ Status GraphProperties::PropagateShapes(
 Status GraphProperties::UpdateQueue(const NodeDef* queue_node,
                                     SymbolicShapeRefiner* shape_refiner,
                                     bool* new_shapes) {
-  auto ctx = shape_refiner->GetNodeContext(queue_node);
+  auto* ctx = shape_refiner->GetNodeContext(queue_node);
   if (!ctx) {
     TF_RETURN_IF_ERROR(shape_refiner->AddNode(queue_node));
     ctx = CHECK_NOTNULL(shape_refiner->GetNodeContext(queue_node));
@@ -1970,7 +2006,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
   }
 
   std::unordered_map<const NodeDef*, const NodeDef*> resource_handles;
-  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_deps;
+  std::vector<TopologicalDependency> extra_deps;
   for (const auto& resource : resources) {
     for (const NodeDef* src : resource.second.first) {
       resource_handles[src] = resource.first;
@@ -1982,8 +2018,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
     }
   }
 
-  std::unordered_map<const NodeDef*, int> topo_order;
-  Status s = ComputeTopologicalOrder(item_.graph, &topo_order, &extra_deps);
+  std::vector<const NodeDef*> topo_order;
+  Status s = ComputeTopologicalOrder(item_.graph, extra_deps, &topo_order);
   if (!s.ok()) {
     if (extra_deps.empty()) {
       return s;
@@ -1992,8 +2028,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
       // order. This will make the shape inference less precise but since this
       // isn't common it's not worth to figure out where to break the loop and
       // do a proper relaxation.
-      TF_RETURN_IF_ERROR(
-          ComputeTopologicalOrder(item_.graph, &topo_order, nullptr));
+      TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 0635222fe960d0f97e3b02741a106feb8f0e01b4..3fcad6eb1b17e0c0239c5daf17bfcf717b5e3305 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -111,8 +111,8 @@ class GraphProperties {
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
-  Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                         const NodeDef* node, bool* new_shapes) const;
+  Status UpdateMerge(SymbolicShapeRefiner* shape_refiner, const NodeDef* node,
+                     bool* new_shapes) const;
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                             const NodeDef* node, bool* new_shapes);
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index c7f53b2cdee08bc7594d4d093c8ca77693424bfd..0a7697a21324d3c27b80e04d6c79f5eeb539bc20 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -341,6 +341,44 @@ TEST_F(GraphPropertiesTest, VarHandles) {
   EXPECT_EQ(7, prop.shape().dim(1).size());
 }
 
+TEST_F(GraphPropertiesTest, WhileLoopWithVarHandleOpInput) {
+  // Test graph is first generated in python using:
+  /*
+    i0 = tf.constant(0)
+    v = tf.get_variable(initializer=i0, name='loop_var', use_resource=True)
+    def cond(i, x):
+      return i < 3
+    def body(i, x):
+      return i + 1, x + x
+    v, y = tf.while_loop(cond, body, loop_vars=[v, tf.constant(1)])
+  */
+  // and then modified by hand such that the ReadVariableOp is inside the loop
+  // body instead of outside the while loop (which is the case when constructed
+  // using the python API), such that we have the following pattern: VarHandleOp
+  // -> Enter -> Switch -> ReadVariableOp -> other parts of loop body. Note
+  // DT_RESOURCE is passed all the way until ReadVariableOp.
+  GrapplerItem item;
+  string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
+                                 "while_loop_var_handle_op.pbtxt");
+  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+
+  std::vector<string> resource_nodes{
+      "loop_var",       "while/Enter",         "while/Merge", "while/Switch",
+      "while/Identity", "while/NextIteration", "while/Exit"};
+  for (const string& node : resource_nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    EXPECT_GE(props.size(), 1);  // Merge has 2 outputs.
+    EXPECT_EQ("resource: []", PropToString(props[0]));
+  }
+
+  // After ReadVariableOp, the shape should be recovered.
+  const auto props = properties.GetOutputProperties("while/ReadVariableOp");
+  EXPECT_EQ(1, props.size());
+  EXPECT_EQ("int32: []", PropToString(props[0]));
+}
+
 TEST_F(GraphPropertiesTest, QueueWithOnlyDequeue_NoShapeAttr) {
   tensorflow::Scope root = tensorflow::Scope::NewRootScope();
   auto q1 = ops::FIFOQueue(root.WithOpName("Queue1"), {DataType::DT_FLOAT});
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
index 415c347a1d2d563099490b780e10008508259027..d4e23e901a46a8524c2b2ef7d2311b9cf48850e7 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
@@ -511,6 +511,13 @@ library {
           s: "VALID"
         }
       }
+      attr {
+        key: "explicit_paddings"
+        value {
+          list {
+          }
+        }
+      }
       attr {
         key: "strides"
         value {
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop_var_handle_op.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop_var_handle_op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..048b8a23dca5b7c10b1b2e131863e1f0665fdc73
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop_var_handle_op.pbtxt
@@ -0,0 +1,291 @@
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "loop_var"
+  op: "VarHandleOp"
+  attr {
+    key: "_class"
+    value { list { s: "loc:@loop_var" } }
+  }
+  attr {
+    key: "container"
+    value { s: "" }
+  }
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "shape"
+    value { shape {} }
+  }
+  attr {
+    key: "shared_name"
+    value { s: "loop_var" }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "loop_var"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+  attr {
+    key: "frame_name"
+    value { s: "while/while_context" }
+  }
+  attr {
+    key: "is_constant"
+    value { b: false }
+  }
+  attr {
+    key: "parallel_iterations"
+    value { i: 10 }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "Const_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "frame_name"
+    value { s: "while/while_context" }
+  }
+  attr {
+    key: "is_constant"
+    value { b: false }
+  }
+  attr {
+    key: "parallel_iterations"
+    value { i: 10 }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value { i: 2 }
+  }
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value { i: 2 }
+  }
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node { name: "while/LoopCond" op: "LoopCond" input: "while/Less" }
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+  attr {
+    key: "_class"
+    value { list { s: "loc:@while/Merge" } }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "_class"
+    value { list { s: "loc:@while/Merge_1" } }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/ReadVariableOp"
+  op: "ReadVariableOp"
+  input: "while/Identity"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/ReadVariableOp"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/ReadVariableOp"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/add_1"
+  op: "Add"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Identity"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/add_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+versions { producer: 27 }
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index 833205ac6f12a73d96c93455bb355ee511d6700a..088ce566580c4f23c9927adfb927fbf9afd34017 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -51,8 +51,12 @@ Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
 }
 
 Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
-                                            CostGraphDef* cost_graph,
+                                            RunMetadata* run_metadata,
                                             Costs* costs) const {
+  CostGraphDef* cost_graph = nullptr;
+  if (run_metadata) {
+    cost_graph = run_metadata->mutable_cost_graph();
+  }
   const bool running_simulation = (cluster_->type() == "virtual");
 
   std::vector<double> times(measurement_steps_);
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
index 3e741c91997403e7eae438d2dd72c9a70da9316a..67145f5241ef8a5c101d5305889ff5fee823cceb 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -54,12 +54,12 @@ class MeasuringCostEstimator : public CostEstimator {
   // This implementation always returns OK.
   Status Initialize(const GrapplerItem& item) override;
 
-  // Runs the optimized version of the graph on the cluster, measure
-  // the runtimes of each operation, and annotated the CostGraphDef
-  // with the corresponding measurements.
+  // Runs the optimized version of the graph on the cluster, measures
+  // the runtimes of each operation, and annotates the CostGraphDef of
+  // RunMetadata with the corresponding measurements.
   // Returns the average latency for the whole graph.
-  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
-                      Costs* overall_cost) const override;
+  Status PredictCosts(const GraphDef& optimized_graph,
+                      RunMetadata* run_metadata, Costs* cost) const override;
 
  private:
   Cluster* cluster_;  // Not owned.
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 55eb391d2b344778df7d23528dbe42596321b95f..59d20f1fb9a730f45cfe1135526321a0e132fda1 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -54,6 +54,7 @@ constexpr char kVariable[] = "Variable";
 constexpr char kVariableV2[] = "VariableV2";
 constexpr char kRank[] = "Rank";
 constexpr char kShape[] = "Shape";
+constexpr char kShapeN[] = "ShapeN";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
@@ -72,25 +73,25 @@ static const Costs::Duration kMinComputeTime(1);
 
 namespace {
 
-string GetDataFormat(const OpInfo& op_features) {
+string GetDataFormat(const OpInfo& op_info) {
   string data_format = "NHWC";  // Default format.
-  if (op_features.attr().find("data_format") != op_features.attr().end()) {
-    data_format = op_features.attr().at("data_format").s();
+  if (op_info.attr().find("data_format") != op_info.attr().end()) {
+    data_format = op_info.attr().at("data_format").s();
   }
   return data_format;
 }
 
-string GetFilterFormat(const OpInfo& op_features) {
+string GetFilterFormat(const OpInfo& op_info) {
   string filter_format = "HWIO";  // Default format.
-  if (op_features.attr().find("filter_format") != op_features.attr().end()) {
-    filter_format = op_features.attr().at("filter_format").s();
+  if (op_info.attr().find("filter_format") != op_info.attr().end()) {
+    filter_format = op_info.attr().at("filter_format").s();
   }
   return filter_format;
 }
 
-Padding GetPadding(const OpInfo& op_features) {
-  if (op_features.attr().find("padding") != op_features.attr().end() &&
-      op_features.attr().at("padding").s() == "VALID") {
+Padding GetPadding(const OpInfo& op_info) {
+  if (op_info.attr().find("padding") != op_info.attr().end() &&
+      op_info.attr().at("padding").s() == "VALID") {
     return Padding::VALID;
   }
   return Padding::SAME;  // Default padding.
@@ -107,11 +108,11 @@ bool IsTraining(const OpInfo& op_info) {
 // TODO(dyoon): support non-4D tensors in the c ost functions of convolution
 // related ops (Conv, Pool, BatchNorm, and their backprops) and the related
 // helper functions.
-std::vector<int64> GetStrides(const OpInfo& op_features) {
-  if (op_features.attr().find("strides") != op_features.attr().end()) {
-    const auto strides = op_features.attr().at("strides").list().i();
-    CHECK(strides.size() == 4) << "Attr strides is not a length-4 vector: "
-                               << op_features.DebugString();
+std::vector<int64> GetStrides(const OpInfo& op_info) {
+  if (op_info.attr().find("strides") != op_info.attr().end()) {
+    const auto strides = op_info.attr().at("strides").list().i();
+    CHECK(strides.size() == 4)
+        << "Attr strides is not a length-4 vector: " << op_info.DebugString();
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
@@ -264,6 +265,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kShapeN, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kMaxPool, wrap(&OpLevelCostEstimator::PredictMaxPool)},
       {kMaxPoolGrad, wrap(&OpLevelCostEstimator::PredictMaxPoolGrad)},
@@ -359,21 +361,21 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 }
 
 Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  auto it = device_cost_impl_.find(op_features.op());
+  const auto& op_info = op_context.op_info;
+  auto it = device_cost_impl_.find(op_info.op());
   if (it == device_cost_impl_.end()) {
-    if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
+    if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
       return PredictCwiseOp(op_context);
     }
 
-    VLOG(1) << "Missing accurate estimator for op: " << op_features.op();
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
 
     return PredictCostOfAnUnknownOp(op_context);
   }
 
   std::function<Costs(const OpContext&)> estimator = it->second;
   Costs costs = estimator(op_context);
-  VLOG(1) << "Operation " << op_features.op() << " takes "
+  VLOG(1) << "Operation " << op_info.op() << " takes "
           << costs.execution_time.count() << " ns.";
   return costs;
 }
@@ -430,39 +432,38 @@ DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
 }
 
 Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   // For unary or binary element-wise operations, op count is the element count
   // of any input. We use the count for the largest input here to be more robust
   // in case that the shape is unknown or partially known for other input.
-  int64 op_count =
-      CalculateLargestInputCount(op_features, &found_unknown_shapes);
+  int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
   // If output shape is available, try use the element count calcuated from
   // that.
-  if (op_features.outputs_size() > 0) {
-    op_count =
-        std::max(op_count, CalculateTensorElementCount(op_features.outputs(0),
-                                                       &found_unknown_shapes));
+  if (op_info.outputs_size() > 0) {
+    op_count = std::max(
+        op_count,
+        CalculateTensorElementCount(op_info.outputs(0), &found_unknown_shapes));
   }
   // For binary ops, calculate the output shape possibly resulting from
   // broadcasting.
-  if (op_features.inputs_size() >= 2) {
-    op_count = std::max(op_count,
-                        CwiseOutputElementCount(op_features.inputs(0).shape(),
-                                                op_features.inputs(1).shape()));
+  if (op_info.inputs_size() >= 2) {
+    op_count =
+        std::max(op_count, CwiseOutputElementCount(op_info.inputs(0).shape(),
+                                                   op_info.inputs(1).shape()));
   }
 
   int op_cost = 1;
   bool is_known_elementwise_op = false;
-  auto it = elementwise_ops_.find(op_features.op());
+  auto it = elementwise_ops_.find(op_info.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
     is_known_elementwise_op = true;
   } else {
-    LOG(WARNING) << "Not a cwise op: " << op_features.op();
+    LOG(WARNING) << "Not a cwise op: " << op_info.op();
   }
 
-  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
+  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_info);
   if (found_unknown_shapes || !is_known_elementwise_op) {
     costs.inaccurate = true;
   }
@@ -527,8 +528,10 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
                       device_info.intermediate_write_gb_per_sec)
           : 0;
 
-  Costs::NanoSeconds intermediate_memory_cost(intermediate_read_time +
-                                              intermediate_write_time);
+  Costs::NanoSeconds intermediate_memory_cost =
+      compute_memory_overlap_
+          ? std::max(intermediate_read_time, intermediate_write_time)
+          : (intermediate_read_time + intermediate_write_time);
   VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Intermediate Memory Time (ns):"
           << intermediate_memory_cost.count();
@@ -542,17 +545,17 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
 }
 
 int64 OpLevelCostEstimator::CountConv2DOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  return CountConv2DOperations(op_features, nullptr, found_unknown_shapes);
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  return CountConv2DOperations(op_info, nullptr, found_unknown_shapes);
 }
 
 // Helper to translate the positional arguments into named fields.
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
-    const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+    const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
     bool* found_unknown_shapes) {
-  VLOG(2) << "op features: " << op_features.DebugString();
+  VLOG(2) << "op features: " << op_info.DebugString();
   VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
   VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
   auto image_shape =
@@ -563,7 +566,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   VLOG(2) << "Filter shape: " << filter_shape.DebugString();
 
   int x_index, y_index, channel_index;
-  const string& data_format = GetDataFormat(op_features);
+  const string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     x_index = 2;
     y_index = 3;
@@ -574,7 +577,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     y_index = 2;
     channel_index = 3;
   }
-  const string& filter_format = GetFilterFormat(op_features);
+  const string& filter_format = GetFilterFormat(op_info);
   int filter_x_index, filter_y_index, in_channel_index, out_channel_index;
   if (filter_format == "HWIO") {
     filter_x_index = 0;
@@ -594,8 +597,8 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   int64 iz = image_shape.dim(channel_index).size();
   int64 kx = filter_shape.dim(filter_x_index).size();
   int64 ky = filter_shape.dim(filter_y_index).size();
-  std::vector<int64> strides = GetStrides(op_features);
-  const auto padding = GetPadding(op_features);
+  std::vector<int64> strides = GetStrides(op_info);
+  const auto padding = GetPadding(op_info);
   int64 sx = strides[x_index];
   int64 sy = strides[y_index];
   int64 ox = GetOutputSize(ix, kx, sx, padding);
@@ -623,14 +626,13 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
 }
 
 int64 OpLevelCostEstimator::CountConv2DOperations(
-    const OpInfo& op_features, ConvolutionDimensions* conv_info,
+    const OpInfo& op_info, ConvolutionDimensions* conv_info,
     bool* found_unknown_shapes) const {
-  DCHECK(op_features.op() == kConv2d ||
-         op_features.op() == kDepthwiseConv2dNative)
+  DCHECK(op_info.op() == kConv2d || op_info.op() == kDepthwiseConv2dNative)
       << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";
 
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
+      op_info.inputs(0).shape(), op_info.inputs(1).shape(), op_info,
       found_unknown_shapes);
 
   //  in DepthwiseConv2dNative conv_dims.oz is actually the channel depth
@@ -641,7 +643,7 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   int64 ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2d) {
+  if (op_info.op() == kConv2d) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // To ensure output tensor dims to be correct for DepthwiseConv2DNative,
@@ -658,32 +660,32 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
 }
 
 int64 OpLevelCostEstimator::CountMatMulOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  return CountMatMulOperations(op_features, nullptr, found_unknown_shapes);
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  return CountMatMulOperations(op_info, nullptr, found_unknown_shapes);
 }
 
 // TODO(nishantpatil): Create separate estimator for Sparse Matmul
 int64 OpLevelCostEstimator::CountMatMulOperations(
-    const OpInfo& op_features, MatMulDimensions* mat_mul,
+    const OpInfo& op_info, MatMulDimensions* mat_mul,
     bool* found_unknown_shapes) const {
   double ops = 0;
 
-  if (op_features.inputs_size() < 2) {
-    LOG(ERROR) << "Need 2 inputs but got " << op_features.inputs_size();
+  if (op_info.inputs_size() < 2) {
+    LOG(ERROR) << "Need 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
 
-  auto& a_matrix = op_features.inputs(0);
-  auto& b_matrix = op_features.inputs(1);
+  auto& a_matrix = op_info.inputs(0);
+  auto& b_matrix = op_info.inputs(1);
 
   bool transpose_a = false;
   bool transpose_b = false;
 
   double m_dim, n_dim, k_dim, k_dim_b = 0;
 
-  for (const auto& item : op_features.attr()) {
+  for (const auto& item : op_info.attr()) {
     VLOG(1) << "Key:" << item.first
             << " Value:" << SummarizeAttrValue(item.second);
     if (item.first == "transpose_a" && item.second.b() == true)
@@ -735,23 +737,23 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
 }
 
 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  if (op_features.op() != kBatchMatMul) {
-    LOG(ERROR) << "Invalid Operation: " << op_features.op();
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  if (op_info.op() != kBatchMatMul) {
+    LOG(ERROR) << "Invalid Operation: " << op_info.op();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
-  if (op_features.inputs_size() != 2) {
-    LOG(ERROR) << "Expected 2 inputs but got " << op_features.inputs_size();
+  if (op_info.inputs_size() != 2) {
+    LOG(ERROR) << "Expected 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
 
   double ops = 0;
-  const auto& a_input = op_features.inputs(0);
-  const auto& b_input = op_features.inputs(1);
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
 
   // BatchMatMul requires inputs of at least matrix shape (rank 2).
   // The two most minor dimensions of each input are matrices that
@@ -801,24 +803,24 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 
   // Build the MatMul. Note that values are ignored here since we are just
   // counting ops (e.g. only shapes matter).
-  OpInfo matmul_op_features;
-  matmul_op_features.set_op("MatMul");
+  OpInfo matmul_op_info;
+  matmul_op_info.set_op("MatMul");
 
   AttrValue transpose_a;
   transpose_a.set_b(false);
-  if (op_features.attr().find("adj_x") != op_features.attr().end()) {
-    transpose_a.set_b(op_features.attr().at("adj_x").b());
+  if (op_info.attr().find("adj_x") != op_info.attr().end()) {
+    transpose_a.set_b(op_info.attr().at("adj_x").b());
   }
-  (*matmul_op_features.mutable_attr())["transpose_a"] = transpose_a;
+  (*matmul_op_info.mutable_attr())["transpose_a"] = transpose_a;
 
   AttrValue transpose_b;
   transpose_b.set_b(false);
-  if (op_features.attr().find("adj_y") != op_features.attr().end()) {
-    transpose_b.set_b(op_features.attr().at("adj_y").b());
+  if (op_info.attr().find("adj_y") != op_info.attr().end()) {
+    transpose_b.set_b(op_info.attr().at("adj_y").b());
   }
-  (*matmul_op_features.mutable_attr())["transpose_b"] = transpose_b;
+  (*matmul_op_info.mutable_attr())["transpose_b"] = transpose_b;
 
-  OpInfo::TensorProperties* a_matrix = matmul_op_features.add_inputs();
+  OpInfo::TensorProperties* a_matrix = matmul_op_info.add_inputs();
   a_matrix->set_dtype(a_input.dtype());
   TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
   for (int i = std::max(0, a_input_shape.dim_size() - matrix_rank);
@@ -826,7 +828,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i);
   }
 
-  OpInfo::TensorProperties* b_matrix = matmul_op_features.add_inputs();
+  OpInfo::TensorProperties* b_matrix = matmul_op_info.add_inputs();
   b_matrix->set_dtype(b_input.dtype());
   TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
   for (int i = std::max(0, b_input_shape.dim_size() - matrix_rank);
@@ -836,7 +838,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 
   for (int i = 0; i < num_matmuls; ++i) {
     bool matmul_unknown_shapes = false;
-    ops += CountMatMulOperations(matmul_op_features, &matmul_unknown_shapes);
+    ops += CountMatMulOperations(matmul_op_info, &matmul_unknown_shapes);
     *found_unknown_shapes |= matmul_unknown_shapes;
   }
   return ops;
@@ -894,16 +896,16 @@ bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
 
 // TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
 int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
-    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK(op_features.op() == kConv2dBackpropInput ||
-         op_features.op() == kDepthwiseConv2dNativeBackpropInput)
+  DCHECK(op_info.op() == kConv2dBackpropInput ||
+         op_info.op() == kDepthwiseConv2dNativeBackpropInput)
       << "Invalid Operation: not kConv2dBackpropInput nor"
          "kDepthwiseConv2dNativeBackpropInput";
 
-  if (op_features.inputs_size() < 2) {
+  if (op_info.inputs_size() < 2) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
@@ -911,12 +913,12 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
 
   TensorShapeProto input_shape;
   bool shape_found = false;
-  if (op_features.inputs(0).has_value()) {
-    const TensorProto& value = op_features.inputs(0).value();
+  if (op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &input_shape);
   }
-  if (!shape_found && op_features.outputs_size() == 1) {
-    input_shape = op_features.outputs(0).shape();
+  if (!shape_found && op_info.outputs_size() == 1) {
+    input_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
@@ -929,13 +931,12 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
 
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      input_shape, op_features.inputs(1).shape(), op_features,
-      found_unknown_shapes);
+      input_shape, op_info.inputs(1).shape(), op_info, found_unknown_shapes);
 
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2dBackpropInput) {
+  if (op_info.op() == kConv2dBackpropInput) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
@@ -944,7 +945,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
+  VLOG(1) << "Operations for" << op_info.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -953,23 +954,23 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
 }
 
 int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
-    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK(op_features.op() == kConv2dBackpropFilter ||
-         op_features.op() == kDepthwiseConv2dNativeBackpropFilter)
+  DCHECK(op_info.op() == kConv2dBackpropFilter ||
+         op_info.op() == kDepthwiseConv2dNativeBackpropFilter)
       << "Invalid Operation: not kConv2dBackpropFilter nor"
          "kDepthwiseConv2dNativeBackpropFilter";
 
   TensorShapeProto filter_shape;
   bool shape_found = false;
-  if (op_features.inputs_size() >= 2 && op_features.inputs(1).has_value()) {
-    const TensorProto& value = op_features.inputs(1).value();
+  if (op_info.inputs_size() >= 2 && op_info.inputs(1).has_value()) {
+    const TensorProto& value = op_info.inputs(1).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &filter_shape);
   }
-  if (!shape_found && op_features.outputs_size() == 1) {
-    filter_shape = op_features.outputs(0).shape();
+  if (!shape_found && op_info.outputs_size() == 1) {
+    filter_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
@@ -981,19 +982,18 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     *found_unknown_shapes = true;
   }
 
-  if (op_features.inputs_size() < 1) {
+  if (op_info.inputs_size() < 1) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      op_features.inputs(0).shape(), filter_shape, op_features,
-      found_unknown_shapes);
+      op_info.inputs(0).shape(), filter_shape, op_info, found_unknown_shapes);
 
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2dBackpropFilter) {
+  if (op_info.op() == kConv2dBackpropFilter) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
@@ -1001,7 +1001,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     ops *= conv_dims.oz;
   }
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
+  VLOG(1) << "Operations for" << op_info.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -1032,9 +1032,9 @@ int64 OpLevelCostEstimator::CalculateTensorSize(
 }
 
 int64 OpLevelCostEstimator::CalculateInputSize(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 total_input_size = 0;
-  for (auto& input : op_features.inputs()) {
+  for (auto& input : op_info.inputs()) {
     int64 input_size = CalculateTensorSize(input, found_unknown_shapes);
     total_input_size += input_size;
     VLOG(1) << "Input Size: " << input_size
@@ -1044,9 +1044,9 @@ int64 OpLevelCostEstimator::CalculateInputSize(
 }
 
 int64 OpLevelCostEstimator::CalculateLargestInputCount(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 largest_input_count = 0;
-  for (auto& input : op_features.inputs()) {
+  for (auto& input : op_info.inputs()) {
     int64 input_count =
         CalculateTensorElementCount(input, found_unknown_shapes);
     if (input_count > largest_input_count) {
@@ -1059,10 +1059,10 @@ int64 OpLevelCostEstimator::CalculateLargestInputCount(
 }
 
 int64 OpLevelCostEstimator::CalculateOutputSize(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 total_output_size = 0;
   // use float as default for calculations
-  for (const auto& output : op_features.outputs()) {
+  for (const auto& output : op_info.outputs()) {
     DataType dt = output.dtype();
     const auto& original_output_shape = output.shape();
     int64 output_size = DataTypeSize(BaseType(dt));
@@ -1080,10 +1080,10 @@ int64 OpLevelCostEstimator::CalculateOutputSize(
 }
 
 Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
-      CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
+      CountConv2DOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1091,12 +1091,12 @@ Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
 
 Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
-                                  op_features, nullptr, &found_unknown_shapes),
-                              op_features);
+                                  op_info, nullptr, &found_unknown_shapes),
+                              op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1104,12 +1104,12 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
 
 Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
-                                  op_features, nullptr, &found_unknown_shapes),
-                              op_features);
+                                  op_info, nullptr, &found_unknown_shapes),
+                              op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1204,26 +1204,26 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
 }
 
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
-      CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
+      CountMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
 Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   return Costs::ZeroCosts();
 }
 
 Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
-  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  result.max_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
   // Assign the minimum amount of time we can represent to the identity op since
   // it tends to be really cheap.
@@ -1233,11 +1233,10 @@ Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
 }
 
 Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
-  result.persistent_memory =
-      CalculateOutputSize(op_features, &result.inaccurate);
+  result.persistent_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
 
   result.compute_time = kMinComputeTime;
@@ -1247,20 +1246,19 @@ Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
 
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   Costs costs = PredictOpCountBasedCost(
-      CountBatchMatMulOperations(op_features, &found_unknown_shapes),
-      op_features);
+      CountBatchMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   Costs costs = Costs::ZeroCosts();
-  costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
+  costs.max_memory = CalculateOutputSize(op_info, &costs.inaccurate);
   costs.num_ops_with_unknown_shapes = costs.inaccurate;
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 84dd9213f773b538db71f0999c7ffd0b34e1881c..f8ba8c6637d9aade6610a6af8dd6c9f3e0be01af 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 
-#include <functional>
-#include <map>
-#include <string>
-
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
@@ -79,24 +75,23 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
-  int64 CountConv2DOperations(const OpInfo& op_features,
+  int64 CountConv2DOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const;
-  int64 CountConv2DOperations(const OpInfo& op_features,
+  int64 CountConv2DOperations(const OpInfo& op_info,
                               ConvolutionDimensions* conv_info,
                               bool* found_unknown_shapes) const;
-  int64 CountMatMulOperations(const OpInfo& op_features,
+  int64 CountMatMulOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const;
-  int64 CountMatMulOperations(const OpInfo& op_features,
-                              MatMulDimensions* mat_mul,
+  int64 CountMatMulOperations(const OpInfo& op_info, MatMulDimensions* mat_mul,
                               bool* found_unknown_shapes) const;
-  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const;
-  int64 CountConv2DBackpropInputOperations(const OpInfo& op_features,
-                                           ConvolutionDimensions* conv_info,
-                                           bool* found_unknown_shapes) const;
-  int64 CountConv2DBackpropFilterOperations(const OpInfo& op_features,
-                                            ConvolutionDimensions* conv_info,
-                                            bool* found_unknown_shapes) const;
+  int64 CountConv2DBackpropInputOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes) const;
+  int64 CountConv2DBackpropFilterOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes) const;
 
   // Calculate the element count of an input/output tensor.
   int64 CalculateTensorElementCount(const OpInfo::TensorProperties& tensor,
@@ -108,17 +103,17 @@ class OpLevelCostEstimator {
 
   // Calculate the element count of the largest
   // input of specified TensorFlow op.
-  int64 CalculateLargestInputCount(const OpInfo& op_features,
+  int64 CalculateLargestInputCount(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const;
 
   // Calculate the total size in bytes of the all
   // the inputs of specified TensorFlow op.
-  int64 CalculateInputSize(const OpInfo& op_features,
+  int64 CalculateInputSize(const OpInfo& op_info,
                            bool* found_unknown_shapes) const;
 
   // Calculate the total size in bytes of the all
   // the outputs of specified TensorFlow op.
-  int64 CalculateOutputSize(const OpInfo& op_features,
+  int64 CalculateOutputSize(const OpInfo& op_info,
                             bool* found_unknown_shapes) const;
 
   // This family of routines predicts the costs to
@@ -205,4 +200,5 @@ class OpLevelCostEstimator {
 
 }  // end namespace grappler
 }  // end namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 9a59877ac51c850ec59caad61db9d999cb0e17bb..aa0fc9d6c2af621fd0beef75d95c7154e9e88233 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -28,9 +28,34 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+// TODO(dyoon): Consider to use this Test class for all the test cases, and then
+// remove friend in the OpLevelCostEstimator class header.
+class TestOpLevelCostEstimator : public OpLevelCostEstimator {
+ public:
+  TestOpLevelCostEstimator() {
+    compute_memory_overlap_ = true;
+    device_info_ = DeviceInfo();
+  }
+  ~TestOpLevelCostEstimator() override {}
+
+  void SetDeviceInfo(const DeviceInfo& device_info) {
+    device_info_ = device_info;
+  }
+
+  void SetComputeMemoryOverlap(bool value) { compute_memory_overlap_ = value; }
+
+ protected:
+  DeviceInfo GetDeviceInfo(const DeviceProperties& device) const override {
+    return device_info_;
+  }
+
+  DeviceInfo device_info_;
+};
+
 // Wrangles the minimum number of proto fields to set up a matrix.
-void DescribeMatrix(int rows, int columns, OpInfo* op_features) {
-  auto input = op_features->add_inputs();
+void DescribeMatrix(int rows, int columns, OpInfo* op_info) {
+  auto input = op_info->add_inputs();
   auto shape = input->mutable_shape();
   auto shape_rows = shape->add_dim();
   shape_rows->set_size(rows);
@@ -39,8 +64,8 @@ void DescribeMatrix(int rows, int columns, OpInfo* op_features) {
   input->set_dtype(DT_FLOAT);
 }
 
-void SetCpuDevice(OpInfo* op_features) {
-  auto device = op_features->mutable_device();
+void SetCpuDevice(OpInfo* op_info) {
+  auto device = op_info->mutable_device();
   device->set_type("CPU");
   device->set_num_cores(10);
   device->set_bandwidth(10000000);  // 10000000 KB/s = 10 GB/s
@@ -413,15 +438,14 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     return estimator_.PredictCosts(op_context);
   }
 
-  int64 CountMatMulOperations(const OpInfo& op_features,
+  int64 CountMatMulOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const {
-    return estimator_.CountMatMulOperations(op_features, found_unknown_shapes);
+    return estimator_.CountMatMulOperations(op_info, found_unknown_shapes);
   }
 
-  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const {
-    return estimator_.CountBatchMatMulOperations(op_features,
-                                                 found_unknown_shapes);
+    return estimator_.CountBatchMatMulOperations(op_info, found_unknown_shapes);
   }
 
   void SetComputeMemoryOverlap(bool value) {
@@ -946,7 +970,7 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
   };
 
   {
-    // Typical 3xz3 window with 2x2 stride.
+    // Typical 3x3 window with 2x2 stride.
     auto costs = predict_max_pool_grad(10, 20, 384, 3, 2, "SAME");
     EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
     EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
@@ -987,7 +1011,7 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
   };
 
   {
-    // Typical 3xz3 window with 2x2 stride.
+    // Typical 3x3 window with 2x2 stride.
     auto costs = predict_avg_pool(10, 20, 384, 3, 2, "SAME");
     EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
     EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
@@ -1209,5 +1233,59 @@ TEST_F(OpLevelCostEstimatorTest, MaybeGetMinimumShape) {
     ExpectTensorShape({10, 20}, y);
   }
 }
+
+TEST_F(OpLevelCostEstimatorTest, IntermediateRdWrBandwidth) {
+  TestOpLevelCostEstimator estimator;
+
+  // Compute limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/1,
+                                     /*gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  auto cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(3548774400), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(3551112192), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+
+  // Memory limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/99999,
+                                     /*gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2337792), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.memory_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2373281), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+
+  // Intermediate memory bandwidth limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/99999,
+                                     /*gb_per_sec=*/9999,
+                                     /*intermediate_read_gb_per_sec=*/1,
+                                     /*intermediate_write_gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2337792), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.intermediate_memory_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2373515), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ae5200b359232153f96c9ffa21a505d2a056d55d..0aac0348b512d2e8040a9ac1337ceb9c12a09206 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
-#include <math.h>
-
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -38,6 +36,12 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+// Optional attribute name for Switch op as a vector of int that tells
+// which branch the Switch output is taken on every round of execution.
+// We use this side information, if provided, for scheduling ops after Switch
+// correctly (e.g., While loop).
+constexpr char kOutputSlots[] = "_output_slot_vector";
+
 Costs CombineCosts(const Costs& left, const Costs& right) {
   CHECK_NE(left.max_memory, kMemoryUnknown);
   CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
@@ -306,43 +310,25 @@ ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
   LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
 }
 
-VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
-                                   const bool use_static_shapes,
-                                   Cluster* cluster,
-                                   ReadyNodeManager* ready_nodes)
-    : ready_nodes_(ready_nodes),
-      graph_costs_(Costs::ZeroCosts()),
-      graph_properties_(new GraphProperties(*grappler_item)),
-      cluster_(cluster),
-      grappler_item_(grappler_item),
-      use_static_shapes_(use_static_shapes),
-      placer_(cluster) {
-  graph_costs_.num_ops_total = 0;
-  initialized_ = false;
-}
-
 VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
+                                   const bool use_aggressive_shape_inference,
                                    Cluster* cluster,
                                    ReadyNodeManager* ready_nodes)
     : ready_nodes_(ready_nodes),
       graph_costs_(Costs::ZeroCosts()),
       cluster_(cluster),
       use_static_shapes_(use_static_shapes),
+      use_aggressive_shape_inference_(use_aggressive_shape_inference),
       placer_(cluster) {
   graph_costs_.num_ops_total = 0;
   initialized_ = false;
+  track_mem_usage_snapshot_ = VLOG_IS_ON(1);
 }
 
 Status VirtualScheduler::Init(const GrapplerItem* item) {
   grappler_item_ = item;
   graph_properties_ = absl::make_unique<GraphProperties>(*item);
 
-  return Init();
-}
-
-// TODO(pcma): Merge with Init(const GrapplerItem* item) when this
-// deprecated API is deleted
-Status VirtualScheduler::Init() {
   initialized_ = false;
 
   // Clear all internal states so that the VirtualScheduler is reusable for
@@ -366,7 +352,8 @@ Status VirtualScheduler::Init() {
 
   // Construct graph properties.
   if (use_static_shapes_) {
-    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(true));
+    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(
+        true, use_aggressive_shape_inference_));
   } else {
     TF_RETURN_IF_ERROR(graph_properties_->InferDynamically(cluster_));
   }
@@ -400,6 +387,8 @@ Status VirtualScheduler::Init() {
     name_to_node[node->name()] = node;
   }
 
+  // Traverse the graph to check if the graph is annotated with Switch outputs.
+  // Also record _Send nodes.
   // TODO(dyoon): Instead of identifying _Send node here manually, add _Send
   // to _Recv as control dependency when creating GrapplerItem.
   std::unordered_map<string, const NodeDef*> name_to_send;
@@ -408,6 +397,11 @@ Status VirtualScheduler::Init() {
       const auto& attr = node.attr();
       name_to_send[attr.at("tensor_name").s()] = &node;
     }
+
+    if (IsSwitch(node)) {
+      const auto& attr = node.attr();
+      if (attr.count(kOutputSlots) > 0) switch_outputs_annotated_ = true;
+    }
   }
 
   // To reuse _Recv ops.
@@ -562,7 +556,7 @@ void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
       inputs.push_back(control_message);
       outputs.push_back(control_message);
     } else {
-      auto output_properties =
+      const auto& output_properties =
           graph_properties_->GetOutputProperties(NodeName(input_source_name));
       // Like with HasInputProperties, if a node does not have output
       // properties, it's likely it was pruned during the shape inference run.
@@ -769,6 +763,82 @@ Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
   return it->second;
 }
 
+// Check Switch outputs in updated MetaGraphDef, add corresponding nodes to
+// ready queue.
+// Fallback to add all outputs if fail to find the actual output.
+bool VirtualScheduler::AddSwitchOutputsToReadyQueue(
+    const NodeDef* node, int curr_iter, const Costs::Duration& curr_time) {
+  if (node->attr().count(kOutputSlots) == 0) return false;
+
+  auto& node_state = node_map_[node];
+  const auto& slot_vector = node->attr().at(kOutputSlots);
+  if (slot_vector.list().i_size() <= curr_iter) {
+    // Sometimes we encounter infinite loop. Fall back to add all outputs.
+    return false;
+  }
+
+  int slot = slot_vector.list().i(curr_iter);
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    if (port_num_output_pair.first != slot) continue;
+
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      // Execute a node as soon as all its inputs are ready. Merge nodes
+      // are special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+        VLOG(3) << "Node " << node->name() << " iter " << curr_iter << "/"
+                << slot_vector.list().i_size() << " Add Switch output " << slot
+                << ": " << output_node->name();
+      }
+    }
+    return true;
+  }
+
+  return false;
+}
+
+void VirtualScheduler::AddOutputNodesToReadyQueue(
+    const NodeDef* node, const Costs::Duration& curr_time) {
+  auto& node_state = node_map_[node];
+  int curr_iter = node_state.num_executed_times;
+  ++node_state.num_executed_times;
+
+  if (switch_outputs_annotated_) {
+    // If the graph is annotated with StepStats, reset num_inputs_ready so we
+    // can schedule the node multiple times.
+    node_state.num_inputs_ready = 0;
+
+    // For Switch node, get output branch from updated MetaGraphDef.
+    if (IsSwitch(*node) &&
+        AddSwitchOutputsToReadyQueue(node, curr_iter, curr_time))
+      return;
+  }
+
+  // Increment num_inputs_ready of the output nodes and maybe add to ready
+  // nodes.
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      // Execute a node as soon as all its inputs are ready. Merge nodes are
+      // special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+      }
+    }
+  }
+}
+
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
   graph_costs_ = CombineCosts(graph_costs_, node_costs);
@@ -778,13 +848,16 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
   op_cost = CombineCosts(op_cost, node_costs);
 
-  // Also keep track of op counts and costs per op (with their shapes).
-  OpContext op_context = GetCurrNode();
-  string node_description = GetOpDescription(op_context.op_info);
-  op_counts_[node_description] += 1;
-  op_costs_[node_description] =
-      std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
-                     !node_costs.inaccurate);
+  if (VLOG_IS_ON(2)) {
+    // Also keep track of op counts and costs per op (with their shapes).
+    OpContext op_context = GetCurrNode();
+
+    string node_description = GetOpDescription(op_context.op_info);
+    op_counts_[node_description] += 1;
+    op_costs_[node_description] =
+        std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+                       !node_costs.inaccurate);
+  }
 
   // Update node and device states.
   auto& node_state = node_map_[node];
@@ -793,6 +866,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Node is scheduled when the device is available AND all the inputs are
   // ready; hence, time_scheduled is time_ready if time_ready > device curr
   // time.
+  // TODO(andiryxu): Current node_state result only records the last execution.
+  // With annotated MetaGraph we can schedule a node for multiple times.
+  // Refine NodeState structure accordingly, e.g. record time_scheduled in a
+  // vector.
   node_state.time_scheduled =
       std::max(device.GetCurrTime(), node_state.time_ready);
   // Override device curr time with the time_scheduled.
@@ -826,22 +903,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Increment num_inputs_ready of the output nodes and maybe add to ready nodes
-  for (const auto& port_num_output_pair : node_state.outputs) {
-    for (auto* output_node : port_num_output_pair.second) {
-      auto& output_state = node_map_[output_node];
-      output_state.num_inputs_ready++;
-      // Execute a node as soon as all its inputs are ready. Merge nodes are
-      // special since they run as soon as one of their inputs becomes
-      // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
-          IsMerge(*output_node)) {
-        // This output node is now ready.
-        output_state.time_ready = curr_time;
-        ready_nodes_->AddNode(output_node);
-      }
-    }
-  }
+  // Check outputs, add ready nodes to queue.
+  AddOutputNodesToReadyQueue(node, curr_time);
 
   // Increment num_outputs_executed of the input nodes and maybe update memory.
   for (const auto& input_port : node_state.inputs) {
@@ -868,7 +931,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     // check max memory usage.
     if (device.memory_usage > device.max_memory_usage) {
       device.max_memory_usage = device.memory_usage;
-      device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+
+      if (track_mem_usage_snapshot_) {
+        device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+      }
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 6a835f32d16d0850c06891f656b2bec910e26b78..d96371bcab5db2d3ef730bf1eec8fe7f733bf4f6 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -70,11 +70,15 @@ struct NodeState {
   // Each output port uses up memory space from time_scheduled to its
   // time_no_references.
 
+  // How many times this node has been executed, e.g. in a while loop.
+  int num_executed_times;
+
   NodeState() {
     num_inputs_ready = 0;
     time_ready = Costs::Duration::max();
     time_scheduled = Costs::Duration::max();
     time_finished = Costs::Duration::max();
+    num_executed_times = 0;
     // Note that num_outputs_executed and time_no_references are not initialized
     // here, since we don't know the size (i.e., # outputs for this node).
   }
@@ -256,16 +260,9 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
 // dependencies, device, etc.
 class VirtualScheduler {
  public:
-  // TODO(pcma): Modify power_analyzer.cc to use new API's.
-  // DEPRECATED
-  VirtualScheduler(const GrapplerItem* grappler_item,
-                   const bool use_static_shapes, Cluster* cluster,
-                   ReadyNodeManager* ready_nodes);
-  // DEPRECATED
-  Status Init();
-
   // Does not take ownership of cluster or ready_nodes.
-  VirtualScheduler(bool use_static_shapes, Cluster* cluster,
+  VirtualScheduler(const bool use_static_shapes,
+                   const bool use_aggressive_shape_inference, Cluster* cluster,
                    ReadyNodeManager* ready_nodes);
   // Initializes the scheduler for the specific grappler item.
   // Should be called immediately after the c'tor or when the scheduler will be
@@ -305,6 +302,8 @@ class VirtualScheduler {
     return &node_map_;
   }
 
+  void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
+
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
@@ -328,6 +327,10 @@ class VirtualScheduler {
                           std::map<string, Costs>* op_cost);
   float Round2(const float x) const;
   bool IsPersistentNode(const NodeDef* node) const;
+  bool AddSwitchOutputsToReadyQueue(const NodeDef* node, int curr_iter,
+                                    const Costs::Duration& curr_time);
+  void AddOutputNodesToReadyQueue(const NodeDef* node,
+                                  const Costs::Duration& curr_time);
 
   // Scheduler states:
   ReadyNodeManager* ready_nodes_;  // Not owned.
@@ -356,6 +359,12 @@ class VirtualScheduler {
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
   bool initialized_;
+  bool track_mem_usage_snapshot_;
+  const bool use_aggressive_shape_inference_;
+
+  // Whether the input graph includes Switch nodes annotated with output slots
+  // information.
+  bool switch_outputs_annotated_ = false;
 
   VirtualPlacer placer_;  // owned.
 };
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 0a695458e17a576ecda631b576d4ace4aa947dbc..128cb986f11ba4f4bb13583cb293183194e1c744 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -30,8 +30,13 @@ namespace grappler {
 // Class for testing virtual scheduler.
 class TestVirtualScheduler : public VirtualScheduler {
  public:
-  TestVirtualScheduler(const bool use_static_shapes, Cluster* cluster)
-      : VirtualScheduler(use_static_shapes, cluster, &ready_node_manager_) {}
+  TestVirtualScheduler(const bool use_static_shapes,
+                       const bool use_aggressive_shape_inference,
+                       Cluster* cluster)
+      : VirtualScheduler(use_static_shapes, use_aggressive_shape_inference,
+                         cluster, &ready_node_manager_) {
+    enable_mem_usage_tracking();
+  }
 
   FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
   FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
@@ -66,7 +71,8 @@ class VirtualSchedulerTest : public ::testing::Test {
     devices[kCPU1] = cpu_device;
     cluster_ = absl::make_unique<VirtualCluster>(devices);
     scheduler_ = absl::make_unique<TestVirtualScheduler>(
-        /* use_static_shapes = */ true, cluster_.get());
+        /*use_static_shapes=*/true,
+        /*use_aggressive_shape_inference=*/true, cluster_.get());
   }
 
   NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
@@ -867,6 +873,439 @@ versions {
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
+  // A simple while loop strengthened with Switch outputs.
+  void CreateGrapplerItemWithLoopSwitchOutputs() {
+    // Test graph produced in python using:
+    /*
+      with tf.Graph().as_default():
+      i0 = tf.constant(0)
+      m0 = tf.ones([2, 2])
+      c = lambda i, m: i < 10
+      b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+      r = tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+      with open('/tmp/graph.pbtxt', 'w') as f:
+      f.write(str(tf.get_default_graph().as_graph_def()))
+    */
+    const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+  attr {
+    key: "_output_slot_vector"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_slot_vector"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
+  }
+
+  // Create a FusedBatchNorm op that has multiple output ports.
   void CreateGrapplerItemWithInterDeviceTransfers() {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
 
@@ -1940,6 +2379,89 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   ValidateDependencyChain(start_times, {"while/Switch_1", "while/Exit_1"});
 }
 
+TEST_F(VirtualSchedulerTest, WhileLoopWithSwitchOutputs) {
+  // Init.
+  CreateGrapplerItemWithLoopSwitchOutputs();
+  InitScheduler();
+
+  // Runs the scheduler.
+  RunScheduler("");
+
+  RunMetadata metadata;
+  scheduler_->Summary(&metadata);
+
+  // Nodes in topological order:
+  // * const, ones
+  // * while/Enter, while/Enter_1
+  // * while/Merge, while/Merge_1
+  // * while/Less/y
+  // * while/Less
+  // * while/LoopCond
+  // * while/Switch, while/Switch_1
+  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1
+  // * while/add/y, while/concat/axis
+  // * while/add, while/concat
+  // * while/NextIteration, while/NextIteration_1
+
+  int num_next_iteration = 0;
+  int num_next_iteration_1 = 0;
+  int num_exit = 0;
+  int num_exit_1 = 0;
+  int64 next_iter_start_micro;
+  int64 next_iter_1_start_micro;
+  int64 exit_start_micro;
+  int64 exit_1_start_micro;
+
+  std::unordered_map<string, int64> start_times;
+  for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+    for (const auto& stats : device_step_stats.node_stats()) {
+      start_times[stats.node_name()] = stats.all_start_micros();
+      if (stats.node_name() == "while/NextIteration") {
+        ++num_next_iteration;
+        next_iter_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/NextIteration_1") {
+        ++num_next_iteration_1;
+        next_iter_1_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/Exit") {
+        ++num_exit;
+        exit_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/Exit_1") {
+        ++num_exit_1;
+        exit_1_start_micro = stats.all_start_micros();
+      }
+    }
+  }
+
+  // Makes sure we run the loop body for ten times.
+  EXPECT_EQ(10, num_next_iteration);
+  EXPECT_EQ(10, num_next_iteration_1);
+  EXPECT_EQ(1, num_exit);
+  EXPECT_EQ(1, num_exit_1);
+
+  // Start times of while/NextIteration and while/NextIteration_1 should be
+  // different, so should be those of while/Exit and while/Exit_1.
+  EXPECT_NE(next_iter_start_micro, next_iter_1_start_micro);
+  EXPECT_NE(exit_start_micro, exit_1_start_micro);
+
+  // Checks dependency among the nodes; no matter what scheduling mechanism we
+  // use, the scheduled ops should follow these dependency chains.
+  // We have to break the loop into two parts, identified by Switch outputs.
+  ValidateDependencyChain(
+      start_times,
+      {"Const", "while/Enter", "while/Merge", "while/Less/y", "while/Less",
+       "while/LoopCond", "while/Switch", "while/Exit"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add/y",
+                                        "while/add", "while/NextIteration"});
+  ValidateDependencyChain(
+      start_times, {"ones", "while/Enter_1", "while/Merge_1", "while/Switch_1",
+                    "while/Exit_1"});
+  ValidateDependencyChain(start_times, {"while/Identity_1", "while/concat",
+                                        "while/NextIteration_1"});
+  ValidateDependencyChain(
+      start_times, {"while/Identity", "while/concat/axis", "while/concat"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add"});
+}
+
 TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
   // Init.
   CreateGrapplerItemWithInterDeviceTransfers();
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79e2f9a92fda7bbbc9018f678aeee2b95d763ffc
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -0,0 +1,191 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+template <typename T>
+inline void SortAndRemoveDuplicates(T* v) {
+  std::sort(v->begin(), v->end());
+  v->erase(std::unique(v->begin(), v->end()), v->end());
+}
+
+}  // namespace
+
+Status GraphTopologyView::InitializeFromGraph(
+    const GraphDef& graph,
+    const absl::Span<const GraphView::Edge> ephemeral_edges) {
+  if (graph_ != nullptr) {
+    return errors::InvalidArgument("GraphTopologyView is already initialized.");
+  }
+
+  graph_ = &graph;
+  num_nodes_ = graph.node_size();
+  index_to_node_name_.resize(num_nodes_);
+  node_name_to_index_.rehash(num_nodes_);
+  fanins_.resize(num_nodes_);
+  fanouts_.resize(num_nodes_);
+
+  // Build map from name to index and vice versa.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    node_name_to_index_.emplace(node.name(), node_idx);
+    index_to_node_name_.emplace_back(node.name());
+  }
+
+  // 1. Add ephemeral edges to the adjacency lists.
+  for (const GraphView::Edge& edge : ephemeral_edges) {
+    const auto src = node_name_to_index_.find(edge.src.node->name());
+    const bool valid_src = src != node_name_to_index_.end();
+
+    if (!valid_src) {
+      const string error_message =
+          absl::StrCat("Non-existent src node: ", edge.src.node->name());
+      if (skip_invalid_edges_) {
+        VLOG(0) << "Skip error: " << error_message;
+      } else {
+        return errors::InvalidArgument(error_message);
+      }
+    }
+
+    const auto dst = node_name_to_index_.find(edge.dst.node->name());
+    const bool valid_dst = dst != node_name_to_index_.end();
+
+    if (!valid_dst) {
+      const string error_message =
+          absl::StrCat("Non-existent dst node: ", edge.dst.node->name());
+      if (skip_invalid_edges_) {
+        VLOG(0) << "Skip error: " << error_message;
+      } else {
+        return errors::InvalidArgument(error_message);
+      }
+    }
+
+    if (valid_dst && valid_src) {
+      const int src_idx = src->second;
+      const int dst_idx = dst->second;
+      fanins_[dst_idx].push_back(src_idx);
+      fanouts_[src_idx].push_back(dst_idx);
+    }
+  }
+
+  // 2. Add graph edges to the adjacency lists.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    fanins_[node_idx].reserve(node.input_size());
+
+    for (const string& input : node.input()) {
+      TensorId tensor = ParseTensorName(input);
+      const auto it = node_name_to_index_.find(tensor.node());
+      const bool valid_input = it != node_name_to_index_.end();
+
+      if (!valid_input) {
+        const string error_message = absl::StrCat("Non-existent input ", input,
+                                                  " in node ", node.name());
+        if (skip_invalid_edges_) {
+          VLOG(3) << "Skip error: " << error_message;
+        } else {
+          return errors::InvalidArgument(error_message);
+        }
+      }
+
+      if (valid_input) {
+        const int input_idx = it->second;
+        fanins_[node_idx].push_back(input_idx);
+        fanouts_[input_idx].push_back(node_idx);
+      }
+    }
+
+    // Dedup the input list while it's still hot in cache.
+    SortAndRemoveDuplicates(&fanins_[node_idx]);
+  }
+
+  // Dedup outputs for all the graph nodes.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    SortAndRemoveDuplicates(&fanouts_[node_idx]);
+  }
+
+  return Status::OK();
+}
+
+Status GraphTopologyView::InitializeFromGraph(const GraphDef& graph) {
+  return InitializeFromGraph(graph, absl::Span<GraphView::Edge>());
+}
+
+bool GraphTopologyView::HasNode(const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  return it != node_name_to_index_.end();
+}
+
+const NodeDef* GraphTopologyView::GetNode(
+    const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  return it == node_name_to_index_.end() ? nullptr : &graph_->node(it->second);
+}
+
+const NodeDef* GraphTopologyView::GetNode(int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  DCHECK(node_idx >= 0 && node_idx < num_nodes_) << "node_idx is out of range";
+  return &graph_->node(node_idx);
+}
+
+const absl::optional<int> GraphTopologyView::GetNodeIndex(
+    const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  DCHECK(it != node_name_to_index_.end()) << "Node doesn't exist in a graph";
+  return it == node_name_to_index_.end() ? absl::nullopt
+                                         : absl::make_optional(it->second);
+}
+
+const absl::optional<int> GraphTopologyView::GetNodeIndex(
+    const NodeDef& node) const {
+  return GetNodeIndex(node.name());
+}
+
+const absl::InlinedVector<int, 4>& GraphTopologyView::GetFanin(
+    int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const bool is_valid_node_idx = node_idx >= 0 && node_idx < num_nodes_;
+  DCHECK(is_valid_node_idx) << "node_idx is out of range";
+  return is_valid_node_idx ? fanins_[node_idx] : empty_fanin_;
+}
+
+const absl::InlinedVector<int, 2>& GraphTopologyView::GetFanout(
+    int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const bool is_valid_node_idx = node_idx >= 0 && node_idx < num_nodes_;
+  DCHECK(is_valid_node_idx) << "node_idx is out of range";
+  return is_valid_node_idx ? fanouts_[node_idx] : empty_fanout_;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_topology_view.h b/tensorflow/core/grappler/graph_topology_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..c40d0093b9063f4e0dadaa6c607154fdbb4986ab
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view.h
@@ -0,0 +1,111 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// GraphTopologyView is a helper class to simplify `node-to-node` connectivity
+// traversals. Regular `GraphView` simplifies `tensor-to-tensor` traversals:
+// connections between output tensors and inputs of a consumer nodes. For the
+// topology view we are focused on nodes connected to nodes, and it's irrelevant
+// if this connection is formed by one or multiple individual tensors.
+//
+// Example:
+//   a = Placeholder(..)
+//   b = Placeholder(..)
+//   c = AddN([a, a, b])
+//
+// GraphView edges:         [a:0 -> c:0, a:0 -> c:1, b:0 -> c:3]
+// GraphTopologyView edges: [a -> c, b -> c]
+//
+// GraphView is used for exploring single node fanins and fanouts, and
+// GraphTopologyView is focused on efficient full graph traversals (computing
+// graph node properties from transitive fanouts, etc...).
+class GraphTopologyView {
+ public:
+  GraphTopologyView() = default;
+  explicit GraphTopologyView(bool skip_invalid_edges)
+      : skip_invalid_edges_(skip_invalid_edges) {}
+
+  // Initialize graph topology view from the graph. It's possible to pass
+  // additional edges that do not exist in a graph, but must be respected when
+  // computing graph topology. Example: Tensorflow runtime allows concurrent
+  // execution of dequeue/enqueue ops from the same queue resource, but we might
+  // want to enforce ordering between them for the purpose of graph analysis.
+  Status InitializeFromGraph(const GraphDef& graph,
+                             absl::Span<const GraphView::Edge> ephemeral_edges);
+  Status InitializeFromGraph(const GraphDef& graph);
+
+  bool is_initialized() const { return graph_ != nullptr; }
+  int num_nodes() const { return num_nodes_; }
+  const GraphDef* graph() const { return graph_; }
+
+  // Returns true iff the node exists in the underlying graph.
+  bool HasNode(absl::string_view node_name) const;
+
+  // Finds a node by name or returns `nullptr` if it's not in the graph.
+  const NodeDef* GetNode(absl::string_view node_name) const;
+  // Returns a node corresponding to the given node index.
+  const NodeDef* GetNode(int node_idx) const;
+
+  // Returns a node index for the given node name, if the name exists in the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(absl::string_view node_name) const;
+  // Returns a node index for the given node, if the node belongs to the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(const NodeDef& node) const;
+
+  // Returns all the node indexes that are in the direct fanin of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 4>& GetFanin(int node_idx) const;
+  // Returns all the node indexes that are in the direct fanout of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 2>& GetFanout(int node_idx) const;
+
+ private:
+  // If true, all invalid edges and inputs (srd, dst or input node not found in
+  // a graph) will be skipped, otherwise initialization will fail with error.
+  bool skip_invalid_edges_ = false;
+
+  // WARN: `graph_` must outlive this object and graph nodes must not be
+  // destructed, because node names captured with absl::string_view.
+  const GraphDef* graph_ = nullptr;  // do not own
+  int num_nodes_ = 0;
+  std::vector<absl::string_view> index_to_node_name_;
+  absl::flat_hash_map<absl::string_view, int> node_name_to_index_;
+  std::vector<absl::InlinedVector<int, 4>> fanins_;   // node_idx->input nodes
+  std::vector<absl::InlinedVector<int, 2>> fanouts_;  // node_idx->output nodes
+
+  // We need a valid reference to return from GetFanin/GetFanout if the
+  // `node_idx` argument is outside of the [0, num_nodes_) range.
+  absl::InlinedVector<int, 4> empty_fanin_;
+  absl::InlinedVector<int, 2> empty_fanout_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
diff --git a/tensorflow/core/grappler/graph_topology_view_test.cc b/tensorflow/core/grappler/graph_topology_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36d3a2017cc5ef965a26b0bdbbbdde441fb633db
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GraphTopologyViewTest : public ::testing::Test {
+ protected:
+  using NodeConfig = std::pair<string, std::vector<string>>;
+
+  static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
+    GraphDef graph;
+
+    for (const NodeConfig& node : nodes) {
+      const auto& node_name = node.first;
+      const auto& node_inputs = node.second;
+
+      NodeDef node_def;
+      node_def.set_name(node_name);
+      for (const string& input : node_inputs) {
+        node_def.add_input(input);
+      }
+
+      *graph.add_node() = std::move(node_def);
+    }
+
+    return graph;
+  }
+};
+
+TEST_F(GraphTopologyViewTest, SimpleGraph) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},          // idx: 0
+      {"b", {}},          // idx: 1
+      {"c", {"a", "b"}},  // idx: 2
+      {"d", {"a", "c"}},  // idx: 3
+  });
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+
+  EXPECT_TRUE(graph_view.is_initialized());
+
+  const NodeDef* a_by_name = graph_view.GetNode("a");
+  const NodeDef* a_by_idx = graph_view.GetNode(0);
+  ASSERT_TRUE(a_by_name);
+  ASSERT_TRUE(a_by_idx);
+  EXPECT_EQ(a_by_name, a_by_idx);
+
+  const NodeDef* b_by_name = graph_view.GetNode("b");
+  const NodeDef* b_by_idx = graph_view.GetNode(1);
+  ASSERT_TRUE(b_by_name);
+  ASSERT_TRUE(b_by_idx);
+  EXPECT_EQ(b_by_name, b_by_idx);
+
+  const absl::optional<int> b_idx = graph_view.GetNodeIndex(*b_by_name);
+  ASSERT_TRUE(b_idx.has_value());
+  EXPECT_EQ(b_idx.value(), 1);
+
+  const absl::optional<int> c_idx = graph_view.GetNodeIndex("c");
+  ASSERT_TRUE(c_idx.has_value());
+  EXPECT_EQ(c_idx.value(), 2);
+
+  using Fanin = absl::InlinedVector<int, 4>;
+  EXPECT_EQ(graph_view.GetFanin(0), Fanin());
+  EXPECT_EQ(graph_view.GetFanin(1), Fanin());
+  EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1}));
+  EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+  using Fanout = absl::InlinedVector<int, 2>;
+  EXPECT_EQ(graph_view.GetFanout(0), Fanout({2, 3}));
+  EXPECT_EQ(graph_view.GetFanout(1), Fanout({2}));
+  EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+  EXPECT_EQ(graph_view.GetFanout(3), Fanout());
+}
+
+TEST_F(GraphTopologyViewTest, GraphWithALoop) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},               // idx: 0
+      {"b", {}},               // idx: 1
+      {"c", {"a", "b", "d"}},  // idx: 2 <<<--- 'c' and 'd' have a loop
+      {"d", {"a", "c"}},       // idx: 3
+  });
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  EXPECT_TRUE(graph_view.is_initialized());
+
+  using Fanin = absl::InlinedVector<int, 4>;
+  EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1, 3}));
+  EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+  using Fanout = absl::InlinedVector<int, 2>;
+  EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+  EXPECT_EQ(graph_view.GetFanout(3), Fanout({2}));
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index ba9d2eb32181940bc430771db281c6cea8cb48c4..be9b9c36c71c6f8282862de85a211358fa826186 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -66,28 +66,27 @@ int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
 bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
                          int port) {
   const auto output = GraphView::OutputPort(node, port);
-  const auto fanout = graph_view.GetFanout(output);
-  return fanout.size() <= 1;
+  return graph_view.GetFanout(output).size() <= 1;
 }
 
 bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
   const auto output = GraphView::OutputPort(node, port);
-  const auto fanout = graph_view.GetFanout(output);
-  return !fanout.empty();
+  return !graph_view.GetFanout(output).empty();
 }
 
-bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
-  const auto control_port = GraphView::InputPort(node, -1);
-  return graph_view.GetFanin(control_port).empty();
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::InputPort(node, Graph::kControlSlot);
+  return !graph_view.GetFanin(control_port).empty();
 }
 
-bool NoControlFanout(const GraphView& graph_view, const NodeDef* node) {
-  const auto control_port = GraphView::OutputPort(node, -1);
-  return graph_view.GetFanout(control_port).empty();
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::OutputPort(node, Graph::kControlSlot);
+  return !graph_view.GetFanout(control_port).empty();
 }
 
-bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
-  return NoControlFanin(graph_view, node) && NoControlFanout(graph_view, node);
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
+  return HasControlFanin(graph_view, node) ||
+         HasControlFanout(graph_view, node);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 16156d0f2042763a7518d5de2c57440343e50f2d..63c58a0aede059c6def5eca322ce3c491ea709b7 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -143,13 +143,20 @@ class GraphViewInternal {
 
   // Gets the output port(s) in the immediate fanin of an input port.
   absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
-    if (port.port_id >= 0) return {GetRegularFanin(port)};
+    if (port.port_id >= 0) {
+      OutputPort regular_fanin = GetRegularFanin(port);
+      if (regular_fanin.node == nullptr) {
+        return {};
+      }
+      return {regular_fanin};
+    }
 
     // Collect fanin for the control input.
     absl::flat_hash_set<OutputPort> result;
-    for (int i = port.node->input_size() - 1; i >= 0; --i) {
+    const int first_control_port =
+        gtl::FindWithDefault(max_regular_input_port_, port.node, -1) + 1;
+    for (int i = first_control_port; i < port.node->input_size(); ++i) {
       TensorId tensor_id = ParseTensorName(port.node->input(i));
-      if (tensor_id.index() >= 0) break;  // we reached regular inputs
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
@@ -158,23 +165,36 @@ class GraphViewInternal {
   }
 
   // Special case: regular (i.e. non-control) input ports can only have one
-  // fanin.
+  // fanin. If port.port_id is out of range or is a control dependency, then an
+  // empty OutputPort is returned.
   const OutputPort GetRegularFanin(const InputPort& port) const {
-    DCHECK_GE(port.port_id, 0);
-    if (port.port_id < 0) return OutputPort();
+    if (port.port_id < 0 ||
+        port.port_id >
+            gtl::FindWithDefault(max_regular_input_port_, port.node, -1)) {
+      return OutputPort();
+    }
 
     TensorId tensor_id = ParseTensorName(port.node->input(port.port_id));
     return GetOutputPort(tensor_id.node(), tensor_id.index());
   }
 
   // Checks if a tensor id is a fanin of the node.
-  bool HasFanin(const NodeDef& node, const TensorId& fanin) const {
-    if (fanin.index() < -1) {
+  bool HasFanin(const NodeDefT& node, const TensorId& fanin) const {
+    int end = node.input_size();
+    if (end == 0 || fanin.index() < -1) {
       return false;
     }
-    string fanin_string = TensorIdToString(fanin);
-    for (int i = 0; i < node.input_size(); ++i) {
-      if (node.input(i) == fanin_string) {
+
+    const int num_regular_fanins =
+        gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
+    int start = 0;
+    if (fanin.index() > -1) {
+      end = num_regular_fanins;
+    } else {
+      start = num_regular_fanins;
+    }
+    for (int i = start; i < end; ++i) {
+      if (ParseTensorName(node.input(i)) == fanin) {
         return true;
       }
     }
@@ -184,14 +204,14 @@ class GraphViewInternal {
   // Gets all the input ports in the immediate fanout of a node. Include the
   // controlled nodes iff include_controlled_nodes is true.
   absl::flat_hash_set<InputPort> GetFanouts(
-      const NodeDef& node, bool include_controlled_nodes) const {
+      const NodeDefT& node, bool include_controlled_nodes) const {
     absl::flat_hash_set<InputPort> result;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -206,11 +226,14 @@ class GraphViewInternal {
   // Gets all the output ports in the immediate fanin of a node. Include the
   // controlling nodes iff include_controlling_nodes is true.
   absl::flat_hash_set<OutputPort> GetFanins(
-      const NodeDef& node, bool include_controlling_nodes) const {
+      const NodeDefT& node, bool include_controlling_nodes) const {
     absl::flat_hash_set<OutputPort> result;
-    for (int i = 0; i < node.input_size(); ++i) {
+    const int max_input_port =
+        include_controlling_nodes
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
       TensorId tensor_id = ParseTensorName(node.input(i));
-      if (tensor_id.index() < 0 && !include_controlling_nodes) break;
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
@@ -220,27 +243,23 @@ class GraphViewInternal {
 
   // Gets the number of ports in the immediate fanin of a node. Count the
   // controlling nodes iff include_controlling_nodes is true.
-  int NumFanins(const NodeDef& node, bool include_controlling_nodes) const {
-    int count = 0;
-    for (const string& input : node.input()) {
-      if (!include_controlling_nodes && IsControlInput(input)) {
-        break;
-      }
-      count += 1;
+  int NumFanins(const NodeDefT& node, bool include_controlling_nodes) const {
+    if (include_controlling_nodes) {
+      return node.input_size();
     }
-    return count;
+    return gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
   }
 
   // Gets the number of ports in the immediate fanout of a node. Count the
   // controlled nodes iff include_controlled_nodes is true.
-  int NumFanouts(const NodeDef& node, bool include_controlled_nodes) const {
+  int NumFanouts(const NodeDefT& node, bool include_controlled_nodes) const {
     int count = 0;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -254,7 +273,7 @@ class GraphViewInternal {
   // Gets all the edges in the immediate fanout of a node. Include the
   // controlled edges iff include_controlled_edges is true.
   absl::flat_hash_set<Edge> GetFanoutEdges(
-      const NodeDef& node, bool include_controlled_edges) const {
+      const NodeDefT& node, bool include_controlled_edges) const {
     absl::flat_hash_set<Edge> result;
 
     OutputPort port;
@@ -268,8 +287,7 @@ class GraphViewInternal {
       auto it = fanouts_.find(port);
       if (it != fanouts_.end()) {
         for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
-          result.emplace(/*src=*/OutputPort(const_cast<NodeDefT*>(&node), i),
-                         /*dst=*/*itr);
+          result.emplace(/*src=*/port, /*dst=*/*itr);
         }
       }
     }
@@ -279,11 +297,14 @@ class GraphViewInternal {
   // Gets all the edges in the immediate fanin of a node. Include the
   // controlling edges iff include_controlling_edges is true.
   absl::flat_hash_set<Edge> GetFaninEdges(
-      const NodeDef& node, bool include_controlling_edges) const {
+      const NodeDefT& node, bool include_controlling_edges) const {
     absl::flat_hash_set<Edge> result;
-    for (int i = 0; i < node.input_size(); ++i) {
+    const int max_input_port =
+        include_controlling_edges
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
       TensorId tensor_id = ParseTensorName(node.input(i));
-      if (tensor_id.index() < 0 && !include_controlling_edges) break;
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) {
@@ -297,14 +318,24 @@ class GraphViewInternal {
  protected:
   explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
 
+  Status AddUniqueNode(NodeDefT* node) {
+    auto inserted = nodes_.emplace(node->name(), node);
+    return inserted.second
+               ? Status::OK()
+               : errors::InvalidArgument("Non unique node name detected: ",
+                                         node->name());
+  }
+
+  // TODO(ezhulenev): Remove this function.
   void AddUniqueNodeOrDie(NodeDefT* node) {
-    auto result = nodes_.emplace(node->name(), node);
-    // TODO(ezhulenev): Replace CHECK with factory method returning
-    // absl::StatusOr (when available).
-    CHECK(result.second) << "Non unique node name detected: " << node->name();
+    Status st = AddUniqueNode(node);
+    CHECK(st.ok()) << st.error_message();
   }
 
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
   void AddFanouts(NodeDefT* node) {
+    int max_input_port = -1;
     for (int i = 0; i < node->input_size(); ++i) {
       TensorId tensor_id = ParseTensorName(node->input(i));
       OutputPort output(nodes_[tensor_id.node()], tensor_id.index());
@@ -312,11 +343,15 @@ class GraphViewInternal {
       if (output.port_id < 0) {
         fanouts_[output].emplace(node, -1);
       } else {
+        max_input_port = i;
         max_regular_output_port_[output.node] =
             std::max(max_regular_output_port_[output.node], output.port_id);
         fanouts_[output].emplace(node, i);
       }
     }
+    if (max_input_port > -1) {
+      max_regular_input_port_[node] = max_input_port;
+    }
   }
 
   // Access to the mutable internal state for MutableGraphView.
@@ -326,7 +361,11 @@ class GraphViewInternal {
     return fanouts_;
   }
 
-  absl::flat_hash_map<const NodeDef*, int>& max_regular_output_port() {
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_input_port() {
+    return max_regular_input_port_;
+  }
+
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_output_port() {
     return max_regular_output_port_;
   }
 
@@ -339,10 +378,13 @@ class GraphViewInternal {
   // A mapping from the output port to all inputs that read from it.
   absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>> fanouts_;
 
+  // Keep a maximum index of input tensors of the node.
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_input_port_;
+
   // Keep a maximum index of tensor fetched from the node. It doesn't guarantee
   // that all tensors in the [0, max_regular_output_port] range are actually
   // fetched by other nodes.
-  absl::flat_hash_map<const NodeDef*, int> max_regular_output_port_;
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_output_port_;
 
   // If the node has no fanouts at given output port (output tensor consumers)
   // we return a reference to this set from `GetFanout` (we can't construct new
@@ -370,10 +412,12 @@ bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
 
 // Returns true if node has at least one fanout node at given output port.
 bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
-
-bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
-bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
-bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input control dependency.
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one output control dependency.
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input or output control dependency.
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index 404dcd30c12781f2f9581ac6a1cb5986bb75f187..839057065b4e3f13dc55b9c0a7ddcfd94a165376 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -42,26 +43,24 @@ TEST_F(GraphViewTest, OpPortIdToArgIdShapeN) {
 
   const OpDef* a_op_def = nullptr;
   const OpDef* b_op_def = nullptr;
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(a_node_def.op(), &a_op_def).ok());
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(a_node_def.op(), &a_op_def));
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def));
 
   // Const has 0 inputs, 1 output.
-  EXPECT_EQ(-1, OpInputPortIdToArgId(a_node_def, *a_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(a_node_def, *a_op_def, 0));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(a_node_def, *a_op_def, 1));
+  EXPECT_EQ(OpInputPortIdToArgId(a_node_def, *a_op_def, 0), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(a_node_def, *a_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(a_node_def, *a_op_def, 1), -1);
 
   // ShapeN has N=3 inputs and outputs.
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
-  EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 1));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 2));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 3));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 4));
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 1), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 2), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 3), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 1), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 2), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 3), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 4), -1);
 }
 
 TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
@@ -76,22 +75,21 @@ TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
 
     const NodeDef& b_node_def = *graph_view.GetNode("b");
     const OpDef* b_op_def = nullptr;
-    EXPECT_TRUE(
-        OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
+    TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def));
 
     // We have 4 inputs.
-    EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
-    EXPECT_EQ(1, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
-    EXPECT_EQ(2, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
-    EXPECT_EQ(3, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
-    EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 4));
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 1), 1);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 2), 2);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 3), 3);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 4), -1);
 
     for (int port_id = 0; port_id <= num_splits * 3; ++port_id) {
       int arg_id = -1;
       if (port_id < num_splits * 3) {
         arg_id = port_id / num_splits;
       }
-      EXPECT_EQ(arg_id, OpOutputPortIdToArgId(b_node_def, *b_op_def, port_id));
+      EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, port_id), arg_id);
     }
   }
 }
@@ -110,18 +108,17 @@ TEST_F(GraphViewTest, ParseSingleExample) {
   const NodeDef& c_node_def = *graph_view.GetNode("c");
 
   const OpDef* c_op_def = nullptr;
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(c_node_def.op(), &c_op_def).ok());
-
-  EXPECT_EQ(0, OpOutputPortIdToArgId(c_node_def, *c_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(c_node_def, *c_op_def, 1));
-  EXPECT_EQ(1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 2));
-  EXPECT_EQ(1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 3));
-  EXPECT_EQ(2, OpOutputPortIdToArgId(c_node_def, *c_op_def, 4));
-  EXPECT_EQ(2, OpOutputPortIdToArgId(c_node_def, *c_op_def, 5));
-  EXPECT_EQ(3, OpOutputPortIdToArgId(c_node_def, *c_op_def, 6));
-  EXPECT_EQ(3, OpOutputPortIdToArgId(c_node_def, *c_op_def, 7));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 8));
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(c_node_def.op(), &c_op_def));
+
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 1), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 2), 1);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 3), 1);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 4), 2);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 5), 2);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 6), 3);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 7), 3);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 8), -1);
 }
 
 TEST_F(GraphViewTest, BasicGraph) {
@@ -132,26 +129,26 @@ TEST_F(GraphViewTest, BasicGraph) {
   GraphView graph(&item.graph);
 
   GraphView::InputPort input = graph.GetInputPort("AddN", 0);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(0, input.port_id);
+  EXPECT_EQ(input.node->name(), "AddN");
+  EXPECT_EQ(input.port_id, 0);
   GraphView::OutputPort fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  EXPECT_EQ(fanin.node->name(), "Square");
+  EXPECT_EQ(fanin.port_id, 0);
 
   input = graph.GetInputPort("AddN", 1);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(1, input.port_id);
+  EXPECT_EQ(input.node->name(), "AddN");
+  EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square_1", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  EXPECT_EQ(fanin.node->name(), "Square_1");
+  EXPECT_EQ(fanin.port_id, 0);
 
   GraphView::OutputPort output = graph.GetOutputPort("AddN", 0);
-  EXPECT_EQ("AddN", output.node->name());
-  EXPECT_EQ(0, output.port_id);
-  EXPECT_EQ(2, graph.GetFanout(output).size());
+  EXPECT_EQ(output.node->name(), "AddN");
+  EXPECT_EQ(output.port_id, 0);
+  EXPECT_EQ(graph.GetFanout(output).size(), 2);
   for (auto fanout : graph.GetFanout(output)) {
     if (fanout.node->name() == "AddN_2" || fanout.node->name() == "AddN_3") {
-      EXPECT_EQ(0, fanout.port_id);
+      EXPECT_EQ(fanout.port_id, 0);
     } else {
       // Invalid fanout
       EXPECT_FALSE(true);
@@ -159,7 +156,7 @@ TEST_F(GraphViewTest, BasicGraph) {
   }
 
   const NodeDef* add_node = graph.GetNode("AddN");
-  EXPECT_NE(nullptr, add_node);
+  EXPECT_NE(add_node, nullptr);
 
   absl::flat_hash_set<string> fanouts;
   absl::flat_hash_set<string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
@@ -190,44 +187,44 @@ TEST_F(GraphViewTest, ControlDependencies) {
   GraphView graph(&item.graph);
 
   GraphView::OutputPort output = graph.GetOutputPort("a", -1);
-  EXPECT_EQ("a", output.node->name());
-  EXPECT_EQ(-1, output.port_id);
+  EXPECT_EQ(output.node->name(), "a");
+  EXPECT_EQ(output.port_id, -1);
   auto fanout = graph.GetFanout(output);
-  EXPECT_EQ(1, fanout.size());
-  EXPECT_EQ("d", (*fanout.begin()).node->name());
-  EXPECT_EQ(-1, (*fanout.begin()).port_id);
+  EXPECT_EQ(fanout.size(), 1);
+  EXPECT_EQ((*fanout.begin()).node->name(), "d");
+  EXPECT_EQ((*fanout.begin()).port_id, -1);
 
   output = graph.GetOutputPort("a", 0);
-  EXPECT_EQ("a", output.node->name());
-  EXPECT_EQ(0, output.port_id);
+  EXPECT_EQ(output.node->name(), "a");
+  EXPECT_EQ(output.port_id, 0);
   fanout = graph.GetFanout(output);
-  EXPECT_EQ(1, fanout.size());
-  EXPECT_EQ("b", (*fanout.begin()).node->name());
-  EXPECT_EQ(0, (*fanout.begin()).port_id);
+  EXPECT_EQ(fanout.size(), 1);
+  EXPECT_EQ((*fanout.begin()).node->name(), "b");
+  EXPECT_EQ((*fanout.begin()).port_id, 0);
 
   GraphView::InputPort input = graph.GetInputPort("d", -1);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(-1, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, -1);
   auto fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("a", (*fanin.begin()).node->name());
-  EXPECT_EQ(-1, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "a");
+  EXPECT_EQ((*fanin.begin()).port_id, -1);
 
   input = graph.GetInputPort("d", 0);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(0, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, 0);
   fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("b", (*fanin.begin()).node->name());
-  EXPECT_EQ(0, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "b");
+  EXPECT_EQ((*fanin.begin()).port_id, 0);
 
   input = graph.GetInputPort("d", 1);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(1, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("c", (*fanin.begin()).node->name());
-  EXPECT_EQ(0, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "c");
+  EXPECT_EQ((*fanin.begin()).port_id, 0);
 }
 
 TEST_F(GraphViewTest, HasNode) {
@@ -238,8 +235,8 @@ TEST_F(GraphViewTest, HasNode) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   GraphView graph(&item.graph);
 
-  EXPECT_EQ(true, graph.HasNode("a"));
-  EXPECT_EQ(false, graph.HasNode("b"));
+  EXPECT_EQ(graph.HasNode("a"), true);
+  EXPECT_EQ(graph.HasNode("b"), false);
 }
 
 TEST_F(GraphViewTest, HasFanin) {
@@ -254,14 +251,42 @@ TEST_F(GraphViewTest, HasFanin) {
   GraphView graph(&item.graph);
 
   const NodeDef* d_node = graph.GetNode("d");
-  EXPECT_NE(nullptr, d_node);
-
-  EXPECT_EQ(true, graph.HasFanin(*d_node, {"a", Graph::kControlSlot}));
-  EXPECT_EQ(false, graph.HasFanin(*d_node, {"a", 0}));
-  EXPECT_EQ(true, graph.HasFanin(*d_node, {"b", 0}));
-  EXPECT_EQ(false, graph.HasFanin(*d_node, {"b", Graph::kControlSlot}));
-  EXPECT_EQ(true, graph.HasFanin(*d_node, {"c", 0}));
-  EXPECT_EQ(false, graph.HasFanin(*d_node, {"c", Graph::kControlSlot}));
+  EXPECT_NE(d_node, nullptr);
+
+  EXPECT_EQ(graph.HasFanin(*d_node, {"a", Graph::kControlSlot}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"a", 0}), false);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"b", 0}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"b", Graph::kControlSlot}), false);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"c", 0}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"c", Graph::kControlSlot}), false);
+}
+
+TEST_F(GraphViewTest, GetRegularFaninPortOutOfBounds) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Square(s.WithOpName("b"), {});
+  Output c = ops::Sqrt(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(a), {b, c});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  const NodeDef* b_node = graph.GetNode("b");
+  EXPECT_NE(b_node, nullptr);
+  const NodeDef* c_node = graph.GetNode("c");
+  EXPECT_NE(c_node, nullptr);
+  const NodeDef* d_node = graph.GetNode("d");
+  EXPECT_NE(d_node, nullptr);
+
+  auto d_output_0 = graph.GetRegularFanin({d_node, 0});
+  EXPECT_EQ(d_output_0, GraphView::OutputPort(b_node, 0));
+  auto d_output_1 = graph.GetRegularFanin({d_node, 1});
+  EXPECT_EQ(d_output_1, GraphView::OutputPort(c_node, 0));
+  auto d_output_2 = graph.GetRegularFanin({d_node, 2});
+  EXPECT_EQ(d_output_2, GraphView::OutputPort());
+  auto d_output_control = graph.GetRegularFanin({d_node, Graph::kControlSlot});
+  EXPECT_EQ(d_output_control, GraphView::OutputPort());
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 2d71ac54cc7af2b40e42ef34d198fd42f4b0a3d4..e27d2b049c2edefca32cb14886441497a11d8b9e 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -43,7 +43,7 @@ GrapplerItem GrapplerItem::WithGraph(GraphDef&& graph_def) const {
   item.save_restore_loc_tensor = save_restore_loc_tensor;
   item.queue_runners = queue_runners;
   item.devices_ = devices_;
-  item.allowed_optimizations_ = allowed_optimizations_;
+  item.optimization_options_ = optimization_options_;
   item.graph.Swap(&graph_def);
   return item;
 }
@@ -115,9 +115,15 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
     }
   }
 
-  if (!allowed_optimizations_.prune_ops_with_side_effects) {
+  // Tensorflow functions do not prune stateful or dataset-output ops from
+  // the function body (see PruneFunctionBody in common_runtime/function.cc).
+  //
+  // We also keep placeholders in the functions body, because it's a bug to have
+  // placeholders inside functions, and we want to catch such invalid graphs
+  // early.
+  if (optimization_options_.is_function_instantiation) {
     for (const NodeDef& node : graph.node()) {
-      if (!IsFreeOfSideEffect(node)) {
+      if (IsStateful(node) || IsDataset(node) || IsPlaceholder(node)) {
         result.insert(node.name());
       }
     }
@@ -175,13 +181,13 @@ Status GrapplerItem::InferDevicesFromGraph() {
 
 void GrapplerItem::ClearDevices() { devices_.clear(); }
 
-const GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations()
+const GrapplerItem::OptimizationOptions& GrapplerItem::optimization_options()
     const {
-  return allowed_optimizations_;
+  return optimization_options_;
 }
 
-GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations() {
-  return allowed_optimizations_;
+GrapplerItem::OptimizationOptions& GrapplerItem::optimization_options() {
+  return optimization_options_;
 }
 
 std::vector<const NodeDef*> ComputeTransitiveFanin(
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 1ae551f5ac9f5ed09dbaf2c399bf1a464dfab138..75712e9f92cc47007caae65be9a4e265458fa619 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -81,17 +81,17 @@ struct GrapplerItem {
   // fetch nodes, keep_ops, init_ops.
   std::unordered_set<string> NodesToPreserve() const;
 
-  // Restrict types of optimizations that are allowed for this GrapplerItem.
-  struct AllowedOptimizations {
+  struct OptimizationOptions {
     // Is it allowed to add nodes to the graph that do not have registered
     // gradient function.
-    bool non_differentiable_rewrites = true;
-
-    // By default we are allowed to prune ops with side-effects from the main
-    // graph if they are not in transitive fanin of the fetch nodes. If we are
-    // optimizing a graph that was instantiated by a function definition, we
-    // must keep all side effects intact.
-    bool prune_ops_with_side_effects = true;
+    bool allow_non_differentiable_rewrites = true;
+
+    // Tensorflow function execution semantics is slightly different from the
+    // main Tensorflow graph, and we need to make sure that we do not change it
+    // by running Grappler optimizer passes. One main difference is that
+    // functions do not prune ops with side-effects and dataset-output ops (see
+    // PruneFunctionBody in common_runtime/function.cc).
+    bool is_function_instantiation = false;
   };
 
   const std::unordered_set<string>& devices() const;
@@ -108,8 +108,8 @@ struct GrapplerItem {
   // Clears a set of available devices.
   void ClearDevices();
 
-  const AllowedOptimizations& allowed_optimizations() const;
-  AllowedOptimizations& allowed_optimizations();
+  const OptimizationOptions& optimization_options() const;
+  OptimizationOptions& optimization_options();
 
  private:
   // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
@@ -120,7 +120,7 @@ struct GrapplerItem {
   // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
   std::unordered_set<string> devices_;
 
-  AllowedOptimizations allowed_optimizations_;
+  OptimizationOptions optimization_options_;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 9224ee7849211f849c3655d6faea18dcc32b8e17..fc55fb5b3d2f905fc0fab837a9345b7e396acd13 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -103,7 +103,12 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
 
   // Instantiate all variables for function library runtime creation.
   std::vector<std::unique_ptr<Device>> devices;
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+  // Only CPU device is used so instead of calling DeviceFactory::AddDevices()
+  // with dummy session config, which will conflict with user defined options
+  // and create unwanted devices, call cpu_factory->CreateDevices() to get CPU
+  // only devices.
+  DeviceFactory* cpu_factory = DeviceFactory::GetFactory("CPU");
+  TF_RETURN_IF_ERROR(cpu_factory->CreateDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
   Device* cpu_device = devices[0].get();
   std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(std::move(devices)));
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index ffa204028cca828147810c99277fdcd9cb05f5ee..286c30cd356baf408bb227236d9369f81ab8b1ad 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -49,7 +49,11 @@ cc_library(
     deps = [
         ":input_yielder",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/kernels:aggregate_ops",
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index ca4d5255c0fb321fa3c744480d7b81f975a02589..724efcd21e347f019a67bf92f71ba2cd2c9589c6 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -18,16 +18,21 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -39,8 +44,212 @@ bool IsTensorIdPortValid(const TensorId& tensor_id) {
   return tensor_id.index() >= Graph::kControlSlot;
 }
 
+bool IsTensorIdRegular(const TensorId& tensor_id) {
+  return tensor_id.index() > Graph::kControlSlot;
+}
+
+bool IsTensorIdControlling(const TensorId& tensor_id) {
+  return tensor_id.index() == Graph::kControlSlot;
+}
+
+bool IsOutputPortRegular(const MutableGraphView::OutputPort& port) {
+  return port.port_id > Graph::kControlSlot;
+}
+
+bool IsOutputPortControlling(const MutableGraphView::OutputPort& port) {
+  return port.port_id == Graph::kControlSlot;
+}
+
+// Determines if node is an Identity where it's first regular input is a Switch
+// node.
+bool IsIdentityConsumingSwitch(const MutableGraphView& graph,
+                               const NodeDef& node) {
+  if ((IsIdentity(node) || IsIdentityNSingleInput(node)) &&
+      node.input_size() > 0) {
+    TensorId tensor_id = ParseTensorName(node.input(0));
+    if (IsTensorIdControlling(tensor_id)) {
+      return false;
+    }
+
+    NodeDef* input_node = graph.GetNode(tensor_id.node());
+    return IsSwitch(*input_node);
+  }
+  return false;
+}
+
+// Determines if node input can be deduped by regular inputs when used as a
+// control dependency. Specifically, if a node is an Identity that leads to a
+// Switch node, when used as a control dependency, that control dependency
+// should not be deduped even though the same node is used as a regular input.
+bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
+                                     const NodeDef& control_node) {
+  return !IsIdentityConsumingSwitch(graph, control_node);
+}
+
+// Determines if node input can be deduped by regular inputs when used as a
+// control dependency. Specifically, if a node is an Identity that leads to a
+// Switch node, when used as a control dependency, that control dependency
+// should not be deduped even though the same node is used as a regular input.
+bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
+                                     absl::string_view control_node_name) {
+  NodeDef* control_node = graph.GetNode(control_node_name);
+  return CanDedupControlWithRegularInput(graph, *control_node);
+}
+
+Status MutationError(absl::string_view function_name, absl::string_view params,
+                     absl::string_view msg) {
+  return errors::InvalidArgument(absl::Substitute(
+      "MutableGraphView::$0($1) error: $2.", function_name, params, msg));
+}
+
+using ErrorHandler = std::function<Status(absl::string_view)>;
+
+ErrorHandler UpdateFanoutsError(absl::string_view from_node_name,
+                                absl::string_view to_node_name) {
+  return [from_node_name, to_node_name](absl::string_view msg) {
+    string params = absl::Substitute("from_node_name='$0', to_node_name='$1'",
+                                     from_node_name, to_node_name);
+    return MutationError("UpdateFanouts", params, msg);
+  };
+}
+
+Status CheckFaninIsRegular(const TensorId& fanin, ErrorHandler handler) {
+  if (!IsTensorIdRegular(fanin)) {
+    return handler(absl::Substitute("fanin '$0' must be a regular tensor id",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckFaninIsValid(const TensorId& fanin, ErrorHandler handler) {
+  if (!IsTensorIdPortValid(fanin)) {
+    return handler(absl::Substitute("fanin '$0' must be a valid tensor id",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckAddingFaninToSelf(absl::string_view node_name,
+                              const TensorId& fanin, ErrorHandler handler) {
+  if (node_name == fanin.node()) {
+    return handler(
+        absl::Substitute("can't add fanin '$0' to self", fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckRemovingFaninFromSelf(absl::string_view node_name,
+                                  const TensorId& fanin, ErrorHandler handler) {
+  if (node_name == fanin.node()) {
+    return handler(absl::Substitute("can't remove fanin '$0' from self",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+string NodeMissingErrorMsg(absl::string_view node_name) {
+  return absl::Substitute("node '$0' was not found", node_name);
+}
+
+Status CheckNodeExists(absl::string_view node_name, NodeDef* node,
+                       ErrorHandler handler) {
+  if (node == nullptr) {
+    return handler(NodeMissingErrorMsg(node_name));
+  }
+  return Status::OK();
+}
+
+Status CheckPortRange(int port, int min, int max, ErrorHandler handler) {
+  if (port < min || port > max) {
+    if (max < min) {
+      return handler("no available ports as node has no regular fanins");
+    }
+    return handler(
+        absl::Substitute("port must be in range [$0, $1]", min, max));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
+void MutableGraphView::AddAndDedupFanouts(NodeDef* node) {
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
+  absl::flat_hash_set<absl::string_view> fanins;
+  absl::flat_hash_set<absl::string_view> controlling_fanins;
+  int max_input_port = -1;
+  int pos = 0;
+  const int last_idx = node->input_size() - 1;
+  int last_pos = last_idx;
+  while (pos <= last_pos) {
+    TensorId tensor_id = ParseTensorName(node->input(pos));
+    absl::string_view input_node_name = tensor_id.node();
+    bool is_control_input = IsTensorIdControlling(tensor_id);
+    bool can_dedup_control_with_regular_input =
+        CanDedupControlWithRegularInput(*this, input_node_name);
+    bool can_dedup_control =
+        is_control_input && (can_dedup_control_with_regular_input ||
+                             (!can_dedup_control_with_regular_input &&
+                              controlling_fanins.contains(input_node_name)));
+    if (!gtl::InsertIfNotPresent(&fanins, input_node_name) &&
+        can_dedup_control) {
+      node->mutable_input()->SwapElements(pos, last_pos);
+      --last_pos;
+    } else {
+      OutputPort output(nodes()[input_node_name], tensor_id.index());
+
+      if (is_control_input) {
+        fanouts()[output].emplace(node, Graph::kControlSlot);
+      } else {
+        max_input_port = pos;
+        max_regular_output_port()[output.node] =
+            std::max(max_regular_output_port()[output.node], output.port_id);
+        fanouts()[output].emplace(node, pos);
+      }
+      ++pos;
+    }
+    if (is_control_input) {
+      controlling_fanins.insert(input_node_name);
+    }
+  }
+
+  if (last_pos < last_idx) {
+    node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
+  }
+
+  if (max_input_port > -1) {
+    max_regular_input_port()[node] = max_input_port;
+  }
+}
+
+void MutableGraphView::UpdateMaxRegularOutputPortForRemovedFanin(
+    const OutputPort& fanin,
+    const absl::flat_hash_set<InputPort>& fanin_fanouts) {
+  int max_port = max_regular_output_port()[fanin.node];
+  if (!fanin_fanouts.empty() || max_port != fanin.port_id) {
+    return;
+  }
+  bool updated_max_port = false;
+  for (int i = fanin.port_id - 1; i >= 0; --i) {
+    OutputPort fanin_port(fanin.node, i);
+    if (!fanouts()[fanin_port].empty()) {
+      max_regular_output_port()[fanin.node] = i;
+      updated_max_port = true;
+      break;
+    }
+  }
+  if (!updated_max_port) {
+    max_regular_output_port().erase(fanin.node);
+  }
+}
+
+void MutableGraphView::UpdateMaxRegularOutputPortForAddedFanin(
+    const OutputPort& fanin) {
+  if (max_regular_output_port()[fanin.node] < fanin.port_id) {
+    max_regular_output_port()[fanin.node] = fanin.port_id;
+  }
+}
+
 const absl::flat_hash_set<MutableGraphView::InputPort>&
 MutableGraphView::GetFanout(const GraphView::OutputPort& port) const {
   return GetFanout(MutableGraphView::OutputPort(const_cast<NodeDef*>(port.node),
@@ -65,30 +274,79 @@ NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
 
   AddUniqueNodeOrDie(node_in_graph);
 
-  AddFanouts(node_in_graph);
+  AddAndDedupFanouts(node_in_graph);
   return node_in_graph;
 }
 
-void MutableGraphView::UpdateFanouts(absl::string_view from_node,
-                                     absl::string_view to_node) {
-  NodeDef* from_node_ptr = GetNode(from_node);
-  NodeDef* to_node_ptr = GetNode(to_node);
-  if (from_node_ptr && to_node_ptr) {
-    UpdateFanouts(from_node_ptr, to_node_ptr);
-  } else if (!from_node_ptr) {
-    LOG(WARNING) << absl::Substitute(
-        "Can't update fanouts from '$0' to '$1', from node was not found.",
-        from_node, to_node);
-  } else {
-    LOG(WARNING) << absl::Substitute(
-        "Can't update fanouts from '$0' to '$1', to node was not found.",
-        from_node, to_node);
+Status MutableGraphView::AddSubgraph(GraphDef&& subgraph) {
+  // 1. Add all new functions and check that functions with the same name
+  // have identical definition.
+  const int function_size = subgraph.library().function_size();
+  if (function_size > 0) {
+    absl::flat_hash_map<absl::string_view, const FunctionDef*> graph_fdefs;
+    for (const FunctionDef& fdef : graph()->library().function()) {
+      graph_fdefs.emplace(fdef.signature().name(), &fdef);
+    }
+
+    for (FunctionDef& fdef : *subgraph.mutable_library()->mutable_function()) {
+      const auto graph_fdef = graph_fdefs.find(fdef.signature().name());
+
+      if (graph_fdef == graph_fdefs.end()) {
+        VLOG(3) << "Add new function definition: " << fdef.signature().name();
+        graph()->mutable_library()->add_function()->Swap(&fdef);
+      } else {
+        if (!FunctionDefsEqual(fdef, *graph_fdef->second)) {
+          return MutationError(
+              "AddSubgraph",
+              absl::Substitute("function_size=$0", function_size),
+              absl::StrCat(
+                  "Found different function definition with the same name: ",
+                  fdef.signature().name()));
+        }
+      }
+    }
+  }
+
+  // 2. Add all nodes to the underlying graph.
+  int node_size_before = graph()->node_size();
+
+  for (NodeDef& node : *subgraph.mutable_node()) {
+    auto* node_in_graph = graph()->add_node();
+    node_in_graph->Swap(&node);
+    TF_RETURN_IF_ERROR(AddUniqueNode(node_in_graph));
+  }
+
+  // TODO(ezhulenev, lyandy): Right now AddAndDedupFanouts do not check that
+  // fanins actually exists in the graph, and there is already TODO for that.
+
+  for (int i = node_size_before; i < graph()->node_size(); ++i) {
+    NodeDef* node = graph()->mutable_node(i);
+    AddAndDedupFanouts(node);
   }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateFanouts(absl::string_view from_node_name,
+                                       absl::string_view to_node_name) {
+  NodeDef* from_node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(from_node_name, from_node,
+                      UpdateFanoutsError(from_node_name, to_node_name)));
+  NodeDef* to_node = GetNode(to_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(
+      to_node_name, to_node, UpdateFanoutsError(from_node_name, to_node_name)));
+
+  return UpdateFanoutsInternal(from_node, to_node);
 }
 
-void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
+Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
+                                               NodeDef* to_node) {
   VLOG(2) << absl::Substitute("Update fanouts from '$0' to '$1'.",
                               from_node->name(), to_node->name());
+  if (from_node == to_node) {
+    return Status::OK();
+  }
 
   // Update internal state with the new output_port->input_port edge.
   const auto add_edge = [this](const OutputPort& output_port,
@@ -102,6 +360,32 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
     fanouts()[output_port].erase(input_port);
   };
 
+  // For the control fanouts we do not know the input index in a NodeDef,
+  // so we have to traverse all control inputs.
+
+  auto control_fanouts =
+      GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
+
+  bool to_node_is_switch = IsSwitch(*to_node);
+  for (const InputPort& control_port : control_fanouts) {
+    // Node can't be control dependency of itself.
+    if (control_port.node == to_node) continue;
+
+    // Can't add Switch node as a control dependency.
+    if (to_node_is_switch) {
+      // Trying to add a Switch as a control dependency, which if allowed will
+      // make the graph invalid.
+      return UpdateFanoutsError(from_node->name(), to_node->name())(
+          absl::Substitute("can't update fanouts to node '$0' as it will "
+                           "become a Switch control dependency",
+                           to_node->name()));
+    }
+
+    NodeDef* node = control_port.node;
+    RemoveControllingFaninInternal(node, from_node);
+    AddFaninInternal(node, {to_node, Graph::kControlSlot});
+  }
+
   // First we update regular fanouts. For the regular fanouts
   // `input_port:port_id` is the input index in NodeDef.
 
@@ -120,51 +404,23 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
     // AddAndUpdateFanoutsWithoutSelfLoops test for an example).
     if (input_port.node == to_node) {
       keep_max_regular_output_port =
-          std::max(keep_max_regular_output_port, input_port.port_id);
+          std::max(keep_max_regular_output_port, output_port.port_id);
       continue;
     }
 
     // Update input at destination node.
     input_port.node->set_input(
         input_port.port_id,
-        output_port.port_id == 0
-            ? to_node->name()
-            : absl::StrCat(to_node->name(), ":", output_port.port_id));
+        TensorIdToString({to_node->name(), output_port.port_id}));
 
     // Remove old edge between the `from_node` and the fanout node.
     remove_edge(output_port, input_port);
     // Add an edge between the `to_node` and new fanout node.
     add_edge(OutputPort(to_node, output_port.port_id), input_port);
-  }
-
-  // For the control fanouts we do not know the input index in a NodeDef,
-  // so we have to traverse all control inputs.
-
-  auto control_fanouts =
-      GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
-  if (control_fanouts.empty()) return;
-
-  const string from_control_input = absl::StrCat("^", from_node->name());
-  const string to_control_input = absl::StrCat("^", to_node->name());
-
-  for (const InputPort& control_port : control_fanouts) {
-    // Node can't be control dependency of itself.
-    if (control_port.node == to_node) continue;
-
-    // Find and update input corresponding to control dependency.
-    NodeDef* node = control_port.node;
-    for (int i = node->input_size() - 1; i >= 0; --i) {
-      const string& input = node->input(i);
-      if (!IsControlInput(input)) break;  // we reached regular inputs
-      if (input == from_control_input) {
-        node->set_input(i, to_control_input);
-      }
+    // Dedup control dependency.
+    if (CanDedupControlWithRegularInput(*this, *to_node)) {
+      RemoveControllingFaninInternal(input_port.node, to_node);
     }
-
-    // Remove old edge between the `from_node` and the fanout node.
-    remove_edge(OutputPort(from_node, Graph::kControlSlot), control_port);
-    // Add an edge between the `to_node` and new fanout node.
-    add_edge(OutputPort(to_node, Graph::kControlSlot), control_port);
   }
 
   // Because we update all regular fanouts of `from_node`, we can just copy
@@ -177,319 +433,697 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
   } else {
     max_regular_output_port().erase(from_node);
   }
+
+  return Status::OK();
 }
 
 bool MutableGraphView::AddFaninInternal(NodeDef* node,
                                         const OutputPort& fanin) {
-  int num_non_controlling_fanins =
+  int num_regular_fanins =
       NumFanins(*node, /*include_controlling_nodes=*/false);
+  bool input_is_control = IsOutputPortControlling(fanin);
+  bool can_dedup_control_with_regular_input =
+      CanDedupControlWithRegularInput(*this, *fanin.node);
+  // Don't add duplicate control dependencies.
+  if (input_is_control) {
+    const int start =
+        can_dedup_control_with_regular_input ? 0 : num_regular_fanins;
+    for (int i = start; i < node->input_size(); ++i) {
+      if (ParseTensorName(node->input(i)).node() == fanin.node->name()) {
+        return false;
+      }
+    }
+  }
+
   InputPort input;
   input.node = node;
-  input.port_id = fanin.port_id == Graph::kControlSlot
-                      ? Graph::kControlSlot
-                      : num_non_controlling_fanins;
+  input.port_id = input_is_control ? Graph::kControlSlot : num_regular_fanins;
 
-  if (!gtl::InsertIfNotPresent(&fanouts()[fanin], input)) {
-    return false;
-  }
   node->add_input(TensorIdToString({fanin.node->name(), fanin.port_id}));
-  if (fanin.port_id > Graph::kControlSlot) {
-    int node_input_size = node->input_size() - 1;
+  if (!input_is_control) {
+    const int last_node_input = node->input_size() - 1;
     // If there are control dependencies in node, move newly inserted fanin to
     // be before such control dependencies.
-    if (num_non_controlling_fanins < node_input_size) {
-      node->mutable_input()->SwapElements(node_input_size,
-                                          num_non_controlling_fanins);
+    if (num_regular_fanins < last_node_input) {
+      node->mutable_input()->SwapElements(last_node_input, num_regular_fanins);
+    }
+  }
+
+  fanouts()[fanin].insert(input);
+  if (max_regular_output_port()[fanin.node] < fanin.port_id) {
+    max_regular_output_port()[fanin.node] = fanin.port_id;
+  }
+
+  // Update max input port and dedup control dependencies.
+  if (!input_is_control) {
+    max_regular_input_port()[node] = num_regular_fanins;
+    if (can_dedup_control_with_regular_input) {
+      RemoveControllingFaninInternal(node, fanin.node);
     }
   }
+
   return true;
 }
 
-bool MutableGraphView::AddFaninInternal(NodeDef* node, const TensorId& fanin) {
+Status MutableGraphView::AddRegularFanin(absl::string_view node_name,
+                                         const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("AddRegularFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
   NodeDef* fanin_node = GetNode(fanin.node());
-  if (fanin_node == nullptr) {
-    return false;
-  }
-  return AddFaninInternal(node, {fanin_node, fanin.index()});
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  AddFaninInternal(node, {fanin_node, fanin.index()});
+  return Status::OK();
 }
 
-bool MutableGraphView::AddFanin(absl::string_view node_name,
-                                const TensorId& fanin) {
-  if (!IsTensorIdPortValid(fanin)) {
-    return false;
+Status MutableGraphView::AddRegularFaninByPort(absl::string_view node_name,
+                                               int port,
+                                               const TensorId& fanin) {
+  auto error_status = [node_name, port, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', port=$1, fanin='$2'",
+                                     node_name, port, fanin.ToString());
+    return MutationError("AddRegularFaninByPort", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, num_regular_fanins, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  const int last_node_input = node->input_size();
+  node->add_input(TensorIdToString(fanin));
+  node->mutable_input()->SwapElements(num_regular_fanins, last_node_input);
+  for (int i = num_regular_fanins - 1; i >= port; --i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase({node, i});
+    fanouts_set->insert({node, i + 1});
+    node->mutable_input()->SwapElements(i, i + 1);
   }
+
+  OutputPort fanin_port(fanin_node, fanin.index());
+  fanouts()[fanin_port].insert({node, port});
+  UpdateMaxRegularOutputPortForAddedFanin(fanin_port);
+
+  max_regular_input_port()[node] = num_regular_fanins;
+  if (CanDedupControlWithRegularInput(*this, *fanin_node)) {
+    RemoveControllingFaninInternal(node, fanin_node);
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::AddControllingFanin(absl::string_view node_name,
+                                             const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("AddControllingFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
   NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  if (!IsSwitch(*fanin_node)) {
+    AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
+  } else {
+    if (IsTensorIdControlling(fanin)) {
+      // Can't add a Switch node control dependency.
+      return error_status(absl::Substitute(
+          "can't add fanin '$0' as it will become a Switch control dependency",
+          fanin.ToString()));
+    }
+    // We can't anchor control dependencies directly on the switch node: unlike
+    // other nodes only one of the outputs of the switch node will be generated
+    // when the switch node is executed, and we need to make sure the control
+    // dependency is only triggered when the corresponding output is triggered.
+    // We start by looking for an identity node connected to the output of the
+    // switch node, and use it to anchor the control dependency.
+    auto fanouts = GetFanouts(*fanin_node, /*include_controlled_nodes=*/false);
+    for (auto fanout : fanouts) {
+      if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
+        if (ParseTensorName(fanout.node->input(0)) == fanin) {
+          if (fanout.node->name() == node_name) {
+            return error_status(
+                absl::Substitute("can't add found fanin '$0' to self",
+                                 AsControlDependency(fanout.node->name())));
+          }
+          AddFaninInternal(node, {fanout.node, Graph::kControlSlot});
+          return Status::OK();
+        }
+      }
+    }
+    // We haven't found an existing node where we can anchor the control
+    // dependency: add a new identity node.
+    string ctrl_dep_name = AddPrefixToNodeName(
+        absl::StrCat(fanin.node(), "_", fanin.index()), kMutableGraphViewCtrl);
+    if (node_name == ctrl_dep_name) {
+      return error_status(
+          absl::Substitute("can't add generated fanin '$0' to self",
+                           AsControlDependency(ctrl_dep_name)));
+    }
+
+    // Reuse a previously created node, if possible.
+    NodeDef* ctrl_dep_node = GetNode(ctrl_dep_name);
+    if (ctrl_dep_node == nullptr) {
+      NodeDef new_node;
+      new_node.set_name(ctrl_dep_name);
+      new_node.set_op("Identity");
+      new_node.set_device(fanin_node->device());
+      (*new_node.mutable_attr())["T"].set_type(
+          fanin_node->attr().at("T").type());
+      new_node.add_input(TensorIdToString(fanin));
+      ctrl_dep_node = AddNode(std::move(new_node));
+    }
+    AddFaninInternal(node, {ctrl_dep_node, Graph::kControlSlot});
   }
-  return AddFaninInternal(node, fanin);
+  return Status::OK();
 }
 
-bool MutableGraphView::RemoveFanins(NodeDef* node,
-                                    absl::Span<const TensorId> fanins) {
-  bool modified = false;
+bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
+                                                  const OutputPort& fanin) {
+  auto remove_input = [this, node](const OutputPort& fanin_port,
+                                   int node_input_port, bool update_max_port) {
+    InputPort input(node, node_input_port);
+
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase(input);
+    if (update_max_port) {
+      UpdateMaxRegularOutputPortForRemovedFanin(fanin_port, *fanouts_set);
+    }
+    return fanouts_set;
+  };
+
   auto mutable_inputs = node->mutable_input();
+  bool modified = false;
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  int i;
   int curr_pos = 0;
-  int num_inputs = node->input_size();
-  for (int i = 0; i < num_inputs; ++i) {
+  for (i = 0; i < num_regular_fanins; ++i) {
     TensorId tensor_id = ParseTensorName(node->input(i));
-    bool remove_fanin =
-        std::find(fanins.begin(), fanins.end(), tensor_id) != fanins.end();
-    bool update_fanin = !remove_fanin && modified;
-    if (remove_fanin || update_fanin) {
-      OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
-
-      InputPort input;
-      input.node = node;
-      input.port_id =
-          tensor_id.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
-
-      if (remove_fanin) {
-        fanouts()[fanin].erase(input);
-      } else {
-        // Shift inputs to be retained.
-        if (tensor_id.index() > Graph::kControlSlot) {
-          fanouts()[fanin].erase(input);
-          fanouts()[fanin].insert(InputPort(node, i));
-        }
-        mutable_inputs->SwapElements(i, curr_pos++);
-      }
-
+    if (tensor_id.node() == fanin.node->name() &&
+        tensor_id.index() == fanin.port_id) {
+      remove_input(fanin, i, /*update_max_port=*/true);
       modified = true;
+    } else if (modified) {
+      // Regular inputs will need to have their ports updated.
+      OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+      auto fanouts_set = remove_input(fanin_port, i, /*update_max_port=*/false);
+      fanouts_set->insert({node, curr_pos});
+      // Shift inputs to be retained.
+      mutable_inputs->SwapElements(i, curr_pos);
+      ++curr_pos;
     } else {
       // Skip inputs to be retained until first modification.
-      curr_pos++;
+      ++curr_pos;
     }
   }
+
   if (modified) {
-    mutable_inputs->DeleteSubrange(curr_pos, num_inputs - curr_pos);
+    const int last_regular_input_port = curr_pos - 1;
+    if (last_regular_input_port < 0) {
+      max_regular_input_port().erase(node);
+    } else {
+      max_regular_input_port()[node] = last_regular_input_port;
+    }
+    if (curr_pos < i) {
+      // Remove fanins from node inputs.
+      mutable_inputs->DeleteSubrange(curr_pos, i - curr_pos);
+    }
   }
+
   return modified;
 }
 
-bool MutableGraphView::RemoveFanin(absl::string_view node_name,
-                                   const TensorId& fanin) {
-  if (!IsTensorIdPortValid(fanin)) {
-    return false;
-  }
+Status MutableGraphView::RemoveRegularFanin(absl::string_view node_name,
+                                            const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("RemoveRegularFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(
+      CheckRemovingFaninFromSelf(node_name, fanin, error_status));
   NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  RemoveRegularFaninInternal(node, {fanin_node, fanin.index()});
+  return Status::OK();
+}
+
+Status MutableGraphView::RemoveRegularFaninByPort(absl::string_view node_name,
+                                                  int port) {
+  auto error_status = [node_name, port](absl::string_view msg) {
+    string params =
+        absl::Substitute("node_name='$0', port=$1", node_name, port);
+    return MutationError("RemoveRegularFaninByPort", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, last_regular_fanin_port, error_status));
+
+  TensorId tensor_id = ParseTensorName(node->input(port));
+  OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+  fanouts()[fanin_port].erase({node, port});
+  auto mutable_inputs = node->mutable_input();
+  for (int i = port + 1; i <= last_regular_fanin_port; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase({node, i});
+    fanouts_set->insert({node, i - 1});
+    mutable_inputs->SwapElements(i - 1, i);
+  }
+  const int last_node_input = node->input_size() - 1;
+  if (last_regular_fanin_port < last_node_input) {
+    mutable_inputs->SwapElements(last_regular_fanin_port, last_node_input);
   }
-  return RemoveFanins(node, {fanin});
+  mutable_inputs->RemoveLast();
+
+  const int updated_last_regular_input_port = last_regular_fanin_port - 1;
+  if (updated_last_regular_input_port < 0) {
+    max_regular_input_port().erase(node);
+  } else {
+    max_regular_input_port()[node] = updated_last_regular_input_port;
+  }
+
+  return Status::OK();
+}
+
+bool MutableGraphView::RemoveControllingFaninInternal(NodeDef* node,
+                                                      NodeDef* fanin_node) {
+  for (int i = node->input_size() - 1; i >= 0; --i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (tensor_id.index() > Graph::kControlSlot) {
+      break;
+    }
+    if (tensor_id.node() == fanin_node->name()) {
+      fanouts()[{fanin_node, Graph::kControlSlot}].erase(
+          {node, Graph::kControlSlot});
+      node->mutable_input()->SwapElements(i, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+      return true;
+    }
+  }
+  return false;
 }
 
-bool MutableGraphView::RemoveAllFanins(absl::string_view node_name,
-                                       bool keep_controlling_fanins) {
+Status MutableGraphView::RemoveControllingFanin(
+    absl::string_view node_name, absl::string_view fanin_node_name) {
+  auto error_status = [node_name, fanin_node_name](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin_node_name='$1'",
+                                     node_name, fanin_node_name);
+    return MutationError("RemoveControllingFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckRemovingFaninFromSelf(
+      node_name, {fanin_node_name, Graph::kControlSlot}, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin_node_name);
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(fanin_node_name, fanin_node, error_status));
+
+  RemoveControllingFaninInternal(node, fanin_node);
+  return Status::OK();
+}
+
+Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
+                                         bool keep_controlling_fanins) {
   NodeDef* node = GetNode(node_name);
-  if (node == nullptr || node->input().empty()) {
-    return false;
+  if (node == nullptr) {
+    string params =
+        absl::Substitute("node_name='$0', keep_controlling_fanins=$1",
+                         node_name, keep_controlling_fanins);
+    return MutationError("RemoveAllFanins", params,
+                         NodeMissingErrorMsg(node_name));
+  }
+
+  if (node->input().empty()) {
+    return Status::OK();
   }
+
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
   RemoveFaninsInternal(node, keep_controlling_fanins);
   if (keep_controlling_fanins) {
-    int num_non_controlling_fanins =
-        NumFanins(*node, /*include_controlling_nodes=*/false);
-    if (num_non_controlling_fanins == 0) {
-      return false;
-    } else if (num_non_controlling_fanins < node->input_size()) {
-      node->mutable_input()->DeleteSubrange(0, num_non_controlling_fanins);
+    if (num_regular_fanins == 0) {
+      return Status::OK();
+    } else if (num_regular_fanins < node->input_size()) {
+      node->mutable_input()->DeleteSubrange(0, num_regular_fanins);
     } else {
       node->clear_input();
     }
   } else {
     node->clear_input();
   }
-  return true;
+  return Status::OK();
 }
 
-bool MutableGraphView::UpdateFanin(absl::string_view node_name,
-                                   const TensorId& from_fanin,
-                                   const TensorId& to_fanin) {
-  if (from_fanin == to_fanin || !IsTensorIdPortValid(from_fanin) ||
-      !IsTensorIdPortValid(to_fanin)) {
-    return false;
-  }
+Status MutableGraphView::UpdateFanin(absl::string_view node_name,
+                                     const TensorId& from_fanin,
+                                     const TensorId& to_fanin) {
+  auto error_status = [node_name, from_fanin, to_fanin](absl::string_view msg) {
+    string params =
+        absl::Substitute("node_name='$0', from_fanin='$1', to_fanin='$2'",
+                         node_name, from_fanin.ToString(), to_fanin.ToString());
+    return MutationError("UpdateFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(from_fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(to_fanin, error_status));
   NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
-  }
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(from_fanin.node(), from_fanin_node, error_status));
+  NodeDef* to_fanin_node = GetNode(to_fanin.node());
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(to_fanin.node(), to_fanin_node, error_status));
 
-  bool is_from_fanin_control = from_fanin.index() == Graph::kControlSlot;
-  bool is_to_fanin_control = to_fanin.index() == Graph::kControlSlot;
   // When replacing a non control dependency fanin with a control dependency, or
   // vice versa, remove and add, so ports can be updated properly in fanout(s).
-  if (is_from_fanin_control || is_to_fanin_control) {
-    bool modified = RemoveFanins(node, {from_fanin});
-    if (!HasFanin(*node, to_fanin)) {
-      modified |= AddFaninInternal(node, to_fanin);
-    }
-    return modified;
+  bool to_fanin_is_control = IsTensorIdControlling(to_fanin);
+  if (to_fanin_is_control && IsSwitch(*to_fanin_node)) {
+    // Can't add Switch node as a control dependency.
+    return error_status(
+        absl::Substitute("can't update to fanin '$0' as it will become a "
+                         "Switch control dependency",
+                         to_fanin.ToString()));
+  }
+  if (node_name == from_fanin.node() || node_name == to_fanin.node()) {
+    return error_status("can't update fanin to or from self");
   }
 
-  // In place mutation, requires no shifting of ports.
-  NodeDef* from_fanin_node = GetNode(from_fanin.node());
-  NodeDef* to_fanin_node = GetNode(to_fanin.node());
-  if (from_fanin_node == nullptr || to_fanin_node == nullptr) {
-    return false;
+  if (from_fanin == to_fanin) {
+    return Status::OK();
+  }
+
+  bool from_fanin_is_control = IsTensorIdControlling(from_fanin);
+  if (from_fanin_is_control || to_fanin_is_control) {
+    bool modified = false;
+    if (from_fanin_is_control) {
+      modified |= RemoveControllingFaninInternal(node, from_fanin_node);
+    } else {
+      modified |= RemoveRegularFaninInternal(
+          node, {from_fanin_node, from_fanin.index()});
+    }
+    if (modified) {
+      AddFaninInternal(node, {to_fanin_node, to_fanin.index()});
+    }
+    return Status::OK();
   }
 
+  // In place mutation of regular fanins, requires no shifting of ports.
   string to_fanin_string = TensorIdToString(to_fanin);
-  int num_inputs = node->input_size();
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
   bool modified = false;
-  for (int i = 0; i < num_inputs; ++i) {
+  absl::flat_hash_set<InputPort>* from_fanin_port_fanouts = nullptr;
+  absl::flat_hash_set<InputPort>* to_fanin_port_fanouts = nullptr;
+  for (int i = 0; i < num_regular_fanins; ++i) {
     if (ParseTensorName(node->input(i)) == from_fanin) {
-      OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
-      InputPort old_input;
-      old_input.node = node;
-      old_input.port_id =
-          from_fanin.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
-      fanouts()[from_fanin_port].erase(old_input);
-
-      OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
-      InputPort new_input;
-      new_input.node = node;
-      new_input.port_id =
-          to_fanin.index() == Graph::kControlSlot ? Graph::kControlSlot : i;
-      fanouts()[to_fanin_port].insert(new_input);
+      InputPort input(node, i);
+      if (from_fanin_port_fanouts == nullptr) {
+        OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
+        from_fanin_port_fanouts = &fanouts()[from_fanin_port];
+      }
+      from_fanin_port_fanouts->erase(input);
+
+      if (to_fanin_port_fanouts == nullptr) {
+        OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
+        to_fanin_port_fanouts = &fanouts()[to_fanin_port];
+      }
+      to_fanin_port_fanouts->insert(input);
 
       node->set_input(i, to_fanin_string);
       modified = true;
     }
   }
 
-  return modified;
-}
-
-bool MutableGraphView::DedupControllingFanins(NodeDef* node) {
-  absl::flat_hash_set<absl::string_view> fanins;
-  absl::flat_hash_set<string> removed_fanins;
-  int pos = 0;
-  const int last_idx = node->input_size() - 1;
-  int last_pos = last_idx;
-  while (pos <= last_pos) {
-    const string& input = node->input(pos);
-    TensorId tensor_id = ParseTensorName(input);
-    if (!gtl::InsertIfNotPresent(&fanins, tensor_id.node()) &&
-        IsControlInput(tensor_id)) {
-      node->mutable_input()->SwapElements(pos, last_pos--);
-      removed_fanins.insert(input);
-    } else {
-      ++pos;
+  // Dedup control dependencies and update max regular output ports.
+  if (modified) {
+    UpdateMaxRegularOutputPortForRemovedFanin(
+        {from_fanin_node, from_fanin.index()}, *from_fanin_port_fanouts);
+    if (max_regular_output_port()[to_fanin_node] < to_fanin.index()) {
+      max_regular_output_port()[to_fanin_node] = to_fanin.index();
     }
-  }
-
-  if (last_pos < last_idx) {
-    absl::flat_hash_set<string> retained_fanins(
-        node->input().begin(), node->input().begin() + last_pos + 1);
-    for (const auto& removed : removed_fanins) {
-      if (!retained_fanins.contains(removed)) {
-        OutputPort fanin(nodes()[ParseTensorName(removed).node()],
-                         Graph::kControlSlot);
-        fanouts()[fanin].erase({node, Graph::kControlSlot});
-      }
+    if (CanDedupControlWithRegularInput(*this, *to_fanin_node)) {
+      RemoveControllingFaninInternal(node, to_fanin_node);
     }
-    node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
-    return true;
   }
 
-  return false;
+  return Status::OK();
 }
 
-bool MutableGraphView::DedupControllingFanins(absl::string_view node_name) {
+Status MutableGraphView::UpdateRegularFaninByPort(absl::string_view node_name,
+                                                  int port,
+                                                  const TensorId& fanin) {
+  auto error_status = [node_name, port, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', port=$1, fanin='$2'",
+                                     node_name, port, fanin.ToString());
+    return MutationError("UpdateRegularFaninByPort", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
   NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, last_regular_fanin_port, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  TensorId tensor_id = ParseTensorName(node->input(port));
+  if (tensor_id == fanin) {
+    return Status::OK();
   }
-  return DedupControllingFanins(node);
-}
 
-bool MutableGraphView::DedupControllingFanins() {
-  const int num_nodes = graph()->node_size();
-  bool modified = false;
-  for (int i = 0; i < num_nodes; ++i) {
-    modified |= DedupControllingFanins(graph()->mutable_node(i));
+  InputPort input(node, port);
+  OutputPort from_fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+  absl::flat_hash_set<InputPort>* from_fanouts = &fanouts()[from_fanin_port];
+  from_fanouts->erase(input);
+  UpdateMaxRegularOutputPortForRemovedFanin(from_fanin_port, *from_fanouts);
+
+  OutputPort to_fanin_port(fanin_node, fanin.index());
+  fanouts()[to_fanin_port].insert(input);
+  UpdateMaxRegularOutputPortForAddedFanin(to_fanin_port);
+
+  node->set_input(port, TensorIdToString(fanin));
+
+  if (CanDedupControlWithRegularInput(*this, *fanin_node)) {
+    RemoveControllingFaninInternal(node, fanin_node);
   }
-  return modified;
+
+  return Status::OK();
 }
 
-bool MutableGraphView::AddControllingFanin(absl::string_view node_name,
-                                           const TensorId& fanin) {
+Status MutableGraphView::SwapRegularFaninsByPorts(absl::string_view node_name,
+                                                  int from_port, int to_port) {
+  auto error_status = [node_name, from_port, to_port](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', from_port=$1, to_port=$2",
+                                     node_name, from_port, to_port);
+    return MutationError("SwapRegularFaninsByPorts", params, msg);
+  };
+
   NodeDef* node = GetNode(node_name);
-  if (node == nullptr) {
-    return false;
-  }
-  NodeDef* fanin_node = GetNode(fanin.node());
-  if (fanin_node == nullptr) {
-    return false;
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(CheckPortRange(from_port, /*min=*/0,
+                                    last_regular_fanin_port, error_status));
+  TF_RETURN_IF_ERROR(CheckPortRange(to_port, /*min=*/0, last_regular_fanin_port,
+                                    error_status));
+
+  if (from_port == to_port) {
+    return Status::OK();
   }
-  if (fanin.index() == Graph::kControlSlot) {
-    return AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
+  TensorId from_fanin = ParseTensorName(node->input(from_port));
+  TensorId to_fanin = ParseTensorName(node->input(to_port));
+  if (from_fanin == to_fanin) {
+    return Status::OK();
   }
 
-  if (!IsSwitch(*fanin_node)) {
-    return AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
-  } else {
-    // We can't anchor control dependencies directly on the switch node: unlike
-    // other nodes only one of the outputs of the switch node will be generated
-    // when the switch node is executed, and we need to make sure the control
-    // dependency is only triggered when the corresponding output is triggered.
-    // We start by looking for an identity node connected to the output of the
-    // switch node, and use it to anchor the control dependency.
-    auto fanouts = GetFanouts(*fanin_node, /*include_controlled_nodes=*/false);
-    for (auto fanout : fanouts) {
-      if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
-        if (ParseTensorName(fanout.node->input(0)) == fanin) {
-          return AddFaninInternal(node, {fanout.node, Graph::kControlSlot});
+  InputPort from_input(node, from_port);
+  InputPort to_input(node, to_port);
+  NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  absl::flat_hash_set<InputPort>* from_fanouts =
+      &fanouts()[{from_fanin_node, from_fanin.index()}];
+  from_fanouts->erase(from_input);
+  from_fanouts->insert(to_input);
+  NodeDef* to_fanin_node = GetNode(to_fanin.node());
+  absl::flat_hash_set<InputPort>* to_fanouts =
+      &fanouts()[{to_fanin_node, to_fanin.index()}];
+  to_fanouts->erase(to_input);
+  to_fanouts->insert(from_input);
+
+  node->mutable_input()->SwapElements(from_port, to_port);
+
+  return Status::OK();
+}
+
+Status MutableGraphView::CheckNodesCanBeDeleted(
+    const absl::flat_hash_set<string>& nodes_to_delete) {
+  std::vector<string> missing_nodes;
+  std::vector<string> nodes_with_fanouts;
+  for (const string& node_name_to_delete : nodes_to_delete) {
+    NodeDef* node = GetNode(node_name_to_delete);
+    if (node == nullptr) {
+      // Can't delete missing node.
+      missing_nodes.push_back(node_name_to_delete);
+      continue;
+    }
+    const int max_port = gtl::FindWithDefault(max_regular_output_port(), node,
+                                              Graph::kControlSlot);
+    for (int i = Graph::kControlSlot; i <= max_port; ++i) {
+      auto it = fanouts().find({node, i});
+      bool has_retained_fanout = false;
+      if (it != fanouts().end()) {
+        for (const auto& fanout : it->second) {
+          // Check if fanouts are of nodes to be deleted, and if so, they can be
+          // ignored, as they will be removed also.
+          if (!nodes_to_delete.contains(fanout.node->name())) {
+            // Removing node will leave graph in an invalid state.
+            has_retained_fanout = true;
+            break;
+          }
         }
       }
+      if (has_retained_fanout) {
+        nodes_with_fanouts.push_back(node_name_to_delete);
+        break;
+      }
     }
-    // We haven't found an existing node where we can anchor the control
-    // dependency: add a new identity node.
-    string ctrl_dep_name = AddPrefixToNodeName(
-        absl::StrCat(fanin.node(), "_", fanin.index()), kMutableGraphViewCtrl);
+  }
 
-    NodeDef* ctrl_dep_node = GetNode(ctrl_dep_name);
-    if (ctrl_dep_node == nullptr) {
-      NodeDef new_node;
-      new_node.set_name(ctrl_dep_name);
-      new_node.set_op("Identity");
-      new_node.set_device(fanin_node->device());
-      (*new_node.mutable_attr())["T"].set_type(
-          fanin_node->attr().at("T").type());
-      new_node.add_input(TensorIdToString(fanin));
-      ctrl_dep_node = AddNode(std::move(new_node));
+  // Error message can get quite long, so we only show the first 5 node names.
+  auto sort_and_sample = [](std::vector<string>* s) {
+    constexpr int kMaxNodeNames = 5;
+    std::sort(s->begin(), s->end());
+    if (s->size() > kMaxNodeNames) {
+      return absl::StrCat(
+          absl::StrJoin(s->begin(), s->begin() + kMaxNodeNames, ", "), ", ...");
     }
-    return AddFaninInternal(node, {ctrl_dep_node, Graph::kControlSlot});
+    return absl::StrJoin(*s, ", ");
+  };
+
+  if (!missing_nodes.empty()) {
+    VLOG(2) << absl::Substitute("Attempting to delete missing node(s) [$0].",
+                                sort_and_sample(&missing_nodes));
+  }
+  if (!nodes_with_fanouts.empty()) {
+    std::vector<string> input_node_names(nodes_to_delete.begin(),
+                                         nodes_to_delete.end());
+    string params = absl::Substitute("nodes_to_delete={$0}",
+                                     sort_and_sample(&input_node_names));
+    string error_msg =
+        absl::Substitute("can't delete node(s) with retained fanouts(s) [$0]",
+                         sort_and_sample(&nodes_with_fanouts));
+    return MutationError("DeleteNodes", params, error_msg);
   }
+
+  return Status::OK();
 }
 
-void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
-  for (const string& node_name_to_delete : nodes_to_delete)
-    RemoveFaninsInternal(nodes().at(node_name_to_delete),
-                         /*keep_controlling_fanins=*/false);
-  for (const string& node_name_to_delete : nodes_to_delete)
+Status MutableGraphView::DeleteNodes(
+    const absl::flat_hash_set<string>& nodes_to_delete) {
+  TF_RETURN_IF_ERROR(CheckNodesCanBeDeleted(nodes_to_delete));
+
+  // Find nodes in internal state and delete.
+  for (const string& node_name_to_delete : nodes_to_delete) {
+    NodeDef* node = GetNode(node_name_to_delete);
+    if (node != nullptr) {
+      RemoveFaninsInternal(node, /*keep_controlling_fanins=*/false);
+      RemoveFanoutsInternal(node);
+    }
+  }
+  for (const string& node_name_to_delete : nodes_to_delete) {
     nodes().erase(node_name_to_delete);
-  EraseNodesFromGraph(nodes_to_delete, graph());
+  }
+
+  // Find nodes in graph and delete by partitioning into nodes to retain and
+  // nodes to delete based on input set of nodes to delete by name.
+  // TODO(lyandy): Use a node name->idx hashmap if this is a performance
+  // bottleneck.
+  int pos = 0;
+  const int last_idx = graph()->node_size() - 1;
+  int last_pos = last_idx;
+  while (pos <= last_pos) {
+    if (nodes_to_delete.contains(graph()->node(pos).name())) {
+      graph()->mutable_node()->SwapElements(pos, last_pos);
+      --last_pos;
+    } else {
+      ++pos;
+    }
+  }
+  if (last_pos < last_idx) {
+    graph()->mutable_node()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
+  }
+
+  return Status::OK();
 }
 
 void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
                                             bool keep_controlling_fanins) {
   for (int i = 0; i < deleted_node->input_size(); ++i) {
     TensorId tensor_id = ParseTensorName(deleted_node->input(i));
-    if (keep_controlling_fanins && tensor_id.index() < 0) {
+    bool is_control = IsTensorIdControlling(tensor_id);
+    if (keep_controlling_fanins && is_control) {
       break;
     }
     OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
     input.node = deleted_node;
-    if (tensor_id.index() < 0)
-      input.port_id = Graph::kControlSlot;
-    else
-      input.port_id = i;
+    input.port_id = is_control ? Graph::kControlSlot : i;
+
+    auto it = fanouts().find(fanin);
+    if (it != fanouts().end()) {
+      absl::flat_hash_set<InputPort>* fanouts_set = &it->second;
+      fanouts_set->erase(input);
+      UpdateMaxRegularOutputPortForRemovedFanin(fanin, *fanouts_set);
+    }
+  }
+  max_regular_input_port().erase(deleted_node);
+}
 
-    fanouts()[fanin].erase(input);
+void MutableGraphView::RemoveFanoutsInternal(NodeDef* deleted_node) {
+  const int max_port =
+      gtl::FindWithDefault(max_regular_output_port(), deleted_node, -1);
+  for (int i = Graph::kControlSlot; i <= max_port; ++i) {
+    fanouts().erase({deleted_node, i});
   }
+  max_regular_output_port().erase(deleted_node);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index f7c2a1118e5f879fecca2a1fc37d2e906df19ec4..16ef832a4eec9714f6bd9834fb0cae474dcbb92a 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -24,8 +24,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -41,7 +44,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
  public:
   explicit MutableGraphView(GraphDef* graph) : GraphViewInternal(graph) {
     for (NodeDef& node : *graph->mutable_node()) AddUniqueNodeOrDie(&node);
-    for (NodeDef& node : *graph->mutable_node()) AddFanouts(&node);
+    for (NodeDef& node : *graph->mutable_node()) AddAndDedupFanouts(&node);
   }
 
   // Lookup fanouts/fanins using immutable ports.
@@ -60,63 +63,54 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // node in graph.
   NodeDef* AddNode(NodeDef&& node);
 
-  // Updates all fanouts (input ports fetching output tensors) from `from_node`
-  // to the `to_node`, including control dependencies.
-  //
-  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
-  //   1. foo1(bar:0, bar:1, other:0, ^bar)
-  //   2. foo2(bar:1, other:1)
+  // Adds all nodes from the `subgraph` to the underlying graph and updates the
+  // view. `subgraph` doesn't have to be a valid graph definition on it's own,
+  // it can have edges to the nodes that are not in it, however after adding
+  // it to the underlying graph, final graph must be valid.
   //
-  // After calling ForwardOutputs(bar, new_bar):
-  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
-  //   2. foo2(new_bar:1, other:1)
-  void UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
-
-  // Add fanin to node `node_name`. If the node or fanin do not exist in the
-  // graph, nothing will be modified in the graph. If fanin is a control
-  // dependency, existing control dependencies will be checked first before
-  // adding. Otherwise fanin will be added after existing non control dependency
-  // inputs.
+  // If subgraph function library is not empty, all new functions will be added
+  // to the graph. Functions that appear with the same name in both subgraph and
+  // the graph represented by *this, must have identical function definitions.
   //
-  // This will return true iff the node is modified. If a control dependency
-  // already exists, the node will not be modified.
-  bool AddFanin(absl::string_view node_name, const TensorId& fanin);
+  // IMPORTANT: All nodes and functions of the given subgraph moved into the
+  // underlying graph, which leaves subgraph in valid but undefined state.
+  Status AddSubgraph(GraphDef&& subgraph);
 
-  // Remove fanin from node `node_name`. If the node or fanin do not exist in
-  // the graph, nothing will be modified in the graph. If there are multiple
-  // inputs that match the fanin, all of them will be removed.
+  // Updates all fanouts (input ports fetching output tensors) from
+  // `from_node_name` to the `to_node_name`, including control dependencies.
   //
-  // This will return true iff the node is modified. If no inputs match the
-  // fanin, the node will not be modified.
-  bool RemoveFanin(absl::string_view node_name, const TensorId& fanin);
-
-  // Remove all fanins from node `node_name`. Control dependencies will be
-  // retained if keep_controlling_fanins is true.
-  //
-  // This will return true iff the node is modified.
-  bool RemoveAllFanins(absl::string_view node_name,
-                       bool keep_controlling_fanins);
-
-  // Replace all fanins `from_fanin` with `to_fanin` in node `node_name`. If
-  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
+  //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
   //
-  // This will return true iff the node is modified.
-  bool UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
-                   const TensorId& to_fanin);
+  // After calling ForwardOutputs(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
+  //   2. foo2(new_bar:1, other:1)
+  //   3. foo3(other:2, ^new_bar)
+  Status UpdateFanouts(absl::string_view from_node_name,
+                       absl::string_view to_node_name);
 
-  // Removes redundant control fanins from node `node_name`.
-  //
-  // This will return true iff the node is modified.
-  // TODO(lyandy): Measure performance of deduping on every AddFanin compared to
-  // deduping once at the end.
-  bool DedupControllingFanins(absl::string_view node_name);
+  // Adds regular fanin `fanin` to node `node_name`. If the node or fanin do not
+  // exist in the graph, nothing will be modified in the graph. Otherwise fanin
+  // will be added after existing non control dependency fanins. Control
+  // dependencies will be deduped. To add control dependencies, use
+  // AddControllingFanin.
+  Status AddRegularFanin(absl::string_view node_name, const TensorId& fanin);
 
-  // Removes redundant control fanins from all nodes in the graph.
+  // Adds regular fanin `fanin` to node `node_name` at port `port`. If the node
+  // or fanin do not exist in the graph, nothing will be modified in the graph.
+  // Otherwise fanin will be inserted at port `port`. Control dependencies will
+  // be deduped. To add control dependencies, use AddControllingFanin.
   //
-  // This will return true iff the node is modified.
-  bool DedupControllingFanins();
+  // If the port is not a valid port (less than 0 or greater than the number of
+  // regular fanins), this will result in an error and the node will not be
+  // modified.
+  Status AddRegularFaninByPort(absl::string_view node_name, int port,
+                               const TensorId& fanin);
 
-  // Adds a control dependency to the target node named `node_name`.
+  // Adds control dependency `fanin` to the target node named `node_name`. To
+  // add regular fanins, use AddRegularFanin.
   //
   // Case 1: If the fanin is not a Switch node, the control dependency is simply
   // added to the target node:
@@ -133,55 +127,140 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   //
   //   fanin -> Identity{N} -^> target node.
   //
-  // This will return true iff the node is modified.
-  bool AddControllingFanin(absl::string_view node_name, const TensorId& fanin);
+  // If the control dependency being added is redundant (control dependency
+  // already exists or control dependency can be deduped from regular fanins),
+  // this will not result in an error and the node will not be modified.
+  Status AddControllingFanin(absl::string_view node_name,
+                             const TensorId& fanin);
+
+  // Removes regular fanin `fanin` from node `node_name`. If the node or fanin
+  // do not exist in the graph, nothing will be modified in the graph. If there
+  // are multiple inputs that match the fanin, all of them will be removed. To
+  // remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the fanin being removed doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  Status RemoveRegularFanin(absl::string_view node_name, const TensorId& fanin);
+
+  // Removes regular fanin at port `port` from node `node_name`. If the node
+  // does not exist in the graph, nothing will be modified in the graph.
+  // To remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  Status RemoveRegularFaninByPort(absl::string_view node_name, int port);
+
+  // Removes control dependency `fanin_node_name` from the target node named
+  // `node_name`. If the node or fanin do not exist in the graph, nothing will
+  // be modified in the graph. To remove regular fanins, use RemoveRegualrFanin.
+  //
+  // If the fanin being removed doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  Status RemoveControllingFanin(absl::string_view node_name,
+                                absl::string_view fanin_node_name);
 
-  // Deletes nodes from the graph.
-  void DeleteNodes(const std::set<string>& nodes_to_delete);
+  // Removes all fanins from node `node_name`. Control dependencies will be
+  // retained if keep_controlling_fanins is true.
+  //
+  // If no fanins are removed, this will not result in an error and the node
+  // will not be modified.
+  Status RemoveAllFanins(absl::string_view node_name,
+                         bool keep_controlling_fanins);
+
+  // Replaces all fanins `from_fanin` with `to_fanin` in node `node_name`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the fanin being updated doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  Status UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
+                     const TensorId& to_fanin);
+
+  // Replaces fanin at port `port` in node `node_name` with fanin `fanin`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  Status UpdateRegularFaninByPort(absl::string_view node_name, int port,
+                                  const TensorId& fanin);
+
+  // Swaps fanins at ports `from_port` and `to_port` in node `node_name`. If the
+  // node does not exist, nothing will be modified in the graph.
+  //
+  // If the ports are not a valid port (less than 0 or greater than the last
+  // index of the regular fanins), this will result in an error and the node
+  // will not be modified.
+  Status SwapRegularFaninsByPorts(absl::string_view node_name, int from_port,
+                                  int to_port);
+
+  // Deletes nodes from the graph. If a node can't be safely removed,
+  // specifically if a node still has fanouts, an error will be returned. Nodes
+  // that can't be found are ignored.
+  Status DeleteNodes(const absl::flat_hash_set<string>& nodes_to_delete);
 
  private:
+  // Adds fanouts for fanins of node to graph, while deduping control
+  // dependencies from existing control dependencies and regular fanins. Note,
+  // node inputs will be mutated if control dependencies can be deduped.
+  void AddAndDedupFanouts(NodeDef* node);
+
+  // Finds next output port smaller than fanin.port_id and update. The
+  // max_regular_output_port is only updated if fanin.port_id is the same as the
+  // current max_regular_output_port and if the fanouts set is empty. If there
+  // are no regular outputs, max_regular_output_port will be erased.
+  void UpdateMaxRegularOutputPortForRemovedFanin(
+      const OutputPort& fanin,
+      const absl::flat_hash_set<InputPort>& fanin_fanouts);
+
+  // Updates max regular output port for newly added fanin by checking the
+  // current max and updating if the newly added fanin is of a larger port.
+  void UpdateMaxRegularOutputPortForAddedFanin(const OutputPort& fanin);
+
   // Updates all fanouts (input ports fetching output tensors) from `from_node`
   // to the `to_node`, including control dependencies.
   //
-  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
-  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
   //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
   //
   // After calling ForwardOutputs(bar, new_bar):
-  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
+  //   3. foo3(other:2, ^new_bar)
   //
   // IMPORTANT: If `from_node` or `to_node` is not in the underlying graph, the
   // behavior is undefined.
-  void UpdateFanouts(NodeDef* from_node, NodeDef* to_node);
-
-  // Removes fanins of the deleted node from internal state. Control
-  // dependencies are retained iff keep_controlling_fanins is true.
-  void RemoveFaninsInternal(NodeDef* deleted_node,
-                            bool keep_controlling_fanins);
+  Status UpdateFanoutsInternal(NodeDef* from_node, NodeDef* to_node);
 
-  // Add fanin to node. If fanin is a control dependency, existing control
+  // Adds fanin to node. If fanin is a control dependency, existing control
   // dependencies will be checked first before adding. Otherwise fanin will be
   // added after existing non control dependency inputs.
-  //
-  // This will return true iff the node is modified. If a control dependency
-  // already exists, the node will not be modified.
   bool AddFaninInternal(NodeDef* node, const OutputPort& fanin);
 
-  // Add fanin to node. If the node or fanin do not exist in the graph, nothing
-  // will be modified in the graph. If fanin is a control dependency, existing
-  // control dependencies will be checked first before adding. Otherwise fanin
-  // will be added after existing non control dependency inputs.
-  //
-  // This will return true iff the node is modified. If a control dependency
-  // already exists, the node will not be modified.
-  bool AddFaninInternal(NodeDef* node, const TensorId& fanin);
+  // Removes all instances of regular fanin `fanin` from node `node`.
+  bool RemoveRegularFaninInternal(NodeDef* node, const OutputPort& fanin);
 
-  // Removes any fanin in node that matches to a fanin in fanins.
-  bool RemoveFanins(NodeDef* node, absl::Span<const TensorId> fanins);
+  // Removes controlling fanin `fanin_node` from node if such controlling fanin
+  // exists.
+  bool RemoveControllingFaninInternal(NodeDef* node, NodeDef* fanin_node);
+
+  // Checks if nodes to be deleted are missing or have any fanouts that will
+  // remain in the graph. If node is removed in either case, the graph will
+  // enter an invalid state.
+  Status CheckNodesCanBeDeleted(
+      const absl::flat_hash_set<string>& nodes_to_delete);
+
+  // Removes fanins of the deleted node from internal state. Control
+  // dependencies are retained iff keep_controlling_fanins is true.
+  void RemoveFaninsInternal(NodeDef* deleted_node,
+                            bool keep_controlling_fanins);
 
-  // Removes redundant control fanins from node.
-  bool DedupControllingFanins(NodeDef* node);
+  // Removes fanouts of the deleted node from internal state.
+  void RemoveFanoutsInternal(NodeDef* deleted_node);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index cdc212f6f9ecf9575e011e76a4ea1126ae534b6d..f4ee4d89b1998281678701a6d95fb79eb904c326 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -21,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -28,6 +31,183 @@ namespace grappler {
 namespace {
 
 using ::tensorflow::test::function::NDef;
+using FDH = FunctionDefHelper;
+
+void CompareNodeFanins(const MutableGraphView& graph, NodeDef* node,
+                       absl::Span<const string> fanins) {
+  ASSERT_EQ(node->input_size(), fanins.size());
+  for (int i = 0; i < node->input_size(); ++i) {
+    TensorId tensor_id = ParseTensorName(fanins[i]);
+    EXPECT_EQ(ParseTensorName(node->input(i)), tensor_id);
+    int port;
+    if (tensor_id.index() == Graph::kControlSlot) {
+      port = Graph::kControlSlot;
+    } else {
+      port = i;
+    }
+    MutableGraphView::InputPort input_port(node, port);
+    MutableGraphView::OutputPort output_port =
+        graph.GetOutputPort(tensor_id.node(), tensor_id.index());
+    EXPECT_TRUE(graph.GetFanin(input_port).contains(output_port));
+    EXPECT_TRUE(graph.GetFanout(output_port).contains(input_port));
+  }
+}
+
+void CompareNodeFanouts(const MutableGraphView& graph, NodeDef* node,
+                        absl::Span<const string> fanouts) {
+  auto node_fanouts =
+      graph.GetFanouts(*node, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(node_fanouts.size(), fanouts.size());
+  for (const string& fanout : fanouts) {
+    TensorId tensor_id = ParseTensorName(fanout);
+    MutableGraphView::InputPort input_port(graph.GetNode(tensor_id.node()),
+                                           tensor_id.index());
+    EXPECT_TRUE(node_fanouts.contains(input_port));
+  }
+}
+
+void CheckNode(const MutableGraphView& graph, absl::string_view node_name,
+               absl::string_view op, absl::string_view device,
+               absl::Span<const std::pair<string, FDH::AttrValueWrapper>> attrs,
+               absl::Span<const string> fanins,
+               absl::Span<const string> fanouts) {
+  NodeDef* node = graph.GetNode(node_name);
+  ASSERT_NE(node, nullptr);
+  EXPECT_EQ(node->op(), op);
+  EXPECT_EQ(node->device(), device);
+  EXPECT_EQ(node->attr_size(), attrs.size());
+  for (const auto& attr : attrs) {
+    auto it = node->attr().find(attr.first);
+    ASSERT_NE(it, node->attr().end());
+    EXPECT_TRUE(AreAttrValuesEqual(it->second, attr.second.proto));
+  }
+  CompareNodeFanins(graph, node, fanins);
+  CompareNodeFanouts(graph, node, fanouts);
+}
+
+void CheckGraph(const MutableGraphView& mutable_graph) {
+  GraphView immutable_graph(mutable_graph.graph());
+  EXPECT_EQ(mutable_graph.graph()->node_size(),
+            immutable_graph.graph()->node_size());
+  EXPECT_EQ(mutable_graph.graph(), immutable_graph.graph());
+
+  auto check_edges =
+      [](const absl::flat_hash_set<MutableGraphView::Edge>& mutable_edges,
+         const absl::flat_hash_set<GraphView::Edge>& immutable_edges) {
+        EXPECT_EQ(mutable_edges.size(), immutable_edges.size());
+        for (const auto& fanin_edge : mutable_edges) {
+          GraphView::Edge immutable_edge(
+              {fanin_edge.src.node, fanin_edge.src.port_id},
+              {fanin_edge.dst.node, fanin_edge.dst.port_id});
+          EXPECT_TRUE(immutable_edges.contains(immutable_edge));
+        }
+      };
+
+  // Check graph connectivity.
+  for (auto& node : *mutable_graph.graph()->mutable_node()) {
+    EXPECT_EQ(&node, immutable_graph.GetNode(node.name()));
+
+    auto mutable_fanins =
+        mutable_graph.GetFanins(node, /*include_controlling_nodes=*/true);
+    auto immutable_fanins =
+        immutable_graph.GetFanins(node, /*include_controlling_nodes=*/true);
+    EXPECT_EQ(mutable_fanins.size(), immutable_fanins.size());
+    for (const auto& fanin : mutable_fanins) {
+      GraphView::OutputPort immutable_fanin(fanin.node, fanin.port_id);
+      EXPECT_TRUE(immutable_fanins.contains(immutable_fanin));
+    }
+
+    auto mutable_fanouts =
+        mutable_graph.GetFanouts(node, /*include_controlled_nodes=*/true);
+    auto immutable_fanouts =
+        immutable_graph.GetFanouts(node, /*include_controlled_nodes=*/true);
+    EXPECT_EQ(mutable_fanouts.size(), immutable_fanouts.size());
+    for (const auto& fanout : mutable_fanouts) {
+      GraphView::InputPort immutable_fanout(fanout.node, fanout.port_id);
+      EXPECT_TRUE(immutable_fanouts.contains(immutable_fanout));
+    }
+
+    auto mutable_fanin_edges =
+        mutable_graph.GetFaninEdges(node, /*include_controlling_edges=*/true);
+    auto immutable_fanin_edges =
+        immutable_graph.GetFaninEdges(node, /*include_controlling_edges=*/true);
+    check_edges(mutable_fanin_edges, immutable_fanin_edges);
+
+    auto mutable_fanout_edges =
+        mutable_graph.GetFanoutEdges(node, /*include_controlled_edges=*/true);
+    auto immutable_fanout_edges =
+        immutable_graph.GetFanoutEdges(node, /*include_controlled_edges=*/true);
+    check_edges(mutable_fanout_edges, immutable_fanout_edges);
+  }
+}
+
+TEST(MutableGraphViewTest, AddSubgraph) {
+  GraphDef graph_def = test::function::GDef(
+      {
+          NDef("foo", "NotImportant", {}, {}),
+          NDef("bar", "NotImportant", {}, {}),
+          NDef("baz", "NotImportant", {"foo", "bar"}),
+      },
+      /*funcs=*/{});
+  MutableGraphView graph(&graph_def);
+
+  // `s/bar` node has inputs that are valid only if we add subgraph into the
+  // original graph.
+  GraphDef subgraph = test::function::GDef(
+      {
+          NDef("s/n0", "NotImportant", {}, {}),
+          NDef("s/n1", "NotImportant", {"bar", "s/n0"}, {}),
+      },
+      /*funcs=*/{});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+
+  // Fanins and fanouts must be updated for the nodes of the original graph, and
+  // added subgraph.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"baz:1", "s/n1"});
+  CheckNode(graph, "s/n1", "NotImportant", "", {}, {"bar", "s/n0"}, {});
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndAddFunction) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+  EXPECT_EQ(graph_def.library().function_size(), 1);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndSkipSameFunction) {
+  FunctionDef x_times_two = test::function::XTimesTwo();
+
+  GraphDef graph_def = test::function::GDef({}, {x_times_two});
+  MutableGraphView graph(&graph_def);
+
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+  EXPECT_EQ(graph_def.library().function_size(), 1);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndFailIfFunctionDifferent) {
+  FunctionDef x_times_four = test::function::XTimesFour();
+  x_times_four.mutable_signature()->set_name("XTimesTwo");
+
+  GraphDef graph_def = test::function::GDef({}, {x_times_four});
+  MutableGraphView graph(&graph_def);
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  Status status = graph.AddSubgraph(std::move(subgraph));
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.error_message(),
+            "MutableGraphView::AddSubgraph(function_size=1) error: Found "
+            "different function definition with the same name: XTimesTwo.");
+}
 
 TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
   // Actual node.op() is not important in this test.
@@ -35,82 +215,139 @@ TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
       {NDef("bar", "NotImportant", {}, {}),
        NDef("other", "NotImportant", {}, {}),
        NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
-       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"}),
+       NDef("foo_3", "NotImportant", {"other:2", "^bar"})},
       /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
   NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NotImportant", {}, {}));
-  NodeDef* bar = graph.GetNode("bar");
-
-  graph.UpdateFanouts(bar->name(), new_bar->name());
-
-  // Fanout nodes must have their inputs updated.
-  NodeDef* foo_1 = graph.GetNode("foo_1");
-  ASSERT_NE(foo_1, nullptr);
-  ASSERT_EQ(foo_1->input_size(), 4);
-  EXPECT_EQ(foo_1->input(0), "new_bar");
-  EXPECT_EQ(foo_1->input(1), "other");
-  EXPECT_EQ(foo_1->input(2), "new_bar:1");
-  EXPECT_EQ(foo_1->input(3), "^new_bar");
-
-  NodeDef* foo_2 = graph.GetNode("foo_2");
-  ASSERT_NE(foo_2, nullptr);
-  ASSERT_EQ(foo_2->input_size(), 3);
-  EXPECT_EQ(foo_2->input(0), "other:1");
-  EXPECT_EQ(foo_2->input(1), "new_bar:2");
-  EXPECT_EQ(foo_2->input(2), "^new_bar");
-
-  // And fanouts mapping must be also updated for both nodes.
-  bool include_control_fanouts = true;
-  auto old_node_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
-  auto new_node_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
-
-  EXPECT_TRUE(old_node_fanouts.empty());
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 0)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 2)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, -1)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
+
+  TF_EXPECT_OK(graph.UpdateFanouts("bar", new_bar->name()));
+
+  // Fanins and fanouts must be updated.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "other", "NotImportant", "", {}, {},
+            {"foo_1:1", "foo_2", "foo_3"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"new_bar", "other", "new_bar:1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"other:1", "new_bar:2"},
+            {});
+  CheckNode(graph, "foo_3", "NotImportant", "", {}, {"other:2", "^new_bar"},
+            {});
+  CheckNode(graph, "new_bar", "NotImportant", "", {}, {},
+            {"foo_1:0", "foo_1:2", "foo_2:1", "^foo_3"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddAndUpdateFanoutsKeepControls) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "Identity", {"bar_1:2"}));
+
+  TF_EXPECT_OK(graph.UpdateFanouts("bar_2", new_bar->name()));
+
+  // Fanins and fanouts must be updated.
+  CheckNode(graph, "bar_1", "Switch", "", {}, {}, {"bar_2", "new_bar"});
+  CheckNode(graph, "bar_2", "Identity", "", {}, {"bar_1:1"}, {});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_1:1", "foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"new_bar", "other", "new_bar:1", "^new_bar"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"other:1", "new_bar:2", "^new_bar"}, {});
+  CheckNode(graph, "new_bar", "Identity", "", {}, {"bar_1:2"},
+            {"foo_1", "foo_1:2", "^foo_1", "foo_2:1", "^foo_2"});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def =
       test::function::GDef({NDef("bar", "NotImportant", {}, {}),
-                            NDef("foo", "NotImportant", {"bar", "^bar"})},
+                            NDef("foo_1", "NotImportant", {"bar", "^bar"}),
+                            NDef("foo_2", "NotImportant", {"^bar"})},
                            /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
   // `new_bar` reads the output of an original `bar` node.
   NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NewBar", {"bar"}, {}));
-  NodeDef* bar = graph.GetNode("bar");
 
-  graph.UpdateFanouts("bar", new_bar->name());
+  TF_EXPECT_OK(graph.UpdateFanouts("bar", new_bar->name()));
 
-  // Foo node must read from `new_bar`.
-  NodeDef* foo = graph.GetNode("foo");
-  ASSERT_NE(foo, nullptr);
-  ASSERT_EQ(foo->input_size(), 2);
-  EXPECT_EQ(foo->input(0), "new_bar");
-  EXPECT_EQ(foo->input(1), "^new_bar");
+  // Fanins and fanouts must be updated.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"new_bar"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {}, {"new_bar"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"^new_bar"}, {});
+  CheckNode(graph, "new_bar", "NewBar", "", {}, {"bar"}, {"foo_1", "^foo_2"});
 
-  // And the `new_bar` should read from the original `bar`.
-  ASSERT_EQ(new_bar->input_size(), 1);
-  ASSERT_EQ(new_bar->input(0), "bar");
+  CheckGraph(graph);
+}
 
-  // And fanouts mapping must be also updated for both nodes.
-  bool include_control_fanouts = true;
-  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
-  auto new_bar_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
+TEST(MutableGraphViewTest, UpdateFanoutsToSwitchWithControlFromSwitch) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {}),
+       NDef("c", "NotImportant", {}, {}), NDef("d", "NotImportant", {}, {}),
+       NDef("e", "NotImportant", {"c", "b", "^a", "^d"})},
+      /*funcs=*/{});
 
-  EXPECT_EQ(bar_fanouts.size(), 1);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(new_bar, 0)), 1);
+  MutableGraphView graph(&graph_def);
 
-  EXPECT_EQ(new_bar_fanouts.size(), 2);
-  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, 0)), 1);
-  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, -1)), 1);
+  Status s = graph.UpdateFanouts("a", "b");
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::UpdateFanouts(from_node_name='a', to_node_name='b') "
+      "error: can't update fanouts to node 'b' as it will become a Switch "
+      "control dependency.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+  s = graph.UpdateFanouts("d", "b");
+  EXPECT_FALSE(s.ok());
+  expected_msg =
+      "MutableGraphView::UpdateFanouts(from_node_name='d', to_node_name='b') "
+      "error: can't update fanouts to node 'b' as it will become a Switch "
+      "control dependency.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 5);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {"e:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {"e:0"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c", "b", "^a", "^d"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateFanoutsToSwitchWithNoControlFromSwitch) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {}),
+       NDef("c", "NotImportant", {}, {}), NDef("d", "NotImportant", {}, {}),
+       NDef("e", "NotImportant", {"c", "b", "^a", "^d"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanouts("c", "b"));
+
+  EXPECT_EQ(graph.graph()->node_size(), 5);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {"e:0", "e:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"b", "b", "^a", "^d"}, {});
+
+  CheckGraph(graph);
 }
 
 GraphDef SimpleMutateFaninGraph() {
@@ -128,107 +365,314 @@ GraphDef SimpleMutateFaninGraph() {
   return graph_def;
 }
 
-void CompareNodeInputs(const MutableGraphView& graph, const NodeDef* expected,
-                       NodeDef* actual) {
-  ASSERT_EQ(actual->input_size(), expected->input_size());
-  int port;
-  for (int i = 0; i < actual->input_size(); ++i) {
-    EXPECT_EQ(actual->input(i), expected->input(i));
-    TensorId tensor_id = ParseTensorName(expected->input(i));
-    if (tensor_id.index() == Graph::kControlSlot) {
-      port = Graph::kControlSlot;
-    } else {
-      port = i;
+absl::flat_hash_map<string, std::vector<string>> GetNodeInputsFromGraph(
+    const GraphDef& graph, absl::string_view node_to_exclude) {
+  absl::flat_hash_map<string, std::vector<string>> node_inputs;
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
     }
-    MutableGraphView::InputPort input_port(actual, port);
-    MutableGraphView::OutputPort output_port =
-        graph.GetOutputPort(tensor_id.node(), tensor_id.index());
-    EXPECT_EQ(graph.GetFanin(input_port).contains(output_port), true);
-    EXPECT_EQ(graph.GetFanout(output_port).contains(input_port), true);
+    node_inputs[node.name()] =
+        std::vector<string>(node.input().begin(), node.input().end());
   }
+  return node_inputs;
 }
 
-void TestAddFanin(absl::string_view node_name, const TensorId& fanin_to_add,
-                  bool modified, const NodeDef* expected_node) {
+void CheckUnmodifiedNodeFanins(
+    const GraphDef& graph, absl::string_view node_to_exclude,
+    const absl::flat_hash_map<string, std::vector<string>>&
+        unmodified_node_inputs) {
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    auto it = unmodified_node_inputs.find(node.name());
+    ASSERT_NE(it, unmodified_node_inputs.end());
+    ASSERT_EQ(it->second.size(), node.input_size());
+    for (int i = 0; i < node.input_size(); ++i) {
+      EXPECT_EQ(node.input(i), it->second[i]);
+    }
+  }
+}
+
+void TestAddRegularFanin(absl::string_view node_name, bool node_exists,
+                         const TensorId& fanin_to_add, bool success,
+                         const string& error_msg,
+                         absl::Span<const string> expected_fanins) {
   GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
 
-  auto node = graph.GetNode(node_name);
-  if (expected_node == nullptr) {
-    EXPECT_EQ(node, nullptr);
-  } else {
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
     EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
   }
 
-  EXPECT_EQ(modified, graph.AddFanin(node_name, fanin_to_add));
-  if (expected_node != nullptr) {
-    CompareNodeInputs(graph, expected_node, node);
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.AddRegularFanin(node_name, fanin_to_add);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
   }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, AddFanin) {
-  NodeDef expected_node;
+TEST(MutableGraphViewTest, AddRegularFanin) {
+  string error_msg;
   // Add input to node with 1 input 0 controls.
-  expected_node = NDef("", "", {"a", "b:1"});
-  TestAddFanin("foo_1", {"b", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_1", /*node_exists=*/true, {"b", 1}, /*success=*/true,
+                      error_msg, {"a", "b:1"});
   // Add input to node with multiple inputs and 0 controls.
-  expected_node = NDef("", "", {"b", "a:1", "a:1", "b:2"});
-  TestAddFanin("foo_3", {"b", 2}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_3", /*node_exists=*/true, {"b", 2}, /*success=*/true,
+                      error_msg, {"b", "a:1", "a:1", "b:2"});
   // Add input to node with 1 input multiple controls.
-  expected_node = NDef("", "", {"b", "a", "^c", "^a"});
-  TestAddFanin("foo_2", {"a", 0}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_2", /*node_exists=*/true, {"a", 0}, /*success=*/true,
+                      error_msg, {"b", "a", "^c"});
   // Add input to node with multiple inputs and controls.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "a:1", "^d", "^c"});
-  TestAddFanin("foo_4", {"a", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_4", /*node_exists=*/true, {"a", 1}, /*success=*/true,
+                      error_msg, {"a", "b:2", "b:2", "a:1", "^d", "^c"});
   // Add input to node with 0 inputs 0 controls.
-  expected_node = NDef("", "", {"a:1"});
-  TestAddFanin("foo_5", {"a", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_5", /*node_exists=*/true, {"a", 1}, /*success=*/true,
+                      error_msg, {"a:1"});
   // Add input to node with 0 inputs multiple controls.
-  expected_node = NDef("", "", {"c:1", "^b", "^a"});
-  TestAddFanin("foo_6", {"c", 1}, /*modified=*/true, &expected_node);
+  TestAddRegularFanin("foo_6", /*node_exists=*/true, {"c", 1}, /*success=*/true,
+                      error_msg, {"c:1", "^b", "^a"});
 
   // Add control to node with 1 input 0 controls.
-  expected_node = NDef("", "", {"a", "^b"});
-  TestAddFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_1', fanin='^b') error: "
+      "fanin '^b' must be a regular tensor id.";
+  TestAddRegularFanin("foo_1", /*node_exists=*/true, {"b", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"a"});
   // Add control to node with multiple inputs and 0 controls.
-  expected_node = NDef("", "", {"b", "a:1", "a:1", "^c"});
-  TestAddFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_3', fanin='^c') error: "
+      "fanin '^c' must be a regular tensor id.";
+  TestAddRegularFanin("foo_3", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
   // Add control to node with 1 input multiple controls.
-  expected_node = NDef("", "", {"b", "^a", "^c", "^d"});
-  TestAddFanin("foo_2", {"d", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_2', fanin='^d') error: "
+      "fanin '^d' must be a regular tensor id.";
+  TestAddRegularFanin("foo_2", /*node_exists=*/true, {"d", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"b", "^a", "^c"});
   // Add control to node with multiple input multiple controls.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c", "^d", "^a"});
-  TestAddFanin("foo_4", {"a", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_4', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
+  TestAddRegularFanin("foo_4", /*node_exists=*/true, {"a", Graph::kControlSlot},
+                      /*success=*/false, error_msg,
+                      {"a", "b:2", "b:2", "^c", "^d"});
   // Add control to node with 0 inputs 0 controls.
-  expected_node = NDef("", "", {"^a"});
-  TestAddFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_5', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
+  TestAddRegularFanin("foo_5", /*node_exists=*/true, {"a", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {});
   // Add control to node with 0 inputs multiple controls.
-  expected_node = NDef("", "", {"^a", "^b", "^c"});
-  TestAddFanin("foo_6", {"c", Graph::kControlSlot}, /*modified=*/true,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_6', fanin='^c') error: "
+      "fanin '^c' must be a regular tensor id.";
+  TestAddRegularFanin("foo_6", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"^a", "^b"});
   // Add control to node with control that already exists.
-  expected_node = NDef("", "", {"b", "^a", "^c"});
-  TestAddFanin("foo_2", {"a", Graph::kControlSlot}, /*modified=*/false,
-               &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_2', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
+  TestAddRegularFanin("foo_2", /*node_exists=*/true, {"a", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"b", "^a", "^c"});
+
+  // Add fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', fanin='a:0') "
+      "error: node 'foo_missing' was not found.";
+  TestAddRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
+                      /*success=*/false, error_msg, {});
+  // Add fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_1', "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestAddRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
+                      /*success=*/false, error_msg, {"a"});
+  // Add fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', "
+      "fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFanin("foo_missing", /*node_exists=*/false, {"bar_missing", 0},
+                      /*success=*/false, error_msg, {});
+  // Add control fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', "
+      "fanin='^bar_missing') error: fanin '^bar_missing' must be a regular "
+      "tensor id.";
+  TestAddRegularFanin("foo_missing", /*node_exists=*/false,
+                      {"bar_missing", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {});
+
+  // Add self to create cycle.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_6', fanin='foo_6:2') "
+      "error: can't add fanin 'foo_6:2' to self.";
+  TestAddRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
+                      /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestAddRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                               int port, const TensorId& fanin_to_add,
+                               bool success, const string& error_msg,
+                               absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.AddRegularFaninByPort(node_name, port, fanin_to_add);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddRegularFaninByPort) {
+  string error_msg;
+  // Add input at start to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"d:2", "b", "a:1", "a:1"});
+  // Add input at end to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/3, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "a:1", "a:1", "d:2"});
+  // Add input in middle to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/2, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "a:1", "d:2", "a:1"});
+  // Add input at start to node with some inputs and some controls.
+  TestAddRegularFaninByPort("foo_2", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"d:2", "b", "^c", "^a"});
+  // Add input at end to node with some inputs and some controls.
+  TestAddRegularFaninByPort("foo_2", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "d:2", "^c", "^a"});
+  // Add input in middle to node with some inputs and some controls, and dedup
+  // controls.
+  TestAddRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/2, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"a", "b:2", "d:2", "b:2", "^c"});
+  // Add input to node with no inputs and no controls.
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg, {"d:2"});
+  // Add input to node with no inputs and some controls.
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg, {"d:2", "^b", "^a"});
+  // Add fanin should dedup control.
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0, {"b", 2},
+                            /*success=*/true, error_msg, {"b:2", "^a"});
+
+  // Add controlling fanin.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=2, "
+      "fanin='^d') error: fanin '^d' must be a regular tensor id.";
+  TestAddRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/2, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Add fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_5', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/-1,
+                            {"d", 2},
+                            /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_5', port=1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/-1,
+                            {"d", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 3].";
+  TestAddRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=4, "
+      "fanin='d:2') error: port must be in range [0, 3].";
+  TestAddRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/4, {"d", 2},
+                            /*success=*/false, error_msg,
+                            {"a", "b:2", "b:2", "^c", "^d"});
 
   // Add fanin to node where node is missing.
-  TestAddFanin("foo_missing", {"a", 0}, /*modified=*/false, nullptr);
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='a:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                            {"a", 0},
+                            /*success=*/false, error_msg, {});
   // Add fanin to node where fanin is missing.
-  expected_node = NDef("", "", {"a"});
-  TestAddFanin("foo_1", {"bar_missing", 0}, /*modified=*/false, &expected_node);
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_1', port=0, "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestAddRegularFaninByPort("foo_1", /*node_exists=*/true, /*port=*/0,
+                            {"bar_missing", 0},
+                            /*success=*/false, error_msg, {"a"});
   // Add fanin to node where node and fanin are missing.
-  TestAddFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
-               /*expected_node=*/nullptr);
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                            {"bar_missing", 0},
+                            /*success=*/false, error_msg, {});
+
+  // Add self to create cycle.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='foo_6:2') error: can't add fanin 'foo_6:2' to self.";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                            {"foo_6", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
 }
 
-void CheckFanout(const MutableGraphView& graph, const TensorId& fanin,
-                 absl::string_view node_name) {
+void CheckFanoutRemoved(const MutableGraphView& graph, const TensorId& fanin,
+                        absl::string_view node_name) {
   MutableGraphView::OutputPort output_port =
       graph.GetOutputPort(fanin.node(), fanin.index());
   auto fanouts = graph.GetFanout(output_port);
@@ -237,311 +681,1197 @@ void CheckFanout(const MutableGraphView& graph, const TensorId& fanin,
   }
 }
 
-void TestRemoveFanin(absl::string_view node_name,
-                     const TensorId& fanin_to_remove, bool modified,
-                     const NodeDef* expected_node) {
+void TestRemoveRegularFanin(absl::string_view node_name, bool node_exists,
+                            const TensorId& fanin_to_remove, bool success,
+                            const string& error_msg,
+                            absl::Span<const string> expected_fanins) {
   GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
 
-  auto node = graph.GetNode(node_name);
-  if (expected_node == nullptr) {
-    EXPECT_EQ(nullptr, node);
-  } else {
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
     EXPECT_NE(nullptr, node);
+  } else {
+    EXPECT_EQ(nullptr, node);
   }
 
-  EXPECT_EQ(modified, graph.RemoveFanin(node_name, fanin_to_remove));
-  if (expected_node != nullptr) {
-    CompareNodeInputs(graph, expected_node, node);
-    if (modified) {
-      CheckFanout(graph, fanin_to_remove, node_name);
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveRegularFanin(node_name, fanin_to_remove);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+    if (success) {
+      CheckFanoutRemoved(graph, fanin_to_remove, node_name);
     }
   }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, RemoveFanin) {
-  NodeDef expected_node;
+TEST(MutableGraphViewTest, RemoveRegularFanin) {
+  string error_msg;
   // Remove input from node with 1 input 0 controls.
-  expected_node = NDef("", "", {});
-  TestRemoveFanin("foo_1", {"a", 0}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true, {"a", 0},
+                         /*success=*/true, error_msg, {});
   // Remove input from node with multiple inputs and 0 controls.
-  expected_node = NDef("", "", {"b"});
-  TestRemoveFanin("foo_3", {"a", 1}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_3", /*node_exists=*/true, {"a", 1},
+                         /*success=*/true, error_msg, {"b"});
   // Remove input from node with 1 input multiple controls.
-  expected_node = NDef("", "", {"^a", "^c"});
-  TestRemoveFanin("foo_2", {"b", 0}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_2", /*node_exists=*/true, {"b", 0},
+                         /*success=*/true, error_msg, {"^a", "^c"});
   // Remove input from node with multiple inputs and controls.
-  expected_node = NDef("", "", {"a", "^c", "^d"});
-  TestRemoveFanin("foo_4", {"b", 2}, /*modified=*/true, &expected_node);
+  TestRemoveRegularFanin("foo_4", /*node_exists=*/true, {"b", 2},
+                         /*success=*/true, error_msg, {"a", "^c", "^d"});
+  // Remove input from node with multiple inputs and controls, and results in
+  // shifting of ports.
+  TestRemoveRegularFanin("foo_4", /*node_exists=*/true, {"a", 0},
+                         /*success=*/true, error_msg,
+                         {"b:2", "b:2", "^c", "^d"});
 
   // Remove control from node with 1 input multiple controls.
-  expected_node = NDef("", "", {"b", "^c"});
-  TestRemoveFanin("foo_2", {"a", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_2', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_2", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"b", "^a", "^c"});
   // Remove control from node with multiple input multiple controls.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^c"});
-  TestRemoveFanin("foo_4", {"d", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_4', fanin='^d') "
+      "error: fanin '^d' must be a regular tensor id.";
+  TestRemoveRegularFanin(
+      "foo_4", /*node_exists=*/true, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
   // Remove control from node with 0 inputs multiple controls.
-  expected_node = NDef("", "", {"^b"});
-  TestRemoveFanin("foo_6", {"a", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_6', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"^a", "^b"});
 
   // Remove input from node with 0 inputs 0 controls.
-  expected_node = NDef("", "", {});
-  TestRemoveFanin("foo_5", {"a", 1}, /*modified=*/false, &expected_node);
+  error_msg = "";
+  TestRemoveRegularFanin("foo_5", /*node_exists=*/true, {"a", 1},
+                         /*success=*/true, error_msg, {});
   // Remove input from node with 0 inputs multiple controls.
-  expected_node = NDef("", "", {"^a", "^b"});
-  TestRemoveFanin("foo_6", {"a", 1}, /*modified=*/false, &expected_node);
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true, {"a", 1},
+                         /*success=*/true, error_msg, {"^a", "^b"});
+
   // Remove control from node with 1 input 0 controls.
-  expected_node = NDef("", "", {"a"});
-  TestRemoveFanin("foo_1", {"b", Graph::kControlSlot}, /*modified=*/false,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_1', fanin='^b') "
+      "error: fanin '^b' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true,
+                         {"b", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"a"});
   // Remove control from node with multiple inputs and 0 controls.
-  expected_node = NDef("", "", {"b", "a:1", "a:1"});
-  TestRemoveFanin("foo_3", {"c", Graph::kControlSlot}, /*modified=*/false,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_3', fanin='^c') "
+      "error: fanin '^c' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_3", /*node_exists=*/true,
+                         {"c", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"b", "a:1", "a:1"});
   // Remove control from node with 0 inputs 0 controls.
-  expected_node = NDef("", "", {});
-  TestRemoveFanin("foo_5", {"a", Graph::kControlSlot}, /*modified=*/false,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_5', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_5", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {});
 
   // Remove fanin from node where node is missing.
-  TestRemoveFanin("foo_missing", {"a", 0}, /*modified=*/false,
-                  /*expected_node=*/nullptr);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='a:0') error: node 'foo_missing' was not found.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
+                         /*success=*/false, error_msg, {});
   // Remove fanin from node where fanin is missing.
-  expected_node = NDef("", "", {"a"});
-  TestRemoveFanin("foo_1", {"bar_missing", 0}, /*modified=*/false,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_1', "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
+                         /*success=*/false, error_msg, {"a"});
   // Remove fanin from node where node and fanin are missing.
-  TestRemoveFanin("foo_missing", {"bar_missing", 0}, /*modified=*/false,
-                  /*expected_node=*/nullptr);
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
+                         {"bar_missing", 0}, /*success=*/false, error_msg, {});
+  // Remove control from node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='^bar_missing') error: fanin '^bar_missing' must be a regular "
+      "tensor id.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
+                         {"bar_missing", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {});
+
+  // Remove self.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_6', "
+      "fanin='foo_6:2') error: can't remove fanin 'foo_6:2' from self.";
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
+                         /*success=*/false, error_msg, {"^a", "^b"});
 }
 
-void TestRemoveAllFanins(absl::string_view node_name,
-                         bool keep_controlling_nodes, bool modified,
-                         const NodeDef* expected_node) {
+void TestRemoveRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                                  int port, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
   GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
 
-  auto node = graph.GetNode(node_name);
-  absl::flat_hash_set<string> fanin_strings;
-  if (expected_node == nullptr) {
-    EXPECT_EQ(node, nullptr);
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(nullptr, node);
   } else {
+    EXPECT_EQ(nullptr, node);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveRegularFaninByPort(node_name, port);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveRegularFaninByPort) {
+  string error_msg;
+  // Remove input at start of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/true, error_msg, {"a:1", "a:1"});
+  // Remove input at end of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/2,
+                               /*success=*/true, error_msg, {"b", "a:1"});
+  // Remove input in middle of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/true, error_msg, {"b", "a:1"});
+  // Remove input at start of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/true, error_msg,
+                               {"b:2", "b:2", "^d", "^c"});
+  // Remove input at end of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/2,
+                               /*success=*/true, error_msg,
+                               {"a", "b:2", "^d", "^c"});
+  // Remove input in middle of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/true, error_msg,
+                               {"a", "b:2", "^d", "^c"});
+
+  // Remove input from node with no inputs and no controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_5', port=0) "
+      "error: no available ports as node has no regular fanins.";
+  TestRemoveRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/false, error_msg, {});
+  // Remove input from node with no inputs and some controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_6', port=1) "
+      "error: no available ports as node has no regular fanins.";
+  TestRemoveRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/false, error_msg, {"^a", "^b"});
+
+  // Remove fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_3', port=-1) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/-1,
+                               /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_3', port=3) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/3,
+                               /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_4', port=-1) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/-1,
+                               /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_4', port=3) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/3,
+                               /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Remove fanin from node where node is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_missing', "
+      "port=0) error: node 'foo_missing' was not found.";
+  TestRemoveRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                               /*success=*/false, error_msg, {});
+}
+
+void TestRemoveAllFanins(absl::string_view node_name, bool node_exists,
+                         bool keep_controlling_nodes, bool success,
+                         const string& error_msg,
+                         absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  absl::flat_hash_set<string> fanin_strings;
+  if (node_exists) {
     EXPECT_NE(node, nullptr);
     fanin_strings.insert(node->input().begin(), node->input().end());
+  } else {
+    EXPECT_EQ(node, nullptr);
   }
 
-  EXPECT_EQ(modified, graph.RemoveAllFanins(node_name, keep_controlling_nodes));
-  if (expected_node != nullptr) {
-    CompareNodeInputs(graph, expected_node, node);
-    if (modified) {
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveAllFanins(node_name, keep_controlling_nodes);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+    if (success) {
       TensorId tensor_id;
       auto retained_inputs = absl::flat_hash_set<string>(node->input().begin(),
                                                          node->input().end());
       for (const string& fanin : fanin_strings) {
         if (!retained_inputs.contains(fanin)) {
           tensor_id = ParseTensorName(fanin);
-          CheckFanout(graph, tensor_id, node_name);
+          CheckFanoutRemoved(graph, tensor_id, node_name);
         }
       }
     }
   }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, RemoveAllFanins) {
-  NodeDef expected_node;
+  string error_msg;
   // Remove all fanins from node with no control dependencies.
-  expected_node = NDef("", "", {});
-  TestRemoveAllFanins("foo_3", /*keep_controlling_nodes=*/false,
-                      /*modified=*/true, &expected_node);
+  TestRemoveAllFanins("foo_3", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
   // Remove all fanins from node with control dependencies.
-  TestRemoveAllFanins("foo_4", /*keep_controlling_nodes=*/false,
-                      /*modified=*/true, &expected_node);
+  TestRemoveAllFanins("foo_4", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
 
   // Remove all fanins from node with no control dependencies and preserve
   // control dependencies.
-  TestRemoveAllFanins("foo_3", /*keep_controlling_nodes=*/true,
-                      /*modified=*/true, &expected_node);
+  TestRemoveAllFanins("foo_3", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {});
   // Remove all fanins from node with control dependencies and preserve control
   // dependencies.
-  expected_node = NDef("", "", {"^c", "^d"});
-  TestRemoveAllFanins("foo_4", /*keep_controlling_nodes=*/true,
-                      /*modified=*/true, &expected_node);
+  TestRemoveAllFanins("foo_4", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {"^c", "^d"});
 
   // Remove all fanins from node with no fanins.
-  expected_node = NDef("", "", {});
-  TestRemoveAllFanins("foo_5", /*keep_controlling_nodes=*/false,
-                      /*modified=*/false, &expected_node);
-  TestRemoveAllFanins("foo_5", /*keep_controlling_nodes=*/true,
-                      /*modified=*/false, &expected_node);
+  TestRemoveAllFanins("foo_5", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
+  TestRemoveAllFanins("foo_5", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {});
 
   // Remove all fanins from node with only control dependencies.
-  TestRemoveAllFanins("foo_6", /*keep_controlling_nodes=*/false,
-                      /*modified=*/true, &expected_node);
-  expected_node = NDef("", "", {"^a", "^b"});
-  TestRemoveAllFanins("foo_6", /*keep_controlling_nodes=*/true,
-                      /*modified=*/false, &expected_node);
+  TestRemoveAllFanins("foo_6", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
+  TestRemoveAllFanins("foo_6", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {"^a", "^b"});
 
   // Remove all fanins from node where node is missing.
-  TestRemoveAllFanins("foo_missing", /*keep_controlling_nodes=*/false,
-                      /*modified=*/false, /*expected_node=*/nullptr);
-  TestRemoveAllFanins("foo_missing", /*keep_controlling_nodes=*/true,
-                      /*modified=*/false, /*expected_node=*/nullptr);
+  error_msg =
+      "MutableGraphView::RemoveAllFanins(node_name='foo_missing', "
+      "keep_controlling_fanins=false) error: node 'foo_missing' was not found.";
+  TestRemoveAllFanins("foo_missing", /*node_exists=*/false,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::RemoveAllFanins(node_name='foo_missing', "
+      "keep_controlling_fanins=true) error: node 'foo_missing' was not found.";
+  TestRemoveAllFanins("foo_missing", /*node_exists=*/false,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/false, error_msg, {});
 }
 
-void TestUpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
-                     const TensorId& to_fanin, bool modified,
-                     const NodeDef* expected_node) {
+void TestUpdateFanin(absl::string_view node_name, bool node_exists,
+                     const TensorId& from_fanin, const TensorId& to_fanin,
+                     bool success, const string& error_msg,
+                     absl::Span<const string> expected_fanins) {
   GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
 
-  auto node = graph.GetNode(node_name);
-  if (expected_node == nullptr) {
-    EXPECT_EQ(node, nullptr);
-  } else {
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
     EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
   }
 
-  EXPECT_EQ(modified, graph.UpdateFanin(node_name, from_fanin, to_fanin));
-  if (expected_node != nullptr) {
-    CompareNodeInputs(graph, expected_node, node);
-    if (modified) {
-      CheckFanout(graph, from_fanin, node_name);
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateFanin(node_name, from_fanin, to_fanin);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+    if (success) {
+      CheckFanoutRemoved(graph, from_fanin, node_name);
     }
   }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, UpdateFanin) {
-  NodeDef expected_node;
+  string error_msg;
   // Update fanin from non control to non control.
-  expected_node = NDef("", "", {"a", "b:3", "b:3", "^c", "^d"});
-  TestUpdateFanin("foo_4", {"b", 2}, {"b", 3}, /*modified=*/true,
-                  &expected_node);
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2}, {"b", 3},
+                  /*success=*/true, error_msg, {"a", "b:3", "b:3", "^c", "^d"});
   // Update fanin from non control to control.
-  expected_node = NDef("", "", {"a", "^c", "^d", "^b"});
-  TestUpdateFanin("foo_4", {"b", 2}, {"b", Graph::kControlSlot},
-                  /*modified=*/true, &expected_node);
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2},
+                  {"b", Graph::kControlSlot},
+                  /*success=*/true, error_msg, {"a", "^c", "^d", "^b"});
   // Update fanin from control to non control.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "d:1", "^c"});
-  TestUpdateFanin("foo_4", {"d", Graph::kControlSlot}, {"d", 1},
-                  /*modified=*/true, &expected_node);
+  TestUpdateFanin(
+      "foo_4", /*node_exists=*/true, {"d", Graph::kControlSlot}, {"d", 1},
+      /*success=*/true, error_msg, {"a", "b:2", "b:2", "d:1", "^c"});
   // Update fanin from control to control.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^d", "^b"});
-  TestUpdateFanin("foo_4", {"c", Graph::kControlSlot},
-                  {"b", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                  {"b", Graph::kControlSlot}, /*success=*/true, error_msg,
+                  {"a", "b:2", "b:2", "^d"});
   // Update fanin from control to existing control.
-  expected_node = NDef("", "", {"a", "b:2", "b:2", "^d"});
-  TestUpdateFanin("foo_4", {"c", Graph::kControlSlot},
-                  {"d", Graph::kControlSlot}, /*modified=*/true,
-                  &expected_node);
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                  {"d", Graph::kControlSlot}, /*success=*/true, error_msg,
+                  {"a", "b:2", "b:2", "^d"});
 
   // Update fanin of node where from and to fanins are the same.
-  expected_node = NDef("", "", {"a"});
-  TestUpdateFanin("foo_1", {"a", -1}, {"a", -1}, /*modified=*/false,
-                  &expected_node);
-  TestUpdateFanin("foo_1", {"a", 0}, {"a", 0}, /*modified=*/false,
-                  &expected_node);
-  TestUpdateFanin("foo_1", {"a", 1}, {"a", 1}, /*modified=*/false,
-                  &expected_node);
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", -1}, {"a", -1},
+                  /*success=*/true, error_msg, {"a"});
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0}, {"a", 0},
+                  /*success=*/true, error_msg, {"a"});
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 1}, {"a", 1},
+                  /*success=*/true, error_msg, {"a"});
+
   // Update fanin of node where node is missing.
-  TestUpdateFanin("foo_missing", {"a", 0}, {"a", 1}, /*modified=*/false,
-                  /*expected_node=*/nullptr);
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='a:0', to_fanin='a:1') error: node 'foo_missing' was not "
+      "found.";
+  TestUpdateFanin("foo_missing", /*node_exists=*/false, {"a", 0}, {"a", 1},
+                  /*success=*/false, error_msg, {});
   // Update fanin of node where from fanin is missing.
-  TestUpdateFanin("foo_1", {"from_bar_missing", 0}, {"a", 1},
-                  /*modified=*/false, &expected_node);
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', "
+      "from_fanin='from_bar_missing:0', to_fanin='a:1') error: node "
+      "'from_bar_missing' was not found.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"from_bar_missing", 0},
+                  {"a", 1},
+                  /*success=*/false, error_msg, {"a"});
   // Update fanin of node where to fanin is missing.
-  TestUpdateFanin("foo_1", {"a", 0}, {"to_bar_missing", 1}, /*modified=*/false,
-                  &expected_node);
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:0', "
+      "to_fanin='to_bar_missing:1') error: node 'to_bar_missing' was not "
+      "found.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0},
+                  {"to_bar_missing", 1}, /*success=*/false, error_msg, {"a"});
   // Update fanin of node where from/to fanins and node are missing.
-  TestUpdateFanin("foo_missing", {"from_bar_missing", 0}, {"to_bar_missing", 1},
-                  /*modified=*/false, /*expected_node=*/nullptr);
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='from_bar_missing:0', to_fanin='to_bar_missing:1') error: "
+      "node 'foo_missing' was not found.";
+  TestUpdateFanin("foo_missing", /*node_exists=*/false, {"from_bar_missing", 0},
+                  {"to_bar_missing", 1},
+                  /*success=*/false, error_msg, {});
+  // Update fanin of node where from fanin is invalid.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:-2', "
+      "to_fanin='a:0') error: fanin 'a:-2' must be a valid tensor id.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", -2}, {"a", 0},
+                  /*success=*/false, error_msg, {"a"});
+  // Update fanin of node where to fanin is invalid.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:0', "
+      "to_fanin='a:-2') error: fanin 'a:-2' must be a valid tensor id.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0}, {"a", -2},
+                  /*success=*/false, error_msg, {"a"});
+  // Update fanin of node where from/to fanins are invalid and missing and node
+  // is missing.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='from_bar_missing:-2', to_fanin='to_bar_missing:-3') error: "
+      "fanin 'from_bar_missing:-2' must be a valid tensor id.";
+  TestUpdateFanin("foo_missing", /*node_exists=*/false,
+                  {"from_bar_missing", -2}, {"to_bar_missing", -3},
+                  /*success=*/false, error_msg, {});
+
+  // Update to self to create cycle.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='b:2', "
+      "to_fanin='foo_4:3') error: can't update fanin to or from self.";
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2}, {"foo_4", 3},
+                  /*success=*/false, error_msg,
+                  {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='b:2', "
+      "to_fanin='^foo_4') error: can't update fanin to or from self.";
+  TestUpdateFanin(
+      "foo_4", /*node_exists=*/true, {"b", 2}, {"foo_4", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='^c', "
+      "to_fanin='foo_4:4') error: can't update fanin to or from self.";
+  TestUpdateFanin(
+      "foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot}, {"foo_4", 4},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='^c', "
+      "to_fanin='^foo_4') error: can't update fanin to or from self.";
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                  {"foo_4", Graph::kControlSlot}, /*success=*/false, error_msg,
+                  {"a", "b:2", "b:2", "^c", "^d"});
+}
+
+void TestUpdateFaninFromFaninToNodeAsSwitchControl(const TensorId& fanin) {
+  string tensor_id_str = TensorIdToString(fanin);
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {}),
+       NDef("c", "NotImportant", {tensor_id_str})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.UpdateFanin("c", fanin, {"b", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  string expected_msg = absl::Substitute(
+      "MutableGraphView::UpdateFanin(node_name='c', from_fanin='$0', "
+      "to_fanin='^b') error: can't update to fanin '^b' as it will become a "
+      "Switch control dependency.",
+      fanin.ToString());
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+
+  string fanout = IsControlInput(fanin) ? AsControlDependency("c") : "c";
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {fanout});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {tensor_id_str}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateFaninToNodeAsSwitchControl) {
+  TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", 0});
+  TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", 1});
+  TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", Graph::kControlSlot});
+}
+
+void TestUpdateRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                                  int port, const TensorId& fanin, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateRegularFaninByPort(node_name, port, fanin);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateRegularFaninByPort) {
+  string error_msg;
+  // Update input at start to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/0, {"d", 2},
+      /*success=*/true, error_msg, {"d:2", "a:1", "a:1"});
+  // Update input at end to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/2, {"d", 2},
+      /*success=*/true, error_msg, {"b", "a:1", "d:2"});
+  // Update input in middle to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/1, {"d", 2},
+      /*success=*/true, error_msg, {"b", "d:2", "a:1"});
+  // Update input at start to node with some inputs and some controls, and dedup
+  // controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/0, {"d", 2},
+      /*success=*/true, error_msg, {"d:2", "b:2", "b:2", "^c"});
+  // Update input at end to node with some inputs and some controls, and dedup
+  // controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/2, {"d", 2},
+      /*success=*/true, error_msg, {"a", "b:2", "d:2", "^c"});
+  // Update input in middle to node with some inputs and some controls and
+  // dedup controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/1, {"d", 2},
+      /*success=*/true, error_msg, {"a", "d:2", "b:2", "^c"});
+
+  // Update input to controlling fanin.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=1, "
+      "fanin='^d') error: fanin '^d' must be a regular tensor id.";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/1, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Update fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=-1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/-1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=0, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=-1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/-1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_3', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_3', port=3, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/3, {"d", 2},
+      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=3, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/3, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Update fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='a:0') error: node 'foo_missing' was not found.";
+  TestUpdateRegularFaninByPort("foo_missing", /*node_exists=*/false,
+                               /*port=*/0, {"a", 0},
+                               /*success=*/false, error_msg, {});
+  // Update fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_1', port=0, "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not "
+      "found.";
+  TestUpdateRegularFaninByPort("foo_1", /*node_exists=*/true, /*port=*/0,
+                               {"bar_missing", 0},
+                               /*success=*/false, error_msg, {"a"});
+  // Update fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestUpdateRegularFaninByPort("foo_missing", /*node_exists=*/false,
+                               /*port=*/0, {"bar_missing", 0},
+                               /*success=*/false, error_msg, {});
+
+  // Update self to create cycle.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='foo_6:2') error: can't add fanin 'foo_6:2' to self.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                               {"foo_6", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestSwapRegularFaninsByPorts(absl::string_view node_name, bool node_exists,
+                                  int from_port, int to_port, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.SwapRegularFaninsByPorts(node_name, from_port, to_port);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, SwapRegularFaninsByPorts) {
+  string error_msg;
+  // Swapping first and last regular fanins
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"a:1", "a:1", "b"});
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/2,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"a:1", "a:1", "b"});
+  // Swapping first and last regular fanins, in node with controls.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"b:2", "b:2", "a", "^c", "^d"});
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/2,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"b:2", "b:2", "a", "^c", "^d"});
+  // Swapping middle regular fanin.
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"a:1", "b", "a:1"});
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"a:1", "b", "a:1"});
+  // Swapping middle regular fanin, in node with controls.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"b:2", "a", "b:2", "^c", "^d"});
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"b:2", "a", "b:2", "^c", "^d"});
+  // Swapping same port.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  // Swapping same fanin but different port.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Swaping fanins at out of bounds ports.
+  // Node with no regular fanins and no controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=-1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=-1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  // Node with no regular fanins and some controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=-1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=-1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  // Node with regular fanins and no controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=-1, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=0, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=0, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=3, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=-1, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=3, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  // Node with regular fanins and controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=-1, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=0, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=0, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=3, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=-1, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=3, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Swapping fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_missing', "
+      "from_port=0, to_port=1) error: node 'foo_missing' was not found.";
+  TestSwapRegularFaninsByPorts("foo_missing", /*node_exists=*/false,
+                               /*from_port=*/0, /*to_port=*/1,
+                               /*success=*/false, error_msg, {});
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnGraphInit) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "Switch", {}, {}), NDef("d", "Identity", {"c:1"}),
+       NDef("foo_1", "IdentityN", {"a", "b:1", "^b"}),
+       NDef("foo_2", "IdentityN", {"a", "^b", "^b"}),
+       NDef("foo_3", "IdentityN", {"a", "b:1", "^b", "^b"}),
+       NDef("foo_4", "IdentityN", {"a:2", "b:1", "^b", "^b", "^a", "^a"}),
+       NDef("foo_5", "NotImportant", {"a:2", "b:1", "^b", "^b", "^a", "^a"}),
+       NDef("foo_6", "Identity", {"d", "^d"}),
+       NDef("foo_7", "NotImportant",
+            {"a:3", "b:2", "d", "^d", "^d", "^a", "^b", "^a", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_EQ(graph.graph()->node_size(), 11);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {},
+            {"foo_1", "foo_2", "foo_3", "foo_4", "foo_5", "foo_7"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {},
+            {"foo_1:1", "^foo_2", "foo_3:1", "foo_4:1", "foo_5:1", "foo_7:1"});
+  CheckNode(graph, "c", "Switch", "", {}, {}, {"d"});
+  CheckNode(graph, "d", "Identity", "", {}, {"c:1"},
+            {"foo_6", "^foo_6", "foo_7:2", "^foo_7"});
+  CheckNode(graph, "foo_1", "IdentityN", "", {}, {"a", "b:1"}, {});
+  CheckNode(graph, "foo_2", "IdentityN", "", {}, {"a", "^b"}, {});
+  CheckNode(graph, "foo_3", "IdentityN", "", {}, {"a", "b:1"}, {});
+  CheckNode(graph, "foo_4", "IdentityN", "", {}, {"a:2", "b:1"}, {});
+  CheckNode(graph, "foo_5", "NotImportant", "", {}, {"a:2", "b:1"}, {});
+  CheckNode(graph, "foo_6", "Identity", "", {}, {"d", "^d"}, {});
+  CheckNode(graph, "foo_7", "NotImportant", "", {}, {"a:3", "b:2", "d", "^d"},
+            {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"^a"}),
+       NDef("c", "NotImportant", {"a:1"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFanin("b", {"a", 2}));
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"a", Graph::kControlSlot}));
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:1"}, {});
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b:0", "c:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnAddFanin) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "", {}, {}), NDef("d", "", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"b", 2}));
+  CheckNode(graph, "c", "", "", {}, {"b:2"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"b", 2}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "b:2", "^b"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"^b"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"^b"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"},
+            {"c:0", "c:1", "^c", "^d"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFaninByPort) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def =
+      test::function::GDef({NDef("a", "NotImportant", {}, {}),
+                            NDef("b", "NotImportant", {"c", "^a"}),
+                            NDef("c", "NotImportant", {"a:1"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("b", 0, {"a", 2}));
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2", "c"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"a", Graph::kControlSlot}));
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:1"}, {"b:1"});
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b:0", "c:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnAddFaninByPort) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "", {}, {}), NDef("d", "", {"c:2"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("d", 1, {"b", 2}));
+  CheckNode(graph, "d", "", "", {}, {"c:2", "b:2"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"c:2", "b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("d", 0, {"b", 2}));
+  CheckNode(graph, "d", "", "", {}, {"b:2", "c:2", "b:2", "^b"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {"d:0", "d:2", "^d"});
+  CheckNode(graph, "c", "", "", {}, {}, {"d:1"});
+
+  CheckGraph(graph);
 }
 
-GraphDef SimpleDuplicateControllingFaninsGraph() {
+TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFanin) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
-       NDef("foo_1", "NotImportant", {"a", "b:1", "^b"}),
-       NDef("foo_2", "NotImportant", {"a", "^b", "^b"}),
-       NDef("foo_3", "NotImportant", {"a", "b:1", "^b", "^b"}),
-       NDef("foo_4", "NotImportant", {"a:2", "b:1", "^b", "^b", "^a", "^a"})},
+       NDef("c", "NotImportant", {"a:1", "^b"})},
       /*funcs=*/{});
-  return graph_def;
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanin("c", {"a", 1}, {"b", 2}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:2"}, {});
+
+  CheckGraph(graph);
 }
 
-void CheckDedupControllingFaninsForNode(MutableGraphView* graph,
-                                        absl::string_view node_name,
-                                        const NodeDef* expected_node) {
-  // Deduping again should result in no change.
-  EXPECT_FALSE(graph->DedupControllingFanins(node_name));
-  NodeDef* node = graph->GetNode(node_name);
-  ASSERT_NE(node, nullptr);
-  ASSERT_EQ(node->input_size(), expected_node->input_size());
-  CompareNodeInputs(*graph, expected_node, node);
-  for (int i = 0; i < node->input_size(); ++i) {
-    TensorId tensor_id = ParseTensorName(node->input(i));
-    if (tensor_id.index() > Graph::kControlSlot) {
-      CheckFanout(*graph, {tensor_id.node(), Graph::kControlSlot}, node_name);
-    }
-  }
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnUpdateFanin) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
+       NDef("e", "NotImportant", {"b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanin("d", {"b", Graph::kControlSlot},
+                                 {"c", Graph::kControlSlot}));
+  CheckNode(graph, "d", "NotImportant", "", {}, {"c", "^c"}, {});
+
+  TF_EXPECT_OK(graph.UpdateFanin("e", {"b", 0}, {"c", 3}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c:3", "^c"}, {});
+
+  TF_EXPECT_OK(graph.UpdateFanin("e", {"c", 3}, {"c", Graph::kControlSlot}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"^c"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0", "c:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {});
+  CheckNode(graph, "c", "Identity", "", {}, {"a:2"}, {"d:0", "^d", "^e"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFaninByPort) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {"a:1", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("c", 0, {"b", 2}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:2"}, {});
+
+  CheckGraph(graph);
 }
 
-void TestDedupControllingFaninsForNode(MutableGraphView* graph,
-                                       absl::string_view node_name,
-                                       const NodeDef* expected_node) {
-  EXPECT_TRUE(graph->DedupControllingFanins(node_name));
-  CheckDedupControllingFaninsForNode(graph, node_name, expected_node);
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnUpdateFaninByPort) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
+       NDef("e", "NotImportant", {"b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("d", 0, {"b", 1}));
+  CheckNode(graph, "d", "NotImportant", "", {}, {"b:1", "^b"}, {});
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("e", 0, {"c", 2}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c:2", "^c"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0", "c:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {"d:0", "^d"});
+  CheckNode(graph, "c", "Identity", "", {}, {"a:2"}, {"e:0", "^e"});
+
+  CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, DedupControllingFaninsForNode) {
-  GraphDef graph_def = SimpleDuplicateControllingFaninsGraph();
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnAddFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"^b"})},
+      /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
-  NodeDef expected_node;
-  // Remove redundant control dependency '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  TestDedupControllingFaninsForNode(&graph, "foo_1", &expected_node);
-  // Remove extra control dependency '^b'.
-  expected_node = NDef("", "", {"a", "^b"});
-  TestDedupControllingFaninsForNode(&graph, "foo_2", &expected_node);
-  // Remove redundant and extra control dependencies '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  TestDedupControllingFaninsForNode(&graph, "foo_3", &expected_node);
-  // Remove multiple redundant control dependencies.
-  expected_node = NDef("", "", {"a:2", "b:1"});
-  TestDedupControllingFaninsForNode(&graph, "foo_4", &expected_node);
-  // Missing node.
-  EXPECT_FALSE(graph.DedupControllingFanins("missing"));
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"a", 3}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:1"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:3", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnRemoveFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveRegularFanin("c", {"a", 2}));
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:1"}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, KeepMaxRegularOutputPortOnRemoveFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveRegularFanin("b", {"a", 1}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:2"}, {});
+
+  CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, DedupControllingFaninsForGraph) {
-  GraphDef graph_def = SimpleDuplicateControllingFaninsGraph();
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnUpdateFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
-  EXPECT_TRUE(graph.DedupControllingFanins());
-  // Deduping again should result in no change.
-  EXPECT_FALSE(graph.DedupControllingFanins());
 
-  NodeDef expected_node;
-  // Remove redundant control dependency '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_1", &expected_node);
-  // Remove extra control dependency '^b'.
-  expected_node = NDef("", "", {"a", "^b"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_2", &expected_node);
-  // Remove redundant and extra control dependencies '^b'.
-  expected_node = NDef("", "", {"a", "b:1"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_3", &expected_node);
-  // Remove multiple redundant control dependencies.
-  expected_node = NDef("", "", {"a:2", "b:1"});
-  CheckDedupControllingFaninsForNode(&graph, "foo_4", &expected_node);
+  TF_EXPECT_OK(graph.UpdateFanin("c", {"a", 2}, {"b", 3}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:1"}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:3"}, {});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninMissing) {
@@ -552,19 +1882,33 @@ TEST(MutableGraphViewTest, AddControllingFaninMissing) {
 
   MutableGraphView graph(&graph_def);
   // Missing fanin.
-  EXPECT_FALSE(graph.AddControllingFanin("a", {"c", Graph::kControlSlot}));
+  Status s = graph.AddControllingFanin("a", {"c", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^c') error: "
+      "node 'c' was not found.";
+  EXPECT_EQ(s.error_message(), expected_msg);
   // Missing node.
-  EXPECT_FALSE(graph.AddControllingFanin("d", {"a", Graph::kControlSlot}));
+  s = graph.AddControllingFanin("d", {"a", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='d', fanin='^a') error: "
+      "node 'd' was not found.";
+  EXPECT_EQ(s.error_message(), expected_msg);
   // Missing node and fanin.
-  EXPECT_FALSE(graph.AddControllingFanin("c", {"d", Graph::kControlSlot}));
+  s = graph.AddControllingFanin("c", {"d", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='c', fanin='^d') error: "
+      "node 'c' was not found.";
+  EXPECT_EQ(s.error_message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
-  NodeDef* a = graph.GetNode("a");
-  ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 0);
-  NodeDef* b = graph.GetNode("b");
-  ASSERT_NE(b, nullptr);
-  ASSERT_EQ(b->input_size(), 0);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninExistingControl) {
@@ -574,17 +1918,15 @@ TEST(MutableGraphViewTest, AddControllingFaninExistingControl) {
       /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
-  EXPECT_TRUE(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
-  EXPECT_FALSE(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
-  NodeDef* a = graph.GetNode("a");
-  ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^b");
-  NodeDef* b = graph.GetNode("b");
-  ASSERT_NE(b, nullptr);
-  ASSERT_EQ(b->input_size(), 0);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {"^b"}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^a"});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninNotSwitch) {
@@ -594,17 +1936,37 @@ TEST(MutableGraphViewTest, AddControllingFaninNotSwitch) {
       /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
-  EXPECT_TRUE(graph.AddControllingFanin("a", {"b", 2}));
-  EXPECT_FALSE(graph.AddControllingFanin("a", {"b", 2}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", 2}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", 2}));
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
-  NodeDef* a = graph.GetNode("a");
-  ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^b");
-  NodeDef* b = graph.GetNode("b");
-  ASSERT_NE(b, nullptr);
-  ASSERT_EQ(b->input_size(), 0);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {"^b"}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSwitch) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.AddControllingFanin("a", {"b", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^b') error: "
+      "can't add fanin '^b' as it will become a Switch control dependency.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  ASSERT_EQ(graph.graph()->node_size(), 2);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninSwitchWithIdentity) {
@@ -615,14 +1977,16 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitchWithIdentity) {
 
   MutableGraphView graph(&graph_def);
 
-  EXPECT_TRUE(graph.AddControllingFanin("a", {"switch", 0}));
-  EXPECT_FALSE(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
 
   ASSERT_EQ(graph.graph()->node_size(), 3);
-  NodeDef* a = graph.GetNode("a");
-  ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^identity");
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {"^identity"}, {});
+  CheckNode(graph, "switch", "Switch", "", {}, {}, {"identity"});
+  CheckNode(graph, "identity", "Identity", "", {}, {"switch"}, {"^a"});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninSwitchWithNoExistingIdentity) {
@@ -634,40 +1998,222 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitchWithNoExistingIdentity) {
 
   MutableGraphView graph(&graph_def);
 
-  EXPECT_TRUE(graph.AddControllingFanin("a", {"switch", 0}));
-  EXPECT_FALSE(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
 
   ASSERT_EQ(graph.graph()->node_size(), 3);
-  NodeDef* a = graph.GetNode("a");
-  ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^ConstantFoldingCtrl/switch_0");
-  NodeDef* identity = graph.GetNode("ConstantFoldingCtrl/switch_0");
-  ASSERT_NE(identity, nullptr);
-  ASSERT_EQ(identity->input_size(), 1);
-  EXPECT_EQ(identity->input(0), "switch");
-  EXPECT_EQ(identity->op(), "Identity");
-  EXPECT_EQ(identity->device(), kDevice);
-  ASSERT_TRUE(identity->attr().count("T"));
-  EXPECT_EQ(identity->attr().at("T").type(), DT_FLOAT);
+
+  CheckNode(graph, "a", "NotImportant", "", {},
+            {"^ConstantFoldingCtrl/switch_0"}, {});
+  CheckNode(graph, "switch", "Switch", kDevice, {{"T", DT_FLOAT}}, {},
+            {"ConstantFoldingCtrl/switch_0"});
+  CheckNode(graph, "ConstantFoldingCtrl/switch_0", "Identity", kDevice,
+            {{"T", DT_FLOAT}}, {"switch"}, {"^a"});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninSwitchWithExistingAddedIdentity) {
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "NotImportant", {}, {}), NDef("switch", "Switch", {}, {}),
-       NDef("ConstantFoldingCtrl/switch_0", "Identity", {}, {})},
+       NDef("ConstantFoldingCtrl/switch_0", "Identity", {"switch"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {},
+            {"^ConstantFoldingCtrl/switch_0"}, {});
+  CheckNode(graph, "switch", "Switch", "", {}, {},
+            {"ConstantFoldingCtrl/switch_0"});
+  CheckNode(graph, "ConstantFoldingCtrl/switch_0", "Identity", "", {},
+            {"switch"}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+void TestAddControllingFaninSelfLoops(absl::string_view node_name,
+                                      const TensorId& fanin,
+                                      const string& error_msg) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("b", "Switch", {}, {{"T", DT_FLOAT}}),
+       NDef("c", "Identity", {"b:0"}), NDef("d", "Identity", {"b:1"}),
+       NDef("e", "NotImportant", {"^a"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.AddControllingFanin(node_name, fanin);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 5);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "b", "Switch", "", {{"T", DT_FLOAT}}, {}, {"c", "d"});
+  CheckNode(graph, "c", "Identity", "", {}, {"b"}, {});
+  CheckNode(graph, "d", "Identity", "", {}, {"b:1"}, {});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"^a"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSelfLoops) {
+  string error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^a') error: "
+      "can't add fanin '^a' to self.";
+  TestAddControllingFaninSelfLoops("a", {"a", Graph::kControlSlot}, error_msg);
+
+  // Adding Switch control dependency to Identity consumer. Node `c` is
+  // consuming `b:0`, so adding `b:0` as a control dependency, because it is a
+  // Switch, should trigger a lookup of outputs. As `c` is a consumer and an
+  // Identity, this will introduce a self loop, so no control dependency should
+  // be added.
+  error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='c', fanin='b:0') "
+      "error: can't add found fanin '^c' to self.";
+  TestAddControllingFaninSelfLoops("c", {"b", 0}, error_msg);
+
+  // Adding Switch control dependency to Identity consumer. Node `d` is
+  // consuming `b:1`, so adding `b:1` as a control dependency, because it is a
+  // Switch, should trigger a lookup of outputs. As `d` is a consumer and an
+  // Identity, this will introduce a self loop, so no control dependency should
+  // be added.
+  error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='d', fanin='b:1') "
+      "error: can't add found fanin '^d' to self.";
+  TestAddControllingFaninSelfLoops("d", {"b", 1}, error_msg);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSelfLoopsGeneratedIdentity) {
+  GraphDef graph_def =
+      test::function::GDef({NDef("a", "NotImportant", {}, {}),
+                            NDef("b", "Switch", {}, {{"T", DT_FLOAT}}),
+                            NDef("c", "NotImportant", {}),
+                            NDef("ConstantFoldingCtrl/b_1", "Identity", {})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  // Adding Switch control dependency to Identity node of the same name as a
+  // generated Identity node for pinning the control dependency. Because there
+  // are no consumers of `b:1`, there will be an attempt to generate an Identity
+  // node, with name `ConstantFoldingCtrl/b_1`. As the input node is of the same
+  // name, we will introduce a self loop, so no control dependency should be
+  // added.
+  Status s = graph.AddControllingFanin("ConstantFoldingCtrl/b_1", {"b", 1});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='ConstantFoldingCtrl/"
+      "b_1', fanin='b:1') error: can't add generated fanin "
+      "'^ConstantFoldingCtrl/b_1' to self.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "Switch", "", {{"T", DT_FLOAT}}, {}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "ConstantFoldingCtrl/b_1", "Identity", "", {}, {}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninMissing) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {}, {}),
+       NDef("d", "NotImportant", {"^a", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveControllingFanin("d", "c"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^a", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninExisting) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {}, {}),
+       NDef("d", "NotImportant", {"^a", "^b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveControllingFanin("d", "a"));
+  TF_EXPECT_OK(graph.RemoveControllingFanin("d", "a"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^c", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninOnRegularFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {"a", "b"})},
       /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
-  EXPECT_TRUE(graph.AddControllingFanin("a", {"switch", 0}));
-  EXPECT_FALSE(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.RemoveControllingFanin("c", "a"));
+  TF_EXPECT_OK(graph.RemoveControllingFanin("c", "b"));
 
   ASSERT_EQ(graph.graph()->node_size(), 3);
-  NodeDef* a = graph.GetNode("a");
-  ASSERT_NE(a, nullptr);
-  ASSERT_EQ(a->input_size(), 1);
-  EXPECT_EQ(a->input(0), "^ConstantFoldingCtrl/switch_0");
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a"}, {"c:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a", "b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninSelfLoop) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {"a", "b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.RemoveControllingFanin("c", "c");
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::RemoveControllingFanin(node_name='c', "
+      "fanin_node_name='c') error: can't remove fanin '^c' from "
+      "self.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a"}, {"c:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a", "b"}, {});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, DeleteNodes) {
@@ -682,24 +2228,159 @@ TEST(MutableGraphViewTest, DeleteNodes) {
   MutableGraphView graph(&graph_def);
 
   EXPECT_NE(graph.GetNode("foo_1"), nullptr);
-  graph.DeleteNodes({"foo_1"});
+  TF_EXPECT_OK(graph.DeleteNodes({"foo_1"}));
 
+  EXPECT_EQ(graph.graph()->node_size(), 3);
   EXPECT_EQ(graph.GetNode("foo_1"), nullptr);
 
-  NodeDef* bar = graph.GetNode("bar");
-  NodeDef* other = graph.GetNode("other");
-  NodeDef* foo_2 = graph.GetNode("foo_2");
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"foo_2:1"});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_2"});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"other:1", "bar:2"}, {});
+
+  CheckGraph(graph);
+}
+
+GraphDef SimpleDeleteNodeGraph() {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"a:5", "^b"}), NDef("d", "NotImportant", {}),
+       NDef("e", "NotImportant", {"d:2"}),
+       NDef("f", "NotImportant", {"d:3", "^e"})},
+      /*funcs=*/{});
+  return graph_def;
+}
+
+TEST(MutableGraphViewTest, DeleteNodesWithFanoutsBeingDeleted) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+  EXPECT_NE(graph.GetNode("a"), nullptr);
+  EXPECT_NE(graph.GetNode("b"), nullptr);
+  EXPECT_NE(graph.GetNode("c"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"c", "a", "b"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+  EXPECT_EQ(graph.GetNode("a"), nullptr);
+  EXPECT_EQ(graph.GetNode("b"), nullptr);
+  EXPECT_EQ(graph.GetNode("c"), nullptr);
+
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"e", "f"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteMissingNodes) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
 
-  bool include_control_fanouts = true;
-  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
-  auto other_fanouts = graph.GetFanouts(*other, include_control_fanouts);
+  EXPECT_EQ(graph.GetNode("g"), nullptr);
+  EXPECT_EQ(graph.GetNode("h"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"g", "h"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 6);
+  EXPECT_EQ(graph.GetNode("g"), nullptr);
+  EXPECT_EQ(graph.GetNode("h"), nullptr);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:5", "^b"}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"e", "f"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {});
+
+  CheckGraph(graph);
+}
 
-  EXPECT_EQ(bar_fanouts.size(), 2);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
+TEST(MutableGraphViewTest, DeleteMissingNodesAndNodesWithFanoutsBeingDeleted) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_NE(graph.GetNode("d"), nullptr);
+  EXPECT_NE(graph.GetNode("e"), nullptr);
+  EXPECT_NE(graph.GetNode("f"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"d", "e", "f", "g", "h"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+  EXPECT_EQ(graph.GetNode("d"), nullptr);
+  EXPECT_EQ(graph.GetNode("e"), nullptr);
+  EXPECT_EQ(graph.GetNode("f"), nullptr);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:5", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteNodesWithError) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.DeleteNodes({"b", "a"});
+  EXPECT_FALSE(s.ok());
+  string error_msg =
+      "MutableGraphView::DeleteNodes(nodes_to_delete={a, b}) error: can't "
+      "delete node(s) with retained fanouts(s) [a, b].";
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 6);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:5", "^b"}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"e", "f"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteNodesWithLargeError) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"^b"}), NDef("d", "NotImportant", {"c:6"}),
+       NDef("e", "NotImportant", {"d:2"}),
+       NDef("f", "NotImportant", {"d:3", "^e"}),
+       NDef("g", "NotImportant", {"f"}), NDef("h", "NotImportant", {"a"}),
+       NDef("i", "NotImportant", {"b"}), NDef("j", "NotImportant", {"c"}),
+       NDef("k", "NotImportant", {"d"}), NDef("l", "NotImportant", {"e"}),
+       NDef("m", "NotImportant", {"f"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
 
-  EXPECT_EQ(other_fanouts.size(), 1);
-  EXPECT_EQ(other_fanouts.count(MutableGraphView::InputPort(foo_2, 0)), 1);
+  Status s = graph.DeleteNodes({"a", "b", "c", "d", "e", "f"});
+  EXPECT_FALSE(s.ok());
+  string error_msg =
+      "MutableGraphView::DeleteNodes(nodes_to_delete={a, b, c, d, e, ...}) "
+      "error: can't delete node(s) with retained fanouts(s) [a, b, c, d, e, "
+      "...].";
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 13);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "h"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c", "i"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"^b"}, {"d", "j"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"c:6"}, {"e", "f", "k"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f", "l"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {"g", "m"});
+  CheckNode(graph, "g", "NotImportant", "", {}, {"f"}, {});
+  CheckNode(graph, "h", "NotImportant", "", {}, {"a"}, {});
+  CheckNode(graph, "i", "NotImportant", "", {}, {"b"}, {});
+  CheckNode(graph, "j", "NotImportant", "", {}, {"c"}, {});
+  CheckNode(graph, "k", "NotImportant", "", {}, {"d"}, {});
+  CheckNode(graph, "l", "NotImportant", "", {}, {"e"}, {});
+  CheckNode(graph, "m", "NotImportant", "", {}, {"f"}, {});
+
+  CheckGraph(graph);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index b201c3a7172a717d0d88003cf15b411721afdd34..27fba4fd11a671d5aecf53d4821e1935be25fb73 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -355,6 +355,10 @@ bool IsRandomShuffle(const NodeDef& node) {
 
 bool IsRank(const NodeDef& node) { return node.op() == "Rank"; }
 
+bool IsReadVariableOp(const NodeDef& node) {
+  return node.op() == "ReadVariableOp";
+}
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -561,6 +565,29 @@ bool MaybeHasRefInput(const NodeDef& node) {
   return false;
 }
 
+bool IsDataset(const NodeDef& node) {
+  const string& op = node.op();
+  // See `GetNodeClassForOp` in core/graph/graph.cc.
+  return op == "IteratorGetNext" || op == "IteratorGetNextSync" ||
+         op == "DatasetToSingleElement" || op == "ReduceDataset";
+}
+
+bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry) {
+  const OpDef* op_def = nullptr;
+  const string& op_name = node.op();
+  Status status = op_registry->LookUpOpDef(op_name, &op_def);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to lookup OpDef for " << op_name
+                 << ". Error: " << status.error_message();
+    return false;
+  }
+  return op_def->is_stateful();
+}
+
+bool IsStateful(const NodeDef node) {
+  return IsStateful(node, OpRegistry::Global());
+}
+
 bool IsFreeOfSideEffect(const NodeDef& node,
                         const OpRegistryInterface* op_registry) {
   // Placeholders must be preserved to keep the graph feedable.
@@ -706,7 +733,6 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Asin",
           "Asinh",
           "Atan",
-          "Atan2",
           "Atanh",
           "Ceil",
           "ComplexAbs",
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index cb7781ec6ef4c131325b7103952754335653d674..a1ee2533692284ae7c05007799adc34b783f39b3 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -115,6 +115,7 @@ bool IsPow(const NodeDef& node);
 bool IsQueue(const NodeDef& node);
 bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
+bool IsReadVariableOp(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu(const NodeDef& node);
@@ -183,6 +184,14 @@ bool IsCommutative(const NodeDef& node);
 // value.
 bool IsPersistent(const NodeDef& node);
 
+// Returns true if the node belongs to the NC_DATASET class (see graph/graph.h).
+bool IsDataset(const NodeDef& node);
+
+// Returns true if the node op is marked as stateful, or if it was not found in
+// op_registry.
+bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry);
+bool IsStateful(const NodeDef node);  // use OpRegistry::Global()
+
 bool IsFreeOfSideEffect(const NodeDef& node,
                         const OpRegistryInterface* op_registry);
 bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 79578cb3ce0733bcfce1a382414c20881879e3e3..4af8fc987416c5c82e7d1bb421d713b795d09341 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -145,12 +145,16 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -252,12 +256,16 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -409,6 +417,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
@@ -524,10 +533,12 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core/grappler/verifiers:graph_verifier",
+        "//tensorflow/core/grappler/verifiers:structure_verifier",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -540,6 +551,7 @@ tf_cuda_cc_test(
         ":custom_graph_optimizer_registry",
         ":meta_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
@@ -606,16 +618,18 @@ cc_library(
         ":constant_folding",
         ":evaluation_utils",
         ":graph_optimizer",
-        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -674,7 +688,7 @@ tf_cc_test(
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "remapper",
     srcs = ["remapper.cc"],
     hdrs = [
@@ -691,6 +705,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -702,6 +717,7 @@ tf_cuda_cc_test(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -758,7 +774,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index e28f991e2dfa50c559c42f06e06d475f8017b323..2168dbd623e0ddbfe8c6b078776ca893fb34eaa5 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -38,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -3316,25 +3320,6 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   return true;
 }
 
-namespace {
-
-bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
-  const std::unordered_set<string> op_types_to_traverse = {
-      node.op(),    "Identity", "IdentityN", "Reshape",
-      "ExpandDims", "Enter",    "Switch",    "Merge"};
-  int node_idx = graph_view.index(node.name());
-  std::set<int> node_fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, node_idx, &node_fanout);
-  for (int fanout : node_fanout) {
-    if (ModifiesInputsInPlace(graph_view.graph()->node(fanout))) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
 bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -3353,20 +3338,37 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
 }
 
 void ArithmeticOptimizer::DedupComputations() {
-  bool stop = true;
-  SimpleGraphView graph_view;
-  if (!graph_view.Initialize(*optimized_graph_).ok()) {
-    LOG(WARNING) << "Failed to build SimpleGraphView.";
+  GraphTopologyView graph_view;
+  if (!graph_view.InitializeFromGraph(*optimized_graph_).ok()) {
+    LOG(WARNING) << "Failed to initialize GraphTopologyView.";
     return;
   }
-  std::set<int> duplicates;
+
+  const absl::flat_hash_set<string> ops_to_traverse = {
+      "Identity", "IdentityN", "Reshape", "ExpandDims",
+      "Enter",    "Switch",    "Merge"};
+
   // Populate feed_inplace_op;
-  std::unordered_set<NodeDef*> feeds_inplace_op;
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    if (FeedsInPlaceOp(graph_view, optimized_graph_->node(i))) {
-      feeds_inplace_op.insert(optimized_graph_->mutable_node(i));
+  absl::flat_hash_set<const NodeDef*> feeds_inplace_op;
+
+  for (const NodeDef& root : optimized_graph_->node()) {
+    if (feeds_inplace_op.find(&root) != feeds_inplace_op.end()) continue;
+
+    if (ModifiesInputsInPlace(root)) {
+      const auto is_continue_traversal = [&](const NodeDef* node) -> bool {
+        return node->op() == root.op() || ops_to_traverse.count(node->op()) > 0;
+      };
+
+      DfsTraversal(graph_view, {&root}, TraversalDirection::kFollowInputs,
+                   DfsPredicates::Advance(is_continue_traversal),
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     feeds_inplace_op.insert(node);
+                   }));
     }
   }
+
+  bool stop = true;
+  std::set<int> duplicates;
   do {
     stop = true;
     UniqueNodes nodes;
@@ -3574,7 +3576,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
-      item.allowed_optimizations().non_differentiable_rewrites;
+      item.optimization_options().allow_non_differentiable_rewrites;
 
   if (options_.dedup_computations) {
     DedupComputations();
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 3882e3b3a9a0fa5788a298f0900ca545b792f56e..b0c3c5b5181be4b744128fb18ac288c122c59f2a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -901,8 +901,8 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
 
 // static
 Status ConstantFolding::CreateNodeDef(const string& name,
-                                      const TensorValue& tensor,
-                                      NodeDef* node) {
+                                      const TensorValue& tensor, NodeDef* node,
+                                      size_t original_size) {
   node->set_name(name);
   node->set_op("Const");
 
@@ -980,11 +980,12 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   }
   node->mutable_attr()->insert({"value", attr_tensor});
 
-  if (encoded_size < 10 * 1024 * 1024) {
-    return Status::OK();
+  if (encoded_size > original_size && encoded_size >= 10 * 1024 * 1024) {
+    return errors::InvalidArgument(
+        strings::StrCat("Can't fold ", name, ", its size would be too large (",
+                        encoded_size, " >= ", 10 * 1024 * 1024, " bytes)"));
   }
-  return errors::InvalidArgument(
-      strings::StrCat("Can't fold ", name, ", its size would be too large"));
+  return Status::OK();
 }
 
 Status ConstantFolding::EvaluateNode(const NodeDef& node,
@@ -1010,6 +1011,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     }
   });
 
+  size_t total_inputs_size = 0;
   for (const auto& input : node.input()) {
     const TensorId input_tensor = ParseTensorName(input);
     if (input_tensor.index() < 0) {
@@ -1027,6 +1029,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
     CHECK(value->FromProto(raw_val));
     inputs.emplace_back(value);
+    total_inputs_size += value->TotalBytes();
   }
 
   TF_RETURN_IF_ERROR(EvaluateNode(node, inputs, &output_tensors));
@@ -1041,7 +1044,8 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       node_name = strings::StrCat(node_name, "-", i);
     }
     if (output_tensors[i].tensor) {
-      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i));
+      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i),
+                               total_inputs_size);
       if (!s.ok()) {
         *result_too_large = true;
         return s;
@@ -1697,12 +1701,12 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (ConstantPushDown(node)) {
+  if (ConstantPushDown(optimized_graph, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
 
-  if (MulConvPushDown(node, *properties)) {
+  if (MulConvPushDown(optimized_graph, node, *properties)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -2612,7 +2616,8 @@ bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
   return false;
 }
 
-bool ConstantFolding::ConstantPushDown(NodeDef* node) {
+bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
+                                       NodeDef* node) {
   // Consider the transformation
   //
   //                      +                +       = parent
@@ -2680,9 +2685,10 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
       // edge. We can replace such a control edge with a control edge from A
       // to C.
       CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
-                                    graph_, node_map_.get()));
-      NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
-      MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
+                                    optimized_graph, node_map_.get()));
+      string other_leaf_input = left_leaf_is_constant ? op_child_node->input(0)
+                                                      : op_child_node->input(1);
+      MaybeAddControlInput(other_leaf_input, const_child_node, optimized_graph,
                            node_map_.get());
     }
 
@@ -2699,7 +2705,7 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::MulConvPushDown(NodeDef* node,
+bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
                                       const GraphProperties& properties) {
   // Push down multiplication on ConvND.
   //                       *                  ConvND
@@ -2791,12 +2797,13 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
     }
     // Make sure we don't introduce loops in the graph by removing control
     // dependencies from the conv2d node to c2.
-    NodeDef* conv_const_node =
-        conv_left_is_constant ? conv_left_child : conv_right_child;
-    if (MaybeRemoveControlInput(conv_node->name(), const_node, graph_,
+    string conv_const_input =
+        conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
+    if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
                                 node_map_.get())) {
       // Add a control dep from c1 to c2 to ensure c2 is in the right frame
-      *const_node->add_input() = AsControlDependency(*conv_const_node);
+      MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
+                           node_map_.get());
     }
 
     conv_node->set_name(node->name());
@@ -2808,6 +2815,8 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
       node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
       conv_node->set_input(1, mul_new_name);
     }
+    NodeDef* conv_const_node =
+        conv_left_is_constant ? conv_left_child : conv_right_child;
     if (left_child_is_constant) {
       node->set_input(1, conv_const_node->name());
     } else {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 0b778882d7d4d89d83de5d6bd5a6f9c827cf5bf8..99200925cb351478bd188361c33b88634caffa26 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -35,8 +35,10 @@ const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
 // Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
  public:
+  // The size limit will only be considered if the newly created node is greater
+  // than original_size (optional).
   static Status CreateNodeDef(const string& name, const TensorValue& tensor,
-                              NodeDef* node);
+                              NodeDef* node, size_t original_size = 0);
   static string AddControlDependency(const string& input_name, GraphDef* graph,
                                      NodeMap* node_map);
 
@@ -124,11 +126,12 @@ class ConstantFolding : public GraphOptimizer {
 
   // Pushes down constants on '+' and '*' operators if applicable. Returns true
   // the transformation applied successfully.
-  bool ConstantPushDown(NodeDef* node);
+  bool ConstantPushDown(GraphDef* optimized_graph, NodeDef* node);
 
   // Aggregate constants present around a conv operator. Returns true if the
   // transformation was applied successfully.
-  bool MulConvPushDown(NodeDef* node, const GraphProperties& properties);
+  bool MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
+                       const GraphProperties& properties);
 
   // Strength reduces floating point division by a constant Div(x, const) to
   // multiplication by the reciprocal Mul(x, Reciprocal(const)).
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 192f48272f9ed08b2b6424f3c8e33d1afafdb56d..d7cabf5a8b8ad6659937e868df7635292936d48c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -1601,7 +1601,7 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   AddNode("split_dim", "Const", {}, {}, &want);
   AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
           &want);
-  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("s2", "Split", {"split_dim", "in2"}, {}, &want);
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 7593023ff4d649c623db9be98ac52ef6b799219f..682e7cd6c4820147d2adfb74d98a3604a44ec948 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -3,16 +3,42 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
+package(default_visibility = [
+    "//tensorflow/core/grappler/optimizers/data:__subpackages__",
+    "//tensorflow/core/kernels/data:__pkg__",
+])
+
+cc_library(
+    name = "data",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":filter_fusion",
+        ":hoist_random_uniform",
+        ":latency_all_edges",
+        ":make_numa_aware",
+        ":make_sloppy",
+        ":map_and_batch_fusion",
+        ":map_and_filter_fusion",
+        ":map_fusion",
+        ":map_parallelization",
+        ":map_vectorization",
+        ":meta_optimizer",
+        ":noop_elimination",
+        ":shuffle_and_repeat_fusion",
+    ],
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
     hdrs = [
         "filter_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":fusion_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
@@ -20,16 +46,15 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "filter_fusion_test",
     srcs = ["filter_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":filter_fusion",
         ":graph_test_utils",
@@ -48,7 +73,6 @@ cc_library(
     hdrs = [
         "fusion_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":function_utils",
@@ -60,6 +84,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
@@ -68,7 +93,6 @@ cc_library(
 tf_cc_test(
     name = "fusion_utils_test",
     srcs = ["fusion_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":fusion_utils",
@@ -87,7 +111,6 @@ cc_library(
     hdrs = [
         "function_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         "//tensorflow/core:framework",
@@ -101,19 +124,19 @@ cc_library(
 tf_cc_test(
     name = "function_utils_test",
     srcs = ["function_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
+        ":graph_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/tools/graph_transforms:transform_utils",
-    ],
+    ] + tf_protos_all(),
 )
 
 cc_library(
@@ -122,7 +145,6 @@ cc_library(
     hdrs = [
         "graph_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -136,7 +158,6 @@ cc_library(
 tf_cc_test(
     name = "graph_utils_test",
     srcs = ["graph_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         "//tensorflow/core:core_cpu",
@@ -145,7 +166,6 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ],
 )
@@ -157,7 +177,6 @@ cc_library(
     hdrs = [
         "graph_test_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -174,26 +193,26 @@ cc_library(
     hdrs = [
         "hoist_random_uniform.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "hoist_random_uniform_test",
     srcs = ["hoist_random_uniform_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -212,42 +231,54 @@ cc_library(
     hdrs = [
         "latency_all_edges.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "latency_all_edges_test",
+    srcs = ["latency_all_edges_test.cc"],
+    deps = [
+        ":graph_utils",
+        ":latency_all_edges",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
 )
 
 cc_library(
     name = "make_numa_aware",
     srcs = ["make_numa_aware.cc"],
     hdrs = ["make_numa_aware.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "make_numa_aware_test",
     srcs = ["make_numa_aware_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -264,21 +295,20 @@ cc_library(
     name = "make_sloppy",
     srcs = ["make_sloppy.cc"],
     hdrs = ["make_sloppy.h"],
-    visibility = ["//visibility:public"],
     deps = [
+        ":optimizer_base",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "make_sloppy_test",
     srcs = ["make_sloppy_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -297,24 +327,24 @@ cc_library(
     hdrs = [
         "map_and_batch_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_and_batch_fusion_test",
     srcs = ["map_and_batch_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":map_and_batch_fusion",
@@ -331,27 +361,27 @@ cc_library(
     hdrs = [
         "map_and_filter_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":fusion_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_and_filter_fusion_test",
     srcs = ["map_and_filter_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -370,10 +400,11 @@ cc_library(
     hdrs = [
         "map_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":fusion_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
@@ -381,20 +412,20 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_fusion_test",
     srcs = ["map_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
         ":map_fusion",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -410,25 +441,24 @@ cc_library(
     hdrs = [
         "map_parallelization.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
+        ":function_utils",
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_parallelization_test",
     srcs = ["map_parallelization_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -447,62 +477,93 @@ cc_library(
     hdrs = [
         "map_vectorization.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":graph_utils",
+        ":optimizer_base",
         ":vectorization_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_vectorization_test",
     srcs = ["map_vectorization_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":map_vectorization",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels/data",
     ],
 )
 
+cc_library(
+    name = "meta_optimizer",
+    srcs = ["meta_optimizer.cc"],
+    hdrs = ["meta_optimizer.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:arithmetic_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler/optimizers:dependency_optimizer",
+        "//tensorflow/core/grappler/optimizers:function_optimizer",
+        "//tensorflow/core/grappler/optimizers:model_pruner",
+        "//tensorflow/core/grappler/optimizers:shape_optimizer",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ptr_util",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
 cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
     hdrs = [
         "noop_elimination.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "noop_elimination_test",
     srcs = ["noop_elimination_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":noop_elimination",
@@ -513,30 +574,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "optimizer_base",
+    srcs = ["optimizer_base.cc"],
+    hdrs = [
+        "optimizer_base.h",
+    ],
+    deps = [
+        "//tensorflow/core:metrics",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+    ],
+)
+
 cc_library(
     name = "shuffle_and_repeat_fusion",
     srcs = ["shuffle_and_repeat_fusion.cc"],
     hdrs = [
         "shuffle_and_repeat_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "shuffle_and_repeat_fusion_test",
     srcs = ["shuffle_and_repeat_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":shuffle_and_repeat_fusion",
@@ -547,47 +620,12 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "data",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":filter_fusion",
-        ":hoist_random_uniform",
-        ":latency_all_edges",
-        ":make_numa_aware",
-        ":make_sloppy",
-        ":map_and_batch_fusion",
-        ":map_and_filter_fusion",
-        ":map_fusion",
-        ":map_parallelization",
-        ":map_vectorization",
-        ":noop_elimination",
-        ":shuffle_and_repeat_fusion",
-    ],
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "latency_all_edges_test",
-    srcs = ["latency_all_edges_test.cc"],
-    deps = [
-        ":graph_utils",
-        ":latency_all_edges",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler:grappler_item",
-    ],
-)
-
 cc_library(
     name = "vectorization_utils",
     srcs = ["vectorization_utils.cc"],
     hdrs = [
         "vectorization_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":graph_utils",
@@ -608,7 +646,6 @@ cc_library(
 tf_cc_test(
     name = "vectorization_utils_test",
     srcs = ["vectorization_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":function_utils",
@@ -620,16 +657,27 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         # For ops we need registered
         "//tensorflow/core/kernels/data:dataset_ops",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
+        "//tensorflow/core:logging_ops_op_lib",
         "//tensorflow/core/kernels:math",
+        "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core/kernels:nn",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core/kernels:parsing",
+        "//tensorflow/core:parsing_ops_op_lib",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
 )
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index 89b568ecf161cda08f1b71b369c3edb1d43f2a7f..7a20b8042bf27b4151e7063dad1e2b188ca2d3a4 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/filter_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -57,14 +58,16 @@ NodeDef MakeFusedFilterNode(const NodeDef& first_filter_node,
 
 }  // namespace
 
-Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                              GraphDef* output) {
+Status FilterFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                             const GrapplerItem& item,
+                                             GraphDef* output,
+                                             OptimizationStats* stats) {
   GraphDef sorted_old_graph = item.graph;
   TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              output->library());
 
@@ -109,7 +112,8 @@ Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_filter_node = graph.AddNode(MakeFusedFilterNode(
         *first_filter_node, *second_filter_node, *fused_predicate, &graph));
 
-    graph.UpdateFanouts(second_filter_node->name(), fused_filter_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(second_filter_node->name(),
+                                           fused_filter_node->name()));
 
     // TODO(prazek): we should run some optimizations on the fused filter
     // functions, or make sure that optimization passes run after filter
@@ -119,9 +123,10 @@ Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     // they are not used anymore.
     nodes_to_delete.insert(first_filter_node->name());
     nodes_to_delete.insert(second_filter_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -132,5 +137,5 @@ void FilterFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(FilterFusion, "filter_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
index 91a0364a46121aefbd7140ef5fc0a72291c5bf82..ac0326c0ec24bea74d0473ef8ca2fb95cb97e4c8 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This optimization fuses filter transformations.
-class FilterFusion : public CustomGraphOptimizer {
+class FilterFusion : public TFDataOptimizerBase {
  public:
   FilterFusion() = default;
   ~FilterFusion() override = default;
@@ -34,14 +34,15 @@ class FilterFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index 311df15bc2728a57a66e58cbe3217d3cf03e44dd..20536910db12607bcef9155d739251648696a0c7 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -171,6 +171,57 @@ void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
   node->set_name(std::move(name));
 }
 
-}  // end namespace function_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def, bool skip_assert) {
+  if (!function_def.signature().is_stateful()) return false;
+
+  for (const NodeDef& node_def : function_def.node_def()) {
+    if (IsNodeStateful(library, node_def, skip_assert)) return true;
+  }
+  return false;
+}
+
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert) {
+  const OpDef* op_def;
+  Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+
+  if (!s.ok()) return true;
+
+  if (!op_def->is_stateful()) return false;
+
+  if (skip_assert && op_def->name() == "Assert") {
+    return false;
+  }
+
+  if (op_def->name() == "If") {
+    const FunctionDef* then_func =
+        library.Find(node.attr().at("then_branch").func().name());
+    const FunctionDef* else_func =
+        library.Find(node.attr().at("else_branch").func().name());
+    if ((then_func != nullptr &&
+         !IsFunctionStateful(library, *then_func, skip_assert)) &&
+        (else_func != nullptr &&
+         !IsFunctionStateful(library, *else_func, skip_assert))) {
+      return false;
+    }
+  }
+
+  if (op_def->name() == "While") {
+    const FunctionDef* cond_func =
+        library.Find(node.attr().at("cond").func().name());
+    const FunctionDef* body_func =
+        library.Find(node.attr().at("body").func().name());
+    if ((cond_func != nullptr &&
+         !IsFunctionStateful(library, *cond_func, skip_assert)) &&
+        (body_func != nullptr &&
+         !IsFunctionStateful(library, *body_func, skip_assert))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace function_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index d4ce824652beaca77198a87a6fcb5c342a35b4b1..79271e8ad0c330318ed4538c46158967758e5747 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -101,6 +101,22 @@ int FindFunctionNodeWithOp(StringPiece op, const FunctionDef& function);
 void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
                                NodeDef* node);
 
+// Checks if the function is stateful by checking the function graph for
+// stateful ops. Because the "If" and "While" ops are conservatively marked as
+// stateful, the check recurses into their graph to determine whether they are
+// actually stateful. The `skip_assert` argument determines whether the "Assert"
+// op should be treated as stateful or not.
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def,
+                        bool skip_assert = false);
+
+// Checks if the node is stateful. Because the "If" or "While" ops are
+// conservatively marked as stateful, the check recurses into their graph to
+// determine whether they are actually stateful. The `skip_assert` argument
+// determines whether the "Assert" op  should be treated as stateful or not.
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert = false);
+
 }  // end namespace function_utils
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index 3739e20eb1444fa24ec5553b8a133d8d96c5d714..8ae0cde4cd1ba20c8259ae9ac7e7a767f7b542e4 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -158,6 +160,692 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   }
 }
 
+// Graph containing function with "If" and "Assert" Op.
+/*
+  @eager_function.defun
+  def test_function():
+    pred = constant_op.constant(True)
+
+    def fn1():
+      return control_flow_ops.no_op()
+
+    def fn2():
+      return control_flow_ops.Assert(False, ["Wrong branch!!!"])
+
+    return control_flow_ops.cond(pred, fn1, fn2)
+
+  r = test_function()
+*/
+// Following proto is generated in python using the above code block, to
+// regenerate get the graph_def from the default graph/specified graph for the
+// code block (e.g ops.get_default_graph.as_graph_def()).
+constexpr char kCondGraphProto[] = R"proto(
+  node {
+    name: "StatefulPartitionedCall"
+    op: "StatefulPartitionedCall"
+    attr {
+      key: "Tin"
+      value { list {} }
+    }
+    attr {
+      key: "Tout"
+      value { list { type: DT_BOOL } }
+    }
+    attr {
+      key: "_gradient_op_type"
+      value { s: "PartitionedCall-20" }
+    }
+    attr {
+      key: "config"
+      value { s: "" }
+    }
+    attr {
+      key: "config_proto"
+      value { s: "" }
+    }
+    attr {
+      key: "executor_type"
+      value { s: "" }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_test_function_19" } }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "cond_true_3"
+        input_arg { name: "identity_const" type: DT_BOOL }
+        output_arg { name: "identity_1" type: DT_BOOL }
+      }
+      node_def { name: "NoOp" op: "NoOp" }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "identity_const"
+        input: "^NoOp"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "Identity:output:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+    }
+    function {
+      signature {
+        name: "cond_false_4"
+        input_arg { name: "identity_const" type: DT_BOOL }
+        output_arg { name: "identity_1" type: DT_BOOL }
+        is_stateful: true
+      }
+      node_def {
+        name: "Assert/Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_STRING }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_STRING
+              tensor_shape {}
+              string_val: "Wrong branch!!!"
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert/condition"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_BOOL
+              tensor_shape {}
+              bool_val: false
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert/data_0"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_STRING }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_STRING
+              tensor_shape {}
+              string_val: "Wrong branch!!!"
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert"
+        op: "Assert"
+        input: "Assert/Assert/condition:output:0"
+        input: "Assert/Assert/data_0:output:0"
+        attr {
+          key: "T"
+          value { list { type: DT_STRING } }
+        }
+        attr {
+          key: "summarize"
+          value { i: 3 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "identity_const"
+        input: "^Assert/Assert"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "Identity:output:0"
+        input: "^Assert/Assert"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+    }
+    function {
+      signature {
+        name: "__inference_test_function_19"
+        output_arg { name: "identity" type: DT_BOOL }
+        is_stateful: true
+      }
+      node_def {
+        name: "Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_BOOL
+              tensor_shape {}
+              bool_val: true
+            }
+          }
+        }
+      }
+      node_def {
+        name: "cond"
+        op: "If"
+        input: "Const:output:0"
+        input: "Const:output:0"
+        attr {
+          key: "Tcond"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "Tin"
+          value { list { type: DT_BOOL } }
+        }
+        attr {
+          key: "Tout"
+          value { list { type: DT_BOOL } }
+        }
+        attr {
+          key: "_lower_using_switch_merge"
+          value { b: true }
+        }
+        attr {
+          key: "else_branch"
+          value { func { name: "cond_false_4" } }
+        }
+        attr {
+          key: "output_shapes"
+          value { list { shape {} } }
+        }
+        attr {
+          key: "then_branch"
+          value { func { name: "cond_true_3" } }
+        }
+      }
+      node_def {
+        name: "cond/Identity"
+        op: "Identity"
+        input: "cond:output:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "cond/Identity:output:0"
+        input: "^cond"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 })proto";
+
+// Graph containing function with "While" Op in python.
+/*
+  @eager_function.defun
+  def test_function():
+    return control_flow_ops.while_loop(
+        lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
+
+  r = test_function()
+*/
+// Following proto is generated in python using the above code block, to
+// regenerate get the graph_def from the default graph/specified graph for the
+// code block (e.g ops.get_default_graph.as_graph_def()).
+constexpr char kWhileGraphProto[] = R"proto(
+  node {
+    name: "StatefulPartitionedCall"
+    op: "StatefulPartitionedCall"
+    attr {
+      key: "Tin"
+      value { list {} }
+    }
+    attr {
+      key: "Tout"
+      value { list { type: DT_INT32 } }
+    }
+    attr {
+      key: "_gradient_op_type"
+      value { s: "PartitionedCall-35" }
+    }
+    attr {
+      key: "config"
+      value { s: "" }
+    }
+    attr {
+      key: "config_proto"
+      value { s: "" }
+    }
+    attr {
+      key: "executor_type"
+      value { s: "" }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_test_function_34" } }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "while_body_5"
+        input_arg { name: "while_loop_counter" type: DT_INT32 }
+        input_arg { name: "const" type: DT_INT32 }
+        input_arg { name: "maximum_iterations" type: DT_INT32 }
+        output_arg { name: "identity" type: DT_INT32 }
+        output_arg { name: "identity_1" type: DT_INT32 }
+        output_arg { name: "identity_2" type: DT_INT32 }
+      }
+      node_def {
+        name: "add/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "add"
+        op: "Add"
+        input: "const"
+        input: "add/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "add_1/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "add_1"
+        op: "Add"
+        input: "while_loop_counter"
+        input: "add_1/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "add_1:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "add:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity_2"
+        op: "Identity"
+        input: "maximum_iterations"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+      ret { key: "identity_2" value: "Identity_2:output:0" }
+    }
+    function {
+      signature {
+        name: "__inference_test_function_34"
+        output_arg { name: "identity" type: DT_INT32 }
+        is_stateful: true
+      }
+      node_def {
+        name: "maximum_iterations"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 0
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while/loop_counter"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 0
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while"
+        op: "While"
+        input: "while/loop_counter:output:0"
+        input: "Const:output:0"
+        input: "maximum_iterations:output:0"
+        attr {
+          key: "T"
+          value { list { type: DT_INT32 type: DT_INT32 type: DT_INT32 } }
+        }
+        attr {
+          key: "_lower_using_switch_merge"
+          value { b: true }
+        }
+        attr {
+          key: "body"
+          value { func { name: "while_body_5" } }
+        }
+        attr {
+          key: "cond"
+          value { func { name: "while_cond_4" } }
+        }
+        attr {
+          key: "output_shapes"
+          value {
+            list {
+              shape {}
+              shape {}
+              shape {}
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while/Identity"
+        op: "Identity"
+        input: "while:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "while/Identity_1"
+        op: "Identity"
+        input: "while:output:1"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "while/Identity_2"
+        op: "Identity"
+        input: "while:output:2"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "while/Identity_1:output:0"
+        input: "^while"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+    function {
+      signature {
+        name: "while_cond_4"
+        input_arg { name: "while_loop_counter" type: DT_INT32 }
+        input_arg { name: "const" type: DT_INT32 }
+        input_arg { name: "less_maximum_iterations" type: DT_INT32 }
+        output_arg { name: "identity" type: DT_BOOL }
+      }
+      node_def {
+        name: "Less"
+        op: "Less"
+        input: "while_loop_counter"
+        input: "less_maximum_iterations"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Less_1/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 3
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Less_1"
+        op: "Less"
+        input: "const"
+        input: "Less_1/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "LogicalAnd"
+        op: "LogicalAnd"
+        input: "Less:z:0"
+        input: "Less_1:z:0"
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "LogicalAnd:z:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 })proto";
+
+// TODO(shivaniagrawal): split the test into multiple tests for better
+// readability and add full coverage i.e. add/separate out the tests for all
+// branches of IsNodeStateful and IsFunctionStateful:
+// - test for IsNodeStateful for Cond that has a stateful branch
+// - test for IsNodeStateful for Cond that does not have a stateful branches
+// - test for IsNodeStateful for While that has a stateful branch
+// - test for IsNodeStateful for While that does not have a stateful branches
+// - test for IsNodeStateful for Assert
+// - test for IsNodeStateful for a stateful op
+// - test for IsNodeStateful for a stateless op
+//
+// - test for IsFunctionStateful for a function that contains a Cond
+// - test for IsFunctionStateful for a function that contains a While
+// - test for IsFunctionStateful for a function that contains an Assert (and no
+//   other stateful op)
+// - test for IsFunctionStateful for a function that contains a stateful op
+//   other than Assert
+// - test for IsFunctionStateful for a function that does not contain a stateful
+//   op
+
+TEST(FunctionUtilsTest, IsFunctionStateful) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* nodeA = graph_utils::AddNode("", "A", {}, {}, &graph);
+  FunctionDef* function = graph_def.mutable_library()->add_function();
+  *function = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(),
+                                    *graph_def.mutable_library());
+
+  EXPECT_FALSE(IsFunctionStateful(lib_def, *function));
+
+  // Op "A" is not a registered Op.
+  EXPECT_TRUE(IsNodeStateful(lib_def, *nodeA));
+
+  // Get graph_def for the graph `kCondGraphProto`, graph with function
+  // containing "If" and "Assert" Op.
+
+  GraphDef graph_def_cond;
+  protobuf::TextFormat::ParseFromString(kCondGraphProto, &graph_def_cond);
+  FunctionLibraryDefinition cond_lib(OpRegistry::Global(),
+                                     graph_def_cond.library());
+
+  const FunctionDef* no_op_fnc = cond_lib.Find("cond_true_3");
+
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *no_op_fnc));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *no_op_fnc, true));
+
+  const FunctionDef* assert_func = cond_lib.Find("cond_false_4");
+
+  EXPECT_TRUE(IsFunctionStateful(cond_lib, *assert_func));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *assert_func, true));
+
+  EXPECT_TRUE(ContainsFunctionNodeWithOp("Const", *assert_func));
+  EXPECT_TRUE(ContainsFunctionNodeWithOp("Assert", *assert_func));
+
+  for (auto node : assert_func->node_def()) {
+    if (node.op() == "Const") {
+      EXPECT_FALSE(IsNodeStateful(lib_def, node));
+    }
+    if (node.op() == "Assert") {
+      EXPECT_TRUE(IsNodeStateful(lib_def, node));
+      EXPECT_FALSE(IsNodeStateful(lib_def, node, true));
+    }
+  }
+
+  const FunctionDef* cond_func = cond_lib.Find("__inference_test_function_19");
+
+  EXPECT_TRUE(IsFunctionStateful(cond_lib, *cond_func));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *cond_func, true));
+
+  // Get graph def for the graph `kWhileGraphProto`, graph with function
+  // containing "While" Op.
+
+  GraphDef graph_def_while;
+  protobuf::TextFormat::ParseFromString(kWhileGraphProto, &graph_def_while);
+
+  FunctionLibraryDefinition while_lib(OpRegistry::Global(),
+                                      graph_def_while.library());
+  const FunctionDef* while_function =
+      while_lib.Find("__inference_test_function_34");
+  EXPECT_FALSE(IsFunctionStateful(while_lib, *while_function));
+  EXPECT_FALSE(IsFunctionStateful(while_lib, *while_function, true));
+}
 }  // namespace
 }  // namespace function_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index b3bfee138ffd9254e4a28bf87906b543defb95bc..d5308ad31a87f3cb0d129721af899c52787de3f2 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -471,6 +471,6 @@ FunctionDef* FuseFunctions(
   return fused_function;
 }
 
-}  // end namespace fusion_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace fusion_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index 9d8b388a3a8bca1fb560e5acc94d50f3d82ed30d..82ca0146b97c2503371042bc070611cefbc40678 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -42,7 +42,7 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece function_name) {
   return test::function::NDef(
       name, "ExperimentalMapAndBatchDataset",
-      {string(input_node_name), "", string(batch_size_node_name),
+      {string(input_node_name), string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
@@ -68,7 +68,7 @@ NodeDef MakeParallelInterleaveNode(StringPiece name,
                                    StringPiece function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV2",
-      {string(input_node_name), "", string(cycle_length_node_name),
+      {string(input_node_name), string(cycle_length_node_name),
        string(block_length_node_name), string(num_parallel_calls_node_name)},
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
@@ -107,6 +107,6 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
       });
 }
 
-}  // end namespace graph_tests_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_tests_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
index a2707ee7b7f3888212f2402617d2063f1feb9c8d..3750e2d5cce66a6644eea69cac7531efb308d055 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -56,8 +56,8 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
                              StringPiece num_parallel_calls_node_name,
                              bool sloppy);
 
-}  // end namespace graph_tests_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_tests_utils
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_TEST_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 90208c1fba6b089f57b303827cf1327ad43bf736..cbafb9dc8231509181ace4a1bb02ef5f2191728b 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -293,6 +293,6 @@ Status EnsureNodeNamesUnique(Graph* g) {
 
   return Status::OK();
 }
-}  // end namespace graph_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index d130fee2047e5be49857dea6ac6489f93088aa50..8f2872c146ba6201436e83b94037bc529efba37c 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -140,8 +140,8 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
 // and renaming nodes does not mutate any edges.
 Status EnsureNodeNamesUnique(Graph* g);
 
-}  // end namespace graph_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_utils
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 5c0f03dca8774d64395c8bc0f2c1334a45bfe9dc..3b6d223fd36b68cf187a7da2a00a47b0757c997b 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -109,7 +109,7 @@ TEST(GraphUtilsTest, ContainsGraphNodeWithName) {
   AddNode("A", "OpA", {}, {}, &graph);
   EXPECT_TRUE(ContainsGraphNodeWithName("A", *graph.graph()));
 
-  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(graph.DeleteNodes({"A"}).ok());
   EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.graph()));
 }
 
@@ -131,7 +131,7 @@ TEST(GraphUtilsTest, ContainsNodeWithOp) {
   AddNode("A", "OpA", {}, {}, &graph);
   EXPECT_TRUE(ContainsNodeWithOp("OpA", *graph.graph()));
 
-  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(graph.DeleteNodes({"A"}).ok());
   EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.graph()));
 }
 
@@ -143,7 +143,7 @@ TEST(GraphUtilsTest, FindGraphNodeWithName) {
   AddNode("A", "OpA", {}, {}, &graph);
   EXPECT_NE(FindGraphNodeWithName("A", *graph.graph()), -1);
 
-  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(graph.DeleteNodes({"A"}).ok());
   EXPECT_EQ(FindGraphNodeWithName("A", *graph.graph()), -1);
 }
 
@@ -164,10 +164,10 @@ TEST(GraphUtilsTest, FindGraphNodeWithOp) {
 
   AddNode("A", "OpA", {}, {}, &graph);
   AddNode("B", "OpB", {"A"}, {}, &graph);
-  AddNode("A2", "OpA", {"B"}, {}, &graph);
+  AddNode("A2", "OpA", {"A"}, {}, &graph);
   EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.graph()), 0);
 
-  graph.DeleteNodes({"B"});
+  EXPECT_TRUE(graph.DeleteNodes({"B"}).ok());
   EXPECT_EQ(FindGraphNodeWithOp("OpB", *graph.graph()), -1);
   EXPECT_EQ(FindGraphNodeWithName("A2", *graph.graph()), 1);
 }
@@ -186,7 +186,7 @@ TEST(GraphUtilsTest, FindAllGraphNodesWithOp) {
   EXPECT_EQ(result_indices.at(0), 0);
   EXPECT_EQ(result_indices.at(1), 2);
 
-  graph.DeleteNodes({"A2"});
+  EXPECT_TRUE(graph.DeleteNodes({"A2"}).ok());
   std::vector<int> result_indices_new =
       FindAllGraphNodesWithOp("OpA", *graph.graph());
   EXPECT_EQ(result_indices_new.size(), 1);
@@ -201,7 +201,7 @@ TEST(GraphUtilsTest, SetUniqueGraphNodeName) {
   NodeDef* node2 = AddNode("", "A", {}, {}, &graph);
   EXPECT_NE(node1->name(), node2->name());
 
-  graph.DeleteNodes({node1->name()});
+  EXPECT_TRUE(graph.DeleteNodes({node1->name()}).ok());
   NodeDef* node3 = AddNode("", "A", {}, {}, &graph);
   EXPECT_NE(node2->name(), node3->name());
 }
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 60755256d83d74287748125e18ccd8a63a1b4759..e29b620140236aa8852d7bd36799b99ce62c1f0d 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -173,7 +174,7 @@ const FunctionDef* MakeLessStatefulFunction(const FunctionDef& map_function,
   return stateless_function;
 }
 // This function returns true if function is stateful and has single
-// RandomUniform op and no other stateful ops except Assert.
+// RandomUniform op and no other stateful ops except Assert and If/While.
 // `is_stateful_after_hoisting` is set to true if RandomUniform is the only
 // stateful op and hoisting can be performed.
 bool CanHoistRandomUniform(const FunctionDef& map_function,
@@ -188,10 +189,10 @@ bool CanHoistRandomUniform(const FunctionDef& map_function,
   for (const auto& node : map_function.node_def()) {
     const OpDef* op_def;
     TF_CHECK_OK(library.LookUpOpDef(node.op(), &op_def));
-    // Skip stateless nodes and assert, as it does not actually have a state.
     if (!op_def->is_stateful()) continue;
 
-    if (op_def->name() == "Assert") {
+    if (!function_utils::IsNodeStateful(library, node, true)) {
+      // Skip ops that are marked stateful but are in fact not stateful.
       have_other_stateful_ops = true;
       continue;
     }
@@ -220,12 +221,14 @@ int NumberOfPlaceholders(const NodeDef& map_node) {
 
 }  // namespace
 
-Status HoistRandomUniform::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* output) {
+Status HoistRandomUniform::OptimizeAndCollectStats(Cluster* cluster,
+                                                   const GrapplerItem& item,
+                                                   GraphDef* output,
+                                                   OptimizationStats* stats) {
   *output = item.graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
@@ -266,14 +269,16 @@ Status HoistRandomUniform::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* stateless_map = graph.AddNode(
         MakeStatelessMap(*map_node, *zip_node, *stateless_func, &graph));
 
-    graph.UpdateFanouts(map_node->name(), stateless_map->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(map_node->name(), stateless_map->name()));
 
     // TODO(b/116285210): we could also remove map functions from library if
     // they are not used anymore.
     nodes_to_delete.insert(map_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -285,5 +290,5 @@ void HoistRandomUniform::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(HoistRandomUniform, "hoist_random_uniform");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
index d1bcf6782d60b6a41482730b9d7ec9f2c4b43119..94db9f72a453e5567d493434682c5d2e8d59cf82 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_HOIST_RANDOM_UNIFORM_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_HOIST_RANDOM_UNIFORM_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -30,7 +30,7 @@ namespace grappler {
 // `stateless_random_uniform`.
 // TODO(prazek): for now only `RandomUniform` is handled, but we could handle
 // `RandomUniformInt` similarly.
-class HoistRandomUniform : public CustomGraphOptimizer {
+class HoistRandomUniform : public TFDataOptimizerBase {
  public:
   HoistRandomUniform() = default;
   ~HoistRandomUniform() override = default;
@@ -42,14 +42,15 @@ class HoistRandomUniform : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_HOIST_RANDOM_UNIFORM_H_
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 52b4b785a3d09ca7f3bec3373d9dd1c8de444a87..4529b89bd4aa2d6ab47884bba13922fa20c568bc 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -63,8 +64,10 @@ NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
 
 }  // namespace
 
-Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output) {
+Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
+                                                const GrapplerItem& item,
+                                                GraphDef* output,
+                                                OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
 
@@ -72,10 +75,8 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
   // TODO(shivaniagrawal): Add Op to return Latency for the particular Op than
   // for the edge (e2 - e1?).
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op().rfind("Dataset") != node.op().size() - strlen("Dataset") ||
-        node.attr().empty() ||
-        node.name().rfind("_generated") ==
-            node.name().size() - strlen("_generated")) {
+    if (!str_util::EndsWith(node.op(), "Dataset") || node.attr().empty() ||
+        str_util::EndsWith(node.name(), "_generated")) {
       // TODO(b/111805951): Replace this with non-approximate way to check if
       // node corresponds to a `Dataset` op.
       continue;
@@ -89,15 +90,15 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
     } else {  // fanout will have size 0 for last dataset node in the pipeline.
       if (fanout.size() == 1) {
         NodeDef* output_node = (*(fanout.begin())).node;
-        if (output_node->name().rfind("_generated") ==
-            output_node->name().size() - strlen("_generated")) {
+        if (str_util::EndsWith(output_node->name(), "_generated")) {
           continue;
         }
       }
     }
 
     NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
-    graph.UpdateFanouts(node.name(), latency_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), latency_node->name()));
+    stats->num_changes++;
   }
   return Status::OK();
 }
@@ -109,5 +110,5 @@ void LatencyAllEdges::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(LatencyAllEdges, "latency_all_edges");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.h b/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
index f6c71a9ec7d8c9c98a5d4e58894f11b35e7b8772..313d108286b7595f2370ef2f9276353e9ef7e58f 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_LATENCY_ALL_EDGES_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_LATENCY_ALL_EDGES_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class LatencyAllEdges : public CustomGraphOptimizer {
+class LatencyAllEdges : public TFDataOptimizerBase {
  public:
   LatencyAllEdges() = default;
   ~LatencyAllEdges() override = default;
@@ -33,14 +33,15 @@ class LatencyAllEdges : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_LATENCY_ALL_EDGES_H_
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index d428d04a66659cd3b961428e3762ea3ab81ad69e..426c1dca5bb2c112d47b440a672b5a720a994cdf 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -30,9 +30,9 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   using test::function::NDef;
   GrapplerItem item;
   NodeDef component_node =
-      NDef("component_nodes", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}});
+      NDef("component_node", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}});
   NodeDef from_tensor_node =
-      NDef("from_tensor_nodes", "TensorDataset", {"component_nodes"},
+      NDef("from_tensor_node", "TensorDataset", {"component_node"},
            {{"Toutput_types", {}}, {"output_shapes", {}}});
 
   NodeDef captured_input_node = NDef("captured_input_node", "Const", {},
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
index 72c27a1d4afb8f3766a1f7c56ade37b1e161a039..221f4c252583c6f29aba4d22920a60a75568115f 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/make_numa_aware.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -37,20 +38,23 @@ NodeDef MakeNumaAwareNode(const NodeDef& node, MutableGraphView* graph) {
 
 }  // namespace
 
-Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* output) {
+Status MakeNumaAware::OptimizeAndCollectStats(Cluster* cluster,
+                                              const GrapplerItem& item,
+                                              GraphDef* output,
+                                              OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
 
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "ExperimentalMapAndBatchDataset") continue;
 
     auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
-    graph.UpdateFanouts(node.name(), numa_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), numa_node->name()));
     nodes_to_delete.insert(node.name());
+    stats->num_changes++;
   }
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h b/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
index 48a7d8145f0157c6cea1633edb68d9ee3ee08de1..81dbb31e6d55c3a8f86be945afcef588efe2d6e3 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MakeNumaAware : public CustomGraphOptimizer {
+class MakeNumaAware : public TFDataOptimizerBase {
  public:
   MakeNumaAware() = default;
   ~MakeNumaAware() override = default;
@@ -33,8 +33,9 @@ class MakeNumaAware : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override {}
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
index 1cfaef3ffb270cc338aaaef601f5f6037740112e..1de0c46427aa7812329aa657fd2c1f0611655ad3 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
@@ -25,8 +25,10 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-Status MakeSloppy::Optimize(Cluster* cluster, const GrapplerItem& item,
-                            GraphDef* output) {
+Status MakeSloppy::OptimizeAndCollectStats(Cluster* cluster,
+                                           const GrapplerItem& item,
+                                           GraphDef* output,
+                                           OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
 
@@ -35,6 +37,7 @@ Status MakeSloppy::Optimize(Cluster* cluster, const GrapplerItem& item,
         node.op() == "ParallelMapDataset" ||
         node.op() == "ParseExampleDataset") {
       (*node.mutable_attr())["sloppy"].set_b(true);
+      stats->num_changes++;
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
index 9dcab1038de3f6c39c4db4954903465bc0a6146d..cf42e841989da351c7203da6d01dac9c398c0cc9 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.h
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MakeSloppy : public CustomGraphOptimizer {
+class MakeSloppy : public TFDataOptimizerBase {
  public:
   MakeSloppy() = default;
   ~MakeSloppy() override = default;
@@ -33,8 +33,9 @@ class MakeSloppy : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override {}
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 84c4d82f6a38dd81e88374c6ce6a7a6082451a38..5d26d1abe48fa9cc9217b34d3990b306d3f6a494 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -98,11 +99,13 @@ NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
 
 }  // namespace
 
-Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                   GraphDef* output) {
+Status MapAndBatchFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                                  const GrapplerItem& item,
+                                                  GraphDef* output,
+                                                  OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
       continue;
@@ -120,14 +123,16 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* new_node =
         graph.AddNode(MakeMapAndBatchNode(*map_node, batch_node, &graph));
-    graph.UpdateFanouts(batch_node.name(), new_node->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(batch_node.name(), new_node->name()));
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
     nodes_to_delete.insert(batch_node.name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -139,5 +144,5 @@ void MapAndBatchFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapAndBatchFusion, "map_and_batch_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index 2c64831105295391f77e7e8be554b25fa85a5779..ef3a218bf340d96e9b95eb0175d5cb6167c5a208 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MapAndBatchFusion : public CustomGraphOptimizer {
+class MapAndBatchFusion : public TFDataOptimizerBase {
  public:
   MapAndBatchFusion() = default;
   ~MapAndBatchFusion() override = default;
@@ -33,14 +33,15 @@ class MapAndBatchFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 233d7968c8965a5ec2389aa297da72a9708b9257..e257683b35d7ca8a60d0dc7324ffd5ad7f270175 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -92,8 +93,10 @@ NodeDef MakeFilterByLastComponentNode(const NodeDef& fused_map_node,
 
 }  // namespace
 
-Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* output) {
+Status MapAndFilterFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                                   const GrapplerItem& item,
+                                                   GraphDef* output,
+                                                   OptimizationStats* stats) {
   GraphDef sorted_old_graph = item.graph;
   TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
   // TODO(prazek): We might have some problems with performance if we copy
@@ -101,7 +104,7 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
@@ -155,16 +158,18 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* filter_by_component = graph.AddNode(
         MakeFilterByLastComponentNode(*fused_maps, *filter_node, &graph));
 
-    graph.UpdateFanouts(filter_node->name(), filter_by_component->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(filter_node->name(), filter_by_component->name()));
     TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
 
     // TODO(prazek): we could also remove functions from library if they are not
     // used anymore.
     nodes_to_delete.insert(map_node->name());
     nodes_to_delete.insert(filter_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -176,5 +181,5 @@ void MapAndFilterFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapAndFilterFusion, "map_and_filter_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index ba25ca0591043989b97c62a7adb32eeeb193694e..8b3c95d37c109e2752c80b3696462d06a0797680 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -26,7 +26,7 @@ namespace grappler {
 // component. The FilterDataset is transformed to FilterByLastComponent - a
 // custom kernel that filters elements based on a value of the boolean
 // component.
-class MapAndFilterFusion : public CustomGraphOptimizer {
+class MapAndFilterFusion : public TFDataOptimizerBase {
  public:
   MapAndFilterFusion() = default;
   ~MapAndFilterFusion() override = default;
@@ -38,14 +38,15 @@ class MapAndFilterFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 6b8015f96a29ac2fa2de3871a678a1b82efb12ff..ce41f7069cc5d54287ba6c8d546e57ca7293de8b 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -77,14 +78,16 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
 
 }  // namespace
 
-Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                           GraphDef* output) {
+Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                          const GrapplerItem& item,
+                                          GraphDef* output,
+                                          OptimizationStats* stats) {
   GraphDef sorted_old_graph = item.graph;
   TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
@@ -130,7 +133,8 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_maps_node = graph.AddNode(
         MakeFusedNode(*parent_map_node, *map_node, *fused_function, &graph));
 
-    graph.UpdateFanouts(map_node->name(), fused_maps_node->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(map_node->name(), fused_maps_node->name()));
 
     // TODO(prazek): we should run some optimizations on the fused map
     // functions, or make sure that optimization passes run after map
@@ -141,9 +145,10 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     // they are not used anymore.
     nodes_to_delete.insert(parent_map_node->name());
     nodes_to_delete.insert(map_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -154,5 +159,5 @@ void MapFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapFusion, "map_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
index a6a06592b80823458ee6ae3b655aecacbdfbb93b..c9960c721789002daeeea91f5fbbfe0dc9f30968 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This optimization fuses map transformations by merging their map functions.
-class MapFusion : public CustomGraphOptimizer {
+class MapFusion : public TFDataOptimizerBase {
  public:
   MapFusion() = default;
   ~MapFusion() override = default;
@@ -34,14 +34,15 @@ class MapFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 8e49f908a77288c8e99b62706578d86a272ab682..90dd885c7fc75954e4207876ac154bec0e9d3093 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_parallelization.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 
@@ -28,33 +30,21 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-bool CanParallelize(const FunctionDef& function,
-                    const FunctionLibraryDefinition& library) {
-  if (!function.signature().is_stateful()) return true;
-
-  for (const auto& node : function.node_def()) {
-    const OpDef* op_def;
-    TF_CHECK_OK(library.LookUpOpDef(node.op(), &op_def));
-    // Assert is marked as stateful, but it does not have any state (except
-    // changing io).  Similarly to CUDA, we do not give guarantee that the
-    // assert operation that would fail would be the first one, so that we can
-    // parallelize it.
-    if (op_def->is_stateful() && op_def->name() != "Assert") return false;
-  }
-
-  return true;
-}
-
-NodeDef MakeParallelMap(const NodeDef& map_node, MutableGraphView* graph) {
-  NodeDef parallel_map = map_node;
-  graph_utils::SetUniqueGraphNodeName("parallel_map", graph->graph(),
+constexpr char kMapDataset[] = "MapDataset";
+constexpr char kParallelMapDataset[] = "ParallelMapDataset";
+constexpr int kAutotune = -1;
+
+NodeDef MakeParallelMap(const string& name, MutableGraphView* graph) {
+  // The inputs of the node to be parallelized could be changed by the
+  // optimization pass, so we need to look it up in the modified graph.
+  int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
+  DCHECK_NE(index, -1) << "Failed to find node " << name
+                       << " in the optimized graph.";
+  NodeDef parallel_map = graph->graph()->node(index);
+  graph_utils::SetUniqueGraphNodeName(kParallelMapDataset, graph->graph(),
                                       &parallel_map);
-  parallel_map.set_op("ParallelMapDataset");
-  // TODO(b/114475558): We want to set `num_parallel_calls` to a special value,
-  // so that dynamic tunning will pick the optimal value at runtime. Because
-  // this feature is not yet implemented, we set it to 2, which is the smallest
-  // value that introduces parallelism.
-  auto* num_parallel_calls = graph_utils::AddScalarConstNode(2, graph);
+  parallel_map.set_op(kParallelMapDataset);
+  auto* num_parallel_calls = graph_utils::AddScalarConstNode(kAutotune, graph);
   parallel_map.add_input(num_parallel_calls->name());
 
   return parallel_map;
@@ -62,15 +52,17 @@ NodeDef MakeParallelMap(const NodeDef& map_node, MutableGraphView* graph) {
 
 }  // namespace
 
-Status MapParallelization::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* output) {
+Status MapParallelization::OptimizeAndCollectStats(Cluster* cluster,
+                                                   const GrapplerItem& item,
+                                                   GraphDef* output,
+                                                   OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
-    if (node.op() == "MapDataset") return &node;
+    if (node.op() == kMapDataset) return &node;
     return nullptr;
   };
 
@@ -80,14 +72,18 @@ Status MapParallelization::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* function =
         function_library.Find(map_node->attr().at("f").func().name());
-    if (!CanParallelize(*function, function_library)) continue;
+    if (function_utils::IsFunctionStateful(function_library, *function, true))
+      continue;
 
-    auto* parallel_map = graph.AddNode(MakeParallelMap(*map_node, &graph));
-    graph.UpdateFanouts(map_node->name(), parallel_map->name());
+    auto* parallel_map =
+        graph.AddNode(MakeParallelMap(map_node->name(), &graph));
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(map_node->name(), parallel_map->name()));
     nodes_to_delete.insert(map_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -99,5 +95,5 @@ void MapParallelization::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapParallelization, "map_parallelization");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index ac9cf7e12af344da2079637db9f3c51012c5ccd5..8e71dadcb858bbee5f94a3e51038350e46f542ce 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This optimization parallelizes MapDataset when function is stateless.
-class MapParallelization : public CustomGraphOptimizer {
+class MapParallelization : public TFDataOptimizerBase {
  public:
   MapParallelization() = default;
   ~MapParallelization() override = default;
@@ -34,14 +34,15 @@ class MapParallelization : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 3401dcc6f23bae1b2e77d5ea18a94f382fee4fb8..3da238fb7dfc7d3391102e6c01a257247ac972da 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/map_vectorization.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization_utils.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -35,6 +38,20 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kCastOp[] = "Cast";
+constexpr char kRealDivOp[] = "RealDiv";
+constexpr char kSubOp[] = "Sub";
+constexpr char kMulOp[] = "Mul";
+constexpr char kAddOp[] = "Add";
+constexpr char kEqualOp[] = "Equal";
+constexpr char kCeilOp[] = "Ceil";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+constexpr char kMapOp[] = "MapDataset";
+constexpr char kParallelMapOp[] = "ParallelMapDataset";
+constexpr int kAutotune = -1;
+
 // Returns a FunctionDef containing a MapDefun op that wraps the original
 // function.
 FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
@@ -100,7 +117,6 @@ FunctionDef* AddVectorizedFunction(const NodeDef& map_node,
   const NodeDef& map_defun_node = vectorized_func->node_def(0);
   DCHECK_EQ(map_defun_node.op(), "MapDefun");
 
-  // TODO(b/116285210): Unreferenced functions should get cleaned up later
   FunctionDef* result;
   Status s = vectorization_utils::VectorizeMapDefun(
       *vectorized_func, map_defun_node, library, &result);
@@ -120,6 +136,7 @@ bool IsOutputShapesFullyDefined(const NodeDef& node) {
   const auto& shapes = shapes_attr->list().shape();
 
   for (const TensorShapeProto& shape : shapes) {
+    if (shape.unknown_rank()) return false;
     for (const auto& dim : shape.dim()) {
       if (dim.size() == -1) {
         return false;
@@ -129,34 +146,68 @@ bool IsOutputShapesFullyDefined(const NodeDef& node) {
   return true;
 }
 
-bool IsStatefulFn(const FunctionLibraryDefinition& library,
-                  const FunctionDef& function_def) {
-  for (const NodeDef& node_def : function_def.node_def()) {
-    const OpDef* op_def;
-    Status s = library.LookUpOpDef(node_def.op(), &op_def);
-    if (!s.ok() || op_def->is_stateful()) {
-      return true;
-    }
+// Returns a mapping from input names to the [start, end) indices of the input
+// in the node's input list.
+Status GetInputMap(const NodeDef& node, NameRangeMap* result) {
+  const OpRegistrationData* op_reg_data;  // Owned by global op registry
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(node.op(), &op_reg_data));
+
+  return NameRangesForNode(node, op_reg_data->op_def, result,
+                           /*outputs=*/nullptr);
+}
+
+Status CopyInputs(StringPiece input_name, const NameRangeMap& input_map,
+                  const NodeDef& from, NodeDef* to) {
+  const auto* range = gtl::FindOrNull(input_map, input_name);
+  if (range == nullptr) {
+    return errors::Internal(
+        "Failed to copy inputs: did not find inputs with name: ", input_name,
+        ", in node with name: ", from.name());
   }
-  return false;
+  for (int i = range->first; i < range->second; ++i) {
+    to->add_input(from.input(i));
+  }
+
+  return Status::OK();
 }
 
-NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
-                         const NodeDef& input_node,
-                         const FunctionDef& vectorized_func,
-                         MutableGraphView* graph) {
+Status GetInputNodeName(StringPiece input_name, const NameRangeMap& input_map,
+                        const NodeDef& node, string* result) {
+  const auto* range = gtl::FindOrNull(input_map, input_name);
+  if (range == nullptr) {
+    return errors::Internal(
+        "Failed to get input node name: did not find input with name: ",
+        input_name, ", in node with name: ", node.name());
+  }
+  if (range->second - range->first > 1) {
+    return errors::Internal("Tried to get single input name for a list input.");
+  }
+  *result = node.input(range->first);
+  return Status::OK();
+}
+
+Status AddNewBatchNode(const NodeDef& old_batch_node, const NodeDef& input_node,
+                       const FunctionDef& vectorized_func,
+                       MutableGraphView* graph, NodeDef** new_batch_node) {
   NodeDef batch_node;
-  batch_node.set_op(old_batch_node.op());
+  batch_node.set_op(old_batch_node.op() == kBatchOp ? kBatchOp : kBatchV2Op);
   graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->graph(),
                                       &batch_node);
 
   // Set the `input_dataset` input argument
   batch_node.add_input(input_node.name());
-  // Set the `batch_size` input_argument
-  batch_node.add_input(old_batch_node.input(1));
-  if (batch_node.op() == "BatchDatasetV2") {
-    // Set the `drop_remainder` input argument
-    batch_node.add_input(old_batch_node.input(2));
+
+  NameRangeMap input_map;
+  TF_RETURN_IF_ERROR(GetInputMap(old_batch_node, &input_map));
+
+  // Set the `batch_size` input argument
+  TF_RETURN_IF_ERROR(
+      CopyInputs("batch_size", input_map, old_batch_node, &batch_node));
+
+  // Set the `drop_remainder` input argument
+  if (batch_node.op() != kBatchOp) {
+    TF_RETURN_IF_ERROR(
+        CopyInputs("drop_remainder", input_map, old_batch_node, &batch_node));
   }
 
   // Set attrs
@@ -166,6 +217,9 @@ NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
   }
   (*batch_node.mutable_attr())["output_types"] = output_types;
 
+  // It is safe to assume that input_node has the "output_shapes" attr here,
+  // because earlier we checked that the input node has fully defined output
+  // shapes.
   auto& output_shapes_attr = (*batch_node.mutable_attr())["output_shapes"];
   const auto& input_shapes =
       input_node.attr().at("output_shapes").list().shape();
@@ -177,23 +231,204 @@ NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
     dim->set_size(batch_size);
     shape->MergeFrom(input_shapes.Get(i));
   }
-  return batch_node;
+
+  *new_batch_node = graph->AddNode(std::move(batch_node));
+  return Status::OK();
 }
 
-NodeDef MakeNewMapNode(const NodeDef& old_map_node,
-                       const NodeDef& old_batch_node,
-                       const NodeDef& new_batch_node,
-                       const FunctionDef& vectorized_func,
+NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
+                     MutableGraphView* graph) {
+  NodeDef cast_node;
+  cast_node.set_op(kCastOp);
+  cast_node.add_input(input);
+  graph_utils::SetUniqueGraphNodeName(cast_node.op(), graph->graph(),
+                                      &cast_node);
+  AddNodeAttr("SrcT", src_t, &cast_node);
+  AddNodeAttr("DstT", dst_t, &cast_node);
+
+  return graph->AddNode(std::move(cast_node));
+}
+
+NodeDef* AddEqualityNode(const string& input_x, const string& input_y,
+                         DataType t, MutableGraphView* graph) {
+  NodeDef equal_node;
+  equal_node.set_op(kEqualOp);
+  equal_node.add_input(input_x);
+  equal_node.add_input(input_y);
+  graph_utils::SetUniqueGraphNodeName(equal_node.op(), graph->graph(),
+                                      &equal_node);
+  AddNodeAttr("T", t, &equal_node);
+
+  return graph->AddNode(std::move(equal_node));
+}
+
+NodeDef* AddCeilNode(const string& input, MutableGraphView* graph) {
+  NodeDef ceil_node;
+  ceil_node.set_op(kCeilOp);
+  graph_utils::SetUniqueGraphNodeName(ceil_node.op(), graph->graph(),
+                                      &ceil_node);
+  AddNodeAttr("T", DT_FLOAT, &ceil_node);
+  ceil_node.add_input(input);
+
+  return graph->AddNode(std::move(ceil_node));
+}
+
+NodeDef* AddBinaryNode(const string& input_x, const string& input_y,
+                       const string& op, DataType type,
                        MutableGraphView* graph) {
+  NodeDef node;
+  node.set_op(op);
+  node.add_input(input_x);
+  node.add_input(input_y);
+  graph_utils::SetUniqueGraphNodeName(op, graph->graph(), &node);
+  AddNodeAttr("T", type, &node);
+
+  return graph->AddNode(std::move(node));
+}
+
+NodeDef* AddIntAddNode(const string& input_x, const string& input_y,
+                       MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kAddOp, DT_INT32, graph);
+}
+
+NodeDef* AddFloatDivNode(const string& input_x, const string& input_y,
+                         MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kRealDivOp, DT_FLOAT, graph);
+}
+
+NodeDef* AddIntSubNode(const string& input_x, const string& input_y,
+                       MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kSubOp, DT_INT32, graph);
+}
+
+NodeDef* AddIntMulNode(const string& input_x, const string& input_y,
+                       MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kMulOp, DT_INT32, graph);
+}
+
+// Create a new node for the num_parallel_calls input argument according to the
+// following formula:
+//
+// Let N = old num_parallel_calls, N' = new num_parallel_calls, and B =
+// batch_size.
+//     N' = ceil(N // B) * (1 - (N == -1)) + N * (N == -1)
+//
+// i.e. N' = -1 if N = -1 (autotune)
+//      N' = ceil(N // B) otherwise.
+// Note that "ceil" is necessary so N' != 0.
+//
+// For non-autotune values of `num_parallel_call`, we divide it by `batch_size`
+// to limit memory consumption by the map buffer.
+//
+// TODO(rachelim): Evaluate the performance of other potential transformations
+// to `num_parallel_calls`:
+//   1) use the autotune value (i.e. -1)
+//   2) use the original value
+Status MakeNumParallelCallsInput(const NodeDef& old_map_node,
+                                 const NodeDef& old_batch_node,
+                                 const NameRangeMap& input_map,
+                                 MutableGraphView* graph, string* result) {
+  string num_parallel_calls_name;
+  TF_RETURN_IF_ERROR(GetInputNodeName("num_parallel_calls", input_map,
+                                      old_map_node, &num_parallel_calls_name));
+
+  NodeDef* float_num_parallel_calls;
+  NodeDef* float_batch_size;
+  NodeDef* bool_is_autotune;
+
+  // Cast the old num_parallel_calls and batch_size arguments to DT_FLOAT before
+  // dividing.
+  if (old_map_node.op() == kExperimentalMapAndBatchOp) {
+    auto autotune_val =
+        graph_utils::AddScalarConstNode(static_cast<int64>(kAutotune), graph);
+    bool_is_autotune = AddEqualityNode(
+        autotune_val->name(), num_parallel_calls_name, DT_INT64, graph);
+
+    float_num_parallel_calls =
+        AddCastNode(num_parallel_calls_name, DT_INT64, DT_FLOAT, graph);
+
+    string batch_size_name;
+    TF_RETURN_IF_ERROR(GetInputNodeName("batch_size", input_map, old_map_node,
+                                        &batch_size_name));
+
+    float_batch_size = AddCastNode(batch_size_name, DT_INT64, DT_FLOAT, graph);
+  } else {
+    auto autotune_val =
+        graph_utils::AddScalarConstNode(static_cast<int>(kAutotune), graph);
+    bool_is_autotune = AddEqualityNode(
+        autotune_val->name(), num_parallel_calls_name, DT_INT32, graph);
+
+    float_num_parallel_calls =
+        AddCastNode(num_parallel_calls_name, DT_INT32, DT_FLOAT, graph);
+
+    float_batch_size =
+        AddCastNode(old_batch_node.input(1), DT_INT64, DT_FLOAT, graph);
+  }
+
+  // Divide
+  auto div_node = AddFloatDivNode(float_num_parallel_calls->name(),
+                                  float_batch_size->name(), graph);
+
+  // Ceil
+  auto float_ceil_node = AddCeilNode(div_node->name(), graph);
+
+  // Cast back to DT_INT32
+  auto int_ceil_node =
+      AddCastNode(float_ceil_node->name(), DT_FLOAT, DT_INT32, graph);
+
+  // is_autotune = int(num_parallel_calls == -1)
+  auto int_is_autotune =
+      AddCastNode(bool_is_autotune->name(), DT_BOOL, DT_INT32, graph);
+
+  // is_not_autotune = 1 - is_autotune
+  auto int_is_not_autotune =
+      AddIntSubNode(graph_utils::AddScalarConstNode(1, graph)->name(),
+                    int_is_autotune->name(), graph);
+
+  auto mul_1 =
+      AddIntMulNode(int_ceil_node->name(), int_is_not_autotune->name(), graph);
+
+  NodeDef* mul_2;
+  if (old_map_node.op() == kExperimentalMapAndBatchOp) {
+    auto int_num_parallel_calls =
+        AddCastNode(num_parallel_calls_name, DT_INT64, DT_INT32, graph);
+    mul_2 = AddIntMulNode(int_num_parallel_calls->name(),
+                          int_is_autotune->name(), graph);
+  } else {
+    mul_2 =
+        AddIntMulNode(num_parallel_calls_name, int_is_autotune->name(), graph);
+  }
+
+  auto add_node = AddIntAddNode(mul_1->name(), mul_2->name(), graph);
+
+  *result = add_node->name();
+  return Status::OK();
+}
+
+Status AddNewMapNode(const NodeDef& old_map_node, const NodeDef& old_batch_node,
+                     const NodeDef& new_batch_node,
+                     const FunctionDef& vectorized_func,
+                     MutableGraphView* graph, NodeDef** new_map_node) {
   NodeDef map_node;
-  map_node.set_op(old_map_node.op());
+  map_node.set_op(old_map_node.op() == kMapOp ? kMapOp : kParallelMapOp);
   graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->graph(), &map_node);
 
   // Set the `input_dataset` input argument
   map_node.add_input(new_batch_node.name());
-  for (int i = 1; i < old_map_node.input_size(); i++) {
-    // Set the `other_arguments` and `num_parallel_calls` input arguments
-    map_node.add_input(old_map_node.input(i));
+
+  NameRangeMap input_map;
+  TF_RETURN_IF_ERROR(GetInputMap(old_map_node, &input_map));
+
+  // Set the `other_arguments` input argument
+  TF_RETURN_IF_ERROR(
+      CopyInputs("other_arguments", input_map, old_map_node, &map_node));
+
+  // Set the `num_parallel_calls` input argument
+  if (old_map_node.op() != kMapOp) {
+    string num_parallel_calls;
+    TF_RETURN_IF_ERROR(MakeNumParallelCallsInput(
+        old_map_node, old_batch_node, input_map, graph, &num_parallel_calls));
+    map_node.add_input(std::move(num_parallel_calls));
   }
 
   // Set attrs
@@ -206,71 +441,111 @@ NodeDef MakeNewMapNode(const NodeDef& old_map_node,
   }
 
   (*map_node.mutable_attr())["use_inter_op_parallelism"].set_b(true);
+  *new_map_node = graph->AddNode(std::move(map_node));
+  return Status::OK();
+}
+
+// Given an input pipeline graph and a query node, tries to the node to the
+// 'batch' node in a input_dataset->map->batch pattern, or the 'map_and_batch'
+// node in an input_dataset->map_and_batch pattern.
+bool FindMapAndBatchPattern(const MutableGraphView& graph, const NodeDef& node,
+                            const FunctionLibraryDefinition& function_library,
+                            const NodeDef** batch_node_output,
+                            const NodeDef** map_node_output,
+                            const NodeDef** input_node_output,
+                            const FunctionDef** map_fn_output) {
+  const FunctionDef*& map_fn = *map_fn_output;
+  const NodeDef*& batch_node = *batch_node_output;
+  const NodeDef*& map_node = *map_node_output;
+  const NodeDef*& input_node = *input_node_output;
+
+  if (node.op() == kExperimentalMapAndBatchOp) {
+    batch_node = &node;
+    map_node = &node;
+  } else if (node.op() == kBatchOp || node.op() == kBatchV2Op) {
+    batch_node = &node;
+    map_node = graph_utils::GetInputNode(*batch_node, graph);
+    if (map_node->op() != kMapOp && map_node->op() != kParallelMapOp) {
+      return false;
+    }
+    if (!IsOutputShapesFullyDefined(*map_node)) {
+      // If any of the map func outputs have an unknown shape, don't
+      // optimize, so that batching errors surface as before.
+      VLOG(1) << "Cannot vectorize dataset.map().batch() because the map "
+                 "dataset does not have fully defined output shapes.";
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  // Input to the map node
+  input_node = graph_utils::GetInputNode(*map_node, graph);
+  DCHECK_NE(input_node, nullptr);
+
+  if (!IsOutputShapesFullyDefined(*input_node)) {
+    // If any of the inputs have an unknown shape, don't optimize, since
+    // inputs might not be batchable.
+    VLOG(1) << "Cannot vectorize dataset.map().batch() because the input "
+               "dataset does not have fully defined output shapes.";
+    return false;
+  }
 
-  return map_node;
+  map_fn = function_library.Find(map_node->attr().at("f").func().name());
+
+  if (function_utils::IsFunctionStateful(function_library, *map_fn)) {
+    VLOG(1) << "Cannot vectorize dataset.map().batch() because the map "
+               "function is stateful.";
+    return false;
+  }
+
+  return true;
 }
 
 }  // namespace
 
-Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                  GraphDef* output) {
+Status MapVectorization::OptimizeAndCollectStats(Cluster* cluster,
+                                                 const GrapplerItem& item,
+                                                 GraphDef* output,
+                                                 OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
 
-  for (const NodeDef& node : item.graph.node()) {
-    // Find Map->Batch nodes.
-    // TODO(rachelim): Optimize MapAndBatchDataset[V2] as well.
-    if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
-      continue;
-    }
-
-    const NodeDef& batch_node(node);
-    NodeDef* node2 = graph_utils::GetInputNode(batch_node, graph);
-    if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
-      continue;
-    }
-
-    // Use a more descriptive variable name now that we know the node type.
-    NodeDef* map_node = node2;
-    // Input to the map node
-    NodeDef* input_node = graph_utils::GetInputNode(*map_node, graph);
-    CHECK_NOTNULL(input_node);
-
-    FunctionDefLibrary* library = output->mutable_library();
+  FunctionDefLibrary* library = output->mutable_library();
 
+  for (const NodeDef& node : item.graph.node()) {
     FunctionLibraryDefinition function_library(OpRegistry::Global(), *library);
-    const FunctionDef* orig_func =
-        function_library.Find(map_node->attr().at("f").func().name());
-
-    // Check that this is a valid optimization.
-    if (!IsOutputShapesFullyDefined(*input_node) ||
-        !IsOutputShapesFullyDefined(*map_node) ||
-        IsStatefulFn(function_library, *orig_func)) {
-      // 1. If any of the inputs have an unknown shape, don't optimize, since
-      // inputs might not be batchable.
-      // 2. If any of the map func outputs have an unknown shape, don't
-      // optimize, so that batching errors surface as before.
-      // 3. If the function is stateful, don't vectorize it.
+    const NodeDef* map_node;
+    const NodeDef* batch_node;
+    const NodeDef* input_node;
+    const FunctionDef* map_func;
+    if (!FindMapAndBatchPattern(graph, node, function_library, &batch_node,
+                                &map_node, &input_node, &map_func)) {
       continue;
     }
 
     FunctionDef* vectorized_func =
-        AddVectorizedFunction(*map_node, *orig_func, library);
+        AddVectorizedFunction(*map_node, *map_func, library);
     CHECK_NOTNULL(vectorized_func);
 
-    auto* new_batch_node = graph.AddNode(
-        MakeNewBatchNode(batch_node, *input_node, *vectorized_func, &graph));
+    NodeDef* new_batch_node;
+    TF_RETURN_IF_ERROR(AddNewBatchNode(
+        *batch_node, *input_node, *vectorized_func, &graph, &new_batch_node));
 
-    auto* new_map_node = graph.AddNode(MakeNewMapNode(
-        *map_node, batch_node, *new_batch_node, *vectorized_func, &graph));
-    graph.UpdateFanouts(batch_node.name(), new_map_node->name());
+    NodeDef* new_map_node;
+    TF_RETURN_IF_ERROR(AddNewMapNode(*map_node, *batch_node, *new_batch_node,
+                                     *vectorized_func, &graph, &new_map_node));
 
+    // Make output of Batch point to Map instead.
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(batch_node->name(), new_map_node->name()));
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
-    nodes_to_delete.insert(batch_node.name());
+    nodes_to_delete.insert(batch_node->name());
+    stats->num_changes++;
   }
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -282,5 +557,5 @@ void MapVectorization::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapVectorization, "map_vectorization");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.h b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
index cc56a8ee5e4e2d0b180047da5368c82ac719ddc1..3e170aae278f804f47380900acc09597e9e8e0ed 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
@@ -16,12 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MapVectorization : public CustomGraphOptimizer {
+// This optimizer rewrites dataset.map(map_fn, ...).batch(...) and
+// dataset.apply(tf.data.experimental.map_and_batch(map_fn, ...)) patterns in an
+// input pipeline. It vectorizes the map_fn, such that this segment can be
+// rewritten as dataset.batch().map(vectorized_map_fn). This is more performant
+// when the map_fn is cheap, because it amortizes the cost of running a map
+// function over a larger batch.
+class MapVectorization : public TFDataOptimizerBase {
  public:
   MapVectorization() = default;
   ~MapVectorization() override = default;
@@ -33,14 +39,15 @@ class MapVectorization : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
index f4faf415496f306cb9ced961c1a8c12e11cb167c..0565d5f6439cccaa70b38be80c0d77086e750386 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
@@ -17,195 +17,466 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-using test::function::GDef;
+constexpr char kConstOp[] = "Const";
+constexpr char kRangeOp[] = "RangeDataset";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+constexpr char kMapOp[] = "MapDataset";
+constexpr char kParallelMapOp[] = "ParallelMapDataset";
+constexpr char kAttrNameF[] = "f";
+constexpr char kAttrNameTarguments[] = "Targuments";
+constexpr char kAttrNameOutputTypes[] = "output_types";
+constexpr char kAttrNameOutputShapes[] = "output_shapes";
+constexpr char kAttrNameInterOpParallelism[] = "use_inter_op_parallelism";
+constexpr char kAttrNamePreserveCardinality[] = "preserve_cardinality";
+constexpr char kAttrNameSloppy[] = "sloppy";
+constexpr char kAttrNameValue[] = "value";
+constexpr char kAttrNameDtype[] = "dtype";
+
 using test::function::NDef;
 
-NodeDef MakeMapNodeHelper(StringPiece name, StringPiece input_node_name,
-                          StringPiece function_name, StringPiece map_op_name,
-                          gtl::ArraySlice<PartialTensorShape> output_shapes,
-                          gtl::ArraySlice<DataType> output_types) {
-  return test::function::NDef(
-      name, map_op_name, {string(input_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
-       {"Targuments", {}},
-       {"output_shapes", output_shapes},
-       {"output_types", output_types}});
-}
-
-NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
-                    StringPiece function_name,
-                    gtl::ArraySlice<PartialTensorShape> output_shapes,
-                    gtl::ArraySlice<DataType> output_types) {
-  return MakeMapNodeHelper(name, input_node_name, function_name, "MapDataset",
-                           output_shapes, output_types);
-}
-
-NodeDef MakeBatchNode(StringPiece name, StringPiece input_node_name,
-                      StringPiece input_batch_size_name,
-                      gtl::ArraySlice<PartialTensorShape> output_shapes,
-                      gtl::ArraySlice<DataType> output_types) {
-  return NDef(
-      name, "BatchDataset",
-      {string(input_node_name), string(input_batch_size_name)},
-      {{"output_types", output_types}, {"output_shapes", output_shapes}});
-}
-
-NodeDef MakeBatchV2Node(StringPiece name, StringPiece input_node_name,
-                        StringPiece input_batch_size_name,
-                        StringPiece input_drop_remainder_name,
-                        gtl::ArraySlice<PartialTensorShape> output_shapes,
-                        gtl::ArraySlice<DataType> output_types) {
-  return NDef(
-      name, "BatchDatasetV2",
-      {string(input_node_name), string(input_batch_size_name),
-       string(input_drop_remainder_name)},
-      {{"output_types", output_types}, {"output_shapes", output_shapes}});
-}
-
-NodeDef MakeRangeNode(StringPiece name, gtl::ArraySlice<string> inputs) {
-  return NDef(name, "RangeDataset", inputs,
-              {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})},
-               {"output_types", gtl::ArraySlice<DataType>({DT_INT64})}});
-}
-
-TEST(MapVectorizationTest, VectorizeMapWithBatch) {
+// Adds a simple vectorizable map function that is akin to
+// dataset.map(lambda x: tf.identity(x))
+FunctionDef* AddMapFn(MutableGraphView* graph) {
+  FunctionDef* map_fn = graph->graph()->mutable_library()->add_function();
+  *map_fn = FunctionDefHelper::Create(
+      /*function_name=*/"map_fn",
+      /*in_def=*/{"x: int64"},
+      /*out_def=*/{"res: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"node"}, "Identity", {"x"}, {{"T", DT_INT64}}}},
+      /*ret_def=*/{{"res", "node:output"}});
+
+  return map_fn;
+}
+
+NodeDef* AddMapNode(MutableGraphView* graph, const string& input_dataset,
+                    const string& map_fn, int num_parallel_calls = 0) {
+  NodeDef result;
+  if (num_parallel_calls) {
+    auto num_parallel_calls_node =
+        graph_utils::AddScalarConstNode(num_parallel_calls, graph);
+    result =
+        NDef(/*name=*/"map", /*op=*/kParallelMapOp,
+             /*inputs=*/{input_dataset, num_parallel_calls_node->name()},
+             /*attrs=*/
+             {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+              {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+              {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+              {kAttrNameInterOpParallelism, false},
+              {kAttrNameSloppy, true},
+              {kAttrNamePreserveCardinality, true}});
+  } else {
+    result =
+        NDef(/*name=*/"map", /*op=*/kMapOp,
+             /*inputs=*/{input_dataset},
+             /*attrs=*/
+             {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+              {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+              {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+              {kAttrNameInterOpParallelism, false},
+              {kAttrNamePreserveCardinality, true}});
+  }
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddBatchNode(MutableGraphView* graph, const string& input_dataset,
+                      bool v2 = false, int64 batch_size = 10) {
+  NodeDef result;
+  auto batch_size_node = graph_utils::AddScalarConstNode(batch_size, graph);
+
+  if (v2) {
+    // BatchDatasetV2
+    auto drop_remainder = graph_utils::AddScalarConstNode(true, graph);
+    result = NDef(
+        /*name=*/"batch", /*op=*/kBatchV2Op,
+        /*inputs=*/
+        {input_dataset, batch_size_node->name(), drop_remainder->name()},
+        /*attrs=*/
+        {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+         {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{10, 1}})}});
+  } else {
+    result =
+        NDef(/*name=*/"batch", /*op=*/kBatchOp,
+             /*inputs=*/{input_dataset, batch_size_node->name()},
+             /*attrs=*/
+             {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes,
+               gtl::ArraySlice<PartialTensorShape>({{v2 ? 10 : -1, 1}})}});
+  }
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddRangeNode(MutableGraphView* graph) {
+  auto start = graph_utils::AddScalarConstNode(static_cast<int64>(0), graph);
+  auto stop = graph_utils::AddScalarConstNode(static_cast<int64>(10), graph);
+  auto step = graph_utils::AddScalarConstNode(static_cast<int64>(1), graph);
+
+  NodeDef result =
+      NDef(/*name=*/"range", /*op=*/kRangeOp,
+           /*inputs=*/{start->name(), stop->name(), step->name()},
+           /*attrs=*/
+           {{kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+            {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})}});
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+void CheckNotVectorized(const GraphDef& output, const string& map_op,
+                        const string& batch_op, const string& map_input_name) {
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(map_op, output).size(), 1);
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(batch_op, output).size(), 1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp(map_op, output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp(batch_op, output));
+  EXPECT_EQ(map_node.input(0), map_input_name);
+  EXPECT_EQ(batch_node.input(0), map_node.name());
+}
+
+void CheckVectorized(const GraphDef& output, const string& map_op,
+                     const string& batch_op, const string& map_input_name) {
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(map_op, output).size(), 1);
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(batch_op, output).size(), 1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp(map_op, output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp(batch_op, output));
+  EXPECT_EQ(map_node.input(0), batch_node.name());
+  EXPECT_EQ(batch_node.input(0), map_input_name);
+
+  // Check that the function is actually vectorized.
+  // The vectorization of the identity function is itself.
+  string function_name = map_node.attr().at(kAttrNameF).func().name();
+  int found =
+      graph_utils::FindGraphFunctionWithName(function_name, output.library());
+  ASSERT_NE(found, -1);
+  const auto& function = output.library().function(found);
+  EXPECT_EQ(function.node_def(0).op(), "Identity");
+}
+
+class MapThenBatchTest
+    : public ::testing::TestWithParam<std::tuple<int, bool>> {};
+
+TEST_P(MapThenBatchTest, IsVectorized) {
+  int num_parallel_calls = std::get<0>(GetParam());
+  bool use_batch_v2 = std::get<1>(GetParam());
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node = AddMapNode(&graph, range_node->name(),
+                             map_fn->signature().name(), num_parallel_calls);
+  auto batch_node = AddBatchNode(&graph, map_node->name(), use_batch_v2);
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, map_node->op(), batch_node->op(), range_node->name());
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
-            1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+INSTANTIATE_TEST_SUITE_P(MapThenBatchTest, MapThenBatchTest,
+                         ::testing::Combine(::testing::Values(0, 12),
+                                            ::testing::Bool()));
+
+NodeDef* AddMapAndBatchNode(MutableGraphView* graph,
+                            const string& input_dataset, const string& map_fn,
+                            int64 batch_size = 10,
+                            int64 num_parallel_calls = 12) {
+  auto batch_size_node = graph_utils::AddScalarConstNode(batch_size, graph);
+  auto num_parallel_calls_node =
+      graph_utils::AddScalarConstNode(num_parallel_calls, graph);
+  auto drop_remainder = graph_utils::AddScalarConstNode(true, graph);
+
+  NodeDef result =
+      NDef(/*name=*/"map_and_batch",
+           /*op=*/kExperimentalMapAndBatchOp,
+           /*inputs=*/
+           {input_dataset, batch_size_node->name(),
+            num_parallel_calls_node->name(), drop_remainder->name()},
+           /*attrs=*/
+           {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+            {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+            {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+            {kAttrNameOutputShapes,
+             gtl::ArraySlice<PartialTensorShape>({{10, 1}})}});
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
 }
 
-TEST(MapVectorizationTest, VectorizeMapWithBatchV2) {
+TEST(MapVectorizationTest, VectorizeExperimentalMapAndBatch) {
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("drop_remainder", "Const", {},
-            {{"value", false}, {"dtype", DT_BOOL}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchV2Node("batch", "map", "batch_size", "drop_remainder", {{-1}},
-                       {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_and_batch_node = AddMapAndBatchNode(&graph, range_node->name(),
+                                               map_fn->signature().name());
+  ASSERT_NE(map_and_batch_node, nullptr);
+
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, kParallelMapOp, kBatchV2Op, "range");
+}
+
+void EvaluateNodes(const GraphDef& graph,
+                   const std::vector<string>& output_tensor_names,
+                   std::vector<Tensor>* output_tensors) {
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph));
+  TF_CHECK_OK(session->Run({}, output_tensor_names, {}, output_tensors));
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(
-      graph_utils::FindAllGraphNodesWithOp("BatchDatasetV2", output).size(), 1);
+void CheckNumParallelCalls(const GraphDef& output,
+                           int expected_num_parallel_calls) {
+  // Run the graph to see that the new num_parallel_calls is computed correctly.
   const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+      output.node(graph_utils::FindGraphNodeWithOp(kParallelMapOp, output));
+  const string& num_parallel_calls = map_node.input(1);
+  std::vector<Tensor> output_tensors;
+  EvaluateNodes(output, {num_parallel_calls}, &output_tensors);
+
+  test::ExpectTensorEqual<int>(
+      output_tensors.at(0),
+      Tensor(static_cast<int32>(expected_num_parallel_calls)));
+}
+
+struct TestStruct {
+  int original_num_parallel_calls;
+  int batch_size;
+  int expected_num_parallel_calls;
+};
+
+class NumParallelCallsTest : public ::testing::TestWithParam<TestStruct> {};
+
+TEST_P(NumParallelCallsTest, TestCorrectNumParallelCalls) {
+  auto params = GetParam();
+
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, range_node->name(), map_fn->signature().name(),
+                 params.original_num_parallel_calls);
+  auto batch_node = AddBatchNode(&graph, map_node->name(), /*v2=*/true,
+                                 /*batch_size=*/params.batch_size);
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, map_node->op(), batch_node->op(), range_node->name());
+
+  CheckNumParallelCalls(output, params.expected_num_parallel_calls);
 }
 
-TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShape) {
+TEST_P(NumParallelCallsTest, TestCorrectNumParallelCallsFused) {
+  auto params = GetParam();
+
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("input", "InputDataset", {},
-            {{"output_types", gtl::ArraySlice<DataType>({DT_INT32})}}),
-       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_and_batch_node =
+      AddMapAndBatchNode(&graph, range_node->name(), map_fn->signature().name(),
+                         params.batch_size, params.original_num_parallel_calls);
+  ASSERT_NE(map_and_batch_node, nullptr);
+
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, kParallelMapOp, kBatchV2Op, range_node->name());
+
+  CheckNumParallelCalls(output, params.expected_num_parallel_calls);
 }
 
-TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+INSTANTIATE_TEST_SUITE_P(
+    NumParallelCalls, NumParallelCallsTest,
+    ::testing::Values(TestStruct({1, 1, 1}), TestStruct({2, 10, 1}),
+                      TestStruct({4, 3, 2}), TestStruct({10, 1, 10}),
+                      TestStruct({-1, 1, -1}), TestStruct({-1, 10, -1})));
+
+class ChainedMapAndBatchTest
+    : public ::testing::TestWithParam<std::tuple<bool, bool>> {};
+
+// Tests:
+// 1) map.batch.map.batch
+// 2) map.batch.map_and_batch
+// 3) map_and_batch.map.batch
+// 4) map_and_batch.map_and_batch
+TEST_P(ChainedMapAndBatchTest, IsVectorized) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  auto input_node = AddRangeNode(&graph);
+
+  auto map_fn = AddMapFn(&graph);
+
+  auto make_map_and_batch = [&graph, map_fn](NodeDef* input, bool fuse) {
+    if (fuse) {
+      return AddMapAndBatchNode(&graph, input->name(),
+                                map_fn->signature().name());
+    }
+    auto map_node =
+        AddMapNode(&graph, input->name(), map_fn->signature().name(), true);
+    auto batch_node = AddBatchNode(&graph, map_node->name(), true);
+    return batch_node;
+  };
+
+  auto map_and_batch_0 =
+      make_map_and_batch(input_node, std::get<0>(GetParam()));
+  auto map_and_batch_1 =
+      make_map_and_batch(map_and_batch_0, std::get<1>(GetParam()));
+  ASSERT_NE(map_and_batch_1, nullptr);
+
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  TF_ASSERT_OK(TopologicalSort(&output));
+  std::vector<int> map_nodes =
+      graph_utils::FindAllGraphNodesWithOp(kParallelMapOp, output);
+  std::vector<int> batch_nodes =
+      graph_utils::FindAllGraphNodesWithOp(kBatchV2Op, output);
+  ASSERT_EQ(map_nodes.size(), 2);
+  ASSERT_EQ(batch_nodes.size(), 2);
+  const NodeDef& range_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kRangeOp, output));
+
+  const NodeDef& batch_node_0 = output.node(batch_nodes[0]);
+  EXPECT_EQ(batch_node_0.input(0), range_node.name());
+  const NodeDef& map_node_0 = output.node(map_nodes[0]);
+  EXPECT_EQ(map_node_0.input(0), batch_node_0.name());
+  const NodeDef& batch_node_1 = output.node(batch_nodes[1]);
+  EXPECT_EQ(batch_node_1.input(0), map_node_0.name());
+  const NodeDef& map_node_1 = output.node(map_nodes[1]);
+  EXPECT_EQ(map_node_1.input(0), batch_node_1.name());
+}
+
+INSTANTIATE_TEST_SUITE_P(ChainedMapAndBatchTest, ChainedMapAndBatchTest,
+                         ::testing::Combine(::testing::Bool(),
+                                            ::testing::Bool()));
+
+// Not all dataset types have "output_shapes" and "output_types"
+// attrs defined. Add a generic input node which may not have these attrs
+// defined.
+NodeDef* AddArbitraryInputNode(MutableGraphView* graph,
+                               std::vector<PartialTensorShape>* output_shapes,
+                               std::vector<DataType>* output_types) {
+  std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>> attrs;
+  if (output_shapes) {
+    attrs.push_back({kAttrNameOutputShapes, *output_shapes});
+  }
+  if (output_types) {
+    attrs.push_back({kAttrNameOutputTypes, *output_types});
+  }
+
+  NodeDef result = NDef(/*name=*/"input", /*op=*/"InputDataset",
+                        /*inputs=*/{},
+                        /*attrs=*/attrs);
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShapes) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // doesn't have an output_shapes attr defined. In this case, the map and
+  // batch swap does not occur.
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("input", "InputDataset", {},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})}}),
-       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, nullptr, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
 }
 
-TEST(MapVectorizationTest, VectorizeWithFullyDefinedFunction) {
+TEST(MapVectorizationTest, VectorizeWithUnknownRank) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // has components with unknown rank. In this case, the optimization does not
+  // occur.
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "Func", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {FunctionDefHelper::Create(
-          "Func", {"x: int64", "y: int64"}, {"res: int64", "res2: int64"}, {},
-          {{{"o"}, "Mul", {"x", "x"}, {{"T", DT_INT64}}}},
-          {{"res", "o:z"}, {"res2", "o:z"}})});
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{}});
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
-            1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+TEST(MapVectorizationTest, VectorizeWithUnknownDim) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // has components with unknown dimensions. In this case, the optimization does
+  // not occur.
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{-1, 2}});
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
 }
 
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+  // Tests that the optimization doesn't break when the input doesn't have
+  // an output_types attr defined. The output_types of the input node, even
+  // if not present, can be inferred from the map function input signature.
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{1}});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, nullptr);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, map_node->op(), batch_node->op(), input_node->name());
+}
+
+// TODO(rachelim): Add test that has a polymorphic function.
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..584759f85d468157bbda142a5ebf654d264753cf
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/meta_optimizer.h"
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                     GraphDef* output) {
+  // Stores the optimized item so far.
+  GrapplerItem optimized_item = item;
+
+  // Perform optimizations in a meaningful order.
+  for (const auto& optimization :
+       {"noop_elimination",
+        "shuffle_and_repeat_fusion",
+        "map_fusion",
+        "filter_fusion",
+        "map_and_filter_fusion",
+        "hoist_random_uniform",
+        "map_parallelization",
+        "map_and_batch_fusion",
+        "map_vectorization",
+        "make_numa_aware",
+        "latency_all_edges",
+        "make_sloppy",
+        "pruning",
+        "function",
+        "shape",
+        "arithmetic",
+        "dependency"}) {
+    TF_RETURN_IF_ERROR(
+        ApplyOptimization(optimization, cluster, &optimized_item));
+  }
+
+  // Store the final result of all the optimizations in `output`.
+  output->Swap(&optimized_item.graph);
+  return Status::OK();
+}
+
+Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
+                                              Cluster* cluster,
+                                              GrapplerItem* item) const {
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+
+  const auto* optimizer = gtl::FindOrNull(enabled_optimizers_, name);
+  if (!optimizer) {
+    return Status::OK();
+  }
+
+  GraphDef result;
+  (*optimizer)->set_deadline_usec(this->deadline_usec());
+  TF_RETURN_IF_ERROR((*optimizer)->Optimize(cluster, *item, &result));
+  item->graph.Swap(&result);
+
+  return Status::OK();
+}
+
+Status TFDataMetaOptimizer::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  if (!config) return Status::OK();
+
+  // Initialize custom tf.data optimizers based on config.
+  auto& optimizers = config->parameter_map().at("optimizers").list().s();
+  for (const auto& optimizer_name : optimizers) {
+    auto optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+    if (optimizer) {
+      // None of our data optimizers implement a meaningful Init function.
+      // This returns an error in case any of them does.
+      TF_RETURN_IF_ERROR(optimizer->Init());
+      enabled_optimizers_[optimizer_name] = std::move(optimizer);
+    } else {
+      // This should never happen.
+      return errors::Internal(
+          "Tried to register a dataset optimizer that doesn't exist: ",
+          optimizer_name);
+    }
+  }
+
+  // Initialize standard grappler optimizers.
+  enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
+  enabled_optimizers_["function"] =
+      MakeUnique<FunctionOptimizer>(RewriterConfig::ON);
+  enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
+  enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
+  enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
+
+  return Status::OK();
+}
+
+void TFDataMetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                   const GraphDef& optimize_output,
+                                   double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(TFDataMetaOptimizer, "tf_data_meta_optimizer");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b65e7027777b165737b444106897c0bb97778450
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimizer performs tf.data-specific optimizations by invoking
+// other optimizers.
+class TFDataMetaOptimizer : public CustomGraphOptimizer {
+ public:
+  TFDataMetaOptimizer() = default;
+  ~TFDataMetaOptimizer() override = default;
+
+  string name() const override { return "tf_data_meta_optimizer"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  absl::flat_hash_map<string, std::unique_ptr<GraphOptimizer>>
+      enabled_optimizers_;
+
+  // Applies an optimization with the specified name on `item`, and stores
+  // the result in `item.graph`
+  Status ApplyOptimization(const string& name, Cluster* cluster,
+                           GrapplerItem* item) const;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index bd405c8329464793ee42757bc7ee1a3f34826bd9..851bbbdc1a28b91742bbfef3e98a4562b340a6c0 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/noop_elimination.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -70,21 +71,24 @@ bool IsNoOp(const NodeDef& node, const MutableGraphView& graph) {
 
 }  // namespace
 
-Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output) {
+Status NoOpElimination::OptimizeAndCollectStats(Cluster* cluster,
+                                                const GrapplerItem& item,
+                                                GraphDef* output,
+                                                OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (!IsNoOp(node, graph)) continue;
 
     NodeDef* const parent = graph_utils::GetInputNode(node, graph);
-    graph.UpdateFanouts(node.name(), parent->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), parent->name()));
 
     nodes_to_delete.insert(node.name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -95,5 +99,5 @@ void NoOpElimination::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(NoOpElimination, "noop_elimination");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index a65fccd882b782d4c6ead5ef9cb15e2cebd05e6f..11d86ad2a388da852cd4495b23277d6aecc143b6 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This class eliminates tf.data transformations such as `take(n)` (for n < 0),
 // `skip(0)`, `repeat(1)`, or `prefetch(0)`.
-class NoOpElimination : public CustomGraphOptimizer {
+class NoOpElimination : public TFDataOptimizerBase {
  public:
   NoOpElimination() = default;
   ~NoOpElimination() override = default;
@@ -35,14 +35,15 @@ class NoOpElimination : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/optimizer_base.cc b/tensorflow/core/grappler/optimizers/data/optimizer_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7fc0da357953906be87b02b2da10795b6e668cba
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/optimizer_base.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+#include "tensorflow/core/common_runtime/metrics.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status TFDataOptimizerBase::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                     GraphDef* output) {
+  OptimizationStats stats;
+  Status s = OptimizeAndCollectStats(cluster, item, output, &stats);
+  if (s.ok() && stats.num_changes > 0) {
+    metrics::RecordTFDataOptimization(name(), stats.num_changes);
+  }
+  return s;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/optimizer_base.h b/tensorflow/core/grappler/optimizers/data/optimizer_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..45af5a4b7d4dcea9f3a1d6e31a8f8f10880f9d0b
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/optimizer_base.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A base class for tf.data optimizers.
+class TFDataOptimizerBase : public CustomGraphOptimizer {
+ public:
+  struct OptimizationStats {
+    // Identifies the number of independent graph changes for an optimization.
+    int64 num_changes = 0;
+  };
+
+  TFDataOptimizerBase() = default;
+  ~TFDataOptimizerBase() override = default;
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) final;
+
+  virtual Status OptimizeAndCollectStats(Cluster* cluster,
+                                         const GrapplerItem& item,
+                                         GraphDef* output,
+                                         OptimizationStats* stats) = 0;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index d9af78d38cd590f5eecefe4d70c7e45dd94985c0..ff64ff1adbcd71d916a1bd6f842b9decc4a68d96 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -34,12 +35,12 @@ constexpr char kFusedOpName[] = "ShuffleAndRepeatDataset";
 
 }  // namespace
 
-Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
-                                        const GrapplerItem& item,
-                                        GraphDef* output) {
+Status ShuffleAndRepeatFusion::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
 
   auto make_shuffle_and_repeat_node = [&output](const NodeDef& shuffle_node,
                                                 const NodeDef& repeat_node) {
@@ -86,14 +87,16 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
 
     NodeDef* shuffle_and_repeat_node =
         graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
-    graph.UpdateFanouts(repeat_node.name(), shuffle_and_repeat_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(repeat_node.name(),
+                                           shuffle_and_repeat_node->name()));
 
     // Mark the `Shuffle` and `Repeat` nodes for removal.
     nodes_to_delete.insert(shuffle_node.name());
     nodes_to_delete.insert(repeat_node.name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -107,5 +110,5 @@ void ShuffleAndRepeatFusion::Feedback(Cluster* cluster,
 REGISTER_GRAPH_OPTIMIZER_AS(ShuffleAndRepeatFusion,
                             "shuffle_and_repeat_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
index c8fa53edce38531671aa481c1dffbc5b8a28046b..3738d141c3a582fb9b214686a58e36e8869cea4e 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class ShuffleAndRepeatFusion : public CustomGraphOptimizer {
+class ShuffleAndRepeatFusion : public TFDataOptimizerBase {
  public:
   ShuffleAndRepeatFusion() = default;
   ~ShuffleAndRepeatFusion() override = default;
@@ -33,14 +33,15 @@ class ShuffleAndRepeatFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index 60c557d557e31173135cf9639efbf345a586faa1..c57a7b125693af2b53d52e772bb4264bfbe00b23 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -46,7 +46,7 @@ namespace {
 // Describes a tensor with its operation Node and output position
 typedef std::pair<Node*, int> TensorDesc;
 
-const char* const kRetValOp = "_Retval";
+constexpr char kRetValOp[] = "_Retval";
 
 void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
                         Graph* graph) {
@@ -643,6 +643,6 @@ Status VectorizeMapDefun(const FunctionDef& outer_scope,
   return Vectorization(lib).Vectorize(outer_scope, map_defun_node, result);
 }
 
-}  // end namespace vectorization_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace vectorization_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.h b/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
index bd7d3909003d0b32938d939fbf87b809b4aed0dd..f5183fd4ff905baf3ba52dc1a1bae53928603657 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
@@ -90,8 +90,8 @@ Status VectorizeMapDefun(const FunctionDef& outer_scope,
                          const NodeDef& map_defun_node, FunctionDefLibrary* lib,
                          FunctionDef** result);
 
-}  // end namespace vectorization_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace vectorization_utils
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 7fee3ae9d51bcdb234945a6000985fb5531000a0..8b81cb2430ca9a34926217312f2894cf283c1dd2 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -205,14 +205,6 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
     num_cross_out += static_cast<int>(output_node->device() != node_dev);
   }
 
-  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
-      num_cross_out > 0) {
-    // This identity node follows a device crossing, so it might be
-    // following a _Recv node after partioning. Do not remove such nodes,
-    // unless they only have consumers on the same device as themselves.
-    return false;
-  }
-
   // Make sure we do not increase the number of device crossings.
   const int num_cross_before = num_cross_in + num_cross_out;
   int num_cross_after = 0;
@@ -225,6 +217,15 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
   if (num_cross_after > num_cross_before) {
     return false;
   }
+
+  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
+      num_cross_out > 0 && num_cross_after > 0) {
+    // This identity node follows a device crossing, so it might be
+    // following a _Recv node after partioning. Do not remove such nodes,
+    // unless they only have consumers on the same device as themselves.
+    return false;
+  }
+
   return true;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 8d70d9d5c73690e87d84cf941c749948e47ace26..5883fcb92681f13c0f1d7f4d623b409274d6f962 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -356,6 +356,32 @@ TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentityOps_IdenticalDevices) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  auto id_a = ops::Identity(s.WithOpName("id_a").WithDevice("/CPU:1"), x);
+  Output id =
+      ops::Identity(s.WithControlDependencies(id_a).WithDevice("/CPU:0"), id_a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.name(), "id_a");
+    if (node.name() == "Identity") {
+      EXPECT_EQ(node.input(0), "x");
+    }
+  }
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 4ec68c7543c998f3551c374056efb8092d200133..6913f9b55b9bbd74d93d3e9763d0136b6a326507 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 
-#include <unordered_map>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
@@ -38,11 +39,13 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace tensorflow {
@@ -65,6 +68,11 @@ constexpr char kFuncAttrName[] = "f";
 
 constexpr char kNoInlineAttr[] = "_noinline";
 
+// Names of the nodes that used to anchor incoming/outgoing control edges for
+// inlined function calls (see InlineIndirectFunctionCall).
+constexpr char kControlInputNodeName[] = "control_input";
+constexpr char kControlOutputNodeName[] = "control_output";
+
 bool AttrIsTrue(const FunctionDef& func, const string& attr) {
   return func.attr().count(attr) != 0 && func.attr().at(attr).b();
 }
@@ -163,10 +171,10 @@ struct FunctionSpecializationSignature {
 
   string func_name;
   bool is_in_fetch_set;
-  gtl::FlatSet<OutputPort> active_outputs;
-  std::unordered_map<string, DataType> type_parameters;
-  std::unordered_map<string, AttrValue> body_parameters;
-  std::unordered_map<InputPort, string> const_inputs;
+  absl::flat_hash_set<OutputPort> active_outputs;
+  absl::flat_hash_map<string, DataType> type_parameters;
+  absl::flat_hash_map<string, AttrValue> body_parameters;
+  absl::flat_hash_map<InputPort, string> const_inputs;
 
   bool operator==(const FunctionSpecializationSignature& other) const {
     bool equals = func_name == other.func_name &&
@@ -189,48 +197,45 @@ struct FunctionSpecializationSignature {
     return true;
   }
 
-  // TODO(ezhulenev): Migrate to AbslHashValue.
-  // TODO(ezhulenev): Optimize performance by computing hashes of unordered
-  // values first, and then compute a hash of sorted hashes.
-  struct Hash {
-    uint64 operator()(FunctionSpecializationSignature const& s) const {
-      uint64 h = Hash64(s.func_name);
-      h = Hash64Combine(std::hash<bool>()(s.is_in_fetch_set), h);
-
-      // Use std::set/std::map for deterministic iteration order.
-
-      std::set<OutputPort> active_outputs(s.active_outputs.begin(),
-                                          s.active_outputs.end());
-      for (const auto& active_output : active_outputs) {
-        h = Hash64Combine(std::hash<int>()(active_output), h);
-      }
-
-      std::map<string, DataType> types(s.type_parameters.begin(),
-                                       s.type_parameters.end());
-      for (const auto& pair : types) {
-        AttrValue attr_value;
-        attr_value.set_type(pair.second);
-        h = Hash64Combine(Hash64(pair.first), h);
-        h = Hash64Combine(AttrValueHash(attr_value), h);
-      }
-
-      std::map<string, AttrValue> body(s.body_parameters.begin(),
-                                       s.body_parameters.end());
-      for (const auto& pair : body) {
-        h = Hash64Combine(Hash64(pair.first), h);
-        h = Hash64Combine(FastAttrValueHash(pair.second), h);
-      }
-
-      std::map<InputPort, string> inputs(s.const_inputs.begin(),
-                                         s.const_inputs.end());
-      for (const auto& pair : inputs) {
-        h = Hash64Combine(std::hash<int>()(pair.first), h);
-        h = Hash64Combine(Hash64(pair.second), h);
-      }
-
-      return h;
-    }
-  };
+  template <typename H>
+  friend H AbslHashValue(H h, const FunctionSpecializationSignature& s) {
+    H base = H::combine(std::move(h), s.func_name, s.is_in_fetch_set);
+
+    // First pre-compute hashes for all values in collections with
+    // non-deterministic iteration order.
+    std::vector<uint64> hashes;
+    hashes.reserve(s.active_outputs.size()         //
+                   + s.type_parameters.size() * 2  //
+                   + s.body_parameters.size() * 2  //
+                   + s.const_inputs.size() * 2);
+
+    absl::c_transform(s.active_outputs, std::back_inserter(hashes),
+                      hash<OutputPort>());
+
+    using TypeParam = std::pair<const string, DataType>;
+    absl::c_for_each(s.type_parameters, [&hashes](const TypeParam& type_param) {
+      AttrValue attr_value;
+      attr_value.set_type(type_param.second);
+      hashes.push_back(Hash64(type_param.first));
+      hashes.push_back(AttrValueHash(attr_value));
+    });
+
+    using BodyParam = std::pair<const string, AttrValue>;
+    absl::c_for_each(s.body_parameters, [&hashes](const BodyParam& body_param) {
+      hashes.push_back(Hash64(body_param.first));
+      hashes.push_back(FastAttrValueHash(body_param.second));
+    });
+
+    using ConstInput = std::pair<const InputPort, string>;
+    absl::c_for_each(s.const_inputs, [&hashes](const ConstInput& const_input) {
+      hashes.push_back(hash<InputPort>()(const_input.first));
+      hashes.push_back(Hash64(const_input.second));
+    });
+
+    // Combine all pre-computed hashes in a deterministic order.
+    absl::c_sort(hashes);
+    return H::combine_contiguous(std::move(base), hashes.data(), hashes.size());
+  }
 };
 
 struct FunctionSpecialization {
@@ -238,39 +243,39 @@ struct FunctionSpecialization {
   // True if the function caller node is in GrapplerItem fetch set.
   bool is_in_fetch_set;
   // Names of the tensors that were pushed down into the function body.
-  gtl::FlatSet<string> const_inputs;
+  absl::flat_hash_set<string> const_inputs;
   // Control dependencies of pushed down const inputs have to be attached to
   // function caller node.
-  gtl::FlatSet<string> control_deps;
+  absl::flat_hash_set<string> control_deps;
   // Output tensors (ports) that consumed by other nodes in the graph or in a
   // GrapplerItem fetch set.
-  gtl::FlatSet<int> active_outputs;
+  absl::flat_hash_set<int> active_outputs;
   // Mapping from original function output port to the output port of
   // specialized function. If function specialization changes the number of
   // function outputs it's required to update all node consumers.
   std::vector<std::pair<int, int>> output_mapping;
 };
 
+// Function optimizer context initialized once for each optimization pass, and
+// it uses the latest available graph (for the first iteration it will be the
+// GrapplerItem.graph, for next iterations it will be the output of previous
+// function optimizer pass).
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
-                                    const GrapplerItem& item)
-      : grappler_item_id_(item.id),
-        graph_version_(item.graph.versions().producer()),
+  explicit FunctionOptimizerContext(const GrapplerItem& item,
+                                    RewriterConfig::Toggle opt_level,
+                                    const GraphDef& graph)
+      : item_(&item),
         opt_level_(opt_level),
-        allowed_optimizations_(item.allowed_optimizations()),
-        function_library_(OpRegistry::Global(), item.graph.library()),
-        available_device_names_(item.devices().begin(), item.devices().end()),
-        graph_view_(&item.graph) {
-    InitializeTrulyConstNodes(item);
-    InitializeFetchNodes(item);
-  }
+        function_library_(OpRegistry::Global(), graph.library()),
+        truly_const_nodes_(InferTrulyConstNodes(item, graph)),
+        graph_view_(&graph) {}
 
-  const RewriterConfig::Toggle opt_level() const { return opt_level_; }
+  const GrapplerItem& item() const { return *item_; }
 
-  const GrapplerItem::AllowedOptimizations& allowed_optimizations() const {
-    return allowed_optimizations_;
-  }
+  const int graph_version() const { return item_->graph.versions().producer(); }
+
+  RewriterConfig::Toggle opt_level() const { return opt_level_; }
 
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
@@ -285,25 +290,22 @@ class FunctionOptimizerContext {
     return flr_;
   }
 
-  const gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
+  const absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
   tensor_mapping() const {
     return tensor_mapping_;
   }
 
-  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+  const absl::flat_hash_map<string, std::vector<string>>& control_overrides()
+      const {
     return control_overrides_;
   }
 
   const GraphView& graph_view() const { return graph_view_; }
 
-  const string& grappler_item_id() const { return grappler_item_id_; }
-
-  const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
-
   const DeviceSet* devices() const {
     // Create fake devices lazily only if we need a DeviceSet.
-    if (available_devices_.empty() && !available_device_names_.empty()) {
-      for (const string& name : available_device_names_) {
+    if (available_devices_.empty() && !item_->devices().empty()) {
+      for (const string& name : item_->devices()) {
         auto device = absl::make_unique<FakeDevice>(name);
         available_device_set_.AddDevice(device.get());
         available_devices_.push_back(std::move(device));
@@ -313,7 +315,9 @@ class FunctionOptimizerContext {
   }
 
   bool IsFetchNode(const string& node_name) const {
-    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
+    return absl::c_any_of(item_->fetch, [&](const string& fetch) {
+      return ParseTensorName(fetch).node() == node_name;
+    });
   }
 
   bool IsTrulyConst(const string& name) const {
@@ -335,6 +339,11 @@ class FunctionOptimizerContext {
   }
 
   void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    DCHECK(from.index() != Graph::kControlSlot)
+        << "Tensor mapping must be from regular tensor";
+    DCHECK(to.index() != Graph::kControlSlot)
+        << "Tensor mapping must be to regular tensor";
+
     auto inserted = tensor_mapping_.insert({from, to});
     DCHECK(inserted.second)
         << "Failed to insert duplicated tensor mapping: "
@@ -349,8 +358,7 @@ class FunctionOptimizerContext {
       if (from_idx != to_idx) {
         SafeTensorId from_tensor(func_node, from_idx);
         SafeTensorId to_tensor(func_node, to_idx);
-        auto inserted = tensor_mapping_.insert({from_tensor, to_tensor});
-        DCHECK(inserted.second);
+        AddTensorMapping(from_tensor, to_tensor);
       }
     }
   }
@@ -364,24 +372,21 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeTrulyConstNodes(const GrapplerItem& item) {
-    gtl::FlatSet<string> feed_nodes;
+  static absl::flat_hash_map<string, const NodeDef*> InferTrulyConstNodes(
+      const GrapplerItem& item, const GraphDef& graph) {
+    absl::flat_hash_set<absl::string_view> feed_nodes;
     for (const auto& feed : item.feed) {
-      feed_nodes.insert(NodeName(feed.first));
+      feed_nodes.insert(feed.first);
     }
 
-    for (const NodeDef& node : item.graph.node()) {
-      if (IsConstant(node) && feed_nodes.count(node.name()) == 0) {
-        truly_const_nodes_[node.name()] = &node;
+    absl::flat_hash_map<string, const NodeDef*> const_nodes;
+    for (const NodeDef& node : graph.node()) {
+      if (IsConstant(node) && !feed_nodes.contains(node.name())) {
+        const_nodes[node.name()] = &node;
       }
     }
-  }
 
-  void InitializeFetchNodes(const GrapplerItem& item) {
-    for (const string& fetch : item.fetch) {
-      fetch_tensors_.insert(fetch);
-      fetch_nodes_.insert(NodeName(fetch));
-    }
+    return const_nodes;
   }
 
   void InitializeFunctionLibraryRuntime() {
@@ -393,16 +398,16 @@ class FunctionOptimizerContext {
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
       process_flr_.reset(new ProcessFunctionLibraryRuntime(
-          device_mgr_.get(), env, graph_version_, &function_library_,
-          optimizer_opts));
+          device_mgr_.get(), env, item_->graph.versions().producer(),
+          &function_library_, optimizer_opts));
       flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
     }
   }
 
-  const string grappler_item_id_;
-  const int graph_version_;
-  const RewriterConfig::Toggle opt_level_;
-  const GrapplerItem::AllowedOptimizations allowed_optimizations_;
+  const GrapplerItem* item_;  // must outlive this object
+  RewriterConfig::Toggle opt_level_;
+
+  // Function library constructed from current graph.
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -410,28 +415,20 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
-  // Fully defined names of the devices available to the GrapplerItem.
-  const gtl::FlatSet<string> available_device_names_;
-
   // List of available `FakedDevices` (lazily initialized, see devices()).
   mutable std::vector<std::unique_ptr<Device>> available_devices_;
 
   // DeviceSet of fake devices (`FakeDevice`) constructed from
-  // available_devices_ (lazily initialized).
+  // item_.devices() (lazily initialized).
   mutable DeviceSet available_device_set_;
 
   // Nodes that are Const and not in feed.
-  std::unordered_map<string, const NodeDef*> truly_const_nodes_;
+  absl::flat_hash_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
-  std::unordered_map<FunctionSpecializationSignature,
-                     const FunctionSpecialization,
-                     FunctionSpecializationSignature::Hash>
+  absl::flat_hash_map<FunctionSpecializationSignature,
+                      const FunctionSpecialization>
       specialized_functions_;
 
-  // GrapplerItem.fetch is a vector of tensors.
-  gtl::FlatSet<string> fetch_tensors_;  // format: node_name:port
-  gtl::FlatSet<string> fetch_nodes_;    // format: node_name
-
   // After function inlining and specialization, the optimized graph might be in
   // invalid state, nodes can read from non-existing function call nodes that
   // were inlined, or they can read from output index that is no longer valid
@@ -439,7 +436,7 @@ class FunctionOptimizerContext {
   //
   // Tensor mapping that has to be applied to the graph after all functions
   // optimizations (invalidated tensor id -> optimized graph tensor id).
-  gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
+  absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
       tensor_mapping_;
 
   // When we inline a function into the optimized graph, we no longer have the
@@ -448,7 +445,7 @@ class FunctionOptimizerContext {
   // to all side-effectful ops inside the function body.
   //
   // Invalidated function call node name -> Inlined side-effectful nodes
-  gtl::FlatMap<string, std::vector<string>> control_overrides_;
+  absl::flat_hash_map<string, std::vector<string>> control_overrides_;
 
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
@@ -472,10 +469,10 @@ const FunctionDef* FindFunctionCall(const FunctionOptimizerContext& ctx,
   return ctx.function_library().Find(node.op());
 }
 
-gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
-                                   const FunctionOptimizerContext& ctx,
-                                   int size_hint = 0) {
-  gtl::FlatSet<int> active_outputs;
+absl::flat_hash_set<int> GetActiveOutputs(const NodeDef& node,
+                                          const FunctionOptimizerContext& ctx,
+                                          int size_hint = 0) {
+  absl::flat_hash_set<int> active_outputs;
   active_outputs.reserve(static_cast<size_t>(size_hint));
 
   // 1. Output can be consumed by the other graph node.
@@ -486,9 +483,11 @@ gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
   }
 
   // 2. Or it can be in a fetch set.
-  for (const string& fetch_tensor : ctx.fetch_tensors()) {
-    int port = NodePositionIfSameNode(fetch_tensor, node.name());
-    if (port >= 0) active_outputs.insert(port);
+  for (const string& fetch : ctx.item().fetch) {
+    TensorId fetch_tensor = ParseTensorName(fetch);
+    if (fetch_tensor.node() == node.name()) {
+      active_outputs.insert(fetch_tensor.index());
+    }
   }
 
   return active_outputs;
@@ -508,7 +507,7 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
   // number of output args is the same as number of possible function caller
   // node outputs.
   int num_outputs = func.signature().output_arg_size();
-  const gtl::FlatSet<int> active_outputs =
+  const absl::flat_hash_set<int> active_outputs =
       GetActiveOutputs(func_node, ctx, /*size_hind*/ num_outputs);
 
   return active_outputs.size() != num_outputs;
@@ -519,7 +518,7 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
 FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
                                         const GraphDef& optimized_graph) {
   FunctionLibraryDefinition pruned_flib =
-      ReachableFunctionLibraryDefinition(flib, optimized_graph);
+      flib.ReachableDefinitions(optimized_graph);
 
   int pruned_functions = static_cast<int>(pruned_flib.num_functions()) -
                          static_cast<int>(flib.num_functions());
@@ -534,8 +533,8 @@ FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
 Status PushDownConstInputs(const NodeDef& func_node,
                            const FunctionOptimizerContext& ctx,
                            GrapplerFunctionItem* item,
-                           gtl::FlatSet<string>* const_inputs,
-                           gtl::FlatSet<string>* control_deps) {
+                           absl::flat_hash_set<string>* const_inputs,
+                           absl::flat_hash_set<string>* control_deps) {
   // Record node control dependencies in the control_deps set.
   const auto record_control_deps = [&](const NodeDef* const_input) {
     for (int i = const_input->input_size() - 1; i >= 0; --i) {
@@ -585,7 +584,7 @@ void RemovePushedDownConstInputs(const FunctionSpecialization& specialization,
 
   // Attach control dependencies of pushed down const input to the caller node.
   if (!specialization.control_deps.empty()) {
-    gtl::FlatSet<string> existing_control_deps;
+    absl::flat_hash_set<string> existing_control_deps;
 
     for (const string& input : keep_inputs) {
       existing_control_deps.insert(AsControlDependency(NodeName(input)));
@@ -746,14 +745,12 @@ Status InitializeFunctionSpecializationSignature(
 string SpecializedFunctionName(const FunctionOptimizerContext& ctx,
                                const FunctionDef& func,
                                const NodeDef& func_node) {
-  return absl::Substitute("$0_specialized_for_$1_at_$2",
-                          func.signature().name(),
-                          absl::StrReplaceAll(func_node.name(), {{"/", "_"}}),
-                          ctx.grappler_item_id());
+  return absl::Substitute(
+      "$0_specialized_for_$1_at_$2", func.signature().name(),
+      absl::StrReplaceAll(func_node.name(), {{"/", "_"}}), ctx.item().id);
 }
 
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
-                          const int graph_def_version,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
   VLOG(2) << "Specialize function call: " << SummarizeNodeDef(func_node);
@@ -792,13 +789,13 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Make a GrapplerFunctionItem and convert it back to FunctionDef after
   // pushing all constant inputs into the function body.
   GrapplerFunctionItem item;
-  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_instantiation_attr,
-                                              flib, graph_def_version, &item));
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+      func, func_instantiation_attr, flib, ctx->graph_version(), &item));
 
   // Push const inputs into the function body, and keep track of their control
   // dependencies.
-  gtl::FlatSet<string> const_inputs;
-  gtl::FlatSet<string> control_deps;
+  absl::flat_hash_set<string> const_inputs;
+  absl::flat_hash_set<string> control_deps;
   TF_RETURN_IF_ERROR(PushDownConstInputs(func_node, *ctx, &item, &const_inputs,
                                          &control_deps));
 
@@ -806,8 +803,17 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // update outputs for the fetch nodes, so we just skip them.
   std::vector<std::pair<int, int>> output_mapping;
   if (!signature.is_in_fetch_set) {
-    TF_RETURN_IF_ERROR(
-        RemoveUnusedOutputs(signature.active_outputs, &item, &output_mapping));
+    int num_func_outputs = 0;
+    for (const auto& out_arg : item.outputs()) {
+      num_func_outputs += out_arg.output_nodes.size();
+    }
+
+    absl::flat_hash_set<int> remove;
+    for (int i = 0; i < num_func_outputs; ++i) {
+      if (!signature.active_outputs.count(i)) remove.insert(i);
+    }
+
+    TF_RETURN_IF_ERROR(RemoveFunctionOutputs(remove, &item, &output_mapping));
   }
 
   // TODO(ezhulenev): Push down known input shapes.
@@ -962,8 +968,10 @@ NodeDef InlinedFunctionInputsNode(const NodeDef& func_node,
 
 // Create an IdentityN node to hook the function outputs to: this ensures that
 // the function body is fully evaluated before its fanout gets scheduled.
-NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
-                                   const GrapplerFunctionItem& item) {
+NodeDef InlinedFunctionOutputsNode(
+    const NodeDef& func_node, const GrapplerFunctionItem& item,
+    const absl::flat_hash_map<absl::string_view, absl::string_view>
+        output_tensors) {
   NodeDef outputs;
   outputs.set_name(func_node.name());
   outputs.set_op("IdentityN");
@@ -972,7 +980,8 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
       (*outputs.mutable_attr())["T"].mutable_list();
 
   for (const OutputArgExpansion& output_arg : item.outputs()) {
-    for (const string& output_tensor : output_arg.output_tensors) {
+    for (const string& output_node : output_arg.output_nodes) {
+      const absl::string_view output_tensor = output_tensors.at(output_node);
       type_list->add_type(output_arg.data_type);
       outputs.add_input(strings::StrCat(func_node.name(), "/", output_tensor));
     }
@@ -983,7 +992,6 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
 
 Status InlineDirectFunctionCall(const NodeDef& func_node,
                                 const FunctionDef& func,
-                                const int graph_def_version,
                                 const FunctionOptimizerContext& ctx,
                                 GraphDef* optimized_graph) {
   VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
@@ -995,7 +1003,7 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
   GrapplerFunctionItem item;
   Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
                                                 ctx.function_library(),
-                                                graph_def_version, &item);
+                                                ctx.graph_version(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -1004,29 +1012,51 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
   }
 
   // Mapping from input placeholder name to function input position.
-  int idx = 0;
-  std::unordered_map<string, int> input_placeholders_idx;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
     for (const string& placeholder : input_arg.placeholders) {
-      input_placeholders_idx[placeholder] = idx++;
+      const int idx = input_placeholders_idx.size();
+      input_placeholders_idx[placeholder] = idx;
+    }
+  }
+
+  // Bypass identity nodes added to the graph in place of function outputs.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
     }
   }
 
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<absl::string_view, absl::string_view> output_tensors;
+
   // Hook inlined function inputs to IdentityN node.
   NodeDef* func_inputs = optimized_graph->add_node();
   *func_inputs = InlinedFunctionInputsNode(func_node, item);
 
   for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      // Turn input placeholders into identity nodes.
+    const string& node_name = func_body_node.name();
+
+    // Skip output identity node, and update a mapping to the output tensor.
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      output_tensors.emplace(node_name, func_body_node.input(0));
+      continue;
+    }
+
+    // Turn placeholders added in place of input arguments into identity nodes.
+    const auto input_placeholder_idx = input_placeholders_idx.find(node_name);
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
       CHECK_EQ(0, func_body_node.input_size());
       func_body_node.set_op("Identity");
       (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
       func_body_node.mutable_attr()->erase("dtype");
       func_body_node.mutable_attr()->erase("shape");
-      int input_idx = input_placeholders_idx[func_body_node.name()];
-      func_body_node.add_input(
-          strings::StrCat(func_inputs->name(), ":", input_idx));
+      func_body_node.add_input(strings::StrCat(func_inputs->name(), ":",
+                                               input_placeholder_idx->second));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
@@ -1050,41 +1080,16 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Move the function body node to the optimized graph.
-    const auto move_node_to_optimized_graph = [&]() {
-      // Annotate the node with the function attributes.
-      for (const auto& attr : func.attr()) {
-        func_body_node.mutable_attr()->insert(attr);
-      }
-      // Move the node to the main graph.
-      optimized_graph->add_node()->Swap(&func_body_node);
-    };
-
-    // Check if a body node is itself a function call and can be inlined.
-    const FunctionDef* func_body_node_func =
-        FindFunctionCall(ctx, func_body_node);
-
-    if (func_body_node_func != nullptr) {
-      Status inlinable = IsInlinableDirectFunctionCall(
-          ctx, *func_body_node_func, func_body_node);
-      if (inlinable.ok()) {
-        TF_RETURN_IF_ERROR(
-            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
-                                     graph_def_version, ctx, optimized_graph));
-      } else {
-        VLOG(2) << "Can't inline nested direct function call: "
-                << inlinable.error_message();
-        move_node_to_optimized_graph();
-      }
-
-    } else {
-      move_node_to_optimized_graph();
-    }
+    // Move the node to the main graph.
+    optimized_graph->add_node()->Swap(&func_body_node);
   }
 
+  DCHECK(output_tensors.size() == item.output_size())
+      << "Each function output must be mapped to an output tensor";
+
   // Hook inlined function outputs to IdentityN node.
   NodeDef* func_outputs = optimized_graph->add_node();
-  *func_outputs = InlinedFunctionOutputsNode(func_node, item);
+  *func_outputs = InlinedFunctionOutputsNode(func_node, item, output_tensors);
 
   return Status::OK();
 }
@@ -1134,12 +1139,35 @@ Status InlineSymbolicGradient(const NodeDef& node,
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
 
-  // Recursively inline the functions until there is nothing more to inline. We
-  // should at least expand one function.
-  int counter = 0;
-  while (counter < 50 && ExpandInlineFunctions(
-                             ctx->mutable_function_library_runtime(), &graph)) {
-    ++counter;
+  FunctionLibraryRuntime* flr = ctx->mutable_function_library_runtime();
+
+  // 1. Inline symbolic gradient node.
+  const bool expanded = ExpandInlineFunctions(flr, &graph);
+  DCHECK(expanded) << "Didn't expand SymbolicGradient op";
+
+  // TODO(ezhulenev): InlineFunctionBody in common_runtime/function silently
+  // fails to inline function into the graph, and leaves the graph unmodified.
+  // We check that graph has our symbolic gradient inlined, otherwise we return
+  // a error.
+  const auto is_symbolic_gradient_op = [&](const Node* node) {
+    return node->name() == inlined->name() &&
+           node->type_string() == "SymbolicGradient";
+  };
+  for (Node* node : graph.nodes()) {
+    if (is_symbolic_gradient_op(node)) {
+      return errors::Internal("Failed to inline symbolic gradient node: ",
+                              SummarizeNode(*node));
+    }
+  }
+
+  // 2. Recursively inline nested function calls.
+  int iteration = 0;
+  while (ExpandInlineFunctions(flr, &graph)) {
+    if (++iteration >= 50) {
+      VLOG(2) << "Break symbolic gradient inlining loop at iteration #"
+              << iteration;
+      break;
+    }
   }
 
   GraphDef inlined_graph_def;
@@ -1196,12 +1224,26 @@ Status InlineSymbolicGradient(const NodeDef& node,
 // dependency tracking via input/output control edges, and we relax some of the
 // constraints that we have for direct function call inlining.
 //
-// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
-// input argument it "captures" the mutable resource.  This is implemented by
-// automatically adding a incoming control edge from the previous side-effectful
-// op touching that resource, and an outgoing control edge to the next
-// side-effectful op using the same resource. This serializes the mutations of
-// the resource to make graph execution deterministic.
+// Automatic control dependency rules:
+//
+// 1) "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data
+//    type) input argument it "captures" the mutable resource.  This is
+//    implemented by automatically adding a incoming control edge from the
+//    previous side-effectful op touching that resource, and an outgoing control
+//    edge to the next side-effectful op using the same resource. This
+//    serializes the mutations of the resource to make graph execution
+//    deterministic.
+//
+// 2) All stateful ops inside a function body are guaranteed to execute in
+//    program order, this is achieved by adding control edges between stateful
+//    ops at graph construction time.
+//
+// 3) Furthermore, all ops accepting the same resource as an input are
+//    guaranteed to run in program order. This is also done by adding control
+//    edges at graph construction time. The last op touching the resource
+//    will have an outgoing control edge to all function return nodes, which
+//    will guarantee that all side effects to the resource will happen before
+//    function completion.
 //
 // Function call inlining must preserve side effect visibility:
 //
@@ -1210,17 +1252,99 @@ Status InlineSymbolicGradient(const NodeDef& node,
 // 2) All side effects to the captured resources, that happened inside function
 //    body, must be visible to every op/function using that resource after the
 //    function call completed.
-
-// To guarantee that these properties are preserved after inlining we do:
 //
-// 1) Forward all input control dependencies from the function call node to the
-//    inlined function inputs (Identity nodes).
-// 2) Each side-effectful op inside function body adds itself as a control
-//    dependency to all the nodes in output control set of function call node.
+// To guarantee that these properties are preserved after inlining we:
+//
+// 1) Create "input_control" NoOp. Function call node incoming control edges
+//    will be forwarded *to* this node. Function inputs (Identity nodes) will
+//    have a control edge *from* this node. If function has no inputs, by
+//    construction it must have nodes without inputs in the function body, and
+//    in this case these nodes will have a control edge *from* this node.
+
+// 2) Create "output_control" NoOp. All nodes that have incoming control edge
+//    *from* the function call node, will be forwarded to this node. Function
+//    outputs (Identity nodes) will have a control edge *to* this node. This
+//    will guarantee that nodes that have control dependency on the function
+//    call, will observe all side-effects (guaranteed by graph construction with
+//    automatic control dependencies tracking).
+//
+// If after function instantiation we find a stateful or a dataset op inside
+// the function body, that is not reachable from any of the function outputs (or
+// if the function has no outputs), we do not inline it, because we can't
+// guarantee that these nodes will be executed in correct order (or executed at
+// all) after inlining.
 //
-// We do not add any other control dependencies to/from function body nodes,
-// because they are pure functions of input tensors, and can be freely
-// reordered.
+// We do not try to add any extra control edges to make sure that all
+// side-effectful nodes will be executed, that should be handled at graph
+// construction time.
+
+struct MaybeDeadOutput {
+  const NodeDef* dead_tensor_src;
+  const NodeDef* output_node_dst;
+};
+
+// Finds all function outputs that might return a dead tensor. This can happen
+// if there is no `Merge` node on the path from the `Switch` node, to the
+// function output.
+Status MaybeDeadOutputs(const FunctionOptimizerContext& ctx,
+                        const GrapplerFunctionItem& item,
+                        std::vector<MaybeDeadOutput>* maybe_dead) {
+  DCHECK(maybe_dead->empty()) << "Input argument must be an empty vector";
+
+  std::vector<const NodeDef*> dead_tensor_srcs;
+  for (const NodeDef& node : item.graph.node()) {
+    if (IsSwitch(node)) {
+      dead_tensor_srcs.push_back(&node);
+      continue;
+    }
+
+    // Regular (aka 'direct') function call can also produce dead tensors if
+    // the function body has mergeless switches.
+    const FunctionDef* func = ctx.function_library().Find(node.op());
+    if (func != nullptr) {
+      GrapplerFunctionItem func_item;
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+          *func, FunctionInstantiationAttributes(*func, node),
+          ctx.function_library(), ctx.graph_version(), &func_item));
+
+      std::vector<MaybeDeadOutput> func_dead_outputs;
+      TF_RETURN_IF_ERROR(MaybeDeadOutputs(ctx, func_item, &func_dead_outputs));
+
+      if (!func_dead_outputs.empty()) dead_tensor_srcs.push_back(&node);
+    }
+  }
+
+  // If we do not have dead tensor sources in the function body, it's
+  // guaranteed that all output tensors can't become dead.
+  if (dead_tensor_srcs.empty()) return Status::OK();
+
+  // Names of the function body nodes that return function output values.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const auto& output_expansion : item.outputs()) {
+    for (const auto& output_node : output_expansion.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
+
+  GraphTopologyView topology_view;
+  TF_RETURN_IF_ERROR(topology_view.InitializeFromGraph(item.graph));
+
+  for (const NodeDef* dead_tensor_src : dead_tensor_srcs) {
+    DfsTraversal(topology_view, {dead_tensor_src},
+                 TraversalDirection::kFollowOutputs,
+                 // Stop traversal when reached first `Merge` node.
+                 DfsPredicates::Advance(
+                     [](const NodeDef* node) { return !IsMerge(*node); }),
+                 // If we reached output node, add MaybeDeadOutput edge.
+                 DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                   if (output_nodes.find(node->name()) != output_nodes.end()) {
+                     maybe_dead->push_back({dead_tensor_src, node});
+                   }
+                 }));
+  }
+
+  return Status::OK();
+}
 
 // Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
 // we know how to inline it into the main graph, otherwise returns and error
@@ -1256,20 +1380,6 @@ Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
         SummarizeNodeDef(func_node));
   }
 
-  // We can't inline functions with `Switch` nodes in the function body, because
-  // they might have dead tensors as a function output argument (we need all
-  // intermediate tensors to compute the function gradient). `PartitionedCallOp`
-  // invokes functions with `allow_dead_tensors = true` to reset dead flag,
-  // and return default initialized tensors instead of a dead tensors.
-  // TODO(ezhulenev): Do the liveness analysis and add
-  // `IdentitytWithResurrection` nodes after all potentially dead output
-  // tensors?
-  if (absl::c_any_of(func.node_def(), IsSwitch)) {
-    return errors::FailedPrecondition(
-        "Can't inline function with `Switch` nodes in the function body: ",
-        SummarizeNodeDef(func_node));
-  }
-
   // TODO(b/120991525, b/120986912): We need to lower `If` and `While` nodes to
   // `Switch` nodes after function inlining (one more PRE_PLACEMENT pass?), but
   // because of the reason described above we are not sure that it's safe, for
@@ -1287,9 +1397,139 @@ Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
   return Status::OK();
 }
 
+// Checks that all side-effects will be executed in well defined order. We do it
+// by checking if there is a path from stateful/dataset ops to one of the output
+// nodes.
+Status CheckThatSideEffectsWillExecute(
+    const FunctionOptimizerContext& ctx,
+    const GraphTopologyView& graph_topo_view,
+    const absl::flat_hash_set<string> output_nodes) {
+  // We ignore side-effects safety check in aggressive mode.
+  const bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+
+  for (const NodeDef& func_body_node : graph_topo_view.graph()->node()) {
+    const bool node_must_execute =
+        IsDataset(func_body_node) ||
+        IsStateful(func_body_node, &ctx.function_library());
+
+    // If op has DT_RESOURCE argument it will be marked as stateful, though if
+    // it only reads from that resource, it's allowed to prune it, because it
+    // can't produce any visible side-effects.
+    const bool read_only = IsReadVariableOp(func_body_node);
+
+    if (read_only || !node_must_execute) continue;
+
+    VLOG(3) << "Check that node " << func_body_node.name()
+            << " will execute after inlining.";
+    bool will_execute = false;
+
+    // Check if we reached one of the output nodes.
+    const auto callbacks = DfsCallbacks::PreOrder([&](const NodeDef* node) {
+      if (output_nodes.count(node->name())) will_execute = true;
+    });
+
+    // Stop if we already proved that node will execute.
+    const auto predicates = DfsPredicates::Enter(
+        [&](const NodeDef* node) { return !will_execute; });
+
+    DfsTraversal(graph_topo_view, {&func_body_node},
+                 TraversalDirection::kFollowOutputs, predicates, callbacks);
+
+    if (!will_execute && !aggressive) {
+      return errors::Internal(
+          "Can't guarantee execution of a side-effectful node, that is not "
+          "reachable from function outputs. Function body node: ",
+          SummarizeNodeDef(func_body_node));
+    }
+
+    if (!will_execute && aggressive) {
+      LOG(WARNING)
+          << "Can't guarantee execution of a side-effectful node, that is not "
+             "reachable from function outputs. Function body node: "
+          << SummarizeNodeDef(func_body_node);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status PlaceInlinedFunctionBody(
+    const FunctionOptimizerContext& ctx, const NodeDef& func_node,
+    const GrapplerFunctionItem& item,
+    const absl::flat_hash_map<absl::string_view, int>& input_placeholders_idx,
+    GraphDef* placed_graph_def) {
+  // Control flow lowering and Placer works with a Graph object.
+  std::unique_ptr<Graph> func_body_graph =
+      absl::make_unique<Graph>(ctx.function_library());
+
+  GraphConstructorOptions opts;
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(opts, item.graph, func_body_graph.get()));
+
+  // TODO(ezhulenev): Lower If/While ops.
+
+  // ------------------------------------------------------------------------ //
+  // Before placing the function body nodes we pin input placeholders to the
+  // same device as their corresponding input nodes.
+
+  for (Node* func_body_node : func_body_graph->nodes()) {
+    const auto input_placeholder_idx =
+        input_placeholders_idx.find(func_body_node->name());
+
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
+      const int input_idx = input_placeholder_idx->second;
+      const GraphView::OutputPort output_port =
+          ctx.graph_view().GetRegularFanin({&func_node, input_idx});
+
+      VLOG(3) << "Pin inlined function input node '" << func_body_node->name()
+              << "' to the '" << output_port.node->device() << "' device.";
+      func_body_node->set_requested_device(output_port.node->device());
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+  // After placing nodes corresponding to the function inputs, we need to assign
+  // device placements to all other function body nodes.
+
+  const DeviceSet* devices = ctx.devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    for (Node* func_body_node : func_body_graph->nodes()) {
+      func_body_node->set_requested_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(func_body_graph.get(), devices,
+                  nullptr /* No session options */, default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+  }
+
+  // Convert Graph back to the placed GraphDef.
+  func_body_graph->ToGraphDef(placed_graph_def);
+
+  return Status::OK();
+}
+
 Status InlineIndirectFunctionCall(const NodeDef& func_node,
                                   const FunctionDef& func,
-                                  const int graph_def_version,
                                   FunctionOptimizerContext* ctx,
                                   GraphDef* optimized_graph) {
   VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
@@ -1301,7 +1541,7 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   GrapplerFunctionItem item;
   Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
                                                 ctx->function_library(),
-                                                graph_def_version, &item);
+                                                ctx->graph_version(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -1309,6 +1549,26 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
                                    ". Error: ", item_status.error_message());
   }
 
+  // `PartitionedCallOp` invokes functions with `allow_dead_tensors = true` to
+  // reset dead flag, and return default initialized tensors instead of a dead
+  // tensors. There is no way to express this in a regular Tensorflow graph, so
+  // we choose not to inline if a function can have dead tensors as an output
+  // position. In practice `mergeless switches` should not exists in a function
+  // body, because tf-eager will only use v2 control flow ops.
+  std::vector<MaybeDeadOutput> maybe_dead_outputs;
+  TF_RETURN_IF_ERROR(MaybeDeadOutputs(*ctx, item, &maybe_dead_outputs));
+  if (!maybe_dead_outputs.empty()) {
+    struct MaybeDeadOutputFormatter {
+      void operator()(string* out, const MaybeDeadOutput& md) const {
+        absl::StrAppend(out, SummarizeNodeDef(*md.dead_tensor_src));
+      }
+    };
+    return errors::FailedPrecondition(
+        "Can't inline function with dead outputs. Dead tensor sources (size = ",
+        maybe_dead_outputs.size(), "): ",
+        absl::StrJoin(maybe_dead_outputs, "\n", MaybeDeadOutputFormatter()));
+  }
+
   GraphView::InputPort control_input_port =
       ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
   GraphView::OutputPort control_output_port =
@@ -1342,111 +1602,140 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     inputs.push_back(tensor_id);
   }
 
-  // If we have a node inside the function body without inputs (e.g. Const), we
-  // must attach a control dependency to it, to make sure that if a function
-  // call happens inside a loop, the node will be evaluated in correct frame.
-  //
-  // If the function call node has no inputs and no control dependencies, it
-  // means that it can't be a function call inside a loop, and we can safely
-  // insert that node without inputs into the main graph.
-  //
-  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
-  // the function is called inside a loop.
-  std::vector<string> empty_inputs_hook;
-  if (!item.inputs().empty()) {
-    const InputArgExpansion& arg0 = item.inputs()[0];
-    DCHECK(!arg0.placeholders.empty());
-    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
-        arg0.placeholders[0], /*prefix=*/func_node.name())));
-  } else if (!happens_before.empty()) {
-    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
-  }
-
   // Mapping from input placeholder name to function input position.
-  int idx = 0;
   absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
     for (const string& placeholder : input_arg.placeholders) {
-      input_placeholders_idx[placeholder] = idx++;
+      const int idx = input_placeholders_idx.size();
+      input_placeholders_idx[placeholder] = idx;
     }
   }
 
   const string prefix = strings::StrCat(func_node.name(), "/");
 
   // ------------------------------------------------------------------------ //
-  // First we need to assign device placements to all function body nodes.
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<string, string> output_tensors;
 
-  GraphDef placed_graph_def;
+  // Unique names of nodes producing tensors in `output_tensors`.
+  absl::flat_hash_set<string> output_tensors_nodes;
 
-  const DeviceSet* devices = ctx->devices();
+  // Identity nodes added to the function body in place of function outputs.
+  absl::flat_hash_set<string> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
 
-  if (devices->devices().empty()) {
-    // If there are no devices available for placer, we just put all nodes to
-    // the same device as a function caller node. This can happen if Grappler is
-    // running "offline", without active runtime session, for example as a part
-    // of a batch job for graph analysis/optimization.
-    VLOG(3) << "Assign function call node device to all function body nodes. "
-            << "Device: " << func_node.device();
-    placed_graph_def = item.mutable_function_body();
-    for (NodeDef& node : *placed_graph_def.mutable_node()) {
-      node.set_device(func_node.device());
+  for (const NodeDef& func_body_node : item.graph.node()) {
+    const string& node_name = func_body_node.name();
+
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      const string& output_tensor = func_body_node.input(0);
+      output_tensors.emplace(node_name, output_tensor);
+
+      SafeTensorId tensor_id = ParseTensorName(output_tensor);
+      output_tensors_nodes.insert(tensor_id.node());
     }
-  } else {
-    // If we are running in an active runtime session, Grappler will get the
-    // graph after initial placing is done, and we should have devices for the
-    // placer.
-    VLOG(3) << "Run placer for instantiated function body. Devices: ["
-            << absl::StrJoin(
-                   devices->devices(), ", ",
-                   [](string* out, const Device* d) { out->append(d->name()); })
-            << "]";
+  }
 
-    // Construct a Graph object from the instantiated function body.
-    GraphConstructorOptions opts;
-    Graph graph(ctx->function_library());
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+  // ------------------------------------------------------------------------ //
+  // To guarantee side-effects execution order we add NoOp control_input and
+  // control_output nodes:
+  // 1) 'control_input' node will have incoming control edges from all nodes in
+  //    'happens_before' set.
+  // 2) 'control_output' node will have outgoing control edges to all nodes in
+  //    'happens_after' set.
 
-    // Use function caller node device as a default for placer.
-    const Device* default_device =
-        devices->FindDeviceByName(func_node.device());
+  NodeDef* control_input = nullptr;
+  NodeDef* control_output = nullptr;
 
-    Placer placer(&graph, devices, nullptr, /* No session options */
-                  default_device);
-    TF_RETURN_IF_ERROR(placer.Run());
+  // IMPORTANT: Actual control inputs will be added to these nodes at the very
+  // last stage, because we don't want to have invalid edges in a function body
+  // graph (control edges depend on the nodes in the "outer" optimized graph).
+
+  if (!happens_before.empty()) {
+    control_input = item.graph.add_node();
+    control_input->set_op("NoOp");
+    control_input->set_name(kControlInputNodeName);
+  }
+
+  if (!happens_after.empty()) {
+    control_output = item.graph.add_node();
+    control_output->set_op("NoOp");
+    control_output->set_name(kControlOutputNodeName);
+  }
 
-    // Convert Graph back to the GraphDef.
-    graph.ToGraphDef(&placed_graph_def);
+  // ------------------------------------------------------------------------ //
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (!item.inputs().empty()) {
+    const InputArgExpansion& arg0 = item.inputs()[0];
+    empty_inputs_hook.push_back(arg0.placeholders[0]);
+  } else if (control_input != nullptr) {
+    empty_inputs_hook.push_back(control_input->name());
   }
 
+  // ------------------------------------------------------------------------ //
+  // Grappler called after PRE_PLACEMENT and PLACEMENT passes, so we have to
+  // make sure that after inlining all nodes will have valid device assignment.
+
+  GraphDef placed_graph_def;
+  TF_RETURN_IF_ERROR(PlaceInlinedFunctionBody(
+      *ctx, func_node, item, input_placeholders_idx, &placed_graph_def));
+
   // ------------------------------------------------------------------------ //
   // After all nodes placed we need to prepare them for inlining into the
   // optimized graph: turn placeholders into identities, update nodes
   // connectivity, etc...
 
+  const auto inlined_node_name = [&func_node](const string& name) -> string {
+    return AddPrefixToNodeName(name, /*prefix=*/func_node.name());
+  };
+
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      // Turn input placeholders into identity node.
+    const string& node_name = func_body_node.name();
+
+    // Turn placeholders added in place of input arguments into identity nodes.
+    const auto input_placeholder_idx = input_placeholders_idx.find(node_name);
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
       DCHECK_EQ(0, func_body_node.input_size());
       func_body_node.set_op("Identity");
       (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
       func_body_node.mutable_attr()->erase("dtype");
       func_body_node.mutable_attr()->erase("shape");
-      int input_idx = input_placeholders_idx[func_body_node.name()];
-      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
+      const int input_idx = input_placeholder_idx->second;
+      func_body_node.add_input(inputs[input_idx].ToString());
 
       // All side effects must happen before inputs can start executing.
-      for (const string& hb_node : happens_before) {
-        func_body_node.add_input(AsControlDependency(hb_node));
+      if (control_input) {
+        func_body_node.add_input(
+            AsControlDependency(inlined_node_name(control_input->name())));
       }
-
     } else {
       // Update inputs of the regular function body nodes.
       for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+        input = inlined_node_name(input);
       }
-      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
-        *func_body_node.add_input() = empty_inputs_hook[0];
+      // Add control input to ensure node executed in correct frame.
+      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty() &&
+          func_body_node.name() != kControlInputNodeName &&
+          func_body_node.name() != kControlOutputNodeName) {
+        *func_body_node.add_input() =
+            AsControlDependency(inlined_node_name(empty_inputs_hook[0]));
       }
     }
 
@@ -1464,96 +1753,197 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     AddDefaultsToNodeDef(*op_def, &func_body_node);
   }
 
-  // Construct a graph view for the preprocessed function body graph.
-  GraphView placed_graph_view(&placed_graph_def);
+  // ------------------------------------------------------------------------ //
+  // Check that after inlining all side-effects will be executed in well defined
+  // order. We do it by checking if there is a path from stateful/dataset ops to
+  // one of the output nodes.
+
+  // Because we rename all the nodes before inlining, we need a copy of
+  // output_nodes with a new names.
+  absl::flat_hash_set<string> inlined_output_nodes;
+  for (const string& output_node : output_nodes) {
+    inlined_output_nodes.insert(inlined_node_name(output_node));
+  }
+  const auto is_inlined_output_node = [&](const NodeDef& node) -> bool {
+    return inlined_output_nodes.find(node.name()) != inlined_output_nodes.end();
+  };
 
-  // Keep track of side-effectful ops inside function body. Each outgoing
-  // control edge from the function call node, must be replaced with control
-  // edges from inlined side-effectful ops.
-  std::vector<string> side_effectful_nodes;
+  // Construct a graph topology view for DFS traversals (skip invalid edges for
+  // input nodes connected to nodes in the optimized graph).
+  GraphTopologyView placed_topo_view(/*skip_invalid_edges=*/true);
+  TF_RETURN_IF_ERROR(placed_topo_view.InitializeFromGraph(placed_graph_def));
+  TF_RETURN_IF_ERROR(CheckThatSideEffectsWillExecute(*ctx, placed_topo_view,
+                                                     inlined_output_nodes));
 
-  // We have to make sure that all side-effectful nodes inside a function body
-  // will be executed after function inlining.
-  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
-      int num_fanouts = placed_graph_view.NumFanouts(
-          func_body_node, /*include_controlling_nodes=*/true);
-
-      // If the node doesn't have any outgoing edges and we do not have any
-      // nodes in the `happens_after` set, we can't inline a function and
-      // guarantee that side-effects will be executed. The only exception if we
-      // do function library optimization, and the GrapplerItem was constructed
-      // for the function body, because functions have strict semantics.
-
-      if (num_fanouts == 0 && happens_after.empty() &&
-          ctx->allowed_optimizations().prune_ops_with_side_effects) {
-        return errors::Internal(
-            "Can't inline a function with a side-effectful op with empty "
-            "fanouts and empty output control edge set. Function body node: ",
-            SummarizeNodeDef(func_body_node));
-      }
+  // ------------------------------------------------------------------------ //
+  // Move all the nodes to the optimized graph after successful preprocessing.
+
+  if (control_input != nullptr) {
+    string inlined_node = inlined_node_name(control_input->name());
+    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
 
-      side_effectful_nodes.push_back(func_body_node.name());
+    for (const string& node_name : happens_before) {
+      placed_graph_def.mutable_node(*node_idx)->add_input(
+          AsControlDependency(node_name));
     }
   }
 
-  // Move all the nodes to the optimized graph after successful preprocessing.
+  if (control_output != nullptr) {
+    string inlined_node = inlined_node_name(control_output->name());
+    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
+
+    // Add control edges from all nodes producing output tensors.
+    for (const string& node_name : output_tensors_nodes) {
+      placed_graph_def.mutable_node(*node_idx)->add_input(
+          AsControlDependency(inlined_node_name(node_name)));
+    }
+
+    // Forward all control dependencies in the optimized graph to the new node.
+    ctx->AddControlOverrides(func_node, {inlined_node});
+  }
+
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    // Skip output identity nodes.
+    if (IsIdentity(func_body_node) && is_inlined_output_node(func_body_node))
+      continue;
+
     optimized_graph->add_node()->Swap(&func_body_node);
   }
 
-  // TODO(ezhulenev): Inline nested indirect function calls.
-
   // Indirect function call is fully inlined into the optimized graph, and we do
   // not copy the original function call node, so we have to setup tensor
   // mapping from old output tensors, to the outputs of inlined nodes.
   int output_idx = 0;
   for (const OutputArgExpansion& output : item.outputs()) {
-    for (const string& output_tensor : output.output_tensors) {
+    for (const string& output_node : output.output_nodes) {
+      const string& output_tensor = output_tensors.at(output_node);
+
       const SafeTensorId from_tensor(func_node.name(), output_idx++);
-      const SafeTensorId to_tensor = ParseTensorName(
-          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
-      ctx->AddTensorMapping(from_tensor, to_tensor);
+      const SafeTensorId to_tensor = ParseTensorName(output_tensor);
+
+      const SafeTensorId inlined_to_tensor =
+          SafeTensorId(absl::StrCat(func_node.name(), "/", to_tensor.node()),
+                       to_tensor.index());
+
+      ctx->AddTensorMapping(from_tensor, inlined_to_tensor);
     }
   }
 
-  // After inlining we'll have to forward all control dependencies from function
-  // call node to all side-effectful ops inside function body.
-  ctx->AddControlOverrides(func_node, side_effectful_nodes);
-
   VLOG(3) << "Successfully inlined indirect function call: "
           << SummarizeNodeDef(func_node);
+
   return Status::OK();
 }
 
-}  // namespace
+// Restores graph invariants after function specialization and inlining: all
+// inputs must be connected to valid nodes.
+Status RestoreGraphInvariants(const FunctionOptimizerContext& ctx,
+                              GraphDef* optimized_graph) {
+  // After function specialization and inlining graph might be in invalid
+  // state, and some nodes can read tensors that do not exists anymore in the
+  // optimized graph: function call node was fully inlined into the graph, or
+  // output index was invalidated by the output pruning.
 
-Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
-                                   GraphDef* optimized_graph) {
-  // Nothing to do here.
-  if (item.graph.library().function_size() == 0) {
-    *optimized_graph = item.graph;
-    return Status::OK();
+  if (!ctx.tensor_mapping().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        TensorId input_tensor = ParseTensorName(node.input(idx));
+        if (input_tensor.index() == Graph::kControlSlot) break;
+
+        auto mapping = ctx.tensor_mapping().find(input_tensor);
+        if (mapping != ctx.tensor_mapping().end()) {
+          node.set_input(idx, mapping->second.ToString());
+        }
+      }
+    }
+  }
+
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the control outputs
+  // node (it's also possible to rewrite singe control edge into multiple edges
+  // to inlined side-effectful nodes).
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      absl::flat_hash_set<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); /* see below */) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        } else {
+          // Go to the next input only if the current one was not invalidated,
+          // otherwise we need to check the swapped input as well.
+          ++idx;
+        }
+      }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
+    }
   }
 
-  FunctionOptimizerContext ctx(opt_level_, item);
+  return Status::OK();
+}
+
+}  // namespace
+
+Status FunctionOptimizer::RunFunctionOptimizerPass(
+    const GrapplerItem& item, const GraphDef& graph, const int iteration,
+    std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
+    bool* graph_has_unoptimized_function_calls) const {
+  VLOG(3) << absl::Substitute(
+      "Run function optimizer pass (iteration = $0): grappler_item_id = $1",
+      iteration, item.id);
+
+  FunctionOptimizerContext ctx(item, opt_level_, graph);
 
   bool inline_gradients = options_.enable_symbolic_gradient_inlining;
   bool inline_func = options_.enable_function_inlining;
   bool specialize_func = options_.enable_function_specialization;
 
-  for (const NodeDef& node : item.graph.node()) {
+  for (const NodeDef& node : graph.node()) {
     // Each node optimization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
     const auto is_graph_modified = [&]() {
       int num_nodes = optimized_graph->node_size();
-      CHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
+      DCHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
       return num_nodes > num_nodes_before;
     };
 
-    // Add a copy of an input graph node to the optimized graph.
-    const auto add_node_copy = [&]() { *optimized_graph->add_node() = node; };
+    // Copy node from the `graph` to the `optimized_graph`.
+    const auto copy_node = [&]() { *optimized_graph->add_node() = node; };
+
+    // If we already failed to optimize this node during one of the previous
+    // passes, we just give up, and do not try on more time.
+    if (skip_nodes->find(node.name()) != skip_nodes->end()) {
+      VLOG(3) << "Skip optimization for node: " << node.name();
+      copy_node();
+      continue;
+    }
 
 // Skip errors if optimized graph was not modified before error happened.
 #define TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(...)                     \
@@ -1563,7 +1953,8 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       return _status;                                              \
     if (TF_PREDICT_FALSE(!_status.ok() && !is_graph_modified())) { \
       VLOG(3) << "Skip error: " << _status.error_message();        \
-      add_node_copy();                                             \
+      skip_nodes->insert(node.name());                             \
+      copy_node();                                                 \
     }                                                              \
   } while (0)
 
@@ -1581,6 +1972,9 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
+      } else {
+        VLOG(2) << "Skip SymbolicGradient inlining: function=" << f_name;
+        skip_nodes->insert(node.name());
       }
     }
 
@@ -1593,7 +1987,6 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
 
     if (func != nullptr) {
       const string& func_name = func->signature().name();
-      const int graph_def_version = item.graph.versions().producer();
 
       const bool is_direct_func = IsDirectFunctionCall(*func, node);
       const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
@@ -1602,11 +1995,12 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       if (inline_func && is_direct_func) {
         Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
         if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
-              node, *func, graph_def_version, ctx, optimized_graph));
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+              InlineDirectFunctionCall(node, *func, ctx, optimized_graph));
           continue;
         } else {
           VLOG(2) << inlinable.error_message();
+          skip_nodes->insert(node.name());
         }
       }
 
@@ -1614,11 +2008,12 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       if (inline_func && is_indirect_func) {
         Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
         if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
-              node, *func, graph_def_version, &ctx, optimized_graph));
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+              InlineIndirectFunctionCall(node, *func, &ctx, optimized_graph));
           continue;
         } else {
           VLOG(2) << inlinable.error_message();
+          skip_nodes->insert(node.name());
         }
       }
 
@@ -1635,95 +2030,95 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            SpecializeFunction(node, *func, item.graph.versions().producer(),
-                               &ctx, optimized_graph));
+            SpecializeFunction(node, *func, &ctx, optimized_graph));
         continue;
+      } else {
+        VLOG(2) << "Skip function specialization: " << func->signature().name();
+        skip_nodes->insert(node.name());
       }
     }
 
     // ---------------------------------------------------------------------- //
     // If we reached this point, node was not handled by any of the stages
-    // (inline, specialize), simply add a copy to the graph.
-    add_node_copy();
+    // (inline, specialize), simply copy the node to the optimized graph.
+    copy_node();
 
 #undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
-  // After function specialization and inlining graph might be in invalid
-  // state, and some nodes can read tensors that do not exists anymore in the
-  // optimized graph: function call node was fully inlined into the graph, or
-  // output index was invalidated by the output pruning.
+  TF_RETURN_IF_ERROR(RestoreGraphInvariants(ctx, optimized_graph));
 
-  if (!ctx.tensor_mapping().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      for (int idx = 0; idx < node.input_size(); ++idx) {
-        TensorId input_tensor = ParseTensorName(node.input(idx));
-        if (input_tensor.index() == Graph::kControlSlot) break;
+  // Preserve the graph version.
+  *optimized_graph->mutable_versions() = graph.versions();
 
-        auto mapping = ctx.tensor_mapping().find(input_tensor);
-        if (mapping != ctx.tensor_mapping().end()) {
-          node.set_input(idx, mapping->second.ToString());
-        }
-      }
+  // Prune unreachable function from the library.
+  if (options_.enable_trim_function_library) {
+    *optimized_graph->mutable_library() =
+        PruneFunctionLibrary(ctx.function_library(), *optimized_graph);
+  } else {
+    *optimized_graph->mutable_library() = ctx.function_library().ToProto();
+  }
+
+  // Before returning we check if after single optimization pass we have more
+  // unoptimized function calls.
+  *graph_has_unoptimized_function_calls = false;
+  for (const NodeDef& node : optimized_graph->node()) {
+    // Check if we can inline symbolic gradient.
+    if (IsSymbolicGradient(node) && inline_gradients &&
+        skip_nodes->count(node.name()) == 0) {
+      *graph_has_unoptimized_function_calls = true;
+      break;
     }
-  }
 
-  // Function inlining instantiates function body directly into the optimized
-  // graph, and we might end up with control dependencies to the nodes that no
-  // longer exist in a graph. We need to apply control overrides to all
-  // invalidated nodes, and rewire control dependencies to the inlined
-  // side-effectful function body nodes.
+    // Check if after inlining we have unoptimized function calls.
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+    if (func != nullptr && !MarkedSpecialized(*func) &&
+        skip_nodes->count(node.name()) == 0) {
+      *graph_has_unoptimized_function_calls = true;
+      break;
+    }
+  }
 
-  // TODO(ezhulenev): With nested function call inlining, single pass over
-  // `control_overrides` might not bring the graph into a valid state,
-  // continue until it converges and all invalidated control dependencies
-  // removed.
+  return Status::OK();
+}
 
-  if (!ctx.control_overrides().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      // Keep track of new control inputs to the node.
-      gtl::FlatSet<string> add_ctrl_inputs;
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
+                                   GraphDef* optimized_graph) {
+  // Nothing to do here.
+  if (item.graph.library().function_size() == 0) {
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
 
-      // Remove all invalidated control inputs.
-      for (int idx = 0; idx < node.input_size(); /* see below */) {
-        // TODO(ezhulenev): Use non-allocating TensorId after migrating
-        // `control_overrides()` to absl::flat_hash_set.
-        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+  // Do not retry failed function inlining or specialization.
+  std::unordered_set<string> skip_nodes;
+  bool graph_has_unoptimized_function_calls = false;
 
-        auto overrides = ctx.control_overrides().find(input_tensor.node());
-        if (overrides != ctx.control_overrides().end()) {
-          // If this happens it's a bug in the function inlining.
-          if (input_tensor.index() != Graph::kControlSlot) {
-            return errors::Internal(
-                "Illegal input edge from inlined function call node");
-          }
-          // Remove control dependency to the inlined function call node.
-          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
-          node.mutable_input()->RemoveLast();
+  // We'll keep running function optimizer pass until we inlined and optimized
+  // all function call nodes.
+  int iteration = 0;
+  constexpr int kMaxIterations = 50;
 
-          // Keep track of all overrides.
-          for (const string& override : overrides->second) {
-            add_ctrl_inputs.insert(AsControlDependency(override));
-          }
-        } else {
-          // Go to the next input only if the current one was not invalidated,
-          // otherwise we need to check the swapped input as well.
-          ++idx;
-        }
-      }
+  // 1. Run first optimizer pass with GrapplerItem.graph.
+  TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
+      item, item.graph, 0, &skip_nodes, optimized_graph,
+      &graph_has_unoptimized_function_calls));
 
-      // Add overrides to the node inputs.
-      for (const string& ctrl_input : add_ctrl_inputs) {
-        node.add_input(ctrl_input);
-      }
+  // 2. If after function inlining we have unoptimized function calls, we have
+  // to run function optimization pass one more time.
+  while (graph_has_unoptimized_function_calls) {
+    if (iteration++ > kMaxIterations) {
+      VLOG(1) << "Break function optimizer loop at iteration #" << iteration;
+      break;
     }
-  }
 
-  *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() =
-      options_.enable_trim_function_library
-          ? PruneFunctionLibrary(ctx.function_library(), *optimized_graph)
-          : ctx.function_library().ToProto();
+    GraphDef workspace_graph;
+    workspace_graph.Swap(optimized_graph);
+
+    TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
+        item, workspace_graph, iteration, &skip_nodes, optimized_graph,
+        &graph_has_unoptimized_function_calls));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 4352555064c43c8db40157ace2fca9479907df8e..ab90281509fc1f4a80a82bd6e1ab830e22200838 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -48,6 +48,16 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_trim_function_library = true;
   };
 
+  // Runs a single function optimizer pass over the `graph`. All nodes that are
+  // not function calls will be copied from the `graph` to the
+  // `optimized_graph`. Function call nodes inlined or specialized, and
+  // instantiated function body or specialized function call nodes will be added
+  // to the `optimized_graph`.
+  Status RunFunctionOptimizerPass(
+      const GrapplerItem& item, const GraphDef& graph, const int iteration,
+      std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
+      bool* graph_has_unoptimized_function_calls) const;
+
   RewriterConfig::Toggle opt_level_;
   FunctionOptimizerOptions options_;
 };
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index c971eec3f4dae5cc3457ad802700ee4f3086eb90..fc7fffe7b2a06afe9fbe30162fa4c0173be26e02 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -660,7 +660,7 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
-TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradientNoInlineFunc) {
   FunctionOptimizer optimizer(RewriterConfig::ON);
 
   FunctionDef func = FunctionDefHelper::Define(
@@ -831,36 +831,43 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
             kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+       NDef("f1/control_input", "NoOp", {"^init_v"}, {}, kDevice),
+       NDef("f1/x", "Identity", {"a:0", "^f1/control_input"}, {{"T", DT_FLOAT}},
             kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^f1/control_input"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f1/v", "Identity", {"v:0", "^f1/control_input"},
+            {{"T", DT_RESOURCE}}, kDevice),
        NDef("f1/one", "Const", {"^f1/x"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
        NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
        NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
             kDevice),
+       NDef("f1/control_output", "NoOp", {"^f1/mul"}, {}, kDevice),
 
        // Function body of a second function call also inlined into the graph,
        // and input nodes read directly from the inlined nodes of the first
        // function call.
-       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
-            kDevice),
+       NDef("f2/control_input", "NoOp", {"^f1/control_output"}, {}, kDevice),
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/control_input"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/control_input"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f2/control_input"},
+            {{"T", DT_RESOURCE}}, kDevice),
        NDef("f2/one", "Const", {"^f2/x"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
        NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
        NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
             kDevice),
+       NDef("f2/control_output", "NoOp", {"^f2/mul"}, {}, kDevice),
 
        // Return values read directly from inlined nodes.
        NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("out_2", "ReadVariableOp", {"v", "^f1/add", "^f2/add"},
+       NDef("out_2", "ReadVariableOp",
+            {"v", "^f1/control_output", "^f2/control_output"},
             {{"dtype", DT_FLOAT}}, kDevice)},
 
       // Function library.
@@ -924,8 +931,9 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
 
-       // Function must be inlined and `mul` node placed on a requested device.
-       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu1),
+       // Function must be inlined and `mul` node placed on a requested device,
+       // and input `Identity` nodes must be colocated with their source nodes.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu0),
        NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
        NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
 
@@ -936,7 +944,8 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
   CompareGraphs(expected, optimized_graph);
 }
 
-TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
+TEST_F(FunctionOptimizerTest,
+       InlineIndirectFunctionWithControlDependencyAndNoSideEffects) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
 
@@ -997,12 +1006,16 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
        NDef("f1/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("f1/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/control_output", "NoOp", {"^f1/mul"}, {}, kDevice),
 
        // Function body of a second function call also inlined into the graph,
        // and input nodes read directly from the inlined nodes of the first
        // function call, and control dependency edge removed.
-       NDef("f2/x", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/control_input", "NoOp", {"^f1/control_output"}, {}, kDevice),
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/control_input"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/control_input"},
+            {{"T", DT_FLOAT}}, kDevice),
        NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
 
        // Return directly from inlined node of f2.
@@ -1024,6 +1037,238 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionDoNotInlineDeadOutputs) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  // Function output can be dead.
+  FunctionDef dead_outputs = FunctionDefHelper::Create(
+      "DeadOutputs", {"x:T", "cond:bool"}, {"z:T"}, {"T: {float, double}"},
+      {
+          {{"switch"}, "Switch", {"x", "cond"}, {{"T", "$T"}}},
+          {{"if_false"}, "Identity", {"switch:output_false:0"}, {{"T", "$T"}}},
+          {{"if_true"}, "Identity", {"switch:output_true:0"}, {{"T", "$T"}}},
+      },
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "if_false:output:0"}});
+
+  // Simple proxy functions that calls DeadOutputs from the function body.
+  FunctionDef proxy_func = FunctionDefHelper::Create(
+      "Proxy", {"x:T", "cond:bool"}, {"z:T"}, {"T: {float, double}"},
+      {{{"dead"}, "DeadOutputs", {"x", "cond"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "dead:z:0"}});
+
+  // Build a graph to compute:
+  //   a: float
+  //   b: bool
+  //   fn0 = DeadOutputs(x, b)
+  //   fn1 = Proxy(x, b)
+  //   out0 = Identity(fn0)
+  //   out1 = Identity(fn1)
+  //   return [out0, out1]
+  //
+  GrapplerItem item;
+  item.fetch = {"out0", "out1"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+
+       NDef("fn0", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_BOOL}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("DeadOutputs", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       NDef("fn1", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_BOOL}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("Proxy", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       NDef("out0", "Identity", {"fn0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out1", "Identity", {"fn1"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {dead_outputs, proxy_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = item.graph;
+  CompareGraphs(expected, optimized_graph);
+
+  const Tensor one = test::AsScalar<float>(1.0);
+  item.feed.emplace_back("a", one);
+  item.feed.emplace_back("b", test::AsScalar<bool>(false));
+
+  auto tensors = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors.size(), 2);
+  test::ExpectTensorEqual<float>(tensors[0], one);
+  test::ExpectTensorEqual<float>(tensors[1], one);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithMergedDeadTensors) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  // Function output can't be dead because it goes through the Merge node.
+  FunctionDef no_dead_outputs = FunctionDefHelper::Create(
+      "NoDeadOutputs", {"x:T", "cond:bool"}, {"z:T"}, {"T: {float, double}"},
+      {
+          {{"switch"}, "Switch", {"x", "cond"}, {{"T", "$T"}}},
+          {{"if_false"}, "Identity", {"switch:output_false:0"}, {{"T", "$T"}}},
+          {{"if_true"}, "Identity", {"switch:output_true:0"}, {{"T", "$T"}}},
+          {{"merge"},
+           "Merge",
+           {"if_false:output:0", "if_true:output:0"},
+           {{"T", "$T"}, {"N", 2}}},
+      },
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "merge:output:0"}});
+
+  // Build a graph to compute:
+  //   a: float
+  //   b: bool
+  //   d = DeadOutputs(x, b)
+  //   out = Identity(d)
+  //   return out
+  //
+  GrapplerItem item;
+  item.fetch = {"out"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+
+       NDef("fn", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_BOOL}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("NoDeadOutputs", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       NDef("out", "Identity", {"fn"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {no_dead_outputs});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("fn/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("fn/cond", "Identity", {"b:0"}, {{"T", DT_BOOL}}, kDevice),
+       NDef("fn/switch", "Switch", {"fn/x:0", "fn/cond:0"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/if_false", "Identity", {"fn/switch:0"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/if_true", "Identity", {"fn/switch:1"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/merge", "Merge", {"fn/if_false:0", "fn/if_true:0"},
+            {{"T", DT_FLOAT}, {"N", 2}}, kDevice),
+
+       // Return directly from inlined node.
+       NDef("out", "Identity", {"fn/merge:0"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {no_dead_outputs});
+
+  CompareGraphs(expected, optimized_graph);
+
+  const Tensor one = test::AsScalar<float>(1.0);
+  item.feed.emplace_back("a", one);
+  item.feed.emplace_back("b", test::AsScalar<bool>(false));
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors.size(), 1);
+
+  test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithNestedFunctionCall) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // `Square` implemented in terms of PartitionedCall to `MyMul`.
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"output:T"}, {"T: {float, double}"},
+      {{{"square"},
+        "PartitionedCall",
+        {"x", "x"},
+        {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+         {"Tout", DataTypeSlice{DT_FLOAT}},
+         {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"output", "square:output:0"}});
+
+  // Build a graph to compute:
+  //   b = Square(a)
+  //   c = Identity(b)
+  //   return c
+  GrapplerItem item;
+  item.fetch = {"c"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "PartitionedCall", {"a"},
+            {{"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MySquare", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("c", "Identity", {"b"}, {{"T", DT_FLOAT}}, kDevice)},
+      /* Function library */
+      {mul_func, square_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Inlined inputs of `c` node.
+       NDef("b/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Inlined inputs of `square` node inside inlined `MySquare` function.
+       NDef("b/square/x", "Identity", {"b/x:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("b/square/y", "Identity", {"b/x:0"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Inlined mul node from the `MyMul` function.
+       NDef("b/square/mul", "Mul", {"b/square/x", "b/square/y"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("c", "Identity", {"b/square/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor three = test::AsScalar<float>(3.0f);
+  item.feed.emplace_back("a", three);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index e587a2b2af74cb417ac58f672a4cc5526335d0a8..44dfe0de7890f09feb0b2cbfc450ddb9e37fc3cd 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -39,7 +39,7 @@ class GraphOptimizer {
   // Routine called to allow an algorithm to propose a rewritten graph
   // for the graph, feeds and fetches in "item" to run more efficiently
   // on "cluster".
-  // Returns true iff it managed to generate a solution, false otherwise.
+  // Returns an error status if it failed to generate a solution.
   virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
                           GraphDef* optimized_graph) = 0;
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 8f25a1c8c1c48281fb44c01a142348863836d5aa..e9b706a58371cad72ef4b0652bc86364d7c4f5c0 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -503,6 +503,7 @@ class NodeProcessor : public GraphProcessor {
       UpdateAttrKSize();
       UpdateAttrStrides();
       UpdateAttrDilations();
+      UpdateAttrExplicitPaddings();
       UpdateAttrShape();
       TF_RETURN_IF_ERROR(AddLayoutTransposeToInputs());
       TF_RETURN_IF_ERROR(AddLayoutTransposeToOutputs());
@@ -753,6 +754,28 @@ class NodeProcessor : public GraphProcessor {
     }
   }
 
+  void UpdateAttrExplicitPaddings() {
+    if (node_->attr().find("explicit_paddings") != node_->attr().end()) {
+      auto list = node_->mutable_attr()->at("explicit_paddings").mutable_list();
+      int size = list->i_size();
+      if (size == 8) {
+        int64 height_before = list->i(2);
+        int64 height_after = list->i(3);
+        int64 width_before = list->i(4);
+        int64 width_after = list->i(5);
+        list->set_i(2, 0);
+        list->set_i(3, 0);
+        list->set_i(4, height_before);
+        list->set_i(5, height_after);
+        list->set_i(6, width_before);
+        list->set_i(7, width_after);
+      } else if (size != 0) {
+        LOG(ERROR) << "Cannot handle explicit_paddings attribute of size "
+                   << size;
+      }
+    }
+  }
+
   void UpdateAttrDataFormat() {
     if (node_->attr().find("data_format") != node_->attr().end()) {
       if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 20e47c1b26b173c18eefd01ba7bdb87781a4c59b..eb2a8e87dde605d7a5867ca84f1c5260c42077e4 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
@@ -80,8 +81,13 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output filter =
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
+    ops::Conv2D::Attrs attrs;
+    if (padding == "EXPLICIT") {
+      attrs = attrs.ExplicitPaddings({0, 0, 1, 2, 3, 4, 0, 0});
+    }
+
     Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
-                              filter, {1, stride, stride, 1}, padding);
+                              filter, {1, stride, stride, 1}, padding, attrs);
     return conv;
   }
 
@@ -100,6 +106,28 @@ class LayoutOptimizerTest : public GrapplerTest {
     int input_depth = 3;
     int filter_count = 2;
     int stride = 1;
+    int dilation = dilated ? 2 : 1;
+    int64 padding_top = 1;
+    int64 padding_bottom = 2;
+    int64 padding_left = 3;
+    int64 padding_right = 4;
+    int64 output_height;
+    int64 output_width;
+    Padding padding_enum;
+    if (padding == "SAME") {
+      padding_enum = SAME;
+    } else if (padding == "VALID") {
+      padding_enum = VALID;
+    } else {
+      CHECK_EQ(padding, "EXPLICIT");
+      padding_enum = EXPLICIT;
+    }
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        input_height, filter_size, dilation, stride, padding_enum,
+        &output_height, &padding_top, &padding_bottom));
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        input_width, filter_size, dilation, stride, padding_enum, &output_width,
+        &padding_left, &padding_right));
     TensorShape input_sizes_shape({4});
     Tensor input_data(DT_INT32, input_sizes_shape);
     test::FillValues<int>(&input_data,
@@ -112,8 +140,6 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output filter =
         ops::Variable(s->WithOpName("Filter"), filter_shape, DT_FLOAT);
 
-    int output_height = input_height;
-    int output_width = input_width;
     TensorShape output_shape(
         {batch_size, output_height, output_width, filter_count});
     Tensor output_data(DT_FLOAT, output_shape);
@@ -124,10 +150,21 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output conv_backprop_input;
     Output input_sizes_i =
         ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
-    ops::Conv2DBackpropInput::Attrs attrs;
-    if (dilated) {
-      attrs = attrs.Dilations({1, 2, 2, 1});
+    std::vector<int> dilations{1, dilation, dilation, 1};
+    std::vector<int> explicit_paddings;
+    if (padding == "EXPLICIT") {
+      explicit_paddings = {0,
+                           0,
+                           static_cast<int>(padding_top),
+                           static_cast<int>(padding_bottom),
+                           static_cast<int>(padding_left),
+                           static_cast<int>(padding_right),
+                           0,
+                           0};
     }
+    auto attrs =
+        ops::Conv2DBackpropInput::Attrs().Dilations(dilations).ExplicitPaddings(
+            explicit_paddings);
     if (const_input_size) {
       conv_backprop_input = ops::Conv2DBackpropInput(
           s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
@@ -186,7 +223,7 @@ class LayoutOptimizerTest : public GrapplerTest {
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME");
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "EXPLICIT");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -306,6 +343,19 @@ TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
+TEST_F(LayoutOptimizerTest, ExplicitPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "EXPLICIT");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
+}
+
 TEST_F(LayoutOptimizerTest, Pad) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID");
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 36064738408c744db53cb9e95645d6a2968b1746..cf5e4db29f418ac560c6a4c6381d4a7f3d88088e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -451,16 +453,29 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
 }
 
 std::vector<int> GetStackPushNodesToConvert(
-    const SimpleGraphView& graph_view,
+    const GraphTopologyView& graph_view,
     const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
   VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+
   const std::unordered_set<string> op_types_to_traverse(
       {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
        "Identity", "RefIdentity"});
+  const auto is_op_to_traverse = [&](const NodeDef* node) -> bool {
+    return op_types_to_traverse.find(node->op()) != op_types_to_traverse.end();
+  };
+
   std::vector<int> nodes_to_convert;
-  std::set<int> fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
-  for (int fanout_idx : fanout) {
+  std::vector<int> fanouts;
+
+  DfsTraversal(graph_view, {graph_view.GetNode(stack_node_idx)},
+               TraversalDirection::kFollowOutputs,
+               DfsPredicates::Advance(is_op_to_traverse),
+               DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                 const absl::optional<int> idx = graph_view.GetNodeIndex(*node);
+                 fanouts.push_back(idx.value());
+               }));
+
+  for (int fanout_idx : fanouts) {
     const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
     VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
     if (IsStackPushOp(fanout_node)) {
@@ -468,13 +483,12 @@ std::vector<int> GetStackPushNodesToConvert(
       // happen when the graph we have contains only the forward pass for a loop
       // (as when the forward and backward passes are split across different
       // functions).
-      if (graph_view.has_node(fanout_node.input(0))) {
-        const NodeDef* stack_node =
-            &graph_view.node(graph_view.index(fanout_node.input(0)));
+      if (graph_view.HasNode(fanout_node.input(0))) {
+        const NodeDef* stack_node = graph_view.GetNode(fanout_node.input(0));
         while (stack_node->op() != "Stack" && stack_node->op() != "StackV2" &&
                stack_node->input_size() > 0 &&
-               graph_view.has_node(stack_node->input(0))) {
-          stack_node = &graph_view.node(graph_view.index(stack_node->input(0)));
+               graph_view.HasNode(stack_node->input(0))) {
+          stack_node = graph_view.GetNode(stack_node->input(0));
         }
         if (nodes_to_preserve.find(stack_node->name()) ==
             nodes_to_preserve.end()) {
@@ -488,7 +502,7 @@ std::vector<int> GetStackPushNodesToConvert(
                    op_types_to_traverse.end()) {
       continue;
     } else if (!IsStackPopOp(fanout_node) ||
-               (!graph_view.outputs(fanout_idx).empty() ||
+               (!graph_view.GetFanout(fanout_idx).empty() ||
                 nodes_to_preserve.find(fanout_node.name()) !=
                     nodes_to_preserve.end())) {
       // The node is either a stack pop with consumers or something unexpected
@@ -497,14 +511,16 @@ std::vector<int> GetStackPushNodesToConvert(
       break;
     }
   }
+
   return nodes_to_convert;
 }
 
 Status RemoveStackOps(const std::unordered_set<string>& nodes_to_preserve,
                       GraphDef* optimized_graph) {
   NodeMap node_map(optimized_graph);
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+
   for (int node_idx = 0; node_idx < optimized_graph->node_size(); ++node_idx) {
     if (IsStackOp(optimized_graph->node(node_idx))) {
       for (int push_node_idx : GetStackPushNodesToConvert(
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 042e9fa32b12235f07113c576155bcdd01cf472e..b50d50f84245a5910ccf9cde5166465f4d9e9310 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -188,13 +189,14 @@ std::vector<RecomputedSubGraph> GetOpGroupsToRecompute(
       }
     }
     // Recompute only nodes which eventually feed into a target node.
-    connected_subgraph(node_map,
-                       true,   // Collect inputs
-                       false,  // Collect outputs
-                       [&unpruned_recompute_nodes](const NodeDef& node) {
-                         return unpruned_recompute_nodes.count(&node) != 0;
-                       },
-                       &current_recomputation.recomputed_source_nodes);
+    connected_subgraph(
+        node_map,
+        true,   // Collect inputs
+        false,  // Collect outputs
+        [&unpruned_recompute_nodes](const NodeDef& node) {
+          return unpruned_recompute_nodes.count(&node) != 0;
+        },
+        &current_recomputation.recomputed_source_nodes);
     if (current_recomputation.target_nodes.empty()) {
       continue;
     }
@@ -498,6 +500,16 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
   // Look for AddN nodes (and equivalent) and record input names.
   MutableGraphView view(&item->graph);
 
+  // It's ok to use immutable GraphTopologyView here, because we do not destroy
+  // any of the nodes in the underlying graph, we only add new nodes.
+  GraphTopologyView graph_topology;
+  Status initialized_topology = graph_topology.InitializeFromGraph(item->graph);
+  if (!initialized_topology.ok()) {
+    VLOG(1) << "Failed to initialize graph topology view: "
+            << initialized_topology.error_message();
+    return false;
+  }
+
   std::unordered_map<string, std::unordered_set<NodeDef*>> addn_list;
   for (NodeDef& node : *item->graph.mutable_node()) {
     if (!IsAddN(node) && node.op() != "AccumulateNV2") {
@@ -579,12 +591,11 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
 
     // Compute a topological ordering for the node fanin.
     std::unordered_map<const NodeDef*, int> topo_order;
-    ReverseDfs(view, {node}, nullptr,
-               [&topo_order](const NodeDef* n) {
-                 int topo_index = topo_order.size();
-                 topo_order[n] = topo_index;
-               },
-               nullptr);
+    DfsTraversal(graph_topology, {node}, TraversalDirection::kFollowInputs,
+                 DfsCallbacks::PostOrder([&topo_order](const NodeDef* n) {
+                   int topo_index = static_cast<int>(topo_order.size());
+                   topo_order[n] = topo_index;
+                 }));
 
     std::vector<int> input_topo_index;
 
@@ -1259,46 +1270,55 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
     return Status::OK();
   }
 
-  std::unordered_set<int> optimized_nodes;
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+  std::unordered_set<const NodeDef*> optimized_nodes;
+
   for (int i : assign_nodes) {
-    if (optimized_nodes.find(i) == optimized_nodes.end()) {
-      const NodeDef& assign_node = optimized_graph->node(i);
-      optimized_nodes.insert(i);
-      std::vector<int> assign_nodes_in_fanout;
-      assign_nodes_in_fanout.push_back(i);
-      std::set<int> transitive_fanout;
-      graph_view.DepthFirstSearch(std::unordered_set<string>{}, i,
-                                  &transitive_fanout);
+    const NodeDef& assign_node = optimized_graph->node(i);
+
+    if (optimized_nodes.find(&assign_node) == optimized_nodes.end()) {
+      std::vector<const NodeDef*> assign_nodes_in_fanout;
+      optimized_nodes.insert(&assign_node);
+      assign_nodes_in_fanout.push_back(&assign_node);
+
+      std::vector<const NodeDef*> transitive_fanout;
+      DfsTraversal(graph_view, {graph_view.GetNode(i)},
+                   TraversalDirection::kFollowOutputs,
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     transitive_fanout.push_back(node);
+                   }));
+
       bool relax_constraint = true;
       // If all nodes in the transitive fanout are on the same device as the
       // assign node, there is no need to allocate the output in pinned memory.
-      for (int fanout : transitive_fanout) {
-        const NodeDef& fanout_node = optimized_graph->node(fanout);
+      for (const NodeDef* fanout_node : transitive_fanout) {
+        // const NodeDef& fanout_node = optimized_graph->node(fanout);
         if (relax_constraint &&
-            (IsSend(fanout_node) ||
-             CrossesTaskOrCpuGpuBoundary(fanout_node, assign_node))) {
+            (IsSend(*fanout_node) ||
+             CrossesTaskOrCpuGpuBoundary(*fanout_node, assign_node))) {
           relax_constraint = false;
           break;
         }
-        if (optimized_nodes.find(fanout) == optimized_nodes.end() &&
-            IsAssign(fanout_node)) {
-          assign_nodes_in_fanout.push_back(fanout);
+        if (optimized_nodes.find(fanout_node) == optimized_nodes.end() &&
+            IsAssign(*fanout_node)) {
+          assign_nodes_in_fanout.push_back(fanout_node);
         }
       }
 
       if (relax_constraint) {
-        for (int assign_idx : assign_nodes_in_fanout) {
+        for (const NodeDef* assign_node_in_fanout : assign_nodes_in_fanout) {
           // If all devices match in fanout of node(i) then, by transitivity,
           // they must also match in the fanout of other assign nodes
           // in the fanout of node(i), so we can process them here,
           // and save computing their transitive fanout later.
-          optimized_nodes.insert(assign_idx);
+          optimized_nodes.insert(assign_node_in_fanout);
 
           // Set an attribute telling AssignOp to ignore allocator constraints.
+          const absl::optional<int> assign_node_idx =
+              graph_view.GetNodeIndex(*assign_node_in_fanout);
           NodeDef* assign_node_to_relax =
-              optimized_graph->mutable_node(assign_idx);
+              optimized_graph->mutable_node(assign_node_idx.value());
           (*assign_node_to_relax
                 ->mutable_attr())["_grappler_relax_allocator_constraints"]
               .set_b(true);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 7b788c613c9c1c42e62f69bf2dab1122b08c4f9a..aa19d33f46d2233e14eaf01a308efe41fd619120 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
@@ -282,6 +285,20 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
   return nullptr;
 }
 
+void MetaOptimizer::InitializeVerifiers(
+    std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+    std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+    const {
+  if (cfg_.inter_optimizer_verifier_config().structure_verifier() ==
+      VerifierConfig::ON) {
+    inter_optimizer_verifiers->push_back(MakeUnique<StructureVerifier>());
+  }
+  if (cfg_.post_optimization_verifier_config().structure_verifier() ==
+      VerifierConfig::ON) {
+    post_optimization_verifiers->push_back(MakeUnique<StructureVerifier>());
+  }
+}
+
 #define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer)                            \
   {                                                                            \
     const Status status = RunOptimizer(optimizer, cluster, &optimized_item,    \
@@ -312,6 +329,23 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
   }
 
+  // Initialize the configured verifiers.
+  std::vector<std::unique_ptr<GraphVerifier>> inter_optimizer_verifiers;
+  std::vector<std::unique_ptr<GraphVerifier>> post_optimization_verifiers;
+  InitializeVerifiers(&inter_optimizer_verifiers, &post_optimization_verifiers);
+  if (inter_optimizer_verifiers.empty()) {
+    VLOG(2) << "No inter optimizer verifiers have been configured";
+  } else {
+    VLOG(2) << inter_optimizer_verifiers.size()
+            << " inter optimizer verifiers have been configured";
+  }
+  if (post_optimization_verifiers.empty()) {
+    VLOG(2) << "No post optimization verifiers have been configured";
+  } else {
+    VLOG(2) << post_optimization_verifiers.size()
+            << " post optimization verifiers have been configured";
+  }
+
   VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id
           << " num_optimizers=" << optimizers.size()
           << ", num nodes = " << item.graph.node_size();
@@ -356,6 +390,14 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         continue;
       }
       RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get());
+      for (const auto& verifier : inter_optimizer_verifiers) {
+        // TODO(ashwinm): Need to enforce verification_deadline.
+        TF_RETURN_IF_ERROR(verifier->Verify(*optimized_graph));
+      }
+    }
+    // TODO(ashwinm): Need to enforce verification_deadline.
+    for (const auto& verifier : post_optimization_verifiers) {
+      TF_RETURN_IF_ERROR(verifier->Verify(*optimized_graph));
     }
   }
 
@@ -425,6 +467,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
+  // Constructs a FunctionLibraryDefinition with functions that are reachable
+  // from the nodes of the graph.
+  const auto minimized_flib =
+      [](const GraphDef& graph) -> FunctionLibraryDefinition {
+    return FunctionLibraryDefinition(OpRegistry::Global(), graph.library())
+        .ReachableDefinitions(graph);
+  };
+
   // 0. Original graph might contain a huge function library, that is mostly
   // unused. This library copied over by each individual Grappler optimizer,
   // which adds a huge overhead. Before starting optimization passes we just
@@ -434,11 +484,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   GraphDef trimmed_graph;  // do not copy graph with a potentially huge library
   *trimmed_graph.mutable_node() = item.graph.node();
   *trimmed_graph.mutable_versions() = item.graph.versions();
-  *trimmed_graph.mutable_library() =
-      grappler::ReachableFunctionLibraryDefinition(
-          FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()),
-          item.graph)
-          .ToProto();
+  *trimmed_graph.mutable_library() = minimized_flib(item.graph).ToProto();
 
   GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
 
@@ -470,10 +516,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   // 2. Optimize functions reachable from the optimized graph.
-  FunctionLibraryDefinition flib = ReachableFunctionLibraryDefinition(
-      FunctionLibraryDefinition(OpRegistry::Global(),
-                                optimized_graph->library()),
-      *optimized_graph);
+  FunctionLibraryDefinition flib = minimized_flib(*optimized_graph);
 
   // Find functions for which we might need to compute a gradient at runtime.
   absl::flat_hash_set<string> differentiable_functions;
@@ -524,7 +567,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // can't perform non-differentiable rewrites.
       if (differentiable_functions.find(func_name) !=
           differentiable_functions.end()) {
-        func_item.allowed_optimizations().non_differentiable_rewrites = false;
+        func_item.optimization_options().allow_non_differentiable_rewrites =
+            false;
       }
 
       // Function item is allowed to use all devices from the main graph.
@@ -533,10 +577,11 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         VLOG(3) << added_devices.error_message();
       }
 
-      // We are not allowed to prune side effects from the graph instantiated
-      // by the function definition, because we must guarantee function
-      // execution semantics wrt side effects (see function_optimizer.cc).
-      func_item.allowed_optimizations().prune_ops_with_side_effects = false;
+      // We are not allowed to prune certain types of ops from the graph
+      // instantiated by the function definition, because we must guarantee
+      // function execution semantics wrt side effects (see
+      // function_optimizer.cc).
+      func_item.optimization_options().is_function_instantiation = true;
 
       // Optimize function body graph.
       GraphDef optimized_func_graph;
@@ -626,5 +671,81 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
   return status;
 }
 
+Status OptimizeGraph(
+    std::vector<string> ret_node_names, FunctionLibraryDefinition* flib,
+    const DeviceSet& device_set, Device* cpu_device,
+    const ConfigProto& config_proto,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g) {
+  if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto)) {
+    return Status::OK();
+  }
+
+  tensorflow::grappler::GrapplerItem item;
+  item.optimization_options() = optimization_options;
+
+  // Add all available devices so that inlined function can be placed.
+  for (const Device* d : device_set.devices()) {
+    Status added_device = item.AddDevice(d->name());
+    if (!added_device.ok()) VLOG(3) << added_device.error_message();
+  }
+
+  // Add fetches so that the graph can be pruned.
+  item.fetch.swap(ret_node_names);
+
+  (*g)->ToGraphDef(&item.graph);
+
+  if (flib) {
+    *item.graph.mutable_library() = flib->ToProto();
+  }
+
+  tensorflow::GraphDef out_graph;
+
+  tensorflow::grappler::VirtualCluster cluster(&device_set);
+
+  // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
+  // proto (which also contain the OptimizerOptions).
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      item, config_proto, cpu_device, &cluster, &out_graph));
+
+  std::unique_ptr<tensorflow::Graph> optimized_graph(
+      new tensorflow::Graph(OpRegistry::Global()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
+                                            out_graph, optimized_graph.get()));
+
+  // Copy optimized functions back to the overlay lib.
+  if (flib) {
+    for (const FunctionDef& fdef : out_graph.library().function()) {
+      const string& func_name = fdef.signature().name();
+      if (flib->Contains(func_name)) {
+        TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
+      } else {
+        TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
+      }
+    }
+  }
+
+  *g = std::move(optimized_graph);
+
+  // The graph conversion sets the requested device names but not the
+  // assigned device names. However, since at this point the graph is
+  // placed TF expects an assigned device name for every node. Therefore
+  // we copy the requested device into the assigned device field.
+  for (Node* node : (*g)->nodes()) {
+    if (node->IsOp() && node->assigned_device_name().empty()) {
+      if (node->requested_device().empty()) {
+        return errors::Internal(
+            "Either placer did not place the node or Grappler did not "
+            "copy the assigned device. Contact Grappler team since latter "
+            "is more likely. Node=",
+            node->name(), " Graph: ", (*g)->ToGraphDefDebug().DebugString());
+      }
+      node->set_assigned_device_name(node->requested_device());
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index a06da4394e4b8a4d8e75855a0a432114f7d7fcb3..ec78cc5771f5923345d2f9eab1bcc200963862eb 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -16,12 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/verifier_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -60,6 +65,12 @@ class MetaOptimizer : public GraphOptimizer {
   const RewriterConfig::CustomGraphOptimizer* GetCustomGraphOptimizerConfig(
       const string& name) const;
 
+  // Initialiaze active verifiers from the RewriterConfig toggles.
+  void InitializeVerifiers(
+      std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+      std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+      const;
+
   // Run optimization pass over a single GrapplerItem. Meta optimizer might run
   // multiple such passes: 1) for the main graph 2) for the function library
   Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
@@ -99,6 +110,30 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
+// Wrapper around RunMetaOptimizer convenient for optimizing
+// function graphs.
+//
+// Runs grappler optimizations on `g` based on `config_proto`.
+// `ret_node_names`: a vector of node names whose outputs are returned,
+//    aka fetches. when `g` represent a function, these are _Retval nodes.
+// `lib`: function library to use with `g`.
+// `device_set`: the set of devices that graph can refer to.
+// `cpu_device`: the CPU device.
+// `config_proto`: Grapper configuration.
+// `optimization_options`: Grappler optimization constraints that are known only
+//    at runtime.
+//
+// **g is a graph constructed based on the runtime library 'lib'.
+// OptimizeGraph mutates **g extensively and replaces '*g' with a
+// complete copy. Therefore, the caller should not keep any references
+// to nodes *g.
+Status OptimizeGraph(
+    std::vector<string> ret_node_names, FunctionLibraryDefinition* lib,
+    const DeviceSet& device_set, Device* cpu_device,
+    const ConfigProto& config_proto,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g);
+
 }  // namespace grappler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 12db5d6ca9b001fa04e42e6d228fe6289d87726e..55f14b3b68ecd181f4833b717b4c080e70855868 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/match.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -27,7 +29,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -87,12 +91,12 @@ REGISTER_GRAPH_OPTIMIZER(TestOptimizerWithParams);
 // Record various properties of the GrapplerItems passed for optimization.
 class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
  public:
-  static void SetAllowedOptimizations(
-      gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
-          allowed_optimizations) {
-    allowed_optimizations_ = allowed_optimizations;
+  static void SetOptimizationOptions(
+      gtl::FlatMap<string, GrapplerItem::OptimizationOptions>*
+          optimization_options) {
+    optimization_options_ = optimization_options;
   }
-  static void ResetAllowedOptimizations() { allowed_optimizations_ = nullptr; }
+  static void ResetOptimizationOptions() { optimization_options_ = nullptr; }
 
   GrapplerItemPropertiesAccumulator() {}
   string name() const override {
@@ -107,8 +111,8 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
-    if (allowed_optimizations_) {
-      allowed_optimizations_->insert({item.id, item.allowed_optimizations()});
+    if (optimization_options_) {
+      optimization_options_->insert({item.id, item.optimization_options()});
     }
     return Status::OK();
   }
@@ -117,12 +121,12 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
                 const GraphDef& optimized_graph, double result) override {}
 
  private:
-  static gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
-      allowed_optimizations_;
+  static gtl::FlatMap<string, GrapplerItem::OptimizationOptions>*
+      optimization_options_;
 };
 
-gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
-    GrapplerItemPropertiesAccumulator::allowed_optimizations_;
+gtl::FlatMap<string, GrapplerItem::OptimizationOptions>*
+    GrapplerItemPropertiesAccumulator::optimization_options_;
 
 REGISTER_GRAPH_OPTIMIZER(GrapplerItemPropertiesAccumulator);
 
@@ -231,7 +235,7 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
-  // Enable ony function optimization.
+  // Enable only function optimization.
   ConfigProto config_proto;
   auto& rewriter_config =
       *config_proto.mutable_graph_options()->mutable_rewrite_options();
@@ -254,13 +258,13 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   FunctionDef mul_func = FunctionDefHelper::Create(
       "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "mul:z:0"}});
 
   FunctionDef square_func = FunctionDefHelper::Create(
       "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "my_mul:z:0"}});
   (*square_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -268,7 +272,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
       "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
        {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "quadratic:z:0"}});
   (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -290,7 +294,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
        // Forward outputs
        NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {mul_func, square_func, quadratic_func});
 
   GraphDef output;
@@ -300,7 +304,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
                                            output.library());
 
   // Specialized and optimized functions should be added to the graph.
-  EXPECT_EQ(6, optimized_flib.num_functions());
+  EXPECT_EQ(5, optimized_flib.num_functions());
 
   // Get a specialized function name.
   const auto specialized_name = [](const string& fn, const string& node,
@@ -314,25 +318,22 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
       specialized_name("MyQuadratic", "quadratic", "tf_graph");
 
   // MySquare should be specialized and optimized for 3 instantiations:
-  //   1. 'square' node in the main graph
-  //   2. 'square' node in the MyQuadratic specialization (not in a fetch set)
-  //   3. 'quadratic' node in the MyQuadratic specialization (is in a fetch set)
+  //   1.  'square' node in the main graph
+  //   2.  'square' node in the MyQuadratic specialization
+  //   3*. 'quadratic' node in the MyQuadratic specialization
+  //        has identical instantiation context to #2
 
   const string optimized_1 = specialized_name("MySquare", "square", "tf_graph");
   const string optimized_2 =
       specialized_name("MySquare", "square", optimized_0);
-  const string optimized_3 =
-      specialized_name("MySquare", "quadratic", optimized_0);
 
   const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
   const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
   const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
-  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
 
   ASSERT_NE(optimized_func_0, nullptr);
   ASSERT_NE(optimized_func_1, nullptr);
   ASSERT_NE(optimized_func_2, nullptr);
-  ASSERT_NE(optimized_func_3, nullptr);
 
   // Graph should call optimized function.
   int count = 0;
@@ -351,13 +352,13 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
     if (node.name() == "square" && ++count) {
       EXPECT_EQ(optimized_2, node.op());
     } else if (node.name() == "quadratic" && ++count) {
-      EXPECT_EQ(optimized_3, node.op());
+      EXPECT_EQ(optimized_2, node.op());
     }
   }
   EXPECT_EQ(2, count);
 
-  const std::vector<const FunctionDef*> optimized_funcs = {
-      optimized_func_1, optimized_func_2, optimized_func_3};
+  const std::vector<const FunctionDef*> optimized_funcs = {optimized_func_1,
+                                                           optimized_func_2};
 
   // MyMul should be inlined into all optimized versions of MySquare.
   for (const FunctionDef* optimized_func : optimized_funcs) {
@@ -403,6 +404,97 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneUnusedOutputs) {
+  using test::function::NDef;
+
+  ConfigProto config_proto;
+  MetaOptimizer optimizer(nullptr, config_proto);
+
+  // MyMul computes x*y three times and has three output values.
+  FunctionDef my_mul = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
+      {{{"output0"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /*ret_def=*/
+      {{"z0", "output0:z:0"}, {"z1", "output1:z:0"}, {"z2", "output2:z:0"}});
+
+  // Call MyMyl and forward all three outputs.
+  FunctionDef my_fwd = FunctionDefHelper::Create(
+      "Fwd", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
+      {{{"output"}, "MyMul", {"x", "y"}, {{"T", "$T"}}}},
+      /*ret_def=*/
+      {{"z0", "output:z0:0"}, {"z1", "output:z1:0"}, {"z2", "output:z2:0"}});
+
+  // Mark both functions as `_noinline` to trigger specialization.
+  (*my_mul.mutable_attr())["_noinline"].set_b(true);
+  (*my_fwd.mutable_attr())["_noinline"].set_b(true);
+  /*funcs=*/
+  std::vector<FunctionDef> function_library = {my_mul, my_fwd};
+
+  // Tensorflow graph:
+  //   a = Placeholder[T=float]
+  //   b = Placeholder[T=float]
+  //   fwd = Fwd(a, b)
+  //
+  // Fetch fwd:2 via Identity node.
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.fetch = {"ret"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("fwd", "Fwd", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("ret", "Identity", {"fwd:2"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized functions should be added to the graph.
+  EXPECT_EQ(3, optimized_flib.num_functions());
+
+  // Expected names of the specialized functions.
+  const string specialized_my_fwd = "Fwd_specialized_for_fwd_at_tf_graph";
+  const string specialized_my_mul =
+      absl::StrCat("MyMul_specialized_for_output_at_", specialized_my_fwd);
+
+  // Specialized MyMul should have just one output argument.
+  FunctionDef expected_my_mul = FunctionDefHelper::Create(
+      specialized_my_mul, {"x:float", "y:float"}, {"z2:float"}, {},
+      {{{"output2"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z2", "output2:z:0"}});
+
+  // Specialized Fwd should also have just one output argument.
+  FunctionDef expected_my_fwd = FunctionDefHelper::Create(
+      specialized_my_fwd, {"x:float", "y:float"}, {"z2:float"}, {},
+      {{{"output"}, specialized_my_mul, {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z2", "output:z2:0"}});
+
+  const FunctionDef* my_mul_spec = optimized_flib.Find(specialized_my_mul);
+  const FunctionDef* my_fwd_spec = optimized_flib.Find(specialized_my_fwd);
+
+  ASSERT_NE(my_mul_spec, nullptr);
+  ASSERT_NE(my_fwd_spec, nullptr);
+
+  CompareFunctions(expected_my_mul, *my_mul_spec);
+  CompareFunctions(expected_my_fwd, *my_fwd_spec);
+
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<float>(4.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   using test::function::NDef;
 
@@ -425,7 +517,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
       "MyFunc", {"x:T", "y:T"}, {"z1:T", "z2:T"}, {"T: {float, double}"},
       {{{"mul1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
        {{"mul2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z1", "mul1:z:0"}, {"z2", "mul2:z:0"}});
   (*my_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -449,7 +541,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
        // Read outputs of function call nodes
        NDef("out_fn1", "Identity", {"fn1:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("out_fn2", "Identity", {"fn2:1"}, {{"T", DT_FLOAT}}, kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {my_func});
 
   GraphDef output;
@@ -515,10 +607,9 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
 
   // We will record what type of optimizations meta optimizer allows for each
   // GrapplerItem (main graph and graphs for each function).
-  gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>
-      allowed_optimizations;
-  GrapplerItemPropertiesAccumulator::SetAllowedOptimizations(
-      &allowed_optimizations);
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
 
   // Just record properties of optimized Grappler items.
   ConfigProto config_proto;
@@ -532,17 +623,17 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
   MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
-  FunctionDef mul_func_1 = FunctionDefHelper::Create(
-      "MyMul1", {"x:float", "y:float"}, {"z:float"}, {},
-      {{{"mul"}, "Mul", {"x", "y"}, {}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
-
-  FunctionDef mul_func_2 = FunctionDefHelper::Create(
-      "MyMul2", {"x:float", "y:float"}, {"z:float"}, {},
-      {{{"mul"}, "Mul", {"x", "y"}, {}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
 
   // Tensorflow graph:
   //
@@ -568,7 +659,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
              {"Tin", DataTypeSlice{DT_FLOAT}},
              {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
             kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {mul_func_1, mul_func_2});
   item.fetch = {"mul_1", "mul_2", "dx"};
 
@@ -577,22 +668,23 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
 
   // Our custom optimizer must be called for the main graph and for the two
   // functions.
-  ASSERT_EQ(allowed_optimizations.size(), 3);
-
-  auto allowed_optimizations_main =
-      gtl::FindOrNull(allowed_optimizations, "main");
-  ASSERT_NE(allowed_optimizations_main, nullptr);
-  EXPECT_TRUE(allowed_optimizations_main->non_differentiable_rewrites);
-
-  auto allowed_optimizations_my_mul_1 =
-      gtl::FindOrNull(allowed_optimizations, "MyMul1");
-  ASSERT_NE(allowed_optimizations_my_mul_1, nullptr);
-  EXPECT_TRUE(allowed_optimizations_my_mul_1->non_differentiable_rewrites);
-
-  auto allowed_optimizations_my_mul_2 =
-      gtl::FindOrNull(allowed_optimizations, "MyMul2");
-  ASSERT_NE(allowed_optimizations_my_mul_2, nullptr);
-  EXPECT_FALSE(allowed_optimizations_my_mul_2->non_differentiable_rewrites);
+  ASSERT_EQ(optimization_options.size(), 3);
+
+  auto optimization_options_main =
+      gtl::FindOrNull(optimization_options, "main");
+  ASSERT_NE(optimization_options_main, nullptr);
+  EXPECT_TRUE(optimization_options_main->allow_non_differentiable_rewrites);
+
+  auto optimization_options_my_mul_1 =
+      gtl::FindOrNull(optimization_options, "MyMul1");
+  ASSERT_NE(optimization_options_my_mul_1, nullptr);
+  EXPECT_TRUE(optimization_options_my_mul_1->allow_non_differentiable_rewrites);
+
+  auto optimization_options_my_mul_2 =
+      gtl::FindOrNull(optimization_options, "MyMul2");
+  ASSERT_NE(optimization_options_my_mul_2, nullptr);
+  EXPECT_FALSE(
+      optimization_options_my_mul_2->allow_non_differentiable_rewrites);
 }
 
 class SleepingOptimizer : public CustomGraphOptimizer {
@@ -660,6 +752,190 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
 
+TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnValidGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config_proto;
+  auto& post_optimization_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_post_optimization_verifier_config();
+  post_optimization_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnValidGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config_proto;
+  auto& inter_optimizer_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_inter_optimizer_verifier_config();
+  inter_optimizer_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnInvalidGraph) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      /*funcs=*/
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+
+  // Call Optimize with post optimization verifiers.
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+  auto& post_optimization_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_post_optimization_verifier_config();
+  post_optimization_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer_with_post_verifiers(nullptr, config_proto);
+  Status status =
+      optimizer_with_post_verifiers.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.code(), errors::Code::NOT_FOUND);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Op type not registered"));
+}
+
+TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnInvalidGraph) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      /*funcs=*/
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+
+  // Call Optimize with post optimization verifiers.
+  ConfigProto config_proto;
+  // Call Optimize with inter optimizer verifiers.
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+  auto& inter_optimizer_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_inter_optimizer_verifier_config();
+  inter_optimizer_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer_with_inter_verifiers(nullptr, config_proto);
+  Status status =
+      optimizer_with_inter_verifiers.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.code(), errors::Code::NOT_FOUND);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Op type not registered"));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index f0c81f29e687aa792df74b69b4c063090a707e61..193772fcda23378850485db105fc2d3ebef1d8ab 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -60,17 +61,30 @@ struct RemapperContext {
 
 // FusedBatchNorm that can be replaced with a cheaper set of primitives.
 struct FusedBatchNorm {
+  FusedBatchNorm() = default;
+  explicit FusedBatchNorm(const NodeDef* fused_batch_norm)
+      : fused_batch_norm(fused_batch_norm) {}
+
   const NodeDef* fused_batch_norm = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd.
 struct Conv2DWithBiasAdd {
+  Conv2DWithBiasAdd() = default;
+  Conv2DWithBiasAdd(const NodeDef* conv2d, const NodeDef* bias_add)
+      : conv2d(conv2d), bias_add(bias_add) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* bias_add = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd and Relu.
 struct Conv2DWithBiasAddAndRelu {
+  Conv2DWithBiasAddAndRelu() = default;
+  Conv2DWithBiasAddAndRelu(const NodeDef* conv2d, const NodeDef* bias_add,
+                           const NodeDef* relu)
+      : conv2d(conv2d), bias_add(bias_add), relu(relu) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* bias_add = nullptr;
   const NodeDef* relu = nullptr;
@@ -78,6 +92,11 @@ struct Conv2DWithBiasAddAndRelu {
 
 // Conv2D node followed by a Squeeze and BiasAdd.
 struct Conv2DWithSqueezeAndBiasAdd {
+  Conv2DWithSqueezeAndBiasAdd() = default;
+  Conv2DWithSqueezeAndBiasAdd(const NodeDef* conv2d, const NodeDef* squeeze,
+                              const NodeDef* bias_add)
+      : conv2d(conv2d), squeeze(squeeze), bias_add(bias_add) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* squeeze = nullptr;
   const NodeDef* bias_add = nullptr;
@@ -85,6 +104,11 @@ struct Conv2DWithSqueezeAndBiasAdd {
 
 // Conv2D node followed by a FusedBatchNorm.
 struct Conv2DWithBatchNorm {
+  Conv2DWithBatchNorm() = default;
+  Conv2DWithBatchNorm(const NodeDef* conv2d, const NodeDef* fused_batch_norm,
+                      float epsilon = 0.0)
+      : conv2d(conv2d), fused_batch_norm(fused_batch_norm), epsilon(epsilon) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* fused_batch_norm = nullptr;
   float epsilon = 0.0;
@@ -92,16 +116,23 @@ struct Conv2DWithBatchNorm {
 
 // Conv2D node followed by a FusedBatchNorm and Relu.
 struct Conv2DWithBatchNormAndRelu {
+  Conv2DWithBatchNormAndRelu() = default;
+  Conv2DWithBatchNormAndRelu(const NodeDef* conv2d,
+                             const NodeDef* fused_batch_norm,
+                             const NodeDef* relu, float epsilon = 0.0)
+      : conv2d(conv2d),
+        fused_batch_norm(fused_batch_norm),
+        relu(relu),
+        epsilon(epsilon) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* fused_batch_norm = nullptr;
   const NodeDef* relu = nullptr;
   float epsilon = 0.0;
 };
 
-bool IsFloatOrDoubleDataType(const NodeDef* node,
-                             const string& type_attr = "T") {
-  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
-  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
 }
 
 bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
@@ -119,91 +150,165 @@ bool HasDataType(const NodeDef* node, const DataType& expected,
   return dtype == expected;
 }
 
-bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
-  return ctx.nodes_to_preserve.count(node->name()) > 0;
+bool IsCpuCompatibleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+}
+
+bool IsGpuCompatibleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT;
+}
+
+bool IsCpuCompatibleDataFormat(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  const string& data_format = conv2d->attr().at(kDataFormat).s();
+  return data_format == "NHWC";
+}
+
+bool IsGpuCompatibleDataFormat(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  const string& data_format = conv2d->attr().at(kDataFormat).s();
+  return data_format == "NHWC" || data_format == "NCHW";
 }
 
-bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
-                        Conv2DWithBiasAdd* matched) {
+bool IsCpuCompatibleConv2D(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  return NodeIsOnCpu(conv2d) && IsCpuCompatibleDataType(conv2d) &&
+         IsCpuCompatibleDataFormat(conv2d);
+}
+
+bool IsGpuCompatibleConv2D(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  return NodeIsOnGpu(conv2d) && IsGpuCompatibleDataType(conv2d) &&
+         IsGpuCompatibleDataFormat(conv2d);
+}
+
+// Checks if we can rewrite a pattern to the `_FusedConv2D` on CPU device.
+template <typename Pattern>
+bool IsCpuCompatible(const Pattern& matched) {
+  return IsCpuCompatibleConv2D(matched.conv2d);
+}
+
+// Checks if we can rewrite a pattern to the `_FusedConv2D` on GPU device.
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithBiasAddAndRelu& matched) {
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      ctx.graph_properties.GetInputProperties(matched.conv2d->name());
+  const TensorShapeProto& filter_shape =
+      input_props.size() >= 2 ? input_props[1].shape() : TensorShapeProto();
+
+  // FusedConv2D on GPU with 1x1 convolution is marginally faster than
+  // in-graph computation in micro benchmarks (see kernels/conv_ops_test.cc),
+  // and significantly slower in large scale benchmarks.
+  bool is_spatial_conv = Rank(filter_shape) == 4 &&          //
+                         IsKnown(filter_shape.dim(1)) &&     //
+                         IsKnown(filter_shape.dim(2)) &&     //
+                         filter_shape.dim(1).size() != 1 &&  //
+                         filter_shape.dim(2).size() != 1;
+
+  return is_spatial_conv && IsGpuCompatibleConv2D(matched.conv2d);
+}
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithBiasAdd& matched) {
+  return false;
+}
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithSqueezeAndBiasAdd& matched) {
+  return false;
+}
+
+// Returns true if the given pattern is supported on the assigned device.
+template <typename Pattern>
+bool IsDeviceCompatible(const RemapperContext& ctx, Pattern& matched) {
+  return IsCpuCompatible(matched) || IsGpuCompatible(ctx, matched);
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* bias_add,
+                        Conv2DWithBiasAdd* matched,
+                        bool check_device_compatible = true) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a BiasAdd.
-  if (!node) return false;
-  if (!IsBiasAdd(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (bias_add == nullptr || !IsBiasAdd(*bias_add) ||
+      HasControlFaninOrFanout(ctx.graph_view, bias_add))
+    return false;
 
-  // Input to the BiasAdd must be a Conv2D in NHWC format.
-  const auto input_port = GraphView::InputPort(node, 0);
+  // Input to the BiasAdd must be a Conv2D.
+  const auto input_port = GraphView::InputPort(bias_add, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
-  if (!conv2d.node) return false;
-  if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||
+      !HaveSameDataType(bias_add, conv2d.node) ||
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithBiasAdd pattern{conv2d.node, bias_add};
+  if (check_device_compatible && !IsDeviceCompatible(ctx, pattern)) {
+    return false;
+  }
 
   // We successfully found a Conv2D+BiasAdd pattern.
-  matched->conv2d = conv2d.node;
-  matched->bias_add = node;
+  *matched = pattern;
 
   return true;
 }
 
-bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* relu,
                                Conv2DWithBiasAddAndRelu* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a Relu.
-  if (!node) return false;
-  if (!IsRelu(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!relu || !IsRelu(*relu) || HasControlFaninOrFanout(ctx.graph_view, relu))
+    return false;
 
   // And input to Relu must match Conv2DWithBiasAdd pattern.
-  const auto input_port = GraphView::InputPort(node, 0);
+  const auto input_port = GraphView::InputPort(relu, 0);
   const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBiasAdd base;
-  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
-  if (!HaveSameDataType(node, base.bias_add)) return false;
-  if (IsInPreserveSet(ctx, base.bias_add)) return false;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base,
+                          /*check_device_compatible=*/false) ||
+      !HasSingleFanoutNode(ctx.graph_view, base.bias_add) ||
+      !HaveSameDataType(relu, base.bias_add) ||
+      IsInPreserveSet(ctx, base.bias_add))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithBiasAddAndRelu pattern{base.conv2d, base.bias_add, relu};
+  if (!IsDeviceCompatible(ctx, pattern)) return false;
 
   // We successfully found a Conv2D+BiasAdd+Relu pattern.
-  matched->conv2d = base.conv2d;
-  matched->bias_add = base.bias_add;
-  matched->relu = node;
+  *matched = pattern;
 
   return true;
 }
 
 bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
-                                  const NodeDef* node,
+                                  const NodeDef* bias_add,
                                   Conv2DWithSqueezeAndBiasAdd* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a BiasAdd.
-  if (node == nullptr) return false;
-  if (node->op() != "BiasAdd") return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!bias_add || !IsBiasAdd(*bias_add) ||
+      HasControlFaninOrFanout(ctx.graph_view, bias_add))
+    return false;
 
   // Input to the BiasAdd must be a Squeeze.
-  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto bias_input_port = GraphView::InputPort(bias_add, 0);
   const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
-  if (squeeze.node == nullptr) return false;
-  if (squeeze.node->op() != "Squeeze") return false;
-  if (!NodeIsOnCpu(squeeze.node)) return false;
-  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
-  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  if (!squeeze.node || !IsSqueeze(*squeeze.node) ||
+      !HaveSameDataType(bias_add, squeeze.node, "T") ||
+      HasControlFaninOrFanout(ctx.graph_view, squeeze.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, squeeze.node) ||
+      IsInPreserveSet(ctx, squeeze.node))
+    return false;
 
   // Squeeze must not squeeze output channel dimension.
   std::vector<int32> dims;
@@ -212,67 +317,72 @@ bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
     if (dim == 3) return false;
   }
 
-  // Input to the Squeeze must be a Conv2D in NHWC format.
+  // Input to the Squeeze must be a Conv2D.
   const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
-  if (conv2d.node == nullptr) return false;
-  if (conv2d.node->op() != "Conv2D") return false;
-  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||
+      !HaveSameDataType(bias_add, conv2d.node, "T") ||
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithSqueezeAndBiasAdd pattern{conv2d.node, squeeze.node,
+                                            bias_add};
+  if (!IsDeviceCompatible(ctx, pattern)) return false;
 
   // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
-  matched->conv2d = conv2d.node;
-  matched->squeeze = squeeze.node;
-  matched->bias_add = node;
+  *matched = pattern;
 
   return true;
 }
 
-bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx,
+                             const NodeDef* batch_norm,
                              Conv2DWithBatchNorm* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
-  if (node == nullptr) return false;
-  if (!IsFusedBatchNorm(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!HasDataType(node, DT_FLOAT)) return false;
+  if (!batch_norm || !IsFusedBatchNorm(*batch_norm)) return false;
 
   // V2 has a separate data type for the scale/offset/mean/variance inputs.
-  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+  if (batch_norm->op() == "FusedBatchNormV2" &&
+      !HasDataType(batch_norm, DT_FLOAT, "U"))
     return false;
 
   // Check that batch normalization is in inference mode.
-  const auto& attr = node->attr();
+  const auto& attr = batch_norm->attr();
   if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
 
   // Check that only 0th output is consumed by other nodes.
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
-  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
-  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
-  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
-  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+  if (HasControlFaninOrFanout(ctx.graph_view, batch_norm) ||
+      HasFanouts(ctx.graph_view, batch_norm, 1) ||  // batch_mean
+      HasFanouts(ctx.graph_view, batch_norm, 2) ||  // batch_variance
+      HasFanouts(ctx.graph_view, batch_norm, 3) ||  // reserve_space_1
+      HasFanouts(ctx.graph_view, batch_norm, 4))    // reserve_space_2
+    return false;
 
-  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
-  const auto input_port = GraphView::InputPort(node, 0);
+  // Input to the FusedBatchNorm must be a Conv2D.
+  const auto input_port = GraphView::InputPort(batch_norm, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
-  if (conv2d.node == nullptr) return false;
-  if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||               //
+      !NodeIsOnCpu(conv2d.node) ||                             //
+      !HaveSameDataType(batch_norm, conv2d.node) ||            //
+      !IsCpuCompatibleDataType(conv2d.node) ||                 //
+      !IsCpuCompatibleDataFormat(conv2d.node) ||               //
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||  //
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||     //
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
 
   // We successfully found a Conv2D+FusedBatchNorm pattern.
   matched->conv2d = conv2d.node;
-  matched->fused_batch_norm = node;
-  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+  matched->fused_batch_norm = batch_norm;
+  if (!GetNodeAttr(*batch_norm, "epsilon", &matched->epsilon).ok())
+    return false;
 
   return true;
 }
@@ -283,21 +393,19 @@ bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a Relu.
-  if (node == nullptr) return false;
-  if (!IsRelu(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!node || !IsRelu(*node) || HasControlFaninOrFanout(ctx.graph_view, node))
+    return false;
 
   // And input to Relu must match Conv2DWithBatchNorm pattern.
   const auto input_port = GraphView::InputPort(node, 0);
   const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBatchNorm base;
-  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
-  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
-  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base) ||
+      !HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm) ||
+      !HaveSameDataType(node, base.fused_batch_norm) ||
+      IsInPreserveSet(ctx, base.fused_batch_norm))
+    return false;
 
   // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
   matched->conv2d = base.conv2d;
@@ -355,9 +463,7 @@ bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
   return true;
 }
 
-void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
-                          const std::vector<string>& fused_ops = {},
-                          int num_args = 1, float epsilon = 0.0) {
+void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d) {
   auto* attr = fused_conv2d->mutable_attr();
   auto src_attr = conv2d->attr();
 
@@ -367,53 +473,65 @@ void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
+}
 
-  auto* fused_ops_attr = (*attr)["fused_ops"].mutable_list();
-  for (const string& fused_op : fused_ops) {
-    fused_ops_attr->add_s(fused_op);
-  }
-
+void SetFusedConv2DAttributes(
+    NodeDef* fused_conv2d, const absl::Span<const absl::string_view> fused_ops,
+    int num_args = 1, float epsilon = 0.0) {
+  auto* attr = fused_conv2d->mutable_attr();
+  SetAttrValue(fused_ops, &(*attr)["fused_ops"]);
   SetAttrValue(num_args, &(*attr)["num_args"]);
-  // Required only for FusedBatchNorm.
-  SetAttrValue(epsilon, &(*attr)["epsilon"]);
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);  // required only for BatchNorm
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithBiasAdd& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
-  VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
+  VLOG(2) << "Fuse Conv2D with BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
-  fused_conv2d->set_name(matched.bias_add->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.bias_add->device());
+  fused_conv2d->set_name(matched.bias_add->name());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd"});
 
   invalidated_nodes->insert(matched.bias_add);
   invalidated_nodes->insert(matched.conv2d);
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithBiasAddAndRelu& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
-  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
+  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: "
+          << " relu=" << matched.relu->name()
           << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.relu->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd", "Relu"});
 
   invalidated_nodes->insert(matched.relu);
   invalidated_nodes->insert(matched.bias_add);
@@ -421,8 +539,12 @@ void AddFusedConv2DNode(
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithSqueezeAndBiasAdd& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
   VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
           << " bias_add=" << matched.bias_add->name()
           << " squeeze=" << matched.squeeze->name()
@@ -432,13 +554,14 @@ void AddFusedConv2DNode(
   // has single consumer (only the squeeze node).
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.conv2d->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd"});
 
   // Replace BiasAdd node with a Squeeze.
   NodeDef* remapped_squeeze = optimized_graph->add_node();
@@ -461,7 +584,7 @@ void AddFusedConv2DNode(
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.fused_batch_norm->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
   fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
@@ -469,8 +592,9 @@ void AddFusedConv2DNode(
   fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
   fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
-                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"FusedBatchNorm"},
+                           /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
   invalidated_nodes->insert(matched.fused_batch_norm);
   invalidated_nodes->insert(matched.conv2d);
@@ -487,7 +611,7 @@ void AddFusedConv2DNode(
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
   fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
@@ -495,8 +619,9 @@ void AddFusedConv2DNode(
   fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
   fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
-                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"FusedBatchNorm", "Relu"},
+                           /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
   invalidated_nodes->insert(matched.relu);
   invalidated_nodes->insert(matched.fused_batch_norm);
@@ -680,21 +805,25 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
 
     // Remap Conv2D+BiasAdd into the _FusedConv2D.
     if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
-      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
+      AddFusedConv2DNode(ctx, conv2d_with_bias, optimized_graph,
+                         &invalidated_nodes);
       continue;
     }
 
     // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
     if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
-      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+      AddFusedConv2DNode(ctx, conv2d_with_bias_and_relu, optimized_graph,
                          &invalidated_nodes);
       continue;
     }
 
+// TODO(penporn):
+// Remove this once TF-MKL supports _FusedConv2D with these operations.
+#ifndef INTEL_MKL
     // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
     if (FindConv2DWithSqueezeAndBias(ctx, &node,
                                      &conv2d_with_squeeze_and_bias)) {
-      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+      AddFusedConv2DNode(ctx, conv2d_with_squeeze_and_bias, optimized_graph,
                          &invalidated_nodes);
       continue;
     }
@@ -713,6 +842,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                          &invalidated_nodes);
       continue;
     }
+#endif  // !INTEL_MKL
 
     // Infer properties lazily in case they are not needed.
     if (!ctx.inferred_graph_properties && IsFusedBatchNormCandidate(node)) {
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 90ad04cf47b7ec7d8d80f90d65ea4aafa7722464..375c3e56c80aa65cd9e5ab0e2248b81d3e3db776 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -166,10 +168,10 @@ string AddPrefixToNodeName(const string& name, const string& prefix,
                            const string& delimiter) {
   if (!name.empty()) {
     if (name[0] == '^') {
-      return strings::StrCat("^", prefix, delimiter, name.substr(1));
+      return absl::StrCat("^", prefix, delimiter, name.substr(1));
     }
   }
-  return strings::StrCat(prefix, delimiter, name);
+  return absl::StrCat(prefix, delimiter, name);
 }
 
 string AddPrefixToNodeName(const string& name, const string& prefix) {
@@ -193,20 +195,26 @@ bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
 }
 
 string AsControlDependency(const NodeDef& node) {
-  return strings::StrCat("^", node.name());
+  return absl::StrCat("^", node.name());
 }
 
 string AsControlDependency(const string& node_name) {
   CHECK(!node_name.empty());
   return (!node_name.empty() && node_name[0] == '^')
              ? node_name
-             : strings::StrCat("^", node_name);
+             : absl::StrCat("^", node_name);
 }
 
 bool NodeIsOnCpu(const NodeDef* node) {
   string task, device;
   return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
-         str_util::StartsWith(device, DEVICE_CPU);
+         absl::StartsWith(device, DEVICE_CPU);
+}
+
+bool NodeIsOnGpu(const NodeDef* node) {
+  string task, device;
+  return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+         absl::StartsWith(device, DEVICE_GPU);
 }
 
 int NumOutputs(const NodeDef& node, GraphDef* graph) {
@@ -402,123 +410,6 @@ void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
   EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
 }
 
-Status SimpleGraphView::Initialize(
-    const GraphDef& graph,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies,
-    bool dedup_inputs, bool dedup_outputs) {
-  graph_ = &graph;
-  const int num_nodes = graph.node_size();
-  inputs_.clear();
-  inputs_.resize(num_nodes);
-  outputs_.clear();
-  outputs_.resize(num_nodes);
-  name_to_index_.clear();
-  name_to_index_.reserve(num_nodes);
-  index_to_name_.clear();
-  index_to_name_.reserve(num_nodes);
-
-  // Build map from name to index and vice versa.
-  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = graph.node(node_idx);
-    name_to_index_.emplace(node.name(), node_idx);
-    index_to_name_.push_back(node.name());
-  }
-
-  if (extra_dependencies) {
-    for (const auto& dep : *extra_dependencies) {
-      auto itr_src = name_to_index_.find(dep.first->name());
-      if (itr_src == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent src ", dep.first->name());
-      }
-      auto itr_tgt = name_to_index_.find(dep.second->name());
-      if (itr_tgt == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent tgt ", dep.second->name());
-      }
-      const int src_idx = itr_src->second;
-      const int tgt_idx = itr_tgt->second;
-      inputs_[tgt_idx].push_back(src_idx);
-      outputs_[src_idx].push_back(tgt_idx);
-    }
-  }
-
-  // Build forward and reverse adjacency lists.
-  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = graph.node(node_idx);
-    inputs_[node_idx].reserve(node.input_size());
-    for (const string& input : node.input()) {
-      auto it = name_to_index_.find(NodeName(input));
-      if (it == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent input ", input,
-                                       " for node ", node.name());
-      }
-      const int input_idx = it->second;
-      inputs_[node_idx].push_back(input_idx);
-      outputs_[input_idx].push_back(node_idx);
-    }
-    if (dedup_inputs) {
-      // Dedup the input list while it's still hot in cache.
-      STLSortAndRemoveDuplicates(&inputs_[node_idx]);
-    }
-  }
-
-  // Dedup outputs.
-  if (dedup_outputs) {
-    for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-      STLSortAndRemoveDuplicates(&outputs_[node_idx]);
-    }
-  }
-  return Status::OK();
-}
-
-void SimpleGraphView::DepthFirstSearch(
-    const std::unordered_set<string>& op_types_to_traverse, int root_node,
-    std::set<int>* nodes_found) const {
-  nodes_found->clear();
-  const string& op_type = graph_->node(root_node).op();
-  if (!op_types_to_traverse.empty() &&
-      op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
-    return;
-  }
-  std::vector<int> stack;
-  stack.reserve(32);
-  stack.push_back(root_node);
-  while (!stack.empty()) {
-    const int node_idx = stack.back();
-    stack.pop_back();
-    nodes_found->insert(node_idx);
-    const string& op_type = graph_->node(node_idx).op();
-    if (op_types_to_traverse.empty() ||
-        op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
-      for (auto output_idx : this->outputs(node_idx)) {
-        if (nodes_found->find(output_idx) == nodes_found->end()) {
-          stack.push_back(output_idx);
-        }
-      }
-    }
-  }
-}
-
-string SimpleGraphView::PrintToString() const {
-  string str;
-  for (int i = 0; i < num_nodes(); ++i) {
-    strings::StrAppend(&str, "Node ", i, "'", node_name(i), "'\n", "Inputs: [");
-    for (int input : inputs(i)) {
-      strings::StrAppend(&str, input, " '", node_name(input), "', ");
-    }
-    strings::StrAppend(&str, "]\n", "Outputs: [");
-    for (int j = 0; j < outputs(i).size(); ++j) {
-      const int output = outputs(i)[j];
-      if (j > 0) {
-        strings::StrAppend(&str, ", ");
-      }
-      strings::StrAppend(&str, output, " '", node_name(output), "'");
-    }
-    strings::StrAppend(&str, "]\n");
-  }
-  return str;
-}
-
 #define HANDLE_CASE(DTYPE)                                          \
   case DTYPE:                                                       \
     if (!SafeSetScalarTensorValue<EnumToDataType<DTYPE>::Type>(     \
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 89a87af323a4b40e3ce0a997d4a68a243498b046..9053ae4c07dae96c96bac416cf9e175c88462c33 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -242,6 +242,9 @@ string AsControlDependency(const string& node);
 // Returns true if the node is assigned to run on CPU device.
 bool NodeIsOnCpu(const NodeDef* node);
 
+// Returns true if the node is assigned to run on GPU device.
+bool NodeIsOnGpu(const NodeDef* node);
+
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
@@ -302,68 +305,6 @@ void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph);
 void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
                          GraphDef* graph);
 
-class SimpleGraphView {
- public:
-  // Build a graph view for the specified graphdef.
-  Status Initialize(const GraphDef& graph) {
-    return Initialize(graph, nullptr, true, true);
-  }
-  // Build a graph view for the specified graphdef augmented with the additional
-  // edges specified in 'extra_dependencies' if any. Note that
-  // extra_dependencies can be null.
-  Status Initialize(
-      const GraphDef& graph,
-      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-          extra_dependencies) {
-    return Initialize(graph, extra_dependencies, true, true);
-  }
-  Status Initialize(
-      const GraphDef& graph,
-      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-          extra_dependencies,
-      bool dedup_inputs, bool dedup_outputs);
-
-  const GraphDef* graph() const { return graph_; }
-  inline int num_nodes() const { return index_to_name_.size(); }
-  inline bool has_node(const string& node_name) const {
-    return name_to_index_.find(node_name) != name_to_index_.end();
-  }
-  inline const int index(const string& node_name) const {
-    const auto& it = name_to_index_.find(node_name);
-    DCHECK(it != name_to_index_.end());
-    return it == name_to_index_.end() ? -1 : it->second;
-  }
-  inline const NodeDef& node(int node_idx) const {
-    return graph_->node(node_idx);
-  }
-  inline const string& node_name(int node_idx) const {
-    return index_to_name_[node_idx];
-  }
-  inline const gtl::InlinedVector<int, 4>& inputs(int node_idx) const {
-    return inputs_[node_idx];
-  }
-  inline const gtl::InlinedVector<int, 2>& outputs(int node_idx) const {
-    return outputs_[node_idx];
-  }
-
-  // Traverse the graph starting at `node_idx`, collecting indices of nodes
-  // visited in nodes_found. If a node has an op in `op_types_to_traverse`, the
-  // walk continues to its children. It is assumed that *graph_ was not modified
-  // after the call to Initialize().
-  // If `op_types_to_traverse` is empty the DFS will traverse any node type.
-  void DepthFirstSearch(const std::unordered_set<string>& op_types_to_traverse,
-                        int node_idx, std::set<int>* nodes_found) const;
-
-  string PrintToString() const;
-
- private:
-  const GraphDef* graph_;  // Not owned.
-  std::vector<string> index_to_name_;
-  gtl::FlatMap<string, int> name_to_index_;
-  std::vector<gtl::InlinedVector<int, 4>> inputs_;
-  std::vector<gtl::InlinedVector<int, 2>> outputs_;
-};
-
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index c0f19d3828ac1581a937531318ff62875fbf3bc7..1fd0a02b65e3a212780b6fdabadce98833b3ebda 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -48,8 +48,11 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -58,10 +61,11 @@ tf_cc_test(
     srcs = ["topological_sort_test.cc"],
     deps = [
         ":topological_sort",
-        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -101,8 +105,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:graph_topology_view",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -116,6 +119,8 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -138,6 +143,7 @@ cc_library(
         "//tensorflow/core:test",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -173,6 +179,10 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -191,6 +201,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index f2894a942bd3dac3e22748787eaa24717ed61555..357a0b3b47a233e33a1d686eab2eed7ca9b6cc28 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/grappler/utils/functions.h"
 
-#include <unordered_map>
-
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
@@ -76,16 +76,6 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
 
 }  // namespace
 
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return flib.ReachableDefinitions(graph);
-}
-
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return flib.ReachableDefinitions(func);
-}
-
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
     InputArgExpansion input_arg_expansion) {
   string input_name = input_arg_expansion.input_name;
@@ -94,7 +84,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_index=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -193,7 +183,7 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
           // If position is not defined expand node output range
           for (int i = output_range.first; i < output_range.second; ++i) {
             graph_def_inputs->push_back(
-                i == 0 ? node_name : strings::StrCat(node_name, ":", i));
+                i == 0 ? node_name : absl::StrCat(node_name, ":", i));
           }
         } else {
           if (position > (output_range.second - output_range.first)) {
@@ -203,7 +193,7 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
           }
           int pos = output_range.first + position;
           graph_def_inputs->push_back(
-              pos == 0 ? node_name : strings::StrCat(node_name, ":", pos));
+              pos == 0 ? node_name : absl::StrCat(node_name, ":", pos));
         }
 
         return Status::OK();
@@ -232,39 +222,39 @@ Status GrapplerFunctionConnectivity::ExpandNodeInputs(
 
 Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const string& graph_def_input, string* func_def_input) const {
-  using gtl::FindOrNull;
-
   if (IsControlInput(graph_def_input)) {
     *func_def_input = graph_def_input;
     return Status::OK();
   }
 
-  int position;
-  string node_name = ParseNodeName(graph_def_input, &position);
-  CHECK_GE(position, 0);
+  const TensorId tensor = ParseTensorName(graph_def_input);
+  DCHECK_GE(tensor.index(), 0);
+
+  const absl::string_view node_name = tensor.node();
+  const int index = tensor.index();
 
   // Check if it's an input arg placeholder
-  if (position == 0) {
-    const InputArgPlaceholder* placeholder =
-        FindOrNull(input_arg_placeholders_, node_name);
-    if (placeholder != nullptr) {
-      *func_def_input = strings::StrCat(placeholder->input_name, ":",
-                                        placeholder->input_position);
+  if (tensor.index() == 0) {
+    const auto is_input_placeholder = input_arg_placeholders_.find(node_name);
+    if (is_input_placeholder != input_arg_placeholders_.end()) {
+      const InputArgPlaceholder& placeholder = is_input_placeholder->second;
+      *func_def_input =
+          absl::StrCat(placeholder.input_name, ":", placeholder.input_index);
       return Status::OK();
     }
   }
 
   // It must be output from one of the function body nodes
-  const tensorflow::NameRangeMap* outputs_range_map =
-      FindOrNull(function_body_outputs_, node_name);
-  if (outputs_range_map != nullptr) {
-    for (const auto& el : *outputs_range_map) {
+  const auto is_body_output = function_body_outputs_.find(tensor.node());
+  if (is_body_output != function_body_outputs_.end()) {
+    const tensorflow::NameRangeMap& outputs_range_map = is_body_output->second;
+
+    for (const auto& el : outputs_range_map) {
       const auto& output_name = el.first;
       const auto& output_range = el.second;
-      if (position >= output_range.first && position < output_range.second) {
-        int pos = position - output_range.first;
-        *func_def_input =
-            strings::StrCat(node_name, ":", output_name, ":", pos);
+      if (index >= output_range.first && index < output_range.second) {
+        int pos = index - output_range.first;
+        *func_def_input = absl::StrCat(node_name, ":", output_name, ":", pos);
         return Status::OK();
       }
     }
@@ -338,19 +328,18 @@ GrapplerFunctionItem::GrapplerFunctionItem(
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
       feed.push_back({placeholder, Tensor()});
-      input_arg_placeholders_.insert(placeholder);
     }
   }
   // Fill the fetch nodes with outputs.
   for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
-    for (const string& output_tensor : output_arg.output_tensors) {
-      fetch.push_back(output_tensor);
+    for (const string& output_node : output_arg.output_nodes) {
+      fetch.push_back(output_node);
     }
   }
 
-  // It's unsafe to prune side-effectful ops from the graph instantiated from a
-  // function definition (see inlining in function_optimizer.cc).
-  allowed_optimizations().prune_ops_with_side_effects = false;
+  // Tensorflow functions execution semantics is different from the main graph,
+  // and we need to preserve it when we do graph optimizations.
+  optimization_options().is_function_instantiation = true;
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -367,11 +356,6 @@ const std::size_t GrapplerFunctionItem::input_size() const {
   return input_arg_expansions_.size();
 }
 
-bool GrapplerFunctionItem::IsInputPlaceholder(const string& node_name) const {
-  return input_arg_placeholders_.find(node_name) !=
-         input_arg_placeholders_.end();
-}
-
 const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
   return output_arg_expansions_;
 }
@@ -426,7 +410,7 @@ bool IsParametrized(const FunctionDef& func) {
 
 Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, DataType>* type_parameters) {
+    absl::flat_hash_map<string, DataType>* type_parameters) {
   if (!type_parameters->empty()) {
     return errors::InvalidArgument("Type parameters output map must be empty");
   }
@@ -454,7 +438,7 @@ Status InstantiationTypeParameters(
 
 Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, AttrValue>* body_parameters) {
+    absl::flat_hash_map<string, AttrValue>* body_parameters) {
   if (!body_parameters->empty()) {
     return errors::InvalidArgument("Body parameters output map must be empty");
   }
@@ -514,8 +498,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // Function body shares the library with the graph that instantiated it. We do
   // not need a full copy of the function library, just the reachable subset.
-  *function_body.mutable_library() =
-      ReachableFunctionLibraryDefinition(flib, func).ToProto();
+  *function_body.mutable_library() = flib.ReachableDefinitions(func).ToProto();
 
   VLOG(3) << absl::Substitute(
       "Deleted $0 unreachable functions from the Grappler function item "
@@ -525,12 +508,18 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
 
-  // Make sure that there is no tensor sequences in outputs
+  // Make sure that there are no tensor lists in inputs or outputs.
+  for (const OpDef::ArgDef& input : signature.input_arg()) {
+    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Inputs with lists of tensors are not supported. Input: ",
+          input.name());
+    }
+  }
   for (const OpDef::ArgDef& output : signature.output_arg()) {
     if (!output.type_list_attr().empty() || !output.number_attr().empty()) {
       return errors::InvalidArgument(
-          "Outputs with sequence of tensors are not supported. Unsupported "
-          "output: ",
+          "Outputs with lists of tensors are not supported. Output: ",
           output.name());
     }
   }
@@ -540,13 +529,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // For each input argument create a placeholder in function body.
   for (const OpDef::ArgDef& input : signature.input_arg()) {
-    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
-      return errors::InvalidArgument(
-          "Inputs with sequence of tensors are not supported. Unsupported "
-          "input: ",
-          input.name());
-    }
-
     DataType input_data_type;
     TF_RETURN_IF_ERROR(instantiation.GetArgType(input, &input_data_type));
 
@@ -565,8 +547,25 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     inputs.push_back(std::move(input_expansion));
   }
 
-  // Add all function nodes to the function body
+  // Keep names of all nodes in the function body to guarantee that we do not
+  // add an identity with a duplicate name.
+  absl::flat_hash_set<absl::string_view> func_body_nodes;
+
+  // Generate unique output node name: "${out_arg_name}_output_node_${index}".
+  const auto output_node_name = [&func_body_nodes](const OpDef::ArgDef& out,
+                                                   int index) -> string {
+    string name = absl::StrCat(out.name(), "_output_node_", index);
+    int i = 1;
+    while (func_body_nodes.find(name) != func_body_nodes.end()) {
+      name = absl::StrCat(out.name(), "_output_node_", index, "_", i++);
+    }
+    return name;
+  };
+
+  // Add all function nodes to the function body.
   for (const NodeDef& func_def_node : func.node_def()) {
+    func_body_nodes.insert(func_def_node.name());
+
     NodeDef* new_node = function_body.add_node();
     *new_node = func_def_node;
 
@@ -589,8 +588,13 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   std::vector<OutputArgExpansion> outputs;
   outputs.reserve(signature.output_arg_size());
-  // Add function outputs
+
+  // For each function output argument we create an Identity node in the
+  // function body, that reads output tensor from the function body node.
   for (const OpDef::ArgDef& out : signature.output_arg()) {
+    DataType output_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+
     std::vector<string> output_tensors;
     auto ret = func.ret().find(out.name());
     TF_RETURN_IF_ERROR(
@@ -600,13 +604,23 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
             // Otherwise output must be one of the function inputs
             : connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
 
-    DataType output_data_type;
-    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+    absl::InlinedVector<string, 1> output_nodes;
+    for (int i = 0; i < output_tensors.size(); ++i) {
+      const string& output_tensor = output_tensors[i];
+
+      NodeDef* identity = function_body.add_node();
+      identity->set_name(output_node_name(out, i));
+      identity->set_op("Identity");
+      (*identity->mutable_attr())["T"].set_type(output_data_type);
+      identity->add_input(output_tensor);
+
+      output_nodes.push_back(identity->name());
+    }
 
     OutputArgExpansion output{/*output_name=*/out.name(),
                               /*data_type=*/output_data_type,
                               /*is_ref=*/out.is_ref(),
-                              /*output_tensors=*/std::move(output_tensors)};
+                              /*output_nodes=*/std::move(output_nodes)};
     outputs.push_back(std::move(output));
   }
 
@@ -645,7 +659,7 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item) {
   if (!IsConstant(input_const)) {
     return errors::InvalidArgument("Input node ", input_const.name(),
@@ -657,7 +671,7 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   // Find input arg expansion and input placeholder position in it for the
   // given function input position.
   InputArgExpansion* input_arg_expansion = nullptr;
-  int placeholder_idx = input_position;
+  int placeholder_idx = input_index;
 
   for (InputArgExpansion& input : inputs) {
     if (placeholder_idx < input.placeholders.size()) {
@@ -668,14 +682,12 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   }
 
   if (input_arg_expansion == nullptr) {
-    return errors::InvalidArgument(
-        "Input placeholder not found: input_position=", input_position,
-        " function=", item->id);
+    return errors::InvalidArgument("Input placeholder not found: input_index=",
+                                   input_index, " function=", item->id);
   }
 
   // Delete placeholder from input expansion.
   string placeholder_name = input_arg_expansion->placeholders[placeholder_idx];
-  item->input_arg_placeholders_.erase(placeholder_name);
   input_arg_expansion->placeholders.erase(
       input_arg_expansion->placeholders.begin() + placeholder_idx);
 
@@ -699,43 +711,46 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   return Status::OK();
 }
 
-Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
-                           GrapplerFunctionItem* item,
-                           std::vector<std::pair<int, int>>* output_mapping) {
+Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
+                             GrapplerFunctionItem* item,
+                             std::vector<std::pair<int, int>>* output_mapping) {
   DCHECK(output_mapping->empty());
 
-  // Do some sanity checking of the active outputs positions.
-  for (int active_output : active_outputs) {
-    if (active_output < 0 || active_output >= item->output_size()) {
+  // Code below assumes that we do not support tensor list outputs and there is
+  // a 1-to-1 mapping between output tensor and output argument expansion.
+  for (const OutputArgExpansion& out_arg : item->outputs()) {
+    DCHECK(out_arg.output_nodes.size() == 1)
+        << "Output arg expansion must have single output";
+  }
+
+  // Do some sanity checking of the removed outputs positions.
+  for (int remove_output : remove_outputs) {
+    if (remove_output < 0 || remove_output >= item->output_size()) {
       return errors::InvalidArgument(
-          "Active output position is out of bound: active_output=",
-          active_output, " num_output_args=", item->output_size());
+          "Function output index is out of bound: index=", remove_output,
+          " max_output_index=", item->output_size());
     }
   }
 
-  gtl::FlatSet<const OutputArgExpansion*> unused_output_args;
-
-  const auto is_unused_output_arg = [&](const OutputArgExpansion& output) {
-    return unused_output_args.find(&output) != unused_output_args.end();
+  absl::flat_hash_set<const OutputArgExpansion*> remove_output_args;
+  const auto is_remove_output_arg = [&](const OutputArgExpansion& output) {
+    return remove_output_args.find(&output) != remove_output_args.end();
   };
 
   for (int i = 0; i < item->output_size(); ++i) {
     const OutputArgExpansion& output = item->output(i);
-    DCHECK(output.output_tensors.size() == 1)
-        << "Output arg expansion must have single tensor";
-
-    if (active_outputs.find(i) == active_outputs.end()) {
-      VLOG(3) << "Remove unused output: output_name=" << output.output_name
-              << " output_position=" << i;
-      unused_output_args.insert(&output);
-    } else if (!unused_output_args.empty()) {
+    if (remove_outputs.find(i) != remove_outputs.end()) {
+      VLOG(3) << "Remove functions output: output_name=" << output.output_name
+              << "(index = " << i << ")";
+      remove_output_args.insert(&output);
+    } else if (!remove_output_args.empty()) {
       // Add output mapping only if output position changed.
-      output_mapping->push_back({i, i - unused_output_args.size()});
+      output_mapping->push_back({i, i - remove_output_args.size()});
     }
   }
 
   auto& o = item->output_arg_expansions_;
-  o.erase(std::remove_if(o.begin(), o.end(), is_unused_output_arg), o.end());
+  o.erase(std::remove_if(o.begin(), o.end(), is_remove_output_arg), o.end());
 
   return Status::OK();
 }
@@ -747,6 +762,55 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   func->mutable_signature()->set_description(item.description());
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
+  // Keep track of placeholders that were added to the graph in place of
+  // expanded function input arguments.
+  absl::flat_hash_set<absl::string_view> input_placeholders;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders.insert(placeholder);
+    }
+  }
+
+  // Keep track of identity nodes that were added to the graph in place of
+  // expanded function output arguments.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
+
+  // If the output identity node was not modified by any optimizer, we can
+  // bypass it and returns the function value from its input.
+  absl::flat_hash_map<absl::string_view, string> output_tensors;
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    if (!IsIdentity(func_body_node)) continue;
+
+    const string& node_name = func_body_node.name();
+    if (output_nodes.find(node_name) != output_nodes.end()) {
+      // Grappler optimizers might optimize nodes in the fanin of the output
+      // node, and forward their control dependencies. We can't express control
+      // dependencies in a function signature, so we have to keep the node.
+      if (func_body_node.input_size() == 1) {
+        VLOG(3) << "Bypass function output node: " << node_name << " -> "
+                << func_body_node.input(0);
+        output_tensors.emplace(node_name, func_body_node.input(0));
+      } else {
+        VLOG(3) << "Keep function output node: " << node_name;
+      }
+    }
+  }
+
+  // Return output tensor name (input of the output node) if it's safe to bypass
+  // output node, otherwise returns the output node name.
+  const auto output_tensor =
+      [&output_tensors](const OutputArgExpansion& output_arg) -> const string& {
+    const string& output_node = output_arg.output_nodes[0];
+    const auto is_output_tensor = output_tensors.find(output_node);
+    return is_output_tensor == output_tensors.end() ? output_node
+                                                    : is_output_tensor->second;
+  };
+
   // Build a GrapplerFunctionConnectivity from inputs and new function body.
   GrapplerFunctionConnectivity connectivity;
   TF_RETURN_IF_ERROR(
@@ -754,8 +818,8 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function input arguments.
   for (const InputArgExpansion& input_arg : item.inputs()) {
-    CHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
-        << "Inputs of tensor sequences are not supported";
+    DCHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
+        << "Inputs of tensor lists are not supported";
 
     OpDef::ArgDef arg_def;
     arg_def.set_name(input_arg.input_name);
@@ -766,8 +830,8 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function output arguments.
   for (const OutputArgExpansion& output_arg : item.outputs()) {
-    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
-        << "Outputs of tensor sequences are not supported";
+    DCHECK(output_arg.output_nodes.size() == 1)  // do some sanity checking
+        << "Outputs of tensor lists are not supported";
 
     OpDef::ArgDef arg_def;
     arg_def.set_name(output_arg.output_name);
@@ -775,11 +839,9 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
     arg_def.set_is_ref(output_arg.is_ref);
     *func->mutable_signature()->add_output_arg() = arg_def;
 
-    string ret;
-    for (const string& output_tensor : output_arg.output_tensors) {
-      TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
-      (*func->mutable_ret())[output_arg.output_name] = ret;
-    }
+    TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(
+        output_tensor(output_arg),
+        &(*func->mutable_ret())[output_arg.output_name]));
   }
 
   // Copy function definition specific attributes.
@@ -790,12 +852,16 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   }
 
   // Copy function body nodes to the FunctionDef and update input format
-  for (const NodeDef& func_body_node : item.function_body().node()) {
-    // Do not copy input placeholders
-    if (item.IsInputPlaceholder(func_body_node.name())) continue;
+  for (const NodeDef& func_node : item.function_body().node()) {
+    const string& name = func_node.name();
+
+    // Do not copy input placeholders.
+    if (IsPlaceholder(func_node) && input_placeholders.count(name)) continue;
+    // Do not copy output nodes that we bypassed.
+    if (IsIdentity(func_node) && output_tensors.count(name)) continue;
 
     NodeDef* func_def_node = func->add_node_def();
-    *func_def_node = func_body_node;
+    *func_def_node = func_node;
     TF_RETURN_IF_ERROR(connectivity.AsFunctionDefNode(func_def_node));
   }
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 038cf5f527e0f32cc10e123bb0cab357e5902463..d5a41e74739d67fc2cef0c295efe208edbd6255c 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -18,7 +18,10 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -30,12 +33,20 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Returns a copy of FunctionLibraryDefinition with subset of functions that are
-// reachable from the nodes of the graph.
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph);
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func);
+// WARNING(ezhulenev): Currently we do not support functions with inputs or
+// outputs instantiated into multiple tensors. This can happen if the
+// input/output type is 'T*N' or 'list(type)'. This is enforced by multiple
+// checks across this file and also function_optimizer.cc. InputArgExpansion and
+// OutputArgExpansion already support lists of tensors, but that's pretty much
+// it, all other code is written with assumption that expansions are always of
+// size 1. MakeGrapplerFunctionItem will gracefully fail with Status error.
+//
+// This is a low priority feature, because in practice we don't see a lot (any
+// at all?) functions with such arguments. Tensorflow-Eager always produces
+// functions with plain input/output arguments.
+
+// TODO(ezhulenev): Support inputs and outputs of type 'T*N'.
+// TODO(ezhulenev): Support inputs and outputs of type 'list(type)'.
 
 // Depending on the function instantiation attributes, input argument to the
 // function might be a single tensor, list of tensors of the same type, or a
@@ -44,30 +55,23 @@ FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
 // InputArgExpansion keeps track of the placeholders that were added to the
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
-  // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized inputs?
-  string input_name;                 // name of the function input argument
-  DataType data_type;                // input data type
-  bool is_ref;                       // if true, inputs are required to be refs
-  std::vector<string> placeholders;  // names of placeholder nodes in the
-                                     // function body
+  string input_name;
+  DataType data_type;
+  bool is_ref;
+  absl::InlinedVector<string, 1> placeholders;
 };
 
 // Depending on the function instantiation attributes, output argument is mapped
 // to one or more outputs of one of the function body nodes.
 //
-// OutputArgExpansion keeps mapping from a function output arg to the output
-// tensors of a function body nodes and a resolved output data type
+// OutputArgExpansion keeps track of the Identity nodes that were added to the
+// function body to forward output tensors. Adding these output nodes allows
+// nested function inlining and specialization (see function optimizer).
 struct OutputArgExpansion {
-  // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized outputs?
-  string output_name;                  // name of the function output argument
-  DataType data_type;                  // output data type
-  bool is_ref;                         // if true, outputs are refs
-  std::vector<string> output_tensors;  // names of output tensor from the
-                                       // function body nodes
+  string output_name;
+  DataType data_type;
+  bool is_ref;
+  absl::InlinedVector<string, 1> output_nodes;
 };
 
 // FunctionDef uses different connectivity encoding for the function body nodes,
@@ -81,44 +85,46 @@ class GrapplerFunctionConnectivity {
   void RegisterFunctionBodyOutputs(const string& node_name,
                                    tensorflow::NameRangeMap&& outputs);
 
-  // Expand input encoded in FunctionDef format (name[:output][:position]) into
+  // Expands input encoded in FunctionDef format (name[:output][:position]) into
   // multiple inputs in GraphDef format (name[:position]).
   Status ExpandFunctionDefInput(const string& func_def_input,
                                 std::vector<string>* graph_def_inputs) const;
 
-  // Update Node inputs from FunctionDef to GraphDef format.
+  // Updates Node inputs from FunctionDef to GraphDef format.
   Status ExpandNodeInputs(NodeDef* function_body_node) const;
 
   // When expanding inputs in function def format, single input might be
   // expanded into multiple tensors. When converting back to the function def
   // format from graph def format, it's always a 1-to-1 relationship.
-  // FunctionDef built from GrapplerFunctionItem is always specialized to it's
+  // FunctionDef built from GrapplerFunctionItem is always specialized to its
   // instantiation attributes and length of input args (and node def outputs) is
   // known.
 
-  // Map from GraphDef input format to FunctionDef input format using registered
-  // input arg expansion and function body outputs.
+  // Converts input name from GraphDef format (name[:position]) to the
+  // FunctionDef input format (name[:output][:position]) using registered input
+  // arg expansion and function body outputs.
   Status AsFunctionDefInput(const string& graph_def_input,
                             string* func_def_input) const;
 
-  // Update Node inputs from GraphDef to FunctionDef format.
+  // Updates Node inputs from GraphDef to FunctionDef format.
   Status AsFunctionDefNode(NodeDef* function_body_node) const;
 
  private:
   // Mapping from input name to input arg expansion.
-  std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  absl::flat_hash_map<string, InputArgExpansion> input_arg_expansions_;
   // Mapping from function body node name to output names range map.
-  std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+  absl::flat_hash_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
+  // For each placeholder added to the function instantiation graph, we keep a
+  // mapping back to the function input argument name and index.
   struct InputArgPlaceholder {
-    string input_name;   // Name of the function input argument.
-    int input_position;  // Index of a tensor in the function input argument
-                         // expansion, it can be greater than `0` if input
-                         // argument is a list of tensors (aka list(type)).
+    string input_name;  // Name of the function input argument.
+    int input_index;    // Index of a tensor in the function input argument
+                        // expansion, it can be greater than `0` if input
+                        // argument is a list of tensors (aka list(type)).
   };
-
   // Mapping from input arg placeholder to the function input tensor.
-  std::unordered_map<string, InputArgPlaceholder> input_arg_placeholders_;
+  absl::flat_hash_map<string, InputArgPlaceholder> input_arg_placeholders_;
 };
 
 // Get Function type attributes using attributes of a node that instantiated
@@ -147,8 +153,6 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   const string& description() const;
 
-  bool IsInputPlaceholder(const string& node_name) const;
-
   const std::vector<InputArgExpansion>& inputs() const;
   const InputArgExpansion& input(int i) const;
   const std::size_t input_size() const;
@@ -171,9 +175,9 @@ class GrapplerFunctionItem : public GrapplerItem {
                                          GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
-  friend Status RemoveUnusedOutputs(
-      const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
-      std::vector<std::pair<int, int>>* output_mapping);
+  friend Status RemoveFunctionOutputs(const absl::flat_hash_set<int>&,
+                                      GrapplerFunctionItem*,
+                                      std::vector<std::pair<int, int>>*);
 
   GrapplerFunctionItem(string func_name, string description,
                        AttrSlice func_attr,
@@ -189,16 +193,14 @@ class GrapplerFunctionItem : public GrapplerItem {
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
 
-  std::set<string> input_arg_placeholders_;
-
-  bool is_stateful_;
+  bool is_stateful_ = false;
 };
 
 // Check if function input/output types are fully defined only at instantiation
-// time (parametrized by it's instantiation node).
+// time (parametrized by its instantiation node).
 bool HasParametrizedType(const FunctionDef& func);
 
-// Check if a function body is parametrized by it's instantiation node. Function
+// Check if a function body is parametrized by its instantiation node. Function
 // body is parametrized, if it has at least one node with a 'placeholder'
 // attribute.
 bool HasParametrizedBody(const FunctionDef& func);
@@ -210,14 +212,14 @@ bool IsParametrized(const FunctionDef& func);
 // caller node. Return error if type can't be resolved.
 Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, DataType>* type_parameters);
+    absl::flat_hash_map<string, DataType>* type_parameters);
 
 // Resolve function instantiation body parameters (values for the function body
 // attr placeholders) from the attributes of the caller node. Return error if
 // type can't be resolved.
 Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, AttrValue>* body_parameters);
+    absl::flat_hash_map<string, AttrValue>* body_parameters);
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity. Use function library definition to
@@ -227,18 +229,19 @@ Status RegisterGrapplerFunctionConnectivity(
     GrapplerFunctionConnectivity* connectivity);
 
 // Replace one of the function inputs with a constant.
-Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item);
 
-// Remove function output arguments that do not have any active outputs (output
-// tensor connected to other node inputs or in a fetch set). Active outputs uses
-// GraphDef output position encoding, and multiple active outputs could
-// potentially be connected to the same output argument (in case of tensor list
-// outputs). Add output mapping for all active outputs that changed it's output
-// position (std::pair<old position, new position>).
-Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
-                           GrapplerFunctionItem* item,
-                           std::vector<std::pair<int, int>>* output_mapping);
+// Removes outputs from instantiated grappler function item. Function node
+// outputs use GraphDef output index encoding, and multiple outputs might belong
+// to the same output argument expansion (in case of tensor list outputs). For
+// all active function outputs that changed its output index, this function adds
+// an output mapping (std::pair<old index, new index>).
+Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
+                             GrapplerFunctionItem* item,
+                             std::vector<std::pair<int, int>>* output_mapping);
+
+// TODO(ezhulennev, b/120103818): Add RemoveFunctionInputs.
 
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
@@ -253,7 +256,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 // fully defined (no type or body parametrization).
 // TODO(ezhulenev): Support parametrized functions without fully defined
 // instantiation attributes? Do we ever want to optimize parametrized function
-// without specializing it to it's instantiation attributes (at least types)?
+// without specializing it to its instantiation attributes (at least types)?
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
                                 int graph_def_version,
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 5923850eca65a219fe3c452947751509a2bcf445..772088882835d0223f424f5d73a3587c53440469 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/functions.h"
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -77,7 +79,7 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   func_instantiation_attr["B"].set_type(DT_INT32);
   func_instantiation_attr["C"].set_type(DT_DOUBLE);
 
-  std::unordered_map<string, DataType> type_parameters;
+  absl::flat_hash_map<string, DataType> type_parameters;
   TF_EXPECT_OK(InstantiationTypeParameters(
       func, AttrSlice(&func_instantiation_attr), &type_parameters));
 
@@ -86,7 +88,7 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   EXPECT_EQ(DT_INT32, type_parameters["B"]);
   EXPECT_EQ(DT_DOUBLE, type_parameters["C"]);
 
-  std::unordered_map<string, AttrValue> body_parameters;
+  absl::flat_hash_map<string, AttrValue> body_parameters;
   TF_EXPECT_OK(InstantiationBodyParameters(
       func, AttrSlice(&func_instantiation_attr), &body_parameters));
 
@@ -247,15 +249,16 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("XTimesTwo", item.id);
-  EXPECT_EQ(4, item.function_body().node_size());
+  EXPECT_EQ(5, item.function_body().node_size());
 
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ("x", item.input(0).input_name);
-  EXPECT_EQ(std::vector<string>{"x"}, item.input(0).placeholders);
+  ASSERT_EQ(1, item.input(0).placeholders.size());
+  EXPECT_EQ("x", item.input(0).placeholders[0]);
 
   EXPECT_EQ(1, item.output_size());
   EXPECT_EQ("y", item.output(0).output_name);
-  EXPECT_EQ("y", item.output(0).output_tensors[0]);
+  EXPECT_EQ("y_output_node_0", item.output(0).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -277,9 +280,13 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("scale", node.input(1));
+    } else if (node.name() == "y_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
     }
   }
-  EXPECT_EQ(4, count);
+  EXPECT_EQ(5, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
@@ -324,7 +331,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("SubGrad", item.id);
-  EXPECT_EQ(12, item.function_body().node_size());
+  EXPECT_EQ(14, item.function_body().node_size());
 
   ASSERT_EQ(3, item.input_size());
   EXPECT_EQ("x", item.input(0).input_name);
@@ -332,8 +339,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
   EXPECT_EQ("dz", item.input(2).input_name);
 
   ASSERT_EQ(2, item.output_size());
-  EXPECT_EQ("dx", item.output(0).output_tensors[0]);
-  EXPECT_EQ("dy", item.output(1).output_tensors[0]);
+  EXPECT_EQ("dx_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ("dy_output_node_0", item.output(1).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -357,9 +364,17 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("gy", node.input(0));
       EXPECT_EQ("rx:1", node.input(1));
+    } else if (node.name() == "dx_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("dx", node.input(0));
+    } else if (node.name() == "dy_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("dy", node.input(0));
     }
   }
-  EXPECT_EQ(6, count);
+  EXPECT_EQ(8, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
@@ -470,7 +485,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(1, item.output_size());
-  EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
+  EXPECT_EQ("out_output_node_0", item.output(0).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -486,9 +501,13 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("Linear_func", node.input(0));
+    } else if (node.name() == "out_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("Exp", node.input(0));
     }
   }
-  EXPECT_EQ(3, count);
+  EXPECT_EQ(4, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
@@ -515,27 +534,44 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("ForwardInputs", item.id);
-  EXPECT_EQ(5, item.function_body().node_size());
+  EXPECT_EQ(8, item.function_body().node_size());
 
   EXPECT_EQ(3, item.output_size());
-  EXPECT_EQ("in0", item.output(0).output_tensors[0]);
-  EXPECT_EQ("arg2", item.output(1).output_tensors[0]);
-  EXPECT_EQ("arg3", item.output(2).output_tensors[0]);
+  EXPECT_EQ("out0_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ("arg2_output_node_0", item.output(1).output_nodes[0]);
+  EXPECT_EQ("arg3_output_node_0", item.output(2).output_nodes[0]);
 
   int count = 0;
+
+  const auto is_arg_placeholder = [](const string &name) {
+    return name == "in0" || name == "in1" || name == "arg2" || name == "arg3" ||
+           name == "arg4";
+  };
+
   for (const NodeDef &node : item.function_body().node()) {
-    EXPECT_TRUE(node.name() == "in0" || node.name() == "in1" ||
-                node.name() == "arg2" || node.name() == "arg3" ||
-                node.name() == "arg4");
-    count++;
-    EXPECT_EQ("Placeholder", node.op());
-    if (node.name() == "arg3") {
-      EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
-    } else {
-      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+    if (is_arg_placeholder(node.name()) && node.op() == "Placeholder") {
+      count++;
+      if (node.name() == "arg3") {
+        EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
+      } else {
+        EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      }
+      continue;
+    }
+
+    EXPECT_EQ("Identity", node.op());
+    ASSERT_EQ(1, node.input_size());
+    EXPECT_TRUE(is_arg_placeholder(node.input(0)));
+
+    if (node.name() == "out0_output_node_0" && ++count) {
+      EXPECT_EQ("in0", node.input(0));
+    } else if (node.name() == "arg2_output_node_0" && ++count) {
+      EXPECT_EQ("arg2", node.input(0));
+    } else if (node.name() == "arg3_output_node_0" && ++count) {
+      EXPECT_EQ("arg3", node.input(0));
     }
   }
-  EXPECT_EQ(5, count);
+  EXPECT_EQ(8, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
@@ -564,16 +600,22 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
 
   EXPECT_EQ(0, item.input_size());
   EXPECT_EQ(1, item.output_size());
-  EXPECT_EQ("o", item.output(0).output_tensors[0]);
+  EXPECT_EQ("o_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ(3, item.function_body().node_size());
 
-  EXPECT_EQ(2, item.function_body().node_size());
   const NodeDef &two = item.function_body().node(0);
   EXPECT_EQ("two", two.name());
   EXPECT_EQ(0, two.input_size());
+
   const NodeDef &cast = item.function_body().node(1);
   EXPECT_EQ("o", cast.name());
   EXPECT_EQ(1, cast.input_size());
   EXPECT_EQ("two", cast.input(0));
+
+  const NodeDef &retval = item.function_body().node(2);
+  EXPECT_EQ("o_output_node_0", retval.name());
+  EXPECT_EQ(1, retval.input_size());
+  EXPECT_EQ("o", retval.input(0));
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
@@ -599,7 +641,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
   EXPECT_EQ(3, item.function_body().node_size());
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ(0, item.output_size());
-  EXPECT_EQ(false, item.allowed_optimizations().prune_ops_with_side_effects);
+  EXPECT_EQ(true, item.optimization_options().is_function_instantiation);
 }
 
 TEST_F(FunctionsTest, MakeFunctionDef) {
@@ -672,7 +714,7 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
   EXPECT_EQ(2, item.input_size());
   EXPECT_EQ(1, item.output_size());
 
-  ASSERT_EQ(3, item.function_body().node_size());
+  ASSERT_EQ(4, item.function_body().node_size());
 
   const NodeDef &input_x = item.function_body().node(0);
   const NodeDef &input_y = item.function_body().node(1);
@@ -746,8 +788,9 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
       {{"z", "output:z:0"}});
 
   GraphDef id_func_body = test::function::GDef(
-      {/* pass input to output through identity */
-       NDef("output", "Identity", {"x"}, {{"T", "float"}})});
+      {/* Read and return input argument through Identity node. */
+       NDef("read_x", "Identity", {"x"}, {{"T", "float"}}),
+       NDef("z_output_node_0", "Identity", {"read_x"}, {{"T", "float"}})});
 
   protobuf::Map<string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["T"].set_type(DT_FLOAT);
@@ -770,15 +813,15 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   // Check that graph body was updated.
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {
-    if (node.name() == "output" && ++count) {
+    if (node.name() == "read_x" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ("x:0", node.input(0));
     }
   }
   EXPECT_EQ(1, count);
 
-  // And return tensor mapping was updated with a new output name (z->output).
-  EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
+  // And return tensor mapping was updated with a new output name (z->read_x).
+  EXPECT_EQ("read_x:output:0", (*specialized.mutable_ret())["z"]);
 }
 
 TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 576494cad55e22ba8457f30d0ea79b53f6f5de78..1b4b9f9a51af17c4472f0fc34331b75192e3d3ae 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+
 #include <memory>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -23,6 +27,46 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
+                       protobuf::RepeatedPtrField<NodeDef>* got) {
+  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
+    return n1.name() < n2.name();
+  };
+
+  std::sort(want->begin(), want->end(), comparator);
+  std::sort(got->begin(), got->end(), comparator);
+
+  ASSERT_EQ(want->size(), got->size());
+
+  for (int i = 0; i < want->size(); ++i) {
+    NodeDef& want_node = (*want)[i];
+    NodeDef& got_node = (*got)[i];
+
+    EXPECT_EQ(want_node.op(), got_node.op());
+    EXPECT_EQ(want_node.name(), got_node.name());
+    EXPECT_EQ(want_node.device(), got_node.device());
+    ASSERT_EQ(want_node.input_size(), got_node.input_size());
+
+    // Order of control dependencies doesn't matter, so we sort them first.
+    const auto is_control = [](const string& input) -> bool {
+      return ParseTensorName(input).index() < 0;
+    };
+
+    auto want_inputs = want_node.mutable_input();
+    auto got_inputs = got_node.mutable_input();
+    std::sort(absl::c_find_if(*want_inputs, is_control), want_inputs->end());
+    std::sort(absl::c_find_if(*got_inputs, is_control), got_inputs->end());
+
+    for (int j = 0; j < want_node.input_size(); ++j) {
+      const TensorId want_tensor = ParseTensorName(want_node.input(j));
+      const TensorId got_tensor = ParseTensorName(got_node.input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
+    }
+  }
+}
+}  // namespace
+
 GrapplerTest::GrapplerTest() {
   // Turn off all the automatic optimizations to ensure that we run the graph
   // exactly as it is given to us. This ensures that we can compare the results
@@ -94,34 +138,35 @@ NodeDef* GrapplerTest::AddNode(
 }
 
 void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
-  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
-    return n1.name() < n2.name();
-  };
-  std::sort(want.mutable_node()->begin(), want.mutable_node()->end(),
-            comparator);
-  std::sort(got.mutable_node()->begin(), got.mutable_node()->end(), comparator);
+  CompareGraphNodes(want.mutable_node(), got.mutable_node());
+}
 
-  for (int i = 0; i < want.node_size(); ++i) {
-    std::sort(want.mutable_node(i)->mutable_input()->begin(),
-              want.mutable_node(i)->mutable_input()->end());
-  }
-  for (int i = 0; i < got.node_size(); ++i) {
-    std::sort(got.mutable_node(i)->mutable_input()->begin(),
-              got.mutable_node(i)->mutable_input()->end());
-  }
+void GrapplerTest::CompareFunctions(FunctionDef want, FunctionDef got) const {
+  CompareGraphNodes(want.mutable_node_def(), got.mutable_node_def());
+}
 
-  ASSERT_EQ(want.node_size(), got.node_size());
-  for (int i = 0; i < want.node_size(); ++i) {
-    EXPECT_EQ(want.node(i).op(), got.node(i).op());
-    EXPECT_EQ(want.node(i).name(), got.node(i).name());
-    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+void GrapplerTest::CompareNodes(const NodeDef& want, const NodeDef& got) const {
+  EXPECT_EQ(want.name(), got.name());
+  EXPECT_EQ(want.op(), got.op());
 
-    ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
-    for (int j = 0; j < want.node(i).input_size(); ++j) {
-      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
-      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
-      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
-    }
+  std::vector<string> want_inputs(want.input().begin(), want.input().end());
+  std::vector<string> got_inputs(got.input().begin(), got.input().end());
+  EXPECT_EQ(want_inputs, got_inputs);
+
+  const auto attr_name = [](const std::pair<const string, AttrValue>& attr) {
+    return attr.first;
+  };
+
+  std::vector<string> want_attrs;
+  std::vector<string> got_attrs;
+  absl::c_transform(want.attr(), std::back_inserter(want_attrs), attr_name);
+  absl::c_transform(got.attr(), std::back_inserter(got_attrs), attr_name);
+  absl::c_sort(want_attrs);
+  absl::c_sort(got_attrs);
+  EXPECT_EQ(want_attrs, got_attrs);
+
+  for (const string& attr : want_attrs) {
+    EXPECT_TRUE(AreAttrValuesEqual(want.attr().at(attr), got.attr().at(attr)));
   }
 }
 
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 0cfd740dcbe15e0571bc159858c0ed33c2071cb8..26c1db37405a48a7252f388a3e659b8d07c569ae 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -49,13 +49,32 @@ class GrapplerTest : public ::testing::Test {
                    const std::vector<std::pair<string, AttrValue>>& attributes,
                    GraphDef* graph) const;
 
+  // Checks if two graphs are equal. Both graphs must have the same set of nodes
+  // with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failuires to the current test.
   void CompareGraphs(GraphDef want, GraphDef got) const;
 
-  // Check if node 'src' is directly connected to the input($position) of 'dst'.
+  // Checks if two nodes have the same name, op, inputs and attributes.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failuires to the current test.
+  void CompareNodes(const NodeDef& want, const NodeDef& got) const;
+
+  // Checks if two functions are equal. Both functions must have the same set of
+  // nodes with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failures to the current test.
+  void CompareFunctions(FunctionDef want, FunctionDef got) const;
+
+  // Checks if node 'src' is directly connected to the input($position) of
+  // 'dst'.
   bool IsNodesDirectlyConnected(const NodeMap& node_map, const string& src,
                                 const string& dst, int position = 0);
 
-  // Count nodes of the given op-type in a graph.
+  // Counts nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
 
   // Get a random tensor with given shape.
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 63ca92c69e1c11a90e7870f1509228d90239fa72..a6d0f5037bb35cbbb909cbb4049153f0d1013c64 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -14,10 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+
 #include <algorithm>
 #include <deque>
 #include <unordered_map>
+
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -25,27 +30,46 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+std::vector<GraphView::Edge> MakeEphemeralEdges(
+    const absl::Span<const TopologicalDependency> extra_dependencies) {
+  std::vector<GraphView::Edge> ephemeral_edges;
+  ephemeral_edges.reserve(extra_dependencies.size());
+  for (const auto& dep : extra_dependencies) {
+    ephemeral_edges.emplace_back(
+        GraphView::OutputPort(dep.from, Graph::kControlSlot),
+        GraphView::InputPort(dep.to, Graph::kControlSlot));
+  }
+  return ephemeral_edges;
+}
+
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::vector<int>* ready_nodes,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies) {
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph, extra_dependencies));
+    const GraphDef& graph,
+    const absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<int>* ready_nodes) {
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(
+      graph, MakeEphemeralEdges(extra_dependencies)));
+
+  // Keep track of how many inputs are ready for the given node.
+  std::vector<int> num_ready_inputs(graph.node_size(), 0);
 
-  ready_nodes->reserve(graph_view.num_nodes());
+  // We'll push index of ready nodes to this output vector.
+  ready_nodes->reserve(graph.node_size());
 
   int front = 0;
   int back = 0;
-  std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
-  for (int i = 0; i < graph_view.num_nodes(); i++) {
-    if (graph_view.inputs(i).empty()) {
+
+  for (int i = 0; i < graph.node_size(); i++) {
+    if (graph_view.GetFanin(i).empty()) {
       ready_nodes->push_back(i);
       back++;
     }
     if (IsMerge(graph.node(i))) {
-      for (int input : graph_view.inputs(i)) {
+      for (int input : graph_view.GetFanin(i)) {
         if (IsNextIteration(graph.node(input))) {
           num_ready_inputs[i]++;
         }
@@ -55,9 +79,9 @@ Status ComputeTopologicalOrder(
 
   while (front != back) {
     int ready_node = (*ready_nodes)[front];
-    for (int fanout : graph_view.outputs(ready_node)) {
+    for (int fanout : graph_view.GetFanout(ready_node)) {
       ++num_ready_inputs[fanout];
-      if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
+      if (num_ready_inputs[fanout] == graph_view.GetFanin(fanout).size()) {
         ready_nodes->push_back(fanout);
         ++back;
       }
@@ -72,23 +96,32 @@ Status ComputeTopologicalOrder(
   return Status::OK();
 }
 
+}  // namespace
+
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies) {
+    const GraphDef& graph,
+    const absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order) {
   std::vector<int> ready_nodes;
   TF_RETURN_IF_ERROR(
-      ComputeTopologicalOrder(graph, &ready_nodes, extra_dependencies));
-  topo_order->reserve(graph.node_size());
-  for (int i = 0; i < ready_nodes.size(); ++i) {
-    (*topo_order)[&graph.node(ready_nodes[i])] = i;
+      ComputeTopologicalOrder(graph, extra_dependencies, &ready_nodes));
+
+  topo_order->reserve(ready_nodes.size());
+  for (int ready_node_idx : ready_nodes) {
+    topo_order->emplace_back(&graph.node(ready_node_idx));
   }
+
   return Status::OK();
 }
 
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<const NodeDef*>* topo_order) {
+  return ComputeTopologicalOrder(graph, {}, topo_order);
+}
+
 Status ReversedTopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   std::reverse(ready_nodes.begin(), ready_nodes.end());
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
@@ -96,7 +129,7 @@ Status ReversedTopologicalSort(GraphDef* graph) {
 
 Status TopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index b8cf897a321877bc73946907aa11b8b2c20255e9..dd4208dfff3b28f2b55f71e0cf369b655d6f8c09 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -16,22 +16,40 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// Compute a topological ordering for the graph nodes.
+// TODO(ezhulenev, b/121379902): We should be consistent with GraphTopologyView
+// and use `GraphView::Edge` to pass extra dependencies.
+struct TopologicalDependency {
+  TopologicalDependency(const NodeDef* from, const NodeDef* to)
+      : from(from), to(to) {}
+  const NodeDef* from;
+  const NodeDef* to;
+};
+
+// Computes a topological ordering for the graph nodes and outputs nodes in the
+// topological order to the `topo_order` output argument.
+//
+// It's possible to pass additional edges that do not exists in a graph, but
+// must be respected when computing graph topological order. Example: Tensorflow
+// runtime allows concurrent execution of dequeue/enqueue ops from the same
+// queue resource, but we might want to enforce ordering between them.
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies);
+    const GraphDef& graph,
+    absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order);
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<const NodeDef*>* topo_order);
 
-// Sort a graph in topological order.
+// Sorts a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
 
-// Sort a graph in topological order and reverse it.
+// Sorts a graph in topological order and reverse it.
 Status ReversedTopologicalSort(GraphDef* graph);
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index 48b7eb50bd9f2a4867e68291588d2e5c11a0c5c2..3868183c62d0dbdb09a65996b9de79b7a6001ca3 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -14,79 +14,94 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
 class TopologicalSortTest : public ::testing::Test {
  protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "", inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
+  struct NodeConfig {
+    NodeConfig(string name, std::vector<string> inputs)
+        : name(std::move(name)), inputs(std::move(inputs)) {}
+    NodeConfig(string name, string op, std::vector<string> inputs)
+        : name(std::move(name)), op(std::move(op)), inputs(std::move(inputs)) {}
+
+    string name;
+    string op;
+    std::vector<string> inputs;
+  };
+
+  static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
+    GraphDef graph;
+
+    for (const NodeConfig& node : nodes) {
+      NodeDef node_def;
+      node_def.set_name(node.name);
+      node_def.set_op(node.op);
+      for (const string& input : node.inputs) {
+        node_def.add_input(input);
+      }
+      *graph.add_node() = std::move(node_def);
     }
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
+
+    return graph;
   }
 };
 
 TEST_F(TopologicalSortTest, NoLoop) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
+  GraphDef graph = CreateGraph({
+      {"2", {"5"}},       //
+      {"0", {"5", "4"}},  //
+      {"1", {"4", "3"}},  //
+      {"3", {"2"}},       //
+      {"5", {}},          //
+      {"4", {}}           //
+  });
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
 
   const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    std::cout << "Node " << node_name << " at order " << topo_order
-              << std::endl;
-    EXPECT_EQ(node_name, order[topo_order]);
+
+  ASSERT_EQ(topo_order.size(), order.size());
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    EXPECT_EQ(node->name(), order[i]);
   }
 
   TF_EXPECT_OK(TopologicalSort(&graph));
-  for (int i = 0; i < order.size(); i++) {
+  for (int i = 0; i < topo_order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 }
 
 TEST_F(TopologicalSortTest, WithLoop) {
-  GraphDef graph;
-  // Create a loop
-  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
-  *graph.add_node() = CreateNode("3", "Switch", {"2"});
-  *graph.add_node() = CreateNode("4", "Identity", {"3"});
-  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
-  *graph.add_node() = CreateNode("1", {});
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
+  GraphDef graph = CreateGraph({
+      // Graph with a loop.
+      {"2", "Merge", {"1", "5"}},     //
+      {"3", "Switch", {"2"}},         //
+      {"4", "Identity", {"3"}},       //
+      {"5", "NextIteration", {"4"}},  //
+      {"1", {}}                       //
+  });
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
 
   const std::vector<string> order = {"1", "2", "3", "4", "5"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    EXPECT_EQ(node_name, order[topo_order]);
+
+  ASSERT_EQ(topo_order.size(), order.size());
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    EXPECT_EQ(node->name(), order[i]);
   }
 
   TF_EXPECT_OK(TopologicalSort(&graph));
@@ -96,12 +111,13 @@ TEST_F(TopologicalSortTest, WithLoop) {
 }
 
 TEST_F(TopologicalSortTest, WithIllegalLoop) {
-  GraphDef graph;
   // A loop without Merge and NextIteration is illegal and the original node
   // order and graph will be preserved.
-  *graph.add_node() = CreateNode("2", {"1", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("1", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"1", "3"}},  //
+      {"3", {"2"}},       //
+      {"1", {}}           //
+  });
 
   EXPECT_FALSE(TopologicalSort(&graph).ok());
   std::vector<string> order = {"2", "3", "1"};
@@ -111,9 +127,10 @@ TEST_F(TopologicalSortTest, WithIllegalLoop) {
 }
 
 TEST_F(TopologicalSortTest, DuplicatedInputs) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"1", "1"});
-  *graph.add_node() = CreateNode("1", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"1", "1"}},  //
+      {"1", {}}           //
+  });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2"};
@@ -123,12 +140,13 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
 }
 
 TEST_F(TopologicalSortTest, Idempotent) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("1", {});
-  *graph.add_node() = CreateNode("2", {});
-  *graph.add_node() = CreateNode("3", {"1", "2"});
-  *graph.add_node() = CreateNode("4", {"1", "3"});
-  *graph.add_node() = CreateNode("5", {"2", "3"});
+  GraphDef graph = CreateGraph({
+      {"1", {}},          //
+      {"2", {}},          //
+      {"3", {"1", "2"}},  //
+      {"4", {"1", "3"}},  //
+      {"5", {"2", "3"}}   //
+  });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2", "3", "4", "5"};
@@ -136,7 +154,7 @@ TEST_F(TopologicalSortTest, Idempotent) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 
-  // Run topo sort again to verify that it is idenpotent.
+  // Run topo sort again to verify that it is idempotent.
   TF_EXPECT_OK(TopologicalSort(&graph));
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -144,35 +162,81 @@ TEST_F(TopologicalSortTest, Idempotent) {
 }
 
 TEST_F(TopologicalSortTest, ExtraDependencies) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"5"}},       //
+      {"0", {"5", "4"}},  //
+      {"1", {"4", "3"}},  //
+      {"3", {"2"}},       //
+      {"5", {}},          //
+      {"4", {}}           //
+  });
 
   // Add an edge from 4 to 5.
-  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_dependencies;
-  extra_dependencies.emplace_back(&graph.node(5), &graph.node(4));
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(
-      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies));
-
-  const std::vector<string> order = {"4", "5", "2", "0", "3", "1"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    EXPECT_EQ(node_name, order[topo_order]);
+  std::vector<TopologicalDependency> extra_dependencies;
+  extra_dependencies.push_back({&graph.node(5), &graph.node(4)});
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, extra_dependencies, &topo_order));
+
+  const std::vector<string> valid_order_1 = {"4", "5", "2", "0", "3", "1"};
+  const std::vector<string> valid_order_2 = {"4", "5", "0", "2", "3", "1"};
+
+  ASSERT_EQ(topo_order.size(), valid_order_1.size());
+
+  std::vector<string> computed_order(6, "");
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    computed_order[i] = node->name();
   }
+  EXPECT_TRUE(computed_order == valid_order_1 ||
+              computed_order == valid_order_2);
 
-  // Add an edge from 0 to 4. This will create a loop
-  extra_dependencies.emplace_back(&graph.node(1), &graph.node(5));
+  // Add an edge from `0` to `4`. This will create a loop.
+  extra_dependencies.push_back({&graph.node(1), &graph.node(5)});
   EXPECT_FALSE(
-      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies).ok());
+      ComputeTopologicalOrder(graph, extra_dependencies, &topo_order).ok());
+}
+
+static void BM_ComputeTopologicalOrder(int iters, int size) {
+  testing::StopTiming();
+
+  random::PhiloxRandom philox(0x12345);
+  random::SimplePhilox rnd(&philox);
+
+  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+
+  GraphDef graph;
+  for (int i = 0; i < size; ++i) {
+    const string name = absl::StrCat(prefix, i);
+    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+
+    NodeDef node;
+    node.set_name(name);
+    for (int n = 0; n < num_inputs; ++n) {
+      const uint32 input_node = rnd.Uniform(i);
+      node.add_input(absl::StrCat(prefix, input_node));
+    }
+
+    *graph.add_node() = std::move(node);
+  }
+
+  testing::StartTiming();
+  std::vector<const NodeDef*> topo_order;
+  for (int i = 0; i < iters; i++) {
+    topo_order.clear();
+    Status st = ComputeTopologicalOrder(graph, &topo_order);
+    CHECK(st.ok()) << "Failed to compute topological order";
+  }
+  testing::StopTiming();
 }
+BENCHMARK(BM_ComputeTopologicalOrder)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(25000)
+    ->Arg(50000)
+    ->Arg(100000);
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
index 6952277568676baf5812a20c4c743356eeedd40a..c602e8c0e47723b4e6ad68431e5b08b8314d1c95 100644
--- a/tensorflow/core/grappler/utils/traversal.cc
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -17,89 +17,109 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 
 namespace tensorflow {
 namespace grappler {
 
 namespace {
 
-template <typename GraphViewType>
-void ReverseDfsInternal(
-    const GraphViewType& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  // Stack of work to do.
-  struct StackElem {
-    const NodeDef* node;
-    bool children_visited;
-    const NodeDef* src;
-  };
-  std::vector<StackElem> stack;
+struct DfsStackElem {
+  DfsStackElem(int node, bool children_visited, int src)
+      : node(node), children_visited(children_visited), src(src) {}
+  explicit DfsStackElem(int node) : DfsStackElem(node, false, -1) {}
 
+  // Index of the node in the graph ∊ [0, num_nodes).
+  int node;
+  // `True` if visited all the input/output nodes (pushed all input/output nodes
+  // to the stack).
+  bool children_visited;
+  // Index of the node in the graph, from which we entered the `node`.
+  int src;
+};
+
+enum class NodeState { kNotVisited, kVisiting, kDone };
+
+}  // namespace
+
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  const absl::Span<const NodeDef* const> from,
+                  const TraversalDirection direction,
+                  const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks) {
+  std::vector<DfsStackElem> stack;
   stack.reserve(from.size());
+
   for (const NodeDef* node : from) {
-    stack.push_back(StackElem{node, false});
+    const absl::optional<int> node_idx = graph_view.GetNodeIndex(*node);
+    DCHECK(node_idx.has_value()) << "Illegal start node: " << node->name();
+    if (node_idx.has_value()) {
+      stack.emplace_back(node_idx.value());
+    }
   }
 
-  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
-  absl::flat_hash_map<const NodeDef*, NodeState> node_state;
+  absl::flat_hash_map<int, NodeState> node_state;
   while (!stack.empty()) {
-    StackElem w = stack.back();
+    DfsStackElem w = stack.back();
     stack.pop_back();
 
+    NodeState& state = node_state[w.node];
+    if (state == NodeState::kDone) continue;
+
+    // Skip nodes that we should not enter.
+    if (predicates.enter && !predicates.enter(graph_view.GetNode(w.node))) {
+      state = NodeState::kDone;
+      continue;
+    }
+
+    // We've processed all the children of this node.
     if (w.children_visited) {
-      // We've processed all the children of this node
-      node_state[w.node] = DONE;
-      if (post_order) {
-        post_order(w.node);
+      state = NodeState::kDone;
+      if (callbacks.post_order) {
+        callbacks.post_order(graph_view.GetNode(w.node));
       }
       continue;
     }
 
-    auto& rslt = node_state[w.node];
-    if (rslt == DONE) {
-      continue;
-    } else if (rslt == VISITING) {
-      // Loop detected
-      if (on_back_edge) {
-        on_back_edge(w.src, w.node);
+    // Loop detected.
+    if (state == NodeState::kVisiting) {
+      if (callbacks.on_back_edge) {
+        callbacks.on_back_edge(graph_view.GetNode(w.src),
+                               graph_view.GetNode(w.node));
       }
       continue;
     }
-    rslt = VISITING;
-    if (pre_order) {
-      pre_order(w.node);
+
+    state = NodeState::kVisiting;
+    if (callbacks.pre_order) {
+      callbacks.pre_order(graph_view.GetNode(w.node));
     }
 
     // Enqueue the node again with the children_visited flag set to true.
-    stack.push_back(StackElem{w.node, true, w.src});
+    stack.emplace_back(w.node, true, w.src);
 
-    // Now enqueue the node children.
-    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
-      stack.push_back(StackElem{fanin.node, false, w.node});
+    // Check if we can continue traversal from the current node.
+    if (predicates.advance && !predicates.advance(graph_view.GetNode(w.node))) {
+      continue;
     }
-  }
-}
-
-}  // namespace
 
-void ReverseDfs(
-    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  ReverseDfsInternal<GraphView>(graph_view, from, pre_order, post_order,
-                                on_back_edge);
+    // Now enqueue the fanin/fanout nodes.
+    if (direction == TraversalDirection::kFollowInputs) {
+      for (const int fanin : graph_view.GetFanin(w.node)) {
+        stack.emplace_back(fanin, false, w.node);
+      }
+    } else {
+      for (const int fanout : graph_view.GetFanout(w.node)) {
+        stack.emplace_back(fanout, false, w.node);
+      }
+    }
+  }
 }
 
-void ReverseDfs(
-    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  ReverseDfsInternal<MutableGraphView>(graph_view, from, pre_order, post_order,
-                                       on_back_edge);
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  const absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks) {
+  DfsTraversal(graph_view, from, direction, {}, callbacks);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
index 5b7737f97eb1f8ee56efd599d6216dc4e472febd..5c9dada4933ff803c9f53fec44f74104daec11f6 100644
--- a/tensorflow/core/grappler/utils/traversal.h
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -17,29 +17,85 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
 
 #include <functional>
-#include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// Traverse the graph in reverse dfs order, starting from the list of nodes
-// specified in the 'from' argument. The pre_order and post_order functors will
-// be called on each reachable node (including the 'from' nodes) in pre and post
-// order. If loops are found, the on_back_edge functor will be called on the
+enum class TraversalDirection { kFollowInputs, kFollowOutputs };
+
+// Encapsulate DFS callbacks that will be called during the graph traversal.
+//
+// If non-empty, the `pre_order` and `post_order` functors will be called on
+// each reachable node (including the `from` nodes) in pre and post order. If
+// loops are found, the `on_back_edge` functor will be called on the
 // corresponding back edges. Moreover, the pre and post order will assume that
 // these back edges will be cut.
-void ReverseDfs(
-    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
-
-void ReverseDfs(
-    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
+struct DfsCallbacks {
+  DfsCallbacks() = default;
+  DfsCallbacks(std::function<void(const NodeDef*)> pre,
+               std::function<void(const NodeDef*)> post,
+               std::function<void(const NodeDef*, const NodeDef*)> back_edge)
+      : pre_order(std::move(pre)),
+        post_order(std::move(post)),
+        on_back_edge(std::move(back_edge)) {}
+
+  static DfsCallbacks PreOrder(std::function<void(const NodeDef*)> pre) {
+    return DfsCallbacks(std::move(pre), nullptr, nullptr);
+  }
+
+  static DfsCallbacks PostOrder(std::function<void(const NodeDef*)> post) {
+    return DfsCallbacks(nullptr, std::move(post), nullptr);
+  }
+
+  std::function<void(const NodeDef*)> pre_order;
+  std::function<void(const NodeDef*)> post_order;
+  std::function<void(const NodeDef*, const NodeDef*)> on_back_edge;
+};
+
+// Encapsulate DFS predicates for traversing the graph.
+//
+// The `enter` predicate decides if traversal should enter the node, and the
+// `advance` predicate decides if the traversal should follow inputs/outputs
+// from the node.
+//
+// If predicates are empty (default initialized), it's assumed that we can enter
+// into any node and advance from any node respectively.
+struct DfsPredicates {
+  DfsPredicates() = default;
+  DfsPredicates(std::function<bool(const NodeDef*)> enter,
+                std::function<bool(const NodeDef*)> advance)
+      : enter(std::move(enter)), advance(std::move(advance)) {}
+
+  static DfsPredicates Enter(std::function<bool(const NodeDef*)> enter) {
+    return DfsPredicates(std::move(enter), nullptr);
+  }
+
+  static DfsPredicates Advance(std::function<bool(const NodeDef*)> advance) {
+    return DfsPredicates(nullptr, std::move(advance));
+  }
+
+  std::function<bool(const NodeDef*)> enter;
+  std::function<bool(const NodeDef*)> advance;
+};
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Use `predicates` to decide if
+// traversal should enter/advance to/from the graph node. These predicates also
+// applied to the `from` nodes. Call corresponding callbacks for each visited
+// node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks);
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Call corresponding callbacks
+// for each visited node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
index c040477a08970436cb07f6bb87c30e47b6b72525..7b36d328e938473333bd79044b7e953a2f25e17c 100644
--- a/tensorflow/core/grappler/utils/traversal_test.cc
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -15,101 +15,222 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/traversal.h"
 
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
+
 namespace {
+using ::tensorflow::test::function::NDef;
+
+DfsCallbacks MkCallbacks(std::vector<string>* pre_order,
+                         std::vector<string>* post_order,
+                         std::vector<string>* back_edges) {
+  return {[pre_order](const NodeDef* n) { pre_order->push_back(n->name()); },
+          [post_order](const NodeDef* n) { post_order->push_back(n->name()); },
+          [back_edges](const NodeDef* src, const NodeDef* dst) {
+            back_edges->push_back(absl::StrCat(src->name(), "->", dst->name()));
+          }};
+}
+
+TEST(TraversalTest, OutputsDfsNoLoop) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", op, {"5"}, {}),                        //
+       NDef("0", op, {"5", "4"}, {}),                   //
+       NDef("1", op, {"4", "3"}, {}),                   //
+       NDef("3", op, {"2"}, {}),                        //
+       NDef("5", op, {}, {}),                           //
+       NDef("4", op, {}, {})},                          //
+      /*funcs=*/{});
 
-class TraversalTest : public ::testing::Test {
- protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "", inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
-    }
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
-  }
-};
-
-TEST_F(TraversalTest, ReverseDfsNoLoop) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
+  std::vector<const NodeDef*> start_nodes = {&graph.node(4), &graph.node(5)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"4", "1", "0", "5", "2", "3"};
+  const std::vector<string> expected_post = {"1", "0", "4", "3", "2", "5"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
+}
+
+TEST(TraversalTest, InputsDfsNoLoop) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", op, {"5"}, {}),                        //
+       NDef("0", op, {"5", "4"}, {}),                   //
+       NDef("1", op, {"4", "3"}, {}),                   //
+       NDef("3", op, {"2"}, {}),                        //
+       NDef("5", op, {}, {}),                           //
+       NDef("4", op, {}, {})},                          //
+      /*funcs=*/{});
 
   std::vector<const NodeDef*> start_nodes = {&graph.node(1), &graph.node(2)};
+
   std::vector<string> pre_order;
   std::vector<string> post_order;
-  bool found_back_edge = false;
-  ReverseDfs(
-      GraphView(&graph), start_nodes,
-      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
-      [&found_back_edge](const NodeDef*, const NodeDef*) {
-        found_back_edge = true;
-      });
-
-  // Pre/Post order traversals are non deterministic because a node fanin is an
-  // absl::flat_hash_set with non deterministic traversal order.
-  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
-
-  std::set<ValidTraversal> valid_traversals = {
-      // pre_order                     post_order
-      {{"1", "4", "3", "2", "5", "0"}, {"4", "5", "2", "3", "1", "0"}},
-      {{"1", "3", "2", "5", "4", "0"}, {"5", "2", "3", "4", "1", "0"}}};
-
-  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
-  EXPECT_FALSE(found_back_edge);
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowInputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "3", "2", "5", "0"};
+  const std::vector<string> expected_post = {"4", "5", "2", "3", "1", "0"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
 }
 
-TEST_F(TraversalTest, ReverseDfsWithLoop) {
-  GraphDef graph;
-  // Create a loop
-  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
-  *graph.add_node() = CreateNode("3", "Switch", {"2"});
-  *graph.add_node() = CreateNode("4", "Identity", {"3"});
-  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
-  *graph.add_node() = CreateNode("1", "Enter", {});
-  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+TEST(TraversalTest, InputsDfsWithLoop) {
+  // Graph with a loop.
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", "Merge", {"1", "5"}, {}),              //
+       NDef("3", "Switch", {"2"}, {}),                  //
+       NDef("4", "Identity", {"3"}, {}),                //
+       NDef("5", "NextIteration", {"4"}, {}),           //
+       NDef("1", "Enter", {}, {}),                      //
+       NDef("6", "Exit", {"3"}, {})},                   //
+      /*funcs=*/{});
 
   std::vector<const NodeDef*> start_nodes = {&graph.node(5)};
+
   std::vector<string> pre_order;
   std::vector<string> post_order;
   std::vector<string> back_edges;
-  ReverseDfs(
-      GraphView(&graph), start_nodes,
-      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
-      [&back_edges](const NodeDef* src, const NodeDef* dst) {
-        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
-      });
-
-  // Pre/Post order traversals are non deterministic because a node fanin is an
-  // absl::flat_hash_set with non deterministic traversal order.
-  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
-
-  std::set<ValidTraversal> valid_traversals = {
-      // pre_order                     post_order
-      {{"6", "3", "2", "4", "5", "1"}, {"5", "4", "1", "2", "3", "6"}},
-      {{"6", "3", "2", "1", "5", "4"}, {"1", "4", "5", "2", "3", "6"}},
-      {{"6", "3", "2", "5", "4", "1"}, {"4", "5", "1", "2", "3", "6"}}};
-
-  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
-  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowInputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"6", "3", "2", "1", "5", "4"};
+  const std::vector<string> expected_post = {"1", "4", "5", "2", "3", "6"};
+  const std::vector<string> expected_edges = {"4->3"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_EQ(back_edges, expected_edges);
+}
+
+TEST(TraversalTest, OutputDfsWithLoop) {
+  // Graph with a loop.
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", "Merge", {"1", "5"}, {}),              //
+       NDef("3", "Switch", {"2"}, {}),                  //
+       NDef("4", "Identity", {"3"}, {}),                //
+       NDef("5", "NextIteration", {"4"}, {}),           //
+       NDef("1", "Enter", {}, {}),                      //
+       NDef("6", "Exit", {"3"}, {})},                   //
+      /*funcs=*/{});
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"2", "3", "6", "4", "5"};
+  const std::vector<string> expected_post = {"6", "5", "4", "3", "2"};
+  const std::vector<string> expected_edges = {"5->2"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_EQ(back_edges, expected_edges);
+}
+
+TEST(TraversalTest, DfsWithEnterPredicate) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("1", op, {}, {}),                           //       2 -> 3
+       NDef("2", op, {"1"}, {}),                        // 1 -> /      \ -> 6
+       NDef("3", op, {"2"}, {}),                        //      \      /
+       NDef("4", op, {"1"}, {}),                        //       4 -> 5
+       NDef("5", op, {"4"}, {}),                        //
+       NDef("6", op, {"3", "5"}, {})},                  //
+      /*funcs=*/{});
+
+  // Do not enter the nodes '2' and '3'.
+  const auto enter = [](const NodeDef* node) {
+    return node->name() != "2" && node->name() != "3";
+  };
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               DfsPredicates::Enter(enter),
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "5", "6"};
+  const std::vector<string> expected_post = {"6", "5", "4", "1"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
+}
+
+TEST(TraversalTest, DfsWithAdvancePredicate) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("1", op, {}, {}),                           //       2 -> 3
+       NDef("2", op, {"1"}, {}),                        // 1 -> /      \ -> 6
+       NDef("3", op, {"2"}, {}),                        //      \      /
+       NDef("4", op, {"1"}, {}),                        //       4 -> 5
+       NDef("5", op, {"4"}, {}),                        //
+       NDef("6", op, {"3", "5"}, {})},                  //
+      {} /* empty function library*/);
+
+  // Do not advance from the nodes '2' and '3'.
+  const auto advance = [](const NodeDef* node) {
+    return node->name() != "2" && node->name() != "3";
+  };
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               DfsPredicates::Advance(advance),
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "5", "6", "2"};
+  const std::vector<string> expected_post = {"6", "5", "4", "2", "1"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e3e1538b00c5ca446deea5859771286f45736c6d
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -0,0 +1,50 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "graph_verifier",
+    hdrs = [
+        "graph_verifier.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "structure_verifier",
+    srcs = ["structure_verifier.cc"],
+    hdrs = [
+        "structure_verifier.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_verifier",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/utils:topological_sort",
+    ],
+)
+
+tf_cc_test(
+    name = "structure_verifier_test",
+    srcs = ["structure_verifier_test.cc"],
+    deps = [
+        ":structure_verifier",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/grappler/verifiers/graph_verifier.h b/tensorflow/core/grappler/verifiers/graph_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..10fd201eadcfd33709c0e7d2540528ad895b3358
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/graph_verifier.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// An abstract interface for verifying a graph.
+// This will be used to implement specific verifiers to verify that a grappler
+// transformed graph is valid.
+// Some examples of specific verifiers are:
+// 1. A general structural verifier that verifies that the specified graph has
+//    a valid structure that meets the specification of what it means to be
+//      a valid TensorFlow graph.
+// 2. A backend specific verifier that verifies that the specified graph,
+//     generated after a grappler transformation to convert the input TensorFlow
+//     graph to a corresponding backend graph, is a valid graph in the
+//     specification of the backend.
+class GraphVerifier {
+ public:
+  GraphVerifier() {}
+  virtual ~GraphVerifier() {}
+
+  // A name for the verifier.
+  virtual string name() const = 0;
+
+  // Implement an algorithm to verify the specified graph.
+  // The return value is a Status that represents a concatenation of Status of
+  // each verification step.
+  virtual Status Verify(const GraphDef& graph) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.cc b/tensorflow/core/grappler/verifiers/structure_verifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab0a2ed5268eb5a9d8f16c9081fbbc04f7b31af3
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.cc
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/validate.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// TODO(ashwinm): Expand this to add more structural checks.
+Status StructureVerifier::Verify(const GraphDef& graph) {
+  StatusGroup status_group;
+  status_group.Update(tensorflow::graph::ValidateGraphDefAgainstOpRegistry(
+      graph, *OpRegistry::Global()));
+  status_group.Update(tensorflow::graph::VerifyNoDuplicateNodeNames(graph));
+
+  std::vector<const NodeDef*> topo_order;
+  status_group.Update(ComputeTopologicalOrder(graph, &topo_order));
+  return status_group.as_status();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.h b/tensorflow/core/grappler/verifiers/structure_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab719f1214eebb624d50a814ce437ffe3957304d
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Verifies the structure of a graph to ensure it is valid.
+class StructureVerifier : public GraphVerifier {
+ public:
+  StructureVerifier() {}
+  ~StructureVerifier() override {}
+
+  string name() const override { return "structure_verifier"; };
+
+  Status Verify(const GraphDef& graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3b1d0646d9b336cd8a70d5b44bf33eed9f8432c
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/match.h"
+#include "tensorflow/cc/ops/parsing_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class StructureVerifierTest : public ::testing::Test {
+ protected:
+  StructureVerifierTest() { verifier_.reset(new StructureVerifier()); }
+  void SetGraph(const string& gdef_ascii) {
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &graph_));
+  }
+  GraphDef graph_;
+  std::unique_ptr<StructureVerifier> verifier_;
+};
+
+Status Scalars(shape_inference::InferenceContext* c) {
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("TestParams").Output("o: float").SetShapeFn(Scalars);
+REGISTER_OP("TestInput")
+    .Output("a: float")
+    .Output("b: float")
+    .SetShapeFn(Scalars);
+REGISTER_OP("TestMul")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(Scalars);
+
+TEST_F(StructureVerifierTest, ValidGraphs) {
+  // With scope, ops gets registered automatically.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  ops::ShapeN b(s.WithOpName("b"), {a, a, a});
+
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  TF_EXPECT_OK(verifier_->Verify(graph));
+
+  // With graphdef directly, relies on REGISTER_OP to register ops
+  SetGraph(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }");
+
+  TF_EXPECT_OK(verifier_->Verify(graph_));
+}
+
+TEST_F(StructureVerifierTest, OpNotRegistered) {
+  SetGraph(
+      "node { name: 'input' op: 'OpNotRegistered' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::NOT_FOUND);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Op type not registered"));
+}
+
+TEST_F(StructureVerifierTest, DuplicateNodeNames) {
+  SetGraph(
+      "node { name: 'A' op: 'TestParams' }"
+      "node { name: 'A' op: 'TestInput' }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::ALREADY_EXISTS);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Node already exists:"));
+}
+
+TEST_F(StructureVerifierTest, GraphWithInvalidCycle) {
+  SetGraph(
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(),
+                        "The graph couldn't be sorted in topological order"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index c8aa2b32659d5a3ee680595913f6b8fb456f3bee..ffb093b2efe7c80cdc9e20da700dbb0a83acfc30 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -48,7 +48,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
-    "tf_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -61,6 +60,7 @@ load(
     "mkl_deps",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -138,7 +138,11 @@ tf_kernel_library(
         "slice_op.h",
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
-        "strided_slice_op_gpu.cu.cc",
+        "strided_slice_op_gpu_impl.h",
+        "strided_slice_op_gpu_int.cu.cc",
+        "strided_slice_op_gpu_complex.cu.cc",
+        "strided_slice_op_gpu_bool.cu.cc",
+        "strided_slice_op_gpu_number_types.cu.cc",
     ],
     deps = [
         ":bounds_check",
@@ -152,14 +156,65 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "clustering_ops",
+    prefix = "clustering_ops",
+    deps = [
+        "//tensorflow/core:clustering_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "clustering_ops_test",
+    srcs = ["clustering_ops_test.cc"],
+    deps = [
+        ":clustering_ops",
+        "//tensorflow/core:clustering_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "collective_ops",
+    srcs = if_nccl([
+        "collective_nccl_reducer.h",
+        "collective_nccl_reducer.cc",
+    ]),
     prefix = "collective_ops",
     deps = [
         "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+    ] + if_nccl([
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core/nccl:nccl_lib",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "collective_nccl_reducer_test",
+    size = "small",
+    srcs = ["collective_nccl_reducer_test.cc"],
+    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -316,6 +371,59 @@ tf_kernel_library(
     ]),
 )
 
+cc_library(
+    name = "sparse_utils",
+    srcs = [
+        "sparse_utils.cc",
+    ],
+    hdrs = ["sparse_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_utils_test",
+    srcs = ["sparse_utils_test.cc"],
+    deps = [
+        ":sparse_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "tensor_flag_utils",
+    srcs = [
+        "tensor_flag_utils.cc",
+    ],
+    hdrs = ["tensor_flag_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "tensor_flag_utils_test",
+    srcs = ["tensor_flag_utils_test.cc"],
+    deps = [
+        ":tensor_flag_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
 tf_cuda_library(
     name = "ops_testutil",
     testonly = 1,
@@ -458,12 +566,12 @@ cc_library(
     name = "batch_kernels",
     srcs = ["batch_kernels.cc"],
     deps = [
+        ":concat_lib_hdrs",
+        ":ops_util_hdrs",
+        ":split_lib_hdrs",
         "//tensorflow/core:batch_ops_op_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:concat_lib_hdrs",
-        "//tensorflow/core/kernels:ops_util_hdrs",
-        "//tensorflow/core/kernels:split_lib_hdrs",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
         "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
     ],
@@ -545,13 +653,10 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "bounds_check",
-    hdrs = ["bounds_check.h"],
+    actual = "//tensorflow/core:framework_bounds_check",
     visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:framework_bounds_check",
-    ],
 )
 
 # Private support libraries ---------------------------------------------------
@@ -1012,7 +1117,16 @@ tf_kernel_library(
     hdrs = ["tile_functor.h"],
     gpu_srcs = [
         "tile_functor.h",
-        "tile_functor_gpu.cu.cc",
+        "tile_functor_gpu.h",
+        "tile_functor_gpu_bool.cu.cc",
+        "tile_functor_gpu_complex64.cu.cc",
+        "tile_functor_gpu_complex128.cu.cc",
+        "tile_functor_gpu_double.cu.cc",
+        "tile_functor_gpu_float.cu.cc",
+        "tile_functor_gpu_half.cu.cc",
+        "tile_functor_gpu_int16.cu.cc",
+        "tile_functor_gpu_int32.cu.cc",
+        "tile_functor_gpu_int64.cu.cc",
     ],
     prefix = "tile_ops",
     deps = ARRAY_DEPS,
@@ -1087,13 +1201,13 @@ tf_cc_test(
     size = "small",
     srcs = ["ragged_gather_op_test.cc"],
     deps = [
+        ":ops_testutil",
         ":ragged_gather_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:ragged_array_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
     ],
 )
 
@@ -1110,13 +1224,13 @@ tf_cc_test(
     name = "ragged_range_op_test",
     srcs = ["ragged_range_op_test.cc"],
     deps = [
+        ":ops_testutil",
         ":ragged_range_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:ragged_math_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
     ],
 )
 
@@ -1134,6 +1248,7 @@ tf_cc_test(
     size = "small",
     srcs = ["ragged_tensor_to_sparse_kernel_test.cc"],
     deps = [
+        ":ops_testutil",
         ":ragged_tensor_to_sparse_kernel",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1141,7 +1256,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
     ],
 )
 
@@ -1150,13 +1264,13 @@ tf_kernel_library(
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":bounds_check_lib",
         ":gpu_util_hdrs",
         "//tensorflow/core:cudnn_rnn_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor",
-        "//tensorflow/core/kernels:bounds_check_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
     ],
@@ -2060,6 +2174,16 @@ tf_kernel_library(
     deps = LOOKUP_DEPS,
 )
 
+cc_library(
+    name = "string_view_variant_wrapper",
+    hdrs = ["string_view_variant_wrapper.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "checkpoint_ops",
     deps = [
@@ -2223,9 +2347,7 @@ tf_kernel_library(
         ":bounds_check",
         ":dense_update_functor",
         ":gather_functor",
-        ":mutex_ops",
         ":scatter_functor",
-        ":state",
         ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:core_cpu_lib",
@@ -2247,6 +2369,7 @@ tf_kernel_library(
     ],
     deps = [
         ":concat_lib",
+        ":fill_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:list_ops_op_lib",
@@ -2303,6 +2426,8 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/stream_executor:stream",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2478,7 +2603,6 @@ tf_kernel_library(
     prefix = "encode_wav_op",
     deps = [
         ":bounds_check",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2490,7 +2614,6 @@ tf_kernel_library(
     name = "decode_wav_op",
     prefix = "decode_wav_op",
     deps = [
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2513,6 +2636,7 @@ tf_cc_tests(
         ":eigen_helpers",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3110,7 +3234,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     prefix = "sparse_matmul_op",
-    deps = MATH_DEPS + select({
+    deps = MATH_DEPS + [":eigen_contraction_kernel"] + select({
         ":xsmm": [
             "@libxsmm_archive//:xsmm_avx",
         ],
@@ -3135,6 +3259,7 @@ cc_library(
         ":fft_ops",
         ":histogram_op",
         ":matmul_op",
+        ":nextafter_op",
         ":population_count_op",
         ":reduction_ops",
         ":scan_ops",
@@ -3205,6 +3330,12 @@ tf_kernel_library(
     deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"],
 )
 
+tf_kernel_library(
+    name = "nextafter_op",
+    prefix = "nextafter_op",
+    deps = MATH_DEPS + [":cwise_op"],
+)
+
 tf_kernel_library(
     name = "population_count_op",
     prefix = "population_count_op",
@@ -3264,7 +3395,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "scan_ops",
-    prefix = "scan_ops",
+    srcs = ["scan_ops.cc"],
+    hdrs = ["scan_ops.h"],
+    gpu_srcs = [
+        "scan_ops.h",
+        "scan_ops_gpu.h",
+        "scan_ops_gpu_double.cu.cc",
+        "scan_ops_gpu_float.cu.cc",
+        "scan_ops_gpu_half.cu.cc",
+    ],
     deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -3530,27 +3669,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "shape_op_test",
-    srcs = ["shape_op_test.cc"],
-    deps = [
-        ":array",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "sparse_matmul_op_test",
     size = "small",
@@ -3689,7 +3807,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "depthwise_conv_op",
-    prefix = "depthwise_conv_op",
+    srcs = ["depthwise_conv_op.cc"],
+    hdrs = ["depthwise_conv_op.h"],
+    gpu_srcs = [
+        "depthwise_conv_op.h",
+        "depthwise_conv_op_gpu.h",
+        "depthwise_conv_op_gpu_double.cu.cc",
+        "depthwise_conv_op_gpu_float.cu.cc",
+        "depthwise_conv_op_gpu_half.cu.cc",
+    ],
     deps = [
         ":bounds_check",
         ":conv_ops",
@@ -3760,7 +3886,6 @@ NN_DEPS = [
     ":conv_2d",
     ":eigen_contraction_kernel",
     ":ops_util",
-    ":pooling_ops",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -3787,6 +3912,8 @@ tf_kernel_library(
     deps = NN_DEPS + if_cuda([
         ":reduction_ops",
         "@cub_archive//:cub",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/stream_executor/cuda:cuda_stream",
     ]),
 )
 
@@ -3839,7 +3966,21 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "topk_op",
-    prefix = "topk_op",
+    srcs = ["topk_op.cc"],
+    hdrs = ["topk_op.h"],
+    gpu_srcs = [
+        "topk_op.h",
+        "topk_op_gpu.h",
+        "topk_op_gpu_double.cu.cc",
+        "topk_op_gpu_float.cu.cc",
+        "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_int32.cu.cc",
+        "topk_op_gpu_int16.cu.cc",
+        "topk_op_gpu_uint16.cu.cc",
+        "topk_op_gpu_int8.cu.cc",
+        "topk_op_gpu_uint8.cu.cc",
+    ],
     deps = NN_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -3939,6 +4080,7 @@ tf_cuda_cc_test(
         ":nn",
         ":ops_testutil",
         ":ops_util",
+        ":pooling_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:core_cpu",
@@ -4221,6 +4363,31 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "stateful_random_ops",
+    prefix = "stateful_random_ops",
+    deps = [
+        ":bounds_check",
+        ":dense_update_functor",
+        ":gather_functor",
+        ":mutex_ops",
+        ":random_op",
+        ":resource_variable_ops",
+        ":scatter_functor",
+        ":state",
+        ":training_op_helpers",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stateful_random_ops_op_lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
 tf_kernel_library(
     name = "stateless_random_ops",
     prefix = "stateless_random_ops",
@@ -4438,7 +4605,10 @@ tf_kernel_library(
     deps = SPARSE_DEPS + [
         ":bounds_check",
         "//third_party/eigen3",
-    ],
+    ] + if_cuda([
+        ":reduction_ops",
+        "@cub_archive//:cub",
+    ]),
 )
 
 tf_kernel_library(
@@ -4563,6 +4733,7 @@ cc_library(
     srcs = ["sdca_internal.cc"],
     hdrs = ["sdca_internal.h"],
     deps = [
+        ":eigen_contraction_kernel",
         ":loss_updaters",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -4754,6 +4925,8 @@ tf_cc_test(
     size = "small",
     srcs = ["string_format_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":string_format_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4762,8 +4935,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4796,6 +4967,8 @@ tf_cc_test(
     size = "small",
     srcs = ["regex_replace_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":regex_replace_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4804,8 +4977,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4820,6 +4991,8 @@ tf_cc_test(
     size = "small",
     srcs = ["string_split_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":string_split_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4828,8 +5001,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4850,6 +5021,8 @@ tf_cc_test(
     size = "small",
     srcs = ["substr_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":substr_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4859,8 +5032,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4929,11 +5100,14 @@ tf_kernel_library(
         ":random_op",
         ":random_ops",
         ":stateless_random_ops",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ],
+    ] + if_cuda([
+        ":reduction_ops",
+        "@cub_archive//:cub",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -5090,7 +5264,6 @@ tf_kernel_library(
     prefix = "spectrogram_op",
     deps = [
         ":spectrogram",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5208,7 +5381,6 @@ tf_kernel_library(
     prefix = "mfcc_op",
     deps = [
         ":mfcc",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5258,7 +5430,6 @@ filegroup(
     srcs = [
         "avgpooling_op.h",
         "batch_util.h",
-        "bounds_check.h",
         "cwise_ops.h",
         "cwise_ops_common.h",
         "cwise_ops_gradients.h",
@@ -5299,7 +5470,6 @@ filegroup(
         "assign_op.h",
         "bias_op.cc",
         "bias_op.h",
-        "bounds_check.h",
         "cast_op.cc",
         "cast_op.h",
         "cast_op_impl.h",
@@ -5595,6 +5765,7 @@ filegroup(
         "decode_bmp_op.cc",
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
+        "fft_ops.cc",
         "in_topk_op.cc",
         "initializable_lookup_table.cc",
         "logging_ops.cc",
@@ -5787,6 +5958,7 @@ filegroup(
             "mkl_*",
             "xsmm_*",
             "cwise_ops_sycl_common.h",
+            "nextafter_op.cc",
         ] + ANDROID_TEXTUAL_HDRS,
     ),
     visibility = ["//visibility:public"],
@@ -6512,6 +6684,7 @@ cc_library(
     srcs = ["remote_fused_graph_execute_op_test_utils.cc"],
     hdrs = ["remote_fused_graph_execute_op_test_utils.h"],
     deps = [
+        ":cwise_op",
         ":remote_fused_graph_execute_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
@@ -6519,7 +6692,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
     ],
 )
 
@@ -6530,6 +6702,7 @@ tf_cc_test(
         "remote_fused_graph_execute_utils_test.cc",
     ],
     deps = [
+        ":cwise_op",
         ":remote_fused_graph_execute_op_test_utils",
         ":remote_fused_graph_execute_utils",
         "//tensorflow/cc:cc_ops",
@@ -6545,7 +6718,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
     ],
 )
 
@@ -6868,14 +7040,14 @@ tf_kernel_library(
     name = "summary_kernels",
     srcs = ["summary_kernels.cc"],
     deps = [
-        "//tensorflow/contrib/tensorboard/db:schema",
-        "//tensorflow/contrib/tensorboard/db:summary_db_writer",
-        "//tensorflow/contrib/tensorboard/db:summary_file_writer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:summary_ops_op_lib",
         "//tensorflow/core/lib/db:sqlite",
+        "//tensorflow/core/summary:schema",
+        "//tensorflow/core/summary:summary_db_writer",
+        "//tensorflow/core/summary:summary_file_writer",
     ],
 )
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 47e10f56dfa682d97b04b78cd0e5f9a536081025..1aef0060b0c35a9cc1b451f4a579c90bc31fbaaf 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -189,11 +189,11 @@ class AdjustContrastOpV2Base : public OpKernel {
                          const ComputeOptions& options) = 0;
 };
 
-template <typename Device>
+template <typename Device, typename T>
 class AdjustContrastOpv2;
 
 template <>
-class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
+class AdjustContrastOpv2<CPUDevice, float> : public AdjustContrastOpV2Base {
  public:
   explicit AdjustContrastOpv2(OpKernelConstruction* context)
       : AdjustContrastOpV2Base(context) {}
@@ -378,23 +378,32 @@ class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_CPU),
-                        AdjustContrastOpv2<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustContrastv2").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustContrastOpv2<CPUDevice, float>);
 
 #if GOOGLE_CUDA
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-template <>
-void AdjustContrastv2<GPUDevice>::operator()(
-    const GPUDevice& d, typename TTypes<float, 4>::ConstTensor input,
-    typename TTypes<float>::ConstScalar contrast_factor,
-    typename TTypes<float, 4>::Tensor output);
-extern template struct AdjustContrastv2<GPUDevice>;
+
+#define DECLARE_GPU_SPEC(T)                                         \
+  template <>                                                       \
+  void AdjustContrastv2<GPUDevice, T>::operator()(                  \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+      typename TTypes<float>::ConstScalar contrast_factor,          \
+      typename TTypes<T, 4>::Tensor output);                        \
+  extern template struct AdjustContrastv2<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+
+#undef DECLARE_GPU_SPEC
+
 }  // namespace functor
 
-template <>
-class AdjustContrastOpv2<GPUDevice> : public AdjustContrastOpV2Base {
+template <typename T>
+class AdjustContrastOpv2<GPUDevice, T> : public AdjustContrastOpV2Base {
  public:
   explicit AdjustContrastOpv2(OpKernelConstruction* context)
       : AdjustContrastOpV2Base(context) {}
@@ -403,20 +412,27 @@ class AdjustContrastOpv2<GPUDevice> : public AdjustContrastOpV2Base {
                  const ComputeOptions& options) override {
     const int64 shape[4] = {options.batch, options.height, options.width,
                             options.channels};
-    functor::AdjustContrastv2<GPUDevice>()(
-        context->eigen_device<GPUDevice>(),
-        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
-        options.output->shaped<float, 4>(shape));
+    functor::AdjustContrastv2<GPUDevice, T>()(
+        context->eigen_device<GPUDevice>(), options.input->shaped<T, 4>(shape),
+        options.factor->scalar<float>(), options.output->shaped<T, 4>(shape));
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_GPU),
-                        AdjustContrastOpv2<GPUDevice>);
+#define REGISTER_GPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("AdjustContrastv2").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustContrastOpv2<GPUDevice, T>);
+
+REGISTER_GPU(float)
+REGISTER_GPU(Eigen::half)
+
+#undef REGISTER_GPU
+
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
 template <>
-class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
+class AdjustContrastOpv2<SYCLDevice, float> : public AdjustContrastOpV2Base {
  public:
   explicit AdjustContrastOpv2(OpKernelConstruction* context)
       : AdjustContrastOpV2Base(context) {}
@@ -431,8 +447,9 @@ class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
         options.output->shaped<float, 4>(shape));
   }
 };
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_SYCL),
-                        AdjustContrastOpv2<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustContrastv2").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
+    AdjustContrastOpv2<SYCLDevice, float>);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/adjust_contrast_op.h
index f4a53c2ef9ca77eaa634a9a090cc98f93d179806..3e501bccee3315f15cf8009f5e04aa00d706da5c 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.h
+++ b/tensorflow/core/kernels/adjust_contrast_op.h
@@ -87,11 +87,11 @@ struct AdjustContrast {
 };
 
 // Functor used by AdjustContrastOpv2 to do the computations.
-template <typename Device>
+template <typename Device, typename T>
 struct AdjustContrastv2 {
-  void operator()(const Device& d, typename TTypes<float, 4>::ConstTensor input,
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<float>::ConstScalar contrast_factor,
-                  typename TTypes<float, 4>::Tensor output) {
+                  typename TTypes<T, 4>::Tensor output) {
     const int batch = input.dimension(0);
     const int height = input.dimension(1);
     const int width = input.dimension(2);
@@ -138,15 +138,19 @@ struct AdjustContrastv2 {
 #endif
     Eigen::Sizes<1, 1, 1, 1> scalar;
     float num_reduced_coeffs = height * width;
-    output.device(d) =
-        (input.shuffle(reduced_dims_first).sum(reduction_axis).eval() /
-         num_reduced_coeffs)
-            .reshape(reshape_dims)
-            .broadcast(broadcast_dims);
+    output.device(d) = (input.template cast<float>()
+                            .shuffle(reduced_dims_first)
+                            .sum(reduction_axis)
+                            .eval() /
+                        num_reduced_coeffs)
+                           .template cast<T>()
+                           .reshape(reshape_dims)
+                           .broadcast(broadcast_dims);
     auto contrast_factor_tensor =
         contrast_factor.reshape(scalar).broadcast(scalar_broadcast);
-    auto adjusted = (input - output) * contrast_factor_tensor;
-    output.device(d) += adjusted;
+    auto adjusted =
+        (input - output).template cast<float>() * contrast_factor_tensor;
+    output.device(d) += adjusted.template cast<T>();
   }
 };
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
index a451bfe29c76d0710e97cbe2b98a9837332014e5..1a1c2a4e1ee99cffbf5c18d849a97c36767829ee 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
@@ -26,7 +26,8 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 // this is for v2
-template struct functor::AdjustContrastv2<GPUDevice>;
+template struct functor::AdjustContrastv2<GPUDevice, float>;
+template struct functor::AdjustContrastv2<GPUDevice, Eigen::half>;
 
 // these are for v1
 template struct functor::AdjustContrast<GPUDevice, uint8>;
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
index 49df5ae296b3e2a213c436d0e4656757c49cb16e..dede7d249da978dc71214c06dd1c542f78db751e 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
@@ -91,11 +91,10 @@ inline __device__ RgbTuple hsv2rgb_cuda(const float h, const float s,
   return tuple;
 }
 
-template <bool AdjustHue, bool AdjustSaturation, bool AdjustV>
+template <bool AdjustHue, bool AdjustSaturation, bool AdjustV, typename T>
 __global__ void adjust_hsv_nhwc(const int64 number_elements,
-                                const float* const __restrict__ input,
-                                float* const output,
-                                const float* const hue_delta,
+                                const T* const __restrict__ input,
+                                T* const output, const float* const hue_delta,
                                 const float* const saturation_scale,
                                 const float* const value_scale) {
   // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel
@@ -111,7 +110,9 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
     output[idx + 2] = input[idx + 2];
     return;
   }
-  const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]);
+  const HsvTuple hsv = rgb2hsv_cuda(static_cast<float>(input[idx]),
+                                    static_cast<float>(input[idx + 1]),
+                                    static_cast<float>(input[idx + 2]));
   float new_h = hsv.h;
   float new_s = hsv.s;
   float new_v = hsv.v;
@@ -134,9 +135,9 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
     new_v = hsv.v * scale;
   }
   const RgbTuple rgb = hsv2rgb_cuda(new_h, new_s, new_v);
-  output[idx] = rgb.r;
-  output[idx + 1] = rgb.g;
-  output[idx + 2] = rgb.b;
+  output[idx] = static_cast<T>(rgb.r);
+  output[idx + 1] = static_cast<T>(rgb.g);
+  output[idx + 2] = static_cast<T>(rgb.b);
 }
 
 }  // namespace internal
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 52dec94305d3c8558013861a44524609ad6eed7a..06de5ea3fb69c811f3057ff6829b22466a31f64a 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -82,7 +82,7 @@ class AdjustHueOpBase : public OpKernel {
   }
 };
 
-template <class Device>
+template <class Device, typename T>
 class AdjustHueOp;
 
 namespace internal {
@@ -196,7 +196,7 @@ static void hv_range_to_rgb(float h, float v_min, float v_max, float* r,
 }  // namespace internal
 
 template <>
-class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
+class AdjustHueOp<CPUDevice, float> : public AdjustHueOpBase {
  public:
   explicit AdjustHueOp(OpKernelConstruction* context)
       : AdjustHueOpBase(context) {}
@@ -245,12 +245,13 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU),
-                        AdjustHueOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHue").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustHueOp<CPUDevice, float>);
 
 #if GOOGLE_CUDA
-template <>
-class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
+template <typename T>
+class AdjustHueOp<GPUDevice, T> : public AdjustHueOpBase {
  public:
   explicit AdjustHueOp(OpKernelConstruction* context)
       : AdjustHueOpBase(context) {}
@@ -265,17 +266,24 @@ class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
     const auto stream = device.stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
     if (number_of_elements > 0) {
-      const float* input_data = input->flat<float>().data();
+      const T* input_data = input->flat<T>().data();
       const float* delta_h = delta->flat<float>().data();
-      float* const output_data = output->flat<float>().data();
-      functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h,
-                              output_data);
+      T* const output_data = output->flat<T>().data();
+      functor::AdjustHueGPU<T>()(&device, number_of_elements, input_data,
+                                 delta_h, output_data);
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU),
-                        AdjustHueOp<GPUDevice>);
+#define REGISTER_GPU(T)                                            \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("AdjustHue").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustHueOp<GPUDevice, T>);
+
+REGISTER_GPU(float)
+REGISTER_GPU(Eigen::half)
+
+#undef REGISTER_GPU
 
 #endif
 
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/adjust_hue_op.h
index 983a4072bfa2ee5f44a1c5e1e1050ffa5aea5de7..6d6699de3fbcdd4e2b83f0c2a77a36422aa8e24b 100644
--- a/tensorflow/core/kernels/adjust_hue_op.h
+++ b/tensorflow/core/kernels/adjust_hue_op.h
@@ -27,10 +27,11 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
+template <typename T>
 struct AdjustHueGPU {
   void operator()(GPUDevice* device, const int64 number_of_elements,
-                  const float* const input, const float* const delta,
-                  float* const output);
+                  const T* const input, const float* const delta,
+                  T* const output);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
index a4fe5f755cafb6f30a28e87ea7febf0535c68a70..c30085269c07e2bdeae70a8729261596faeb6344 100644
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@@ -24,19 +24,25 @@ namespace tensorflow {
 
 namespace functor {
 
-void AdjustHueGPU::operator()(GPUDevice* device, const int64 number_of_elements,
-                              const float* const input,
-                              const float* const delta, float* const output) {
+template <typename T>
+void AdjustHueGPU<T>::operator()(GPUDevice* device,
+                                 const int64 number_of_elements,
+                                 const T* const input, const float* const delta,
+                                 T* const output) {
   const auto stream = device->stream();
   const CudaLaunchConfig config =
       GetCudaLaunchConfig(number_of_elements, *device);
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  internal::adjust_hsv_nhwc<true, false, false>
+  internal::adjust_hsv_nhwc<true, false, false, T>
       <<<block_count, threads_per_block, 0, stream>>>(
           number_of_elements, input, output, delta, nullptr, nullptr);
 }
+
+template struct AdjustHueGPU<float>;
+template struct AdjustHueGPU<Eigen::half>;
+
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index f0c6ae499d4c209ef1556890e87f63085de7ea75..87d34fcfcc31c9f806754e9b1bc36430938d64c3 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -81,7 +81,7 @@ class AdjustSaturationOpBase : public OpKernel {
   }
 };
 
-template <class Device>
+template <class Device, typename T>
 class AdjustSaturationOp;
 
 namespace internal {
@@ -173,7 +173,7 @@ static void hsv_to_rgb(float h, float s, float v, float* r, float* g,
 }  // namespace internal
 
 template <>
-class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
+class AdjustSaturationOp<CPUDevice, float> : public AdjustSaturationOpBase {
  public:
   explicit AdjustSaturationOp(OpKernelConstruction* context)
       : AdjustSaturationOpBase(context) {}
@@ -211,12 +211,13 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustSaturation").Device(DEVICE_CPU),
-                        AdjustSaturationOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustSaturation").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustSaturationOp<CPUDevice, float>);
 
 #if GOOGLE_CUDA
-template <>
-class AdjustSaturationOp<GPUDevice> : public AdjustSaturationOpBase {
+template <typename T>
+class AdjustSaturationOp<GPUDevice, T> : public AdjustSaturationOpBase {
  public:
   explicit AdjustSaturationOp(OpKernelConstruction* context)
       : AdjustSaturationOpBase(context) {}
@@ -231,17 +232,24 @@ class AdjustSaturationOp<GPUDevice> : public AdjustSaturationOpBase {
     const auto stream = device.stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
     if (number_of_elements > 0) {
-      const float* input_data = input->flat<float>().data();
+      const T* input_data = input->flat<T>().data();
       const float* scale_data = scale->flat<float>().data();
-      float* const output_data = output->flat<float>().data();
-      functor::AdjustSaturationGPU()(&device, number_of_elements, input_data,
-                                     scale_data, output_data);
+      T* const output_data = output->flat<T>().data();
+      functor::AdjustSaturationGPU<T>()(&device, number_of_elements, input_data,
+                                        scale_data, output_data);
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustSaturation").Device(DEVICE_GPU),
-                        AdjustSaturationOp<GPUDevice>);
+#define REGISTER_GPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("AdjustSaturation").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustSaturationOp<GPUDevice, T>);
+
+REGISTER_GPU(float)
+REGISTER_GPU(Eigen::half)
+
+#undef REGISTER_GPU
 
 #endif
 
diff --git a/tensorflow/core/kernels/adjust_saturation_op.h b/tensorflow/core/kernels/adjust_saturation_op.h
index fd28ba536f2f4e13079a0b7ed9f4097bb10e629e..c21ce4e3608827df08c76d608fb88a5b5b99a3da 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.h
+++ b/tensorflow/core/kernels/adjust_saturation_op.h
@@ -27,10 +27,11 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
+template <typename T>
 struct AdjustSaturationGPU {
   void operator()(GPUDevice* device, const int64 number_of_elements,
-                  const float* const input, const float* const scale,
-                  float* const output);
+                  const T* const input, const float* const scale,
+                  T* const output);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
index 37cfb26a47b01ca15cdb6287243a16490bb34bfb..6c70490d469fa8dbdc425f9e57b42acda14f5a58 100644
--- a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
@@ -24,21 +24,26 @@ namespace tensorflow {
 
 namespace functor {
 
-void AdjustSaturationGPU::operator()(GPUDevice* device,
-                                     const int64 number_of_elements,
-                                     const float* const input,
-                                     const float* const scale,
-                                     float* const output) {
+template <typename T>
+void AdjustSaturationGPU<T>::operator()(GPUDevice* device,
+                                        const int64 number_of_elements,
+                                        const T* const input,
+                                        const float* const scale,
+                                        T* const output) {
   const auto stream = device->stream();
   const CudaLaunchConfig config =
       GetCudaLaunchConfig(number_of_elements, *device);
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  internal::adjust_hsv_nhwc<false, true, false>
+  internal::adjust_hsv_nhwc<false, true, false, T>
       <<<block_count, threads_per_block, 0, stream>>>(
           number_of_elements, input, output, nullptr, scale, nullptr);
 }
+
+template struct AdjustSaturationGPU<float>;
+template struct AdjustSaturationGPU<Eigen::half>;
+
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 150e8fe6379fd2a41778e94df793ba45ef0d309e..edf6d3e61e0dc4297ad330fbe43086fce0607088 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -179,20 +179,7 @@ class AddNOp<Device, Variant> : public OpKernel {
               i, " has shape: ", ctx->input(i).shape().DebugString(), "."));
     }
 
-    TensorShape common_shape;
-    OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(0), &common_shape));
-    // Step 2: access all variants and ensure shapes match.
-    for (int i = 1; i < num; ++i) {
-      TensorShape check_shape;
-      OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(i), &check_shape));
-      OP_REQUIRES(ctx, common_shape == check_shape,
-                  errors::InvalidArgument(
-                      "AddN of Variants of differing shapes; inputs[0] shape: ",
-                      common_shape.DebugString(), ", inputs[", i,
-                      "] shape: ", check_shape.DebugString()));
-    }
-
-    // Step 3: attempt to add using
+    // Step 2: attempt to add using
     //   BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...)
     //   For the output create a default-constructed variant object.
     // TODO(ebrevdo): Perform summation in a tree-structure.
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index c731b64993b3a6cebfb46eca9221ca28b729e845..778f818a61a54ec1aa78b93a8f5b8e61755a341f 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -25,13 +25,13 @@ limitations under the License.
 
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index aa9123582210bdf31993e9d8c58ba90cc02acc5e..d5bd36b4ceaa62f6c2f6928bbea704a0e6d01017 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -300,7 +300,7 @@ class Barrier : public ResourceBase {
     ready_queue_->Unref();
   }
 
-  string DebugString() override { return "A barrier"; }
+  string DebugString() const override { return "A barrier"; }
 
  protected:
   template <typename T>
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 35ddda0ec04da6f3b6f11606ecb019e38698c6d7..5ba461aa9de2a647962c653fb9ca0f199e9110be 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -233,7 +233,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() final { return "BatchResource"; }
+  string DebugString() const final { return "BatchResource"; }
 
   // Ingests data from one invocation of the batch op. The data is enqueued to
   // be combined with others into a batch, asynchronously.
@@ -878,7 +878,7 @@ class UnbatchResource : public ResourceBase {
     timeout_enforcer_ = nullptr;
   }
 
-  string DebugString() final { return "UnbatchResource"; }
+  string DebugString() const final { return "UnbatchResource"; }
 
   Status Compute(OpKernelContext* context, AsyncOpKernel::DoneCallback done) {
     const Tensor& data_t = context->input(0);
@@ -1094,7 +1094,7 @@ class UnbatchGradResource : public ResourceBase {
  public:
   UnbatchGradResource() {}
 
-  string DebugString() final { return "UnbatchGradResource"; }
+  string DebugString() const final { return "UnbatchGradResource"; }
 
   // Flushes the information for one batch, given its context and done
   // callback. Clears all information about it from the available_tensors_.
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 656b6ced6de00933cfe8db7dadd1a56ade212758..bef73b0574fc684f6970e705a3b95ed54e41a369 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -125,6 +125,10 @@ class AdaptiveSharedBatchScheduler
     int max_batch_size = 1000;
     // Maximum number of enqueued (i.e. non-scheduled) batches.
     int max_enqueued_batches = 10;
+    // Amount of time non-full batches must wait before becoming schedulable.
+    // A non-zero value can improve performance by limiting the scheduling of
+    // nearly empty batches.
+    int64 batch_timeout_micros = 0;
   };
 
   using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
@@ -267,8 +271,11 @@ class ASBSQueue : public BatchScheduler<TaskType> {
 template <typename TaskType>
 class ASBSBatch : public Batch<TaskType> {
  public:
-  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros)
-      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros,
+            int64 batch_timeout_micros)
+      : queue_(queue),
+        creation_time_micros_(creation_time_micros),
+        schedulable_time_micros_(creation_time_micros + batch_timeout_micros) {}
 
   ~ASBSBatch() override {}
 
@@ -276,9 +283,12 @@ class ASBSBatch : public Batch<TaskType> {
 
   int64 creation_time_micros() const { return creation_time_micros_; }
 
+  int64 schedulable_time_micros() const { return schedulable_time_micros_; }
+
  private:
   ASBSQueue<TaskType>* queue_;
   const int64 creation_time_micros_;
+  const int64 schedulable_time_micros_;
   TF_DISALLOW_COPY_AND_ASSIGN(ASBSBatch);
 };
 }  // namespace internal
@@ -377,7 +387,12 @@ void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
     bool also_schedule_closed_batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
-  MaybeScheduleNextBatch();
+  // Maybe schedule this batch once it becomes schedulable.
+  GetEnv()->SchedClosureAfter(
+      batch->schedulable_time_micros() - batch->creation_time_micros(), [this] {
+        mutex_lock l(mu_);
+        MaybeScheduleNextBatch();
+      });
   if (also_schedule_closed_batch) {
     MaybeScheduleClosedBatch();
   }
@@ -400,21 +415,22 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
           in_flight_batches_limit_ - in_flight_batches_) {
     return;
   }
-  auto best_it = batches_.begin();
-  double best_score =
-      (*best_it)->creation_time_micros() -
-      options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
-          static_cast<double>((*best_it)->queue()->max_task_size());
-  for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+  auto best_it = batches_.end();
+  double best_score;
+  int64 now_micros = GetEnv()->NowMicros();
+  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+    if ((*it)->schedulable_time_micros() > now_micros) continue;
     const double score =
         (*it)->creation_time_micros() -
         options_.full_batch_scheduling_boost_micros * (*it)->size() /
             static_cast<double>((*it)->queue()->max_task_size());
-    if (score < best_score) {
+    if (best_it == batches_.end() || score < best_score) {
       best_score = score;
       best_it = it;
     }
   }
+  // No schedulable batches.
+  if (best_it == batches_.end()) return;
   const internal::ASBSBatch<TaskType>* batch = *best_it;
   batches_.erase(best_it);
   // Queue may destroy itself after ReleaseBatch is called.
@@ -552,7 +568,8 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     if (!current_batch_) {
       num_enqueued_batches_++;
       current_batch_ = new_batch =
-          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros());
+          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
+                                  options_.batch_timeout_micros);
     }
     current_batch_->AddTask(std::move(*task));
     num_enqueued_tasks_++;
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index d4f4b43d63b90c22abbbe82263b09353912010c8..074f64a634aa83509df6e633f1fb0153ef9d6393 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/bias_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if GOOGLE_CUDA
@@ -153,13 +153,13 @@ class BiasOp : public BinaryOp<T> {
               bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
         } break;
         case 5: {
-          Eigen::DSizes<int32, 5> four_dims(1, channel, 1, 1, 1);
+          Eigen::DSizes<int32, 5> five_dims(1, channel, 1, 1, 1);
           Eigen::DSizes<int32, 5> broad_cast_dims(batch, 1, height, width,
                                                   depth);
           const Device& d = context->eigen_device<Device>();
           output->tensor<T, 5>().device(d) =
               input.tensor<T, 5>() +
-              bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+              bias.tensor<T, 1>().reshape(five_dims).broadcast(broad_cast_dims);
         } break;
         default:
           OP_REQUIRES(context, false,
@@ -269,28 +269,24 @@ class BiasGradOp : public OpKernel {
       output->template flat<T>().setZero();
     } else {
       // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
-      // TODO(yongtang): Add 3/4/5 dimensional data support for NCHW format.
       if (data_format_ == FORMAT_NCHW) {
-        OP_REQUIRES(context, output_backprop.dims() == 4,
-                    errors::InvalidArgument(
-                        "NCHW format supports only 4D input/output tensor."));
-        Eigen::DSizes<Eigen::Index, 4> four_dims(batch, channel, height, width);
+        Eigen::DSizes<Eigen::Index, 3> three_dims(batch, channel,
+                                                  height * width * depth);
 #ifdef EIGEN_HAS_INDEX_LIST
         using idx0 = Eigen::type2index<0>;
         using idx2 = Eigen::type2index<2>;
-        using idx3 = Eigen::type2index<3>;
-        Eigen::IndexList<idx0, idx2, idx3> reduction_axes;
+        Eigen::IndexList<idx0, idx2> reduction_axes;
 #else
-        Eigen::array<Eigen::Index, 3> reduction_axes = {0, 2, 3};
+        Eigen::array<Eigen::Index, 2> reduction_axes = {0, 2};
 #endif
         output->template flat<T>().device(context->eigen_device<Device>()) =
             output_backprop.flat<T>()
                 .template cast<typename AccumulatorType<T>::type>()
-                .reshape(four_dims)
+                .reshape(three_dims)
                 .sum(reduction_axes)
                 .template cast<T>();  // End of code by intel_tf.
       } else {
-        Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width,
+        Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width * depth,
                                                 channel);
 #ifdef EIGEN_HAS_INDEX_LIST
         Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
@@ -496,21 +492,21 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
 
   void ComputeWithCustomKernel(OpKernelContext* context,
                                const Tensor& output_backprop, int32 batch,
-                               int32 width, int32 height, int32 channel,
-                               Tensor* output) {
+                               int32 width, int32 height, int32 depth,
+                               int32 channel, Tensor* output) {
     BiasGradGPU<T>::compute(context->template eigen_device<Device>(),
                             output_backprop.template flat<T>().data(),
                             output->flat<T>().data(), batch, width, height,
-                            channel, data_format_);
+                            depth, channel, data_format_);
   }
 
   void ComputeWithReduceSum(OpKernelContext* context,
                             const Tensor& output_backprop, int32 batch,
-                            int32 width, int32 height, int32 channel,
-                            Tensor* output) {
+                            int32 width, int32 height, int32 depth,
+                            int32 channel, Tensor* output) {
     if (data_format_ == FORMAT_NCHW) {
       int32 row_count = batch * channel;
-      int32 col_count = height * width;
+      int32 col_count = height * width * depth;
       Tensor temp_grad_outputs;
       // For 'NCHW' format, we perform reduction twice: first HW, then N.
       TensorShape temp_grad_output_shape{row_count, col_count};
@@ -528,7 +524,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
                                      row_count, col_count);
     } else {
       // For 'NHWC', we simply apply reduction once on NHW.
-      int32 row_count = batch * height * width;
+      int32 row_count = batch * height * width * depth;
       int32 col_count = channel;
       BiasGradGPU<T>::DoColReduction(
           context, const_cast<T*>(output->flat<T>().data()),
@@ -561,7 +557,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     int device_id = stream->parent()->device_ordinal();
     DataType dtype = output_backprop.dtype();
     BiasAddParams bias_parameters = {
-        {batch, height * width, channel},
+        {batch, height * width * depth, channel},
         data_format_,
         dtype,
         device_id,
@@ -576,7 +572,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
       stream->InitTimer(&timer);
       stream->ThenStartTimer(&timer);
       ComputeWithCustomKernel(context, output_backprop, batch, width, height,
-                              channel, output);
+                              depth, channel, output);
       stream->ThenStopTimer(&timer);
       uint64 elapsed_microseconds = timer.Microseconds();
       VLOG(1) << "BiasAddGrad " << bias_parameters.ToString()
@@ -589,7 +585,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
       // Try reduction and profile.
       stream->ThenStartTimer(&timer);
       ComputeWithReduceSum(context, output_backprop, batch, width, height,
-                           channel, output);
+                           depth, channel, output);
       stream->ThenStopTimer(&timer);
 
       elapsed_microseconds = timer.Microseconds();
@@ -610,11 +606,11 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     // Choose the best algorithm based on autotune results.
     if (algo_config.get_mode() == BiasAddGradGPUMode::kReduction) {
       ComputeWithReduceSum(context, output_backprop, batch, width, height,
-                           channel, output);
+                           depth, channel, output);
     } else {
       // Default to the customized kernel.
       ComputeWithCustomKernel(context, output_backprop, batch, width, height,
-                              channel, output);
+                              depth, channel, output);
     }
   }
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 24fea8a8e6f10cea4f74e743c8aa2c6bfb49313f..006fa1dc712f7c06953f70e278fedaa3504bfcce 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -195,10 +195,10 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
 template <typename T>
 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
                              T* bias_backprop, int32 batch, int32 height,
-                             int32 width, int32 channel,
+                             int32 width, int32 depth, int32 channel,
                              TensorFormat data_format) {
   const int32 bias_size = channel;
-  const int32 image_size = height * width;
+  const int32 image_size = height * width * depth;
   const int32 total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
diff --git a/tensorflow/core/kernels/bias_op_gpu.h b/tensorflow/core/kernels/bias_op_gpu.h
index a0b2ce4f9b34b0b343de3d09374b07d554c57d15..372a403e6872dcfb0c41b0dafe5be045c3388054 100644
--- a/tensorflow/core/kernels/bias_op_gpu.h
+++ b/tensorflow/core/kernels/bias_op_gpu.h
@@ -39,7 +39,7 @@ template <typename T>
 struct BiasGradGPU {
   static void compute(const GPUDevice& device, const T* output_backprop,
                       T* bias_backprop, int32 batch, int32 height, int32 width,
-                      int32 channel, TensorFormat data_format);
+                      int32 depth, int32 channel, TensorFormat data_format);
 
   static void DoRowReduction(OpKernelContext* context, T* output,
                              const T* input, int rows, int cols);
diff --git a/tensorflow/core/kernels/bitcast_op.cc b/tensorflow/core/kernels/bitcast_op.cc
index f602cfa428a555970f35b4057c46641a3ba156dd..02c8808809e10b777d37c08be0ff907eb923c3c7 100644
--- a/tensorflow/core/kernels/bitcast_op.cc
+++ b/tensorflow/core/kernels/bitcast_op.cc
@@ -45,8 +45,7 @@ class BitcastOp : public OpKernel {
                 in_size_ >= out_size_ ||
                     (input_tensor.dims() > 0 &&
                      input_tensor.dim_size(input_tensor.dims() - 1) ==
-                         out_size_ / in_size_) ||
-                    input_tensor.dim_size(input_tensor.dims()) == -1,
+                         out_size_ / in_size_),
                 errors::InvalidArgument(
                     "Cannot bitcast from ", DataTypeString(input_data_type_),
                     " to ", DataTypeString(output_data_type_), ": shape ",
@@ -59,8 +58,9 @@ class BitcastOp : public OpKernel {
     }
     Tensor output_tensor;
 
-    output_tensor.UnsafeCopyFromInternal(input_tensor, output_data_type_,
-                                         adjusted_shape);
+    OP_REQUIRES_OK(context,
+                   output_tensor.BitcastFrom(input_tensor, output_data_type_,
+                                             adjusted_shape));
     context->set_output(0, output_tensor);
   }
 
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 8f2c2dbe8a778353dff5e0b8823ac99de68282df..285cded181cb2014e50f96c957290d642fcb6810 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -31,7 +31,6 @@ tf_kernel_library(
     deps = [
         ":resource_ops",
         ":resources",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -60,7 +59,6 @@ tf_kernel_library(
     srcs = ["resource_ops.cc"],
     deps = [
         ":resources",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -72,7 +70,6 @@ tf_kernel_library(
     srcs = ["stats_ops.cc"],
     deps = [
         ":tree_helper",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -84,7 +81,6 @@ tf_kernel_library(
     deps = [
         ":resources",
         ":tree_helper",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -95,7 +91,6 @@ tf_kernel_library(
     name = "quantile_ops",
     srcs = ["quantile_ops.cc"],
     deps = [
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles",
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
index 1c31724272ab11a20ac6f72edd87a86105dd643e..965bf2c924c8791578c5f069e40d2d748e5f3978 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
@@ -37,15 +37,15 @@ class BoostedTreesQuantileStreamResource : public ResourceBase {
         epsilon_(epsilon),
         num_streams_(num_streams),
         max_elements_(max_elements) {
-          streams_.reserve(num_streams_);
-          boundaries_.reserve(num_streams_);
-          for (int64 idx = 0; idx < num_streams; ++idx) {
-            streams_.push_back(QuantileStream(epsilon, max_elements));
-            boundaries_.push_back(std::vector<float>());
-          }
-        }
-
-  string DebugString() override { return "QuantileStreamResource"; }
+    streams_.reserve(num_streams_);
+    boundaries_.reserve(num_streams_);
+    for (int64 idx = 0; idx < num_streams; ++idx) {
+      streams_.push_back(QuantileStream(epsilon, max_elements));
+      boundaries_.push_back(std::vector<float>());
+    }
+  }
+
+  string DebugString() const override { return "QuantileStreamResource"; }
 
   tensorflow::mutex* mutex() { return &mu_; }
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 2798722536271380697539dca4d83ca865051da6..42df4848815db7a097a70b4f1713fd42484be438 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -31,7 +31,7 @@ BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
           protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
               &arena_)) {}
 
-string BoostedTreesEnsembleResource::DebugString() {
+string BoostedTreesEnsembleResource::DebugString() const {
   return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
                          "]");
 }
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index f961ed38142709b01ba009a4d8fb3dab2fe757c4..3c7b2df9b08a2b8912c43b2439e28f34a64b38ef 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -48,7 +48,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
  public:
   BoostedTreesEnsembleResource();
 
-  string DebugString() override;
+  string DebugString() const override;
 
   bool InitFromSerialized(const string& serialized, const int64 stamp_token);
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 3a72567655c09c7091bc917e0af9f20725f38287..5306c77102ebf70cdbcbae847d4386829ee3526b 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -99,9 +99,9 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
   } else {
     Tensor in;
     if (external_src_dtype_ != src_dtype_) {
-      // If the type is a quantized type we need to do an UnsafeCopyFromInternal
-      // since the src_dtype_ is different from external_src_type_.
-      in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
+      // If the type is a quantized type we need to do a bitcast since the
+      // src_dtype_ is different from external_src_type_.
+      OP_REQUIRES_OK(ctx, in.BitcastFrom(inp, src_dtype_, inp.shape()));
     } else {
       in = inp;
     }
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/core/kernels/clustering_ops.cc
similarity index 99%
rename from tensorflow/contrib/factorization/kernels/clustering_ops.cc
rename to tensorflow/core/kernels/clustering_ops.cc
index 025534d540bb82cdb87bb2977d08dfa4f02f1bc8..7e1a1fdcd2d2f9a5d4ea5228497b515ff65e3791 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/core/kernels/clustering_ops.cc
@@ -392,7 +392,7 @@ class NearestNeighborsOp : public OpKernel {
       for (; start < limit; ++start) {
         const int64 start_row = num_points * start / num_units;
         const int64 limit_row = num_points * (start + 1) / num_units;
-        CHECK_LE(limit_row, num_points);
+        DCHECK_LE(limit_row, num_points);
         const int64 num_rows = limit_row - start_row;
         auto points_shard = points.middleRows(start_row, num_rows);
         const Eigen::VectorXf points_half_squared_norm =
@@ -430,7 +430,7 @@ class NearestNeighborsOp : public OpKernel {
       const Eigen::Ref<const Eigen::VectorXf>& centers_half_squared_norm,
       const Eigen::Ref<MatrixXi64RowMajor>& nearest_center_indices,
       const Eigen::Ref<MatrixXfRowMajor>& nearest_center_distances) {
-    CHECK_LE(k, centers.rows());
+    DCHECK_LE(k, centers.rows());
     if (centers.rows() <= kNearestNeighborsCentersMaxBlockSize) {
       FindKNearestCentersOneBlock(k, points, points_half_squared_norm, centers,
                                   centers_half_squared_norm,
@@ -451,7 +451,7 @@ class NearestNeighborsOp : public OpKernel {
       const Eigen::Ref<const Eigen::VectorXf>& centers_half_squared_norm,
       Eigen::Ref<MatrixXi64RowMajor> nearest_center_indices,
       Eigen::Ref<MatrixXfRowMajor> nearest_center_distances) {
-    CHECK_LE(k, centers.rows());
+    DCHECK_LE(k, centers.rows());
     const int64 num_points = points.rows();
     const MatrixXfRowMajor inner_product = points * centers.transpose();
     // Find nearest neighbors.
@@ -500,8 +500,8 @@ class NearestNeighborsOp : public OpKernel {
       Eigen::Ref<MatrixXfRowMajor> nearest_center_distances) {
     const int64 num_points = points.rows();
     const int64 num_centers = centers.rows();
-    CHECK_LE(k, num_centers);
-    CHECK_GT(num_centers, kNearestNeighborsCentersMaxBlockSize);
+    DCHECK_LE(k, num_centers);
+    DCHECK_GT(num_centers, kNearestNeighborsCentersMaxBlockSize);
     // Store nearest neighbors with first block of centers directly into the
     // output matrices.
     int64 out_k = std::min(k, kNearestNeighborsCentersMaxBlockSize);
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops_test.cc b/tensorflow/core/kernels/clustering_ops_test.cc
similarity index 100%
rename from tensorflow/contrib/factorization/kernels/clustering_ops_test.cc
rename to tensorflow/core/kernels/clustering_ops_test.cc
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..113f1487f0189aa351e5f89e37be116b0ac33ee5
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+
+namespace tensorflow {
+namespace {
+string NcclCollectiveKey(const string& exec_key, int step_id) {
+  return strings::StrCat(exec_key, ":", step_id);
+}
+}  // namespace
+
+NcclReducer::NcclReducer() : col_ctx_(nullptr), col_params_(nullptr) {}
+
+Status NcclReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.type != REDUCTION_COLLECTIVE ||
+      col_params->instance.impl_details.collective_name != "NcclReduce") {
+    return errors::Internal("Unexpected collective type ",
+                            col_params->instance.type, " expected ",
+                            REDUCTION_COLLECTIVE, "; or collective name ",
+                            col_params->instance.impl_details.collective_name,
+                            " expected NcclReduce");
+  } else {
+    return Status::OK();
+  }
+}
+
+Status NcclReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+Status NcclReducer::InitializeInstanceBeforeGroupDiscovery(
+    CollectiveParams* col_params) {
+  if (col_params->default_rank == 0 && col_params->group.num_tasks > 1) {
+    col_params->instance.communicator_key =
+        NcclManager::instance()->GenerateCommunicatorKey();
+  }
+  return Status::OK();
+}
+
+Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
+  if (merge_op == "Add") {
+    *reduction_op = ncclSum;
+    return Status::OK();
+  } else if (merge_op == "Mul") {
+    *reduction_op = ncclProd;
+    return Status::OK();
+  } else {
+    return errors::Internal("Expected merge_op to be either Add or Mul, found ",
+                            merge_op);
+  }
+}
+
+void NcclReducer::Run(StatusCallback done) {
+  ncclRedOp_t reduction_op;
+  Status s = ReductionOp(col_params_->merge_op->type_string(), &reduction_op);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  Tensor group_size;
+  Notification group_size_ready;
+  Status group_size_status;
+  if (col_params_->final_op) {
+    // Create an on-device scalar value from group_size_.
+    // TODO(ayushd, tucker): avoid this copy by either reusing across
+    // invocations or providing the scalar to the kernel in host memory.
+    Tensor group_size_val(col_ctx_->output->dtype(), TensorShape({}));
+    switch (col_ctx_->output->dtype()) {
+      case DT_FLOAT:
+        group_size_val.scalar<float>()() = col_params_->group.group_size;
+        break;
+      case DT_DOUBLE:
+        group_size_val.scalar<double>()() = col_params_->group.group_size;
+        break;
+      case DT_INT32:
+        group_size_val.scalar<int32>()() = col_params_->group.group_size;
+        break;
+      case DT_INT64:
+        group_size_val.scalar<int64>()() = col_params_->group.group_size;
+        break;
+      default:
+        done(errors::Internal("Unsupported type ", col_ctx_->output->dtype()));
+        return;
+    }
+    group_size = Tensor(
+        col_ctx_->device->GetAllocator(col_ctx_->op_ctx->input_alloc_attr(0)),
+        col_ctx_->output->dtype(), TensorShape({}));
+    DeviceContext* op_dev_ctx = col_ctx_->op_ctx->op_device_context();
+    // Enqueue copy on gpu stream.
+    op_dev_ctx->CopyCPUTensorToDevice(
+        &group_size_val, col_ctx_->device, &group_size,
+        [&group_size_ready, &group_size_status](const Status& s) {
+          group_size_status = s;
+          group_size_ready.Notify();
+        });
+  } else {
+    group_size_ready.Notify();
+  }
+
+  Notification nccl_done;
+  Status nccl_status;
+  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  // `AddToAllReduce` performs consistency checks for the NCCL call and enqueues
+  // the `Participant` struct locally.  When all local participants with this
+  // `nccl_collective_key` have called `AddToAllReduce` and
+  // `SignalMultiNodeReady`, all devices at this worker are ready to process
+  // this NCCL op.
+  //
+  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
+  // point, it synchronizes the NCCL stream with the compute stream, and then
+  // enqueues the NCCL kernel on the NCCL stream.
+  const int num_global_devices = col_params_->group.group_size;
+  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
+      col_params_->instance.task_names[col_params_->default_rank]);
+  const string nccl_collective_key =
+      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
+  auto done_callback = [&nccl_done, &nccl_status](const Status& s) {
+    nccl_status = s;
+    nccl_done.Notify();
+  };
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
+      col_params_->default_rank, std::move(done_callback));
+  VLOG(1) << "NcclReducer calling NcclManager::AddToAllReduce num_tasks "
+          << col_params_->group.num_tasks << " current task "
+          << col_params_->instance.task_names[col_params_->default_rank]
+          << " num local devices " << num_local_devices
+          << " num global devices " << num_global_devices;
+  NcclManager::instance()->AddToAllReduce(
+      std::move(participant),
+      {nccl_collective_key, num_local_devices, num_global_devices,
+       col_params_->instance.communicator_key},
+      reduction_op);
+
+  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
+  // deadlocks.  In the current implementation, we define a deterministic
+  // sequential launch order between potentially concurrent collective instances
+  // by introducing control information during static graph analysis in
+  // graph/collective_order.cc.  This can be either in the form of explicit
+  // control edges or via `wait_for` attribute on the collective op.
+  //
+  // The other end of the design spectrum would have a distinguished node
+  // dynamically signal the next collective to launch to all other participants.
+  // This has higher degree of runtime coordination, but it may be able to
+  // achieve better performance if the (arbitrary) static execution order
+  // assigned in the first approach turns out to not be good from a scheduling
+  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
+  // concurrent collective instances, and the static ordering assigns c1 -> c2
+  // -> c3.  In practice, it could turn out that c3 is always ready to execute
+  // before c1 or c2.
+  //
+  // `WaitForDependencies` may block if the collective instances on which this
+  // op depends have not yet launched.  When this function returns, this op is
+  // ready to go.
+  col_ctx_->col_exec->WaitForDependencies(*col_params_);
+  NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
+  // When all devices at this worker have called `SignalMultiNodeReady`, the
+  // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+  // implementation of `Launched` keeps track of the number of devices that have
+  // launched.
+  col_ctx_->col_exec->Launched(*col_params_);
+
+  // Wait for nccl op and group_size copy to succeed, then do final_op.
+  group_size_ready.WaitForNotification();
+  nccl_done.WaitForNotification();
+  Status final_status =
+      group_size_status.ok() ? nccl_status : group_size_status;
+  if (final_status.ok() && col_params_->final_op) {
+    final_status = collective_util::ComputeBinOp(
+        col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+        col_params_->final_op.get(), col_ctx_->output, &group_size);
+  }
+  done(final_status);
+}
+
+REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.h b/tensorflow/core/kernels/collective_nccl_reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc70b280c5dc9eb9da72667d459ea727945d7e8a
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+#ifdef GOOGLE_CUDA
+
+class NcclReducer : public CollectiveImplementationInterface {
+ public:
+  NcclReducer();
+  ~NcclReducer() override = default;
+
+  // No-op for this collective implementation.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes the device objects and device localities.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // Initialize nccl communicator key.
+  Status InitializeInstanceBeforeGroupDiscovery(
+      CollectiveParams* col_params) override;
+
+  // Hands off all reduce to NcclManager.
+  void Run(StatusCallback done) override;
+
+ private:
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+};
+
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
diff --git a/tensorflow/core/kernels/collective_nccl_reducer_test.cc b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26c92f1f7433e34cf4e3789dcd480f8822147891
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
@@ -0,0 +1,332 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+
+#include <algorithm>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+static constexpr int kStepId = 10;
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node, DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      DEVICE_GPU, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) LOG(FATAL) << status;
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+class NcclReducerTest : public ::testing::Test {
+ protected:
+  ~NcclReducerTest() override {
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void InitGPUDevices() {
+    std::vector<std::unique_ptr<Device>> all_devices;
+    SessionOptions session_options;
+    session_options.config.mutable_gpu_options()
+        ->set_per_process_gpu_memory_fraction(0.1);
+    session_options.env = Env::Default();
+    Status s = DeviceFactory::GetFactory(DEVICE_GPU)
+                   ->AddDevices(session_options, "", &all_devices);
+    TF_CHECK_OK(s);
+    for (std::unique_ptr<Device>& d : all_devices) {
+      if (d->device_type() == "GPU") {
+        gpus_.emplace_back(std::move(d));
+      }
+    }
+  }
+
+  void Init(int num_ranks) {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    InitGPUDevices();
+    std::vector<std::unique_ptr<Device>> local_devices;
+    std::vector<string> device_names;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      if (rank < gpus_.size()) {
+        local_devices.emplace_back(std::move(gpus_[rank]));
+      }
+    }
+    int num_gpus = local_devices.size();
+    for (const auto& device : local_devices) {
+      device_names.push_back(device->name());
+      VLOG(2) << device->name();
+    }
+    if (!dev_mgr_) dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
+        /*gpu_ring_order=*/nullptr);
+
+    // Initialize collective params.
+    col_params_.name = "test_nccl_collective_op";
+    const int group_key = 5;
+    col_params_.group.group_key = group_key;
+    col_params_.group.device_type = DEVICE_GPU;
+    col_params_.group.group_size = num_ranks;
+    const int instance_key = 23;
+    col_params_.instance.instance_key = instance_key;
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.data_type = DT_FLOAT;
+    col_params_.instance.impl_details.collective_name = "NcclReduce";
+    const string task_name = "/job:worker/replica:0/task:0";
+    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      col_params_.instance.device_names.push_back(
+          device_names[rank % num_gpus]);
+      col_params_.instance.task_names.push_back(task_name);
+    }
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      instances_.push_back(absl::make_unique<DeviceInstance>(
+          rank, col_params_.instance.device_names[rank], this));
+    }
+  }
+
+  void Reduce() {
+    int done = 0;
+    mutex done_mu;
+    condition_variable done_cv;
+    for (const auto& instance : instances_) {
+      DeviceInstance* di = instance.get();
+      SchedClosure([di, &done, &done_mu, &done_cv] {
+        di->DoReduce();
+        mutex_lock l(done_mu);
+        ++done;
+        done_cv.notify_all();
+      });
+    }
+
+    mutex_lock l(done_mu);
+    while (done < instances_.size()) done_cv.wait(l);
+  }
+
+  void RunTest(int num_ranks, int tensor_length) {
+    Init(num_ranks);
+    std::vector<float> expected(tensor_length, 0.0);
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      DeviceInstance* instance = instances_[rank].get();
+      instance->InitTensor(DT_FLOAT, TensorShape({tensor_length}),
+                           [&expected, rank](Tensor* t) {
+                             for (size_t i = 0; i < t->NumElements(); ++i) {
+                               float value = pow(10, rank) * i;
+                               t->flat<float>()(i) = value;
+                               expected[i] += value;
+                             }
+                           });
+    }
+    Reduce();
+    // Confirm that every rank computed the same correct value.
+    for (int i = 0; i < tensor_length; ++i) {
+      expected[i] /= num_ranks;
+    }
+    for (int rank = 0; rank < instances_.size(); ++rank) {
+      TF_ASSERT_OK(instances_[rank]->status_);
+      Tensor* dev_tensor = &instances_[rank]->tensor_;
+      Tensor actual(DT_FLOAT, TensorShape({tensor_length}));
+      Notification note;
+      Device* dev = instances_[rank]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          dev_tensor, /*tensor_name=*/"", dev, &actual,
+          [&note](const Status&) { note.Notify(); });
+      note.WaitForNotification();
+      for (int i = 0; i < tensor_length; ++i) {
+        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
+            << "Mismatch at rank " << rank << " index " << i;
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
+                                                Tensor* input,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_reduce_", reduce_counter_++),
+        "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Div")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& device_name, NcclReducerTest* parent)
+        : parent_(parent), device_name_(device_name), rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
+          << "Could not find device " << device_name_ << " existing devices "
+          << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.default_rank = rank;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      Tensor cpu_tensor(dtype, shape);
+      init_f(&cpu_tensor);
+      VLOG(2) << "cpu_tensor " << cpu_tensor.DebugString();
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      Notification note;
+      dev_info->default_context->CopyCPUTensorToDevice(
+          &cpu_tensor, device_, &tensor_,
+          [&note](const Status&) { note.Notify(); });
+      note.WaitForNotification();
+    }
+
+    void DoReduce() {
+      col_params_.merge_op = GetAdd(device_);
+      col_params_.final_op = GetDiv(device_);
+
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          parent_->GetCollectiveReduce(col_params_, &tensor_, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a NcclReducer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclReducer reducer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                &ctx, &op_params, col_params_, exec_key,
+                                kStepId, &tensor_, &tensor_);
+      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
+
+      // Run the all-reduce.
+      reducer.Run([this](Status s) { status_ = s; });
+      if (status_.ok()) {
+        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    NcclReducerTest* parent_;
+    string device_name_;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    Status status_;
+  };
+
+  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  std::unique_ptr<DeviceMgr> dev_mgr_;
+  std::vector<std::unique_ptr<DeviceInstance>> instances_;
+  CollectiveParams col_params_;
+  mutex mu_;
+  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+};
+
+TEST_F(NcclReducerTest, Test2Dev16Len) { RunTest(2, 16); }
+TEST_F(NcclReducerTest, Test4Dev16Len) { RunTest(4, 16); }
+TEST_F(NcclReducerTest, Test8Dev16Len) { RunTest(8, 16); }
+TEST_F(NcclReducerTest, Test8Dev128Len) { RunTest(8, 128); }
+TEST_F(NcclReducerTest, Test8Dev1045991Len) { RunTest(8, 1048576); }
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 82e2913b64afca2e0fc8c64d1c6e366f3a2d307e..04e37e8edcf535a1d9435f527f877f1eb812dc8e 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -43,16 +43,17 @@ class CollectiveOpKernel : public AsyncOpKernel {
       // Call in a blockable thread because it's not guaranteed that
       // this call cannot block.
       c->env()->SchedClosure([this, c, done, col_exec]() {
-        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
-                                      c->cancellation_manager(),
-                                      [this, c, done](const Status& s) {
-                                        if (s.ok()) {
-                                          ComputeAsync(c, done);
-                                        } else {
-                                          c->SetStatus(s);
-                                          done();
-                                        }
-                                      });
+        col_exec->CompleteParamsAsync(
+            c->device()->name(), &col_params_, c->cancellation_manager(),
+            [this, c, done](const Status& s) {
+              if (s.ok()) {
+                col_params_.instance.impl_details.dependencies = dependencies_;
+                ComputeAsync(c, done);
+              } else {
+                c->SetStatus(s);
+                done();
+              }
+            });
       });
       return false;
     }
@@ -60,6 +61,7 @@ class CollectiveOpKernel : public AsyncOpKernel {
   }
 
   CollectiveParams col_params_;
+  std::vector<int32> dependencies_;
 };
 
 class CollectiveReduceOpKernel : public CollectiveOpKernel {
@@ -87,6 +89,7 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
                     "final_op must be one of {\"Id\", \"Div\"} but got ",
                     final_op_name));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("wait_for", &dependencies_));
 
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 93e392d3032405ea848bd2f147653c9a5c7a1818..161810d8cf933237f526768bc8f2a86bf37c8ffc 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -115,6 +115,7 @@ void ConcatGPU(
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
+TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index a561d918bd36f711d1b813dfb533ec6d690af8ee..1a9adfa7319e42aad0aaa4134b905685dc222038 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -201,6 +201,7 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
+TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT32);
 TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
@@ -209,6 +210,7 @@ REGISTER_GPUCONCAT32(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
+TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT64);
 TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
@@ -217,6 +219,7 @@ REGISTER_GPUCONCAT64(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
+TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU32);
 TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
@@ -225,6 +228,7 @@ REGISTER_GPU32(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
+TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU64);
 TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ff6298351761c84bedd117e125f53b2166cd104f..72d8b45dd96b912f3d94f4c0f0495c82de53e4d4 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -18,16 +18,16 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 4a5ec6f0fb3c7272dd0684da3ce56e787848dd7d..2618ffbb099cd1619de826f6b0e4e5ae20982197 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -68,7 +68,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
 
   const DataType& dtype() const { return dtype_; }
 
-  string DebugString() override { return "A conditional accumulator"; }
+  string DebugString() const override { return "A conditional accumulator"; }
 
   // SetGlobalStep is a modifier method for current_global_step.
   // It returns an InvalidArgument error if the new_global_step is less than
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 75ca77fad5cfca27eb4b78954ddf8b6d74f8e5e2..5ff428dd312c6935adc56a0dbcdef76b77cb287b 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/constant_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 4bd90d36b8fa12b73e7d7d71404b003091f60361..c0981805bbe8ec102aecbe6e019596f73ecf97e7 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -267,6 +267,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
 REGISTER_GPU_KERNEL(uint64);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -365,6 +366,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -460,6 +462,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -541,6 +544,7 @@ REGISTER_KERNEL_BUILDER(Name("RefNextIteration").Device(DEVICE_CPU),
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 4e3de33e83a34e0ec6a4c4d87f93127ec134c822..0df05ceb0266fba43dc23162a2d92c33b02c7fa2 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -102,6 +102,7 @@ struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardFilter<CPUDevice, T>()(
@@ -204,6 +205,15 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES(context, dilations_.size() == 4,
                 errors::InvalidArgument("Sliding window dilations field must "
@@ -282,7 +292,8 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     LaunchConv2DBackpropFilterOp<Device, T>()(
         context, false, false, out_backprop, input,
         /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_);
+        dims.spatial_dims[1].stride, padding_, /*explicit_paddings=*/{},
+        filter_backprop, data_format_);
   }
 
  private:
@@ -319,6 +330,15 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES(context, dilations_.size() == 4,
                 errors::InvalidArgument("Sliding window dilations field must "
@@ -587,6 +607,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -626,13 +650,14 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
               dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
-              filter_backprop, data_format_);
+              explicit_paddings_, filter_backprop, data_format_);
   }
 
  private:
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   bool use_cudnn_;
   TensorFormat data_format_;
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
@@ -646,7 +671,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* filter_backprop, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* filter_backprop,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -661,35 +687,33 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   TensorShape filter_shape = filter_backprop->shape();
 
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
-                          "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
-                          input.shape(), filter_shape, out_backprop.shape(),
-                          dilations, strides, padding, data_format, &dims));
-
-  // TODO(yangzihao): The padding computations should be done in
-  // GetWindowedOutputSize() functions.
-  const int padding_rows =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                     dims.spatial_dims[0].stride +
-                                 (dims.spatial_dims[0].filter_size - 1) *
-                                     dims.spatial_dims[0].dilation +
-                                 1 - dims.spatial_dims[0].input_size);
-  const int padding_cols =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                     dims.spatial_dims[1].stride +
-                                 (dims.spatial_dims[1].filter_size - 1) *
-                                     dims.spatial_dims[1].dilation +
-                                 1 - dims.spatial_dims[1].input_size);
-
-  // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
-  // calling it when that is true. Remove this check when (if?) cuDNN starts
-  // supporting different padding.
-  bool rows_odd = (padding_rows % 2 != 0);
-  bool cols_odd = (padding_cols % 2 != 0);
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
+               input.shape(), filter_shape, out_backprop.shape(), dilations,
+               strides, padding, explicit_paddings, data_format, &dims));
+
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
 
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
@@ -711,7 +735,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.spatial_dims[0].filter_size == 1 &&
       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC) {
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     const uint64 m = dims.in_depth;
     const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size;
@@ -779,31 +803,43 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     return;
   }
 
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
   Tensor compatible_input;
-  if (rows_odd || cols_odd) {
-    // If a padding dimension is odd, we have one more element on the right
-    // side or the bottom side. This is unsupported in cudnn. Therefore,
-    // we pad that extra element and make it compatible.
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN receives the same input during the backward pass function as it did
+    // during the forward pass function.
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64 new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(
                  DataTypeToEnum<T>::value,
-                 ShapeFromFormat(data_format, dims.batch_size,
-                                 dims.spatial_dims[0].input_size + rows_odd,
-                                 dims.spatial_dims[1].input_size + cols_odd,
-                                 dims.in_depth),
+                 ShapeFromFormat(data_format, dims.batch_size, new_in_rows,
+                                 new_in_cols, dims.in_depth),
                  &compatible_input));
 
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
-        {{0, 0}}, {{rows_odd, cols_odd}},
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
         To32Bit(compatible_input.tensor<T, 4>()), data_format);
   } else {
     compatible_input = input;
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input, data_format, 'H'))
@@ -826,8 +862,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(zhengxq):
@@ -922,8 +958,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         dims.spatial_dims[1].dilation}},   // dilation_cols
       {{dims.spatial_dims[0].stride,       // stride_rows
         dims.spatial_dims[1].stride}},     // stride_cols
-      {{padding_rows,                      // padding_rows
-        padding_cols}},                    // padding_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
       dtype,                               // tensor datatype
       device_id,                           // device_id
   };
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 9f983ed8166d51a720b4ea0ff360a974a7b4fb86..74b97b98648dc5f2a32d4755ac08d731af5549e8 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -106,8 +106,9 @@ struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* in_backprop,
-                  TensorFormat data_format) {
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
         d, in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
@@ -220,6 +221,15 @@ class Conv2DFastBackpropInputOp : public OpKernel {
                 errors::InvalidArgument(
                     "Current Eigen and libxsmm implementations do not "
                     "yet support dilation rates larger than 1."));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -286,7 +296,8 @@ class Conv2DFastBackpropInputOp : public OpKernel {
     LaunchConv2DBackpropInputOp<Device, T>()(
         context, false, false, out_backprop, filter,
         /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, in_backprop, data_format_);
+        dims.spatial_dims[1].stride, padding_, /*explicit_paddings=*/{},
+        in_backprop, data_format_);
   }
 
  private:
@@ -336,6 +347,15 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                 errors::InvalidArgument(
                     "Current libxsmm and customized CPU implementations do "
                     "not yet support dilation rates larger than 1."));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -661,6 +681,16 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    if (!std::is_same<Device, GPUDevice>::value) {
+      OP_REQUIRES(
+          context, padding_ != Padding::EXPLICIT,
+          errors::Unimplemented("Current CPU implementation does not support "
+                                "EXPLICIT padding yet."));
+    }
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -694,13 +724,14 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
               dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
-              in_backprop, data_format_);
+              explicit_paddings_, in_backprop, data_format_);
   }
 
  private:
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   bool use_cudnn_;
   TensorFormat data_format_;
   LaunchConv2DBackpropInputOp<Device, T> launcher_;
@@ -714,7 +745,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* in_backprop, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* in_backprop,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -731,35 +763,33 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 
   const TensorShape& filter_shape = filter.shape();
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
-                          "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
-                          input_shape, filter_shape, out_backprop.shape(),
-                          dilations, strides, padding, data_format, &dims));
-
-  // TODO(yangzihao): The padding computations should be done in
-  // GetWindowedOutputSize() functions.
-  const int padding_rows =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                     dims.spatial_dims[0].stride +
-                                 (dims.spatial_dims[0].filter_size - 1) *
-                                     dims.spatial_dims[0].dilation +
-                                 1 - dims.spatial_dims[0].input_size);
-  const int padding_cols =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                     dims.spatial_dims[1].stride +
-                                 (dims.spatial_dims[1].filter_size - 1) *
-                                     dims.spatial_dims[1].dilation +
-                                 1 - dims.spatial_dims[1].input_size);
-
-  // TODO(keveman): cuDNN only supports equal padding on both sides, so only
-  // calling it when that is true. Remove this check when (if?) cuDNN starts
-  // supporting different padding.
-  bool rows_odd = (padding_rows % 2 != 0);
-  bool cols_odd = (padding_cols % 2 != 0);
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape,
+               filter_shape, out_backprop.shape(), dilations, strides, padding,
+               explicit_paddings, data_format, &dims));
+
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
 
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
@@ -779,7 +809,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   if (dims.spatial_dims[0].filter_size == 1 &&
       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC) {
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size;
@@ -841,22 +871,28 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
   TensorShape compatible_input_shape;
-  if (rows_odd || cols_odd) {
-    // If a padding dimension is odd, we have one more element on the right
-    // side or the bottom side. This is unsupported in cudnn. Therefore,
-    // we pad that extra element and make it compatible.
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN receives the same input during the backward pass function as it did
+    // during the forward pass function.
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64 new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
     compatible_input_shape = ShapeFromFormat(
-        data_format, dims.batch_size,
-        dims.spatial_dims[0].input_size + rows_odd,
-        dims.spatial_dims[1].input_size + cols_odd, dims.in_depth);
+        data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
   } else {
     compatible_input_shape = input_shape;
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
@@ -879,8 +915,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(keveman):
@@ -971,8 +1007,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         dims.spatial_dims[1].dilation}},   // dilation_cols
       {{dims.spatial_dims[0].stride,       // stride_rows
         dims.spatial_dims[1].stride}},     // stride_cols
-      {{padding_rows,                      // padding_rows
-        padding_cols}},                    // padding_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
       dtype,                               // tensor data type
       device_id,                           // device_id
   };
@@ -1041,7 +1077,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
-  if (rows_odd || cols_odd) {
+  if (padding_top != padding_bottom || padding_left != padding_right) {
     Tensor in_backprop_remove_padding;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(
@@ -1053,12 +1089,18 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                                  GetTensorDim(input_shape, data_format, 'C')),
                  &in_backprop_remove_padding));
 
-    // Remove the padding for odd rows or cols.
+    // Remove the padding that was added to the input shape above.
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(),
         To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
                     .tensor<T, 4>()),
-        {{0, 0}}, {{-rows_odd, -cols_odd}},
+        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
+        {{static_cast<int>(-input_pad_bottom),
+          static_cast<int>(-input_pad_right)}},
         To32Bit(in_backprop_remove_padding.tensor<T, 4>()), FORMAT_NCHW);
 
     pre_transformed_in_backprop = in_backprop_remove_padding;
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 507720c998d752f7157be5340445693bf8849173..0fd7550830333f749312f5db54d3ffd6ffa22a4a 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -52,24 +52,23 @@ int ConvBackpropDimensions::SpatialPadding(const Padding& padding,
                                        1 - input_size(dim)));
 }
 
-// The V2 version computes windowed output size with arbitrary dilation_rate,
-// while the original version only handles the cases where dilation_rates equal
-// to 1.
-Status ConvBackpropExtractAndVerifyDimensionV2(
+namespace {
+
+Status ConvBackpropExtractAndVerifyDimension(
     StringPiece label, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& output_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, int spatial_dim, int filter_spatial_dim,
-    ConvBackpropSpatialDimension* dim) {
+    Padding padding, int64 padding_before, int64 padding_after, int spatial_dim,
+    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
   dim->input_size = input_shape.dim_size(spatial_dim);
   dim->filter_size = filter_shape.dim_size(filter_spatial_dim);
   dim->output_size = output_shape.dim_size(spatial_dim);
   dim->stride = strides[spatial_dim];
   dim->dilation = dilations[spatial_dim];
-  int64 out_size = 0, pad_size = 0;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(dim->input_size, dim->filter_size,
-                                             dim->dilation, dim->stride,
-                                             padding, &out_size, &pad_size));
+  int64 out_size = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
+      dim->input_size, dim->filter_size, dim->dilation, dim->stride, padding,
+      &out_size, &padding_before, &padding_after));
   if (dim->output_size != out_size) {
     return errors::InvalidArgument(
         label, ": Size of out_backprop doesn't match computed: ", "actual = ",
@@ -79,10 +78,13 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
         " stride: ", dim->stride, " dilation: ", dim->dilation);
   }
 
+  // TODO(reedwm): Correctly handle explicit padding here. The rest of the
+  // fields set on 'dim' are only used in XLA. TensorFlow ops do not yet support
+  // explicit padding for XLA.
   int64 effective_filter_size = (dim->filter_size - 1) * dim->dilation + 1;
   dim->expanded_output_size = (dim->output_size - 1) * dim->stride + 1;
   const auto padded_out_size = dim->input_size + effective_filter_size - 1;
-  dim->pad_before = effective_filter_size - 1 - pad_size;
+  dim->pad_before = effective_filter_size - 1 - padding_before;
   dim->pad_after =
       padded_out_size - dim->expanded_output_size - dim->pad_before;
   VLOG(2) << label << ": expanded_out = " << dim->expanded_output_size
@@ -94,22 +96,14 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
   return Status::OK();
 }
 
-Status ConvBackpropExtractAndVerifyDimension(
-    StringPiece label, const TensorShape& input_shape,
-    const TensorShape& filter_shape, const TensorShape& output_shape,
-    const std::vector<int32>& strides, Padding padding, int spatial_dim,
-    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
-  static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
-  return ConvBackpropExtractAndVerifyDimensionV2(
-      label, input_shape, filter_shape, output_shape, one_dilations, strides,
-      padding, spatial_dim, filter_spatial_dim, dim);
-}
+}  // namespace
 
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
+    Padding padding, const std::vector<int64>& explicit_paddings,
+    TensorFormat data_format, ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
   if (input_shape.dims() != num_dims) {
@@ -152,9 +146,15 @@ Status ConvBackpropComputeDimensionsV2(
   dims->spatial_dims.resize(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     int image_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
-    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimensionV2(
+    int64 padding_before = -1, padding_after = -1;
+    if (padding == EXPLICIT) {
+      padding_before = explicit_paddings[2 * image_dim];
+      padding_after = explicit_paddings[2 * image_dim + 1];
+    }
+    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimension(
         label, input_shape, filter_shape, out_backprop_shape, dilations,
-        strides, padding, image_dim, i, &dims->spatial_dims[i]));
+        strides, padding, padding_before, padding_after, image_dim, i,
+        &dims->spatial_dims[i]));
   }
   return Status::OK();
 }
@@ -169,7 +169,8 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
   static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_shape, filter_shape, out_backprop_shape,
-      one_dilations, strides, padding, data_format, dims);
+      one_dilations, strides, padding, /*explicit_paddings=*/{}, data_format,
+      dims);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index 9551959463bf1f32010b436671ff7eed1daa9d82..c8e8cf28c55e266575738dfe9ef65d588dd0dd2f 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -176,8 +176,9 @@ struct LaunchConv2DBackpropInputOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* in_backprop,
-                  TensorFormat data_format);
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format);
 };
 
 template <typename Device, typename T>
@@ -186,6 +187,7 @@ struct LaunchConv2DBackpropFilterOp {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 
@@ -195,7 +197,8 @@ struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -205,6 +208,7 @@ struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
@@ -217,6 +221,8 @@ struct ConvBackpropSpatialDimension {
   int64 output_size;
   int64 stride;
   int64 dilation;
+
+  // The following fields are valid only if the padding is not EXPLICIT.
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
@@ -248,7 +254,7 @@ struct ConvBackpropDimensions {
 
 // Common code between implementations of Conv?DBackpropInput and
 // Conv?DBackpropFilter. Verifies that the dimensions all match, and computes
-// sizes/padding for the spatial dimensions.
+// sizes/padding for the spatial dimensions. Does not support explicit padding.
 Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
                                      const TensorShape& input_shape,
                                      const TensorShape& filter_shape,
@@ -257,13 +263,15 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
                                      Padding padding, TensorFormat data_format,
                                      ConvBackpropDimensions* dims);
 
-// The V2 version computes the same outputs with arbitrary dilation rate.
+// The V2 version computes the same outputs with arbitrary dilation rate and
+// supports explicit padding.
 // TODO(b/67112639): Merge V2 versions and the original versions eventually.
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims);
+    Padding padding, const std::vector<int64>& explicit_paddings,
+    TensorFormat data_format, ConvBackpropDimensions* dims);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONV_GRAD_OPS_H_
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 562a9c8aed5850418aa8acecec35a7860ae99921..ca46da6ba38044b50aa6299b82f9b9cacd87bb4c 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1152,11 +1152,11 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     }
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensionsV2(
-                       "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, dilation_,
-                       stride_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
+                                "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
+                                input_shape, filter_shape, out_backprop_shape,
+                                dilation_, stride_, padding_,
+                                /*explicit_paddings=*/{}, data_format_, &dims));
 
     Tensor* in_backprop;
     OP_REQUIRES_OK(context,
@@ -1537,11 +1537,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     }
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensionsV2(
-                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, dilation_,
-                       stride_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensionsV2(
+            "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, input_shape,
+            filter_shape, out_backprop_shape, dilation_, stride_, padding_,
+            /*explicit_paddings=*/{}, data_format_, &dims));
 
     Tensor* filter_backprop;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index dfba15792dcf5d293d894027b51c56df31a0e520..979c76dc3c99c950ff5d5062e3ee79d448c40fcf 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/deep_conv2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -122,7 +122,8 @@ struct LaunchConv2DOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
       ctx->SetStatus(
@@ -130,6 +131,11 @@ struct LaunchConv2DOp<CPUDevice, T> {
                                 "NHWC tensor format for now."));
       return;
     }
+    // TODO(reedwm): Enable explicit padding on the CPU.
+    OP_REQUIRES(
+        ctx, padding != Padding::EXPLICIT,
+        errors::Unimplemented("Generic conv implementation does not support "
+                              "EXPLICIT padding yet."));
     const int64 in_depth = GetTensorDim(input, data_format, 'C');
     OP_REQUIRES(ctx, in_depth == filter.dim_size(2),
                 errors::Unimplemented("Generic conv implementation does not "
@@ -274,6 +280,10 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
   TF_RETURN_IF_ERROR(context->GetAttr("dilations", &params->dilations));
   TF_RETURN_IF_ERROR(context->GetAttr("strides", &params->strides));
   TF_RETURN_IF_ERROR(context->GetAttr("padding", &params->padding));
+  if (context->HasAttr("explicit_paddings")) {
+    TF_RETURN_IF_ERROR(
+        context->GetAttr("explicit_paddings", &params->explicit_paddings));
+  }
   string data_format_string;
   TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
   TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
@@ -313,6 +323,10 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
       dilation_h > 0 && dilation_w > 0,
       errors::InvalidArgument("Dilated rates should be larger than 0."));
 
+  TF_RETURN_IF_ERROR(CheckValidPadding(params->padding,
+                                       params->explicit_paddings,
+                                       /*num_dims=*/4, data_format));
+
   return Status::OK();
 }
 
@@ -381,14 +395,22 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   const int dilation_cols =
       GetTensorDim(params.dilations, params.data_format, 'W');
 
+  int64 pad_rows_before, pad_rows_after, pad_cols_before, pad_cols_after;
+  if (params.padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
+
   // Compute windowed output sizes for rows and columns.
-  int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+  int64 out_rows = 0, out_cols = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
       input_rows, filter_rows, dilation_rows, stride_rows, params.padding,
-      &out_rows, &pad_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+      &out_rows, &pad_rows_before, &pad_rows_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
       input_cols, filter_cols, dilation_cols, stride_cols, params.padding,
-      &out_cols, &pad_cols));
+      &out_cols, &pad_cols_before, &pad_cols_after));
 
   dimensions->batch = batch;
   dimensions->input_rows = input_rows;
@@ -404,8 +426,10 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   dimensions->dilation_cols = dilation_cols;
   dimensions->out_rows = out_rows;
   dimensions->out_cols = out_cols;
-  dimensions->pad_rows = pad_rows;
-  dimensions->pad_cols = pad_cols;
+  dimensions->pad_rows_before = pad_rows_before;
+  dimensions->pad_rows_after = pad_rows_after;
+  dimensions->pad_cols_before = pad_cols_before;
+  dimensions->pad_cols_after = pad_cols_after;
 
   return Status::OK();
 }
@@ -463,33 +487,35 @@ class Conv2DOp : public BinaryOp<T> {
     }
 
 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
-    if (LaunchXsmmConvOp<Device, T>::Run(
+    if (params_.padding != EXPLICIT &&
+        LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, dimensions.batch, dimensions.input_rows,
             dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
-            dimensions.filter_cols, dimensions.pad_rows, dimensions.pad_cols,
-            dimensions.out_rows, dimensions.out_cols, dimensions.out_depth,
-            dimensions.dilation_rows, dimensions.dilation_cols,
-            dimensions.stride_rows, dimensions.stride_cols, output,
-            params_.data_format)) {
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
       return;
     }
 #endif
 
-    if (LaunchDeepConvOp<Device, T>::Run(
+    if (params_.padding != EXPLICIT &&
+        LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, dimensions.batch, dimensions.input_rows,
             dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
-            dimensions.filter_cols, dimensions.pad_rows, dimensions.pad_cols,
-            dimensions.out_rows, dimensions.out_cols, dimensions.out_depth,
-            dimensions.dilation_rows, dimensions.dilation_cols,
-            dimensions.stride_rows, dimensions.stride_cols, output,
-            params_.data_format)) {
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
       return;
     }
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
               dimensions.dilation_rows, dimensions.dilation_cols,
               dimensions.stride_rows, dimensions.stride_cols, params_.padding,
-              output, params_.data_format);
+              params_.explicit_paddings, output, params_.data_format);
   }
 
  private:
@@ -551,7 +577,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& input_param, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* output, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* output,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -580,7 +607,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   bool is_grouped_convolution = patch_depths != in_depths;
   if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
       row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
-      col_stride == 1 && data_format == FORMAT_NHWC) {
+      col_stride == 1 && data_format == FORMAT_NHWC &&
+      (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = in_batch * in_rows * in_cols;
     const uint64 k = patch_depths;
@@ -634,49 +662,78 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     return;
   }
 
-  int padding_rows = 0;
-  int padding_cols = 0;
   const int64 out_batch = GetTensorDim(*output, data_format, 'N');
   const int64 out_rows = GetTensorDim(*output, data_format, 'H');
   const int64 out_cols = GetTensorDim(*output, data_format, 'W');
   const int64 out_depths = GetTensorDim(*output, data_format, 'C');
-  if (padding == SAME) {
-    // Total padding on rows and cols is
-    // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
-    // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
-    // where (R', C') are output dimensions, (R, C) are input dimensions, S
-    // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
-    // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
-    // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
-    // we pad more on the right and bottom than on the top and left.
-    padding_rows =
-        std::max<int>(0, (out_rows - 1) * row_stride +
-                             (patch_rows - 1) * row_dilation + 1 - in_rows);
-    padding_cols =
-        std::max<int>(0, (out_cols - 1) * col_stride +
-                             (patch_cols - 1) * col_dilation + 1 - in_cols);
-    const bool rows_odd = (padding_rows % 2 != 0);
-    const bool cols_odd = (padding_cols % 2 != 0);
-    if (rows_odd || cols_odd) {
-      Tensor transformed_input;
-      int64 new_in_rows = in_rows + rows_odd;
-      int64 new_in_cols = in_cols + cols_odd;
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(DataTypeToEnum<T>::value,
-                             ShapeFromFormat(data_format, in_batch, new_in_rows,
-                                             new_in_cols, in_depths),
-                             &transformed_input));
-
-      functor::PadInput<GPUDevice, T, int, 4>()(
-          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
-          {{0, 0}}, {{rows_odd, cols_odd}},
-          To32Bit(transformed_input.tensor<T, 4>()), data_format);
-
-      input = transformed_input;
-      in_rows = new_in_rows;
-      in_cols = new_in_cols;
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 out_rows_check, out_cols_check;
+  Status status = GetWindowedOutputSizeVerboseV2(
+      in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
+      &padding_top, &padding_bottom);
+  // The status is guaranteed to be OK because we checked the output and padding
+  // was valid earlier.
+  TF_CHECK_OK(status);
+  DCHECK_EQ(out_rows, out_rows_check);
+  status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
+                                          col_stride, padding, &out_cols_check,
+                                          &padding_left, &padding_right);
+  TF_CHECK_OK(status);
+  DCHECK_EQ(out_cols, out_cols_check);
+
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // cuDNN only supports padding the same amount on the left and right sides,
+    // and on the top and bottom sides. So we manually create a new padded
+    // input tensor such that we can pass it to cuDNN.
+
+    // TODO(reedwm): In some cases, we can avoid an allocation even if the two
+    // padding sides are different. For example, if the input is 2x2, the filter
+    // is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
+    // equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
+    // such a way would allow us to avoid the allocation.
+    Tensor transformed_input;
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows = in_rows + padding_rows_diff;
+    const int64 new_in_cols = in_cols + padding_cols_diff;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::value,
+                            ShapeFromFormat(data_format, in_batch, new_in_rows,
+                                            new_in_cols, in_depths),
+                            &transformed_input));
+
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
     }
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
+        To32Bit(transformed_input.tensor<T, 4>()), data_format);
+
+    input = transformed_input;
+    in_rows = new_in_rows;
+    in_cols = new_in_cols;
   }
 
   if (data_format == FORMAT_NHWC) {
@@ -698,9 +755,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     }
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
@@ -723,8 +780,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_horizontal_dilation_rate(col_dilation)
       .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(in_depths / patch_depths);
 
   Tensor transformed_filter;
@@ -767,23 +824,23 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
   ConvParameters conv_parameters = {
-      in_batch,          // batch
-      in_depths,         // in_depths
-      {{in_rows,         // in_rows
-        in_cols}},       // in_cols
-      FORMAT_NCHW,       // compute_data_format
-      out_depths,        // out_depths
-      {{patch_rows,      // filter_rows
-        patch_cols,      // filter_cols
-        patch_depths}},  // filter_depths
-      {{row_dilation,    // dilation_rows
-        col_dilation}},  // dilation_cols
-      {{row_stride,      // stride_rows
-        col_stride}},    // stride_cols
-      {{padding_rows,    // padding_rows
-        padding_cols}},  // padding_cols
-      dtype,             // tensor datatype
-      device_id,         // device_id
+      in_batch,                 // batch
+      in_depths,                // in_depths
+      {{in_rows,                // in_rows
+        in_cols}},              // in_cols
+      FORMAT_NCHW,              // compute_data_format
+      out_depths,               // out_depths
+      {{patch_rows,             // filter_rows
+        patch_cols,             // filter_cols
+        patch_depths}},         // filter_depths
+      {{row_dilation,           // dilation_rows
+        col_dilation}},         // dilation_cols
+      {{row_stride,             // stride_rows
+        col_stride}},           // stride_cols
+      {{common_padding_rows,    // padding_rows
+        common_padding_cols}},  // padding_cols
+      dtype,                    // tensor datatype
+      device_id,                // device_id
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 7ec878e0b2fc6eaae2a89610a9f8491689705f0c..105a4b1b825e304175d62c1723aeb46154b46a96 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -36,7 +36,8 @@ struct LaunchConv2DOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -46,7 +47,8 @@ struct LaunchConv2DOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
@@ -63,7 +65,7 @@ struct Im2ColBufferResource : public ResourceBase {
   // the buffer memory held by this resource.
   mutex mu;
   T* data;
-  string DebugString() { return "Im2ColBufferResource"; }
+  string DebugString() const { return "Im2ColBufferResource"; }
 };
 
 // Convolution parameters specified by Op attributes.
@@ -72,6 +74,7 @@ struct Conv2DParameters {
   std::vector<int32> strides;
   Padding padding;
   TensorFormat data_format;
+  std::vector<int64> explicit_paddings;
 };
 
 // Convolution dimensions inferred from parameters, input and filter tensors.
@@ -94,8 +97,10 @@ struct Conv2DDimensions {
 
   int64 out_rows;
   int64 out_cols;
-  int64 pad_rows;
-  int64 pad_cols;
+  int64 pad_rows_before;
+  int64 pad_rows_after;
+  int64 pad_cols_before;
+  int64 pad_cols_after;
 };
 
 // Initializes and validates Conv2D parameters configured by OpKernel
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index fd0c565677a29177308d0bc577e14cb38adb8d06..9c807c3375bf76dfcce731029b93fbdbf0cd907a 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -41,11 +41,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -59,11 +59,11 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
-namespace {
 
-using CPUDevice = ::Eigen::ThreadPoolDevice;
-using GPUDevice = ::Eigen::GpuDevice;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
+namespace {
 // Supported Conv2D fusions. Not all of them supported on all type of devices.
 enum class FusedComputationType {
   // NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 7be1de29c951dca16085e35587d02eeeec01354f..0542216a23d7a24c33d7600b155ec4dc6a92ae04 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index ae4132bb0acef649eb1c3ee1abd443c288e61370..a4cd67804ed11148e511b0695d82e71df12aa8ad 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -770,7 +770,15 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    // NOTE(intel-tf): When filter_size is equal to the input image size,
+    // conv2d essentially is element-wise multiplication followed by
+    // a full sum reduction, which causes larger numerical error
+    // than usual cases.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    }
   }
 
   void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
@@ -812,7 +820,15 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    // NOTE(intel-tf): When filter_size is equal to the input image size,
+    // conv2d essentially is element-wise multiplication followed by
+    // a full sum reduction, which causes larger numerical error
+    // than usual cases.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
+    }
   }
 
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
@@ -936,8 +952,8 @@ class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
 template <typename T>
 class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
 
-TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest);
-TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest);
+TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest);
+TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest);
 
 // -------------------------------------------------------------------------- //
 // Conv2D + BiasAdd + {Relu}                                                  //
@@ -1019,29 +1035,29 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndRelu) {
   this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
 }
 
-REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest,    //
-                           OneByOneConvolution,          //
-                           ImageSizeConvolution,         //
-                           SpatialConvolution,           //
-                           OneByOneConvolutionAndRelu,   //
-                           ImageSizeConvolutionAndRelu,  //
-                           SpatialConvolutionAndRelu);
-
-REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest,  //
-                           OneByOneConvolution,             //
-                           ImageSizeConvolution,            //
-                           SpatialConvolution,              //
-                           OneByOneConvolutionAndRelu,      //
-                           ImageSizeConvolutionAndRelu,     //
-                           SpatialConvolutionAndRelu);
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,    //
+                            OneByOneConvolution,          //
+                            ImageSizeConvolution,         //
+                            SpatialConvolution,           //
+                            OneByOneConvolutionAndRelu,   //
+                            ImageSizeConvolutionAndRelu,  //
+                            SpatialConvolutionAndRelu);
+
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,  //
+                            OneByOneConvolution,             //
+                            ImageSizeConvolution,            //
+                            SpatialConvolution,              //
+                            OneByOneConvolutionAndRelu,      //
+                            ImageSizeConvolutionAndRelu,     //
+                            SpatialConvolutionAndRelu);
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBiasOpTest,
-                              FusedBiasAddDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
+                               FusedBiasAddDataTypes);
 
 using FusedBatchNormDataTypes = ::testing::Types<float>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBatchNormOpTest,
-                              FusedBatchNormDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
+                               FusedBatchNormDataTypes);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Performance benchmarks for the FusedConv2DWithBiasOp.                      //
@@ -1481,6 +1497,26 @@ BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
                                    "3x3 /b 32");
 
 #if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// 1x1 Convolution
+// -------------------------------------------------------------------------- //
+
+BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+// -------------------------------------------------------------------------- //
+// 3x3 Convolution
+// -------------------------------------------------------------------------- //
+
 BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 8");
 BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 16");
 BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index af0a9fa82ee5778fa9e18cea59cf759fa468224f..05df9e0207e505bfd5b9a3bc9c5b7b2c90a0fa30 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include <string.h>
 #include <map>
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 99d01b4db6bac68d890d93ac55bea576f43a5994..838cedd7a4aeeee4b1871bf4c64bbc0c871fdac9 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb375ee4b351e4d15c234f9290ecc8780b096c32..aa68e105addab65cdc3ad468547e6e1273834077 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/ctc_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_loss_calculator.h"
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index a59baaa96fc73cc442287dfb4550bc2f6932956b..39d0a998fdcfe0710af97e404e142955e57a7c2b 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -692,8 +692,8 @@ static inline Status GetrsBatchedImpl(
     SolverFnT solver, CudaSolver* cuda_solver, OpKernelContext* context,
     cublasHandle_t cublas_handle, cublasOperation_t trans, int n, int nrhs,
     const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,
-    const Scalar* const host_b_dev_ptrs[], int ldb,
-    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+    const Scalar* const host_b_dev_ptrs[], int ldb, int* host_lapack_info,
+    int batch_size) {
   mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
@@ -714,7 +714,7 @@ static inline Status GetrsBatchedImpl(
       cublas_handle, trans, n, nrhs,
       reinterpret_cast<const CudaScalar* const*>(dev_a_dev_ptrs.data()), lda,
       dev_pivots, reinterpret_cast<CudaScalar**>(dev_b_dev_ptrs.mutable_data()),
-      ldb, dev_lapack_info->mutable_data(), batch_size));
+      ldb, host_lapack_info, batch_size));
   return Status::OK();
 }
 
@@ -723,13 +723,13 @@ static inline Status GetrsBatchedImpl(
   Status CudaSolver::GetrsBatched(                                             \
       cublasOperation_t trans, int n, int nrhs,                                \
       const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,   \
-      const Scalar* const host_b_dev_ptrs[], int ldb,                          \
-      DeviceLapackInfo* dev_lapack_info, int batch_size) {                     \
+      const Scalar* const host_b_dev_ptrs[], int ldb, int* host_lapack_info,   \
+      int batch_size) {                                                        \
     return GetrsBatchedImpl(reinterpret_cast<getrs_##type_prefix*>(            \
                                 BLAS_SOLVER_FN(getrsBatched, type_prefix)),    \
                             this, context_, cublas_handle_, trans, n, nrhs,    \
                             host_a_dev_ptrs, lda, dev_pivots, host_b_dev_ptrs, \
-                            ldb, dev_lapack_info, batch_size);                 \
+                            ldb, host_lapack_info, batch_size);                \
   }
 
 TF_CALL_LAPACK_TYPES(GETRS_BATCHED_INSTANCE);
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 2c30d036df71f917f7e302141f577a49ed4c5112..1fc344731c28df2e2d4cb9e931accfc0ca4592ed 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -235,13 +235,14 @@ class CudaSolver {
                       int batch_size) TF_MUST_USE_RESULT;
 
   // Batched linear solver using LU factorization from getrfBatched.
-  // See:
+  // Notice that lapack_info is returned on the host, as opposed to
+  // most of the other functions that return it on the device. See:
   // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
   template <typename Scalar>
   Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
                       const Scalar* const dev_Aarray[], int lda,
                       const int* devIpiv, const Scalar* const dev_Barray[],
-                      int ldb, DeviceLapackInfo* dev_lapack_info,
+                      int ldb, int* host_lapack_info,
                       int batch_size) TF_MUST_USE_RESULT;
 
   // Computes matrix inverses for a batch of small matrices. Uses the outputs
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index d37f5fb9daea21737bb787521385d3090125b6bf..196494cbcf8b7f4f670599241d5bdbb1c29c7cd1 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -743,7 +743,7 @@ Status DoBackward(
     /* forward inputs */
     const Tensor* input, const Tensor* input_h, const Tensor* input_c,
     const Tensor* params,
-    /* forward outptus */
+    /* forward outputs */
     const Tensor* output, const Tensor* output_h, const Tensor* output_c,
     /* backprop inputs */
     const Tensor* output_backprop, const Tensor* output_h_backprop,
diff --git a/tensorflow/core/kernels/cwise_op_gpu_igammas_double.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_igammas_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2bcc7aa8855c47aa164caeb0c6bd82dd10306432
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_igammas_double.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY1(igamma, double);
+DEFINE_BINARY1(igamma_grad_a, double);
+DEFINE_BINARY1(igammac, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_igammas.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_igammas_float.cu.cc
similarity index 88%
rename from tensorflow/core/kernels/cwise_op_gpu_igammas.cu.cc
rename to tensorflow/core/kernels/cwise_op_gpu_igammas_float.cu.cc
index 508a47deda81d6182e2c16e83d54bbfa5c97f3fb..e6412216e93379beba3bceeaa2b165f48bd64d0f 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_igammas.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_igammas_float.cu.cc
@@ -20,9 +20,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY2(igamma, float, double);
-DEFINE_BINARY2(igamma_grad_a, float, double);
-DEFINE_BINARY2(igammac, float, double);
+DEFINE_BINARY1(igamma, float);
+DEFINE_BINARY1(igamma_grad_a, float);
+DEFINE_BINARY1(igammac, float);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index a136769b912718a5749273050a2226da3fa9e3cf..bb7d22e4dd4b101ff6d695834b881bda872cda9f 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
-          complex64, int64, complex128);
+REGISTER8(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
+          complex64, int64, complex128, bfloat16);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index dd4e4ea547e7738b76796c0e8d174602645b83df..3b51563ca288413b389f938c9ff9810a71c09fd5 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 #include "tensorflow/core/platform/prefetch.h"
 
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index a22d76717a50e0869d38b77f0ec7f0cc46f8c7ac..b03ee937ca85f82b41dddc5bab9aa0d7764d4b7a 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
 namespace internal {
@@ -51,15 +51,12 @@ struct scalar_arg_op<std::complex<double>> {
 };
 #endif
 
+#if EIGEN_HAS_CXX11_MATH == 0
 template <typename T>
 struct scalar_asinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::asinh(a);
-#else
     return std::asinh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -71,11 +68,7 @@ template <typename T>
 struct scalar_acosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::acosh(a);
-#else
     return std::acosh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -87,35 +80,14 @@ template <typename T>
 struct scalar_atanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::atanh(a);
-#else
     return std::atanh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
 struct functor_traits<scalar_atanh_op<T>> {
   enum { Cost = 5 * NumTraits<T>::MulCost, PacketAccess = false };
 };
-
-// TODO(rmlarsen): This is a workaround for upstream change
-// https://bitbucket.org/eigen/eigen/commits/f339468d04d0f87caeb6cab9aef568627e9f6ea9
-// that renamed scalar_binary_pow_op to scalar_pow_op and deleted the unary
-// version of the latter. Remove once we upgrade to Eigen 3.3.
-template <typename Scalar, typename Exponent>
-struct scalar_binary_pow_op_google {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op_google)
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a,
-                                             const Exponent& b) const {
-    return numext::pow(a, b);
-  }
-};
-
-template <typename Scalar, typename Exponent>
-struct functor_traits<scalar_binary_pow_op_google<Scalar, Exponent>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
+#endif
 
 template <typename Scalar, typename Exponent>
 struct safe_scalar_binary_pow_op {
@@ -865,7 +837,7 @@ template <typename T>
 struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
 
 template <typename T>
-struct pow : base<T, Eigen::internal::scalar_binary_pow_op_google<T, T>> {};
+struct pow : base<T, Eigen::internal::scalar_pow_op<T, T>> {};
 
 template <typename T>
 struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 696d5840e8ce39c1bf210b54b9f28ae83cf232c7..acf7cc289933c2d42644faf63f58ec6af53957c9 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -45,6 +45,7 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 #define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
   void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
     const int64 tot = static_cast<int64>(iters) * num;               \
+    testing::UseRealTime();                                          \
     testing::ItemsProcessed(tot);                                    \
     testing::BytesProcessed(tot * sizeof(T));                        \
     test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
@@ -100,6 +101,7 @@ Graph* BinaryScalar(int num, const string& func) {
 #define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
   void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
     const int64 tot = static_cast<int64>(iters) * num;             \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
@@ -125,6 +127,15 @@ BM_BINARY_SCALAR(gpu, Add);
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
 #endif  // TENSORFLOW_USE_SYCL
+
+BM_BINARY_SCALAR(cpu, DivNoNan);
+#if GOOGLE_CUDA
+BM_BINARY_SCALAR(gpu, DivNoNan);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, DivNoNan);
+#endif  // TENSORFLOW_USE_SYCL
+
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -146,6 +157,7 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
+    testing::UseRealTime();                                                    \
     testing::ItemsProcessed(tot);                                              \
     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
     test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
@@ -197,6 +209,7 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
+    testing::UseRealTime();                                                    \
     testing::ItemsProcessed(tot);                                              \
     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
     test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels,         \
@@ -259,6 +272,7 @@ Graph* BcastAdd(int rows, int cols, int dim) {
     const int rows = RowsFromArg(arg);                             \
     const int cols = ColsFromArg(arg);                             \
     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
@@ -285,6 +299,7 @@ BM_BCAST_ADD_ROW_ALL(sycl);
     const int rows = RowsFromArg(arg);                             \
     const int cols = ColsFromArg(arg);                             \
     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
@@ -311,6 +326,7 @@ BM_BCAST_ADD_COL_ALL(sycl);
     const int rows = RowsFromArg(arg);                                 \
     const int cols = ColsFromArg(arg);                                 \
     const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::UseRealTime();                                            \
     testing::ItemsProcessed(tot);                                      \
     testing::BytesProcessed(tot * sizeof(float));                      \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters);      \
@@ -338,6 +354,7 @@ BM_BCAST_ADD_CROSS_RC_ALL(sycl);
     const int rows = RowsFromArg(arg);                                 \
     const int cols = ColsFromArg(arg);                                 \
     const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::UseRealTime();                                            \
     testing::ItemsProcessed(tot);                                      \
     testing::BytesProcessed(tot * sizeof(float));                      \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters);      \
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index e2ab77632da4830f63d63c95c6ace5465fb46b9e..3cadb55a21e61f9406eac605e0695ebd7f169012 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -39,6 +39,7 @@ tf_cc_test(
     srcs = ["dataset_utils_test.cc"],
     deps = [
         ":dataset_utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -54,7 +55,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
@@ -76,13 +76,18 @@ tf_cc_test(
     srcs = ["single_threaded_executor_test.cc"],
     deps = [
         ":single_threaded_executor",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -189,7 +194,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -205,6 +209,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -262,6 +267,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
     ],
 )
 
@@ -295,6 +301,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -488,6 +495,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
@@ -508,8 +516,11 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -560,6 +571,8 @@ tf_kernel_library(
         ":tensor_slice_dataset_op",
         ":window_dataset_op",
         ":zip_dataset_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 1f8d2bdbae897e471113375150935b69e47f6d84..f9ce0d9642dce1972bb94a2668b344e5f050d345 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -79,8 +79,8 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Batch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Batch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f00b38e732a7835896a275d14507e75eade05fa1..343157de6fea3df5fb7ada416f81f95534f76e1c 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -69,8 +69,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new FileIterator({this, strings::StrCat(prefix, "::FileCache")}));
+      return absl::make_unique<FileIterator>(
+          FileIterator::Params{this, strings::StrCat(prefix, "::FileCache")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -325,7 +325,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           }
           filename_ = strings::StrCat(dataset()->filename_, "_", shard_id_);
           lockfile_ = strings::StrCat(filename_, ".lockfile");
-          writer_.reset(new BundleWriter(dataset()->env_, filename_));
+          writer_ = absl::make_unique<BundleWriter>(dataset()->env_, filename_);
           return Status::OK();
         }
 
@@ -385,7 +385,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             // conditions are not met since BundleWriter's constructor creates
             // new temp files which can delete the temp files created by a
             // BundleWriter in another Session.
-            writer_.reset(new BundleWriter(dataset()->env_, filename_));
+            writer_ =
+                absl::make_unique<BundleWriter>(dataset()->env_, filename_);
             lockfile_created_ = true;
             return Status::OK();
           }
@@ -537,12 +538,14 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         // `FileReaderIterator` and seek to the `cur_index`.
         switch (mode_) {
           case Mode::read:
-            iterator_.reset(new FileReaderIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}));
+            iterator_ = absl::make_unique<FileReaderIterator>(
+                FileReaderIterator::Params{dataset(),
+                                           strings::StrCat(prefix(), "Impl")});
             break;
           case Mode::write:
-            iterator_.reset(new FileWriterIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}));
+            iterator_ = absl::make_unique<FileWriterIterator>(
+                FileWriterIterator::Params{dataset(),
+                                           strings::StrCat(prefix(), "Impl")});
         }
       }
 
@@ -573,8 +576,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new MemoryIterator({this, strings::StrCat(prefix, "::MemoryCache")}));
+      return absl::make_unique<MemoryIterator>(MemoryIterator::Params{
+          this, strings::StrCat(prefix, "::MemoryCache")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -614,7 +617,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      public:
       MemoryCache() = default;
 
-      string DebugString() override { return "CacheDataset::MemoryCache"; }
+      string DebugString() const override {
+        return "CacheDataset::MemoryCache";
+      }
 
       // Marks the cache as completed.
       void Complete() {
@@ -931,12 +936,16 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       void InitializeIterator() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         switch (mode_) {
           case Mode::read:
-            iterator_.reset(new MemoryReaderIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}, cache_));
+            iterator_ = absl::make_unique<MemoryReaderIterator>(
+                MemoryReaderIterator::Params{dataset(),
+                                             strings::StrCat(prefix(), "Impl")},
+                cache_);
             break;
           case Mode::write:
-            iterator_.reset(new MemoryWriterIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}, cache_));
+            iterator_ = absl::make_unique<MemoryWriterIterator>(
+                MemoryWriterIterator::Params{dataset(),
+                                             strings::StrCat(prefix(), "Impl")},
+                cache_);
         }
       }
 
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 973b6b06048fb715d9fd32791223cda21751b1c8..99b745b4c45c115b065fced39f7f206c240cf5ed 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -114,8 +113,8 @@ Status CapturedFunction::Create(
   OpInputList inputs;
   TF_RETURN_IF_ERROR(ctx->input_list(argument, &inputs));
   std::vector<Tensor> arguments(inputs.begin(), inputs.end());
-  *out_function = WrapUnique(new CapturedFunction(func, std::move(arguments),
-                                                  use_inter_op_parallelism));
+  *out_function = absl::WrapUnique(new CapturedFunction(
+      func, std::move(arguments), use_inter_op_parallelism));
   return Status::OK();
 }
 
@@ -144,8 +143,10 @@ Status CapturedFunction::Instantiate(
     ret_types.push_back(ret_type);
   }
 
-  instantiated_captured_function->reset(new InstantiatedCapturedFunction(
-      lib, f_handle, std::move(ret_types), *ctx->runner(), this));
+  *instantiated_captured_function =
+      absl::WrapUnique<InstantiatedCapturedFunction>(
+          new InstantiatedCapturedFunction(lib, f_handle, std::move(ret_types),
+                                           *ctx->runner(), this));
   return Status::OK();
 }
 
@@ -422,11 +423,11 @@ void InstantiatedCapturedFunction::RunAsync(
   // (such as queue kernels) that depend on the non-nullness of
   // `OpKernelContext::cancellation_manager()`, but additional effort
   // will be required to plumb it through the `IteratorContext`.
-  CancellationManager* c_mgr = new CancellationManager;
+  CancellationManager* c_mgr = new CancellationManager();
   f_opts.cancellation_manager = c_mgr;
   std::shared_ptr<SimpleStepStatsCollector> stats_collector;
   if (ctx->model() || ctx->stats_aggregator()) {
-    stats_collector = MakeUnique<SimpleStepStatsCollector>();
+    stats_collector = absl::make_unique<SimpleStepStatsCollector>();
   }
   f_opts.stats_collector = stats_collector.get();
 
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 066b2c9aef4faaf23981b207e46c301e99360119..1d7c3a65d5cd5dca5398999073c84af1f2d5c29f 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -63,8 +63,8 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Concatenate")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Concatenate")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 4d92d314d3d207d12310bb744b5601ad922bc570..5a6c8bac08dc0647b8bb5c935bf5e0bec17e9522 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -141,5 +141,95 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
   return Status::OK();
 }
 
+namespace {
+
+constexpr char kDelimiter[] = "@@";
+
+}  // namespace
+
+VariantTensorDataReader::VariantTensorDataReader(
+    const tensorflow::VariantTensorData* data)
+    : data_(data) {
+  string metadata;
+  data_->get_metadata(&metadata);
+  auto keys = str_util::Split(metadata, kDelimiter, str_util::SkipEmpty());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    map_[keys[i]] = i;
+  }
+}
+
+Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) {
+  return ReadScalarInternal(key, val);
+}
+
+Status VariantTensorDataReader::ReadScalar(StringPiece key, string* val) {
+  return ReadScalarInternal(key, val);
+}
+
+Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) {
+  return ReadTensorInternal(key, val);
+}
+
+bool VariantTensorDataReader::Contains(StringPiece key) {
+  return map_.find(string(key)) != map_.end();
+}
+
+template <typename T>
+Status VariantTensorDataReader::ReadScalarInternal(StringPiece key, T* val) {
+  if (map_.find(string(key)) == map_.end()) {
+    return errors::NotFound(key);
+  }
+  *val = data_->tensors(map_[string(key)]).scalar<T>()();
+  return Status::OK();
+}
+
+Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
+                                                   Tensor* val) {
+  if (map_.find(string(key)) == map_.end()) {
+    return errors::NotFound(key);
+  }
+  *val = data_->tensors(map_[string(key)]);
+  return Status::OK();
+}
+
+Status VariantTensorDataWriter::WriteScalar(StringPiece key, const int64 val) {
+  return WriteScalarInternal(key, val);
+}
+
+Status VariantTensorDataWriter::WriteScalar(StringPiece key,
+                                            const string& val) {
+  return WriteScalarInternal(key, val);
+}
+
+Status VariantTensorDataWriter::WriteTensor(StringPiece key,
+                                            const Tensor& val) {
+  return WriteTensorInternal(key, val);
+}
+
+Status VariantTensorDataWriter::Flush() {
+  string metadata;
+  for (size_t i = 0; i < keys_.size(); ++i) {
+    strings::StrAppend(&metadata, kDelimiter, keys_[i]);
+  }
+  data_->set_metadata(metadata);
+  return Status::OK();
+}
+
+template <typename T>
+Status VariantTensorDataWriter::WriteScalarInternal(StringPiece key,
+                                                    const T& val) {
+  Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+  val_t.scalar<T>()() = val;
+  return WriteTensorInternal(key, val_t);
+}
+
+Status VariantTensorDataWriter::WriteTensorInternal(StringPiece key,
+                                                    const Tensor& val) {
+  DCHECK_EQ(key.find(kDelimiter), string::npos);
+  keys_.push_back(string(key));
+  *(data_->add_tensors()) = val;
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 23a3d93ed160c95099a5c8ddb237b4c055a1845c..0754d1d266f83f0b2490e611d69c969d0a8e9a40 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -57,6 +57,47 @@ Status VerifyTypesMatch(const DataTypeVector& expected,
 Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
                               const std::vector<PartialTensorShape>& received);
 
+// Helper class for reading data from a VariantTensorData object.
+class VariantTensorDataReader : public IteratorStateReader {
+ public:
+  explicit VariantTensorDataReader(const VariantTensorData* data);
+
+  // Returns OK iff the initialization was successful.
+  Status ReadScalar(StringPiece key, int64* val) override;
+  Status ReadScalar(StringPiece key, string* val) override;
+  Status ReadTensor(StringPiece key, Tensor* val) override;
+  bool Contains(StringPiece key) override;
+
+ private:
+  template <typename T>
+  Status ReadScalarInternal(StringPiece key, T* val);
+  Status ReadTensorInternal(StringPiece key, Tensor* val);
+
+  std::map<string, size_t> map_;
+  const VariantTensorData* data_;  // Not owned.
+};
+
+// Helper class for writing data to a VariantTensorData object.
+class VariantTensorDataWriter : public IteratorStateWriter {
+ public:
+  // Does not take ownership of data.
+  explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
+  Status WriteScalar(StringPiece key, const int64 val) override;
+  Status WriteScalar(StringPiece key, const string& val) override;
+  Status WriteTensor(StringPiece key, const Tensor& val) override;
+
+  // Writes the metadata to `data_`.
+  Status Flush();
+
+ private:
+  template <typename T>
+  Status WriteScalarInternal(StringPiece key, const T& val);
+  Status WriteTensorInternal(StringPiece key, const Tensor& val);
+
+  VariantTensorData* data_;
+  std::vector<string> keys_;
+};
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 43295b8ebb8f9df2acae8e17162f2d307dd4d9c5..290399b3e257117438298a9c8184e559c03e76fe 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -14,14 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-TEST(DatasetUtils, ComputeMoveVector) {
+TEST(DatasetUtilsTest, ComputeMoveVector) {
   struct TestCase {
     std::vector<int> indices;
     std::vector<bool> expected;
@@ -41,6 +42,41 @@ TEST(DatasetUtils, ComputeMoveVector) {
   }
 }
 
+TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(writer.WriteScalar("Int64", 24));
+  Tensor input_tensor(DT_FLOAT, {1});
+  input_tensor.flat<float>()(0) = 2.0f;
+  TF_ASSERT_OK(writer.WriteTensor("Tensor", input_tensor));
+  TF_ASSERT_OK(writer.Flush());
+
+  VariantTensorDataReader reader(&data);
+  int64 val_int64;
+  TF_ASSERT_OK(reader.ReadScalar("Int64", &val_int64));
+  EXPECT_EQ(val_int64, 24);
+  Tensor val_tensor;
+  TF_ASSERT_OK(reader.ReadTensor("Tensor", &val_tensor));
+  EXPECT_EQ(input_tensor.NumElements(), val_tensor.NumElements());
+  EXPECT_EQ(input_tensor.flat<float>()(0), val_tensor.flat<float>()(0));
+}
+
+TEST(DatasetUtilsTest, VariantTensorDataNonExistentKey) {
+  VariantTensorData data;
+  strings::StrAppend(&data.metadata_, "key1", "@@");
+  data.tensors_.push_back(Tensor(DT_INT64, {1}));
+  VariantTensorDataReader reader(&data);
+  int64 val_int64;
+  string val_string;
+  Tensor val_tensor;
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadScalar("NonExistentKey", &val_int64).code());
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadScalar("NonExistentKey", &val_string).code());
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadTensor("NonExistentKey", &val_tensor).code());
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 7433303f77671cbf67a6365fb1d552edc7b471e0..4f7c8f156c662568078f57e6104fe06f9f7bfe3d 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -119,11 +119,14 @@ tf_kernel_library(
     name = "map_and_batch_dataset_op",
     srcs = ["map_and_batch_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
@@ -142,6 +145,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "choose_fastest_dataset_op",
+    srcs = ["choose_fastest_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
 tf_kernel_library(
     name = "non_serializable_dataset_op",
     srcs = ["non_serializable_dataset_op.cc"],
@@ -156,11 +171,13 @@ tf_kernel_library(
     name = "numa_map_and_batch_dataset_op",
     srcs = ["numa_map_and_batch_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "@com_google_absl//absl/memory",
@@ -187,6 +204,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core/kernels/data:parallel_map_iterator",
     ],
 )
@@ -293,6 +311,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "take_while_dataset_op",
+    srcs = ["take_while_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
 tf_kernel_library(
     name = "to_tf_record_op",
     srcs = ["to_tf_record_op.cc"],
@@ -342,6 +374,7 @@ tf_kernel_library(
     name = "dataset_kernels",
     deps = [
         ":assert_next_dataset_op",
+        ":choose_fastest_dataset_op",
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
@@ -365,6 +398,7 @@ tf_kernel_library(
         ":sql_dataset_op",
         ":stats_aggregator_ops",
         ":stats_dataset_ops",
+        ":take_while_dataset_op",
         ":threadpool_dataset_op",
         ":to_tf_record_op",
         ":unbatch_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 3e87f484b940b336ed68099df7427250a4304207..eb547133609a828f770ff5dc0acc1559f25eb3d2 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -61,8 +61,8 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Assert")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::AssertNext")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e63208f26a93d2a6b5fc265355226acb71b01bd0
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -0,0 +1,351 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+static const double kPercentile = 90.0;
+
+class ChooseFastestDatasetOp : public DatasetOpKernel {
+ public:
+  explicit ChooseFastestDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_experiments", &num_experiments_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    OpInputList input_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("input_datasets", &input_list));
+    OP_REQUIRES(
+        ctx, input_list.size() > 1,
+        errors::InvalidArgument(
+            "ChooseFastestDataset must have at least two input datasets."));
+
+    std::vector<DatasetBase*> inputs;
+    inputs.reserve(input_list.size());
+    for (const auto& tensor : input_list) {
+      DatasetBase* input;
+      OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(tensor, &input));
+      inputs.push_back(input);
+    }
+
+    for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+      OP_REQUIRES(
+          ctx, inputs[i]->output_dtypes() == output_types_,
+          errors::InvalidArgument(
+              "All inputs to ChooseFastestDataset "
+              "must have the same output types. Input ",
+              i, " has output types: ",
+              DataTypeVectorString(inputs[i]->output_dtypes()),
+              ". Expected: ", DataTypeVectorString(output_types_), "."));
+    }
+
+    // Merge the output shapes of all the input datasets, returning an
+    // error if any of them are incompatible.
+    for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+      OP_REQUIRES(
+          ctx, inputs[i]->output_shapes().size() == output_shapes_.size(),
+          errors::InvalidArgument(
+              "All inputs to ChooseFastestDataset must have compatible outputs."
+              " Input ",
+              i, " has ", inputs[i]->output_shapes().size(),
+              " components. Expected to have ", output_shapes_.size(),
+              " components."));
+      for (size_t j = 0, num_components = output_shapes_.size();
+           j < num_components; ++j) {
+        PartialTensorShape result;
+        OP_REQUIRES(ctx,
+                    output_shapes_[j]
+                        .MergeWith(inputs[i]->output_shapes().at(j), &result)
+                        .ok(),
+                    errors::InvalidArgument(
+                        "All inputs to ChooseFastestDataset must have "
+                        "compatible output shapes. Component ",
+                        j, " of input ", i,
+                        " has shape: ", inputs[i]->output_shapes().at(j),
+                        ". Expected to be compatible with shape: ",
+                        output_shapes_[j], "."));
+        output_shapes_[j] = std::move(result);
+      }
+    }
+
+    int64 cardinality = inputs[0]->Cardinality();
+    for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+      if (cardinality == kUnknownCardinality) {
+        cardinality = inputs[i]->Cardinality();
+      } else {
+        OP_REQUIRES(
+            ctx,
+            inputs[i]->Cardinality() == cardinality ||
+                inputs[i]->Cardinality() == kUnknownCardinality,
+            errors::InvalidArgument(
+                "All inputs to ChooseFastestDataset must have compatible "
+                "cardinalities. Input ",
+                i, " has cardinality: ", inputs[i]->Cardinality(),
+                ", while all prior inputs have cardinality: ", cardinality,
+                "."));
+      }
+    }
+    *output = new Dataset(ctx, std::move(inputs), output_types_, output_shapes_,
+                          cardinality, num_experiments_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<DatasetBase*> inputs,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            int64 cardinality, int64 num_experiments)
+        : DatasetBase(DatasetContext(ctx)),
+          inputs_(std::move(inputs)),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          cardinality_(cardinality),
+          num_experiments_(num_experiments) {
+      for (auto input : inputs_) {
+        input->Ref();
+      }
+    }
+
+    ~Dataset() override {
+      for (auto input : inputs_) {
+        input->Unref();
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<ChooseFastestIterator>(
+          ChooseFastestIterator::Params{
+              this, strings::StrCat(prefix, "::ChooseFastest")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ChooseFastestDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return cardinality_; }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      std::vector<Node*> input_nodes;
+      input_nodes.reserve(inputs_.size());
+      for (const auto& input : inputs_) {
+        Node* input_node;
+        TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &input_node));
+        input_nodes.push_back(input_node);
+      }
+      AttrValue num_experiments_attr;
+      b->BuildAttrValue(num_experiments_, &num_experiments_attr);
+      return b->AddDataset(
+          this, {}, {std::make_pair(0, input_nodes)},
+          {std::make_pair("num_experiments", std::move(num_experiments_attr))},
+          output);
+    }
+
+   private:
+    class ChooseFastestIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit ChooseFastestIterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            histograms_(dataset()->inputs_.size()) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        input_impls_.resize(dataset()->inputs_.size());
+        for (size_t i = 0, num_inputs = dataset()->inputs_.size();
+             i < num_inputs; ++i) {
+          TF_RETURN_IF_ERROR(dataset()->inputs_[i]->MakeIterator(
+              ctx, strings::StrCat(prefix(), "_", i), &input_impls_[i]));
+        }
+        return Status::OK();
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        // The first num_experiments_ iterations, we fire up a thread for
+        // each input that calls its GetNext function and records the time
+        // taken. We only return when all the threads have completed.
+        if (experiment_counter_ < dataset()->num_experiments_) {
+          experiment_counter_++;
+          std::vector<ThreadInfo> threads = StartThreads(ctx);
+          for (const auto& thread : threads) {
+            thread.result->notification.WaitForNotification();
+          }
+
+          *out_tensors = std::move(threads[0].result->out_tensors);
+          *end_of_sequence = threads[0].result->end_of_sequence;
+
+          if (experiment_counter_ == dataset()->num_experiments_) {
+            SelectFastestInputIndex();
+          }
+          return threads[0].result->status;
+        }
+        return input_impls_[fastest_index_]->GetNext(ctx, out_tensors,
+                                                     end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
+      }
+
+      // TODO(rachelim): Save and restore histogram state as well. Currently,
+      // if an iterator is saved and restored, the histograms start recording
+      // from scratch.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impls_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        } else {
+          for (auto& input_impl : input_impls_) {
+            TF_RETURN_IF_ERROR(SaveInput(writer, input_impl));
+          }
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("experiment_counter"),
+                                               experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("fastest_index"), fastest_index_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty"))) {
+          input_impls_.clear();
+        } else {
+          DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
+          for (auto& input_impl : input_impls_) {
+            TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
+          }
+        }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("experiment_counter"),
+                                              &experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("fastest_index"), &fastest_index_));
+        return Status::OK();
+      }
+
+     private:
+      struct InvocationResult {
+        Notification notification;
+        Status status;
+        bool end_of_sequence;
+        std::vector<Tensor> out_tensors;
+      };
+
+      struct ThreadInfo {
+        std::unique_ptr<InvocationResult> result;
+        std::unique_ptr<Thread> thread;
+      };
+
+      std::vector<std::unique_ptr<IteratorBase>> input_impls_;
+      // For tracking the time taken for each input's iterations.
+      std::vector<histogram::Histogram> histograms_;
+
+      mutex mu_;
+      int64 experiment_counter_ GUARDED_BY(mu_) = 0;
+      int64 fastest_index_ = -1;
+
+      std::vector<ThreadInfo> StartThreads(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<ThreadInfo> threads(dataset()->inputs_.size());
+        for (size_t i = 0, num_inputs = dataset()->inputs_.size();
+             i < num_inputs; ++i) {
+          threads[i].result = absl::make_unique<InvocationResult>();
+          threads[i].thread.reset(ctx->env()->StartThread(
+              {}, strings::StrCat("tf_data_merge_", i),
+              std::bind(&ChooseFastestIterator::RunnerThread, this, ctx,
+                        threads[i].result.get(), i)));
+        }
+        return threads;
+      }
+
+      void RunnerThread(IteratorContext* ctx, InvocationResult* result, int i) {
+        int64 start = Env::Default()->NowNanos();
+        Status s = input_impls_[i]->GetNext(ctx, &result->out_tensors,
+                                            &result->end_of_sequence);
+        histograms_[i].Add(
+            static_cast<double>(Env::Default()->NowNanos() - start));
+
+        result->status = s;
+        result->notification.Notify();
+      }
+
+      // Select the fastest input to use based on the histograms of timings
+      // of the completed threads. The input with the best 90th percentile
+      // iteration time is selected.
+      void SelectFastestInputIndex() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        fastest_index_ = 0;
+
+        double best_percentile = histograms_[0].Percentile(kPercentile);
+        for (size_t i = 1, num_inputs = histograms_.size(); i < num_inputs;
+             ++i) {
+          double percentile = histograms_[i].Percentile(kPercentile);
+          if (percentile <= best_percentile) {
+            best_percentile = percentile;
+            fastest_index_ = i;
+          }
+        }
+      }
+    };  // class Iterator
+
+    const std::vector<DatasetBase*> inputs_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const int64 cardinality_;
+    const int64 num_experiments_;
+  };  // class Dataset
+
+  int64 num_experiments_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};  // class ChooseFastestDatasetOp
+
+// Register the kernel implementation for ChooseFastestDataset.
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalChooseFastestDataset").Device(DEVICE_CPU),
+    ChooseFastestDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index f6f58fc430b41d05bccdc413c00151130bf7d36d..4435c2a131316be0b3b36fd246e40726fbd8c4bb 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-// See docs in ../ops/parsing_ops.cc.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
@@ -159,8 +157,8 @@ class CSVDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::CSV")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::CSV")});
     }
 
     const DataTypeVector& output_dtypes() const override { return out_type_; }
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index 97e64dd7444e93660afa6defa31314c909a31c7b..31f081a72773e5d0df6f51781d3f42694a986df5 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -96,8 +96,8 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::DenseToSparseBatch")}));
+      return absl::make_unique<Iterator>(typename Iterator::Params{
+          this, strings::StrCat(prefix, "::DenseToSparseBatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index d8bb696167a7971ac21db4b449508946a0c7f11b..f55718a006436d0b7253607964dd44ab04690884 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -93,8 +93,8 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::DirectedInterleave")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::DirectedInterleave")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index 5f0c01be4bc1ce1aefb53b7331e603724eeecb86..56159593a9c8e789ae47e874825943c54e816c24 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -90,8 +90,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::GroupByReducer")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::GroupByReducer")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 11491e00db88e040b0c858a0b2237edad90f59d6..49122807b28aae48b77c1ead3be1f9e4021730ec 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -89,8 +89,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::GroupByWindow")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::GroupByWindow")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index d445d9c8094eec5c9a2bff9c45e2dc28e264d096..5e07bdb32ebac39d8ea0f8b987c77615ccb7028e 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -45,8 +45,8 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::IgnoreErrors")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::IgnoreErrors")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index a07eaebdf9d645fba51945d7bd3e79b72b5e5dc2..e75e6e4b80bce5dd286ed297c1d645adcdc37a4b 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -106,7 +105,7 @@ class MaterializedDatasetResource : public ResourceBase {
       const std::vector<PartialTensorShape>& output_shapes)
       : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
 
-  string DebugString() override {
+  string DebugString() const override {
     return "Materialized IndexedDataset resource";
   }
 
@@ -424,7 +423,7 @@ class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
 
     Status MaterializeDataset(
         std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      materialized->reset(new Materialized(this));
+      (*materialized) = std::make_shared<Materialized>(this);
       return Status::OK();
     }
 
@@ -441,8 +440,8 @@ class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::IdentityIndexedDataset")});
     }
 
     string DebugString() const override {
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index 6248eb775e481cc5f6940b5c2131d4c963186af5..cf900f133612e3af5b06a8f599558b3caa5ff47a 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include <sys/stat.h>
 
 #include "tensorflow/core/framework/dataset.h"
@@ -52,8 +51,8 @@ class LMDBDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::LMDB")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::LMDB")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index ef75c844565874aa32369f3325be5da1075e7323..0ef56915ebe33b0f843fa462ba981c34b6b257f7 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -32,14 +33,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr char kDatasetName[] = "MapAndBatch";
+
 // Maximum number of batch results to buffer.
-const int64 kMaxBatchResults = 16;
+constexpr int64 kMaxBatchResults = 16;
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
@@ -127,6 +129,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       };
     }
 
+    if (num_parallel_calls == model::kAutoTune) {
+      metrics::RecordTFDataAutotune(kDatasetName);
+    }
+
     *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_,
                           std::move(captured_func), &ctx->eigen_cpu_device(),
@@ -163,8 +169,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::MapAndBatch")},
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)},
           map_func_);
     }
 
@@ -259,7 +265,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                                             params.dataset->batch_size_)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
+        key_prefix_ = components.back();
       }
 
       ~Iterator() override {
@@ -397,8 +403,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
@@ -637,18 +644,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               num_calls_++;
             }
           }
-          const std::shared_ptr<StatsAggregator>& stats_aggregator =
-              ctx->stats_aggregator();
+          const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
             mutex_lock l(*mu_);
-            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-            // monitoring code or as histogram at fixed time intervals.
-            stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::active_parallel_calls"),
-                static_cast<float>(num_calls_));
             stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::num_parallel_calls"),
-                static_cast<float>(num_parallel_calls_->value));
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
           for (const auto& call : new_calls) {
             CallFunction(ctx, call.first, call.second);
@@ -803,7 +805,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       int64 waiting_ GUARDED_BY(*mu_) = 0;
       // Identifies the maximum number of batch results to store.
       int64 max_batch_results_ GUARDED_BY(*mu_);
-      string prefix_end_;
+      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index aa27a13416d093dd19475b97b51ac28489d4d177..381b9691d1434fc5f1f60d694d06f9accfa15829 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -61,8 +61,8 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::MatchingFiles")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::MatchingFiles")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 61811ea14eddc9f40987e12ce6343268da24a503..9ca8e33b946c9415d6908606d8165cccadd57258 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -53,8 +53,8 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::NonSerializable")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::NonSerializable")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 2b1aec358cce90cb97723b0497d08294d99839b9..643b6460e8a838e5e9d6f35e789dc0a82e4f7cc5 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -121,8 +120,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::NumaMapAndBatch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::NumaMapAndBatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -321,7 +320,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         workers_.resize(num_workers);
         for (size_t i = 0; i < num_workers; ++i) {
-          workers_[i] = MakeUnique<NumaWorkerBlock>(this);
+          workers_[i] = absl::make_unique<NumaWorkerBlock>(this);
           TF_RETURN_IF_ERROR(
               workers_[i]->manager.Restore(ctx, reader, this, i));
         }
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 1c19119d88bbfa60dc892ce580779b2d7c2a74bc..f6d522078dda68d52bd0722613ecdcfdd314faf1 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -113,8 +112,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::ParallelInterleave")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index ea99a8b32c5a945f30945369ef2ed4f4b6725887..00574057344507fe158d36c210e61f15bf92845e 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -183,8 +183,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      std::unique_ptr<ParallelMapFunctor> parse_example_functor(
-          new ParseExampleFunctor(this));
+      std::unique_ptr<ParallelMapFunctor> parse_example_functor =
+          absl::make_unique<ParseExampleFunctor>(this);
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParseExample")}, input_,
           std::move(parse_example_functor), num_parallel_calls_, sloppy_,
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 6d85cd5c450640a0042add2ead26836433166ade..114bb6a856c90f559e2db48ba68b9d249ef75b2e 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -56,8 +56,8 @@ class RandomDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Random")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Random")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 76ab33fe98887dafd69a45e80ee6794d7044384b..55e22c1cac6090574d52a4dd154feae2701c6dab 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -84,8 +84,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Scan")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Scan")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -186,6 +186,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
         Status s = instantiated_captured_func_->Run(ctx, std::move(args),
                                                     &state_and_output);
+        DCHECK(state_and_output.size() <=
+               dataset()->state_types_.size() + output_dtypes().size());
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index fe128005faca9bd986e7c85600f7f871ebb97a25..67bb1e160b9b125c28f2fda0cdde30c12b957382 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -114,8 +115,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::SetStatsAggregator")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index d2fb8ac4f33b1e844bb39cc70a47ccb15424ace7..9d63690622d5a50b22ac48c85f56f90952ae2dcc 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -55,7 +54,7 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
+      return absl::make_unique<Iterator>(
           Iterator::Params{this, strings::StrCat(prefix, "::Sleep")});
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index 1ce4fbd3136d7fbd245fbb920ff658c4eae794c6..c5851eaf86b654c62457759c7835d4f274c4b9ee 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include <deque>
 #include <vector>
 
@@ -86,8 +85,8 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Slide")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index c16d8ed02ccdfb01a41ff9206a003f4a8c04a667..84f6fba36d197ffaa991ec7f381bdb15aebcd8d3 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -91,8 +91,8 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Sql")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Sql")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 894465e1814cf93b02ecbbb053494d4c032fe243..1d1b788b6c12b1f68cea494b1e269869bb57d648 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -108,8 +108,8 @@ class StatsAggregatorHandleOp
  private:
   Status CreateResource(StatsAggregatorResource** ret) override
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    *ret = new StatsAggregatorResource(
-        std::unique_ptr<StatsAggregator>(new StatsAggregatorImpl));
+    *ret =
+        new StatsAggregatorResource(absl::make_unique<StatsAggregatorImpl>());
     return Status::OK();
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 1961f25df846e8773bf6b0266d089c9d3bac355b..bf96be4eb005e62d1982adf5662984fe15d59091 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -63,8 +62,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::LatencyStats")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::LatencyStats")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -173,8 +172,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BytesProducedStats")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::BytesProducedStats")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a6f70e504ec09007ac21808b1747e299d2b150d
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  using LoopIteratorPredicate =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>&, bool*)>;
+
+  explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
+                                                 &captured_func));
+
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+    OP_REQUIRES(
+        ctx, indices.size() <= 1,
+        errors::InvalidArgument("`predicate` has more than one return value."));
+
+    LoopIteratorPredicate loop_pred;
+    if (indices.empty()) {
+      loop_pred = [](IteratorContext* ctx,
+                     InstantiatedCapturedFunction* inst_captured_func,
+                     const std::vector<Tensor>& args, bool* end_of_sequence) {
+        std::vector<Tensor> result;
+        TF_RETURN_IF_ERROR(
+            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+
+        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+            result[0].NumElements() != 1) {
+          return errors::InvalidArgument(
+              "`predicate` must returns a scalar bool tensor.");
+        }
+        *end_of_sequence = !result[0].scalar<bool>()();
+        return Status::OK();
+      };
+    } else {
+      loop_pred = [indices](IteratorContext* ctx,
+                            InstantiatedCapturedFunction* inst_captured_func,
+                            const std::vector<Tensor>& args,
+                            bool* end_of_sequence) {
+        const Tensor& predicate = args[indices[0]];
+        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+          return errors::InvalidArgument(
+              "`predicate` must returns a scalar bool tensor.");
+        }
+        *end_of_sequence = !predicate.scalar<bool>()();
+        return Status::OK();
+      };
+    }
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          std::move(loop_pred));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func,
+            LoopIteratorPredicate loop_pred)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          func_(func),
+          captured_func_(std::move(captured_func)),
+          loop_pred_(std::move(loop_pred)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::TakeWhile")},
+          loop_pred_);
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "TakeWhileDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return kUnknownCardinality; }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f_attr;
+      b->BuildAttrValue(func_, &f_attr);
+
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {std::make_pair(0, input_node)},
+          {std::make_pair(1, other_arguments)},
+          {std::make_pair("predicate", f_attr),
+           std::make_pair("Targuments", other_arguments_types_attr)},
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params, LoopIteratorPredicate loop_pred)
+          : DatasetIterator<Dataset>(params), loop_pred_(loop_pred) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        {
+          tf_shared_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+        }
+        if (*end_of_sequence) {
+          mutex_lock l(mu_);
+          input_impl_.reset();
+          return Status::OK();
+        }
+        return loop_pred_(ctx, instantiated_captured_func_.get(), *out_tensors,
+                          end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty")))
+          input_impl_.reset();
+        else
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+      const LoopIteratorPredicate loop_pred_;
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const LoopIteratorPredicate loop_pred_;
+  };
+
+  NameAttrList func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalTakeWhileDataset").Device(DEVICE_CPU),
+                        TakeWhileDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 8ae45ed5c9d9fe199ef392a1430f359172ec5c73..7a16cda0f3dc83d5c00a2006f94bdecde866bfd5 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include <memory>
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -51,7 +50,7 @@ class ThreadPoolResource : public ResourceBase {
 
   int32 NumThreads() { return thread_pool_.NumThreads(); }
 
-  string DebugString() override { return "ThreadPoolResource"; }
+  string DebugString() const override { return "ThreadPoolResource"; }
 
  private:
   thread::ThreadPool thread_pool_;
@@ -154,8 +153,8 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::ThreadPool")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::ThreadPool")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -261,8 +260,8 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::MaxIntraOpParallelism")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::MaxIntraOpParallelism")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -362,7 +361,7 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           num_threads_(num_threads) {
-      thread_pool_ = MakeUnique<thread::ThreadPool>(
+      thread_pool_ = absl::make_unique<thread::ThreadPool>(
           ctx->env(), ThreadOptions{}, "data_private_threadpool", num_threads,
           /*low_latency_hint=*/false);
       input_->Ref();
@@ -372,8 +371,8 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::PrivateThreadPool")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::PrivateThreadPool")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index 7728baf1507c6cec2b44f41561f2ab3d04a80cc8..6cf6198432b68fe241e413b8472a2b69bbae314d 100644
--- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -59,18 +58,18 @@ class ToTFRecordOp : public AsyncOpKernel {
       std::unique_ptr<WritableFile> file;
       OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
                            done);
-      std::unique_ptr<io::RecordWriter> writer;
-      writer.reset(new io::RecordWriter(
-          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
-                          compression_type)));
+      std::unique_ptr<io::RecordWriter> writer =
+          absl::make_unique<io::RecordWriter>(
+              file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
+                              compression_type));
 
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
       IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache(
-          new FunctionHandleCache(params.lib));
+      std::unique_ptr<FunctionHandleCache> function_handle_cache =
+          absl::make_unique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
       IteratorContext iter_ctx(std::move(params));
 
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 2626ec3ed7250b725650a76b8674e0a76ebc638f..cb26fd3e43d7c143cdc716a0ee5a4d172c98149f 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -58,8 +58,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Unbatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index 23dd9ff612db61829dcbae65eb3566131d032efc..57865c45fc078c663d4ea43c5ee9a6642bdc51b6 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -58,8 +58,8 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Unique")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Unique")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
index 784f9872860fee0f929dcf4c529c17fbb15e2bc6..3b9b319ea9442c024ec22fee601085b42614836d 100644
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
@@ -64,8 +64,8 @@ class FilterByLastComponentDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<Iterator>(new Iterator(
-          {this, strings::StrCat(prefix, "::FilterByLastComponent")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::FilterByLastComponent")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 30b2fc5db804e5e79ea548859c7ffb0f0ae2a8aa..483d42c8092356ed9fedb70222c7dc96001874b4 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -109,7 +108,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
+      return absl::make_unique<Iterator>(
           Iterator::Params{this, strings::StrCat(prefix, "::Filter")},
           filter_pred_);
     }
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index efa76ab34bc198fa705ecbb5e8c876b0f5cc3a58..3f01ac556998750d02b299a76c8f81c60262f190 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -67,8 +67,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FlatMap")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::FlatMap")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 48697ec6c8f05c438badedbc3234dbb1110c7088..5dff2be39da37e899092c6a764d548b9a4799e22 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/generator_dataset_op.h"
+
 #include <iterator>
 #include <vector>
 
-#include "tensorflow/core/kernels/data/generator_dataset_op.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
@@ -44,8 +44,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::Generator")}));
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::Generator")});
   }
 
   const DataTypeVector& output_dtypes() const override { return output_types_; }
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 1a5e6edb5b716a3af66a2989a003440243f8c084..69310bcff23d56414c5f689339b528b326429c9f 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -89,8 +89,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Interleave")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Interleave")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 9f5881563b5db2b6b5a678b777789091756a6e7a..808f834f62dfac78411829951c9c12570c41336e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
-#include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -38,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -137,7 +135,7 @@ class IteratorResource : public ResourceBase {
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
-    std::unique_ptr<State> new_state = MakeUnique<State>(
+    std::unique_ptr<State> new_state = absl::make_unique<State>(
         std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */);
 
     TF_RETURN_IF_ERROR(
@@ -211,7 +209,7 @@ class IteratorResource : public ResourceBase {
     }
 
     new_state->function_handle_cache =
-        MakeUnique<FunctionHandleCache>(new_state->lib);
+        absl::make_unique<FunctionHandleCache>(new_state->lib);
     // Create new iterator.
     std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
@@ -231,7 +229,7 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() override { return "Iterator resource"; }
+  string DebugString() const override { return "Iterator resource"; }
 
   const DataTypeVector& output_dtypes() const { return output_dtypes_; }
 
@@ -247,7 +245,7 @@ class IteratorResource : public ResourceBase {
         : flib_def(flib_def),
           pflr(pflr),
           lib(lib),
-          function_handle_cache(MakeUnique<FunctionHandleCache>(lib)),
+          function_handle_cache(absl::make_unique<FunctionHandleCache>(lib)),
           iterator(std::move(iterator)) {}
 
     State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
@@ -278,124 +276,6 @@ class IteratorResource : public ResourceBase {
 
 namespace {
 
-// Helper class for reading data from a VariantTensorData object.
-class VariantTensorDataReader : public IteratorStateReader {
- public:
-  explicit VariantTensorDataReader(const VariantTensorData* data)
-      : data_(data) {
-    PreProcess();
-  }
-
-  // Returns OK iff the initialization was successful, i.e.,
-  // pre-processing did not have errors.
-  Status status() const { return status_; }
-
-  Status ReadScalar(StringPiece key, int64* val) override {
-    return ReadScalarInternal(key, val);
-  }
-
-  Status ReadScalar(StringPiece key, string* val) override {
-    return ReadScalarInternal(key, val);
-  }
-
-  Status ReadTensor(StringPiece key, Tensor* val) override {
-    return ReadTensorInternal(key, val);
-  }
-
-  bool Contains(StringPiece key) override {
-    return map_.find(string(key)) != map_.end();
-  }
-
- private:
-  void PreProcess() {
-    string metadata;
-    data_->get_metadata(&metadata);
-    IteratorStateMetadata proto;
-    if (!proto.ParseFromString(metadata)) {
-      status_ = errors::Internal("Error parsing IteratorStateMetadata.");
-      return;
-    }
-    size_t num_entries = proto.keys_size();
-    CHECK_EQ(num_entries, data_->tensors_size());
-    for (size_t i = 0; i < num_entries; i++) {
-      map_[proto.keys(i)] = i;
-    }
-  }
-
-  template <typename T>
-  Status ReadScalarInternal(StringPiece key, T* val) {
-    if (map_.find(string(key)) == map_.end()) {
-      return errors::NotFound(key);
-    }
-    *val = data_->tensors(map_[string(key)]).scalar<T>()();
-    return Status::OK();
-  }
-
-  Status ReadTensorInternal(StringPiece key, Tensor* val) {
-    if (map_.find(string(key)) == map_.end()) {
-      return errors::NotFound(key);
-    }
-    *val = data_->tensors(map_[string(key)]);
-    return Status::OK();
-  }
-
-  std::map<string, size_t> map_;
-  const VariantTensorData* data_;  // Not owned.
-  Status status_;
-};
-
-// Helper class for writing data to a VariantTensorData object.
-class VariantTensorDataWriter : public IteratorStateWriter {
- public:
-  // Does not take ownership of data.
-  explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
-
-  Status WriteScalar(StringPiece key, const int64 val) override {
-    return WriteScalarInternal(key, val);
-  }
-
-  Status WriteScalar(StringPiece key, const string& val) override {
-    return WriteScalarInternal(key, val);
-  }
-
-  Status WriteTensor(StringPiece key, const Tensor& val) override {
-    return WriteTensorInternal(key, val);
-  }
-
-  // Writes the metadata to `data_`.
-  Status Flush() {
-    string metadata;
-    if (!metadata_proto_.SerializeToString(&metadata)) {
-      return errors::Internal("Unable to serialize IteratorStateMetadata.");
-    }
-    data_->set_metadata(metadata);
-    return Status::OK();
-  }
-
- private:
-  template <typename T>
-  Status WriteScalarInternal(StringPiece key, const T& val) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    return WriteTensorInternal(key, val_t);
-  }
-
-  Status WriteTensorInternal(StringPiece key, const Tensor& val) {
-    // Write key to the metadata proto. This gets written to `data_`
-    // when `Flush()` is called. We do this lazily to avoid multiple
-    // serialization calls.
-    metadata_proto_.add_keys(string(key));
-
-    // Update tensors.
-    *(data_->add_tensors()) = val;
-    return Status::OK();
-  }
-
-  VariantTensorData* data_;
-  // TODO(srbs): Set the version string.
-  IteratorStateMetadata metadata_proto_;
-};
-
 // Wrapper for encoding/decoding the iterator state stored in a Variant tensor.
 // The get() method returns an IteratorStateReader which can be used
 // to restore iterator state.
@@ -434,7 +314,7 @@ class IteratorStateVariant {
     SerializationContext::Params params;
     params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
     SerializationContext serialization_ctx(params);
-    data_ = MakeUnique<VariantTensorData>();
+    data_ = absl::make_unique<VariantTensorData>();
     data_->set_type_name(TypeName());
     VariantTensorDataWriter writer(data_.get());
     TF_RETURN_IF_ERROR(iterator_resource->Save(&serialization_ctx, &writer));
@@ -448,25 +328,19 @@ class IteratorStateVariant {
       return false;
     }
     std::unique_ptr<VariantTensorData> tensor_data =
-        MakeUnique<VariantTensorData>();
+        absl::make_unique<VariantTensorData>();
     std::swap(*tensor_data, data);
     std::unique_ptr<VariantTensorDataReader> reader =
-        MakeUnique<VariantTensorDataReader>(tensor_data.get());
-    status_ = reader->status();
-    if (!status_.ok()) {
-      return false;
-    }
+        absl::make_unique<VariantTensorDataReader>(tensor_data.get());
     data_ = std::move(tensor_data);
     reader_ = std::move(reader);
     return true;
   }
   IteratorStateReader* get() { return reader_.get(); }
-  Status status() const { return status_; }
   string DebugString() const {
     if (data_) {
-      return strings::StrCat("IteratorStateVariant<",
-                             "data: ", data_->DebugString(),
-                             " status: ", status_.ToString(), ">");
+      return strings::StrCat("IteratorStateVariant<", data_->DebugString(),
+                             ">");
     } else {
       return strings::StrCat("IteratorStateVariant<empty>");
     }
@@ -474,7 +348,6 @@ class IteratorStateVariant {
 
  private:
   std::unique_ptr<IteratorStateReader> reader_;
-  Status status_;
   std::unique_ptr<VariantTensorData> data_;
 };
 
@@ -585,9 +458,9 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
   *device_mgr = absl::make_unique<DeviceMgr>(RenamedDevice::NewRenamedDevice(
       ctx->device()->name(), down_cast<Device*>(ctx->device()),
       false /* owns_underlying */, false /* isolate_session_state */));
-  *flib_def = MakeUnique<FunctionLibraryDefinition>(
+  *flib_def = absl::make_unique<FunctionLibraryDefinition>(
       *ctx->function_library()->GetFunctionLibraryDefinition());
-  *pflr = MakeUnique<ProcessFunctionLibraryRuntime>(
+  *pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr->get(), ctx->env(), graph_def_version_, flib_def->get(),
       OptimizerOptions{} /* TODO(mrry): OptimizerOptions? */,
       nullptr /* TODO(mrry): ClusterFLR */);
@@ -679,9 +552,10 @@ class ToSingleElementOp : public AsyncOpKernel {
       std::unique_ptr<IteratorBase> iterator;
       IteratorContext::Params params(ctx);
       std::unique_ptr<FunctionHandleCache> function_handle_cache =
-          MakeUnique<FunctionHandleCache>(params.lib);
+          absl::make_unique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
-      std::unique_ptr<ResourceMgr> resource_mgr = MakeUnique<ResourceMgr>();
+      std::unique_ptr<ResourceMgr> resource_mgr =
+          absl::make_unique<ResourceMgr>();
       params.resource_mgr = resource_mgr.get();
       IteratorContext iter_ctx(std::move(params));
 
@@ -769,9 +643,10 @@ class ReduceDatasetOp : public AsyncOpKernel {
 
       IteratorContext::Params params(ctx);
       std::unique_ptr<FunctionHandleCache> function_handle_cache =
-          MakeUnique<FunctionHandleCache>(params.lib);
+          absl::make_unique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
-      std::unique_ptr<ResourceMgr> resource_mgr = MakeUnique<ResourceMgr>();
+      std::unique_ptr<ResourceMgr> resource_mgr =
+          absl::make_unique<ResourceMgr>();
       params.resource_mgr = resource_mgr.get();
       IteratorContext iter_ctx(std::move(params));
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
@@ -1277,12 +1152,10 @@ class DeserializeIteratorOp : public OpKernel {
     OP_REQUIRES(ctx, wrapper != nullptr,
                 errors::InvalidArgument(
                     "DeserializeIteratorOp: Unable to parse variant tensor."));
-    OP_REQUIRES_OK(ctx, wrapper->status());
     OP_REQUIRES_OK(ctx, iterator_resource->Restore(ctx, wrapper->get()));
   }
 };
 
-
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU).Priority(2),
                         IteratorHandleOp);
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 02c0199a0c51d8c6ea5a6500fb82324fcb69740d..95f4c1c89150b81a91f191e4e53c2b81c30841c4 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -120,7 +119,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
+      return absl::make_unique<Iterator>(
           Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
     }
 
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 705b0393de09e7117457370dcf9fcdef37142109..8122048702a6c572486ab8ac36a323f822ab9a0f 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -92,7 +92,7 @@ class MapDefunOp : public AsyncOpKernel {
       // We use a different cancellation manager each time the function is run
       // to avoid the race condition between a function run error and other
       // functions being cancelled as a result.
-      CancellationManager* c_mgr = new CancellationManager;
+      CancellationManager* c_mgr = new CancellationManager();
       CancellationToken token = parent_mgr->get_cancellation_token();
       const bool success = parent_mgr->RegisterCallback(
           token, [c_mgr]() { c_mgr->StartCancel(); });
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 069d61d80d4f00eecdd77356626d7278c0842445..20254234e9da492d5b5faad502e092e15d993a91 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-const int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros;
+constexpr int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros;
 
 class ModelDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -38,7 +41,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input)
+    Dataset(OpKernelContext* ctx, const DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
     }
@@ -47,8 +50,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Model")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Model")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -76,8 +79,12 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            model_(std::make_shared<model::Model>()) {}
+          : DatasetIterator<Dataset>(params) {
+        auto remove_node_hook = [](std::shared_ptr<model::Node> node) {
+          metrics::RecordTFDataElements(node->name(), node->num_elements());
+        };
+        model_ = std::make_shared<model::Model>(std::move(remove_node_hook));
+      }
 
       ~Iterator() override {
         // Signal the optimize thread to terminate it. We will then join that
@@ -131,7 +138,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
-          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+          std::shared_ptr<IteratorContext> new_ctx =
+              std::make_shared<IteratorContext>(*ctx);
           optimize_thread_.reset(ctx->env()->StartThread(
               {}, "tf_data_model",
               [this, new_ctx]() { OptimizeThread(new_ctx); }));
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index ba2125a66eb98985ebd0ae8f55bfc239997ad6df..167276032b4d7e55f9e777b813fa6a0f4e5becbc 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -59,7 +59,7 @@ class MultiDeviceIterator : public ResourceBase {
     DCHECK(lib_ != nullptr);
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("MultiDeviceIterator for ", devices_.size(),
                            " devices");
   }
@@ -81,9 +81,8 @@ class MultiDeviceIterator : public ResourceBase {
     ++incarnation_id_;
     *incarnation_id = incarnation_id_;
 
-    multi_device_buffer_.reset(
-        new MultiDeviceBuffer(devices_.size(), max_buffer_size, incarnation_id_,
-                              std::move(iterator)));
+    multi_device_buffer_ = absl::make_unique<MultiDeviceBuffer>(
+        devices_.size(), max_buffer_size, incarnation_id_, std::move(iterator));
     return Status::OK();
   }
 
@@ -152,7 +151,7 @@ class MultiDeviceIterator : public ResourceBase {
     void Reset() LOCKS_EXCLUDED(mu_) {
       {
         mutex_lock l(mu_);
-        if (!background_thread_finished_) {
+        if (background_thread_ && !background_thread_finished_) {
           cancelled_ = true;
           // Wake up the background thread.
           for (int i = 0; i < size_; ++i) {
@@ -217,10 +216,11 @@ class MultiDeviceIterator : public ResourceBase {
     void EnsureBackgroundThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!background_thread_) {
-        background_thread_.reset(ctx->env()->StartThread(
+        auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+        background_thread_ = absl::WrapUnique<Thread>(ctx->env()->StartThread(
             {}, "tf_data_multi_device_iterator",
             std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread,
-                      this, new IteratorContext(*ctx))));
+                      this, std::move(ctx_copy))));
       }
     }
 
@@ -258,12 +258,11 @@ class MultiDeviceIterator : public ResourceBase {
       }
     }
 
-    void BackgroundThread(IteratorContext* ctx) {
+    void BackgroundThread(std::shared_ptr<IteratorContext> ctx) {
       {
         mutex_lock l(mu_);
         background_thread_started_ = true;
       }
-      std::unique_ptr<IteratorContext> cleanup(ctx);
       int shard_to_fetch = 0;
       while (true) {
         HostBufferElement elem;
@@ -284,8 +283,8 @@ class MultiDeviceIterator : public ResourceBase {
           }
         }
 
-        elem.status =
-            host_iterator_->GetNext(ctx, &elem.value, &elem.end_of_sequence);
+        elem.status = host_iterator_->GetNext(ctx.get(), &elem.value,
+                                              &elem.end_of_sequence);
 
         if (elem.status.ok() && elem.end_of_sequence) {
           end_of_iterator = true;
@@ -397,8 +396,8 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
         OP_REQUIRES_OK(context, context->function_library()->Clone(
                                     &flib_def, &pflr, &lib));
-        std::unique_ptr<FunctionHandleCache> function_handle_cache(
-            new FunctionHandleCache(lib));
+        std::unique_ptr<FunctionHandleCache> function_handle_cache =
+            absl::make_unique<FunctionHandleCache>(lib);
         ResourceMgr* mgr = context->resource_manager();
         OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 9c50d8050a82397f1578ab3f577ef5ad77f81767..6047dc5f3f46fa20878825417bac1a06aacd7c15 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -40,6 +40,8 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr char kOptimizerName[] = "tf_data_meta_optimizer";
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class OptimizeDatasetOp : public UnaryDatasetOpKernel {
@@ -97,7 +99,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       // prefix is used to identify checkpoint elements and since the
       // optimization dataset is excluded from the checkpoint, adding a token
       // here would result in invalid checkpoint identifiers.
-      return std::unique_ptr<IteratorBase>(new Iterator({this, prefix}));
+      return absl::make_unique<Iterator>(Iterator::Params{this, prefix});
     }
 
     Status Optimize(OpKernelContext* ctx) {
@@ -127,7 +129,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
           ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
 
       // Create a FunctionHandleCache.
-      function_handle_cache_.reset(new FunctionHandleCache(lib_));
+      function_handle_cache_ = absl::make_unique<FunctionHandleCache>(lib_);
 
       // Some functions may have been modified without having their names
       // changed (for example, nested dataset graphs from FlatMap or
@@ -286,31 +288,6 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
-      tensorflow::ConfigProto config;
-      RewriterConfig& rewriter_config =
-          *config.mutable_graph_options()->mutable_rewrite_options();
-      for (const string& optimization : optimizations_) {
-        rewriter_config.add_optimizers(optimization);
-      }
-      // If no optimizations were specified, supply a non-existent
-      // optimization to prevent Grappler from applying the default set of
-      // optimizations as some of them do not work out of the box at the
-      // moment (e.g. because we have no cost model for dataset ops).
-      if (optimizations_.empty()) {
-        rewriter_config.add_optimizers("non-existent");
-      } else {
-        // If we apply custom dataset optimizers, explicitly trigger a subset of
-        // standard grappler optimizations to further optimize modified dataset
-        // graphs (e.g. performing constant folding on merged functions,
-        // removing unused graph nodes)
-        // TODO(b/118175421): This should be part of the tf.data optimization
-        // pass manager.
-        // TODO(b/120437209): Apply `constfold` optimization when it is fixed.
-        for (const auto& optimizer :
-             {"pruning", "function", "shape", "arithmetic", "dependency"}) {
-          rewriter_config.add_optimizers(optimizer);
-        }
-      }
       tensorflow::grappler::ItemConfig item_config;
       item_config.apply_optimizations = true;
       std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
@@ -319,13 +296,22 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       std::unordered_map<string, tensorflow::DeviceProperties> device_map;
       tensorflow::grappler::VirtualCluster cluster(device_map);
 
-      // Run optimizer.
-      if (VLOG_IS_ON(2)) {
-        LOG(INFO) << "Performing the following optimizations:";
-        for (const string& optimization : optimizations_) {
-          LOG(INFO) << "  " << optimization;
-        }
+      // Run data optimizer using grappler's meta optimizer.
+      tensorflow::ConfigProto config;
+      RewriterConfig& rewriter_config =
+          *config.mutable_graph_options()->mutable_rewrite_options();
+      rewriter_config.add_optimizers(kOptimizerName);
+      rewriter_config.set_meta_optimizer_iterations(
+          RewriterConfig_NumIterationsType_ONE);
+      auto custom_optimizer = rewriter_config.add_custom_optimizers();
+      custom_optimizer->set_name(kOptimizerName);
+      auto* custom_optimizations_list =
+          (*custom_optimizer->mutable_parameter_map())["optimizers"]
+              .mutable_list();
+      for (const auto& opt : optimizations_) {
+        custom_optimizations_list->add_s(opt);
       }
+
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
           *grappler_item, config, ctx->device(), &cluster, graph_def));
 
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index a406f7467fe1a1d221ee1d5bd9b2e858fb0044d3..6590c7ef9b6b9a795387273ebec33f531625d7d0 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -221,12 +221,5 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
                                           OptionalVariant,
                                           OptionalBinaryAdd<CPUDevice>);
 
-Status OptionalShape(const OptionalVariant& x, TensorShape* s) {
-  *s = TensorShape({});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(OptionalVariant, OptionalShape);
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index ef14e843115da0c37d79c6be13b8064c78c072d5..7089a423d7302decd2e13a6496307e7520e88066 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -44,8 +44,9 @@ class OptionalVariant {
 
   // Create an `OptionalVariant` with the actual value given by the tuple of
   // tensors in `values`.
-  explicit OptionalVariant(std::vector<Tensor> values)
-      : values_(new std::vector<Tensor>(std::move(values))) {}
+  explicit OptionalVariant(std::vector<Tensor> values) {
+    values_ = std::make_shared<std::vector<Tensor>>(std::move(values));
+  }
 
   OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
 
@@ -79,7 +80,7 @@ class OptionalVariant {
       return false;
     }
     if (has_value) {
-      values_.reset(new std::vector<Tensor>(data.tensors()));
+      values_ = std::make_shared<std::vector<Tensor>>(data.tensors());
     } else {
       values_.reset();
     }
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 0fff4c53706269538f770889744e21fffcae3601..41ea36263c7e6a8bc0190d84247612e591f9d1b9 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -135,8 +135,8 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::PaddedBatch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::PaddedBatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index fda7ae0cbba492fb2c3841ed7bfb3e9dd3519483..ddd81d4596ee216c1abd6a17ec94d86c3d41e18c 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -14,24 +14,29 @@ limitations under the License.
 ==============================================================================*/
 #include <atomic>
 #include <deque>
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr char kDatasetName[] = "ParallelInterleaveV2";
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
@@ -43,12 +48,7 @@ namespace {
 //
 // Furthermore, this class favors modularity over extended functionality. In
 // particular, it refrains from implementing configurable buffering of output
-// elements and prefetching of input iterators, relying on other parts of
-// tf.data to provide this functionality if necessary.
-//
-// The above design choices were made with automated optimizations in mind,
-// isolating the degree of parallelism as the single tunable knob of this
-// implementation.
+// elements and prefetching of input iterators.
 class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
@@ -90,6 +90,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
                                       &captured_func));
 
+    if (num_parallel_calls == model::kAutoTune) {
+      metrics::RecordTFDataAutotune(kDatasetName);
+    }
+
     *output =
         new Dataset(ctx, input, interleave_func_, std::move(captured_func),
                     cycle_length, block_length, num_parallel_calls, sloppy_,
@@ -122,9 +126,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<ParallelInterleaveIterator>(
+      return absl::make_unique<ParallelInterleaveIterator>(
           ParallelInterleaveIterator::Params{
-              this, strings::StrCat(prefix, "::ParallelInterleaveV2")},
+              this, strings::StrCat(prefix, "::", kDatasetName)},
           sloppy_);
     }
 
@@ -194,24 +198,22 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
    private:
     class ParallelInterleaveIterator : public DatasetIterator<Dataset> {
      public:
-      explicit ParallelInterleaveIterator(const Params& params, bool sloppy)
+      ParallelInterleaveIterator(const Params& params, bool sloppy)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
                 params.dataset->num_parallel_calls_, mu_, cond_var_)),
             sloppy_(sloppy),
-            args_list_(params.dataset->cycle_length_),
             current_elements_(params.dataset->cycle_length_),
-            element_in_use_(params.dataset->cycle_length_, false),
-            thread_pool_(new thread::ThreadPool(
+            thread_pool_(absl::make_unique<thread::ThreadPool>(
                 Env::Default(), ThreadOptions(),
                 "data_parallel_interleave_worker_pool",
-                dataset()->cycle_length_ /* num_threads */,
+                port::NumSchedulableCPUs() /* num_threads */,
                 false /* low_latency_hint */)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
+        key_prefix_ = components.back();
       }
 
       ~ParallelInterleaveIterator() override {
@@ -239,27 +241,20 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        std::shared_ptr<InvocationResult> result;
-        do {
-          result.reset();
-          {
-            mutex_lock l(*mu_);
-            EnsureRunnerThreadStarted(ctx);
-            while (ShouldWait(&result)) {
-              RecordStop(ctx);
-              cond_var_->wait(l);
-              RecordStart(ctx);
-            }
-            if (!result) {
-              *end_of_sequence = true;
-              return Status::OK();
-            }
+        std::shared_ptr<Result> result;
+        {
+          mutex_lock l(*mu_);
+          EnsureThreadsStarted(ctx);
+          while (!Consume(&result)) {
+            RecordStop(ctx);
+            cond_var_->wait(l);
+            RecordStart(ctx);
           }
-          RecordStop(ctx);
-          result->notification.WaitForNotification();
-          RecordStart(ctx);
-        } while (result->skip);
-
+        }
+        if (!result) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         if (result->status.ok()) {
           *out_tensors = std::move(result->return_values);
           RecordBufferDequeue(ctx, *out_tensors);
@@ -274,7 +269,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         return model::MakeAsyncInterleaveManyNode(
             std::move(args),
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/dataset()->cycle_length_)});
+                                  /*max=*/port::NumSchedulableCPUs())});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
@@ -283,37 +278,22 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         while (num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        CHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name("invocation_results.size"), invocation_results_.size()));
-        for (size_t i = 0; i < invocation_results_.size(); i++) {
-          std::shared_ptr<InvocationResult> result = invocation_results_[i];
-          TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result->status));
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("invocation_results[", i, "].size")),
-              result->return_values.size()));
-          for (size_t j = 0; j < result->return_values.size(); j++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(
-                    strings::StrCat("invocation_results[", i, "][", j, "]")),
-                result->return_values[j]));
-          }
-          if (result->skip) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("invocation_results[", i, "].skip")),
-                ""));
-          }
-        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_index"), block_index_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("cycle_index"), cycle_index_));
         if (end_of_input_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("end_of_input"), ""));
         }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("element_id_counter"),
+                                               element_id_counter_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("num_open"), num_open_));
         TF_RETURN_IF_ERROR(WriteCurrentElements(writer));
+        TF_RETURN_IF_ERROR(WriteFutureElements(writer));
         return Status::OK();
       }
 
@@ -321,258 +301,393 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                              IteratorStateReader* reader) override {
         mutex_lock l(*mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        int64 invocation_results_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name("invocation_results.size"), &invocation_results_size));
-        for (size_t i = 0; i < invocation_results_size; i++) {
-          std::shared_ptr<InvocationResult> result(new InvocationResult());
-          invocation_results_.push_back(result);
-          TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result->status));
-          size_t num_return_values;
-          {
-            int64 size;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("invocation_results[", i, "].size")),
-                &size));
-            num_return_values = static_cast<size_t>(size);
-            if (num_return_values != size) {
-              return errors::InvalidArgument(strings::StrCat(
-                  full_name(
-                      strings::StrCat("invocation_results[", i, "].size")),
-                  ": ", size, " is not a valid value of type size_t."));
-            }
-          }
-          result->return_values.reserve(num_return_values);
-          for (size_t j = 0; j < num_return_values; j++) {
-            result->return_values.emplace_back();
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(full_name(strings::StrCat(
-                                       "invocation_results[", i, "][", j, "]")),
-                                   &result->return_values.back()));
-          }
-          result->skip = reader->Contains(
-              full_name(strings::StrCat("invocation_results[", i, "].skip")));
-          result->notification.Notify();
-        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("block_index"), &block_index_));
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("cycle_index"), &cycle_index_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("element_id_counter"),
+                                              &element_id_counter_));
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("num_open"), &num_open_));
         TF_RETURN_IF_ERROR(ReadCurrentElements(ctx, reader));
+        TF_RETURN_IF_ERROR(ReadFutureElements(ctx, reader));
         return Status::OK();
       }
 
      private:
-      struct InvocationResult {
-        Notification notification;  // used for coordination with the consumer
-        Status status;              // the invocation status
-        std::vector<Tensor> return_values;  // the invocation result values
-        bool skip;  // if set the result should be skipped
+      // Represents the result of fetching an element from a dataset.
+      struct Result {
+        Status status;
+        std::vector<Tensor> return_values;
+        // Indicates whether the result is ready to be consumed.
+        bool is_ready = false;
+      };
+
+      // The interleave transformation repeatedly inputs elements, applies the
+      // user-provided function to transform the input elements to datasets, and
+      // interleaves the elements of these datasets as its output.
+      //
+      // This structure represents an input element and derived state.
+      struct Element {
+        // Unique identifier, needed to support checkpointing.
+        int64 id;
+        // The actual input element.
+        std::vector<Tensor> inputs;
+        // Iterator created from the input element.
+        std::unique_ptr<IteratorBase> iterator;
+        mutex mu;
+        // Buffer for storing the outputs of `iterator`.
+        std::deque<std::shared_ptr<Result>> results GUARDED_BY(mu);
+        // Indicates whether the element is used by a worker thread.
+        bool in_use = false;
       };
 
-      void EnsureRunnerThreadStarted(IteratorContext* ctx)
+      // Advances the position in the interleave cycle to the next cycle
+      // element.
+      void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        block_index_ = 0;
+        cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+      }
+
+      // Advances the position in the interleave cycle by one.
+      void AdvancePosition() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        ++block_index_;
+        if (block_index_ == dataset()->block_length_) {
+          AdvanceToNextInCycle();
+        }
+      }
+
+      // Consumes a result (if available), returning an indication of whether
+      // a result is available. If `true` is returned, `result` either
+      // points to a valid result or is null if end of input has been reached.
+      bool Consume(std::shared_ptr<Result>* result)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (!runner_thread_) {
-          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "tf_data_parallel_interleave_runner",
-              [this, new_ctx]() { RunnerThread(new_ctx); }));
+        if (!sloppy_) {
+          return ConsumeHelper(result);
+        }
+        // If we are allowed to be sloppy (i.e. return results out of order),
+        // try to find an element in the cycle that has a result available.
+        for (int i = 0; i < dataset()->cycle_length_; ++i) {
+          if (ConsumeHelper(result)) {
+            return true;
+          }
+          AdvanceToNextInCycle();
         }
+        return false;
       }
 
-      // Fetches up to `results.size()` outputs from the cycle element at
-      // position `cycle_index`.
+      bool ConsumeHelper(std::shared_ptr<Result>* result)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        while (true) {
+          std::shared_ptr<Element> element = current_elements_[cycle_index_];
+          if (element) {
+            mutex_lock l(element->mu);
+            if (!element->results.empty()) {
+              if (element->results.front()->is_ready) {
+                // We found a result.
+                std::swap(*result, element->results.front());
+                element->results.pop_front();
+                AdvancePosition();
+                cond_var_->notify_all();
+                return true;
+              } else {
+                // Wait for the result to become ready.
+                return false;
+              }
+            } else if (!element->iterator) {
+              // We reached the end of input for this element. Reset
+              // it and move on to the next cycle element.
+              current_elements_[cycle_index_].reset();
+              AdvanceToNextInCycle();
+              cond_var_->notify_all();
+              continue;
+            } else {
+              // Wait for the iterator to produce a result.
+              return false;
+            }
+          } else {
+            if (!future_elements_.empty() || !end_of_input_) {
+              // Wait for an element to be created.
+              return false;
+            }
+            // No new elements will be created; try to find a
+            // non-empty element in the cycle.
+            for (int i = 0; i < dataset()->cycle_length_; ++i) {
+              AdvanceToNextInCycle();
+              if (current_elements_[cycle_index_]) {
+                break;
+              }
+            }
+            if (current_elements_[cycle_index_]) {
+              continue;
+            }
+            // End of input has been reached.
+            return true;
+          }
+        }
+      }
+
+      // Manages current cycle elements, creating new iterators as needed and
+      // asynchronously fetching results from existing iterators.
       //
-      // If end of input is encountered, the `skip` field of the invocation
-      // result is used to identify results that should be skipped.
-      void FetchOutputs(
-          const std::shared_ptr<IteratorContext>& ctx, int64 cycle_index,
-          const std::vector<std::shared_ptr<InvocationResult>>& results)
-          LOCKS_EXCLUDED(*mu_) {
+      // This method runs in the `current_elements_manager_` background thread.
+      void CurrentElementsManager(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
-        bool end_of_input = false;
-        for (auto& result : results) {
-          if (!end_of_input) {
-            result->status = current_elements_[cycle_index]->GetNext(
-                ctx.get(), &result->return_values, &end_of_input);
+        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+          const bool has_more_elements =
+              !future_elements_.empty() || !end_of_input_;
+          const int block_length = dataset()->block_length_;
+          bool all_elements_busy = true;
+          for (auto& element : current_elements_) {
+            if (!element) {
+              if (has_more_elements) {
+                all_elements_busy = false;
+                break;
+              }
+            } else {
+              mutex_lock l(element->mu);
+              if (!element->in_use && element->iterator &&
+                  element->results.size() < block_length) {
+                all_elements_busy = false;
+                break;
+              }
+            }
           }
-          if (end_of_input) {
-            result->skip = true;
+          return all_elements_busy || num_calls_ >= num_parallel_calls_->value;
+        };
+        while (true) {
+          mutex_lock l(*mu_);
+
+          // Wait until this thread is cancelled, the end of input has been
+          // reached.
+          while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
+            RecordStop(ctx.get());
+            cond_var_->wait(l);
+            RecordStart(ctx.get());
           }
-          RecordBufferEnqueue(ctx.get(), result->return_values);
-          {
-            mutex_lock l(*mu_);
-            result->notification.Notify();
-            cond_var_->notify_all();
+
+          if (cancelled_ ||
+              (future_elements_.empty() && end_of_input_ && num_open_ == 0)) {
+            return;
           }
-          if (!result->status.ok()) {
+
+          for (int i = 0; i < dataset()->cycle_length_; ++i) {
+            int idx = (cycle_index_ + i) % dataset()->cycle_length_;
+            if (!current_elements_[idx]) {
+              if (!future_elements_.empty()) {
+                current_elements_[idx] = std::move(future_elements_.back());
+                future_elements_.pop_back();
+              } else {
+                current_elements_[idx] = MakeElement(ctx);
+                if (!current_elements_[idx]) {
+                  continue;
+                }
+              }
+            }
+            std::shared_ptr<Element> element = current_elements_[idx];
+            if (!element->in_use && element->iterator) {
+              int64 num_results;
+              {
+                mutex_lock l(element->mu);
+                num_results =
+                    dataset()->block_length_ - element->results.size();
+              }
+              if (num_results > 0) {
+                num_calls_++;
+                element->in_use = true;
+                thread_pool_->Schedule(
+                    std::bind(&ParallelInterleaveIterator::FetchResults, this,
+                              ctx, std::move(element), num_results));
+              }
+            }
+          }
+          const auto& stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            stats_aggregator->AddScalar(
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
+          }
+          cond_var_->notify_all();
+        }
+      }
+
+      void EnsureThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!current_elements_manager_) {
+          auto new_ctx = std::make_shared<IteratorContext>(*ctx);
+          current_elements_manager_ =
+              absl::WrapUnique<Thread>(ctx->env()->StartThread(
+                  {}, "tf_data_parallel_interleave_current",
+                  [this, new_ctx]() { CurrentElementsManager(new_ctx); }));
+        }
+        if (!future_elements_manager_) {
+          auto new_ctx = std::make_shared<IteratorContext>(*ctx);
+          future_elements_manager_ =
+              absl::WrapUnique<Thread>(ctx->env()->StartThread(
+                  {}, "tf_data_parallel_interleave_future",
+                  [this, new_ctx]() { FutureElementsManager(new_ctx); }));
+        }
+      }
+
+      // Fetches up to `dataset()->block_length_` results from `element`.
+      void FetchResults(const std::shared_ptr<IteratorContext>& ctx,
+                        const std::shared_ptr<Element>& element,
+                        int64 num_results) LOCKS_EXCLUDED(*mu_) {
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
+        bool end_of_input = false;
+        for (int64 i = 0; i < num_results; ++i) {
+          auto result = std::make_shared<Result>();
+          result->status = element->iterator->GetNext(
+              ctx.get(), &result->return_values, &end_of_input);
+          if (end_of_input) {
             break;
           }
+          RecordBufferEnqueue(ctx.get(), result->return_values);
+          mutex_lock l(*mu_);
+          mutex_lock l2(element->mu);
+          element->results.push_back(result);
+          result->is_ready = true;
+          cond_var_->notify_all();
         }
 
-        // Release the ownership of the cycle element iterator, closing the
-        // iterator if end of input was encountered.
+        mutex_lock l(*mu_);
+        // Release the ownership of the cycle element iterator.
+        element->in_use = false;
         if (end_of_input) {
-          current_elements_[cycle_index].reset();
+          // Close the iterator if end of input was encountered.
+          element->iterator.reset();
+          element->inputs.clear();
+          --num_open_;
         }
-        mutex_lock l(*mu_);
-        element_in_use_[cycle_index] = false;
-        num_calls_--;
+        --num_calls_;
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
-        }
-        if (end_of_input) {
-          args_list_[cycle_index].clear();
-          num_open_--;
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
 
-      // Method responsible for 1) creating iterators out of input elements, 2)
-      // determining the order in which elements are fetched from the iterators,
-      // and 3) scheduling the fetching of the elements to a threadpool.
+      // Manages futures cycle elements, creating new iterators as needed and
+      // asynchronously fetching results from existing iterators.
       //
-      // This method runs in the `runner_thread` background thread.
-      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
+      // This method runs in the `future_elements_manager_` background thread.
+      void FutureElementsManager(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
         auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
-          return element_in_use_[cycle_index_] ||
-                 num_calls_ >= num_parallel_calls_->value ||
-                 invocation_results_.size() >=
-                     dataset()->cycle_length_ * dataset()->block_length_;
+          // TODO(jsimsa): Autotune the buffer size.
+          return num_calls_ >= num_parallel_calls_->value ||
+                 future_elements_.size() >= 2 * dataset()->cycle_length_;
         };
         while (true) {
           mutex_lock l(*mu_);
+
           // Wait until this thread is cancelled, the end of input has been
           // reached, or the cycle element at the `cycle_index_` position is
-          // not in use and there is space in the `invocation_results_` queue.
-          while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
+          // not in use.
+          while (!cancelled_ && !end_of_input_ && busy()) {
             RecordStop(ctx.get());
             cond_var_->wait(l);
             RecordStart(ctx.get());
           }
 
-          if (cancelled_ || (end_of_input_ && num_open_ == 0)) {
+          if (cancelled_ || end_of_input_) {
             return;
           }
 
-          while ((!end_of_input_ || num_open_ > 0) && !busy()) {
-            if (!current_elements_[cycle_index_]) {
-              // Try to create a new iterator from the next input element.
-              Status status = input_impl_->GetNext(
-                  ctx.get(), &args_list_[cycle_index_], &end_of_input_);
-              if (!status.ok()) {
-                invocation_results_.emplace_back(new InvocationResult());
-                std::shared_ptr<InvocationResult>& result =
-                    invocation_results_.back();
-                result->status.Update(status);
-                result->notification.Notify();
-                break;
-              }
-              if (!end_of_input_) {
-                Status status = MakeIteratorFromInputElement(
-                    ctx.get(), args_list_[cycle_index_], cycle_index_,
-                    *instantiated_captured_func_, prefix(),
-                    &current_elements_[cycle_index_]);
-                if (!status.ok()) {
-                  invocation_results_.emplace_back(new InvocationResult());
-                  std::shared_ptr<InvocationResult>& result =
-                      invocation_results_.back();
-                  result->status.Update(status);
-                  result->notification.Notify();
-                  break;
-                }
-                ++num_open_;
-              }
+          while (!end_of_input_ && !busy()) {
+            std::shared_ptr<Element> element = MakeElement(ctx);
+            if (!element) {
+              break;
             }
-            if (current_elements_[cycle_index_]) {
-              // Pre-allocate invocation results for outputs to be fetched
-              // and then fetch the outputs asynchronously.
-              std::vector<std::shared_ptr<InvocationResult>> results;
-              results.reserve(dataset()->block_length_);
-              for (int i = 0; i < dataset()->block_length_; ++i) {
-                invocation_results_.emplace_back(new InvocationResult());
-                results.push_back(invocation_results_.back());
-              }
-              num_calls_++;
-              element_in_use_[cycle_index_] = true;
-              thread_pool_->Schedule(
-                  std::bind(&ParallelInterleaveIterator::FetchOutputs, this,
-                            ctx, cycle_index_, std::move(results)));
+            future_elements_.push_front(element);
+            if (!element->iterator) {
+              continue;
             }
-            cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+            ++num_calls_;
+            element->in_use = true;
+            thread_pool_->Schedule(
+                std::bind(&ParallelInterleaveIterator::FetchResults, this, ctx,
+                          std::move(element), dataset()->block_length_));
           }
           const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
-            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-            // monitoring code or as histogram at fixed time intervals.
-            stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::active_parallel_calls"),
-                static_cast<float>(num_calls_));
             stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::num_parallel_calls"),
-                static_cast<float>(num_parallel_calls_->value));
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
           cond_var_->notify_all();
         }
       }
 
-      // Determines whether the caller needs to wait for a result. Upon
-      // returning false, `result` will either be NULL if end of input has been
-      // reached or point to the result.
-      bool ShouldWait(std::shared_ptr<InvocationResult>* result)
+      // Creates a new element.
+      std::shared_ptr<Element> MakeElement(
+          const std::shared_ptr<IteratorContext>& ctx)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (sloppy_) {
-          for (auto it = invocation_results_.begin();
-               it != invocation_results_.end(); ++it) {
-            if ((*it)->notification.HasBeenNotified()) {
-              std::swap(*result, *it);
-              invocation_results_.erase(it);
-              cond_var_->notify_all();
-              return false;
-            }
+        auto element = std::make_shared<Element>();
+        element->id = element_id_counter_++;
+        Status status =
+            input_impl_->GetNext(ctx.get(), &element->inputs, &end_of_input_);
+        if (!status.ok()) {
+          auto result = std::make_shared<Result>();
+          result->is_ready = true;
+          result->status = status;
+          mutex_lock l(element->mu);
+          element->results.push_back(std::move(result));
+          return element;
+        }
+        if (!end_of_input_) {
+          Status status = MakeIteratorFromInputElement(
+              ctx.get(), element->inputs, element->id,
+              *instantiated_captured_func_, prefix(), &element->iterator);
+          if (!status.ok()) {
+            auto result = std::make_shared<Result>();
+            result->is_ready = true;
+            result->status = status;
+            mutex_lock l(element->mu);
+            element->results.push_back(std::move(result));
+            return element;
           }
-          return !invocation_results_.empty() ||
-                 (!end_of_input_ || num_open_ > 0);
+          ++num_open_;
         } else {
-          if (!invocation_results_.empty()) {
-            std::swap(*result, invocation_results_.front());
-            invocation_results_.pop_front();
-            cond_var_->notify_all();
-            return false;
-          }
-          return (!end_of_input_ || num_open_ > 0);
+          element.reset();
         }
+        return element;
       }
 
-      Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& key_prefix, size_t idx,
                                const Status& status)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
-            CodeKey(index), static_cast<int64>(status.code())));
+            CodeKey(key_prefix, idx), static_cast<int64>(status.code())));
         if (!status.ok()) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                                 status.error_message()));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              ErrorMessageKey(key_prefix, idx), status.error_message()));
         }
         return Status::OK();
       }
 
-      Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
+      Status ReadStatusLocked(IteratorStateReader* reader,
+                              const string& key_prefix, size_t idx,
                               Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(CodeKey(key_prefix, idx), &code_int));
         error::Code code = static_cast<error::Code>(code_int);
 
         if (code != error::Code::OK) {
           string error_message;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(ErrorMessageKey(index), &error_message));
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              ErrorMessageKey(key_prefix, idx), &error_message));
           *status = Status(code, error_message);
         } else {
           *status = Status::OK();
@@ -580,58 +695,178 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      string CodeKey(size_t index) {
+      string CodeKey(const string& key_prefix, size_t idx) {
         return full_name(
-            strings::StrCat("invocation_results[", index, "].code"));
+            strings::StrCat(key_prefix, ".results[", idx, "].code"));
       }
 
-      string ErrorMessageKey(size_t index) {
+      string ErrorMessageKey(const string& key_prefix, size_t idx) {
         return full_name(
-            strings::StrCat("invocation_results[", index, "].error_message"));
+            strings::StrCat(key_prefix, ".results[", idx, "].error_message"));
+      }
+
+      Status WriteElement(std::shared_ptr<Element> element, int idx,
+                          const string& key_prefix, IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (element->iterator) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, element->iterator));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].id")),
+              element->id));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].inputs.size")),
+              element->inputs.size()));
+          for (int i = 0; i < element->inputs.size(); i++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(
+                    strings::StrCat(key_prefix, "[", idx, "].inputs[", i, "]")),
+                element->inputs[i]));
+          }
+        }
+        mutex_lock l(element->mu);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].results.size")),
+            element->results.size()));
+        for (size_t i = 0; i < element->results.size(); i++) {
+          std::shared_ptr<Result> result = element->results[i];
+          TF_RETURN_IF_ERROR(WriteStatusLocked(
+              writer, strings::StrCat(key_prefix, "[", idx, "]"), i,
+              result->status));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                        "].size")),
+              result->return_values.size()));
+          for (size_t j = 0; j < result->return_values.size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "][", j, "]")),
+                result->return_values[j]));
+          }
+          if (result->is_ready) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "].is_ready")),
+                ""));
+          }
+        }
+        return Status::OK();
       }
 
       Status WriteCurrentElements(IteratorStateWriter* writer)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("current_elements.size"), current_elements_.size()));
         for (int idx = 0; idx < current_elements_.size(); idx++) {
           if (current_elements_[idx]) {
-            TF_RETURN_IF_ERROR(SaveInput(writer, current_elements_[idx]));
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("args_size[", idx, "]")),
-                args_list_[idx].size()));
-            for (int i = 0; i < args_list_[idx].size(); i++) {
-              TF_RETURN_IF_ERROR(writer->WriteTensor(
-                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
-                  args_list_[idx][i]));
-            }
+            TF_RETURN_IF_ERROR(WriteElement(current_elements_[idx], idx,
+                                            "current_elements", writer));
           }
         }
         return Status::OK();
       }
 
+      Status WriteFutureElements(IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("future_elements.size"), future_elements_.size()));
+        for (int idx = 0; idx < future_elements_.size(); idx++) {
+          if (future_elements_[idx]) {
+            TF_RETURN_IF_ERROR(WriteElement(future_elements_[idx], idx,
+                                            "future_elements", writer));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status ReadElement(IteratorContext* ctx, IteratorStateReader* reader,
+                         int idx, const string& key_prefix,
+                         std::shared_ptr<Element>* out)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!reader->Contains(full_name(
+                strings::StrCat(key_prefix, "[", idx, "].results.size")))) {
+          return Status::OK();
+        }
+        auto element = std::make_shared<Element>();
+        mutex_lock l(element->mu);
+        int64 results_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].results.size")),
+            &results_size));
+        element->results.resize(results_size);
+        for (size_t i = 0; i < results_size; i++) {
+          auto result = std::make_shared<Result>();
+          TF_RETURN_IF_ERROR(ReadStatusLocked(
+              reader, strings::StrCat(key_prefix, "[", idx, "]"), i,
+              &result->status));
+          int64 num_return_values;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                        "].size")),
+              &num_return_values));
+          result->return_values.reserve(num_return_values);
+          for (size_t j = 0; j < num_return_values; j++) {
+            result->return_values.emplace_back();
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "][", j, "]")),
+                &result->return_values.back()));
+          }
+          result->is_ready = reader->Contains(full_name(strings::StrCat(
+              key_prefix, "[", idx, "].results[", i, "].is_ready")));
+          element->results[i] = std::move(result);
+        }
+        if (!reader->Contains(full_name(
+                strings::StrCat(key_prefix, "[", idx, "].inputs.size")))) {
+          element->iterator.reset();
+          *out = std::move(element);
+          return Status::OK();
+        }
+        int64 inputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].inputs.size")),
+            &inputs_size));
+        element->inputs.resize(inputs_size);
+        for (int i = 0; i < inputs_size; i++) {
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(
+                  strings::StrCat(key_prefix, "[", idx, "].inputs[", i, "]")),
+              &element->inputs[i]));
+        }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].id")),
+            &element->id));
+        TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
+            ctx, element->inputs, element->id,
+            *instantiated_captured_func_.get(), prefix(), &element->iterator));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, element->iterator));
+        *out = std::move(element);
+        return Status::OK();
+      }
+
       Status ReadCurrentElements(IteratorContext* ctx,
                                  IteratorStateReader* reader)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_elements.size"), &size));
+        DCHECK_EQ(current_elements_.size(), size);
         for (int idx = 0; idx < current_elements_.size(); idx++) {
-          if (reader->Contains(
-                  full_name(strings::StrCat("args_size[", idx, "]")))) {
-            int64 args_size;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("args_size[", idx, "]")),
-                &args_size));
-            args_list_[idx].resize(args_size);
-            for (int i = 0; i < args_size; i++) {
-              TF_RETURN_IF_ERROR(reader->ReadTensor(
-                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
-                  &args_list_[idx][i]));
-            }
-            TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, *instantiated_captured_func_.get(),
-                prefix(), &current_elements_[idx]));
-            TF_RETURN_IF_ERROR(
-                RestoreInput(ctx, reader, current_elements_[idx]));
-          } else {
-            current_elements_[idx].reset();
-          }
+          TF_RETURN_IF_ERROR(ReadElement(ctx, reader, idx, "current_elements",
+                                         &current_elements_[idx]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadFutureElements(IteratorContext* ctx,
+                                IteratorStateReader* reader)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("future_elements.size"), &size));
+        future_elements_.resize(size);
+        for (int idx = 0; idx < future_elements_.size(); idx++) {
+          TF_RETURN_IF_ERROR(ReadElement(ctx, reader, idx, "future_elements",
+                                         &future_elements_[idx]));
         }
         return Status::OK();
       }
@@ -640,12 +875,11 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // the worker threads.
       const std::shared_ptr<mutex> mu_;
 
-      // Used for coordination between the main thread, the runner thread, and
-      // the worker threads. In particular, the runner thread should only
-      // schedule new calls when the number of in-flight calls is less than the
-      // user specified level of parallelism, there are slots available in the
-      // `invocation_results_` buffer, the current cycle element is not in use,
-      // and there are elements left to be fetched.
+      // Used for coordination between the main thread, the manager threads, and
+      // the threadpool threads. In particular, the managers thread should only
+      // schedule new calls into the threadpool when the number of in-flight
+      // calls is less than the user specified level of parallelism and there
+      // are slots available in the element `results` buffer.
       const std::shared_ptr<condition_variable> cond_var_;
 
       // Identifies the maximum number of parallel calls.
@@ -657,24 +891,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // Iterator for input elements.
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(*mu_);
 
-      // Identifies current cycle element.
-      int64 cycle_index_ = 0;
-
-      // Arguments for creating an iterator for cycle elements.
-      std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(*mu_);
-
-      // Iterators for the current cycle elements. Concurrent access is
-      // protected by `element_in_use_`.
-      std::vector<std::unique_ptr<IteratorBase>> current_elements_;
+      // Identifies position in the interleave cycle.
+      int64 block_index_ GUARDED_BY(*mu_) = 0;
+      int64 cycle_index_ GUARDED_BY(*mu_) = 0;
 
-      // Identifies cycle elements that are in use by worker threads.
-      std::vector<bool> element_in_use_ GUARDED_BY(*mu_);
+      // Elements of the current interleave cycle.
+      std::vector<std::shared_ptr<Element>> current_elements_ GUARDED_BY(*mu_);
 
-      // Buffer for storing the invocation results.
-      std::deque<std::shared_ptr<InvocationResult>> invocation_results_
-          GUARDED_BY(*mu_);
+      // Elements to be used in the interleave cycle in the future.
+      std::deque<std::shared_ptr<Element>> future_elements_ GUARDED_BY(*mu_);
 
-      // Identifies whether end of input has been reached.
+      // Identifies whether the global end of input has been reached.
       bool end_of_input_ GUARDED_BY(*mu_) = false;
 
       // Identifies the number of open iterators.
@@ -684,11 +911,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> current_elements_manager_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> future_elements_manager_ GUARDED_BY(*mu_);
+      int64 element_id_counter_ GUARDED_BY(*mu_) = 0;
 
-      // Identifies whether background activity should be cancelled.
+      // Identifies whether background threads should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
-      string prefix_end_;
+      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index c0002c86d87f4ca556476c556f0a5fa0addec26f..34f341d1d12c02c3900cba2741a5cd38f2b73e9c 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -31,6 +32,8 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+constexpr char kDatasetName[] = "ParallelMap";
+
 class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
@@ -64,6 +67,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     std::vector<int> indices;
     OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
 
+    if (num_parallel_calls == model::kAutoTune) {
+      metrics::RecordTFDataAutotune(kDatasetName);
+    }
+
     *output =
         new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
                     output_shapes_, use_inter_op_parallelism_, sloppy_,
@@ -102,12 +109,13 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         const string& prefix) const override {
       std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
       if (indices_.empty()) {
-        parallel_map_functor.reset(new ParallelMapDatasetFunctor(this));
+        parallel_map_functor =
+            absl::make_unique<ParallelMapDatasetFunctor>(this);
       } else {
-        parallel_map_functor.reset(new ShortCircuitFunctor(this));
+        parallel_map_functor = absl::make_unique<ShortCircuitFunctor>(this);
       }
       return NewParallelMapIterator(
-          {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
+          {this, strings::StrCat(prefix, "::", kDatasetName)}, input_,
           std::move(parallel_map_functor), num_parallel_calls_, sloppy_,
           preserve_cardinality_);
     }
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index b62e7059bab42d7ace20c3fe9d681e2c129b926e..be91de12fe74a39919fa68bd12d60d9c9ac04ac2 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -60,7 +59,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
         preserve_cardinality_(params.preserve_cardinality) {
     std::vector<string> components =
         str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
-    prefix_end_ = components.back();
+    key_prefix_ = components.back();
   }
 
   ~ParallelMapIterator() override {
@@ -207,8 +206,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
     const auto& stats_aggregator = ctx->stats_aggregator();
     if (stats_aggregator) {
       stats_aggregator->AddScalar(
-          strings::StrCat(prefix_end_, "::active_parallel_calls"),
-          static_cast<float>(num_calls_));
+          strings::StrCat(key_prefix_, "::thread_utilization"),
+          static_cast<float>(num_calls_) /
+              static_cast<float>(num_parallel_calls_->value));
     }
     RecordBufferEnqueue(ctx.get(), result->return_values);
     result->notification.Notify();
@@ -300,14 +300,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
         }
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
-          // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-          // monitoring code or as histogram at fixed time intervals.
-          stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::num_parallel_calls"),
-              static_cast<float>(num_parallel_calls_->value));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
@@ -403,7 +399,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
       GUARDED_BY(*mu_);
   std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
-  string prefix_end_;
+  string key_prefix_;
 };
 
 }  // namespace
@@ -413,7 +409,7 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBase* input_dataset,
     std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
     int32 num_parallel_calls, bool sloppy, bool preserve_cardinality) {
-  return MakeUnique<ParallelMapIterator>(
+  return absl::make_unique<ParallelMapIterator>(
       params, input_dataset,
       ParallelMapIterator::Params{std::move(parallel_map_functor),
                                   num_parallel_calls, sloppy,
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 08d6de4bf9a654d433e3cb6dddd6ab0cc1435136..f0e835a27c9775aadad107ca1f274275cc44f622 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <deque>
 
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +30,8 @@ namespace data {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+constexpr char kDatasetName[] = "Prefetch";
+
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
@@ -42,8 +45,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::Prefetch")}));
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)});
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -266,8 +269,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!prefetch_thread_) {
-        std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-        prefetch_thread_.reset(ctx->env()->StartThread(
+        std::shared_ptr<IteratorContext> new_ctx =
+            std::make_shared<IteratorContext>(*ctx);
+        prefetch_thread_ = absl::WrapUnique<Thread>(ctx->env()->StartThread(
             {}, "tf_data_prefetch",
             [this, new_ctx]() { PrefetchThread(new_ctx); }));
       }
@@ -391,6 +395,10 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
               buffer_size >= 0 || buffer_size == PrefetchAutotuner::kAutoTune,
               errors::InvalidArgument("buffer_size must be >= 0"));
 
+  if (buffer_size == PrefetchAutotuner::kAutoTune) {
+    metrics::RecordTFDataAutotune(kDatasetName);
+  }
+
   *output = new Dataset(ctx, input, buffer_size);
 }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 580702f741814b6bd86cab2d537b3ad49b4f6177..aa14d27d5c3ebec797174d5aecf89dd217fe8f3b 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -53,8 +53,8 @@ class RangeDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Range")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Range")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 971fd2a43685197892ad0fb3cd37e3709cd144c1..789f9c859aab2df61d119b9bb6f6ddd88ce24681 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -91,8 +91,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::TextLine")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::TextLine")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -221,20 +221,20 @@ class TextLineDatasetOp : public DatasetOpKernel {
         // Actually move on to next file.
         TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
             dataset()->filenames_[current_file_index_], &file_));
-        input_stream_.reset(
-            new io::RandomAccessInputStream(file_.get(), false));
+        input_stream_ =
+            absl::make_unique<io::RandomAccessInputStream>(file_.get(), false);
 
         if (dataset()->use_compression_) {
-          zlib_input_stream_.reset(new io::ZlibInputStream(
+          zlib_input_stream_ = absl::make_unique<io::ZlibInputStream>(
               input_stream_.get(), dataset()->options_.input_buffer_size,
-              dataset()->options_.input_buffer_size, dataset()->options_));
-          buffered_input_stream_.reset(new io::BufferedInputStream(
+              dataset()->options_.input_buffer_size, dataset()->options_);
+          buffered_input_stream_ = absl::make_unique<io::BufferedInputStream>(
               zlib_input_stream_.get(), dataset()->options_.input_buffer_size,
-              false));
+              false);
         } else {
-          buffered_input_stream_.reset(new io::BufferedInputStream(
+          buffered_input_stream_ = absl::make_unique<io::BufferedInputStream>(
               input_stream_.get(), dataset()->options_.input_buffer_size,
-              false));
+              false);
         }
         return Status::OK();
       }
@@ -344,11 +344,12 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (compression_type_.empty()) {
-        return std::unique_ptr<IteratorBase>(new UncompressedIterator(
-            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+        return absl::make_unique<UncompressedIterator>(
+            UncompressedIterator::Params{
+                this, strings::StrCat(prefix, "::FixedLengthRecord")});
       } else {
-        return std::unique_ptr<IteratorBase>(new CompressedIterator(
-            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+        return absl::make_unique<CompressedIterator>(CompressedIterator::Params{
+            this, strings::StrCat(prefix, "::FixedLengthRecord")});
       }
     }
 
@@ -452,8 +453,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           }
           TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
               dataset()->filenames_[current_file_index_], &file_));
-          input_buffer_.reset(
-              new io::InputBuffer(file_.get(), dataset()->buffer_size_));
+          input_buffer_ = absl::make_unique<io::InputBuffer>(
+              file_.get(), dataset()->buffer_size_);
           TF_RETURN_IF_ERROR(
               input_buffer_->SkipNBytes(dataset()->header_bytes_));
         } while (true);
@@ -495,8 +496,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           file_pos_limit_ = file_size - dataset()->footer_bytes_;
           TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
               dataset()->filenames_[current_file_index_], &file_));
-          input_buffer_.reset(
-              new io::InputBuffer(file_.get(), dataset()->buffer_size_));
+          input_buffer_ = absl::make_unique<io::InputBuffer>(
+              file_.get(), dataset()->buffer_size_);
           TF_RETURN_IF_ERROR(input_buffer_->Seek(current_pos));
         }
 
@@ -612,13 +613,14 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
                 dataset()->compression_type_ == "ZLIB"
                     ? io::ZlibCompressionOptions::DEFAULT()
                     : io::ZlibCompressionOptions::GZIP();
-            file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
-            buffered_input_stream_.reset(new io::ZlibInputStream(
+            file_stream_ =
+                absl::make_unique<io::RandomAccessInputStream>(file_.get());
+            buffered_input_stream_ = absl::make_unique<io::ZlibInputStream>(
                 file_stream_.get(), dataset()->buffer_size_,
-                dataset()->buffer_size_, zlib_options));
+                dataset()->buffer_size_, zlib_options);
           } else {
-            buffered_input_stream_.reset(new io::BufferedInputStream(
-                file_.get(), dataset()->buffer_size_));
+            buffered_input_stream_ = absl::make_unique<io::BufferedInputStream>(
+                file_.get(), dataset()->buffer_size_);
           }
           TF_RETURN_IF_ERROR(
               buffered_input_stream_->SkipNBytes(dataset()->header_bytes_));
@@ -672,10 +674,11 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               dataset()->compression_type_ == "ZLIB"
                   ? io::ZlibCompressionOptions::DEFAULT()
                   : io::ZlibCompressionOptions::GZIP();
-          file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
-          buffered_input_stream_.reset(new io::ZlibInputStream(
+          file_stream_ =
+              absl::make_unique<io::RandomAccessInputStream>(file_.get());
+          buffered_input_stream_ = absl::make_unique<io::ZlibInputStream>(
               file_stream_.get(), dataset()->buffer_size_,
-              dataset()->buffer_size_, zlib_options));
+              dataset()->buffer_size_, zlib_options);
           lookahead_cache_.clear();
           TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(
               current_pos - dataset()->footer_bytes_));
@@ -763,8 +766,8 @@ class TFRecordDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::TFRecord")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::TFRecord")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -885,8 +888,8 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         const string& next_filename =
             dataset()->filenames_[current_file_index_];
         TF_RETURN_IF_ERROR(env->NewRandomAccessFile(next_filename, &file_));
-        reader_.reset(
-            new io::SequentialRecordReader(file_.get(), dataset()->options_));
+        reader_ = absl::make_unique<io::SequentialRecordReader>(
+            file_.get(), dataset()->options_);
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 8100f2695b6ee529da252b7b012a7c87ebb0a670..ef507ffdd1de28f78e7112fbb1c2198e9876d922 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -51,14 +51,14 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
-        return std::unique_ptr<IteratorBase>(new ForeverIterator(
-            {this, strings::StrCat(prefix, "::ForeverRepeat")}));
+        return absl::make_unique<ForeverIterator>(ForeverIterator::Params{
+            this, strings::StrCat(prefix, "::ForeverRepeat")});
       } else if (count_ == 0) {
-        return std::unique_ptr<IteratorBase>(new EmptyIterator(
-            {this, strings::StrCat(prefix, "::EmptyRepeat")}));
+        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
+            this, strings::StrCat(prefix, "::EmptyRepeat")});
       } else {
-        return std::unique_ptr<IteratorBase>(new FiniteIterator(
-            {this, strings::StrCat(prefix, "::FiniteRepeat")}));
+        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
+            this, strings::StrCat(prefix, "::FiniteRepeat")});
       }
     }
 
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index db0cc6fa4db2af07b3906e7daaf1ff0e3690dd15..e0c435718ac46ee9af1ce404e2bdfa0ba31c3044 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -80,8 +79,9 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             num_elements_(0),
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {
-        buffer_.reset(new std::vector<Tensor>[params.dataset->buffer_size_]);
-        slices_.push_back(MakeUnique<Slice>(0, 0));
+        buffer_ = absl::make_unique<std::vector<Tensor>[]>(
+            params.dataset->buffer_size_);
+        slices_.push_back(absl::make_unique<Slice>(0, 0));
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -124,7 +124,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             }
             epoch_++;
             int64 n = slices_.back()->end;
-            slices_.push_back(MakeUnique<Slice>(n, n));
+            slices_.push_back(absl::make_unique<Slice>(n, n));
             TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
                 ctx, this->prefix(), &input_impl_));
           }
@@ -273,7 +273,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
               reader->ReadScalar(this->full_name("slices_size"), &temp));
           slices_size = static_cast<size_t>(temp);
         }
-        buffer_.reset(new std::vector<Tensor>[this->dataset()->buffer_size_]);
+        buffer_ = absl::make_unique<std::vector<Tensor>[]>(
+            this->dataset()->buffer_size_);
         for (size_t i = 0; i < slices_size; ++i) {
           int64 start;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
@@ -281,7 +282,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           int64 end;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
               this->full_name(strings::StrCat("slices_end_", i)), &end));
-          slices_.push_back(MakeUnique<Slice>(start, end));
+          slices_.push_back(absl::make_unique<Slice>(start, end));
           for (size_t j = start; j < end; ++j) {
             size_t index = j % this->dataset()->buffer_size_;
             int64 list_size;
@@ -399,8 +400,9 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Shuffle")}, seed_,
+          seed2_);
     }
 
    protected:
@@ -412,7 +414,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {}
 
-      string DebugString() override {
+      string DebugString() const override {
         return "ReshufflingDataset::RandomSeedGenerator";
       }
 
@@ -578,9 +580,11 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new ShuffleDatasetBase::Iterator<ShuffleDatasetBase>(
-              {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
+      return absl::make_unique<
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>>(
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>::Params{
+              this, strings::StrCat(prefix, "::Shuffle")},
+          seed_, seed2_);
     }
 
    protected:
@@ -663,10 +667,11 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new ShuffleDatasetBase::Iterator<ShuffleDatasetBase>(
-              {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
-              seed2_));
+      return absl::make_unique<
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>>(
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>::Params{
+              this, strings::StrCat(prefix, "::ShuffleAndRepeat")},
+          seed_, seed2_);
     }
 
    protected:
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 89e3881037666299f093ed7423b62c9741ca5dd9..aab4bfe6403e7c34db44d63dc746fa5385da8c74 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -376,8 +376,8 @@ static SingleThreadedExecutorRegistrar registrar;
 Status NewSingleThreadedExecutor(const LocalExecutorParams& params,
                                  std::unique_ptr<const Graph> graph,
                                  Executor** executor) {
-  std::unique_ptr<SingleThreadedExecutorImpl> impl(
-      new SingleThreadedExecutorImpl(params));
+  std::unique_ptr<SingleThreadedExecutorImpl> impl =
+      absl::make_unique<SingleThreadedExecutorImpl>(params);
   TF_RETURN_IF_ERROR(impl->Initialize(*graph));
   *executor = impl.release();
   return Status::OK();
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 7bb51fb8b53d59789f2d1efad04f4ffdf39587e4..df669e53d388957ced8a6863aaa3de4504cec66f 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -139,7 +139,7 @@ Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
 
 TEST_F(ExecutorTest, SimpleAdd) {
   // c = a + b
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   auto in0 = test::graph::Arg(g.get(), 0, DT_FLOAT);
   auto in1 = test::graph::Arg(g.get(), 0, DT_FLOAT);
   auto tmp = test::graph::Add(g.get(), in0, in1);
@@ -163,7 +163,7 @@ TEST_F(ExecutorTest, SelfAdd) {
   //
   // b <- v10
   // All nodes are executed by one thread.
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   auto v = test::graph::Arg(g.get(), 0, DT_FLOAT);
   const int N = 10;
   for (int i = 1; i <= N; ++i) {
@@ -219,7 +219,7 @@ void BuildTree(int N, Graph* g) {
 }
 
 TEST_F(ExecutorTest, RandomTree) {
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   BuildTree(4096, g.get());
   Create(std::move(g));
   FunctionCallFrame call_frame({DT_FLOAT}, {DT_FLOAT});
@@ -231,7 +231,7 @@ TEST_F(ExecutorTest, RandomTree) {
 }
 
 TEST_F(ExecutorTest, OpError) {
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   auto zero = test::graph::Constant(g.get(), V(0.0));
   auto inf = test::graph::Unary(g.get(), "Reciprocal", zero);
   auto check = test::graph::CheckNumerics(g.get(), inf, "message");
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index e321066a715d180f0791c9afdfa947560a0fd9ce..5b85a10edf1f6438feab485a77ad684f0442e67c 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -50,11 +50,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
-        return std::unique_ptr<IteratorBase>(
-            new EmptyIterator({this, strings::StrCat(prefix, "::EmptySkip")}));
+        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
+            this, strings::StrCat(prefix, "::EmptySkip")});
       } else {
-        return std::unique_ptr<IteratorBase>(new FiniteIterator(
-            {this, strings::StrCat(prefix, "::FiniteSkip")}));
+        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
+            this, strings::StrCat(prefix, "::FiniteSkip")});
       }
     }
 
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index be105f8170b8fff79c0c60a76a699a6ee6ba13f9..d8d7cd204d0f00a2e25ce9e36d1d6234d8c7b1d1 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -41,8 +41,8 @@ class Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::SparseTensorSlice")}));
+    return absl::make_unique<Iterator>(typename Iterator::Params{
+        this, strings::StrCat(prefix, "::SparseTensorSlice")});
   }
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 0a3d5869534ddad9f7ed295171d8deefc2154107..0dd0c0c80de194c60aa7d268cb40317d722956c4 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -50,11 +50,11 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ == 0) {
-        return std::unique_ptr<IteratorBase>(
-            new EmptyIterator({this, strings::StrCat(prefix, "::EmptyTake")}));
+        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
+            this, strings::StrCat(prefix, "::EmptyTake")});
       } else {
-        return std::unique_ptr<IteratorBase>(new FiniteIterator(
-            {this, strings::StrCat(prefix, "::FiniteTake")}));
+        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
+            this, strings::StrCat(prefix, "::FiniteTake")});
       }
     }
 
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 98c23f23b202dee580fb89f5473f69c61d57c640..a44dbd0d4d436e3eb85adbe9db6dc39bde0419e8 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -50,8 +50,8 @@ class TensorDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FromTensor")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::FromTensor")});
     }
 
     const DataTypeVector& output_dtypes() const override { return dtypes_; }
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 4ba2bde718a6351ff13bc17cf14ae5c60332c6ca..54dcd7eb7d1bd97fa1c58e0d3235670482bafd93 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -71,8 +71,8 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::TensorSlice")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::TensorSlice")});
     }
 
     const DataTypeVector& output_dtypes() const override { return dtypes_; }
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index c295631550aa008ccbf1abee0a91b27d64a6ba35..dc27702f1efb6f53cabe1fdf305e7e715aa51180 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -31,8 +31,8 @@ class WindowDataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::Window")}));
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::Window")});
   }
 
   const DataTypeVector& output_dtypes() const override { return output_types_; }
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index ae13ae5da8d4c093bdb4d6e168584bda234e4502..0b24c1189148a8d2133dc33dcba7c80324620589 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -78,8 +78,8 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Window")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Window")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 1760e63a9e1c6b6262c19baa8354052d7d73fd3c..cdc2969fc20573fa705dc2ba7a44955e6e062fd4 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -62,8 +62,8 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Zip")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Zip")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index ae451be7e21a119a309a74c3312eee4b24256248..c75fc94bc1c73ce5435271c7940e0779b1be3127 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -16,13 +16,13 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index da3bdb475e274d73751e22334628e3431023b9e4..ab98cacd1a117022444386b9a718e173d68fa99d 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -633,7 +633,8 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
-                stride_, stride_, padding_, in_backprop, data_format_);
+                stride_, stride_, padding_, /*explicit_paddings=*/{},
+                in_backprop, data_format_);
       return;
     }
 
@@ -1115,7 +1116,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
-                padding_, &reshaped_filter, data_format_);
+                padding_, /*explicit_paddings=*/{}, &reshaped_filter,
+                data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index f0902fdba6921b46fd7a0d0adb16e470ed83f65c..11c2b31633dd2186c729c725c4cda5816447954d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <type_traits>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -404,7 +404,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
-                stride_, stride_, padding_, output, data_format_);
+                stride_, stride_, padding_, /*explicit_paddings=*/{}, output,
+                data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
similarity index 98%
rename from tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
rename to tensorflow/core/kernels/depthwise_conv_op_gpu.h
index e811968d277ba3594341a59e8d6262cac637e602..098853e68430d425143d16ff2e8edbb9877f8e23 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 
@@ -38,7 +41,7 @@ using Eigen::GpuDevice;
 
 // Returns whether depthwise convolution forward or backward input pass can be
 // performed using the faster ('Small') variant of the kernel.
-EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
     const DepthwiseArgs& args) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
@@ -51,7 +54,7 @@ EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
 
 // Returns whether depthwise convolution backward filter pass can be performed
 // using the faster ('Small') variant of the kernel.
-EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     const DepthwiseArgs& args, const int block_height) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
@@ -652,13 +655,12 @@ struct PseudoHalfType<Eigen::half> {
 };
 }  // namespace detail
 
-namespace {
 // Maps to float if T is __half, and to T otherwise.
 template <typename T>
 using PseudoHalfType = typename detail::PseudoHalfType<T>::Type;
 
 // Returns whether the context's GPU supports efficient fp16 math.
-bool HasFastHalfMath(OpKernelContext* ctx) {
+inline bool HasFastHalfMath(OpKernelContext* ctx) {
   int major, minor;
   ctx->op_device_context()
       ->stream()
@@ -669,7 +671,6 @@ bool HasFastHalfMath(OpKernelContext* ctx) {
   // GPUs before sm_53 don't support fp16 math, and sm_61's fp16 math is slow.
   return cuda_arch >= 530 && cuda_arch != 610;
 }
-}  // namespace
 
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
@@ -808,10 +809,6 @@ void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
   }
 }
 
-template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvOp<GpuDevice, double>;
-
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
@@ -1030,10 +1027,6 @@ void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
   }
 }
 
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
-
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
@@ -1803,9 +1796,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
                  ctx, args, out_backprop, input, filter_backprop, data_format));
   }
 }
-
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073e7cf269844a7b355019493dad3d9287c00bf5
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, double>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b0e15e4766713130e86224dc9f255fe8ecead81
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2db9fa4dff5bf58cb52d44c3c044ba4fc34d6d9f
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 572d04ae2c464d493508d494ba325a33eb92d4c1..95af19c4c4818abced194f7553e8bb79c777a998 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/util/util.h"
 
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index e7882acc80e3c2383f3a3c208175d16dd8c092ab..59f687bf9c0247be2528c79d0a1ef3dbb3fb7d35 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -40,11 +40,11 @@ limitations under the License.
 #include "third_party/cub/iterator/constant_input_iterator.cuh"
 #include "third_party/cub/thread/thread_operators.cuh"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index f21f2acf2622a56cc3d6f58d259f79788a314dfb..5b8845b675d1264c07f0a6096460ea9edf62a3e4 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
 #ifdef GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 3d8e52ca0e49828b54604f7c5107f5dfd05d6891..4089eec59ee8ccf03679f77d02c1f57d60155a06 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -179,6 +179,9 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
                                                      num_threads);
     }
 
+    // If dimensions do not pass basic sanity checks return immediately.
+    if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return;
+
     // If we are using default Eigen gebp kernel there is no need to adjust the
     // block sizes for MKL-DNN.
     if (!UseCustomContractionKernels()) return;
@@ -194,7 +197,8 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
     // We split Kth dimensions in roughly equal slices.
     StorageIndex target_k_slices =
         (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
-    StorageIndex packet_size = 8;
+    StorageIndex packet_size = internal::packet_traits<Scalar>::size;
+    if (packet_size < 8) packet_size = 8;
     StorageIndex target_bk =
         Eigen::divup(k / target_k_slices, packet_size) * packet_size;
     kc_ = (std::min)(k, target_bk);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 86d8c98ee65aebb2927b338dfb236f470a3a1d39..8b198139400a6d2ce2795f9ef0b5793114a78e0b 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -1683,8 +1683,6 @@ EIGEN_DEVICE_FUNC
     kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
     kernel_dims[1] = kernelFilters;
   }
-  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
-  // moving it to somewhere more "common".
   return choose(
       Cond<internal::traits<Input>::Layout == ColMajor>(),
       kernel.reshape(kernel_dims)
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 22f71d62602cc984c0337f728298f7483c35bed9..920e648972bef4b37e15eb2c6dcee313b7cd26da 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
+
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -1540,22 +1542,188 @@ static void PackRhsHelper(int iters,
     pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
   }
   tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(
+      absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
+                   "; num_patches=", num_patches, " patch_size=", patch_size,
+                   " num_inputs=", num_inputs));
+}
+
+static void PackLhsHelper(int iters,
+                          /* Input dimensions: */
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  eigen_assert(block_rows <= filter_count);
+  eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
+
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StopTiming();
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (`filter count` aka `kernel filers`).
+  Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
+
+  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+
+  // We are going to reshape filter into 2D tensor.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the ReshapeOp. It is the tensorflow TTypes<float>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
+
+  using Evaluator =
+      TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
+                      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      float, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      float, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackLhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
+                                                        SubMapper, ColMajor>;
+#else
+  using Traits = typename Eigen::internal::gebp_traits<float, float>;
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_lhs<float, Eigen::Index, SubMapper,      //
+                                     Traits::mr,                          //
+                                     Traits::LhsProgress,                 //
+                                     typename Traits::LhsPacket4Packing,  //
+                                     ColMajor>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // We will reshape kernel into 2D tensor.
+  NewDimension reshape_dims;
+  reshape_dims[0] = filter_count;
+  reshape_dims[1] = input_depth * filter_rows * filter_cols;
+
+  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
+  nocontract_t nocontract_dim = {0};
+  contract_t contract_dim = {1};
+
+  // These values computed using the algorithm in TensorContraction.h, with
+  // 'nocontract_dim' and 'contract_dim' values specified above.
+  nocontract_t nocontract_strides = {1};
+  contract_t contract_strides = {filter_count};
+  nocontract_t i_strides = {1};
+  contract_t k_strides = {1};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<float, 4> packed(filter_dims);
+
+  // We generate multiple filter tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = filter_dims.TotalSize() * sizeof(float);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_filters =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<float, 4>> filters;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  for (int i = 0; i < num_filters; ++i) {
+    filters.emplace_back(filter_dims);
+    filters[i].setRandom();
+
+    ArgType tensor_map(filters[i].data(), filter_dims);
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
+                               contract_strides, k_strides);
+  }
+
+  PackLhsImpl pack_lhs;
+
+  const Index packed_total_size = filter_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  // Block rows is in the [0, filter_count) range.
+  // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
+
+  const Index max_row = filter_count;
+  const Index max_col = filter_rows * filter_cols * input_depth;
+
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    int filter_idx =
+        num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
+
+    Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
+    Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
+
+    Index rows = std::min(block_rows, max_row - row_offset);
+    Index cols = std::min(block_cols, max_col - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_offset = round_up(
+        internal::random<Index>(0, packed_total_size - rows * cols - 1));
+
+    SubMapper sub_mapper =
+        input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
 
-  std::ostringstream stringStream;
-  stringStream << "patch: " << patch_rows << "x" << patch_cols << " D"
-               << patch_depth << "; num_patches=" << num_patches
-               << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
-  tensorflow::testing::SetLabel(stringStream.str());
+// NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
+// first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
+// and accepts block rows and cols in the same order for lhs and rhs.
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+    pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
+#else
+    pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
+#endif
+  }
+  tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(absl::StrCat(
+      "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
+      "; input: depth=", input_depth, "; num_filers=", num_filters));
 }
 
 // -------------------------------------------------------------------------- //
-// Macro argumentnames:
+// Pack RHS
+//
+// Macro argument names:
 //    N: batch size
 //    H: height
 //    W: width
 //    C: input channels
 //   FC: filter channles
 //   FH: filter height
+//   FW: filter width
 //   SH: stride in height dimensions
 //   SW: stride in width dimensions
 //   BR: block rows
@@ -1563,16 +1731,16 @@ static void PackRhsHelper(int iters,
 
 #define BM_CONCAT(a, b) a##b
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)           \
+#define BM_RHS_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)       \
   BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
             _s##SH##x##SW##_B##BR##x##BC)
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)         \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
-                      BC)(int iters) {                             \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);  \
-  }                                                                \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)             \
+  static void BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                          BC)(int iters) {                             \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);      \
+  }                                                                    \
+  BENCHMARK(BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1645,4 +1813,37 @@ BM_PackRhs(/*batch*/ 32,        //
            /*filter*/ 3, 3,     //
            /*stride*/ 2, 2,     //
            /*block*/ 36, 432);
+
+// -------------------------------------------------------------------------- //
+// Pack LHS
+//
+// Macro argument names:
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   BR: block rows
+//   BC: block cols
+
+#define BM_LHS_NAME(prefix, C, FC, FH, FW, BR, BC) \
+  BM_CONCAT(BM_##prefix##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
+
+#define BM_PackLhs(C, FC, FH, FW, BR, BC)                              \
+  static void BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC)(int iters) { \
+    PackLhsHelper(iters, C, FC, FH, FW, BR, BC);                       \
+  }                                                                    \
+  BENCHMARK(BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC))
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+BM_PackLhs(/*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 56, 256);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 1a5b0f2b675a85ba2c1dbf0356c3e42b03db22b4..e80404a437523862bfe6b8c2961b11cc00bd4426 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
index 8fcda25e692f9aa550ddbb17a4f5cef8ba570b83..cb9a1660a7d059bebaaadea8cc309f74ab974948 100644
--- a/tensorflow/core/kernels/encode_png_op.cc
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/encode_wav_op.cc b/tensorflow/core/kernels/encode_wav_op.cc
index aed095076b92cdef60e217c610fa4c11eb4717ec..082f9a74ae1e36f22ed206c3049dbfd40ac55a48 100644
--- a/tensorflow/core/kernels/encode_wav_op.cc
+++ b/tensorflow/core/kernels/encode_wav_op.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 // See docs in ../ops/audio_ops.cc
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
 
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/extract_image_patches_op.cc
index 68631d14dbc4af5553e02a7e3d622c3772a95eb5..9306eccf9f018f66cc22a7d88050a20814e46f15 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/extract_image_patches_op.cc
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/extract_image_patches_op.h"
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/extract_jpeg_shape_op.cc b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
index 60d798af56737c6abb322a971b31ae596ea96ec6..ab424595c1a6e5c26f26aae9dc3768cf2bf15c9b 100644
--- a/tensorflow/core/kernels/extract_jpeg_shape_op.cc
+++ b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/extract_volume_patches_op.cc
index 52cd078a3512bcfae13539f1e95ef66c4adf8a03..8107bca7d18633f45e747b5175eca1e11f2cc6fe 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/extract_volume_patches_op.cc
@@ -26,11 +26,11 @@ when rates are to be added.
 
 #include "tensorflow/core/kernels/extract_volume_patches_op.h"
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
index 697ee81c39b194e29c03f3583f0aa727778ef316..4d3a7c197125613c662c97044d6964695ab92b0e 100644
--- a/tensorflow/core/kernels/fifo_queue.h
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -49,7 +49,7 @@ class FIFOQueue : public TypedQueue<std::deque<PersistentTensor> > {
                       CallbackWithTuple callback) override;
   Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 7090417dfdb2d7e433025b1a0f1cdeb5eece10a8..9c4c0487f09dff86efa833475ea685c30b1ac915 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -51,6 +51,11 @@ DEFINE_SETZERO_CPU(uint16);
 DEFINE_SETZERO_CPU(int16);
 DEFINE_SETZERO_CPU(int32);
 DEFINE_SETZERO_CPU(int64);
+DEFINE_SETZERO_CPU(quint8);
+DEFINE_SETZERO_CPU(qint8);
+DEFINE_SETZERO_CPU(quint16);
+DEFINE_SETZERO_CPU(qint16);
+DEFINE_SETZERO_CPU(qint32);
 DEFINE_SETZERO_CPU(complex64);
 DEFINE_SETZERO_CPU(complex128);
 DEFINE_SETZERO_CPU(Variant);
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 050c95cf40d4b29bde66b6b6e72b1b48a7199965..d4c92586897da1ead541a98f5d721a9c18d235b9 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -88,9 +88,16 @@ struct SetZeroFunctor<GPUDevice, T> {
   }
 };
 
+template <>
+void SetZeroFunctor<GPUDevice, Variant>::operator()(
+    const GPUDevice& d, typename TTypes<Variant>::Flat out) {
+  // TODO(b/123028789): Implement this.
+}
+
 #define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
 TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
 TF_CALL_bool(DEFINE_SETZERO_GPU);
+TF_CALL_variant(DEFINE_SETZERO_GPU);
 #undef DEFINE_SETZERO_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index dbd3bb05dbf1a310ea9c5a5b1003474e33825133..48b339508b50c835a7aa86306bf3dca758a819f1 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/conv_2d.h"
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #endif
 
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 7300f7a4e249dd436fad9c1cdd3463e5bc73cbdc..3c3e9bfa2e0a6f3f94c9c679994021929f9df489 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -8,11 +8,8 @@ cc_library(
     name = "fuzz_session",
     hdrs = ["fuzz_session.h"],
     deps = [
-        "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:ops",
         "//tensorflow/core:tensorflow",
     ],
 )
@@ -72,3 +69,7 @@ tf_oss_fuzz_dict("decode_json_example")
 tf_ops_fuzz_target_lib("check_numerics")
 
 tf_ops_fuzz_target_lib("one_hot")
+
+tf_ops_fuzz_target_lib("scatter_nd")
+
+tf_oss_fuzz_corpus("scatter_nd")
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b0e5f8d2990c3cac80fa792ba141c43 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b0e5f8d2990c3cac80fa792ba141c43
new file mode 100644
index 0000000000000000000000000000000000000000..d1239633c843b1b8fd64d232604a3d61e9eb07dc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b0e5f8d2990c3cac80fa792ba141c43 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b61fa3a30dd267828f12d9ea2b2a191 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b61fa3a30dd267828f12d9ea2b2a191
new file mode 100644
index 0000000000000000000000000000000000000000..1bd0905cdd6efab2b8450e6cb03f1d15ffae9993
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b61fa3a30dd267828f12d9ea2b2a191 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/8bc8b7d8beb3483c48158739791e56b0 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/8bc8b7d8beb3483c48158739791e56b0
new file mode 100644
index 0000000000000000000000000000000000000000..65a6d0083ee72a2920014fbe252970bff43ca75d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/8bc8b7d8beb3483c48158739791e56b0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/d2ef31d47578e9de8323bb0e4806f1be b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/d2ef31d47578e9de8323bb0e4806f1be
new file mode 100644
index 0000000000000000000000000000000000000000..c6948b6a25f2c1a4fa6de401aaeb681be9a8dbd2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/d2ef31d47578e9de8323bb0e4806f1be differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/e2791edcf2c8d9f4af3678a75d43a3e4 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/e2791edcf2c8d9f4af3678a75d43a3e4
new file mode 100644
index 0000000000000000000000000000000000000000..0e8a48e21096eb7b4f4642f754c18728e575e396
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/e2791edcf2c8d9f4af3678a75d43a3e4 differ
diff --git a/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc5e143cdf01ba8dbf6a820b9693dea69b29fb5e
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
@@ -0,0 +1,136 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzScatterNd : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto indices =
+        tensorflow::ops::Placeholder(scope.WithOpName("indices"), DT_INT32);
+    auto updates =
+        tensorflow::ops::Placeholder(scope.WithOpName("updates"), DT_INT32);
+    auto shape =
+        tensorflow::ops::Placeholder(scope.WithOpName("shape"), DT_INT32);
+    (void)tensorflow::ops::ScatterNd(scope.WithOpName("output"), indices,
+                                     updates, shape);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) override {
+    // This op's runtime is heavily determined by the shape of the tensor
+    // arguments and almost not at all by the values of those tensors. Hence,
+    // the fuzzing data here is only used to determine the shape of the
+    // arguments and the output and the data of these tensors is just a constant
+    // value. Furthermore, the shape of the updates_tensor tensor is fully
+    // determined by the contents of the shape_tensor and the shape of the
+    // indices_tensor. Rather than using random values for the
+    // updates_tensor.shape and getting most of the fuzz runs stopped in the
+    // check, it's better to just create a proper update_tensor.
+    if (size < 1) {
+      return;
+    }
+
+    // First element of the data buffer gives the number of dimensions of the
+    // shape tensor.
+    size_t i;
+    size_t data_ix = 0;
+    size_t shape_dims = 1 + (data[data_ix++] % kMaxShapeDims);
+    Tensor shape_tensor(tensorflow::DT_INT32,
+                        TensorShape({static_cast<int64>(shape_dims)}));
+
+    // Check that we have enough elements left for the shape tensor
+    if (data_ix + shape_dims >= size) {
+      return;  // not enough elements, no fuzz
+    }
+
+    // Subsequent elements give the contents of the shape tensor.
+    // To not get out of memory, reduce all dimensions to at most kMaxDim
+    auto flat_shape = shape_tensor.flat<int32>();
+    for (i = 0; i < shape_dims; i++) {
+      flat_shape(i) = data[data_ix++] % kMaxDim;
+    }
+
+    // Next, we have to fill in the indices tensor. Take the next element from
+    // the buffer to represent the rank of this tensor.
+    if (data_ix >= size) {
+      return;
+    }
+    size_t indices_rank = 1 + (data[data_ix++] % kMaxIndicesRank);
+
+    // Now, read the dimensions of the indices_tensor
+    if (data_ix + indices_rank >= size) {
+      return;
+    }
+    std::vector<int64> indices_dims;
+    size_t num_indices = 1;
+    for (i = 0; i < indices_rank; i++) {
+      // Modulo kMaxDim to not request too much memory
+      int64 dim = data[data_ix++] % kMaxDim;
+      num_indices *= dim;
+      indices_dims.push_back(dim);
+    }
+    Tensor indices_tensor(tensorflow::DT_INT32, TensorShape(indices_dims));
+
+    // Rest of the buffer is used to fill in the indices_tensor
+    auto flat_indices = indices_tensor.flat<int32>();
+    for (i = 0; i < num_indices && data_ix < size; i++) {
+      flat_indices(i) = data[data_ix++];
+    }
+    for (; i < num_indices; i++) {
+      flat_indices(i) = 0;  // ensure that indices_tensor has all values
+    }
+
+    // Given the values in the shape_tensor and the dimensions of the
+    // indices_tensor, the shape of updates_tensor is fixed.
+    num_indices = 1;
+    std::vector<int64> updates_dims;
+    for (i = 0; i < indices_rank - 1; i++) {
+      updates_dims.push_back(indices_dims[i]);
+      num_indices *= indices_dims[i];
+    }
+    int64 last = indices_dims[indices_rank - 1];
+    for (i = last; i < shape_dims; i++) {
+      updates_dims.push_back(flat_shape(i));
+      num_indices *= flat_shape(i);
+    }
+    Tensor updates_tensor(tensorflow::DT_INT32, TensorShape(updates_dims));
+
+    // We don't care about the values in the updates_tensor, make them all be 1
+    auto flat_updates = updates_tensor.flat<int32>();
+    for (i = 0; i < num_indices; i++) {
+      flat_updates(i) = 1;
+    }
+
+    RunInputs({{"indices", indices_tensor},
+               {"updates", updates_tensor},
+               {"shape", shape_tensor}});
+  }
+
+ private:
+  const size_t kMaxShapeDims = 5;
+  const size_t kMaxIndicesRank = 3;
+  const size_t kMaxDim = 10;
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzScatterNd);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 10958602b2fe3fd53d4acde8dce2fff0ccb5cd1d..4dbb6a71160e4c4921aec0992624f197f50963ea 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -46,9 +46,9 @@ class FuzzStringSplit : public FuzzSession {
           string(reinterpret_cast<const char*>(data), delim_len);
       input_tensor.scalar<string>()() = string(
           reinterpret_cast<const char*>(data + delim_len), size - delim_len);
-    }
 
-    RunInputs({{"input", input_tensor}, {"delimiter", delimiter_tensor}});
+      RunInputs({{"input", input_tensor}, {"delimiter", delimiter_tensor}});
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
index 969821dbba70907a1d1d26e84cc4887acd604a82..f7e3da804375a6576f479a88593ddb3d457f98f6 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
@@ -50,9 +50,9 @@ class FuzzStringSplitV2 : public FuzzSession {
           string(reinterpret_cast<const char*>(data), sep_len);
       input_tensor.scalar<string>()() = string(
           reinterpret_cast<const char*>(data + sep_len), size - sep_len);
-    }
 
-    RunInputs({{"input", input_tensor}, {"separator", separator_tensor}});
+      RunInputs({{"input", input_tensor}, {"separator", separator_tensor}});
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 7710cf93d61eeebf25a71d99e92b6b3e9ce237c9..93bdebc00e17abb702236453c220ada1e330c5cb 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index e50b7fe3bf7fb7a32820ec6f95421cb90b506c0a..58867a34bc2361daceb99edd9a6396fe22e5b856 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/gather_nd_op.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 003badb74da3512124490d054cf78fad75c2404c..77c0d7717ee97c5a5a130e38c89b17d20fc8acc9 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -18,8 +18,8 @@ limitations under the License.
 // Functor definition for GatherOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
index 1c78de253e702f5e546467bbed0758c24dbe0443..cf9817dc3060be9e9325d04637e89e147ce143c1 100644
--- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 5795f68889e2393451c5cfae2fd29f14e8f9adce..b26f0a7528df979041869fa327c3c4d890eb58df 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 87d36f22d719ade68d17c6f4a2e6dc2deeef9e45..a85de34ac262906aa0bbe2adc600505eb76dcedd 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -24,11 +24,18 @@ tf_cc_test(
     deps = [
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 1d4fa1a7db11d28268063055143ccfcbc966ec5c..8078c7036a040c937f7d9d47cc259e677b391c03 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -28,12 +28,12 @@ limitations under the License.
 #include <array>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index c37055239c28e0ab243ea30b05b2c8af0905766c..506091f76ec69f1f092b8fe0c67ea46deb851510 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 42fad1d4b053f84a7f5eaae4382f0a090ba628da..1e449ddb2ce8151be8bddf2b02e96e84a8dde773 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -99,13 +99,6 @@ REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
 
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorList, TensorList::kTypeName);
 
-Status TensorListShape(const TensorList& t, TensorShape* s) {
-  *s = TensorShape({});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorListShape);
-
 bool TensorList::Decode(const VariantTensorData& data) {
   // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
   // that we do not have to copy each tensor individually below. This would
@@ -173,6 +166,17 @@ Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
       DataTypeString(t.dtype()));
 }
 
+Status GetElementShapeFromInput(OpKernelContext* c,
+                                const TensorList& tensor_list, int index,
+                                PartialTensorShape* element_shape) {
+  TF_RETURN_IF_ERROR(TensorShapeFromTensor(c->input(index), element_shape));
+  // Check that `element_shape` and `tensor_list.element_shape` are
+  // compatible and store the merged shape in `element_shape`.
+  PartialTensorShape tmp = *element_shape;
+  TF_RETURN_IF_ERROR(tmp.MergeWith(tensor_list.element_shape, element_shape));
+  return Status::OK();
+}
+
 class EmptyTensorList : public OpKernel {
  public:
   explicit EmptyTensorList(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -259,14 +263,21 @@ class TensorListPushBack : public OpKernel {
                                   " max_num_elements: ", l->max_num_elements));
     }
 
-    TensorList output;
-    output = *l;
-    output.tensors.push_back(input);
-    Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.push_back(
+          input);
+    } else {
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      TensorList output;
+      output = *l;
+      output.tensors.push_back(input);
+      result->scalar<Variant>()() = std::move(output);
+    }
   }
 
  private:
@@ -351,63 +362,6 @@ class TensorListElementShape : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("TensorListElementShape").Device(DEVICE_CPU),
                         TensorListElementShape);
 
-#if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("TensorListElementShape")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("element_shape"),
-                        TensorListElementShape);
-
-#endif  // GOOGLE_CUDA
-
-class TensorListPopBack : public OpKernel {
- public:
-  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-  }
-
-  ~TensorListPopBack() override {}
-
-  void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-
-    OP_REQUIRES(c, !l->tensors.empty(),
-                errors::InvalidArgument("Trying to pop from an empty list."));
-
-    c->set_output(1, l->tensors.back());
-    TensorList output;
-    output = *l;
-    output.tensors.pop_back();
-    Tensor* result;
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
-  }
-
- private:
-  DataType element_dtype_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_CPU),
-                        TensorListPopBack);
-
-#if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_GPU),
-                        TensorListPopBack);
-
-#endif  // GOOGLE_CUDA
-
 class TensorListReserve : public OpKernel {
  public:
   explicit TensorListReserve(OpKernelConstruction* c) : OpKernel(c) {
@@ -445,57 +399,62 @@ REGISTER_KERNEL_BUILDER(Name("TensorListReserve")
                         TensorListReserve);
 
 #endif  // GOOGLE_CUDA
-
-class TensorListGetItem : public OpKernel {
+class TensorListResize : public OpKernel {
  public:
-  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-  }
+  explicit TensorListResize(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
-    OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* input_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, input_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    int32 index = c->input(1).scalar<int32>()();
-    OP_REQUIRES(c, index < l->tensors.size(),
-                errors::InvalidArgument("Trying to access element ", index,
-                                        " in a list with ", l->tensors.size(),
-                                        " elements."));
-    c->set_output(0, l->tensors[index]);
-  }
+    int32 size = c->input(1).scalar<int32>()();
+    OP_REQUIRES(
+        c, size >= 0,
+        errors::InvalidArgument(
+            "TensorListSlice expects size to be non-negative. Got: ", size));
 
- private:
-  DataType element_dtype_;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.resize(
+          size, Tensor(DT_INVALID));
+    } else {
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      TensorList output_list;
+      output_list.element_shape = input_list->element_shape;
+      output_list.element_dtype = input_list->element_dtype;
+      output_list.max_num_elements = input_list->max_num_elements;
+      if (size > input_list->tensors.size()) {
+        output_list.tensors.insert(output_list.tensors.begin(),
+                                   input_list->tensors.begin(),
+                                   input_list->tensors.end());
+        // Add DT_INVALID tensors to the end of the list if the requested size
+        // is larger than the list length.
+        output_list.tensors.resize(size, Tensor(DT_INVALID));
+      } else {
+        output_list.tensors.insert(output_list.tensors.begin(),
+                                   input_list->tensors.begin(),
+                                   input_list->tensors.begin() + size);
+      }
+      result->scalar<Variant>()() = std::move(output_list);
+    }
+  }
 };
 
-REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
-                        TensorListGetItem);
+REGISTER_KERNEL_BUILDER(Name("TensorListResize").Device(DEVICE_CPU),
+                        TensorListResize);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("index"),               \
-                          TensorListGetItem);
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
-#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
+REGISTER_KERNEL_BUILDER(
+    Name("TensorListResize").Device(DEVICE_GPU).HostMemory("size"),
+    TensorListResize);
 
 #endif  // GOOGLE_CUDA
 
@@ -528,14 +487,21 @@ class TensorListSetItem : public OpKernel {
                     "list index. Item element shape: ",
                     value.shape().DebugString(),
                     " list shape: ", l->element_shape.DebugString()));
-    TensorList output;
-    output = *l;
-    output.tensors[index] = value;
-    Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors[index] =
+          value;
+    } else {
+      TensorList output;
+      output = *l;
+      output.tensors[index] = value;
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      result->scalar<Variant>()() = std::move(output);
+    }
   }
 
  private:
@@ -557,6 +523,7 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int32(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
 #undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
@@ -656,23 +623,7 @@ REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
 
 #endif  // GOOGLE_CUDA
 
-#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListPushBackBatch<CPUDevice, T>)
-
-TF_CALL_ALL_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint8);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint8);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint16);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint16);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint32);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
-
-#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU
-
-#define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)                           \
   REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
@@ -684,19 +635,15 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListConcat<CPUDevice, T>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
-REGISTER_TENSOR_LIST_STACK_CPU(quint8);
-REGISTER_TENSOR_LIST_STACK_CPU(qint8);
-REGISTER_TENSOR_LIST_STACK_CPU(quint16);
-REGISTER_TENSOR_LIST_STACK_CPU(qint16);
-REGISTER_TENSOR_LIST_STACK_CPU(qint32);
-REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
-
-#undef REGISTER_TENSOR_LIST_STACK_CPU
-
-#define REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(T)                   \
+                          TensorListConcat<CPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListGetItem<CPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListPopBack<CPUDevice, T>)        \
   REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
@@ -705,20 +652,31 @@ REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
                           TensorListScatter<CPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")             \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListScatter<CPUDevice, T>)        \
   REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListSplit<CPUDevice, T>)
+                          TensorListSplit<CPUDevice, T>)          \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListPushBackBatch<CPUDevice, T>)
+
+TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_OPS_CPU);
+REGISTER_TENSOR_LIST_OPS_CPU(quint8);
+REGISTER_TENSOR_LIST_OPS_CPU(qint8);
+REGISTER_TENSOR_LIST_OPS_CPU(quint16);
+REGISTER_TENSOR_LIST_OPS_CPU(qint16);
+REGISTER_TENSOR_LIST_OPS_CPU(qint32);
+REGISTER_TENSOR_LIST_OPS_CPU(bfloat16);
+REGISTER_TENSOR_LIST_OPS_CPU(Variant);
 
-TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint8);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint16);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint16);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint32);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(bfloat16);
+#undef REGISTER_TENSOR_LIST_OPS_CPU
 
-#undef REGISTER_TENSOR_LIST_FROM_TENSOR_CPU
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)
 
 REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
                                           TensorList,
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 23f552642cac273cf53b25a6d43e1e6ca23ea0cc..525938971291b177c4a5246007295838a0353c26 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -36,47 +36,38 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_TENSOR_LIST_STACK_GPU(T)                         \
+#define REGISTER_TENSOR_LIST_OPS_GPU(T)                           \
   REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
                               .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU),                \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape"),       \
                           TensorListStack<GPUDevice, T>)          \
   REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
-                              .HostMemory("indices"),             \
+                              .HostMemory("indices")              \
+                              .HostMemory("element_shape"),       \
                           TensorListGather<GPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index")                \
+                              .HostMemory("element_shape"),       \
+                          TensorListGetItem<GPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape"),       \
+                          TensorListPopBack<GPUDevice, T>)        \
   REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("lengths"),             \
-                          TensorListConcat<GPUDevice, T>)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
-REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_STACK_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_STACK_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_STACK_GPU);
-REGISTER_TENSOR_LIST_STACK_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_STACK_GPU
-
-#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(T)               \
+                          TensorListConcat<GPUDevice, T>)         \
   REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU),                \
-                          TensorListPushBackBatch<GPUDevice, T>)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU
-
-#define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
+                          TensorListPushBackBatch<GPUDevice, T>)  \
   REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
@@ -88,6 +79,13 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
                               .HostMemory("element_shape")        \
                               .HostMemory("indices"),             \
                           TensorListScatter<GPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")             \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape")        \
+                              .HostMemory("num_elements")         \
+                              .HostMemory("indices"),             \
+                          TensorListScatter<GPUDevice, T>)        \
   REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
@@ -95,14 +93,21 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
                               .HostMemory("lengths"),             \
                           TensorListSplit<GPUDevice, T>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_FROM_TENSOR_GPU
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
+REGISTER_TENSOR_LIST_OPS_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_int32(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_OPS_GPU);
+REGISTER_TENSOR_LIST_OPS_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_OPS_GPU
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")
+                            .TypeConstraint<Variant>("element_dtype")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape"),
+                        TensorListPopBack<GPUDevice, Variant>)
 
 REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
                                           TensorList,
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 686679474c40dc922683786cdfe65ffb3fbc03e2..7b3ff078c80a97b83b6895806016b9178f1d46a0 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -65,6 +66,10 @@ struct TensorList {
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
 
+Status GetElementShapeFromInput(OpKernelContext* c,
+                                const TensorList& tensor_list, int index,
+                                PartialTensorShape* element_shape);
+
 template <typename Device, typename T>
 class TensorListStack : public OpKernel {
  public:
@@ -75,8 +80,6 @@ class TensorListStack : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("num_elements", &num_elements_));
   }
 
-  ~TensorListStack() {}
-
   void Compute(OpKernelContext* c) override {
     const TensorList* tensor_list =
         c->input(0).scalar<Variant>()().get<TensorList>();
@@ -89,13 +92,6 @@ class TensorListStack : public OpKernel {
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    OP_REQUIRES(
-        c,
-        !tensor_list->tensors.empty() ||
-            tensor_list->element_shape.IsFullyDefined(),
-        errors::InvalidArgument("Tried to stack elements of a empty ",
-                                "list with non-fully-defined shape: ",
-                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
                   errors::InvalidArgument(
@@ -103,37 +99,66 @@ class TensorListStack : public OpKernel {
                       " elements but got a list with ",
                       tensor_list->tensors.size(), " elements."));
     }
-    // Compute the shape of the output tensor.
-    // If `element_shape` is fully-defined it gets used. It is assumed that all
-    // element tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first element
-    // tensor is used and it is checked that all other tensors have the same
-    // shape.
-    TensorShape resulting_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
-      const Tensor& t = tensor_list->tensors[0];
-      resulting_shape = t.shape();
-      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 1,
+                                               &partial_element_shape));
+    OP_REQUIRES(
+        c,
+        partial_element_shape.IsFullyDefined() || !tensor_list->tensors.empty(),
+        errors::InvalidArgument("Tried to stack elements of an empty ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
         const Tensor& t = tensor_list->tensors[i];
-        OP_REQUIRES(c, t.shape() == resulting_shape,
-                    errors::InvalidArgument(
-                        "Tried to stack tensors with unequal shapes: ",
-                        resulting_shape.DebugString(), " vs ",
-                        t.shape().DebugString()));
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
       }
     }
-    resulting_shape.InsertDim(0, tensor_list->tensors.size());
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(c, partial_element_shape.AsTensorShape(&element_shape),
+                errors::InvalidArgument(
+                    "Tried to stack list which only contains uninitialized ",
+                    "tensors and has a non-fully-defined element_shape: ",
+                    partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
 
     ConstMatrixVector inputs_flat;
     inputs_flat.reserve(tensor_list->tensors.size());
+    Tensor zeros;
     for (const auto& t : tensor_list->tensors) {
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          t.shaped<T, 2>({1, t.NumElements()})));
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                               zeros.flat<T>());
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -151,6 +176,123 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListGetItem : public OpKernel {
+ public:
+  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    OP_REQUIRES(
+        c, c->input(0).shape().num_elements() == 1,
+        errors::InvalidArgument("List tensors are supposed to be scalars."));
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    int32 index = c->input(1).scalar<int32>()();
+    OP_REQUIRES(c, index < l->tensors.size(),
+                errors::InvalidArgument("Trying to access element ", index,
+                                        " in a list with ", l->tensors.size(),
+                                        " elements."));
+    if (l->tensors[index].dtype() != DT_INVALID) {
+      c->set_output(0, l->tensors[index]);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 2, &partial_element_shape));
+      TensorShape element_shape;
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined.",
+                                  partial_element_shape.DebugString()));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(0, element_shape, &result, attr));
+      functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                           result->flat<T>());
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListPopBack : public OpKernel {
+ public:
+  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+
+    OP_REQUIRES(c, !l->tensors.empty(),
+                errors::InvalidArgument("Trying to pop from an empty list."));
+
+    const Tensor& t = l->tensors.back();
+    if (t.dtype() != DT_INVALID) {
+      c->set_output(1, t);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 1, &partial_element_shape));
+      TensorShape element_shape;
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined.",
+                                  partial_element_shape.DebugString()));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(1, element_shape, &result, attr));
+      functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                           result->flat<T>());
+    }
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.pop_back();
+    } else {
+      TensorList output;
+      output = *l;
+      output.tensors.pop_back();
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      result->scalar<Variant>()() = std::move(output);
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
 template <typename Device, typename T>
 class TensorListConcat : public OpKernel {
  public:
@@ -158,10 +300,19 @@ class TensorListConcat : public OpKernel {
       std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
   explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+    // TODO(skyewm): the HasAttr check can be removed once the
+    // element_shape_except_first_dim attr has been checked in for 2 weeks
+    // (around 1/14/2019).
+    if (c->HasAttr("element_shape")) {
+      PartialTensorShape element_shape;
+      OP_REQUIRES_OK(c, c->GetAttr("element_shape", &element_shape));
+      if (!element_shape.unknown_rank()) {
+        element_shape_except_first_dim_ = PartialTensorShape(
+            gtl::ArraySlice<int64>(element_shape.dim_sizes()).subspan(1));
+      }
+    }
   }
 
-  ~TensorListConcat() {}
-
   void Compute(OpKernelContext* c) override {
     // Check that the input Variant tensor is indeed a TensorList and has the
     // correct element type.
@@ -178,29 +329,33 @@ class TensorListConcat : public OpKernel {
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
     // If the TensorList is empty, its element_shape must be fully defined
     // except for the first dimension.
-    PartialTensorShape shape_except_first_dim;
-    if (!tensor_list->element_shape.unknown_rank()) {
-      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
-                  errors::InvalidArgument(
-                      "Concat requires elements to be at least vectors, ",
-                      "found scalars instead."));
-      shape_except_first_dim = PartialTensorShape(
-          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
-              .subspan(1));
+    if (!element_shape_except_first_dim_.IsFullyDefined()) {
+      if (!tensor_list->element_shape.unknown_rank()) {
+        OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+                    errors::InvalidArgument(
+                        "Concat requires elements to be at least vectors, ",
+                        "found scalars instead."));
+        PartialTensorShape shape_except_first_dim(
+            gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
+                .subspan(1));
+        PartialTensorShape tmp = element_shape_except_first_dim_;
+        OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
+                                        &element_shape_except_first_dim_));
+      }
     }
     OP_REQUIRES(c,
                 !tensor_list->tensors.empty() ||
-                    shape_except_first_dim.IsFullyDefined(),
+                    element_shape_except_first_dim_.IsFullyDefined(),
                 errors::InvalidArgument(
                     "All except the first dimension must be fully defined ",
                     "when concating an empty tensor list. element_shape: ",
                     tensor_list->element_shape.DebugString()));
     // 1. Compute the shape of the output tensor.
-    // If `shape_except_first_dim` is fully-defined we just prepend the leading
-    // dim to it. Otherwise we use the shape of the first element tensor and
-    // check to make sure shapes of all tensors are compatible.
+    // If `element_shape_except_first_dim_` is fully-defined we just prepend the
+    // leading dim to it. Otherwise we use the shape of the first element tensor
+    // and check to make sure shapes of all tensors are compatible.
     TensorShape output_shape;
-    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
+    if (!element_shape_except_first_dim_.AsTensorShape(&output_shape)) {
       const Tensor& element_tensor = tensor_list->tensors[0];
       OP_REQUIRES(
           c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
@@ -268,6 +423,7 @@ class TensorListConcat : public OpKernel {
 
  private:
   DataType element_dtype_;
+  PartialTensorShape element_shape_except_first_dim_;
 };
 
 template <typename Device, typename T>
@@ -367,47 +523,47 @@ class TensorListGather : public OpKernel {
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
     Tensor indices = c->input(1);
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 2,
+                                               &partial_element_shape));
     OP_REQUIRES(
-        c,
-        indices.NumElements() > 0 ||
-            tensor_list->element_shape.IsFullyDefined(),
+        c, partial_element_shape.IsFullyDefined() || indices.NumElements() > 0,
         errors::InvalidArgument("Tried to gather 0-elements from "
                                 "a list with non-fully-defined shape: ",
-                                tensor_list->element_shape.DebugString()));
-    // Compute the shape of the output tensor.
-    // If `element_shape` is fully-defined it gets used. It is assumed that all
-    // requested tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first requested
-    // tensor is used and it is checked that all other tensors have the same
-    // shape.
-    TensorShape resulting_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
-      const int i = indices.flat<int32>()(0);
-      OP_REQUIRES(
-          c, i < tensor_list->tensors.size(),
-          errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  tensor_list->tensors.size(), " elements."));
-      const Tensor& t = tensor_list->tensors[i];
-      resulting_shape = t.shape();
-      for (int index = 1; index < indices.NumElements(); ++index) {
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int index = 0; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
         const Tensor& t = tensor_list->tensors[i];
-        OP_REQUIRES(c, t.shape() == resulting_shape,
-                    errors::InvalidArgument(
-                        "Tried to gather elements with unequal shapes: ",
-                        resulting_shape.DebugString(), " vs ",
-                        t.shape().DebugString()));
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
       }
     }
-    resulting_shape.InsertDim(0, indices.NumElements());
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(
+        c, partial_element_shape.AsTensorShape(&element_shape),
+        errors::InvalidArgument("Tried to gather uninitialized tensors from a ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, indices.NumElements());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(tensor_list->tensors.size());
+    inputs_flat.reserve(indices.NumElements());
+    Tensor zeros;
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
@@ -415,8 +571,24 @@ class TensorListGather : public OpKernel {
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
                                   tensor_list->tensors.size(), " elements."));
       const Tensor& t = tensor_list->tensors[i];
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          t.shaped<T, 2>({1, t.NumElements()})));
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                               zeros.flat<T>());
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -491,6 +663,13 @@ class TensorListScatter : public OpKernel {
     Tensor indices = c->input(1);
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
+    // TensorListScatterV2 passes the num_elements input, TensorListScatter does
+    // not.
+    int num_elements = c->num_inputs() >= 4 ? c->input(3).scalar<int>()() : -1;
+    OP_REQUIRES(c, num_elements >= -1,
+                errors::InvalidArgument(
+                    "TensorListScatter expects num_elements >= -1, found: ",
+                    num_elements));
     TensorList output_list;
     const Tensor& input_tensor = c->input(0);
     output_list.element_dtype = input_tensor.dtype();
@@ -505,14 +684,37 @@ class TensorListScatter : public OpKernel {
                     "Specified a list with shape ", element_shape.DebugString(),
                     " from a tensor with shape ", output_shape.DebugString()));
     output_list.element_shape = element_shape;
-    output_list.tensors.reserve(indices.NumElements());
+
+    OP_REQUIRES(c, indices.NumElements() == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Invalid number of rows in input tensor. Expected: ",
+                    indices.NumElements(),
+                    " Actual: ", input_tensor.shape().dim_size(0)));
+
+    // Validate indices and resize output_list.tensors to fit the highest index.
+    {
+      int highest_index = -1;
+      for (int index = 0; index < indices.NumElements(); ++index) {
+        const int i = indices.flat<int32>()(index);
+        OP_REQUIRES(
+            c, i >= 0,
+            errors::InvalidArgument(
+                "Indices in TensorListScatter must all be non-negative."));
+        OP_REQUIRES(c, num_elements == -1 || i < num_elements,
+                    errors::InvalidArgument(
+                        "TensorListScatter: Trying to scatter at index ", i,
+                        " in list with size ", num_elements));
+        if (i > highest_index) {
+          highest_index = i;
+        }
+      }
+      output_list.tensors.resize(std::max(highest_index + 1, num_elements),
+                                 Tensor(DT_INVALID));
+    }
+
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "Trying to scatter index ", i, " from tensor with ",
-                      input_tensor.shape().dim_size(0), " rows."));
-      Tensor tmp = input_tensor.Slice(i, i + 1);
+      Tensor tmp = input_tensor.Slice(index, index + 1);
       TensorShape tmp_shape = tmp.shape();
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
@@ -525,7 +727,7 @@ class TensorListScatter : public OpKernel {
       // many small ondes.
       aligned.flat<T>().device(c->eigen_device<Device>()) =
           tmp.unaligned_flat<T>();
-      output_list.tensors.push_back(aligned);
+      std::swap(output_list.tensors[i], aligned);
     }
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
@@ -589,8 +791,6 @@ class TensorListPushBackBatch : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
   }
 
-  ~TensorListPushBackBatch() override {}
-
   void Compute(OpKernelContext* c) override {
     const Tensor& input = c->input(1);
     OP_REQUIRES(c, element_dtype_ == input.dtype(),
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 9451247f2684892f4666f77128d5721be9a2baa7..b046401c0ae397682a7e0e780e15c9c9f75a7524 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 #define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/lookup_tables/BUILD b/tensorflow/core/kernels/lookup_tables/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f4a41a5cfc06a74edd1220b345cef2275bf8f90f
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/BUILD
@@ -0,0 +1,60 @@
+# Description:
+#   OpKernels and resource templates for lookup tables.
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "lookup_table_interface",
+    hdrs = ["lookup_table_interface.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "table_resource_utils",
+    hdrs = ["table_resource_utils.h"],
+    deps = [
+        ":lookup_table_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "table_op_utils",
+    hdrs = ["table_op_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+tf_kernel_library(
+    name = "fingerprint64_map_ops",
+    srcs = [
+        "fingerprint64_map_ops.cc",
+    ],
+    deps = [
+        ":table_op_utils",
+        ":table_resource_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/kernels/lookup_tables/fingerprint64_map_ops.cc b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a000828c4b01a932b5fa5c518fb18c3b70af54e4
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_ops.cc
@@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/lookup_tables/table_op_utils.h"
+#include "tensorflow/core/kernels/lookup_tables/table_resource_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Map x -> (Fingerprint64(x) % num_oov_buckets) + offset.
+// num_oov_buckets and offset are node attributes provided at construction
+// time.
+template <class HeterogeneousKeyType, class ValueType>
+class Fingerprint64Map final
+    : public LookupTableInterface<HeterogeneousKeyType, ValueType> {
+ public:
+  Fingerprint64Map(int64 num_oov_buckets, int64 offset)
+      : num_oov_buckets_(num_oov_buckets), offset_(offset) {}
+
+  mutex* GetMutex() const override { return nullptr; }
+
+  bool UnsafeInsertOrAssign(const HeterogeneousKeyType& key,
+                            const ValueType& value) override {
+    return true;
+  }
+
+  Status TableUnbatchedInsertStatus() const override {
+    return errors::Unimplemented("Fingerprint64Map does not support inserts.");
+  }
+
+  Status BatchInsertOrAssign(absl::Span<const HeterogeneousKeyType> keys,
+                             absl::Span<const ValueType> values) override {
+    return errors::Unimplemented("Fingerprint64Map does not support inserts.");
+  }
+
+  ValueType UnsafeLookupKey(
+      const HeterogeneousKeyType& key_to_find) const override {
+    // This can cause a downcast.
+    return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                  num_oov_buckets_) +
+           offset_;
+  }
+
+  Status TableUnbatchedLookupStatus() const override { return Status::OK(); }
+
+  Status BatchLookup(absl::Span<const HeterogeneousKeyType> keys,
+                     absl::Span<ValueType> values,
+                     int64 prefetch_lookahead) const override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i] = Fingerprint64Map::UnsafeLookupKey(keys[i]);
+    }
+    return Status::OK();
+  }
+
+  const absl::optional<const ValueType> DefaultValue() const override {
+    return {};
+  }
+
+  void UnsafePrefetchKey(
+      const HeterogeneousKeyType& key_to_find) const override {}
+
+  size_t UnsafeSize() const override { return 0; }
+
+  Status SizeStatus() const override {
+    return errors::Unimplemented(
+        "Fingerprint64Map does not have a concept of size.");
+  }
+
+  bool UnsafeContainsKey(
+      const HeterogeneousKeyType& key_to_find) const override {
+    return true;
+  }
+
+ private:
+  const int64 num_oov_buckets_;
+  const int64 offset_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Fingerprint64Map);
+};
+
+template <typename Fingerprint64Map>
+struct Fingerprint64MapFactory {
+  struct Functor {
+    template <typename ContainerBase>
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    ContainerBase** container) {
+      int64 num_oov_buckets;
+      int64 offset;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(kernel->def(), "num_oov_buckets", &num_oov_buckets));
+      TF_RETURN_IF_ERROR(GetNodeAttr(kernel->def(), "offset", &offset));
+      *container = new Fingerprint64Map(num_oov_buckets, offset);
+      return Status::OK();
+    }
+  };
+};
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                             \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Fingerprint64Map")                                                \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype")                 \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),            \
+      ResourceConstructionOp<                                                 \
+          LookupTableInterface<absl::string_view, table_value_dtype>,         \
+          Fingerprint64MapFactory<Fingerprint64Map<                           \
+              absl::string_view, table_value_dtype>>::Functor>);              \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Fingerprint64Map")                                                \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<string>("heterogeneous_key_dtype")                  \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),            \
+      ResourceConstructionOp<LookupTableInterface<string, table_value_dtype>, \
+                             Fingerprint64MapFactory<Fingerprint64Map<        \
+                                 string, table_value_dtype>>::Functor>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/lookup_table_interface.h b/tensorflow/core/kernels/lookup_tables/lookup_table_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cfe44eda79170c8b140ba6401c95542cbd0b5b5
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/lookup_table_interface.h
@@ -0,0 +1,120 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_LOOKUP_TABLE_INTERFACE_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_LOOKUP_TABLE_INTERFACE_H_
+
+#include <cstddef>
+#include <string>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Interface for key-value pair lookups with support for heterogeneous keys.
+// This class contains two main kinds of methods: methods which operate on
+// a batch of inputs and methods which do not. The latter have the prefix
+// 'Unsafe'. Clients must call the corresponding status methods to determine
+// whether they are safe to call within a code block.
+// Implementations must guarantee thread-safety when GetMutex is used to
+// synchronize method access.
+template <typename HeterogeneousKeyType, typename ValueType>
+class LookupTableInterface : public ResourceBase {
+ public:
+  using heterogeneous_key_type = HeterogeneousKeyType;
+  using value_type = ValueType;
+  using key_type = heterogeneous_key_type;
+
+  // Return value should be used to synchronize read/write access to
+  // all public methods. If null, no synchronization is needed.
+  virtual mutex* GetMutex() const = 0;
+
+  // Insert the KV pair into the underlying table. If a key equivalent to key
+  // already exists in the underlying table, its corresponding value is
+  // overridden. Returns true only if the key was inserted for the first time.
+  // Undefined if TableUnbatchedInsertStatus() != OK.
+  virtual bool UnsafeInsertOrAssign(const HeterogeneousKeyType& key,
+                                    const ValueType& value) = 0;
+
+  // Returns OK if it is safe to call InsertOrAssign.
+  // Once OK is returned, it is safe to call InsertOrAssign for the rest of the
+  // program.
+  virtual Status TableUnbatchedInsertStatus() const TF_MUST_USE_RESULT = 0;
+
+  // Stores each KV pair {keys[i], values[i]} in the underlying map, overriding
+  // pre-existing pairs which have equivalent keys.
+  // keys and values should have the same size.
+  virtual Status BatchInsertOrAssign(
+      absl::Span<const HeterogeneousKeyType> keys,
+      absl::Span<const ValueType> values) = 0;
+
+  // Prefetch key_to_find into implementation defined data caches.
+  // Implementations are free to leave this a no-op.
+  // Undefined if TableUnbatchedLookupStatus() != OK.
+  virtual void UnsafePrefetchKey(
+      const HeterogeneousKeyType& key_to_find) const {}
+
+  // Returns true if and only if the table contains key_to_find.
+  // Undefined if TableUnbatchedLookupStatus() != OK.
+  virtual bool UnsafeContainsKey(
+      const HeterogeneousKeyType& key_to_find) const = 0;
+
+  // Lookup the value for key_to_find. This value must always be well-defined,
+  // even when ContainsKey(key_to_find) == false. When
+  // dv = DefaultValue() != absl::nullopt and ContainsKey(key_to_find) == false,
+  // dv is returned.
+  // Undefined if TableUnbatchedLookupStatus() != OK.
+  virtual ValueType UnsafeLookupKey(
+      const HeterogeneousKeyType& key_to_find) const = 0;
+
+  // Returns OK if it is safe to call PrefetchKey, ContainsKey, and
+  // UnsafeLookupKey.
+  // If OK is returned, it is safe to call these methods until the next
+  // non-const method of this class is called.
+  virtual Status TableUnbatchedLookupStatus() const TF_MUST_USE_RESULT = 0;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status BatchLookup(absl::Span<const HeterogeneousKeyType> keys,
+                             absl::Span<ValueType> values,
+                             int64 prefetch_lookahead) const = 0;
+
+  // Returns the number of elements in the table.
+  // Undefined if SizeStatus() != OK.
+  virtual size_t UnsafeSize() const = 0;
+
+  // Returns OK if the return value of UnsafeSize() is always well-defined.
+  virtual Status SizeStatus() const TF_MUST_USE_RESULT = 0;
+
+  // If non-null value is returned, LookupKey returns that value only for keys
+  // which satisfy ContainsKey(key_to_find) == false.
+  virtual const absl::optional<const ValueType> DefaultValue() const = 0;
+
+  string DebugString() const override { return "A lookup table"; }
+
+  ~LookupTableInterface() override = default;
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_LOOKUP_TABLE_INTERFACE_H_
diff --git a/tensorflow/core/kernels/lookup_tables/table_op_utils.h b/tensorflow/core/kernels/lookup_tables/table_op_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bc18c7dc204fa74b6a0f6d044b7f48cd554a52d
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/table_op_utils.h
@@ -0,0 +1,234 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_OP_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_OP_UTILS_H_
+
+#include <cstddef>
+#include <string>
+#include <type_traits>
+
+#include "absl/base/thread_annotations.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Create resources of type ContainerBase using the static method
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// ContainerBase**)
+// If the resource has already been created it will be looked up.
+template <class ContainerBase, typename Functor>
+class ResourceConstructionOp : public OpKernel {
+ public:
+  explicit ResourceConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx,
+                    this](ContainerBase** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      ContainerBase* container;
+      auto status = Functor::AllocateContainer(ctx, this, &container);
+      if (TF_PREDICT_FALSE(!status.ok())) {
+        container->Unref();
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(container->MemoryUsed());
+      }
+      *ret = container;
+      return Status::OK();
+    };
+
+    ContainerBase* container_base = nullptr;
+    OP_REQUIRES_OK(
+        ctx, cinfo_.resource_manager()->template LookupOrCreate<ContainerBase>(
+                 cinfo_.container(), cinfo_.name(), &container_base, creator));
+    core::ScopedUnref unref_me(container_base);
+
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ContainerBase>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+  }
+
+  ~ResourceConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ContainerBase>(cinfo_.container(),
+                                                cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+    }
+  }
+
+ private:
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ResourceConstructionOp);
+};
+
+// Create resources of type ContainerBase using the static method
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// FallbackTableBaseType*, ContainerBase**)
+// If the resource has already been created it will be looked up.
+// Container must decrease the reference count of the FallbackTableBaseType*
+// constructor argument before its destructor completes.
+template <class ContainerBase, class Functor,
+          class FallbackTableBaseType = ContainerBase>
+class TableWithFallbackConstructionOp : public OpKernel {
+ public:
+  explicit TableWithFallbackConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    if (ctx->num_inputs() == table_int64_args.size()) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Expected op to have a resource input after the table_int64_args "
+          "input but no such input found."));
+      return;
+    }
+
+    FallbackTableBaseType* fallback_table = nullptr;
+    {
+      const Tensor& table_handle = ctx->input(table_int64_args.size());
+      ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+      OP_REQUIRES_OK(
+          ctx, ctx->resource_manager()->Lookup(handle.container(),
+                                               handle.name(), &fallback_table));
+    }
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx, this, fallback_table](
+                       ContainerBase** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // container construction logic can't be merged with
+      // ResourceConstructionOp because Container constructor requires an
+      // input which can only be constructed if the resource manager
+      // internal lock is not already held.
+      ContainerBase* container;
+      auto status =
+          Functor::AllocateContainer(ctx, this, fallback_table, &container);
+      if (TF_PREDICT_FALSE(!status.ok())) {
+        container->Unref();
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(container->MemoryUsed());
+      }
+      *ret = container;
+      return Status::OK();
+    };
+
+    ContainerBase* table = nullptr;
+    OP_REQUIRES_OK(
+        ctx, cinfo_.resource_manager()->template LookupOrCreate<ContainerBase>(
+                 cinfo_.container(), cinfo_.name(), &table, creator));
+    core::ScopedUnref unref_me(table);
+
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ContainerBase>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+  }
+
+  ~TableWithFallbackConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ContainerBase>(cinfo_.container(),
+                                                cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+    }
+  }
+
+ private:
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TableWithFallbackConstructionOp);
+};
+
+// Op that returns the size of a container.
+template <class Container>
+class ContainerSizeOp : public OpKernel {
+ public:
+  explicit ContainerSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& container_handle = ctx->input(0);
+    ResourceHandle handle(container_handle.scalar<ResourceHandle>()());
+    Container* container;
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup(
+                            handle.container(), handle.name(), &container));
+    core::ScopedUnref unref_me(container);
+    OP_REQUIRES_OK(ctx, container->SizeStatus());
+
+    Tensor* out;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+
+    auto* mutex = container->GetMutex();
+    if (mutex != nullptr) {
+      tf_shared_lock lock(*mutex);
+      out->scalar<int64>()() = container->UnsafeSize();
+    } else {
+      out->scalar<int64>()() = container->UnsafeSize();
+    }
+  }
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_OP_UTILS_H_
diff --git a/tensorflow/core/kernels/lookup_tables/table_resource_utils.h b/tensorflow/core/kernels/lookup_tables/table_resource_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2065904d61e63ca598584051a1b05b6c99b51b1f
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/table_resource_utils.h
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_RESOURCE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_RESOURCE_UTILS_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/lookup_tables/lookup_table_interface.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Parent class for tables with support for multithreaded synchronization.
+template <typename HeterogeneousKeyType, typename ValueType>
+class LookupTableWithSynchronization
+    : public LookupTableInterface<HeterogeneousKeyType, ValueType> {
+ public:
+  // By convention, it is assumed that the OpKernel which creates this
+  // resource is bound to an op whose first input is an in64 tensor list with a
+  // first element whose boolean value indicates whether a mutex should be
+  // exported to synchronize access to state. If the tensor list is empty,
+  // the status in ctx is set to InvalidArgument and this object is in an
+  // undefined state.
+  LookupTableWithSynchronization(OpKernelContext* ctx, OpKernel* kernel) {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    if (table_int64_args.size() == 0) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "table_int64_args should not be empty. Set the first element "
+          "to 1 to enable synchronized table use and to 0 otherwise."));
+      return;
+    }
+    if (ctx->input(0).scalar<int64>()() != 0) {
+      mutex_ = absl::make_unique<mutex>();
+    }
+  }
+
+  // Mutex for synchronizing access to unsynchronized methods.
+  mutex* GetMutex() const override { return mutex_.get(); }
+
+ private:
+  // Use this for locking.
+  mutable std::unique_ptr<mutex> mutex_;
+};
+
+// Parent class for tables which can be constructed with arbitrary
+// lookup fallbacks.
+// Since LookupTableInterface::LookupKey assumes that all keys can be mapped
+// to values, LookupTableWithFallbackInterface allows clients to implement
+// two-stage lookups. If the first key lookup fails, clients can choose
+// to perform a fallback lookup using an externally supplied table.
+template <typename HeterogeneousKeyType, typename ValueType,
+          typename FallbackTableRegisteredType =
+              LookupTableInterface<HeterogeneousKeyType, ValueType>>
+class LookupTableWithFallbackInterface
+    : public LookupTableWithSynchronization<HeterogeneousKeyType, ValueType> {
+ public:
+  LookupTableWithFallbackInterface(OpKernelContext* ctx, OpKernel* kernel,
+                                   FallbackTableRegisteredType* fallback_table)
+      : LookupTableWithSynchronization<HeterogeneousKeyType, ValueType>(ctx,
+                                                                        kernel),
+        fallback_table_(fallback_table) {}
+
+  // Clients are required to fail when ctx is set to a not-OK status in
+  // the constructor so this dereference is safe.
+  const FallbackTableRegisteredType& fallback_table() const {
+    return *fallback_table_;
+  }
+
+  ~LookupTableWithFallbackInterface() override {
+    if (fallback_table_ != nullptr) {
+      fallback_table_->Unref();
+    }
+  }
+
+ private:
+  FallbackTableRegisteredType* fallback_table_;
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_TABLE_RESOURCE_UTILS_H_
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index f405ca3c58cfffc8422dcdd65e66c7fd12784519..ba30432e21a12d66c69217bec0c75660a0ae83ec 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index dd89597369bce0dcfd8ae8ad7e2bfc47d8ae2817..27a8696e54647e14eda209c36b7b49c1d171d3bc 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -480,7 +480,7 @@ class StagingMap : public ResourceBase {
     return map_.size();
   }
 
-  string DebugString() override { return "StagingMap"; }
+  string DebugString() const override { return "StagingMap"; }
 };
 
 template <bool Ordered>
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/matrix_solve_op.cc
index 169f3dae76d2fb6d0515d22648a9047657af0032..f3919a16aa50694fa5e05eb2cc421f1dd3f378a1 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_op.cc
@@ -214,9 +214,12 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
     auto input_copy_ptrs = solver->GetScratchSpace<uint8>(
         sizeof(Scalar*) * batch_size, "input_copt_ptrs",
         /* on_host */ true);
-    if (n / batch_size <= 128) {
-      // For small matrices or large batch sizes, we use the batched
-      // interface from cuBlas.
+    const int kMaxMatrixSizeToBatchSizeRatio = 128;
+    const bool use_batched_solver =
+        n <= kMaxMatrixSizeToBatchSizeRatio * batch_size;
+    if (use_batched_solver) {
+      // For small matrices or large batch sizes, we use the batched interface
+      // from cuBlas.
       const Scalar** input_copy_ptrs_base =
           reinterpret_cast<const Scalar**>(input_copy_ptrs.mutable_data());
       for (int batch = 0; batch < batch_size; ++batch) {
@@ -230,8 +233,8 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
                                &dev_info.back(), batch_size),
           done);
     } else {
-      // For small batch sizes we use the non-batched interface from cuSolver,
-      // which is much faster for large matrices.
+      // For small batch sizes or large matrices, we use the non-batched
+      // interface from cuSolver, which is much faster for large matrices.
       dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
@@ -279,11 +282,7 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
         /* on_host */ true);
     auto transposed_rhs_reshaped =
         transposed_rhs.template flat_inner_dims<Scalar, 3>();
-    // TODO(rmlarsen): Enable the following branch when I figure
-    // out why it causes a segfault.
-    if (false && n / batch_size <= 128) {
-      dev_info.push_back(
-          solver->GetDeviceLapackInfo(batch_size, "GetrsBatched"));
+    if (use_batched_solver) {
       const Scalar** input_copy_ptrs_base =
           reinterpret_cast<const Scalar**>(input_copy_ptr_array.mutable_data());
       const Scalar** transposed_rhs_ptrs_base =
@@ -293,13 +292,20 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
         input_copy_ptrs_base[batch] = &input_copy_reshaped(batch, 0, 0);
         transposed_rhs_ptrs_base[batch] = &transposed_rhs_reshaped(batch, 0, 0);
       }
+      int host_info = 0;
       OP_REQUIRES_OK_ASYNC(
           context,
           solver->GetrsBatched(adjoint_ ? CUBLAS_OP_C : CUBLAS_OP_T, n, nrhs,
                                input_copy_ptrs_base, n, pivots_mat.data(),
-                               transposed_rhs_ptrs_base, n, &dev_info.back(),
+                               transposed_rhs_ptrs_base, n, &host_info,
                                batch_size),
           done);
+      OP_REQUIRES_ASYNC(
+          context, host_info == 0,
+          errors::InvalidArgument("The ", -host_info,
+                                  "'th argument to cublas*getrsBatched had "
+                                  "an illegal value."),
+          done);
     } else {
       dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrs"));
       for (int batch = 0; batch < batch_size; ++batch) {
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 507fc9983776d2fd54ca66cc70aa7695886b4b5e..d24cb1cc92d59ad100ffec20262fc69888fa770c 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/eigen_pooling.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -1134,11 +1135,18 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::InvalidArgument(
                     "qint8 should be used with data_format NCHW_VECT_C."));
 
+#if CUDNN_VERSION >= 7300
+    if (use_dnn_) {
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
+#else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
                                stride_, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
+#endif
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 56d0340547a891fe4929bd6a36a72c5e03d1d1e0..f28811ffa4d740e6733b33189a0228bea2428b19 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -390,7 +390,6 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
                        pooled_height, pooled_width, kernel_h, kernel_w,
                        stride_h, stride_w, pad_t, pad_l, top_data);
-  d.synchronize();
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 39e60c9fcef174a4f9e2271600ed847f4e769625..44f2997e182a912476aeab86f1158845b5f1118e 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -54,7 +54,7 @@ class Scratch : public ResourceBase {
 
   uint8_t* buffer() { return scratch_32_aligned_; }
 
-  string DebugString() { return "MetaGemmScratchResource"; }
+  string DebugString() const override { return "MetaGemmScratchResource"; }
 
  private:
   std::unique_ptr<uint8_t> scratch_;
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index f0278caee6b95269b77185d409de67a7441c5ff3..3a5c87485cc8d49f4faf8dd8d2ef0781db2a7f75 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index c1b182be4a4f755bc975563cb3767d7c0079fd7f..47b2a43ed9212f5a58cdaa07b15f8aec44ee7b0f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -59,17 +59,20 @@ struct MklConvBwdFilterParams {
   memory::dims padding_right;
   padding_kind padding;
 
-  MklConvBwdFilterParams(memory::dims src_dims,
-    memory::dims diff_filter_dims, memory::dims diff_bias_dims,
-    memory::dims diff_dst_dims, memory::dims strides,
-    memory::dims dilations, memory::dims padding_left,
-    memory::dims padding_right, padding_kind padding) :
-      src_dims(src_dims), diff_filter_dims(diff_filter_dims),
-      diff_bias_dims(diff_bias_dims), diff_dst_dims(diff_dst_dims),
-      strides(strides), dilations(dilations),
-      padding_left(padding_left), padding_right(padding_right),
-      padding(padding) {
-  }
+  MklConvBwdFilterParams(memory::dims src_dims, memory::dims diff_filter_dims,
+                         memory::dims diff_bias_dims,
+                         memory::dims diff_dst_dims, memory::dims strides,
+                         memory::dims dilations, memory::dims padding_left,
+                         memory::dims padding_right, padding_kind padding)
+      : src_dims(src_dims),
+        diff_filter_dims(diff_filter_dims),
+        diff_bias_dims(diff_bias_dims),
+        diff_dst_dims(diff_dst_dims),
+        strides(strides),
+        dilations(dilations),
+        padding_left(padding_left),
+        padding_right(padding_right),
+        padding(padding) {}
 };
 
 template <typename T>
@@ -93,7 +96,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_bias_data:   output data buffer of diff_bias
   //   diff_dst_data:    input data buffer of diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-      const T* diff_bias_data, const T* diff_dst_data) {
+               const T* diff_bias_data, const T* diff_dst_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -116,8 +119,8 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   src_data:         input data buffer of src
   //   diff_filter_data: output data buffer of diff_filter
   //   diff_dst_data:    input data buffer of diff_dst
-  void Execute(const T* src_data,
-      const T* diff_filter_data, const T* diff_dst_data) {
+  void Execute(const T* src_data, const T* diff_filter_data,
+               const T* diff_dst_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -133,9 +136,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     return;
   }
 
-  memory::format GetSrcMemoryFormat() const {
-    return context_.src_fmt;
-  }
+  memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
 
   memory::format GetDiffDstMemoryFormat() const {
     return context_.diff_dst_fmt;
@@ -185,37 +186,42 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::stream> bwd_filter_stream;
     std::vector<mkldnn::primitive> bwd_filter_primitives;
 
-    ConvBwdFilterContext() :
-        src_fmt(memory::format::any),
-        diff_dst_fmt(memory::format::any),
-        diff_filter_fmt(memory::format::any),
-        src_mem(nullptr), diff_filter_mem(nullptr),
-        diff_bias_mem(nullptr), diff_dst_mem(nullptr),
-        bwd_filter_desc(nullptr), fwd_desc(nullptr), fwd_pd(nullptr),
-        src_md(nullptr), diff_filter_md(nullptr),
-        diff_bias_md(nullptr), diff_dst_md(nullptr),
-        bwd_filter_stream(nullptr) {
-    }
+    ConvBwdFilterContext()
+        : src_fmt(memory::format::any),
+          diff_dst_fmt(memory::format::any),
+          diff_filter_fmt(memory::format::any),
+          src_mem(nullptr),
+          diff_filter_mem(nullptr),
+          diff_bias_mem(nullptr),
+          diff_dst_mem(nullptr),
+          bwd_filter_desc(nullptr),
+          fwd_desc(nullptr),
+          fwd_pd(nullptr),
+          src_md(nullptr),
+          diff_filter_md(nullptr),
+          diff_bias_md(nullptr),
+          diff_dst_md(nullptr),
+          bwd_filter_stream(nullptr) {}
   };
 
   // Setup Conv2d backward filter (weights) primitives.
   void Setup(const MklConvBwdFilterParams& convBwdFilterDims) {
     // create memory descriptors for convolution data w/ no specified format
-    context_.src_md.reset(new memory::desc({convBwdFilterDims.src_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.src_md.reset(new memory::desc(
+        {convBwdFilterDims.src_dims}, MklDnnType<T>(), memory::format::any));
 
-    context_.diff_dst_md.reset(new memory::desc(
-        {convBwdFilterDims.diff_dst_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.diff_dst_md.reset(
+        new memory::desc({convBwdFilterDims.diff_dst_dims}, MklDnnType<T>(),
+                         memory::format::any));
 
-    context_.diff_filter_md.reset(new memory::desc(
-        {convBwdFilterDims.diff_filter_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.diff_filter_md.reset(
+        new memory::desc({convBwdFilterDims.diff_filter_dims}, MklDnnType<T>(),
+                         memory::format::any));
 
     if (!convBwdFilterDims.diff_bias_dims.empty())
-      context_.diff_bias_md.reset(new memory::desc(
-          {convBwdFilterDims.diff_bias_dims},
-          MklDnnType<T>(), memory::format::x));
+      context_.diff_bias_md.reset(
+          new memory::desc({convBwdFilterDims.diff_bias_dims}, MklDnnType<T>(),
+                           memory::format::x));
 
     // create a convolution
     if (!convBwdFilterDims.diff_bias_dims.empty()) {
@@ -226,8 +232,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           convBwdFilterDims.padding_left, convBwdFilterDims.padding_right,
           convBwdFilterDims.padding));
     } else {
-      context_.bwd_filter_desc.reset(
-          new convolution_backward_weights::desc(
+      context_.bwd_filter_desc.reset(new convolution_backward_weights::desc(
           convolution_direct, *context_.src_md, *context_.diff_filter_md,
           *context_.diff_dst_md, convBwdFilterDims.strides,
           convBwdFilterDims.dilations, convBwdFilterDims.padding_left,
@@ -236,18 +241,18 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
 
     // create fwd primitive_desc
     context_.fwd_desc.reset(new convolution_forward::desc(
-        prop_kind::forward, convolution_direct,
-        *context_.src_md, *context_.diff_filter_md, *context_.diff_dst_md,
-        convBwdFilterDims.strides,
-        convBwdFilterDims.dilations, convBwdFilterDims.padding_left,
-        convBwdFilterDims.padding_right, convBwdFilterDims.padding));
+        prop_kind::forward, convolution_direct, *context_.src_md,
+        *context_.diff_filter_md, *context_.diff_dst_md,
+        convBwdFilterDims.strides, convBwdFilterDims.dilations,
+        convBwdFilterDims.padding_left, convBwdFilterDims.padding_right,
+        convBwdFilterDims.padding));
     context_.fwd_pd.reset(new convolution_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
 
     // create backward conv primitive_desc
     context_.bwd_filter_pd.reset(
         new convolution_backward_weights::primitive_desc(
-        *context_.bwd_filter_desc, cpu_engine_, *context_.fwd_pd));
+            *context_.bwd_filter_desc, cpu_engine_, *context_.fwd_pd));
 
     // store the expected memory format
     auto bwd_filter_pd = context_.bwd_filter_pd.get();
@@ -259,25 +264,28 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
         bwd_filter_pd->diff_dst_primitive_desc().desc().data.format);
 
     // create memory primitive based on dummy data
-    context_.src_mem.reset(new memory(
-        bwd_filter_pd->src_primitive_desc(), DummyData));
-    context_.diff_filter_mem.reset(new memory(
-        bwd_filter_pd->diff_weights_primitive_desc(), DummyData));
-    context_.diff_dst_mem.reset(new memory(
-        bwd_filter_pd->diff_dst_primitive_desc(), DummyData));
+    context_.src_mem.reset(
+        new memory(bwd_filter_pd->src_primitive_desc(), DummyData));
+    context_.diff_filter_mem.reset(
+        new memory(bwd_filter_pd->diff_weights_primitive_desc(), DummyData));
+    context_.diff_dst_mem.reset(
+        new memory(bwd_filter_pd->diff_dst_primitive_desc(), DummyData));
 
     // create convolution primitive and add it to net
     if (!convBwdFilterDims.diff_bias_dims.empty()) {
-      context_.diff_bias_mem.reset(new memory(
-          {{{convBwdFilterDims.diff_bias_dims}, MklDnnType<T>(),
-          memory::format::x}, cpu_engine_}, DummyData));
+      context_.diff_bias_mem.reset(
+          new memory({{{convBwdFilterDims.diff_bias_dims},
+                       MklDnnType<T>(),
+                       memory::format::x},
+                      cpu_engine_},
+                     DummyData));
       context_.conv_bwd_filter.reset(new convolution_backward_weights(
           *context_.bwd_filter_pd, *context_.src_mem, *context_.diff_dst_mem,
           *context_.diff_filter_mem, *context_.diff_bias_mem));
     } else {
       context_.conv_bwd_filter.reset(new convolution_backward_weights(
-          *context_.bwd_filter_pd, *context_.src_mem,
-          *context_.diff_dst_mem, *context_.diff_filter_mem));
+          *context_.bwd_filter_pd, *context_.src_mem, *context_.diff_dst_mem,
+          *context_.diff_filter_mem));
     }
 
     context_.bwd_filter_primitives.push_back(*context_.conv_bwd_filter);
@@ -298,13 +306,13 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
       conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
     } else {
       // look into the pool for reusable primitive
-      conv_bwd_filter = dynamic_cast<MklConvBwdFilterPrimitive<T>*> (
-        MklConvBwdFilterPrimitiveFactory<T>::GetInstance().GetConvBwdFilter(
-            convBwdFilterDims));
+      conv_bwd_filter = dynamic_cast<MklConvBwdFilterPrimitive<T>*>(
+          MklConvBwdFilterPrimitiveFactory<T>::GetInstance().GetConvBwdFilter(
+              convBwdFilterDims));
 
-     if (conv_bwd_filter == nullptr) {
-       conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
-       MklConvBwdFilterPrimitiveFactory<T>::GetInstance().SetConvBwdFilter(
+      if (conv_bwd_filter == nullptr) {
+        conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
+        MklConvBwdFilterPrimitiveFactory<T>::GetInstance().SetConvBwdFilter(
             convBwdFilterDims, conv_bwd_filter);
       }
     }
@@ -349,12 +357,12 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool biasEnabled>
+template <typename Device, class T, bool bias_enabled, bool is_depthwise>
 class MklConvCustomBackpropFilterOp
-    : public MklConvBackpropCommonOp<Device, T> {
+    : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
   explicit MklConvCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConvBackpropCommonOp<Device, T>(context) {}
+      : MklConvBackpropCommonOp<Device, T, is_depthwise>(context) {}
 
   ~MklConvCustomBackpropFilterOp() {}
 
@@ -365,7 +373,7 @@ class MklConvCustomBackpropFilterOp
       MklDnnData<T> diff_filter(&cpu_engine_);  // output
 
       // This flag indicates Conv2D or Conv3D
-      bool isConv2D = (this->strides_.size() == 4);
+      bool is_conv2d = (this->strides_.size() == 4);
 
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
@@ -396,8 +404,8 @@ class MklConvCustomBackpropFilterOp
           diff_dst_tf_shape.num_elements() == 0) {
         MklDnnShape diff_filter_mkl_shape;
         diff_filter_mkl_shape.SetMklTensor(false);
-        TensorShape diff_filter_tf_shape = GetOutputTfShape(
-            src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
+        TensorShape diff_filter_tf_shape =
+            GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_filter_tensor,
                                   diff_filter_tf_shape, diff_filter_mkl_shape);
@@ -414,20 +422,20 @@ class MklConvCustomBackpropFilterOp
       // By default, all dims are in MKL order. Only dims in TF order
       // are those with prefix tf_order.
       memory::dims diff_dst_dims, fwd_src_dims, fwd_filter_dims;
-      memory::dims padding_left, padding_right, dilations,
-          strides, fwd_dst_dims;
+      memory::dims padding_left, padding_right, dilations, strides,
+          fwd_dst_dims;
       memory::dims fwd_dst_dims_tf_order;
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, this->strides_, this->padding_,
-          this->data_format_, this->dilations_);
+                              this->data_format_, this->dilations_);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &fwd_src_dims, &fwd_filter_dims,
-          &strides, &dilations, &fwd_dst_dims_tf_order,
-          &fwd_dst_dims, &padding_left, &padding_right);
+          &strides, &dilations, &fwd_dst_dims_tf_order, &fwd_dst_dims,
+          &padding_left, &padding_right, false, is_depthwise);
       if (!context->status().ok()) return;
 
-      auto tf_fmt = isConv2D
+      auto tf_fmt = is_conv2d
                         ? TFDataFormatToMklDnnDataFormat(this->data_format_)
                         : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
 
@@ -439,26 +447,27 @@ class MklConvCustomBackpropFilterOp
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
 
-      auto diff_dst_md = diff_dst_mkl_shape.IsMklTensor()
-                       ? diff_dst_mkl_shape.GetMklLayout()
-                       : memory::desc(diff_dst_dims,
-                           MklDnnType<T>(), tf_fmt);
+      auto diff_dst_md =
+          diff_dst_mkl_shape.IsMklTensor()
+              ? diff_dst_mkl_shape.GetMklLayout()
+              : memory::desc(diff_dst_dims, MklDnnType<T>(), tf_fmt);
 
       memory::dims diff_bias_dims = {};
       int64 depth = 0;
-      if (biasEnabled) {
+      if (bias_enabled) {
         TensorShape obp_tf_shape = GetTfShape(context, 2);
         depth = (this->data_format_ == FORMAT_NCHW)
                     ? obp_tf_shape.dim_size(1)
-                    : obp_tf_shape.dim_size(isConv2D ? 3 : 4);
+                    : obp_tf_shape.dim_size(is_conv2d ? 3 : 4);
         diff_bias_dims = {static_cast<int>(depth)};
       }
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
       MklConvBwdFilterPrimitive<T>* conv_bwd_filter = nullptr;
-      MklConvBwdFilterParams convBwdFilterDims(fwd_src_dims, fwd_filter_dims,
-          diff_bias_dims, diff_dst_dims, strides, dilations, padding_left,
-          padding_right, TFPaddingToMklDnnPadding(this->padding_));
+      MklConvBwdFilterParams convBwdFilterDims(
+          fwd_src_dims, fwd_filter_dims, diff_bias_dims, diff_dst_dims, strides,
+          dilations, padding_left, padding_right,
+          TFPaddingToMklDnnPadding(this->padding_));
 
       // MKL DNN allocates large buffers when a conv gradient filter primtive is
       // created. So we don't cache conv backward primitives when the env
@@ -475,14 +484,38 @@ class MklConvCustomBackpropFilterOp
       MklDnnShape diff_filter_mkl_shape;
       diff_filter_mkl_shape.SetMklTensor(false);
 
-      if (isConv2D) {
-        // Conv2D: output_dims_mkl_order is in OIHW format.
-        TensorShape diff_filter_tf_shape({bwd_output_dims[MklDnnDims::Dim_H],
-                                          bwd_output_dims[MklDnnDims::Dim_W],
-                                          bwd_output_dims[MklDnnDims::Dim_I],
-                                          bwd_output_dims[MklDnnDims::Dim_O]});
-        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+      if (is_conv2d) {
+        if (!is_depthwise) {
+          // Conv2D: output_dims_mkl_order is in OIHW format.
+          TensorShape diff_filter_tf_shape(
+              {bwd_output_dims[MklDnnDims::Dim_H],
+               bwd_output_dims[MklDnnDims::Dim_W],
+               bwd_output_dims[MklDnnDims::Dim_I],
+               bwd_output_dims[MklDnnDims::Dim_O]});
+          AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                    diff_filter_tf_shape,
+                                    diff_filter_mkl_shape);
+        } else {
+          // Depthwise Conv2d: bwd_output_dims is GOIHW format
+          //                  | TensorFlow       | MKLDNN
+          // ----------------------------------------------------------------
+          // filter_out_depth | depth_multiplier | depth_multiplier *
+          //                  |                  | group_count
+          // ----------------------------------------------------------------
+          // filter_in_depth  | in_depth         | in_depth / group_count
+          // For depthwise convolution, we have group_count == in_depth.
+          // So here G = original I, and I = 1.
+          // And the GOIHW is mkldnn format, here we try to extract the TF
+          // format, TF format is HWIO, as G = original I, so here is HWGO.
+          TensorShape diff_filter_tf_shape(
+              {bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_H],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_W],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_G],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_O]});
+          AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                    diff_filter_tf_shape,
+                                    diff_filter_mkl_shape);
+        }
       } else {
         // Conv3D: output_dims_mkl_order is in OIDHW format.
         TensorShape diff_filter_tf_shape(
@@ -496,38 +529,36 @@ class MklConvCustomBackpropFilterOp
       }
 
       Tensor* diff_bias_tensor = nullptr;
-      if (biasEnabled) {
+      if (bias_enabled) {
         TensorShape diff_bias_shape({depth});
         AllocateBiasGradTensor(context, diff_bias_shape, &diff_bias_tensor);
       }
 
       // check if src and diff_dst need reorder
-      T *src_data = nullptr;
+      T* src_data = nullptr;
       if (fwd_src_md.data.format != conv_bwd_filter->GetSrcMemoryFormat()) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
         src.CheckReorderToOpMem(bwd_filter_pd->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
-        src_data = static_cast<T*>(const_cast<T*>(
-            src_tensor.flat<T>().data()));
+        src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
       }
 
-      T *diff_dst_data = nullptr;
+      T* diff_dst_data = nullptr;
       if (diff_dst_md.data.format !=
           conv_bwd_filter->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_filter_pd->diff_dst_primitive_desc());
-        diff_dst_data = static_cast<T*>(
-            diff_dst.GetOpMem().get_data_handle());
+        diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
-        diff_dst_data = static_cast<T*>(const_cast<T*>(
-            diff_dst_tensor.flat<T>().data()));
+        diff_dst_data =
+            static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
       // For backward filter, convert diff_filter back to Tensorflow layout
       // Here we prepare to reorder op memory back to user memory
       bool diff_filter_reorder_required = false;
-      T *diff_filter_data = nullptr;
+      T* diff_filter_data = nullptr;
       if (GetOutputFormat(tf_fmt) !=
           conv_bwd_filter->GetDiffFilterMemoryFormat()) {
         // Allocate diff filter tensor as Tensorflow layout
@@ -535,18 +566,18 @@ class MklConvCustomBackpropFilterOp
                               diff_filter_tensor);
         diff_filter_reorder_required = true;
         diff_filter.PrepareReorderToUserMemIfReq(
-                bwd_filter_pd->diff_weights_primitive_desc());
-        diff_filter_data = static_cast<T*>(
-                            diff_filter.GetOpMem().get_data_handle());
+            bwd_filter_pd->diff_weights_primitive_desc());
+        diff_filter_data =
+            static_cast<T*>(diff_filter.GetOpMem().get_data_handle());
       } else {
-        diff_filter_data = static_cast<T*>(const_cast<T*>(
-                            diff_filter_tensor->flat<T>().data()));
+        diff_filter_data = static_cast<T*>(
+            const_cast<T*>(diff_filter_tensor->flat<T>().data()));
       }
 
       // Execute convolution filter bwd
-      if (biasEnabled) {
-        T* diff_bias_data = static_cast<T*>(const_cast<T*>(
-                         diff_bias_tensor->flat<T>().data()));
+      if (bias_enabled) {
+        T* diff_bias_data =
+            static_cast<T*>(const_cast<T*>(diff_bias_tensor->flat<T>().data()));
         conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
                                  diff_dst_data);
       } else {
@@ -598,7 +629,9 @@ class MklConvCustomBackpropFilterOp
     TensorShape filter_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
     CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
-             &filter_tf_shape).ok(), true);
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
     return filter_tf_shape;
   }
 
@@ -619,10 +652,12 @@ class MklConvCustomBackpropFilterOp
   }
 
   // Output layout is Tensorflow's filter layout
-  //   Conv2D: HWIO;  Conv3D: DHWIO
+  //   Conv2D: HWIO;  Conv3D: DHWIO; Depthwise Conv: HWIGO
   memory::format GetOutputFormat(const memory::format data_format) {
-    return (this->strides_.size() == 4) ? memory::format::hwio
-                                        : memory::format::dhwio;
+    return is_depthwise
+               ? memory::format::hwigo
+               : ((this->strides_.size() == 4) ? memory::format::hwio
+                                               : memory::format::dhwio);
   }
 
   // Allocate output tensor.
@@ -659,32 +694,41 @@ class MklConvCustomBackpropFilterOp
 
     MklDnnShape bias_grad_mkl_shape;
     bias_grad_mkl_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 1, bias_grad_tensor,
-        bias_grad_shape, bias_grad_mkl_shape);
+    AllocateOutputSetMklShape(context, 1, bias_grad_tensor, bias_grad_shape,
+                              bias_grad_mkl_shape);
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")                     \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>); \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")             \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, true>);  \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")       \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklDummyOp<CPUDevice, T>);                           \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropFilterV2")                   \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);        \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, true, false>);         \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklDepthwiseConv2dNativeBackpropFilter")                    \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);                     \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv3DBackpropFilterV2")                                 \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 786a30bb10dcf464b5768160714238c0d5730e96..4e955df5fe9e551ec9aadc21b466dc3810784760 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -61,16 +61,18 @@ struct MklConvBwdInputParams {
   memory::dims padding_right;
   padding_kind padding;
 
-  MklConvBwdInputParams(memory::dims diff_src_dims,
-    memory::dims filter_dims, memory::dims diff_dst_dims,
-    memory::dims strides, memory::dims dilations,
-    memory::dims padding_left, memory::dims padding_right,
-    padding_kind padding) :
-      diff_src_dims(diff_src_dims), filter_dims(filter_dims),
-      diff_dst_dims(diff_dst_dims), strides(strides),
-      dilations(dilations), padding_left(padding_left),
-      padding_right(padding_right), padding(padding) {
-  }
+  MklConvBwdInputParams(memory::dims diff_src_dims, memory::dims filter_dims,
+                        memory::dims diff_dst_dims, memory::dims strides,
+                        memory::dims dilations, memory::dims padding_left,
+                        memory::dims padding_right, padding_kind padding)
+      : diff_src_dims(diff_src_dims),
+        filter_dims(filter_dims),
+        diff_dst_dims(diff_dst_dims),
+        strides(strides),
+        dilations(dilations),
+        padding_left(padding_left),
+        padding_right(padding_right),
+        padding(padding) {}
 };
 
 template <typename T>
@@ -93,8 +95,8 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   //   filter_data:   input data buffer of filter (weights)
   //   diff_dst_data: input data buffer of dst
   // Bias does not matter here
-  void Execute(const T* diff_src_data,
-      const T* filter_data, const T* diff_dst_data) {
+  void Execute(const T* diff_src_data, const T* filter_data,
+               const T* diff_dst_data) {
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
@@ -111,9 +113,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     return;
   }
 
-  memory::format GetFilterMemoryFormat() const {
-    return context_.filter_fmt;
-  }
+  memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
 
   memory::format GetDiffDstMemoryFormat() const {
     return context_.diff_dst_fmt;
@@ -155,27 +155,33 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::stream> bwd_input_stream;
     std::vector<mkldnn::primitive> bwd_input_primitives;
 
-    ConvBwdInputContext() :
-        filter_fmt(memory::format::any), diff_dst_fmt(memory::format::any),
-        diff_src_mem(nullptr), filter_mem(nullptr), diff_dst_mem(nullptr),
-        bwd_input_pd(nullptr), conv_bwd_input(nullptr),
-        bwd_input_desc(nullptr), fwd_desc(nullptr), fwd_pd(nullptr),
-        diff_src_md(nullptr), filter_md(nullptr), diff_dst_md(nullptr),
-        bwd_input_stream(nullptr) {
-    }
+    ConvBwdInputContext()
+        : filter_fmt(memory::format::any),
+          diff_dst_fmt(memory::format::any),
+          diff_src_mem(nullptr),
+          filter_mem(nullptr),
+          diff_dst_mem(nullptr),
+          bwd_input_pd(nullptr),
+          conv_bwd_input(nullptr),
+          bwd_input_desc(nullptr),
+          fwd_desc(nullptr),
+          fwd_pd(nullptr),
+          diff_src_md(nullptr),
+          filter_md(nullptr),
+          diff_dst_md(nullptr),
+          bwd_input_stream(nullptr) {}
   };
 
   void Setup(const MklConvBwdInputParams& convBwdInputDims) {
     // create memory descriptors for convolution data w/ no specified format
-    context_.diff_src_md.reset(new memory::desc(
-        {convBwdInputDims.diff_src_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.diff_src_md.reset(
+        new memory::desc({convBwdInputDims.diff_src_dims}, MklDnnType<T>(),
+                         memory::format::any));
     context_.filter_md.reset(new memory::desc(
-        {convBwdInputDims.filter_dims},
-        MklDnnType<T>(), memory::format::any));
-    context_.diff_dst_md.reset(new memory::desc(
-        {convBwdInputDims.diff_dst_dims},
-        MklDnnType<T>(), memory::format::any));
+        {convBwdInputDims.filter_dims}, MklDnnType<T>(), memory::format::any));
+    context_.diff_dst_md.reset(
+        new memory::desc({convBwdInputDims.diff_dst_dims}, MklDnnType<T>(),
+                         memory::format::any));
 
     // create convolution primitives
     context_.bwd_input_desc.reset(new convolution_backward_data::desc(
@@ -184,9 +190,9 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         convBwdInputDims.dilations, convBwdInputDims.padding_left,
         convBwdInputDims.padding_right, convBwdInputDims.padding));
 
-    context_.fwd_desc.reset(new convolution_forward::desc(prop_kind::forward,
-        convolution_direct, *context_.diff_src_md, *context_.filter_md,
-        *context_.diff_dst_md, convBwdInputDims.strides,
+    context_.fwd_desc.reset(new convolution_forward::desc(
+        prop_kind::forward, convolution_direct, *context_.diff_src_md,
+        *context_.filter_md, *context_.diff_dst_md, convBwdInputDims.strides,
         convBwdInputDims.dilations, convBwdInputDims.padding_left,
         convBwdInputDims.padding_right, convBwdInputDims.padding));
 
@@ -194,8 +200,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         *context_.fwd_desc, cpu_engine_));
 
     // create backward conv prim desc
-    context_.bwd_input_pd.reset(
-        new convolution_backward_data::primitive_desc(
+    context_.bwd_input_pd.reset(new convolution_backward_data::primitive_desc(
         *context_.bwd_input_desc, cpu_engine_, *context_.fwd_pd));
 
     // create memory primitive based on dummy data
@@ -207,15 +212,21 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         context_.bwd_input_pd.get()->diff_dst_primitive_desc(), DummyData));
 
     // store the expected memory format
-    context_.filter_fmt = static_cast<memory::format>(
-     context_.bwd_input_pd.get()->weights_primitive_desc().desc().data.format);
-    context_.diff_dst_fmt = static_cast<memory::format>(
-     context_.bwd_input_pd.get()->diff_dst_primitive_desc().desc().data.format);
+    context_.filter_fmt =
+        static_cast<memory::format>(context_.bwd_input_pd.get()
+                                        ->weights_primitive_desc()
+                                        .desc()
+                                        .data.format);
+    context_.diff_dst_fmt =
+        static_cast<memory::format>(context_.bwd_input_pd.get()
+                                        ->diff_dst_primitive_desc()
+                                        .desc()
+                                        .data.format);
 
     // create convolution primitive and add it to net
     context_.conv_bwd_input.reset(new convolution_backward_data(
-        *context_.bwd_input_pd, *context_.diff_dst_mem,
-        *context_.filter_mem, *context_.diff_src_mem));
+        *context_.bwd_input_pd, *context_.diff_dst_mem, *context_.filter_mem,
+        *context_.diff_src_mem));
 
     context_.bwd_input_primitives.push_back(*context_.conv_bwd_input);
   }
@@ -284,11 +295,12 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T>
-class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
+template <typename Device, class T, bool is_depthwise>
+class MklConvCustomBackpropInputOp
+    : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
   explicit MklConvCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConvBackpropCommonOp<Device, T>(context) {}
+      : MklConvBackpropCommonOp<Device, T, is_depthwise>(context) {}
 
   ~MklConvCustomBackpropInputOp() {}
 
@@ -298,7 +310,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       MklDnnData<T> diff_dst(&cpu_engine);
 
       // This flag indicate Conv2D or Conv3D
-      bool isConv2D = (this->strides_.size() == 4);
+      bool is_conv2d = (this->strides_.size() == 4);
 
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
@@ -311,8 +323,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       GetMklShape(context, kFilterIdx, &filter_mkl_shape);
       GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape);
       // Allow operator-specific sanity checking of shapes.
-      ValidateMklShapes(src_mkl_shape, filter_mkl_shape,
-                        diff_dst_mkl_shape);
+      ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
       // Allow operator-specific generation of shapes.
       // E.g., ConvBackpropFilter gets filter as filter_sizes. It is a
@@ -330,11 +341,11 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
           diff_dst_tf_shape.num_elements() == 0) {
         MklDnnShape diff_src_mkl_shape;
         diff_src_mkl_shape.SetMklTensor(false);
-        TensorShape diff_src_tf_shape = GetOutputTfShape(
-            src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
+        TensorShape diff_src_tf_shape =
+            GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_src_tensor,
-                       diff_src_tf_shape, diff_src_mkl_shape);
+                                  diff_src_tf_shape, diff_src_mkl_shape);
         CHECK_NOTNULL(diff_src_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -353,40 +364,44 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, this->strides_, this->padding_,
-          this->data_format_, this->dilations_);
+                              this->data_format_, this->dilations_);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &fwd_src_dims, &fwd_filter_dims,
           &strides, &dilations, &fwd_output_dims_tf_order, &fwd_output_dims,
-          &padding_left, &padding_right);
+          &padding_left, &padding_right, false, is_depthwise);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
       // API needs it. For that, we first need to create input, filter
       // and output memory descriptors.
-      auto tf_fmt = isConv2D
+      auto tf_fmt = is_conv2d
                         ? TFDataFormatToMklDnnDataFormat(this->data_format_)
                         : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
 
       // If filter is in MKL layout, then simply grab filter layout;
       // otherwise, construct filter in TF layout.
       // For TF layout, filter is in HWIO format.
-      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
-                               ? filter_mkl_shape.GetMklLayout()
-                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                              isConv2D ? memory::format::hwio
-                                                       : memory::format::dhwio);
+      auto fwd_filter_md =
+          filter_mkl_shape.IsMklTensor()
+              ? filter_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                             is_depthwise
+                                 ? memory::hwigo
+                                 : (is_conv2d ? memory::format::hwio
+                                              : memory::format::dhwio));
 
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
-      auto diff_dst_md = diff_dst_mkl_shape.IsMklTensor()
-                       ? diff_dst_mkl_shape.GetMklLayout()
-                       : memory::desc(diff_dst_dims,
-                           MklDnnType<T>(), tf_fmt);
+      auto diff_dst_md =
+          diff_dst_mkl_shape.IsMklTensor()
+              ? diff_dst_mkl_shape.GetMklLayout()
+              : memory::desc(diff_dst_dims, MklDnnType<T>(), tf_fmt);
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
       MklConvBwdInputPrimitive<T>* conv_bwd_input = nullptr;
-      MklConvBwdInputParams convBwdInputDims(fwd_src_dims, fwd_filter_dims,
-          diff_dst_dims, strides, dilations, padding_left, padding_right,
+      MklConvBwdInputParams convBwdInputDims(
+          fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, dilations,
+          padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
 
       // We don't cache those primitves if the env variable
@@ -396,8 +411,8 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       //   1. Legacy CPU without AVX512/AVX2, or
       //   2. 1x1 convolution with stride != 1
       bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled() &&
-                   (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
-                    IsConv1x1StrideNot1(fwd_filter_dims, strides));
+                          (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
+                           IsConv1x1StrideNot1(fwd_filter_dims, strides));
       conv_bwd_input = MklConvBwdInputPrimitiveFactory<T>::Get(convBwdInputDims,
                                                                do_not_cache);
       auto bwd_input_pd = conv_bwd_input->GetPrimitiveDesc();
@@ -411,14 +426,14 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       diff_src_mkl_shape.SetMklLayout(&diff_src_pd);
       diff_src_mkl_shape.SetElemType(MklDnnType<T>());
       diff_src_mkl_shape.SetTfLayout(bwd_diff_src_dims.size(),
-          bwd_diff_src_dims, bwd_diff_src_format);
+                                     bwd_diff_src_dims, bwd_diff_src_format);
       TensorShape diff_src_tf_shape;
       diff_src_tf_shape.AddDim(diff_src_pd.get_size() / sizeof(T));
-      AllocateOutputSetMklShape(context, 0, &diff_src_tensor,
-          diff_src_tf_shape, diff_src_mkl_shape);
+      AllocateOutputSetMklShape(context, 0, &diff_src_tensor, diff_src_tf_shape,
+                                diff_src_mkl_shape);
 
-      T *diff_src_data = static_cast<T*>(const_cast<T*>(
-          diff_src_tensor->flat<T>().data()));
+      T* diff_src_data =
+          static_cast<T*>(const_cast<T*>(diff_src_tensor->flat<T>().data()));
 
       // check if filter and diff_dst need reorder
       T* filter_data = nullptr;
@@ -428,19 +443,18 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
         filter.CheckReorderToOpMem(bwd_input_pd->weights_primitive_desc());
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
-        filter_data = static_cast<T*>(const_cast<T*>(
-                       filter_tensor.flat<T>().data()));
+        filter_data =
+            static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
       }
 
       T* diff_dst_data = nullptr;
       if (diff_dst_md.data.format != conv_bwd_input->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_input_pd->diff_dst_primitive_desc());
-        diff_dst_data = static_cast<T*>(
-                         diff_dst.GetOpMem().get_data_handle());
+        diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
-        diff_dst_data = static_cast<T*>(const_cast<T*>(
-                         diff_dst_tensor.flat<T>().data()));
+        diff_dst_data =
+            static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
       // execute convolution input bwd
@@ -543,18 +557,22 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
   }
 };
 
-#define REGISTER_MKL_CPU_KERNELS(T)                                    \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")              \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
-                          MklConvCustomBackpropInputOp<CPUDevice, T>); \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")            \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
-                          MklConvCustomBackpropInputOp<CPUDevice, T>);
-
+#define REGISTER_MKL_CPU_KERNELS(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")                     \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")                   \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklDepthwiseConv2dNativeBackpropInput")      \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, true>);
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 029c539277f46704680eb10067ffbef85ddcbc9c..ffaa7d710f3f2f97c5a425bd79877d919a51552c 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -21,13 +21,14 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -411,7 +412,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML_ONLY
-template <typename Device, typename T, bool biasEnabled>
+template <typename Device, typename T, bool bias_enabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -447,7 +448,7 @@ class MklConvOp : public OpKernel {
     CHECK(!mkl_filter_shape.IsMklTensor())
         << "Conv filter should not be in MKL Layout";
 
-    if (biasEnabled) {
+    if (bias_enabled) {
       const Tensor& bias = MklGetInput(context, 2);
       OP_REQUIRES(context, bias.dims() == 1,
                   errors::InvalidArgument("bias must be 1-dimensional: ",
@@ -595,14 +596,14 @@ class MklConvOp : public OpKernel {
     mkl_context.filter_strides[2] = filter.dim_size(3);  // in_depth
     mkl_context.filter_strides[3] = 1;                   // out_depth
 
-    if (biasEnabled) {
+    if (bias_enabled) {
       const Tensor& bias = MklGetInput(context, 2);
       mkl_context.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
       mkl_context.bias_strides[0] = {1};
     }
 
     // Create Convolution Primitive
-    if (biasEnabled) {
+    if (bias_enabled) {
       CHECK_EQ(
           dnnConvolutionCreateForwardBias_F32(
               &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
@@ -713,7 +714,7 @@ class MklConvOp : public OpKernel {
                                    filter_strides),
                E_SUCCESS);
 
-      if (biasEnabled) {
+      if (bias_enabled) {
         CHECK_EQ(dnnLayoutCreate_F32(&lt_bias, 1, bias_sizes, bias_strides),
                  E_SUCCESS);
       }
@@ -794,7 +795,7 @@ class MklConvOp : public OpKernel {
       conv_res[dnnResourceFilter] =
           (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
 
-      if (biasEnabled) {
+      if (bias_enabled) {
         const Tensor& bias = MklGetInput(context, 2);
         void* mkl_buf_bias =
             const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
@@ -825,7 +826,7 @@ class MklConvOp : public OpKernel {
       dnnDelete_F32(prim_fwd);
       if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
       dnnLayoutDelete_F32(lt_filter);
-      if (biasEnabled) dnnLayoutDelete_F32(lt_bias);
+      if (bias_enabled) dnnLayoutDelete_F32(lt_bias);
     }
   } MklConv2DOpContext;
 
@@ -851,7 +852,7 @@ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
 // Base class for convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
-          bool biasEnabled, bool padEnabled>
+          bool bias_enabled, bool pad_enabled, bool is_depthwise>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -930,7 +931,7 @@ class MklConvOp : public OpKernel {
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // If pad with conv2d fusion is enabled
-      if (padEnabled) {
+      if (fuse_pad_) {
         PadWithConvFusion(context, padding_left, padding_right);
       }
 
@@ -942,7 +943,7 @@ class MklConvOp : public OpKernel {
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
-          &padding_right, padEnabled);
+          &padding_right, fuse_pad_, is_depthwise);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -972,18 +973,26 @@ class MklConvOp : public OpKernel {
         return;
       }
 
-      bool isConv2D = (strides_.size() == 4);
-      // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D
-      if (!isConv2D) {
+      bool is_conv2d = (strides_.size() == 4);
+
+      // TODO 3-D support for Depthwise is not there
+      if (is_depthwise) {
+        OP_REQUIRES(context, is_conv2d,
+                    errors::InvalidArgument(
+                        "Only 2D convolution is supported for depthwise."));
+      }
+
+      // TODO(Intel-tf) Add check to make sure pad_enabled is true only for 2D
+      if (!is_conv2d) {
         OP_REQUIRES(
-            context, !padEnabled,
+            context, !fuse_pad_,
             errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
       }
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      auto tf_fmt = isConv2D ? TFDataFormatToMklDnnDataFormat(data_format_)
-                             : TFDataFormatToMklDnn3DDataFormat(data_format_);
+      auto tf_fmt = is_conv2d ? TFDataFormatToMklDnnDataFormat(data_format_)
+                              : TFDataFormatToMklDnn3DDataFormat(data_format_);
 
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input Tf layout. For TF layout, although input shape
@@ -997,12 +1006,18 @@ class MklConvOp : public OpKernel {
       src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO).
-      auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
-                           ? filter_mkl_shape.GetMklLayout()
-                           : memory::desc(filter_dims, MklDnnType<Tfilter>(),
-                                          isConv2D ? memory::format::hwio
-                                                   : memory::format::dhwio);
+      // the layout is Tensorflow's layout (HWIO) and (HWIGO)for depthwise/group
+      // convolutions
+
+      auto filter_format = is_conv2d ? (is_depthwise ? memory::format::hwigo
+                                                     : memory::format::hwio)
+                                     : memory::format::dhwio;
+
+      DCHECK(!filter_mkl_shape.IsMklTensor());
+      auto filter_md =
+          filter_mkl_shape.IsMklTensor()
+              ? filter_mkl_shape.GetMklLayout()
+              : memory::desc(filter_dims, MklDnnType<Tfilter>(), filter_format);
       filter.SetUsrMem(filter_md, &filter_tensor);
       // MKLDNN dilation starts from 0.
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
@@ -1117,7 +1132,7 @@ class MklConvOp : public OpKernel {
 
   void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left,
                          memory::dims& padding_right) {
-    const Tensor& paddings_tf = MklGetInput(context, 2);
+    const Tensor& paddings_tf = MklGetInput(context, input_index_pad_);
     OP_REQUIRES(context, paddings_tf.dims() == 2,
                 errors::InvalidArgument("paddings must be 2-dimensional: ",
                                         paddings_tf.shape().DebugString()));
@@ -1156,6 +1171,11 @@ class MklConvOp : public OpKernel {
  protected:
   void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
   void set_fuse_relu(bool fuse_relu) { fuse_relu_ = fuse_relu; }
+  void set_fuse_pad(bool fuse_pad) {
+    fuse_pad_ = fuse_pad;
+    // In PadwithFusedConv OP, pad is the fourth index.
+    input_index_pad_ = 3;
+  }
 
   // This method is for the base class MklConvOp, which handles the
   // floating point implementation of Conv. The quantized conv implementations
@@ -1227,11 +1247,13 @@ class MklConvOp : public OpKernel {
   TensorFormat data_format_;
 
   // Initialize to values the template is instantiated with
-  bool fuse_biasadd_ = biasEnabled;
+  bool fuse_biasadd_ = bias_enabled;
   bool fuse_relu_ = false;
+  bool fuse_pad_ = pad_enabled;
+
+  int input_index_pad_ = 2;
 
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
-  const int kInputIndex_Pad = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
@@ -1300,13 +1322,15 @@ class MklConvOp : public OpKernel {
 
 // Base class for fused convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
-          typename Toutput, typename Ttemp_output>
-class MklFusedConvOp : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput,
-                                        Ttemp_output, int32, false, false> {
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool pad_enabled>
+class MklFusedConvOp
+    : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                       Tpadding, false, false, false> {
  public:
   explicit MklFusedConvOp(OpKernelConstruction* context)
-      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output, int32,
-                  false, false>(context) {
+      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                  Tpadding, false, false, false>(context) {
     // Since we came here through the registration of _MklFusedConv2D, get
     // all information from 'fused_ops' and 'num_args'
     std::vector<string> fused_ops;
@@ -1336,6 +1360,10 @@ class MklFusedConvOp : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput,
                   errors::Unimplemented("Fusion is not implemented: [",
                                         str_util::Join(fused_ops, ","), "]"));
     }
+
+    if (pad_enabled) {
+      this->set_fuse_pad(true);
+    }
   }
 
   virtual ~MklFusedConvOp() {}
@@ -1344,10 +1372,10 @@ class MklFusedConvOp : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput,
 // We create new class for each verison of Quantized Convolution and inherit
 // from the FP32 version of the base class
 template <typename Device, typename Tbias, typename Toutput,
-          typename Ttemp_output, bool biasEnabled>
+          typename Ttemp_output, bool bias_enabled>
 class MklQuantizedConv2DOp
     : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                       int32, biasEnabled, false> {
+                       int32, bias_enabled, false, false> {
  public:
   virtual ~MklQuantizedConv2DOp() {
     if (this->input_bias_ != nullptr) {
@@ -1363,16 +1391,16 @@ class MklQuantizedConv2DOp
 
   explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
       : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-                  biasEnabled, false>(context) {}
+                  bias_enabled, false, false>(context) {}
 
   void Compute(OpKernelContext* context) override {
     // Compute int32 output tensor
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              biasEnabled, false>::Compute(context);
+              bias_enabled, false, false>::Compute(context);
 
     // Compute additional outputs: min/max scalars.
     int bias_index_offset;
-    bias_index_offset = biasEnabled ? 1 : 0;
+    bias_index_offset = bias_enabled ? 1 : 0;
 
     const float min_input =
         context->input(2 + bias_index_offset).flat<float>()(0);
@@ -1415,14 +1443,14 @@ class MklQuantizedConv2DOp
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              biasEnabled, false>::ExtendConvFwdParams(context, params);
+              bias_enabled, false, false>::ExtendConvFwdParams(context, params);
 
     // When the output type is quint8, the output data id requantized
     // into quint8. A post_op "output_scale" is added to do the conversion.
     if (std::is_same<Toutput, quint8>::value ||
         std::is_same<Toutput, qint8>::value) {
       int bias_index_offset;
-      bias_index_offset = biasEnabled ? 1 : 0;
+      bias_index_offset = bias_enabled ? 1 : 0;
 
       const float min_input =
           context->input(2 + bias_index_offset).flat<float>()(0);
@@ -1463,7 +1491,7 @@ class MklQuantizedConv2DOp
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc>& conv_fwd_pd,
       const Tensor& bias_tensor) override {
     int bias_index_offset;
-    bias_index_offset = biasEnabled ? 1 : 0;
+    bias_index_offset = bias_enabled ? 1 : 0;
 
     const float min_input =
         context->input(2 + bias_index_offset).flat<float>()(0);
@@ -1475,7 +1503,7 @@ class MklQuantizedConv2DOp
         context->input(5 + bias_index_offset).flat<float>()(0);
 
     std::vector<mkldnn::primitive> net;
-    if (biasEnabled) {
+    if (bias_enabled) {
       if (std::is_same<Tbias, qint32>::value) {
         return static_cast<Tbias*>(
             const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
@@ -1510,31 +1538,31 @@ class MklQuantizedConv2DOp
 };
 
 template <typename Device, typename Tbias, typename Toutput,
-          typename Ttemp_output, bool biasEnabled>
+          typename Ttemp_output, bool bias_enabled>
 class MklQuantizedConv2DReluOp
     : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                                  biasEnabled> {
+                                  bias_enabled> {
  public:
   virtual ~MklQuantizedConv2DReluOp() {}
 
   explicit MklQuantizedConv2DReluOp(OpKernelConstruction* context)
-      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
-            context) {}
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                             bias_enabled>(context) {}
 
  protected:
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                         biasEnabled>::ExtendConvFwdParams(context, params);
+                         bias_enabled>::ExtendConvFwdParams(context, params);
     params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 };
 
 template <typename Device, typename Tbias, typename Toutput,
-          typename Ttemp_output, bool biasEnabled>
+          typename Ttemp_output, bool bias_enabled>
 class MklQuantizedConv2DSumReluOp
     : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                                  biasEnabled> {
+                                  bias_enabled> {
  public:
   virtual ~MklQuantizedConv2DSumReluOp() {
     if (this->summand_ != nullptr) {
@@ -1549,14 +1577,14 @@ class MklQuantizedConv2DSumReluOp
   }
 
   explicit MklQuantizedConv2DSumReluOp(OpKernelConstruction* context)
-      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
-            context) {}
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                             bias_enabled>(context) {}
 
  protected:
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                         biasEnabled>::ExtendConvFwdParams(context, params);
+                         bias_enabled>::ExtendConvFwdParams(context, params);
     // Calculate the scale (beta in mkldnn api term) for sum
     if (std::is_same<Toutput, quint8>::value) {
       int summand_idx = context->num_inputs() / 2 - 1 - 2;
@@ -1564,7 +1592,7 @@ class MklQuantizedConv2DSumReluOp
       bool summand_condition =
           (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
       CHECK((summand_condition));
-      int bias_index_offset = biasEnabled ? 1 : 0;
+      int bias_index_offset = bias_enabled ? 1 : 0;
       const float min_freezed_output =
           context->input(6 + bias_index_offset).flat<float>()(0);
       const float max_freezed_output =
@@ -1610,7 +1638,8 @@ class MklQuantizedConv2DSumReluOp
       auto dst_md = summand_mkl_shape.GetMklLayout();
       if (summand_mkl_shape.IsMklTensor()) {
         if (summand_type == DT_QINT8) {
-          summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape());
+          OP_REQUIRES_OK(context, summand.BitcastFrom(summand, DT_QUINT8,
+                                                      summand.shape()));
           dst_md.data.data_type =
               static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
           summand_mkl_shape.SetMklLayout(&dst_md);
@@ -1627,10 +1656,10 @@ class MklQuantizedConv2DSumReluOp
     }
     // TODO(mdfaijul): Add cleaner code for non-mkl tensor
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              biasEnabled, false>::AllocateOutputTensor(context, conv_prim_desc,
-                                                        output_dims_mkl_order,
-                                                        output_tf_format,
-                                                        output_tensor);
+              bias_enabled, false,
+              false>::AllocateOutputTensor(context, conv_prim_desc,
+                                           output_dims_mkl_order,
+                                           output_tf_format, output_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -1638,7 +1667,7 @@ class MklQuantizedConv2DSumReluOp
     MklDnnShape summand_mkl_shape;
     GetMklShape(context, summand_idx, &summand_mkl_shape);
     // We need to compute scale for the summand
-    int bias_index_offset = biasEnabled ? 1 : 0;
+    int bias_index_offset = bias_enabled ? 1 : 0;
     const float min_input =
         context->input(2 + bias_index_offset).flat<float>()(0);
     const float max_input =
@@ -1905,13 +1934,13 @@ REGISTER_KERNEL_BUILDER(
                               .TypeConstraint<T>("T")                      \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int32, false, false>);          \
+                                    float, int32, false, false, false>);   \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                       \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int32, true, false>);           \
+                                    float, int32, true, false, false>);    \
   REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")                 \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
@@ -1923,14 +1952,14 @@ REGISTER_KERNEL_BUILDER(
                               .TypeConstraint<int32>("Tpaddings")          \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int32, false, true>);           \
+                                    float, int32, false, true, false>);    \
   REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<int64>("Tpaddings")          \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int64, false, true>);           \
+                                    float, int64, false, true, false>);    \
   REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D")                  \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
@@ -1940,13 +1969,45 @@ REGISTER_KERNEL_BUILDER(
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 
+#define REGISTER_MKL_CPU_2D_DEPTHWISE(T)                                   \
+  REGISTER_KERNEL_BUILDER(Name("_MklDepthwiseConv2dNative")                \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<float>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, false, true>);
+
+TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
+
+// Note we are registering _MklFusedConv2D.
+// We check the fused_ops attributes to decide if bias is enabled or not.
 #define REGISTER_MKL_CPU_2D_FUSED(T)                                \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedConv2D")                   \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklFusedConv2D")                                       \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<T>("T")                                   \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, false>);      \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklPadWithFusedConv2D")                                \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<int32>("Tpaddings")                       \
+          .TypeConstraint<T>("T")                                   \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, true>);       \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklPadWithFusedConv2D")                                \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<T>("T")                                   \
+          .TypeConstraint<int64>("Tpaddings")                       \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int64, true>);       \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithFusedConv2D")      \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
+                              .TypeConstraint<int32>("Tpaddings")   \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklFusedConvOp<CPUDevice, T, T, T, T, T>);
-// We check the fused_ops attributes to decide if bias is enabled or not.
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_FUSED);
 
@@ -1957,7 +2018,7 @@ TF_CALL_float(REGISTER_MKL_CPU_2D_FUSED);
           .Device(DEVICE_CPU)                   \
           .TypeConstraint<T>("T")               \
           .Label(mkl_op_registry::kMklOpLabel), \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false>);
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e61c20dea9f8c3f8749c302f88a46233dab270b7..b96cc23186558277ac0ff5f2f134e7095fdb8c7a 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <vector>
 
 #include "mkldnn.hpp"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -58,7 +58,7 @@ class MklDnnConvUtil {
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
                  Padding pad, TensorFormat fm,
-                 const std::vector<int32>& dilations)
+                 const std::vector<int32>& dilations, bool is_depthwise = false)
       : context_(context),
         strides_(strides),
         dilations_(dilations),
@@ -185,7 +185,8 @@ class MklDnnConvUtil {
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
   virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
                                               const TensorShape& filter_shape,
-                                              memory::dims* filter_dims) {
+                                              memory::dims* filter_dims,
+                                              bool is_depthwise) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == strides_.size(),
@@ -210,20 +211,37 @@ class MklDnnConvUtil {
                       input_depth, " vs ", filter_shape.dim_size(2)));
 
       // TF filter is always in (rows, cols, in_depth, out_depth) order.
-      int filter_rows = static_cast<int>(filter_shape.dim_size(0));
-      int filter_cols = static_cast<int>(filter_shape.dim_size(1));
-      int in_depth = static_cast<int>(filter_shape.dim_size(2));
-      int out_depth = static_cast<int>(filter_shape.dim_size(3));
-
-      // MKL-DNN always needs filter in OIHW format.
+      int filter_rows =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_H));
+      int filter_cols =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_W));
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_I));
+      int filter_out_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_O));
+      // MKL-DNN always needs filter in OIHW format for regular convolutions
+      // and GOIHW for grouped/depthwise convolutions,
       // OIHW = (out_depth, in_depth, rows, cols)
-      std::vector<int> mkldnn_sizes(4, -1);
-      mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
-      mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
-      mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
-      mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
-
-      *filter_dims = mkldnn_sizes;
+      // GOIHW = (group, out_depth, in_depth, rows, cols)
+      // Specifically for depthwise G=filter_indepth, O=filter_outdepth, I=1
+      if (is_depthwise) {
+        std::vector<int> mkldnn_sizes(5, -1);
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_G] = filter_in_depth;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_O] = filter_out_depth;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_I] = 1;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_H] = filter_rows;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_W] = filter_cols;
+
+        *filter_dims = mkldnn_sizes;
+      } else {
+        std::vector<int> mkldnn_sizes(4, -1);
+        mkldnn_sizes[MklDnnDims::Dim_O] = filter_out_depth;
+        mkldnn_sizes[MklDnnDims::Dim_I] = filter_in_depth;
+        mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+        mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+        *filter_dims = mkldnn_sizes;
+      }
     } else {  // Conv3D
       OP_REQUIRES(context_, input_depth == filter_shape.dim_size(3),
                   errors::InvalidArgument(
@@ -231,17 +249,22 @@ class MklDnnConvUtil {
                       input_depth, " vs ", filter_shape.dim_size(3)));
 
       // TF filter is always in (planes, rows, cols, in_depth, out_depth) order.
-      int filter_planes = static_cast<int>(filter_shape.dim_size(0));
-      int filter_rows = static_cast<int>(filter_shape.dim_size(1));
-      int filter_cols = static_cast<int>(filter_shape.dim_size(2));
-      int in_depth = static_cast<int>(filter_shape.dim_size(3));
-      int out_depth = static_cast<int>(filter_shape.dim_size(4));
+      int filter_planes =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_P));
+      int filter_rows =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_H));
+      int filter_cols =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_W));
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_I));
+      int filter_out_depth =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_O));
 
       // MKL-DNN always needs filter in OIDHW format.
       // OIDHW = (out_depth, in_depth, planes, rows, cols)
       std::vector<int> mkldnn_sizes(5, -1);
-      mkldnn_sizes[MklDnnDims3D::Dim3d_O] = out_depth;
-      mkldnn_sizes[MklDnnDims3D::Dim3d_I] = in_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_O] = filter_out_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_I] = filter_in_depth;
       mkldnn_sizes[MklDnnDims3D::Dim3d_D] = filter_planes;
       mkldnn_sizes[MklDnnDims3D::Dim3d_H] = filter_rows;
       mkldnn_sizes[MklDnnDims3D::Dim3d_W] = filter_cols;
@@ -256,10 +279,12 @@ class MklDnnConvUtil {
   // checks are returned in context's status.
   virtual inline void GetFilterSizeInMklOrder(size_t src_index,
                                               size_t filter_index,
-                                              memory::dims* filter_dims) {
+                                              memory::dims* filter_dims,
+                                              bool is_depthwise) {
     CHECK_NOTNULL(filter_dims);
     GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
-                            GetTfShape(context_, filter_index), filter_dims);
+                            GetTfShape(context_, filter_index), filter_dims,
+                            is_depthwise);
   }
 
   // Calculate Bias size for 2D or 3D Convolution. Function does not
@@ -288,15 +313,16 @@ class MklDnnConvUtil {
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
+      memory::dims* pad_l, memory::dims* pad_r, bool pad_enabled = false,
+      bool is_depthwise = false) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    bool isConv2D = (strides_.size() == 4);
+    bool is_conv2d = (strides_.size() == 4);
     int input_planes, input_rows, input_cols;
-    if (isConv2D) {
+    if (is_conv2d) {
       input_rows = GetTensorDim(input_shape, data_format_, 'H');
       input_cols = GetTensorDim(input_shape, data_format_, 'W');
     } else {
@@ -315,18 +341,18 @@ class MklDnnConvUtil {
     //    Third dimension: cols/width.
 
     int filter_planes, filter_rows, filter_cols;
-    if (isConv2D) {
-      filter_rows = filter_shape.dim_size(0);
-      filter_cols = filter_shape.dim_size(1);
+    if (is_conv2d) {
+      filter_rows = filter_shape.dim_size(TF_2DFILTER_DIM_H);
+      filter_cols = filter_shape.dim_size(TF_2DFILTER_DIM_W);
     } else {
-      filter_planes = filter_shape.dim_size(0);
-      filter_rows = filter_shape.dim_size(1);
-      filter_cols = filter_shape.dim_size(2);
+      filter_planes = filter_shape.dim_size(TF_3DFILTER_DIM_P);
+      filter_rows = filter_shape.dim_size(TF_3DFILTER_DIM_H);
+      filter_cols = filter_shape.dim_size(TF_3DFILTER_DIM_W);
     }
 
     int stride_planes, stride_rows, stride_cols;
     int dilation_planes, dilation_rows, dilation_cols;
-    if (isConv2D) {
+    if (is_conv2d) {
       // Conv2D stride is a vector of 2 elements: {s_r, s_c}
       stride_rows = strides[0];
       stride_cols = strides[1];
@@ -344,15 +370,28 @@ class MklDnnConvUtil {
 
     // Output batch is same as input batch.
     int out_batch = GetTensorDim(input_shape, data_format_, 'N');
+    int out_depth;
+
+    // TODO add support for 3-D Depthwise
 
-    // Output depth is same as last dimension for filter.
-    int out_depth = filter_shape.dim_size(isConv2D ? 3 : 4);
+    // Output depth is same as last dimension for filters for regular
+    // convolutions. For depthwise it is in_depth * channel_multiplier.
+    // The channel_multiplier is the last dimension of TF filter for
+    // depthwise convolutions.
+    if (is_depthwise) {
+      out_depth = (filter_shape.dim_size(TF_2DFILTER_DIM_I) *
+                   filter_shape.dim_size(TF_2DFILTER_DIM_O));
+    } else {
+      out_depth = filter_shape.dim_size(
+          is_conv2d ? static_cast<int>(TF_2DFILTER_DIM_O)
+                    : static_cast<int>(TF_3DFILTER_DIM_O));
+    }
 
     int64 out_rows = 0, out_cols = 0, out_planes = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
     int64 pad_D1, pad_D2;
 
-    if (isConv2D) {
+    if (is_conv2d) {
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_rows, filter_rows, dilation_rows, stride_rows,
@@ -373,12 +412,12 @@ class MklDnnConvUtil {
                                    padding_, &out_cols, &pad_left, &pad_right));
     }
 
-    if (isConv2D) {
+    if (is_conv2d) {
       // Conv + pad fusion is enabled only for 2D
-      // If padEnabled, i.e., pad and conv op are fused, then
+      // If pad_enabled, i.e., pad and conv op are fused, then
       // all pads are already passed from pad op through
       // *pad_l and *pad_r
-      if (padEnabled) {
+      if (pad_enabled) {
         pad_top = static_cast<int64>((*pad_l)[0]);
         pad_left = static_cast<int64>((*pad_l)[1]);
         pad_bottom = static_cast<int64>((*pad_r)[0]);
@@ -389,7 +428,7 @@ class MklDnnConvUtil {
         out_cols = out_cols + (pad_left + pad_right) / stride_cols;
       }
       // Handle padding. MKL-DNN uses asymetric padding.
-      // But, if padEnabled, i.e., pad and conv op are fused,
+      // But, if pad_enabled, i.e., pad and conv op are fused,
       // then, *pad_l and *pad_r are already set from pad op.
       // In that case they need not set here.
       else {
@@ -408,14 +447,14 @@ class MklDnnConvUtil {
     //     Conv3D: NDHWC or NCDHW
     // MKL-DNN uses asymetric padding.
     TensorShape out_shape =
-        isConv2D
+        is_conv2d
             ? ShapeFromFormat(data_format_, out_batch, out_rows, out_cols,
                               out_depth)
             : ShapeFromFormat(data_format_, out_batch,
                               {{out_planes, out_rows, out_cols}}, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
-    if (isConv2D) {
+    if (is_conv2d) {
       // For Conv2D, MKL-DNN always needs output in NCHW format.
       std::vector<int> mkldnn_sizes(4, -1);
       mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
@@ -442,7 +481,7 @@ class MklDnnConvUtil {
       size_t src_index, size_t filter_index, const memory::dims& strides,
       const memory::dims& dilations, memory::dims* output_dims_tf_order,
       memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
+      memory::dims* pad_r, bool is_depthwise) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -465,7 +504,8 @@ class MklDnnConvUtil {
 
     GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
                                   dilations, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+                                  output_dims_mkl_order, pad_l, pad_r,
+                                  is_depthwise);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -481,7 +521,8 @@ class MklDnnConvUtil {
       memory::dims* input_dims, memory::dims* filter_dims,
       memory::dims* strides, memory::dims* dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
+      memory::dims* pad_l, memory::dims* pad_r, bool pad_enabled = false,
+      bool is_depthwise = false) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -493,13 +534,14 @@ class MklDnnConvUtil {
 
     GetInputSizeInMklOrder(input_shape, input_dims);
     if (!context_->status().ok()) return;
-    GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims);
+    GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims,
+                            is_depthwise);
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetDilationsInMklOrder(dilations);
     GetOutputAndPadSizeInMklOrder(
         input_shape, filter_shape, *strides, *dilations, output_dims_tf_order,
-        output_dims_mkl_order, pad_l, pad_r, padEnabled);
+        output_dims_mkl_order, pad_l, pad_r, pad_enabled, is_depthwise);
     if (!context_->status().ok()) return;
   }
 };
@@ -508,7 +550,7 @@ class MklDnnConvUtil {
 ///  Common class that implements ConvBackpropFilter and Input
 /////////////////////////////////////////////////////////////////////
 
-template <typename Device, class T>
+template <typename Device, class T, bool is_depthwise>
 class MklConvBackpropCommonOp : public OpKernel {
  public:
   ~MklConvBackpropCommonOp() {}
@@ -521,28 +563,38 @@ class MklConvBackpropCommonOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    const int64 stride_h = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
 
-    if (strides_.size() == 4) {
-      // Check Conv2D dilations
-      OP_REQUIRES(context, dilations_.size() == 4,
-                  errors::InvalidArgument("Sliding window dilations field must "
-                                          "specify 4 dimensions"));
-      int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-      int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-      int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-      int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-      OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "dilations in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context, dilation_h > 0 && dilation_w > 0,
-          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    // Depthwise Convolution doesn't have dilation parameter
+    if (!is_depthwise) {
+      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+      if (strides_.size() == 4) {
+        // Check Conv2D dilations
+        OP_REQUIRES(
+            context, dilations_.size() == 4,
+            errors::InvalidArgument("Sliding window dilations field must "
+                                    "specify 4 dimensions"));
+        int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+        int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+        int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+        int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+        OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                    errors::InvalidArgument(
+                        "Current implementation does not yet support "
+                        "dilations in the batch and depth dimensions."));
+        OP_REQUIRES(
+            context, dilation_h > 0 && dilation_w > 0,
+            errors::InvalidArgument("Dilated rates should be larger than 0."));
+      }
+    } else {
+      // Set dilations as 1 for depthwise conv
+      // for future support to align with Tensorflow
+      dilations_ = {1, 1, 1, 1};
     }
 
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 2ec6c8fa897464be4dba35a5446b8452d12a40d8..1ae42a0d0d74ef7e2e12fe7427cadfc043774c70 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -13,678 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifdef INTEL_MKL
-
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::batch_normalization_backward;
 using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
 using mkldnn::use_global_stats;
 using mkldnn::use_scale_shift;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
-// TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklFusedBatchNormOp : public OpKernel {
- public:
-  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    float epsilon;
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
-    epsilon_ = T(epsilon);
-    string tensor_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
-    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklFusedBatchNormOpContext mkl_context;
-    const Tensor& input = MklGetInput(context, 0);
-    const Tensor& scale = MklGetInput(context, 1);
-    const Tensor& shift = MklGetInput(context, 2);
-    const Tensor& est_mean = MklGetInput(context, 3);
-    const Tensor& est_variance = MklGetInput(context, 4);
-
-    GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
-    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-    OP_REQUIRES(context, scale.dims() == 1,
-                errors::InvalidArgument("scale must be 1-dimensional",
-                                        scale.shape().DebugString()));
-    OP_REQUIRES(context, shift.dims() == 1,
-                errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift.shape().DebugString()));
-    OP_REQUIRES(context, est_mean.dims() == 1,
-                errors::InvalidArgument("estimated_mean must be 1-dimensional",
-                                        est_mean.shape().DebugString()));
-
-    OP_REQUIRES(
-        context, est_variance.dims() == 1,
-        errors::InvalidArgument("estimated_variance must be 1-dimensional",
-                                est_variance.shape().DebugString()));
-
-    if (is_training_) {
-      OP_REQUIRES(context, est_mean.dim_size(0) == 0,
-                  errors::InvalidArgument("estimated_mean empty for training",
-                                          est_mean.shape().DebugString()));
-      OP_REQUIRES(context, est_variance.dim_size(0) == 0,
-                  errors::InvalidArgument(
-                      "estimated_variance must be empty for training",
-                      est_variance.shape().DebugString()));
-    }
-
-    unsigned int flag_batch_norm =
-        is_training_ ? dnnUseScaleShift
-                     : (dnnUseInputMeanVariance | dnnUseScaleShift);
-
-    mkl_context.MklExtractParams(context, tensor_format_);
-
-    // Create layout only for input data as it is used in Op primitive.
-    mkl_context.MklCreateInputLayout(context);
-
-    // Create Op primitive.
-    CHECK_EQ(dnnBatchNormalizationCreateForward_v2_F32(
-                 &(mkl_context.mkl_prim_batchnorm), nullptr,
-                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
-                 flag_batch_norm),
-             E_SUCCESS);
-
-    // Temporary tensors with buffers for the context inputs, if
-    // conversion to MKL-Op specific layouts are required. It is assumed here
-    // that TF's 1D tensors (scale, shift, est_mean, and est_variance) won't
-    // require any conversion.
-    // Since scale-shift is combined in MKL, a buffer is required.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_scale_shift_buf_tensor;
-    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
-                                        &mkl_tmp_scale_shift_buf_tensor);
-
-    // Output data in MKL layout
-    Tensor* output = nullptr;
-    TensorShape tf_shape_output;
-    MklShape mkl_shape_output;
-    mkl_shape_output.SetMklTensor(true);
-    mkl_shape_output.SetMklLayout(mkl_context.mkl_prim_batchnorm,
-                                  dnnResourceDst);
-    mkl_shape_output.SetTfLayout(mkl_context.mkl_params.in_dim,
-                                 mkl_context.mkl_params.in_sizes,
-                                 mkl_context.mkl_params.in_strides);
-    mkl_shape_output.SetTfDimOrder(mkl_context.mkl_params.in_dim,
-                                   tensor_format_);
-    tf_shape_output.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                               mkl_shape_output.GetMklLayout())) /
-                           sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, tf_shape_output,
-                              mkl_shape_output);
-    mkl_context.mkl_res_batchnorm[dnnResourceDst] =
-        static_cast<void*>(output->flat<T>().data());
-
-    // Batch mean in TF layout
-    Tensor* batch_mean = nullptr;
-    MklShape mkl_shape_batch_mean;
-    mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 1, &batch_mean, scale.shape(),
-                              mkl_shape_batch_mean);
-    // Batch variance in TF layout
-    Tensor* batch_variance = nullptr;
-    MklShape mkl_shape_batch_variance;
-    mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 2, &batch_variance, scale.shape(),
-                              mkl_shape_batch_variance);
-    // If training mode, set dnnResourceMean and dnnResourceVariance to
-    // output tensors for batch mean and variance.
-    // Otherwise, set dnnResourceMean and dnnResourceVariance to
-    // estimated mean and variance.
-    if (is_training_)
-      mkl_context.MklSetMeanVariance(*batch_mean, *batch_variance);
-    else
-      mkl_context.MklSetMeanVariance(est_mean, est_variance);
-
-    // Now that all resources are set, it is ready for dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm,
-                            mkl_context.mkl_res_batchnorm),
-             E_SUCCESS);
-
-    // Mean and variance (without Bessel's correction) saved for backward
-    // computation to serve as pre-computed mean and variance.
-    Tensor* saved_mean = nullptr;
-    MklShape mkl_shape_saved_mean;
-    mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 3, &saved_mean, scale.shape(),
-                              mkl_shape_saved_mean);
-    std::memcpy(
-        reinterpret_cast<char*>(saved_mean->flat<float>().data()),
-        reinterpret_cast<char*>(mkl_context.mkl_res_batchnorm[dnnResourceMean]),
-        scale.NumElements() * sizeof(float));
-    Tensor* saved_variance = nullptr;
-    MklShape mkl_shape_saved_variance;
-    mkl_shape_saved_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 4, &saved_variance, scale.shape(),
-                              mkl_shape_saved_variance);
-    std::memcpy(reinterpret_cast<char*>(saved_variance->flat<float>().data()),
-                reinterpret_cast<char*>(
-                    mkl_context.mkl_res_batchnorm[dnnResourceVariance]),
-                scale.NumElements() * sizeof(float));
-
-    // Bessel's correction on variance, if training mode is on
-    if (is_training_) {
-      float* p_var = static_cast<float*>(batch_variance->flat<T>().data());
-      auto depth = mkl_context.mkl_params.depth;
-      size_t orig_size = mkl_context.mkl_params.in_sizes[0] *
-                         mkl_context.mkl_params.in_sizes[1] *
-                         mkl_context.mkl_params.in_sizes[3];
-      size_t adjust_size = orig_size - 1;
-      float adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
-      for (int i = 0; i < depth; i++) p_var[i] = adjust_factor * p_var[i];
-    }
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  T epsilon_;
-  TensorFormat tensor_format_;
-  bool is_training_;
-
-  // Structure containing all info for MklOp
-  typedef struct {
-    // Parameters used for input and output layouts
-    struct MklBatchNormParams {
-      // BatchNormOp src and
-      size_t in_dim;
-      size_t in_sizes[4];
-      size_t in_strides[4];
-      size_t depth;  // Batch normalization is done for per channel.
-    } mkl_params;
-
-    MklShape mkl_shape_input_shape;
-
-    // MKL primitive and resources for BatchNormOp
-    dnnPrimitive_t mkl_prim_batchnorm = nullptr;
-    void* mkl_res_batchnorm[dnnResourceNumber];
-
-    // MKL layouts for inputs in the context
-    dnnLayout_t mkl_lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
-      if (mkl_prim_batchnorm != nullptr) dnnDelete_F32(mkl_prim_batchnorm);
-    }
-
-    void MklExtractParams(OpKernelContext* context,
-                          const TensorFormat& tensor_format) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      mkl_params.in_dim = input_in_mkl_format
-                              ? mkl_shape_input_shape.GetDimension()
-                              : input.dims();
-      mkl_params.in_sizes[0] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
-                              : GetTensorDim(input, tensor_format, 'W'));
-      mkl_params.in_sizes[1] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
-                              : GetTensorDim(input, tensor_format, 'H'));
-      mkl_params.in_sizes[2] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
-                              : GetTensorDim(input, tensor_format, 'C'));
-      mkl_params.in_sizes[3] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
-                              : GetTensorDim(input, tensor_format, 'N'));
-      mkl_params.depth = mkl_params.in_sizes[2];
-      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
-                          mkl_params.in_sizes);
-    }
-
-    void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        mkl_lt_input =
-            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dim,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-    }
-    void MklPrepareContextInputs(OpKernelContext* context,
-                                 Tensor* mkl_tmp_input_buf_tensor,
-                                 Tensor* mkl_tmp_scale_shift_buf_tensor) {
-      bool mkl_convert_input;
-      dnnPrimitive_t mkl_prim_convert_input = nullptr;
-      dnnLayout_t mkl_lt_internal_input = nullptr;
-      void* mkl_buf_converted_input = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_input, mkl_prim_batchnorm, dnnResourceSrc),
-               E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_converted_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_converted_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      mkl_res_batchnorm[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
-
-      // scale-shift layout is created from primitive. So no conversion
-      // is needed, however, a buffer has to be allocated.
-      dnnLayout_t mkl_lt_scale_shift = nullptr;
-      void* mkl_buf_scale_shift = nullptr;
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(
-              &mkl_lt_scale_shift, mkl_prim_batchnorm, dnnResourceScaleShift),
-          E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_scale_shift_buf_tensor,
-                     mkl_lt_scale_shift, &mkl_buf_scale_shift);
-      // Fill the scale-shift buffer with data, presumably buffer is 2D array
-      const Tensor& scale = MklGetInput(context, 1);
-      const Tensor& shift = MklGetInput(context, 2);
-      float* buf_scale_shift = static_cast<float*>(mkl_buf_scale_shift);
-      float* buf_scale = const_cast<float*>(
-          static_cast<const float*>(scale.flat<float>().data()));
-      float* buf_shift = const_cast<float*>(
-          static_cast<const float*>(shift.flat<float>().data()));
-      auto depth = mkl_params.depth;
-      for (int i = 0; i < depth; i++) {
-        buf_scale_shift[i] = buf_scale[i];
-        buf_scale_shift[i + depth] = buf_shift[i];
-      }
-      mkl_res_batchnorm[dnnResourceScaleShift] = mkl_buf_scale_shift;
-    }
-
-    inline void MklSetMeanVariance(const Tensor& mean, const Tensor& variance) {
-      mkl_res_batchnorm[dnnResourceMean] = const_cast<void*>(
-          static_cast<const void*>(mean.flat<float>().data()));
-      mkl_res_batchnorm[dnnResourceVariance] = const_cast<void*>(
-          static_cast<const void*>(variance.flat<float>().data()));
-    }
-  } MklFusedBatchNormOpContext;
-};
-
-template <typename Device, typename T>
-class MklFusedBatchNormGradOp : public OpKernel {
- public:
-  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    float epsilon;
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
-    epsilon_ = T(epsilon);
-    string tensor_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
-    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
-                errors::InvalidArgument("Invalid data format"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklFusedBatchNormGradOpContext mkl_context;
-
-    const Tensor& out_backprop = MklGetInput(context, 0);
-    const Tensor& input = MklGetInput(context, 1);
-    const Tensor& scale = MklGetInput(context, 2);
-    const Tensor& saved_mean = MklGetInput(context, 3);
-    const Tensor& saved_var = MklGetInput(context, 4);
-
-    // Here scale, mean, and variance are 1D and considered
-    // those having same layout in MKL and TF
-    GetMklShape(context, 0, &(mkl_context.mkl_shape_out_backprop));
-    GetMklShape(context, 1, &(mkl_context.mkl_shape_input_shape));
-
-    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
-    bool out_backprop_in_mkl_format =
-        mkl_context.mkl_shape_out_backprop.IsMklTensor();
-    if (!out_backprop_in_mkl_format) {
-      OP_REQUIRES(context, out_backprop.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          out_backprop.shape().DebugString()));
-    }
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-    OP_REQUIRES(context, scale.dims() == 1,
-                errors::InvalidArgument("scale must be 1-dimensional",
-                                        scale.shape().DebugString()));
-    OP_REQUIRES(context, saved_mean.dims() == 1,
-                errors::InvalidArgument("saved mean must be 1-dimensional",
-                                        saved_mean.shape().DebugString()));
-    OP_REQUIRES(context, saved_var.dims() == 1,
-                errors::InvalidArgument("saved variance must be 1-dimensional",
-                                        saved_var.shape().DebugString()));
-
-    mkl_context.MklExtractParams(context, tensor_format_);
-
-    mkl_context.MklCreateInputLayout(context);
-
-    unsigned int flag_batch_norm_grad = dnnUseScaleShift;
-
-    // Create Backward Op primitive.
-    CHECK_EQ(dnnBatchNormalizationCreateBackward_v2_F32(
-                 &(mkl_context.mkl_prim_batchnorm_bwd), nullptr,
-                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
-                 flag_batch_norm_grad),
-             E_SUCCESS);
-
-    // Temporary tensors and their buffers if conversion is required
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_outbackprop_buf_tensor,
-        mkl_tmp_scaleshift_buf_tensor;
-    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
-                                        &mkl_tmp_outbackprop_buf_tensor,
-                                        &mkl_tmp_scaleshift_buf_tensor);
-
-    // Allocate tensor for grad w.r.t. input(x)
-    Tensor* in_backprop = nullptr;
-    TensorShape tf_shape_in_backprop;
-    MklShape mkl_shape_in_backprop;
-    mkl_shape_in_backprop.SetMklTensor(true);
-    mkl_shape_in_backprop.SetMklLayout(mkl_context.mkl_prim_batchnorm_bwd,
-                                       dnnResourceDiffSrc);
-    mkl_shape_in_backprop.SetTfLayout(mkl_context.mkl_params.in_dims,
-                                      mkl_context.mkl_params.in_sizes,
-                                      mkl_context.mkl_params.in_strides);
-    mkl_shape_in_backprop.SetTfDimOrder(mkl_context.mkl_params.in_dims,
-                                        tensor_format_);
-    tf_shape_in_backprop.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_shape_in_backprop.GetMklLayout())) /
-        sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &in_backprop, tf_shape_in_backprop,
-                              mkl_shape_in_backprop);
-    mkl_context.mkl_res_batchnorm_bwd[dnnResourceDiffSrc] =
-        static_cast<void*>(in_backprop->flat<T>().data());
-
-    // grad_scale and grad_shift are combined together in MKL
-    // So create a single temporary buffer for those.
-    // Also set dnnResourceDiffScaleShift to the temporary buffer
-    Tensor mkl_tmp_grad_scale_shift_buf_tensor;
-    mkl_context.MklPrepareGradScaleShift(context,
-                                         &mkl_tmp_grad_scale_shift_buf_tensor);
-
-    // All dnn resources are set now, ready to execute
-    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm_bwd,
-                            mkl_context.mkl_res_batchnorm_bwd),
-             E_SUCCESS);
-
-    // Now separate out scale and shift grad and copy to individual tensors
-    const TensorShape& tf_shape_scale_shift = scale.shape();
-    // Allocate tensor for grad w.r.t. scale (beta)
-    Tensor* scale_backprop = nullptr;
-    MklShape mkl_shape_scale_backprop;
-    AllocateOutputSetMklShape(context, 1, &scale_backprop, tf_shape_scale_shift,
-                              mkl_shape_scale_backprop);
-
-    // Allocate tensor for grad w.r.t. shift(gamma)
-    Tensor* shift_backprop = nullptr;
-    MklShape mkl_shape_shift_backprop;
-    AllocateOutputSetMklShape(context, 2, &shift_backprop, tf_shape_scale_shift,
-                              mkl_shape_shift_backprop);
-
-    // copy scale and shift grads to tensors
-    float* mkl_buf_scale_shift = const_cast<float*>(static_cast<const float*>(
-        mkl_tmp_grad_scale_shift_buf_tensor.flat<T>().data()));
-    float* tf_buf_scale = const_cast<float*>(
-        static_cast<const float*>(scale_backprop->flat<T>().data()));
-    float* tf_buf_shift = const_cast<float*>(
-        static_cast<const float*>(shift_backprop->flat<T>().data()));
-    auto depth = mkl_context.mkl_params.depth;
-    for (int i = 0; i < depth; i++) {
-      tf_buf_scale[i] = mkl_buf_scale_shift[i];
-      tf_buf_shift[i] = mkl_buf_scale_shift[i + depth];
-    }
-
-    // Two placeholders for estimated_mean and estimated_variance, which are
-    // used for inference and thus not needed here for gradient computation.
-    Tensor* placeholder_1 = nullptr;
-    MklShape mkl_shape_placeholder_1;
-    AllocateOutputSetMklShape(context, 3, &placeholder_1, TensorShape({}),
-                              mkl_shape_placeholder_1);
-    Tensor* placeholder_2 = nullptr;
-    MklShape mkl_shape_placeholder_2;
-    AllocateOutputSetMklShape(context, 4, &placeholder_2, TensorShape({}),
-                              mkl_shape_placeholder_2);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  T epsilon_;
-  TensorFormat tensor_format_;
-
-  // Structure containing all info for MklOp
-  typedef struct {
-    // Parameters used for input and output layouts
-    struct MklBatchNormParams {
-      // BatchNormOp src and
-      size_t in_dims;
-      size_t in_sizes[4];
-      size_t in_strides[4];
-      size_t depth;  // Batch normalization is done for per channel.
-    } mkl_params;
-
-    MklShape mkl_shape_out_backprop;
-    MklShape mkl_shape_input_shape;
-
-    // MKL primitive and resources for BatchNormOp
-    dnnPrimitive_t mkl_prim_batchnorm_bwd = nullptr;
-    void* mkl_res_batchnorm_bwd[dnnResourceNumber];
-
-    // MKL layouts for inputs in the context
-    dnnLayout_t mkl_lt_out_backprop = nullptr;
-    dnnLayout_t mkl_lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
-      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_out_backprop);
-
-      dnnDelete_F32(mkl_prim_batchnorm_bwd);
-    }
-
-    void MklExtractParams(OpKernelContext* context,
-                          const TensorFormat& tensor_format) {
-      const Tensor& input = MklGetInput(context, 1);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      mkl_params.in_dims = input_in_mkl_format
-                               ? mkl_shape_input_shape.GetDimension()
-                               : input.dims();
-      mkl_params.in_sizes[0] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
-                              : GetTensorDim(input, tensor_format, 'W'));
-      mkl_params.in_sizes[1] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
-                              : GetTensorDim(input, tensor_format, 'H'));
-      mkl_params.in_sizes[2] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
-                              : GetTensorDim(input, tensor_format, 'C'));
-      mkl_params.in_sizes[3] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
-                              : GetTensorDim(input, tensor_format, 'N'));
-      mkl_params.depth = mkl_params.in_sizes[2];
-      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
-                          mkl_params.in_sizes);
-    }
-
-    void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        mkl_lt_input =
-            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dims,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-
-      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
-      if (out_backprop_in_mkl_format) {
-        mkl_lt_out_backprop =
-            static_cast<dnnLayout_t>(mkl_shape_out_backprop.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_out_backprop, mkl_params.in_dims,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-    }
-
-    void MklPrepareContextInputs(OpKernelContext* context,
-                                 Tensor* mkl_tmp_input_buf_tensor,
-                                 Tensor* mkl_tmp_outbackprop_buf_tensor,
-                                 Tensor* mkl_tmp_scaleshift_buf_tensor) {
-      bool mkl_convert_input;
-      dnnPrimitive_t mkl_prim_convert_input = nullptr;
-      dnnLayout_t mkl_lt_internal_input = nullptr;
-      void* mkl_buf_converted_input = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 1);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(
-              &mkl_lt_internal_input, mkl_prim_batchnorm_bwd, dnnResourceSrc),
-          E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_converted_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_converted_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      mkl_res_batchnorm_bwd[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
-
-      bool mkl_convert_out_backprop;
-      dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
-      dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
-      void* mkl_buf_converted_out_backprop = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& out_backprop = MklGetInput(context, 0);
-      void* mkl_buf_out_backprop = const_cast<void*>(
-          static_cast<const void*>(out_backprop.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      mkl_convert_out_backprop = !dnnLayoutCompare_F32(
-          mkl_lt_internal_out_backprop, mkl_lt_out_backprop);
-      if (mkl_convert_out_backprop) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                                         mkl_lt_out_backprop,
-                                         mkl_lt_internal_out_backprop),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
-                       mkl_lt_internal_out_backprop,
-                       &mkl_buf_converted_out_backprop);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
-                                          mkl_buf_out_backprop,
-                                          mkl_buf_converted_out_backprop),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_out_backprop);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
-      mkl_res_batchnorm_bwd[dnnResourceDiffDst] =
-          (mkl_convert_out_backprop) ? mkl_buf_converted_out_backprop
-                                     : mkl_buf_out_backprop;
-
-      // Set dnnResourceMean and dnnResourceVariance
-      const Tensor& saved_mean = MklGetInput(context, 3);
-      const Tensor& saved_var = MklGetInput(context, 4);
-      void* mkl_buf_saved_mean = const_cast<void*>(
-          static_cast<const void*>(saved_mean.flat<T>().data()));
-      void* mkl_buf_saved_var = const_cast<void*>(
-          static_cast<const void*>(saved_var.flat<T>().data()));
-      mkl_res_batchnorm_bwd[dnnResourceMean] = mkl_buf_saved_mean;
-      mkl_res_batchnorm_bwd[dnnResourceVariance] = mkl_buf_saved_var;
-
-      // Set dnnResourceScaleShift
-      // Note backward Op needs only current values of scale parameters,
-      // shift parameters could be garbage and won't be used
-      const Tensor& scale = MklGetInput(context, 2);
-      dnnLayout_t mkl_lt_scale_shift = nullptr;
-      void* mkl_buf_scale_shift = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_scale_shift,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceScaleShift),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_scaleshift_buf_tensor, mkl_lt_scale_shift,
-                     &mkl_buf_scale_shift);
-      float* pscale =
-          const_cast<float*>(static_cast<const float*>(scale.flat<T>().data()));
-      float* pscale_shift = static_cast<float*>(mkl_buf_scale_shift);
-      auto depth = mkl_params.depth;
-      for (int i = 0; i < depth; i++) pscale_shift[i] = pscale[i];
-      mkl_res_batchnorm_bwd[dnnResourceScaleShift] = mkl_buf_scale_shift;
-      dnnLayoutDelete_F32(mkl_lt_scale_shift);
-    }
-
-    void MklPrepareGradScaleShift(OpKernelContext* context,
-                                  Tensor* mkl_tmp_grad_scale_shift_buf_tensor) {
-      dnnLayout_t mkl_lt_grad_scaleshift = nullptr;
-      void* mkl_buf_grad_scaleshift = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_grad_scaleshift,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceDiffScaleShift),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_grad_scale_shift_buf_tensor,
-                     mkl_lt_grad_scaleshift, &mkl_buf_grad_scaleshift);
-      mkl_res_batchnorm_bwd[dnnResourceDiffScaleShift] =
-          mkl_buf_grad_scaleshift;
-      dnnLayoutDelete_F32(mkl_lt_grad_scaleshift);
-    }
-  } MklFusedBatchNormGradOpContext;
-};
-#endif
-
-#ifndef INTEL_MKL_ML_ONLY
-
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
@@ -1765,8 +1112,6 @@ class MklFusedBatchNormGradOp : public OpKernel {
   memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
-#endif
-
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
index 258cca9332b5b86adbf0bbcb285210552729243e..3be3fecdc07d3a811c8cfa95060b3fe6d6f6b96e 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -38,8 +38,12 @@ namespace tensorflow {
 static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
 static const TensorShape dummy_shape({8});
 
+using BiasAddGraphRunner =
+    std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                       const Tensor& bias_data, Tensor* out)>;
+
 template <typename T>
-class ConvMklToTF : public OpsTestBase {
+class CommonTestUtilities : public OpsTestBase {
  public:
   void PerformConversion(DataType dtype, const Tensor& tensor,
                          const Tensor& mkl_meta_tensor, Tensor* output) {
@@ -59,6 +63,23 @@ class ConvMklToTF : public OpsTestBase {
     *output = *GetOutput(0);
   }
 
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  static void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                          Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(tensorflow::SessionOptions()));
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
   void ConvertAndCompare(DataType dtype, const Tensor& tensor,
                          const Tensor& mkl_meta_tensor,
                          const Tensor& expected) {
@@ -67,6 +88,35 @@ class ConvMklToTF : public OpsTestBase {
     test::ExpectTensorNear<T>(expected, output, 1e-5);
   }
   void TestBody() {}
+
+  static void VerifyBiasAddTensorsClose(int depth, int image_width,
+                                        int image_height, int image_batch_count,
+                                        int filter_size, int filter_count,
+                                        const BiasAddGraphRunner& run_default,
+                                        const BiasAddGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectClose(conv_2d, fused_conv_2d);
+  }
 };
 
 // Testing MKL's fused convolution ops
@@ -79,27 +129,6 @@ class MklFusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageHeight = 32;
   static constexpr int kImageBatchCount = 8;
 
-  using BiasAddGraphRunner =
-      std::function<void(const Tensor& input_data, const Tensor& filter_data,
-                         const Tensor& bias_data, Tensor* out)>;
-
-  // Runs a Tensorflow graph defined by the root scope, and fetches the result
-  // of 'fetch' node into the output Tensor.
-  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
-                   Tensor* output) {
-    tensorflow::GraphDef graph;
-    TF_ASSERT_OK(root.ToGraphDef(&graph));
-
-    std::unique_ptr<tensorflow::Session> session(
-        tensorflow::NewSession(tensorflow::SessionOptions()));
-    TF_ASSERT_OK(session->Create(graph));
-
-    std::vector<Tensor> unfused_tensors;
-    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
-
-    *output = unfused_tensors[0];
-  }
-
   void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
                          const Tensor& bias_data, Tensor* output,
                          int stride = 1) {
@@ -115,7 +144,7 @@ class MklFusedConv2DOpTest : public OpsTestBase {
         root.WithOpName("with_bias"), conv,
         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
 
-    RunAndFetch(root, "with_bias", output);
+    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
   }
 
   void RunConv2DWithBiasAndRelu(const Tensor& input_data,
@@ -136,7 +165,7 @@ class MklFusedConv2DOpTest : public OpsTestBase {
 
     auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
 
-    RunAndFetch(root, "with_relu", output);
+    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
   }
 
   void RunMklFusedConv2DOp(const Tensor& image, const Tensor& filter,
@@ -149,12 +178,12 @@ class MklFusedConv2DOpTest : public OpsTestBase {
     TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklFusedConv2D")
                      .Input(FakeInput(dtype))
                      .Input(FakeInput(dtype))
-                     .Attr("num_args", num_args)
                      .Input(FakeInput(num_args, dtype))
                      .Input(FakeInput(DT_UINT8))
                      .Input(FakeInput(DT_UINT8))
                      .Input(FakeInput(num_args, DT_UINT8))
                      .Attr("T", dtype)
+                     .Attr("num_args", num_args)
                      .Attr("strides", {1, stride, stride, 1})
                      .Attr("padding", "SAME")
                      .Attr("fused_ops", fused_ops)
@@ -178,40 +207,11 @@ class MklFusedConv2DOpTest : public OpsTestBase {
     // Index 2 will need to be changed if the number of outputs produced
     // by MklConv2D change.
     const Tensor& output_meta_tensor = *GetOutput(2);
-    ConvMklToTF<T> conv_comp;
-    conv_comp.PerformConversion(dtype, output_tensor, output_meta_tensor,
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
                                 output);
   }
 
-  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
-                                int image_batch_count, int filter_size,
-                                int filter_count,
-                                const BiasAddGraphRunner& run_default,
-                                const BiasAddGraphRunner& run_fused) {
-    DataType dtype = DataTypeToEnum<T>::v();
-
-    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
-    image.flat<T>() = image.flat<T>().setRandom();
-
-    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
-    filter.flat<T>() = filter.flat<T>().setRandom();
-
-    const int bias_size = filter_count;
-    Tensor bias(dtype, {bias_size});
-    bias.flat<T>() = bias.flat<T>().setRandom();
-
-    Tensor conv_2d;
-    Tensor fused_conv_2d;
-
-    run_default(image, filter, bias, &conv_2d);
-    run_fused(image, filter, bias, &fused_conv_2d);
-
-    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
-    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
-
-    test::ExpectClose(conv_2d, fused_conv_2d);
-  }
-
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
   // FusedConv2D.
   void VerifyConv2DWithBias(int filter_size, int filter_count,
@@ -231,9 +231,9 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                               out);
         };
 
-    VerifyBiasAddTensorsNear(depth, image_width, image_height,
-                             image_batch_count, filter_size, filter_count,
-                             run_default, run_fused);
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
   }
 
   // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
@@ -256,9 +256,9 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                               {"BiasAdd", "Relu"}, out);
         };
 
-    VerifyBiasAddTensorsNear(depth, image_width, image_height,
-                             image_batch_count, filter_size, filter_count,
-                             run_default, run_fused);
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
   }
 };
 
@@ -341,8 +341,8 @@ class FusedPadConvOpTest : public OpsTestBase {
     // Compare output to expected results
     const Tensor& first = *GetOutput(0);
     const Tensor& second = *GetOutput(2);
-    ConvMklToTF<T> conv_comp;
-    conv_comp.ConvertAndCompare(dtype, first, second, expected);
+    CommonTestUtilities<T> test_util;
+    test_util.ConvertAndCompare(dtype, first, second, expected);
   }
 };
 
@@ -401,5 +401,226 @@ TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
 
   Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
 }
+
+// Testing fusion of pad and fusedconv2d
+template <typename T>
+class MklPadWithFusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 30;
+  static constexpr int kImageHeight = 28;
+  static constexpr int kImageBatchCount = 8;
+
+  // 0: top pad, 1: bottom pad, 2: left pad, 3: right pad
+  int padding_list_[4];
+
+  // Verifies that computing Pad+Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyPadAndConv2DWithBias(int filter_size, int filter_count,
+                                  int depth = kDepth,
+                                  int image_width = kImageWidth,
+                                  int image_height = kImageHeight,
+                                  int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default = [this](const Tensor& input_data,
+                                                  const Tensor& filter_data,
+                                                  const Tensor& bias_data,
+                                                  Tensor* out) {
+      RunMklPadWithFusedConv2DAndBias(input_data, filter_data, bias_data, out);
+    };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                     {"BiasAdd"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Pad+Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyPadAndConv2DWithBiasRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklPadWithFusedConv2DAndBiasRelu(input_data, filter_data,
+                                              bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                     {"BiasAdd", "Relu"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  void RunMklPadWithFusedConv2DAndBias(const Tensor& input_data,
+                                       const Tensor& filter_data,
+                                       const Tensor& bias_data, Tensor* output,
+                                       int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    // FusedConv2D only supports NHWC format so we use NHWC here.
+    auto padding = ops::Const(root.WithOpName("padding"),
+                              {0, 0, padding_list_[0], padding_list_[1],
+                               padding_list_[2], padding_list_[3], 0, 0},
+                              {4, 2});
+    auto pad = ops::Pad(
+        root.WithOpName("pad"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        padding);
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"), pad,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "VALID");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunMklPadWithFusedConv2DAndBiasRelu(const Tensor& input_data,
+                                           const Tensor& filter_data,
+                                           const Tensor& bias_data,
+                                           Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    // FusedConv2D only supports NHWC format so we use NHWC here.
+    auto padding = ops::Const(root.WithOpName("padding"),
+                              {0, 0, padding_list_[0], padding_list_[1],
+                               padding_list_[2], padding_list_[3], 0, 0},
+                              {4, 2});
+    auto pad = ops::Pad(
+        root.WithOpName("pad"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        padding);
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"), pad,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "VALID");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunMklFusedConv2DWithPadOp(const Tensor& image, const Tensor& filter,
+                                  const std::vector<Tensor>& args,
+                                  const std::vector<string>& fused_ops,
+                                  Tensor* output, int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    const int num_args = static_cast<int>(args.size());
+    Tensor padding(DT_INT32, {4, 2});
+    test::FillValues<int32>(
+        &padding, {0, 0, padding_list_[0], padding_list_[1], padding_list_[2],
+                   padding_list_[3], 0, 0});
+
+    TF_EXPECT_OK(NodeDefBuilder("pad_fused_conv_op", "_MklPadWithFusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(num_args, dtype))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Attr("T", dtype)
+                     .Attr("num_args", num_args)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "VALID")
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
+    // Add MKL meta input for input, filter, pad and agrs.
+    for (int i = 0; i < args.size() + 3; ++i)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_tensor = *GetOutput(0);
+    // Index 2 will need to be changed if the number of outputs produced
+    // by MklConv2D change.
+    const Tensor& output_meta_tensor = *GetOutput(2);
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                output);
+  }
+
+ public:
+  void SetPaddingList(int top, int bottom, int left, int right) {
+    padding_list_[0] = top;
+    padding_list_[1] = bottom;
+    padding_list_[2] = left;
+    padding_list_[3] = right;
+  }
+};
+
+TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest);
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndRoundPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(2, 2, 1, 1);
+  this->VerifyPadAndConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndPartialPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(4, 0, 2, 0);
+  this->VerifyPadAndConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndRoundPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(2, 2, 1, 1);
+  this->VerifyPadAndConv2DWithBiasRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndPartialPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(4, 0, 2, 0);
+  this->VerifyPadAndConv2DWithBiasRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest,  //
+                           WithBiasAndRoundPad,          //
+                           WithBiasAndPartialPad,        //
+                           WithBiasReluAndRoundPad,      //
+                           WithBiasReluAndPartialPad);
+
+using MklPadWithFusedConv2DDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, MklPadWithFusedConv2DOpTest,
+                              MklPadWithFusedConv2DDataTypes);
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 4d46abb0a4dd232ef13c8b6b0547b0779af1f98f..bc52127b942375c89cea832e3013684687374cb6 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <vector>
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index dc84d3941e78a2232041b2dbcf83bf3545982dee..a8d1dffd4e52c8e9a16a0a82cf8c31be9cb628e9 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <limits>
 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index d8ab1cd25b9e09e6b25e2b0454567caa3dcea9e0..19585969993d6eaf16b62f7abcf01fdefae3fad4 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,15 +16,14 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
+#include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::algorithm;
 using mkldnn::eltwise_bounded_relu;
@@ -36,16 +35,9 @@ using mkldnn::prop_kind;
 using mkldnn::relu_backward;
 using mkldnn::relu_forward;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML_ONLY
-
 template <typename T>
 class MklEltwiseFwdParams {
  public:
@@ -451,335 +443,8 @@ class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-#endif
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-struct MklReluHelpers {
-  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
-                                     const Tensor& a) {
-    OP_REQUIRES(context, a.IsSameSize(g),
-                errors::InvalidArgument("g and a must be the same size"));
-  }
-  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
-                               const Tensor& a) {
-    ValidateSameSizeHelper(context, g, a);
-    return context->status().ok();
-  }
-};
-
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklReluOp : public OpKernel {
- public:
-  ~MklReluOp() {}
-
-  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    MklReluOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
-      const TensorShape& o_shape = input.shape();
-      Tensor* out_tensor = nullptr;
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &out_tensor, o_shape,
-                                mkl_context.output_shape);
-      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
-      (static_cast<T*>(out_o))[0] =
-          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
-      return;
-    }
-
-    // Generate size, stride for input if input is in MKL format.
-    if (input_in_mkl_format) {
-      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
-        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
-      }
-    } else {
-      mkl_context.in_dims = input.dims();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
-      }
-      mkl_context.in_strides[0] = 1;
-      for (int i = 1; i < mkl_context.in_dims; i++) {
-        mkl_context.in_strides[i] =
-            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-      }
-    }
-
-    float negative_slope = 0.0;
-    mkl_context.MklCreateInputLayouts(context);
-    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
-                                      mkl_context.lt_input, negative_slope),
-             E_SUCCESS);
-
-    Tensor* output = nullptr;
-
-    if (input_in_mkl_format) {
-      TensorShape tf_shape;
-      mkl_context.output_shape.SetMklTensor(true);
-      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
-                                            dnnResourceDst);
-      mkl_context.output_shape.SetTfLayout(
-          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                          mkl_context.output_shape.GetMklLayout())) /
-                      sizeof(T));
-      AllocateOutputSetMklShape(context, 0, &output, tf_shape,
-                                mkl_context.output_shape);
-    } else {
-      const TensorShape& o_shape = input.shape();
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output, o_shape,
-                                mkl_context.output_shape);
-    }
-
-    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
-
-    mkl_context.relu_res[dnnResourceDst] = user_o;
-    mkl_context.relu_res[dnnResourceSrc] = user_i;
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
-             E_SUCCESS);
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, output_shape;
-    dnnPrimitive_t prim_relu_fwd = nullptr;
-    void* relu_res[dnnResourceNumber];
-    dnnLayout_t lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      dnnDelete_F32(prim_relu_fwd);
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-    }
-  } MklReluOpContext;
-};
-
-template <typename Device, typename T>
-class MklReluGradOp : public OpKernel {
- public:
-  ~MklReluGradOp() {}
-
-  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override;
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, grad_shape, output_shape;
-    void* relu_res[dnnResourceNumber];
-    dnnPrimitive_t prim_relu_bwd;
-    dnnLayout_t lt_input, lt_grad;
-
-    void MklPrepareReluGradInputs(OpKernelContext* context,
-                                  Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& g = MklGetInput(context, 0);
-      const Tensor& a = MklGetInput(context, 1);
-      void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-      void* mkl_buffer_convert = nullptr;
-
-      dnnPrimitive_t cv_input_to_grad = nullptr;
-
-      // if input and grad are not in the same layout,
-      // do a conversion between them.
-      if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
-                       &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert),
-                 E_SUCCESS);
-        relu_res[dnnResourceSrc] = mkl_buffer_convert;
-        dnnDelete_F32(cv_input_to_grad);
-      } else {
-        relu_res[dnnResourceSrc] = buf_input;
-      }
-
-      void* buf_grad = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-      relu_res[dnnResourceDiffDst] = buf_grad;
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      if (!input_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-
-      if (!grad_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
-      }
-    }
-
-    void MklCleanup() {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      dnnDelete_F32(prim_relu_bwd);
-      if (!input_is_mkl) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      if (!grad_is_mkl) {
-        dnnLayoutDelete_F32(lt_grad);
-      }
-    }
-  } MklReluGradOpContext;
-};
-
-template <typename Device, typename T>
-void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
-  MklReluGradOpContext mkl_context;
-  const Tensor& g = MklGetInput(context, 0);
-  const Tensor& a = MklGetInput(context, 1);
-
-  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-
-  GetMklShape(context, 0, &mkl_context.grad_shape);
-  GetMklShape(context, 1, &mkl_context.input_shape);
-
-  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
-  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
-  if (!input_is_mkl && !grad_is_mkl &&
-      !MklReluHelpers::ValidateSameSize(context, g, a))
-    return;
-  Tensor* output = nullptr;
-
-  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
-    // handle the scalar case
-    const TensorShape& g_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 0, &output, g_shape,
-                              mkl_context.output_shape);
-
-    void* out_o = static_cast<void*>(output->flat<T>().data());
-    (static_cast<T*>(out_o))[0] =
-        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
-    return;
-  }
-
-  // generate size, stride for input if input/grad is in mkl format.
-  if (grad_is_mkl || input_is_mkl) {
-    const MklShape* tmp_mkl_shape =
-        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
-
-    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
-      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
-    }
-  } else {
-    mkl_context.in_dims = g.dims();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
-    }
-    mkl_context.in_strides[0] = 1;
-    for (int i = 1; i < mkl_context.in_dims; i++) {
-      mkl_context.in_strides[i] =
-          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-    }
-  }
-
-  mkl_context.MklCreateInputLayouts(context);
-  float negative_slope = 0.0;
-  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
-                                     mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope),
-           E_SUCCESS);
-  Tensor mkl_tmp_input_buf_tensor;
-  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
-
-  if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
-    TensorShape tf_shape;
-    mkl_context.output_shape.SetMklTensor(true);
-    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
-                                          dnnResourceDiffSrc);
-    mkl_context.output_shape.SetTfLayout(
-        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
-    // shape of one that is in mkl layout.
-    if (grad_is_mkl == true) {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-    }
-
-    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, tf_shape,
-                              mkl_context.output_shape);
-  } else {
-    const TensorShape& o_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 0, &output, o_shape,
-                              mkl_context.output_shape);
-  }
-
-  mkl_context.relu_res[dnnResourceDiffSrc] =
-      static_cast<void*>(output->flat<T>().data());
-
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
-  mkl_context.MklCleanup();
-}
-
-#else  // INTEL_MKL_ML_ONLY
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
@@ -1399,8 +1064,6 @@ class MklLeakyReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
   }
 };
 
-#endif
-
 // register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
   REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
@@ -1415,8 +1078,6 @@ class MklLeakyReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifndef INTEL_MKL_ML_ONLY
-
 // register dnn kernels for supported operations and supported types
 #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
   REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
@@ -1470,8 +1131,6 @@ TF_CALL_float(REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES);
                           MklLeakyReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES);
 
-#endif
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 5cc5877cceb19320023423d35a352c5ba3db13e2..62e38694c8fbe97eb09ccfdca3aa608ec89211ac 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -22,9 +22,10 @@ limitations under the License.
 #include <assert.h>
 #include <stdio.h>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
+#include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
@@ -67,7 +68,6 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
                                                  noises.size(), Dist());
 
 #if defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::IndexList<Eigen::type2index<2>> kTwo;
     Eigen::IndexList<int, int, int> bsc;
     bsc.set(0, batch_size);
     bsc.set(1, num_samples);
@@ -80,7 +80,6 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     Eigen::IndexList<Eigen::type2index<1>, int, Eigen::type2index<1>> oso;
     oso.set(1, num_samples);
 #else
-    Eigen::array<int, 1> kTwo{2};
     Eigen::array<int, 3> bsc{batch_size, num_samples, num_classes};
     Eigen::array<int, 3> boc{batch_size, 1, num_classes};
     Eigen::array<int, 3> oso{1, num_samples, 1};
@@ -98,7 +97,14 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
         ((-((To32Bit(noises) + 2e-30f).log())).log());
 
     // Max-reduce along classes for each (batch, sample).
-    To32Bit(maxima).device(d) = To32Bit(scores).reshape(bsc).maximum(kTwo);
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+    Constants<GPUDevice> constants;
+    cub::Max op;
+    functor::ReduceImpl<float, cub::Max, float*, const float*, ReductionAxes>(
+        /*ctx=*/ctx, /*out=*/maxima.data(), /*in=*/scores.data(), /*in_rank=*/2,
+        /*in_dim0=*/batch_size * num_samples,
+        /*in_dim1=*/num_classes, /*in_dim2=*/1, /*out_rank=*/1,
+        /*reduction_axes=*/constants.kOne, /*Op=*/op);
 
     // Necessary for atomicMax() inside the kernel.
     output.device(d) = output.constant(0LL);
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index ddb7a606c1a7f0264c7c4a9cbb2f97095d9fee01..1603a2aa869e4959713741bfb501798193a63d42 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -45,7 +45,9 @@ class Mutex : public ResourceBase {
     VLOG(2) << "Creating mutex with name " << name << ": " << this;
   }
 
-  string DebugString() override { return strings::StrCat("Mutex ", name_); }
+  string DebugString() const override {
+    return strings::StrCat("Mutex ", name_);
+  }
 
   class LockReleaser {
    public:
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 313d40c082b3e334a01ba97eaf4449e1940b013a..6665152e3e3c7592cda8e0a09dd75d4b2409d6c4 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -24,7 +24,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:ops_util",
         "@gemmlowp",
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 0e820bbb6208ae9c13ac2fb33f67590b9e66ba7e..b218f62ddd9a02026bd654fd76dd2223152da9a8 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #include "public/gemmlowp.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/nextafter_op.cc b/tensorflow/core/kernels/nextafter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6166a1053f32c0b0b7fba4ceda69ad3126346f65
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/nextafter_op.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+
+REGISTER2(BinaryOp, CPU, "NextAfter", functor::nextafter, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                     \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("NextAfter").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BinaryOp<SYCLDevice, functor::nextafter<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "NextAfter", functor::nextafter, float, double);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nextafter_op.h b/tensorflow/core/kernels/nextafter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..64374980f2d5aec7c2d5a9011f14280cd6c394ed
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+struct nextafter_op {
+  EIGEN_EMPTY_STRUCT_CTOR(nextafter_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x1,
+                                                           const T& x2) const {
+    return std::nextafter(x1, x2);
+  }
+};
+
+template <typename T>
+struct nextafter : base<T, nextafter_op<T>> {};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
diff --git a/tensorflow/core/kernels/nextafter_op_gpu.cu.cc b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2321c6a882c425f9851cb59a48e5b4c5aed9cb5
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/nextafter_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+DEFINE_BINARY2(nextafter, float, double);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 37f615abd97044caa7703837714840b8d451d420..2f4e51272ff30c47c4e23b4894440de37a822b1d 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 3b9133ed7e2c210aab3488d667f0c2e543207fcf..691430ebaff5a99ccb103c5f5a80263d15f24b6a 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -322,6 +322,7 @@ namespace functor {
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_int8(DECLARE_GPU_SPECS);
+TF_CALL_uint8(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
@@ -355,6 +356,7 @@ TF_CALL_int8(DECLARE_GPU_SPECS);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_int8(REGISTER_GPU_KERNEL);
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index 00ec44adc284099b3fed644d4742af8d07ae13e1..0cd8ef17ba2be995c719dccb5b3a104f9bd09f68 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -41,6 +41,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_int8(DEFINE_GPU_SPECS);
+TF_CALL_uint8(DEFINE_GPU_SPECS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index fbecd909beacd88d80384a259345727981b64b6c..5d26265aaafac3e84036ad6668d12df884cda365 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -12,34 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_partition.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
-#include "tensorflow/core/util/reffed_status_callback.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/stream_executor/stream.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
-typedef FunctionLibraryRuntime::Handle FHandle;
-
 namespace {
 // A `PartitionedCallOp` asynchronously executes a function, potentially across
 // multiple devices but within a single process. The kernel places and
@@ -77,7 +68,15 @@ class PartitionedCallOp : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
-  ~PartitionedCallOp() override {}
+  ~PartitionedCallOp() override {
+    for (const auto& it : handles_) {
+      Status status = it.first->ReleaseHandle(it.second);
+      if (!status.ok()) {
+        LOG(INFO) << "Ignoring error while destructing PartitionedCallOp: "
+                  << status.ToString();
+      }
+    }
+  }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     FunctionLibraryRuntime* lib = ctx->function_library();
@@ -85,9 +84,6 @@ class PartitionedCallOp : public AsyncOpKernel {
                       errors::Internal("No function library is provided."),
                       done);
 
-    OpInputList args;
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
-
     // The function body's graph is placed and partitioned the first time
     // `ComputeAsync` is invoked; every subsequent invocation calls each
     // of the function shards yielded by partitioning.
@@ -97,526 +93,179 @@ class PartitionedCallOp : public AsyncOpKernel {
     // Inputs and outputs are pinned to the local device, for simplicity.
     //
     // TODO(akshayka): Support re-sharding the function on subsequent calls,
-    // via, e.g., virtual device annotations and a list of device names supplied
-    // through an attribute.
+    // via, e.g., virtual device annotations and a list of device names
+    // supplied through an attribute.
     //
     // TODO(akshayka): Add a fastpath for functions that execute on a single
     // device.
+    FunctionLibraryRuntime::Handle handle;
+    // If we are instantiating the function, we can efficiently extract the
+    // inputs while instantiating. Else, we extract them separately below.
+    std::vector<Tensor> inputs;
+    bool inputs_extracted = false;
     {
       mutex_lock l(mu_);
-      if (function_handles_.find(lib) == function_handles_.end()) {
-        // TODO(b/37549631): Because this kernel may correspond to a stateful
-        // op, it may be shared by multiple subgraphs, which in turn may have
-        // different `FunctionLibraryRuntime` objects and therefore different
-        // `FHandle` namespaces. As such, we partition on a per-FLR basis.
-        FunctionLibraryRuntime::InstantiateOptions opts;
-        FHandle handle;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
-                             &handle),
-            done);
-        const FunctionBody* fbody = lib->GetFunctionBody(handle);
-        OP_REQUIRES_ASYNC(ctx, fbody != nullptr,
-                          errors::Internal("Could not find handle ", handle),
-                          done);
-        OP_REQUIRES_ASYNC(
-            ctx, args.size() == fbody->arg_nodes.size(),
-            errors::InvalidArgument(
-                "Wrong number of arguments to the op; function expects ",
-                fbody->arg_nodes.size(), " but PartitionedCall received ",
-                args.size()),
-            done);
-        // We need to pass global op_registry as default_registry when creating
-        // graph. So that graph optimization passes can lookup all possible ops
-        // by name.
-        auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
-        FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-        TF_CHECK_OK(graph->AddFunctionLibrary(global_flib.ToProto()));
-        CopyGraph(*fbody->graph, graph.get());
-        OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
-
-        DeviceSet device_set;
-        for (auto d : lib->device_mgr()->ListDevices()) {
-          device_set.AddDevice(d);
-        }
-
-        // The FunctionLibraryRuntime's library cannot be mutated from within
-        // an OpKernel, so functions are instantiated in an overlay library.
-        OP_REQUIRES_ASYNC(
-            ctx, overlay_libs_.find(lib) == overlay_libs_.end(),
-            errors::Internal("Found an overlay library but did not "
-                             "find cached function partitions; "
-                             "this indicates a bug."),
-            done);
-        // We do not need a full function library in the overlay, we just keep a
-        // subset that is reachable from the instantiated function.
-        FunctionLibraryDefinition* overlay_lib = new FunctionLibraryDefinition(
-            grappler::ReachableFunctionLibraryDefinition(
-                *lib->GetFunctionLibraryDefinition(), fbody->fdef));
-        overlay_libs_.emplace(lib, overlay_lib);
-
-        GraphOptimizationPassOptions optimization_options;
-        // TODO(akshayka): Thread SessionOptions (if any) into this kernel, or
-        // make it possible to specify the relevant options via attributes.
-        SessionOptions session_options;
-        session_options.env = ctx->env();
-        optimization_options.session_options = &session_options;
-        optimization_options.graph = &graph;
-        optimization_options.flib_def = overlay_lib;
-        optimization_options.device_set = &device_set;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::PRE_PLACEMENT, optimization_options),
-            done);
-
-        // Make the FunctionLibraryRuntime's device the default device if
-        // nothing else is hard coded. This allows the same function definition
-        // to be specialized to different devices depending on the
-        // PartitionedCallOp's device.
-        Placer placer(graph.get(), &device_set,
-                      nullptr, /* No session options */
-                      lib->device() /* Default device */);
-        OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_PLACEMENT, optimization_options),
-            done);
-
-        Device* cpu_device;
-        OP_REQUIRES_OK_ASYNC(
-            ctx, lib->device_mgr()->LookupDevice("CPU:0", &cpu_device), done);
-
-        // Run grappler passes on the graph. It is possible that these are
-        // optimized by the graph executor already.
-        Status optimized = OptimizeGraph(ctx, fbody->ret_nodes, overlay_lib,
-                                         device_set, cpu_device, &graph);
-        if (!optimized.ok()) {
-          LOG(WARNING) << "Grappler optimization failed. Error: "
-                       << optimized.error_message();
-        }
-
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
-                optimization_options),
-            done);
-
-        std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
-        OP_REQUIRES_OK_ASYNC(
-            ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
-            done);
-        if (ctx->graph_collector() != nullptr) {
-          for (const auto& pair : subgraphs) {
-            GraphDef def;
-            pair.second->ToGraphDef(&def);
-            ctx->graph_collector()->CollectGraph(def);
-          }
-        }
-        optimization_options.graph = nullptr;
-        optimization_options.device_set = nullptr;
-        optimization_options.partition_graphs = &subgraphs;
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             OptimizationPassRegistry::Global()->RunGrouping(
-                                 OptimizationPassRegistry::POST_PARTITIONING,
-                                 optimization_options),
+      auto it = handles_.find(lib);
+      if (it == handles_.end()) {
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, ctx, &inputs, &handle),
                              done);
+        inputs_extracted = true;
+        handles_[lib] = handle;
+      } else {
+        handle = it->second;
+      }
+    }
 
-        auto handles = tensorflow::MakeUnique<gtl::FlatMap<string, FHandle>>();
-        for (const auto& pair : subgraphs) {
-          // TODO(akshayka): Fail gracefully if the set of devices corresponds
-          // to more than one address space.
-          const string& target = pair.first;
-          const auto& subgraph = pair.second;
-          OP_REQUIRES_OK_ASYNC(
-              ctx, UpdateArgAndRetMetadata(target, subgraph.get()), done);
-          FunctionDef shard;
-          string unique_name = UniquifyFunctionName(overlay_lib, func_.name());
-          OP_REQUIRES_OK_ASYNC(
-              ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
-          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib->AddFunctionDef(shard), done);
-          FunctionLibraryRuntime::InstantiateOptions opts;
-          opts.executor_type = executor_type_;
-          opts.target = target;
-          opts.overlay_lib = overlay_lib;
-          FHandle handle;
-          OP_REQUIRES_OK_ASYNC(
-              ctx,
-              lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
-                               &handle),
-              done);
-          handles->emplace(target, handle);
-        }
-
-        function_handles_.emplace(lib, std::move(handles));
+    if (!inputs_extracted) {
+      OpInputList args;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
+      inputs.reserve(args.size());
+      for (const Tensor& tensor : args) {
+        inputs.push_back(tensor);
       }
     }
-    ExecuteFunctions(lib, ctx, args, std::move(done));
+
+    RunFunction(handle, inputs, lib, ctx, done);
   }
 
  private:
-  typedef std::pair<string, FHandle> DeviceAndFHandle;
-  typedef std::pair<std::vector<int>, std::vector<int>> ArgAndRetIndices;
-  typedef std::pair<std::vector<AllocatorAttributes>,
-                    std::vector<AllocatorAttributes>>
-      ArgAndRetAllocAttrs;
+  Status FillOutputDevices(const FunctionLibraryRuntime& lib,
+                           const Device& cpu_device, AttrSlice attrs,
+                           FunctionLibraryRuntime::InstantiateOptions* opts) {
+    const FunctionLibraryDefinition* flib = lib.GetFunctionLibraryDefinition();
+    const FunctionDef* fdef = flib->Find(func_.name());
+    if (fdef == nullptr) {
+      return errors::NotFound("Failed for find definiton for function \"",
+                              func_.name(), "\"");
+    }
 
-  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
-  // corresponding resource lives. This ensures that the Placer assigns ops that
-  // access these resources to the appropriate devices.
-  Status PinResourceArgs(Graph* graph, const OpInputList& args) {
-    for (Node* node : graph->op_nodes()) {
-      string node_type = node->type_string();
-      if (node_type == FunctionLibraryDefinition::kArgOp) {
-        const AttrValue* attr_value;
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        TF_RETURN_IF_ERROR(node->attrs().Find("T", &attr_value));
-        DataType dtype = attr_value->type();
-        if (dtype != args[index].dtype()) {
-          return errors::InvalidArgument("For argument ", index, " expected ",
-                                         DataTypeString(dtype), " tensor, got ",
-                                         DataTypeString(args[index].dtype()),
-                                         " instead.");
-        }
-        if (dtype == DT_RESOURCE) {
-          const ResourceHandle& handle = args[index].flat<ResourceHandle>()(0);
-          node->set_assigned_device_name(handle.device());
+    bool is_type_list;
+    for (const OpDef::ArgDef& ret_def : fdef->signature().output_arg()) {
+      DataTypeVector dtypes;
+      TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
+      for (DataType dtype : dtypes) {
+        if (MTypeFromDType(dtype) == HOST_MEMORY) {
+          opts->output_devices.push_back(cpu_device.name());
+        } else {
+          opts->output_devices.push_back(opts->target);
         }
       }
     }
     return Status::OK();
   }
 
-  // Partitions `graph` and populates `subgraphs` with the partitions.
-  Status PartitionHelper(
-      const DeviceSet& device_set, std::unique_ptr<Graph> graph,
-      std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
-    PartitionOptions partition_options;
-    partition_options.node_to_loc = [](const Node* node) {
-      // TODO(akshayka): To better support the distributed case, first split
-      // the graph by worker (e.g,. using the master session's
-      // `SplitByWorker` policy), and then recursively partition the
-      // per-worker shards at the remote worker(s).
-      return node->assigned_device_name();
-    };
-    int64 edge_name_counter = 0;
-    partition_options.new_name = [&edge_name_counter](const string& prefix) {
-      return strings::StrCat(prefix, "/_", ++edge_name_counter);
-    };
-    partition_options.get_incarnation =
-        [&device_set](const string& name) -> int64 {
-      const Device* d = device_set.FindDeviceByName(name);
-      if (d == nullptr) {
-        return PartitionOptions::kIllegalIncarnation;
-      } else {
-        return d->attributes().incarnation();
-      }
-    };
-    partition_options.control_flow_added = false;
-    std::unordered_map<string, GraphDef> partitions;
-    TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
-
-    VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
-            << partitions.size() << " shards.";
-
-    for (const auto& partition : partitions) {
-      std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
-      FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-      TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
-      GraphConstructorOptions opts;
-      opts.allow_internal_ops = true;
-      opts.expect_device_spec = true;
-      const string& device = partition.first;
-      const GraphDef& graph_def = partition.second;
-      TF_RETURN_IF_ERROR(
-          ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
-      subgraphs->emplace(device, std::move(subgraph));
+  Status Instantiate(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                     std::vector<Tensor>* inputs,
+                     FunctionLibraryRuntime::Handle* handle) {
+    // We are going to execute the graph via function library runtime, and
+    // because function execution semantics is slightly different from the
+    // regular tensorlow graph, we need to make sure that Grappler respects it
+    // when doing it's optimization passes (e.g. do not prune stateful and
+    // dataset ops).
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+    optimization_options.is_function_instantiation = true;
+
+    // Keras graphs expected to be executed with regular graph execution
+    // semantics (it's allowed to prune stateful and dataset ops).
+    if (absl::StrContains(func_.name(), "keras_graph")) {
+      optimization_options.is_function_instantiation = false;
     }
 
-    return Status::OK();
-  }
+    // Wrapped function expects execution semantics to be the same as
+    // `session.run`, so we should prune unreachable stateful and dataset ops.
+    if (absl::StrContains(func_.name(), "wrapped_function")) {
+      optimization_options.is_function_instantiation = false;
+    }
 
-  // Each subgraph produced by partitioning the function body contains a subset
-  // of the original `Arg` and `Retval` nodes. This function performs
-  // bookkeeping to track which `Arg` and `Retval` nodes were placed on a
-  // particular device / subgraph.
-  //
-  // More specifically, this function
-  //  (1) rewrites the indices of the `Arg` and `Retval` nodes placed on a
-  //      particular device,
-  //  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
-  //      device, and
-  //  (3) records which `Arg` and `Retval` nodes live in host memory.
-  Status UpdateArgAndRetMetadata(const string& device, Graph* subgraph) {
-    ArgAndRetIndices indices;
-    std::vector<int>* arg_indices = &indices.first;
-    std::vector<int>* ret_indices = &indices.second;
-    std::vector<std::pair<Node*, int>> arg_nodes;
-    std::vector<std::pair<Node*, int>> ret_nodes;
-    const AttrValue* attr_value;
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.target = lib->device()->name();
+    opts.is_multi_device_function = true;
+    opts.optimize_graph_fn = std::bind(
+        grappler::OptimizeGraph, std::placeholders::_1, std::placeholders::_2,
+        std::placeholders::_3, std::placeholders::_4, config_proto_,
+        optimization_options, std::placeholders::_5);
+    opts.graph_collector = ctx->graph_collector();
+    opts.executor_type = executor_type_;
 
-    // Find the Arg and Retval nodes, along with their corresponding indices
-    // in the original function.
-    for (Node* node : subgraph->op_nodes()) {
-      string node_type = node->type_string();
-      if (node_type == FunctionLibraryDefinition::kArgOp) {
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        arg_indices->push_back(index);
-        arg_nodes.push_back(std::make_pair(node, index));
-      } else if (node_type == FunctionLibraryDefinition::kRetOp) {
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        ret_indices->push_back(index);
-        ret_nodes.push_back(std::make_pair(node, index));
+    OpInputList args;
+    TF_RETURN_IF_ERROR(ctx->input_list("args", &args));
+    Device* cpu_device;
+    TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
+
+    inputs->reserve(args.size());
+    for (const Tensor& tensor : args) {
+      inputs->push_back(tensor);
+      DataType dtype = tensor.dtype();
+      if (dtype == DT_RESOURCE) {
+        const ResourceHandle& handle = tensor.flat<ResourceHandle>()(0);
+        opts.input_devices.push_back(handle.device());
+      } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
+        opts.input_devices.push_back(cpu_device->name());
+      } else {
+        opts.input_devices.push_back(opts.target);
       }
     }
 
-    for (int i = 0; i < arg_nodes.size(); ++i) {
-      Node* arg = arg_nodes[i].first;
-      arg->AddAttr("index", i);
-      TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      if (MTypeFromDType(type) == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_and_ret_alloc_attrs_[device].first.push_back(alloc_attr);
-    }
-    for (int i = 0; i < ret_nodes.size(); ++i) {
-      Node* ret = ret_nodes[i].first;
-      ret->AddAttr("index", i);
-      TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      if (MTypeFromDType(type) == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_and_ret_alloc_attrs_[device].second.push_back(alloc_attr);
-    }
+    TF_RETURN_IF_ERROR(
+        FillOutputDevices(*lib, *cpu_device, AttrSlice(&func_.attr()), &opts));
 
-    // If this kernel execution corresponds to a StatefulPartitionedCallOp,
-    // `arg_and_ret_indices_` might have been populated by a previous
-    // invocation.
-    if (arg_and_ret_indices_.find(device) == arg_and_ret_indices_.end()) {
-      arg_and_ret_indices_.emplace(device, indices);
-    }
+    TF_RETURN_IF_ERROR(
+        lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts, handle));
     return Status::OK();
   }
 
-  std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
-                                        const OpInputList& arguments) {
-    std::vector<Tensor> args;
-    args.reserve(indices.size());
-    for (int i : indices) {
-      args.push_back(arguments[i]);
-    }
-    return args;
-  }
-
-  void ExecuteFunctions(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
-                        const OpInputList& op_args, DoneCallback done)
-      LOCKS_EXCLUDED(mu_) {
-    const gtl::FlatMap<string, FHandle>* handles;
-    {
-      mutex_lock l(mu_);
-      handles = function_handles_[lib].get();
-    }
-    if (handles->empty()) {
-      // Trivial case where the function body is empty.
-      ctx->SetStatus(Status::OK());
-      done();
-      return;
-    }
-
-    const string& local_device_name = lib->device()->name();
-    FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
-    opts.step_container = ctx->step_container();
-    opts.cancellation_manager = ctx->cancellation_manager();
-    opts.stats_collector = ctx->stats_collector();
-    // TODO(akshayka): Consider selecting a runner on a per-device basis, i.e.,
-    // using device-specific threadpools when available.
-    opts.runner = ctx->runner();
-    opts.source_device = local_device_name;
-    opts.allow_dead_tensors = true;
+  void RunFunction(FunctionLibraryRuntime::Handle handle,
+                   const std::vector<Tensor>& inputs,
+                   FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                   DoneCallback done) {
+    FunctionLibraryRuntime::Options run_opts;
+    run_opts.step_id = ctx->step_id();
+    run_opts.step_container = ctx->step_container();
+    run_opts.cancellation_manager = ctx->cancellation_manager();
+    run_opts.stats_collector = ctx->stats_collector();
+    run_opts.collective_executor = ctx->collective_executor();
+    // TODO(akshayka): Consider selecting a runner on a per-device basis,
+    // i.e., using device-specific threadpools when available.
+    run_opts.runner = ctx->runner();
+    run_opts.source_device = lib->device()->name();
+    run_opts.allow_dead_tensors = true;
     // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
     // constructed rendezvous to a rendezvous manager.
     Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
-    opts.rendezvous = rendez;
-
-    StatusCallback callback = std::bind(
-        [](Rendezvous* rendez, DoneCallback& done, const Status& status) {
-          rendez->Unref();
-          done();
-        },
-        rendez, std::move(done), std::placeholders::_1);
-    auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 0; i < handles->size(); ++i) {
-      refcounted_done->Ref();
-    }
-
-    for (const auto& pair : *handles) {
-      const string& target = pair.first;
-      FHandle handle = pair.second;
-      VLOG(3) << "Running function shard on device " << target;
-      ArgAndRetIndices indices = arg_and_ret_indices_[target];
-      ArgAndRetAllocAttrs alloc_attrs = arg_and_ret_alloc_attrs_[target];
-      const std::vector<int>& arg_indices = indices.first;
-      const std::vector<int>& ret_indices = indices.second;
-      opts.args_alloc_attrs = alloc_attrs.first;
-      opts.rets_alloc_attrs = alloc_attrs.second;
-      if (target == local_device_name) {
-        opts.remote_execution = false;
-        std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
-        std::vector<Tensor>* rets = new std::vector<Tensor>;
-        lib->Run(
-            opts, handle, args, rets,
-            [rets, ret_indices, refcounted_done, ctx](const Status& status) {
-              if (!status.ok()) {
-                VLOG(3) << "Local execution failed: " << status;
-                ctx->SetStatus(status);
-              } else {
-                for (int i = 0; i < rets->size(); ++i) {
-                  ctx->set_output(ret_indices[i], (*rets)[i]);
-                }
-              }
-              delete rets;
-              VLOG(3) << "Finished local execution.";
-              refcounted_done->Unref();
-            });
-      } else {
-        opts.remote_execution = true;
-        std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
-        std::vector<Tensor>* rets = new std::vector<Tensor>;
-        lib->Run(
-            opts, handle, args, rets,
-            [rets, ret_indices, refcounted_done, ctx](const Status& status) {
-              if (!status.ok()) {
-                VLOG(3) << "Remote execution failed: " << status;
-                ctx->SetStatus(status);
-              } else {
-                for (int i = 0; i < rets->size(); ++i) {
-                  ctx->set_output(ret_indices[i], (*rets)[i]);
-                }
-              }
-              delete rets;
-              VLOG(3) << "Finished remote execution.";
-              refcounted_done->Unref();
-            });
-      }
-    }
-    refcounted_done->Unref();
-  }
-
-  string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
-                              const string& name) {
-    for (;; ++suffix_) {
-      const string candidate = strings::StrCat(name, "_", suffix_);
-      if (function_library->Find(candidate) == nullptr) {
-        return candidate;
-      }
-    }
-  }
-
-  Status OptimizeGraph(OpKernelContext* ctx,
-                       const gtl::InlinedVector<Node*, 4>& ret_nodes,
-                       FunctionLibraryDefinition* flib,
-                       const DeviceSet& device_set, Device* cpu_device,
-                       std::unique_ptr<Graph>* graph) {
-    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
-      return Status::OK();
-    }
-
-    tensorflow::grappler::GrapplerItem item;
-
-    // Add all available devices so that inlined function can be placed.
-    for (const Device* d : device_set.devices()) {
-      Status added_device = item.AddDevice(d->name());
-      if (!added_device.ok()) VLOG(3) << added_device.error_message();
-    }
-
-    // Add fetches so that the graph can be pruned.
-    for (Node* node : ret_nodes) {
-      item.fetch.push_back(node->name());
-    }
-
-    (*graph)->ToGraphDef(&item.graph);
-
-    if (flib) {
-      *item.graph.mutable_library() = flib->ToProto();
-    }
-
-    tensorflow::GraphDef out_graph;
-
-    tensorflow::grappler::VirtualCluster cluster(&device_set);
-
-    // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
-    // proto (which also contain the OptimizerOptions).
-    TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-        item, config_proto_, cpu_device, &cluster, &out_graph));
-
-    std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
-        GraphConstructorOptions(), out_graph, optimized_graph.get()));
-
-    // Copy optimized functions back to the overlay lib.
-    if (flib) {
-      for (const FunctionDef& fdef : out_graph.library().function()) {
-        const string& func_name = fdef.signature().name();
-        if (flib->Contains(func_name)) {
-          TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
-        } else {
-          TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
-        }
-      }
-    }
-
-    *graph = std::move(optimized_graph);
-
-    // The graph conversion sets the requested device names but not the
-    // assigned device names. However, since at this point the graph is
-    // placed TF expects an assigned device name for every node. Therefore
-    // we copy the requested device into the assigned device field.
-    for (Node* node : graph->get()->nodes()) {
-      node->set_assigned_device_name(node->requested_device());
-    }
-
-    return Status::OK();
+    run_opts.rendezvous = rendez;
+
+    std::vector<Tensor>* rets = new std::vector<Tensor>;
+    const string& func_name = func_.name();
+    lib->Run(run_opts, handle, inputs, rets,
+             [rets, rendez, done, ctx, func_name](const Status& status) {
+               if (!status.ok()) {
+                 const string function_and_msg =
+                     strings::StrCat(errors::FormatFunctionForError(func_name),
+                                     " ", status.error_message());
+                 ctx->SetStatus(Status(status.code(), function_and_msg));
+               } else {
+                 for (int i = 0; i < rets->size(); ++i) {
+                   ctx->set_output(i, (*rets)[i]);
+                 }
+               }
+               delete rets;
+               rendez->Unref();
+               done();
+             });
   }
 
   NameAttrList func_;
   ConfigProto config_proto_;
   string executor_type_;
-  // Contains maps from device names to handles of function partitions, keyed by
-  // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
-  // for a stateful op, different invocations of it may use different
-  // FLRs. Different device placements of PartitionedCallOp also use different
-  // FLRs, and we use this to set the "default" device for the function to
-  // PartitionedCallOp's device.)
-  gtl::FlatMap<FunctionLibraryRuntime*,
-               std::unique_ptr<gtl::FlatMap<string, FHandle>>>
-      function_handles_ GUARDED_BY(mu_);
-  // Function partitions are added to overlay libraries.
-  gtl::FlatMap<FunctionLibraryRuntime*,
-               std::unique_ptr<FunctionLibraryDefinition>>
-      overlay_libs_ GUARDED_BY(mu_);
-  // Map from device name to the indices of the arguments and return values
-  // placed on that device. Read-only after the first invocation.
-  gtl::FlatMap<string, ArgAndRetIndices> arg_and_ret_indices_;
-  // Map from device name to alloc attrs for arguments and return values of the
-  // function placed on that device. Read-only after the first invocation.
-  gtl::FlatMap<string, ArgAndRetAllocAttrs> arg_and_ret_alloc_attrs_;
-
   mutex mu_;
-
-  // Used to uniquify function names in `overlay_libs_`.
-  uint32 suffix_ = 0;
+  // Cache the handle per FLR because this kernel may be instantiated for
+  // a stateful op, different invocations of it may use different FLRs.
+  // Different device placements of PartitionedCallOp also use
+  // different FLRs.
+  gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
 };
+
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
                         PartitionedCallOp);
 REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index e583f7feb4df9605115cd16aec54d1f3e9bb8b9c..69122f467c8fcf3818ab69f3f96d00b9a6b3c245 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -28,6 +29,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+template <typename T>
+struct RawType {
+  using type = T;
+};
+
+template <>
+struct RawType<qint8> {
+  using type = int8;
+};
+
+}  // namespace
+
 PoolParameters::PoolParameters(OpKernelContext* context,
                                const std::vector<int32>& ksize,
                                const std::vector<int32>& stride,
@@ -156,7 +171,10 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     return;
   }
 
-  /// For now, cudnn does not support NHWC format, so we need to convert it
+  int batch_size = params.tensor_in_batch;
+  int depth = params.depth;
+#if CUDNN_VERSION < 7300
+  /// Earlier versions do not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
   Tensor transformed_input;
   if (data_format == FORMAT_NHWC) {
@@ -181,7 +199,31 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   } else {
     transformed_output = *tensor_out;
   }
-
+  se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
+#else
+  auto& transformed_input = tensor_in;
+  auto& transformed_output = *tensor_out;
+  se::dnn::DataLayout data_layout;
+  switch (data_format) {
+    case FORMAT_NHWC:
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      break;
+    case FORMAT_NCHW:
+      data_layout = se::dnn::DataLayout::kBatchDepthYX;
+      break;
+    case FORMAT_NCHW_VECT_C:
+      // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
+      // emulated via NHWC.
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      batch_size *= depth;
+      depth = 4;
+      break;
+    default:
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unsupported format: ",
+                                          ToString(data_format)));
+  }
+#endif
   /// Get ready to call cudnn
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
@@ -194,23 +236,27 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor input_desc;
-  input_desc.set_count(params.tensor_in_batch)
+  input_desc.set_count(batch_size)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
-      .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_feature_map_count(depth)
+      .set_layout(data_layout);
 
   se::dnn::BatchDescriptor output_desc;
-  output_desc.set_count(params.tensor_in_batch)
+  output_desc.set_count(batch_size)
       .set_height(params.out_height)
       .set_width(params.out_width)
-      .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_feature_map_count(depth)
+      .set_layout(data_layout);
+
+  auto input_data =
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         transformed_input.template flat<T>().data()),
+                     transformed_input.template flat<T>().size());
 
-  auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
-                                   transformed_input.template flat<T>().size());
   auto output_data =
-      AsDeviceMemory(transformed_output.template flat<T>().data(),
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         transformed_output.template flat<T>().data()),
                      transformed_output.template flat<T>().size());
 
   auto* stream = context->op_device_context()->stream();
@@ -222,15 +268,17 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                     .ok();
   OP_REQUIRES(context, status,
               errors::Internal("cudnn PoolForward launch failed"));
-
+#if CUDNN_VERSION < 7300
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+    using RT = typename RawType<T>::type;
+    functor::NCHWToNHWC<GPUDevice, RT, 4>()(
         context->eigen_device<Device>(),
-        toConstTensor(transformed_output).template tensor<T, 4>(),
-        tensor_out->tensor<T, 4>());
+        toConstTensor(transformed_output).template tensor<RT, 4>(),
+        tensor_out->tensor<RT, 4>());
   }
+#endif
 }
 
 template <typename T>
@@ -388,6 +436,11 @@ void DnnPoolingGradOp<T>::Compute(
   template class DnnPoolingOp<T>; \
   template class DnnPoolingGradOp<T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
+
+#if CUDNN_VERSION >= 7300
+template class DnnPoolingOp<qint8>;
+#endif
+
 #undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index 8e69b5b699065a8722f4e19acaf8b57a7e0b64ed..a719c518c3e9206020602e315d0b0e3be474bfd0 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -68,7 +68,7 @@ class PriorityQueue
   Status MatchesPriorityNodeDefTypes(const NodeDef& node_def) const;
   Status MatchesPriorityNodeDefShapes(const NodeDef& node_def) const;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 31e8ce944fef913fd241801f4931fcb4dfd2025c..02b9b022fdcb00b3d9f4f676be579abced5e720e 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -59,7 +59,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<PersistentTensor> > {
                       CallbackWithTuple callback) override;
   Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index e9cf36c62b966f5f91cf7764421f0c1ff6c131fc..ffa41ece49640a7de5c71e7d3b87f42522021d38 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -539,7 +539,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
 
     OP_REQUIRES(
         ctx, success == 0,
-        errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+        errors::Internal("CUB reduce error ", cudaGetErrorString(success)));
   };
 
   reduce(nullptr);  // Get required amount of temp storage.
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 4167b6005194409d780b3698fda688728a50b3cc..8e3c52ba5b5c96846f013f9ef5e465872dc3adf8 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -56,12 +56,12 @@ limitations under the License.
 
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/kernels/resource_variable_ops.h"
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 1c4d0bc1ae9934dbfb8718dfa05202b1d7b38edc..aa2434da03f5fd76ad409121382e6ce93a2e65df 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/reverse_op.h"
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index efa30438d922fa070747bb4269451cc54f574887..494a846ff562e505a569de19418d371ea8b4f80c 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index c0fde8042e816c325475a36129fb71630f0ca7c6..0e68af867bdf753ec70ff9ff2c978d0b95ea5c52 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/image_ops.cc.
 #include <math.h>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
 
diff --git a/tensorflow/core/kernels/sampling_kernels.h b/tensorflow/core/kernels/sampling_kernels.h
index 4e79d8983194c34cb5b84530df665ccdbc191cdd..a03a2c88db44c350c2fc2bc71ed7cd7db29f5ac6 100644
--- a/tensorflow/core/kernels/sampling_kernels.h
+++ b/tensorflow/core/kernels/sampling_kernels.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_SAMPLING_KERNELS_H_
 
 #include <cmath>
+
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
@@ -68,14 +69,15 @@ struct LanczosKernelFunc {
   // Pass 1 for Lanczos1 kernel, 3 for Lanczos3 etc.
   explicit LanczosKernelFunc(float _radius) : radius(_radius) {}
   float operator()(float x) const {
+    constexpr float kPI = 3.14159265359;
     x = std::abs(x);
     if (x > radius) return 0.0;
     // Need to special case the limit case of sin(x) / x when x is zero.
     if (x <= 1e-3) {
       return 1.0;
     }
-    return radius * std::sin(M_PI * x) * std::sin(M_PI * x / radius) /
-           (M_PI * M_PI * x * x);
+    return radius * std::sin(kPI * x) * std::sin(kPI * x / radius) /
+           (kPI * kPI * x * x);
   }
   float Radius() const { return radius; }
   const float radius;
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 82546d581a9ea55d7fe0a478c4de0c9afe2ff8ed..8580891fc066828abb1c2cef6d66f71c48090f05 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 180eb3ca34b4c1fe96bf7088319455185bd06a2c..ed1195c05353389e9c4c465d402d46220a01fad4 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/core/kernels/scale_and_translate_op.cc b/tensorflow/core/kernels/scale_and_translate_op.cc
index 149c5526ae8952a5dab69dd11c0386d0bb38835f..34fef536df4d9bb9f80bb749b4071b8f5956c997 100644
--- a/tensorflow/core/kernels/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/scale_and_translate_op.cc
@@ -20,13 +20,13 @@ limitations under the License.
 
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/sampling_kernels.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 0a6848361a05559e8d1e23318ca66a9dd3ad9a95..ea42fdefb4124b0fb638adea1f91d77f95d456fd 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.h
similarity index 97%
rename from tensorflow/core/kernels/scan_ops_gpu.cu.cc
rename to tensorflow/core/kernels/scan_ops_gpu.h
index ed66c02dc584541ce4d5eb644630b678c1b05916..976b2215405105ece0a5d25c2684aa558b01d8a0 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -290,17 +293,8 @@ struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
 };
 
 }  // namespace functor
-
-#define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
-
-#define DEFINE_FOR_ALL_REDUCERS(T)           \
-  DEFINE(Eigen::internal::SumReducer<T>, T); \
-  DEFINE(Eigen::internal::ProdReducer<T>, T);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_FOR_ALL_REDUCERS);
-#undef DEFINE_FOR_ALL_REDUCERS
-#undef DEFINE
-
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
diff --git a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adce37e473c4f3f31b29db5b71c4d004da1b6b29
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<double>,
+                              double>;
+template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<double>,
+                              double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b72415822d0eebecf8426008266c5bd503b8830c
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<float>,
+                              float>;
+template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<float>,
+                              float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9fb528be98efc722df3f8b76adc65ae7fa29cdb
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::SumReducer<Eigen::half>, Eigen::half>;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::ProdReducer<Eigen::half>, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 2d43bde23feadc33c7081fccd8ad2e44dfe3c2d5..755f8f8dc55ec7dfdf6c56f1ca86e14ec3e3e352 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index b466e572495ae709d0fb05d58d964ee358077558..9c51d4e3a7d9e93f34a4c5957f9acec55ea14937 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
@@ -49,6 +49,19 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
+// Returns true if the three tensors have valid number of elements
+// If shape_input has 0 elements, then we need to have indices and updates with
+// exactly 0 elements too, otherwise we should error. If indices has 0 elements
+// then updates should also have 0 elements, otherwise we should error.
+bool ValidEmptyOutputShape(int64 num_inputs, int64 num_indices,
+                           int64 num_updates) {
+  if (num_indices == 0 && num_updates == 0) {
+    return true;  // regardless of num_inputs ?= 0, covers both cases
+  }
+  // now we want all 3 tensors to have values
+  return (num_inputs != 0 && num_indices != 0 && num_updates != 0);
+}
+
 template <typename Device, typename T, typename Index>
 class ScatterNdOp : public OpKernel {
  public:
@@ -77,12 +90,12 @@ class ScatterNdOp : public OpKernel {
     OP_REQUIRES_OK(c,
                    TensorShapeUtils::MakeShape(vec.data(), vec.size(), &shape));
 
-    OP_REQUIRES(
-        c,
-        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
-                                      updates.shape().num_elements() == 0)),
-        errors::InvalidArgument(
-            "Indices and updates specified for empty output shape"));
+    OP_REQUIRES(c,
+                ValidEmptyOutputShape(shape_input.NumElements(),
+                                      indices.shape().num_elements(),
+                                      updates.shape().num_elements()),
+                errors::InvalidArgument(
+                    "Indices and updates specified for empty output shape"));
 
     const int64 outer_dims = indices.shape().dims() - 1;
 
@@ -148,12 +161,12 @@ class TensorScatterOp : public OpKernel {
 
     TensorShape shape = input.shape();
 
-    OP_REQUIRES(
-        c,
-        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
-                                      updates.shape().num_elements() == 0)),
-        errors::InvalidArgument(
-            "Indices and updates specified for empty output shape"));
+    OP_REQUIRES(c,
+                ValidEmptyOutputShape(shape.num_elements(),
+                                      indices.shape().num_elements(),
+                                      updates.shape().num_elements()),
+                errors::InvalidArgument(
+                    "Indices and updates specified for empty output shape"));
 
     const int64 outer_dims = indices.shape().dims() - 1;
 
@@ -184,7 +197,7 @@ class TensorScatterOp : public OpKernel {
     }
 
     std::unique_ptr<Tensor> forwarded_input = c->forward_input(
-        2, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+        0, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
 
     if (forwarded_input == nullptr) {
       // We were not able to forward the input, so we deep copy the tensor and
@@ -202,6 +215,8 @@ class TensorScatterOp : public OpKernel {
       OP_REQUIRES_OK(c, functor::DoScatterNd<Device, T, Index, op>(
                             c, indices, updates, shape, forwarded_input.get(),
                             false /*allocate*/));
+
+      c->set_output(0, *forwarded_input);
     }
   }
 };
@@ -338,7 +353,9 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdSub",            \
                                     scatter_nd_op::UpdateOp::SUB);        \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
-      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);
+      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);   \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
+      type, dev, "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
 
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
@@ -546,8 +563,9 @@ Status PrepareAndValidateInputs(const TensorShape& params_shape,
                                    "got shape: ", params_shape.DebugString());
   }
 
-  if (!(params_shape.num_elements() > 0 ||
-        (indices.NumElements() == 0 && updates.NumElements() == 0))) {
+  if (!ValidEmptyOutputShape(params_shape.num_elements(),
+                             indices_shape.num_elements(),
+                             updates_shape.num_elements())) {
     return errors::InvalidArgument(
         "Indices and updates specified for empty output.  indices shape: ",
         indices.shape().DebugString());
diff --git a/tensorflow/core/kernels/scatter_nd_op.h b/tensorflow/core/kernels/scatter_nd_op.h
index 8d04731aae6329dbfd2539ec441a2d1b140f6cd3..eec70ba69e5101068dfdcfde5152ab9ea2088efe 100644
--- a/tensorflow/core/kernels/scatter_nd_op.h
+++ b/tensorflow/core/kernels/scatter_nd_op.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 472f5a3547aaaf0237a6d3ce51a141519c4d11a4..01e4656eab8b2b067f870253ba9f3223835a461f 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index a8e9b3261cd29191955509f34028660dff862bd7..2bb2c0d91e94b9462af330e806745cfb8317767a 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace sdca {
 
diff --git a/tensorflow/core/kernels/searchsorted_op.cc b/tensorflow/core/kernels/searchsorted_op.cc
index dc627ac77a51d6da994309687c5694d261908524..06b2d818374fd6a102ec3966e57e3619b4d18289 100644
--- a/tensorflow/core/kernels/searchsorted_op.cc
+++ b/tensorflow/core/kernels/searchsorted_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/searchsorted_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2328fc6afd8e7b7c24351e612ea6b760a2d522c3..6e1a0d57a169b51e184330c984a5c75d332490da 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -22,15 +22,17 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include <vector>
+
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/util.h"
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 9db0bd4d98bdb9964cb561d96d91782ba3615a7f..21c3b89f548e30cff345a072ca2e11dfe15081b5 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -143,11 +143,12 @@ class LinSpaceOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({num}), &out));
     auto flat = out->flat<T>();
-    if (num == 1) {
-      flat(0) = start;
-    } else {
+    flat(0) = start;
+    if (num > 1) {
       const T step = (stop - start) / (num - 1);
-      for (Tnum i = 0; i < num; ++i) flat(i) = start + step * i;
+      for (Tnum i = 1; i < num - 1; ++i) flat(i) = start + step * i;
+      // Ensure final value == stop; float arithmetic won't guarantee this.
+      flat(num - 1) = stop;
     }
   }
 };
diff --git a/tensorflow/core/kernels/sequence_ops_test.cc b/tensorflow/core/kernels/sequence_ops_test.cc
index 5f0e0a69a890aafa56b43cc55e99f490c100faa7..2247c447500693942ebaeda33eb5cd2baf7d226a 100644
--- a/tensorflow/core/kernels/sequence_ops_test.cc
+++ b/tensorflow/core/kernels/sequence_ops_test.cc
@@ -114,6 +114,27 @@ TEST_F(LinSpaceOpTest, Simple_D32) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(LinSpaceOpTest, Exact_Endpoints) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run. The particular values 0., 1., and 42 are chosen to test that
+  // the last value is not calculated via an intermediate delta as (1./41)*41,
+  // because for IEEE 32-bit floats that returns 0.99999994 != 1.0.
+  AddInputFromArray<float>(TensorShape({}), {0.0});
+  AddInputFromArray<float>(TensorShape({}), {1.0});
+  AddInputFromArray<int32>(TensorShape({}), {42});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output
+  Tensor output = *GetOutput(0);
+  float expected_start = 0.0;
+  float start = output.flat<float>()(0);
+  EXPECT_EQ(expected_start, start) << expected_start << " vs. " << start;
+  float expected_stop = 1.0;
+  float stop = output.flat<float>()(output.NumElements() - 1);
+  EXPECT_EQ(expected_stop, stop) << expected_stop << " vs. " << stop;
+}
+
 TEST_F(LinSpaceOpTest, Single_D64) {
   MakeOp(DT_FLOAT, DT_INT64);
 
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
deleted file mode 100644
index 30cb1e0a7f80f084854073ee061500bbcf0ccade..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <functional>
-#include <memory>
-
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace {
-
-class ShapeOpTest : public OpsTestBase {};
-
-struct NoKnownShape {
-  string TypeName() const { return "NO KNOWN SHAPE"; }
-};
-
-REGISTER_UNARY_VARIANT_DECODE_FUNCTION(NoKnownShape, "NO KNOWN SHAPE");
-
-struct KnownVecSize {
-  KnownVecSize() : shape_value(0) {}
-  explicit KnownVecSize(int value) : shape_value(value) {}
-  string TypeName() const { return "KNOWN VECTOR SIZE TYPE"; }
-  bool Decode(const VariantTensorData& d) {
-    return d.get_metadata(&shape_value);
-  }
-  void Encode(VariantTensorData* d) const { d->set_metadata(shape_value); }
-  int shape_value;
-};
-
-Status GetShapeFromKnownVecSize(const KnownVecSize& ks, TensorShape* s) {
-  *s = TensorShape({ks.shape_value});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, GetShapeFromKnownVecSize);
-
-static void ExpectHasError(const Status& s, StringPiece substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
-      << ">>" << s << "<<, expected substring >>" << substr << "<<";
-}
-
-TEST_F(ShapeOpTest, Simple) {
-  // Ensure the ops run on CPU, as we have no device copy registration
-  // for NoKnownShape and KnownVecSize objects.
-  Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
-
-  // Use a placeholder so the graph optimizer doesn't optimize away
-  // the shape function.
-  auto input = ops::Placeholder(root, DT_VARIANT);
-  auto shape_output = ops::Shape(root, input);
-  auto rank_output = ops::Rank(root, input);
-  auto size_output = ops::Size(root, input);
-
-  TF_ASSERT_OK(root.status());
-
-  ClientSession session(root);
-
-  std::vector<Tensor> outputs;
-
-  {
-    // Test no shape registered.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({}));
-    Variant& v = variant_tensor.scalar<Variant>()();
-    v = NoKnownShape();
-    Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
-    EXPECT_FALSE(s.ok());
-    ExpectHasError(
-        s, strings::StrCat(
-               "No unary variant shape function found for Variant type_index: ",
-               port::MaybeAbiDemangle(MakeTypeIndex<NoKnownShape>().name())));
-  }
-
-  {
-    // Test non-scalar variant.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({1}));
-    Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
-    EXPECT_FALSE(s.ok());
-    ExpectHasError(s, "Shape of non-unary Variant not supported.");
-  }
-
-  {
-    // Test registered variant.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({}));
-    const int vec_dim_value = -0xdeadbeef;  // must be non-negative.
-    Variant& v = variant_tensor.scalar<Variant>()();
-    v = KnownVecSize(vec_dim_value);
-    TF_EXPECT_OK(session.Run({{input, variant_tensor}},
-                             {shape_output, rank_output, size_output},
-                             &outputs));
-    EXPECT_EQ(outputs[0].dims(), 1);  // shape
-    EXPECT_EQ(vec_dim_value, outputs[0].vec<int32>()(0));
-    EXPECT_EQ(outputs[1].dims(), 0);  // rank
-    EXPECT_EQ(1, outputs[1].scalar<int32>()());
-    EXPECT_EQ(outputs[2].dims(), 0);  // size
-    EXPECT_EQ(vec_dim_value, outputs[0].scalar<int32>()());
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index ab1ce0f9c83025e472c114225265ce9430be93a3..db7357ca70e8050ff5d0d858989f27673af5f49d 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -469,8 +469,7 @@ class EnsureShapeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
 
     if (!expected_shape_.IsCompatibleWith(shape)) {
       ctx->SetStatus(errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 7a50f158af02e698681ef513c2baa2be1e22267f..03b32b88d9b7f4441439fb382bc5f8c47643ae43 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -20,27 +20,18 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
 namespace shape_op_helpers {
-inline Status GetRegularOrVariantShape(OpKernelContext* ctx, int input_index,
-                                       TensorShape* shape) {
-  const Tensor& inp = ctx->input(input_index);
-  if (ctx->input_dtype(0) == DT_VARIANT) {
-    if (inp.dims() != 0) {
-      return errors::InvalidArgument(
-          "Shape of non-unary Variant not supported.");
-    }
-    TF_RETURN_IF_ERROR(GetUnaryVariantShape(inp, shape));
-  } else {
-    *shape = inp.shape();
-  }
+inline Status GetShape(OpKernelContext* ctx, int input_index,
+                       TensorShape* shape) {
+  *shape = ctx->input(input_index).shape();
   return Status::OK();
 }
 }  // namespace shape_op_helpers
@@ -52,8 +43,7 @@ class ShapeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int rank = shape.dims();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out));
@@ -81,8 +71,7 @@ class ShapeNOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       TensorShape shape;
-      OP_REQUIRES_OK(
-          ctx, shape_op_helpers::GetRegularOrVariantShape(ctx, i, &shape));
+      OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, i, &shape));
       const int dims = shape.dims();
       Tensor* out = nullptr;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(i, {dims}, &out));
@@ -110,8 +99,7 @@ class RankOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int rank = shape.dims();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
@@ -128,8 +116,7 @@ class SizeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int64 size = shape.num_elements();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index c8bc1ad3bbb60e147dbb1d8fdf3c988b395ea19d..218698f3fff89166c0440195de25295dfe0028ab 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -57,7 +57,6 @@ struct SoftmaxEigenImpl {
     Eigen::DSizes<int, 2> one_by_class(1, num_classes);
 #else
     Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
-    Eigen::IndexList<Eigen::type2index<1> > depth_dim;
     Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
     batch_by_one.set(0, batch_size);
     Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index f46a84da1e951113382e4d44b44463c2a621ca10..459f20b0ae1cea1769277f4d367829d61e831ca1 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 30c57ef287f4c645b198da6ebf6b8554dde4fd12..0a97c6b6a5424c3c75c52add13bfa8021b665e17 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index e261e42e0d3bf43efc3a1328f07b1362f0870dfd..ea95a882b1f0a7dec7581bd6d0335c4f454d87e1 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_tensors_map_ops.cc b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
index 74fa3a15f06fdb267dc9776ee8a0903f8f6626de..939638b37058bf8294ebc437c6c14dbb696a8aa8 100644
--- a/tensorflow/core/kernels/sparse_tensors_map_ops.cc
+++ b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
@@ -43,7 +43,7 @@ class SparseTensorsMap : public ResourceBase {
  public:
   explicit SparseTensorsMap(const string& name) : name_(name), counter_(0) {}
 
-  string DebugString() override { return "A SparseTensorsMap"; }
+  string DebugString() const override { return "A SparseTensorsMap"; }
 
   typedef struct {
     PersistentTensor indices;
diff --git a/tensorflow/core/kernels/sparse_utils.cc b/tensorflow/core/kernels/sparse_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..198862940d1841675f8d7a0b0ade7160d1dc0582
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sparse_utils.h"
+
+#include <cstddef>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat) {
+  // Search in the index range [begin, end) of indices_mat.
+  Tindices begin = sparse_index_begin;
+  Tindices end = indices_mat.dimension(0);
+  const Tindices orig_sparse_index_end = end;
+
+  // The first dense row we search.
+  const Tindices orig_dense_index_begin = indices_mat(begin, 0);
+  // Early exit if no next dense row index.
+  if (orig_dense_index_begin == static_cast<int64>(indices_mat(end - 1, 0))) {
+    return orig_sparse_index_end;
+  }
+
+  Tindices increment = 1;
+  while (begin + increment < end &&
+         indices_mat(begin + increment, 0) == orig_dense_index_begin) {
+    increment *= 2;
+  }
+  // Narrow the search space as an optimization.
+  if (begin + increment < end) {
+    end = begin + increment;
+  }
+  begin += increment / 2;
+
+  // Perform a binary search on the interval [begin, end) for
+  // dense_row_index_to_find.
+  const Tindices dense_row_index_to_find = orig_dense_index_begin;
+  while (begin < end) {
+    const Tindices m = begin + (end - begin) / 2;
+    const Tindices m_dense_row_index = static_cast<Tindices>(indices_mat(m, 0));
+    if (m_dense_row_index == dense_row_index_to_find &&
+        (m + 1 == orig_sparse_index_end ||
+         static_cast<Tindices>(indices_mat(m + 1, 0)) !=
+             dense_row_index_to_find)) {
+      return m + 1;
+    } else if (m_dense_row_index <= dense_row_index_to_find) {
+      begin = m + 1;
+    } else {
+      end = m;
+    }
+  }
+
+  // No next dense row index.
+  return orig_sparse_index_end;
+}
+
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows) {
+  int64 start_sparse_index_of_cur_dense_row = 0;
+  std::vector<Tindices> segment_indices;
+  const Tindices num_entries_in_sparse_tensor = indices_mat.dimension(0);
+  const Tindices num_dense_rows_in_sparse_tensor =
+      1 + indices_mat(num_entries_in_sparse_tensor - 1, 0) - indices_mat(0, 0);
+  // Reserve an extra slot for the 0 we store in the first entry by convention.
+  segment_indices.reserve(1 + num_dense_rows_in_sparse_tensor);
+  segment_indices.push_back(0);
+  *contains_empty_rows = false;
+  while (true) {
+    const Tindices start_sparse_index_of_next_dense_row =
+        FindNextDenseRowStartIndex<Tindices>(
+            start_sparse_index_of_cur_dense_row, indices_mat);
+    if (start_sparse_index_of_next_dense_row == num_entries_in_sparse_tensor) {
+      segment_indices.push_back(start_sparse_index_of_next_dense_row);
+      break;
+    }
+    // Encode the length of the current dense row as well as the lengths of all
+    // the empty rows until the next dense row,
+    for (Tindices i = 0;
+         i < indices_mat(start_sparse_index_of_next_dense_row, 0) -
+                 indices_mat(start_sparse_index_of_cur_dense_row, 0);
+         ++i) {
+      segment_indices.push_back(start_sparse_index_of_next_dense_row);
+    }
+    // If there is more than one row between the current and next non-empty
+    // rows then those rows are empty.
+    *contains_empty_rows |=
+        indices_mat(start_sparse_index_of_next_dense_row, 0) -
+            indices_mat(start_sparse_index_of_cur_dense_row, 0) >
+        1;
+    start_sparse_index_of_cur_dense_row = start_sparse_index_of_next_dense_row;
+  }
+  return segment_indices;
+}
+
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat) {
+  std::vector<Tindices> out;
+  auto vec = tensor.vec<Tindices>();
+  out.reserve(vec.size() + 1);
+  for (size_t i = 0; i < vec.dimension(0); ++i) {
+    out.push_back(vec(i));
+  }
+  out.push_back(num_nonzero_entries_in_sparse_mat);
+  return out;
+}
+
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices) {
+  // Skip checking the lengths of the first and last dense rows since those are
+  // always non-empty.
+  for (size_t i = 2; i < row_start_indices.size() - 1; ++i) {
+    if (row_start_indices.at(i) - row_start_indices.at(i - 1) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+#define REGISTER_SPARSE_UTIL_FUNCTIONS(TypeIndex)                           \
+  template TypeIndex FindNextDenseRowStartIndex<TypeIndex>(                 \
+      const TypeIndex sparse_index_begin,                                   \
+      const TTypes<TypeIndex>::ConstMatrix& indices_mat);                   \
+  template std::vector<TypeIndex> GetStartIndicesOfEachDenseRow<TypeIndex>( \
+      const TTypes<TypeIndex>::ConstMatrix& indices_mat,                    \
+      bool* contains_empty_rows);                                           \
+  template bool ContainsEmptyRows<TypeIndex>(                               \
+      const std::vector<TypeIndex>& row_start_indices);                     \
+  template std::vector<TypeIndex> ParseRowStartIndices<TypeIndex>(          \
+      const tensorflow::Tensor& tensor,                                     \
+      const TypeIndex num_nonzero_entries_in_sparse_mat);
+
+REGISTER_SPARSE_UTIL_FUNCTIONS(int32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(int64);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint8);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint16);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint64);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_utils.h b/tensorflow/core/kernels/sparse_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e3c41a49642ebe722b7aeb5adeb6f41cea858b3
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for writing OpKernels for sparse tensors.
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+// Find the index i of the first element for which
+// indices_mat(sparse_index_begin, 0) < indices_mat(i, 0).
+// The search is conducted in the open interval
+// [sparse_index_begin, indices_mat.dimension(0)) and when no such i is found,
+// indices_mat.dimension(0) is returned.
+// indices_mat(k, 0) should be non-decreasing over the interval
+// [begin, indices_mat.dimension(0)).
+// Requires 0 <= sparse_index_begin < indices_mat.dimension(0).
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat);
+
+// Returns the vector v of indices in indices_mat at which new dense matrix
+// rows begin.
+// v.front() = 0, v.back() = indices_mat.dimension(0), and for i > 0,
+// v[i] - v[i-1] is the length of the ith dense row in indices_mat.
+// *contains_empty_rows = true if and only if indices_mat contains empty rows
+// (rows without values) between its first and last row.
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows);
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns true if and only if the sparse matrix indices_mat whose row start
+// indices are represented by row_start_indices has empty dense rows
+// (between its first and last dense rows).
+// This function satisfies the identity row_start_indices ==
+// GetStartIndicesOfEachDenseRow(indices_mat, &return_value).
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
diff --git a/tensorflow/core/kernels/sparse_utils_test.cc b/tensorflow/core/kernels/sparse_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d0adff8860ded4c8b1f49b99ba6eb3a261782aa
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils_test.cc
@@ -0,0 +1,263 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sparse_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::DataType;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::Tensor;
+using tensorflow::TTypes;
+using tensorflow::uint16;
+using tensorflow::uint32;
+using tensorflow::uint64;
+using tensorflow::sparse_utils::ContainsEmptyRows;
+using tensorflow::sparse_utils::FindNextDenseRowStartIndex;
+using tensorflow::sparse_utils::GetStartIndicesOfEachDenseRow;
+using tensorflow::sparse_utils::ParseRowStartIndices;
+
+TEST(SparseUtilsTest, GetStartIndicesOfEachDenseRow) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int32>(indices_mat,
+                                                     &contains_empty_rows) ==
+                std::vector<int32>({0, 1, 2, 2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+  {
+    int32 data[] = {0, 0, 1, 0, 1, 0, 4, 0, 4, 0, 4, 0,  6, 0,  7,
+                    0, 7, 0, 7, 0, 7, 0, 8, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 15, 2);
+    // indices_list = {0, 1, 1, 4, 4, 4,  6, 7, 7, 7, 7, 8, 8, 10, 12};
+    bool contains_empty_rows;
+    EXPECT_TRUE(
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat,
+                                             &contains_empty_rows) ==
+        std::vector<int32>({0, 1, 3, 3, 3, 6, 6, 7, 11, 13, 13, 14, 14, 15}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+  {
+    int64 data[] = {3, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 1, 2);
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int64>(indices_mat,
+                                                     &contains_empty_rows) ==
+                std::vector<int64>({0, 1}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint32 data[] = {3, 0, 3, 0};
+    TTypes<uint32>::ConstMatrix indices_mat(data, 2, 2);
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint32>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint32>({0, 2}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint16 data[] = {0, 0, 0, 0, 0, 0, 1, 0};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 4, 2);
+    // indices_list = {0, 0, 0, 1};
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint16>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint16>({0, 3, 4}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint64 data[] = {0, 0, 0, 0, 0, 0, 3, 0};
+    TTypes<uint64>::ConstMatrix indices_mat(data, 4, 2);
+    bool contains_empty_rows;
+    // indices_list = {0, 0, 0, 3};
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint64>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint64>({0, 3, 3, 3, 4}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+}
+
+TEST(SparseUtilsTest, ParseRowStartIndices) {
+  {
+    Tensor t(DataType::DT_INT32, {1});
+    int indx = 0;
+    for (const int32 v : {0}) {
+      t.flat<int32>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<int32>(t, 1) ==
+                std::vector<int32>({0, 1}));
+  }
+  {
+    Tensor t(DataType::DT_INT64, {1});
+    int indx = 0;
+    for (const int64 v : {0}) {
+      t.flat<int64>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<int64>(t, 2) ==
+                std::vector<int64>({0, 2}));
+  }
+  {
+    Tensor t(DataType::DT_UINT64, {2});
+    int indx = 0;
+    for (const uint64 v : {0, 3}) {
+      t.flat<uint64>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<uint64>(t, 4) ==
+                std::vector<uint64>({0, 3, 4}));
+  }
+  {
+    Tensor t(DataType::DT_UINT16, {2});
+    int indx = 0;
+    for (const uint16 v : {0, 3}) {
+      t.flat<uint16>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<uint16>(t, 4) ==
+                std::vector<uint16>({0, 3, 4}));
+  }
+}
+
+TEST(SparseUtilsTest, ContainsEmptyRows) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 8, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int32 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int32>::ConstMatrix indices_mat(data, 6, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint16 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 6, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint16>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int32 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int32>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint32 data[] = {0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint32>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint32>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 2, 2, 2, 3};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 2, 2, 2, 3};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint64 data[] = {0, 0, 0, 1, 0, 2, 1, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint64>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 1, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+}
+
+TEST(SparseUtilsTest, FindNextDenseRowStartIndex) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    for (int32 i = 0; i < 8; ++i) {
+      EXPECT_EQ(i + 1, FindNextDenseRowStartIndex<int32>(i, indices_mat));
+    }
+  }
+  {
+    uint16 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    for (uint16 i = 0; i < 8; ++i) {
+      EXPECT_EQ(i + 1, FindNextDenseRowStartIndex<uint16>(i, indices_mat));
+    }
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 1, 0, 4, 0, 4, 0, 4, 0,  6, 0,  7,
+                    0, 7, 0, 7, 0, 7, 0, 8, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 15, 2);
+    // indices_list = {0, 1, 1, 4, 4, 4,  6, 7, 7, 7, 7, 8, 8, 10, 12};
+    EXPECT_EQ(3, FindNextDenseRowStartIndex<int64>(static_cast<int64>(1),
+                                                   indices_mat));
+    EXPECT_EQ(3, FindNextDenseRowStartIndex<int64>(static_cast<int64>(2),
+                                                   indices_mat));
+    EXPECT_EQ(6, FindNextDenseRowStartIndex<int64>(static_cast<int64>(3),
+                                                   indices_mat));
+    EXPECT_EQ(6, FindNextDenseRowStartIndex<int64>(static_cast<int64>(4),
+                                                   indices_mat));
+    EXPECT_EQ(14, FindNextDenseRowStartIndex<int64>(static_cast<int64>(13),
+                                                    indices_mat));
+    EXPECT_EQ(15, FindNextDenseRowStartIndex<int64>(static_cast<int64>(14),
+                                                    indices_mat));
+  }
+}
+
+}  // namespace
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index f84ffd53238f7753c1b4562268be9058c6c03e6d..37d4d0661cadc1d86af10c8226e4aae52b4b8c7c 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -90,9 +90,8 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
             context, CheckInvalidLabelIndex<Index>(labels, logits.dim_size(1)));
       }
       functor::SparseXentFunctor<Device, T, Index> functor;
-      functor(context->eigen_device<Device>(), logits.matrix<T>(),
-              labels.vec<Index>(), scratch.vec<T>(), loss_out->vec<T>(),
-              back_out->matrix<T>());
+      functor(context, logits.matrix<T>(), labels.vec<Index>(),
+              scratch.vec<T>(), loss_out->vec<T>(), back_out->matrix<T>());
     }
   }
 };
@@ -102,11 +101,11 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
 namespace functor {
 template <typename T, typename Index>
 struct SparseXentFunctor<CPUDevice, T, Index> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    SparseXentEigenImpl<CPUDevice, T, Index>::Compute(d, logits, labels,
+    SparseXentEigenImpl<CPUDevice, T, Index>::Compute(ctx, logits, labels,
                                                       scratch, loss, backprop);
   }
 };
diff --git a/tensorflow/core/kernels/sparse_xent_op.h b/tensorflow/core/kernels/sparse_xent_op.h
index 6ba7931ab5f923cec2efa44fb44e2b3a91f73ebe..c94597f29709ae649fc5f0fd85b931b9555cdf60 100644
--- a/tensorflow/core/kernels/sparse_xent_op.h
+++ b/tensorflow/core/kernels/sparse_xent_op.h
@@ -18,8 +18,9 @@ limitations under the License.
 // Functor definition for SparseXentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -128,6 +129,26 @@ class SparseXentGradGenerator {
 
 namespace functor {
 
+template <typename Device, typename T>
+struct RowMaxReduction {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> along_row;
+    along_row[0] = 1;
+#else
+    Eigen::IndexList<Eigen::type2index<1> > along_row;
+#endif
+    Device d = ctx->eigen_device<Device>();
+    To32Bit(maximum).device(d) = To32Bit(logits).maximum(along_row);
+  }
+};
+
 // Functor used by SparseXentOp to do the computations.
 template <typename Device, typename T, typename Index>
 struct SparseXentFunctor {
@@ -138,7 +159,7 @@ struct SparseXentFunctor {
   // scratch: temporary tensor, dims: batch_size, 1
   // loss: output tensor for the loss, dims: batch_size.
   // backprop: output tensor for the backprop, dims: batch_size, num_classes.
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop);
@@ -149,7 +170,8 @@ struct SparseXentFunctor {
 // specializations for both device types.
 template <typename Device, typename T, typename Index>
 struct SparseXentEigenImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  static void Compute(OpKernelContext* ctx,
+                      typename TTypes<T>::ConstMatrix logits,
                       typename TTypes<Index>::ConstVec labels,
                       typename TTypes<T>::Vec scratch,
                       typename TTypes<T>::Vec loss,
@@ -188,8 +210,9 @@ struct SparseXentEigenImpl {
 #endif
 
     // scratch = max_logits along classes.
-    To32Bit(scratch).device(d) = To32Bit(logits).maximum(along_class);
+    RowMaxReduction<Device, T>::Compute(ctx, logits, scratch);
 
+    Device d = ctx->eigen_device<Device>();
     // backprop = logits - max_logits.
     To32Bit(backprop).device(d) =
         To32Bit(logits) -
diff --git a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
index d0539660282240bd40495a5078771d1f7a1f3211..5fe15352c3e562eff0fee5dd43fb8625f4c27fa5 100644
--- a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
@@ -20,22 +20,50 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse_xent_op.h"
 
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
+#include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace functor {
+
+// Partial specialization for a GPUDevice, that uses the CUB implementation
+// from reduction_gpu_kernels.cu.h.
+template <typename T>
+struct RowMaxReduction<GPUDevice, T> {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+    const int rows = logits.dimension(kBatchDim);
+    const int cols = logits.dimension(kClassDim);
+
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+    Constants<GPUDevice> constants;
+    cub::Max op;
+    functor::ReduceImpl<T, cub::Max, T*, const T*, ReductionAxes>(
+        ctx, maximum.data(), logits.data(), 2, rows, cols, 1, 1, constants.kOne,
+        op);
+  }
+};
+
 // Partial specialization for a GPUDevice, that uses the Eigen implementation
 // from XentEigenImpl.
-namespace functor {
 template <typename T, typename Index>
 struct SparseXentFunctor<GPUDevice, T, Index> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    SparseXentEigenImpl<GPUDevice, T, Index>::Compute(d, logits, labels,
+    SparseXentEigenImpl<GPUDevice, T, Index>::Compute(ctx, logits, labels,
                                                       scratch, loss, backprop);
   }
 };
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 11db72bfa3c66130783ad67f01c041a5d3d5085a..ed3429ff5cbfc02fd5196db154ce45a72849518c 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 5c19a45fb18abdacb5f89f623f9690b43bdfa1e5..0324ce9babc3fe73e613f1b5552c6e13d643b090 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <numeric>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index 5c70a2d62d36b94362c6f10473644f2623b77d2a..2af6b4b8148807df9e1f7c0de65f664efe6acc79 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -96,7 +96,7 @@ class Stack : public ResourceBase {
 
   DataType ElemType() { return elem_type_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     mutex_lock l(mu_);
     return strings::StrCat("Stack[", stack_name_, "]");
   }
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index c91bdc43cf4636481f141df70f30b1f2d74dc1a2..65174e163c1031d3e480159824f984e4bf83980b 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -132,7 +132,7 @@ class Buffer : public ResourceBase {
     notify_inserters_if_bounded(&lock);
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
@@ -170,7 +170,7 @@ class Buffer : public ResourceBase {
   std::size_t capacity_;
   std::size_t memory_limit_;
   std::size_t current_bytes_;
-  std::mutex mu_;
+  mutable std::mutex mu_;
   std::condition_variable non_empty_cond_var_;
   std::condition_variable full_cond_var_;
   std::deque<Tuple> buf_;
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7cbbfd490e6232c6730779418a4d50d4bb81d2fb
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -0,0 +1,265 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "absl/strings/str_join.h"
+#include "absl/types/variant.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/resource_variable_ops.h"
+#include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+using random::PhiloxRandom;
+
+namespace {
+
+// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
+// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
+using StateElementType = int64;
+static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
+
+using Algorithm = StateElementType;
+static constexpr Algorithm RNG_ALG_PHILOX = 1;
+
+using SkippableRNG = absl::variant<PhiloxRandom>;
+
+// This function is for hiding the implementation detail about the
+// `absl::variant` index of each algorithm.
+Algorithm GetAlgorithm(SkippableRNG const& rng) {
+  auto idx = rng.index();
+  if (idx == 0) {
+    return RNG_ALG_PHILOX;
+  }
+  // unreachable
+  return RNG_ALG_PHILOX;
+}
+
+// Fills a buffer with random numbers sampled from a given distribution.
+template <class Device, class Distribution>
+Status FillRandom(OpKernelContext* ctx, const Device& device,
+                  SkippableRNG const& gen, int64 size, Distribution dist,
+                  typename Distribution::ResultElementType* data) {
+  auto algorithm = GetAlgorithm(gen);
+  if (algorithm == RNG_ALG_PHILOX) {
+    auto philox = absl::get<PhiloxRandom>(gen);
+    functor::FillPhiloxRandom<Device, Distribution>()(ctx, device, philox, data,
+                                                      size, dist);
+    return Status::OK();
+  } else {
+    // return errors::InvalidArgument("Unsupported algorithm id: ", algorithm);
+    return Status::OK();
+  }
+}
+
+// The following two functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+void Int64ToUint32s(int64 input, uint32* output1, uint32* output2) {
+  auto u64 = static_cast<uint64>(input);
+  *output1 = static_cast<uint32>(u64);
+  *output2 = static_cast<uint32>(u64 >> 32);
+}
+
+int64 Uint32sToInt64(uint32 input1, uint32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return static_cast<int64>(u64_1 | (u64_2 << 32));
+}
+
+void GetPhiloxStateFromTensor(Tensor const& tensor,
+                              PhiloxRandom::ResultType* counter,
+                              PhiloxRandom::Key* key) {
+  auto tensor_flat = tensor.flat<StateElementType>();
+  auto tensor_ptr = tensor_flat.data();
+  // tensor_ptr's index is added by 1 to skip the algorithm tag.
+  Int64ToUint32s(tensor_ptr[1], &(*counter)[0], &(*counter)[1]);
+  Int64ToUint32s(tensor_ptr[2], &(*counter)[2], &(*counter)[3]);
+  Int64ToUint32s(tensor_ptr[3], &(*key)[0], &(*key)[1]);
+}
+
+void WritePhiloxStateToTensor(PhiloxRandom::ResultType const& counter,
+                              PhiloxRandom::Key const& key, Tensor* tensor) {
+  auto tensor_flat = tensor->flat<StateElementType>();
+  auto tensor_ptr = tensor_flat.data();
+  // tensor_ptr's index is added by 1 to skip the algorithm tag.
+  tensor_ptr[1] = Uint32sToInt64(counter[0], counter[1]);
+  tensor_ptr[2] = Uint32sToInt64(counter[2], counter[3]);
+  tensor_ptr[3] = Uint32sToInt64(key[0], key[1]);
+}
+
+// A helper function that does the actual work for
+// 'MakeRNGCopyAndUpdateVariable'.
+template <typename Device>
+Status GetRNGCopyAndUpdateTensor(Tensor* tensor, int64 delta,
+                                 SkippableRNG* rng_copy);
+
+template <>
+Status GetRNGCopyAndUpdateTensor<CPUDevice>(Tensor* tensor, int64 delta,
+                                            SkippableRNG* rng_copy) {
+  // The dtype of `tensor` should be `StateElementType` and the first element
+  // is the algorithm.
+  if (tensor->dims() != 1) {
+    return errors::InvalidArgument(
+        "RNG state must have one and only one dimension, not ", tensor->dims());
+  }
+  auto tensor_flat = tensor->flat<StateElementType>();
+  if (tensor_flat.size() < 1) {
+    return errors::InvalidArgument("Size of tensor must be at least 1");
+  }
+  auto algorithm = tensor_flat.data()[0];
+  if (algorithm == RNG_ALG_PHILOX) {
+    // Delegates to PhiloxRandom to do the actual increasing.
+    static_assert(std::is_same<StateElementType, int64>::value,
+                  "StateElementType must be int64");
+    static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
+                  "PhiloxRandom::ResultElementType must be uint32");
+    auto counter_size = PhiloxRandom::ResultType::kElementCount;
+    auto key_size = PhiloxRandom::Key::kElementCount;
+    auto min_tensor_size = 1 + (counter_size + key_size) / 2;
+    if (tensor_flat.size() < min_tensor_size) {
+      return errors::InvalidArgument(
+          "For Philox algorithm, the size of state"
+          " must be at least ",
+          min_tensor_size, "; got ", tensor_flat.size());
+    }
+    PhiloxRandom::ResultType counter;
+    PhiloxRandom::Key key;
+    GetPhiloxStateFromTensor(*tensor, &counter, &key);
+    PhiloxRandom philox(counter, key);
+    auto old_philox = philox;
+    philox.Skip(delta);  // do the actual increasing
+    WritePhiloxStateToTensor(philox.counter(), philox.key(), tensor);
+    *rng_copy = SkippableRNG(old_philox);
+    return Status::OK();
+  } else {
+    // return errors::InvalidArgument("Unsupported algorithm id: ", algorithm);
+    *rng_copy = SkippableRNG(PhiloxRandom());
+    return Status::OK();
+  }
+}
+
+// Gets a copy of the RNG and updates the variable. The copy can be used to
+// generate upto 'samples' random numbers, and the variable is updated as if
+// 'samples' random numbers have been generated (e.g. if the variable is a
+// counnter, the counter is increased by 'samples').
+template <class Device>
+Status MakeRNGCopyAndUpdateVariable(OpKernelContext* ctx, int input_idx,
+                                    int64 samples, SkippableRNG* rng_copy) {
+  Var* var = nullptr;
+  TF_RETURN_IF_ERROR(
+      LookupResource(ctx, HandleFromInput(ctx, input_idx), &var));
+  core::ScopedUnref s(var);
+  mutex_lock ml(*var->mu());
+  Tensor* var_tensor = var->tensor();
+  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
+    return errors::InvalidArgument("dtype of RNG state variable must be ",
+                                   DataTypeString(STATE_ELEMENT_DTYPE),
+                                   ", not ",
+                                   DataTypeString(var_tensor->dtype()));
+  }
+  TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
+      ctx, var_tensor, var->copy_on_read_mode.load()));
+  TF_RETURN_IF_ERROR(
+      GetRNGCopyAndUpdateTensor<Device>(var_tensor, samples, rng_copy));
+  return Status::OK();
+}
+
+template <typename Device, class Distribution>
+class StatefulRandomOp : public OpKernel {
+ public:
+  using T = typename Distribution::ResultElementType;
+  explicit StatefulRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Assumes that input(0) is an existing resource.
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape_t = ctx->input(1);
+    Tensor* output;
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, MakeShape(shape_t, &shape));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+    if (shape.num_elements() == 0) return;
+
+    auto output_flat = output->flat<T>();
+    SkippableRNG rng;
+    // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
+    // it just here.
+    OP_REQUIRES_OK(ctx, MakeRNGCopyAndUpdateVariable<Device>(
+                            ctx, 0, output_flat.size() * 256, &rng));
+    // Fill in the random numbers
+    OP_REQUIRES_OK(ctx, FillRandom(ctx, ctx->eigen_device<Device>(), rng,
+                                   output_flat.size(), Distribution(),
+                                   output_flat.data()));
+  }
+};
+
+}  // namespace
+
+// So far the 'Distribution' type parameter is only used when the algorithm is
+// philox, so 'NormalDistribution<PhiloxRandom, ...>' is fine for now.
+#define REGISTER(DEVICE, TYPE)            \
+  REGISTER_KERNEL_BUILDER(                \
+      Name("StatefulStandardNormal")      \
+          .Device(DEVICE_##DEVICE)        \
+          .HostMemory("resource")         \
+          .HostMemory("shape")            \
+          .TypeConstraint<TYPE>("dtype"), \
+      StatefulRandomOp<DEVICE##Device,    \
+                       random::NormalDistribution<PhiloxRandom, TYPE> >);
+
+#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_bfloat16(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+
+#undef REGISTER_CPU
+#undef REGISTER
+
+// TODO(wangpeng): Add RNG ops for other distributions.
+// TODO(wangpeng): Add support for GPU and XLA.
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 925f5291a68327c9fd939fd06fc025b58ab436ee..959334abc81d70bc854d2026d9eba99a2a01850d 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 6db68f937def6fb4827b7fc85bff873b651a0002..20bf42ccaa2ec838779c78321d022d6722826bb0 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_impl.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
similarity index 71%
rename from tensorflow/core/kernels/bounds_check.h
rename to tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
index ce6ec1012daacf915fee0ee7bb059306058361d5..8c3f8f2ad30a56fb4c03105a20d0a7ebc692ec25 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
-#define TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#if GOOGLE_CUDA
 
-#include "tensorflow/core/framework/bounds_check.h"
+#define EIGEN_USE_GPU
 
-#endif  // TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_bool(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6951924655a8fcd2b3c400b6e1b76f2d8e49270
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
similarity index 90%
rename from tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
rename to tensorflow/core/kernels/strided_slice_op_gpu_impl.h
index cce1d2fddde7edc0283c524269de9464c2602e25..d70f369ac07a3c605ca90c5ba1e6198525dc1206 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -50,16 +53,8 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssign<GPUDevice, T, 6>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 7>; \
   template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
-TF_CALL_int64(DEFINE_GPU_KERNELS);
-TF_CALL_bool(DEFINE_GPU_KERNELS);
-TF_CALL_int8(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(int32);
-
-#undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07dd0130adc73512df10bf2e95ce580794262c68
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_int8(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..149886308cdf4ec8e9e9187db349e51c57e408b8
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index c4205159c380cb0a78085f87deb760bd4a8c9791..d9b62d4c75486d61f28c0cd9bc3b44206a0689a4 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -22,13 +22,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/string_view_variant_wrapper.cc b/tensorflow/core/kernels/string_view_variant_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b576eb4a3e63863d666bd325d0276039727e38c5
--- /dev/null
+++ b/tensorflow/core/kernels/string_view_variant_wrapper.cc
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/string_view_variant_wrapper.h"
+
+namespace tensorflow {
+
+constexpr const char StringViewVariantWrapper::kTypeName[];
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_view_variant_wrapper.h b/tensorflow/core/kernels/string_view_variant_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc4a8e953489500d1967a6899ae9a003edacf0f9
--- /dev/null
+++ b/tensorflow/core/kernels/string_view_variant_wrapper.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
+#define TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+
+namespace tensorflow {
+
+// A wrapper class for storing an `absl::string_view` instance in a DT_VARIANT
+// tensor.
+class StringViewVariantWrapper {
+ public:
+  static constexpr const char kTypeName[] =
+      "tensorflow::StringViewVariantWrapper";
+
+  using value_type = absl::string_view;
+
+  StringViewVariantWrapper() = default;
+
+  explicit StringViewVariantWrapper(absl::string_view str_view)
+      : str_view_(str_view) {}
+
+  StringViewVariantWrapper(const StringViewVariantWrapper& other)
+      : str_view_(other.str_view_) {}
+
+  const absl::string_view* get() const { return &str_view_; }
+
+  static string TypeName() { return kTypeName; }
+
+  string DebugString() const { return string(str_view_); }
+
+  void Encode(VariantTensorData* data) const {
+    data->add_tensor(string(str_view_));
+  }
+
+  // Decode assumes that the source VariantTensorData will have a longer
+  // lifetime than this StringViewVariantWrapper.
+  bool Decode(const VariantTensorData& data) {
+    if (data.tensors_size() != 1 || data.tensors(0).dtype() != DT_STRING) {
+      return false;
+    }
+    str_view_ = data.tensors(0).scalar<string>()();
+    return true;
+  }
+
+ private:
+  absl::string_view str_view_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 93c427039dd6e0a7984ee58e51479fdff48937bb..77b16b9384de1bfe8956ff7aa89e2bd8fda35d86 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index b287f0cc2f1337cff5705b5a40ba455b837307f9..5e3465d1dd6ce24a82525704f5223b6d9f0ac39f 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/summary/schema.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 1f4e3418f4826dee789002d4aa688f8ce14e17d2..1053aa7d53ad5f831f8127036d8156cdde772b70 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -124,7 +124,9 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER)
 struct HistogramResource : public ResourceBase {
   histogram::ThreadSafeHistogram histogram;
 
-  string DebugString() override { return "A histogram summary. Stats ..."; }
+  string DebugString() const override {
+    return "A histogram summary. Stats ...";
+  }
 };
 
 class SummaryMergeOp : public OpKernel {
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 384a63e945306637bcf074d1f3709eea055bffe9..507ab459ca5ee773e7fa3f3c77dc511a55957dd0 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -261,7 +261,7 @@ class TensorArray : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     mutex_lock l(mu_);
     CHECK(!closed_);
     return strings::StrCat("TensorArray[", tensors_.size(), "]");
@@ -376,7 +376,7 @@ class TensorArray : public ResourceBase {
   const DataType dtype_;
   Tensor handle_;
 
-  mutex mu_;
+  mutable mutex mu_;
 
   // Marks that the tensor_array_ has been cleared.
   bool closed_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index aa85f546a81d0e6b8cf41fc23532fd4a11fe42ec..129035638ab0e3d427a3fa55e1de0ded7e07a85c 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include <numeric>  // clang-format off
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/kernels/tensor_array.h"
diff --git a/tensorflow/core/kernels/tensor_flag_utils.cc b/tensorflow/core/kernels/tensor_flag_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f91927298078168a78144c361f50661c54c096
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils.cc
@@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+Status ValidateSparseMatrixShardingConfig(const Tensor& config) {
+  if (TensorShapeUtils::IsScalar(config.shape())) {
+    const float scalar_config = config.template scalar<float>()();
+    if (0 < scalar_config && scalar_config <= 1.0) {
+      return Status::OK();
+    }
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat("Expected config to be in range (0, 1] but instead found ",
+                     scalar_config));
+  }
+  if (!TensorShapeUtils::IsMatrix(config.shape())) {
+    return Status(error::INVALID_ARGUMENT,
+                  absl::StrCat("Expected config to be either scalar or matrix "
+                               "but instead found tensor of rank ",
+                               config.dims()));
+  }
+  if (config.dim_size(1) != 3) {
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Expected config matrix to have dim(1) = 3 but instead found ",
+            config.dim_size(1)));
+  }
+
+  auto config_matrix = config.matrix<float>();
+  for (int i = 0; i < config.dim_size(0); ++i) {
+    if (0 > config_matrix(i, 0)) {
+      return errors::InvalidArgument(
+          "First column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 0), " in row ", i);
+    }
+    if (0 > config_matrix(i, 1)) {
+      return errors::InvalidArgument(
+          "Second column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 1), " in row ", i);
+    }
+    if (!(0 < config_matrix(i, 2) && config_matrix(i, 2) <= 1)) {
+      return errors::InvalidArgument(
+          "Last column of fraction_rows_per_thread_config should "
+          "have values in the range (0, 1] but found ",
+          config_matrix(i, 2), " in row ", i);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key) {
+  const int last_row_index = config_mat.dimension(0) - 1;
+  for (int i = 0; i < last_row_index; ++i) {
+    if (key.first >= config_mat(i, 0) && key.second >= config_mat(i, 1)) {
+      return config_mat(i, 2);
+    }
+  }
+  return config_mat(last_row_index, 2);
+}
+
+Status ValidateScalarQuantityShardingConfig(const Tensor& config) {
+  if (TensorShapeUtils::IsScalar(config.shape())) {
+    const float scalar_config = config.template scalar<float>()();
+    if (0 < scalar_config && scalar_config <= 1.0) {
+      return Status::OK();
+    }
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat("Expected config to be in range (0, 1] but instead found ",
+                     scalar_config));
+  }
+  if (!TensorShapeUtils::IsMatrix(config.shape())) {
+    return Status(error::INVALID_ARGUMENT,
+                  absl::StrCat("Expected config to be either scalar or matrix "
+                               "but instead found tensor of rank ",
+                               config.dims()));
+  }
+  if (config.dim_size(1) != 2) {
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Expected config matrix to have dim(1) = 2 but instead found ",
+            config.dim_size(1)));
+  }
+
+  auto config_matrix = config.matrix<float>();
+  for (int i = 0; i < config.dim_size(0); ++i) {
+    if (0 > config_matrix(i, 0)) {
+      return errors::InvalidArgument(
+          "First column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 0), " in row ", i);
+    }
+    if (!(0 < config_matrix(i, 1) && config_matrix(i, 1) <= 1)) {
+      return errors::InvalidArgument(
+          "Last column of fraction_rows_per_thread_config should "
+          "have values in the range (0, 1] but found ",
+          config_matrix(i, 1), " in row ", i);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key) {
+  const int last_row_index = config_mat.dimension(0) - 1;
+  for (int i = 0; i < last_row_index; ++i) {
+    if (key >= config_mat(i, 0)) {
+      return config_mat(i, 1);
+    }
+  }
+  return config_mat(last_row_index, 1);
+}
+
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size) {
+  const Tindices next_multiple_of_bucket_size =
+      (value + bucket_size - 1) / bucket_size * bucket_size;
+  return next_multiple_of_bucket_size - (bucket_size - 1);
+}
+
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size) {
+  if (bucket_size == 1) {
+    return 1;
+  }
+  return std::pow(bucket_size, std::floor(std::log(bucket_size * (value - 1)) /
+                                          std::log(bucket_size)) -
+                                   1) +
+         1;
+}
+
+#define REGISTER_SPARSE_UTIL_FUNCTIONS(TypeIndex)                           \
+  template float FindConfigValueForKey<float, TypeIndex>(                   \
+      const TTypes<float>::ConstMatrix& config_mat,                         \
+      const std::pair<TypeIndex, TypeIndex>& key);                          \
+  template float FindConfigValueForKey<float, TypeIndex>(                   \
+      const TTypes<float>::ConstMatrix& config_mat, const TypeIndex key);   \
+  template int64 FindConfigValueForKey<int64, TypeIndex>(                   \
+      const TTypes<int64>::ConstMatrix& config_mat, const TypeIndex key);
+
+REGISTER_SPARSE_UTIL_FUNCTIONS(int32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(int64);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint8);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint16);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint64);
+
+template int32 GetLinearBucket(const int32 value, const int32 bucket_size);
+
+template int64 GetLinearBucket(const int64 value, const int64 bucket_size);
+
+template int32 GetPowerBucket(const int32 value, const int32 bucket_size);
+
+template int64 GetPowerBucket(const int64 value, const int64 bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_flag_utils.h b/tensorflow/core/kernels/tensor_flag_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f406c73a29769db4fa13a1368bf1570277ded928
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for parsing tensors as runtime flags.
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns Status::OK() if and only if config is a float scalar or a matrix with
+// dimensions M x 3. If config is a scalar then config must be in the range
+// [0, 1.0). If confix is a matrix then config must have shape M x 3, all of
+// its entries must be positive, and entries in the last column may not
+// exceed 1.0. If config is a matrix then it may not be empty.
+Status ValidateSparseMatrixShardingConfig(const Tensor& config);
+
+// Returns Status::OK() if and only if config is a float scalar or a non-empty
+// matrix with dimensions M x 2.
+Status ValidateScalarQuantityShardingConfig(const Tensor& config);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key);
+
+// Returns largest multiple of bucket_size less than value.
+// Expects 1 <= bucket_size <= value.
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size);
+
+// Returns the largest power of bucket_size less than value.
+// Expects 1 <= bucket_size <= value. If bucket_size = 1, returns 1.
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
diff --git a/tensorflow/core/kernels/tensor_flag_utils_test.cc b/tensorflow/core/kernels/tensor_flag_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23ccc7ad7a16bb9a9cdac4c53f1a3252ae29ed6c
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils_test.cc
@@ -0,0 +1,322 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::DataType;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::Tensor;
+using tensorflow::TTypes;
+using tensorflow::error::INVALID_ARGUMENT;
+using tensorflow::tensor_flag_utils::FindConfigValueForKey;
+using tensorflow::tensor_flag_utils::GetLinearBucket;
+using tensorflow::tensor_flag_utils::GetPowerBucket;
+using tensorflow::tensor_flag_utils::ValidateScalarQuantityShardingConfig;
+using tensorflow::tensor_flag_utils::ValidateSparseMatrixShardingConfig;
+
+TEST(SparseUtilsTest, ValidateSparseMatrixShardingConfig) {
+  // Only a default is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0.7;
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.0;
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Misshapen.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 1});
+    int indx = 0;
+    for (const float v : {60.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 2});
+    int indx = 0;
+    for (const float v : {
+             60.0,
+             50.0,
+         }) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+
+  // Only one key is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 3});
+    int indx = 0;
+    for (const float v : {30.0, 20.0, 1.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Two keys are specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 50.0, 0.41, 30.0, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Out of range.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 40.0, 0.41, 30.0, 20.0, 10.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 40.0, 0.41, 30.0, 20.0, -0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, -40.0, 0.41, 30.0, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = -0.5;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+}
+
+TEST(SparseUtilsTest, ValidateScalarQuantityShardingConfig) {
+  // Only a default is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0.7;
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.0;
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Misshapen.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 1});
+    int indx = 0;
+    for (const float v : {60.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 2});
+    int indx = 0;
+    for (const float v : {
+             60.0,
+             50.0,
+         }) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Two keys are specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 3});
+    int indx = 0;
+    for (const float v : {30.0, 20.0, 1.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Only one key is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+
+  // Out of range.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, 10.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, -0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {-40.0, 0.41, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = -0.5;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+}
+
+TEST(SparseUtils, FindConfigValueForKey) {
+  {
+    float data[] = {60.0, 50.0, 0.41, 30.0, 20.0, 0.1, 0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 3, 3);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 1, 3);
+    auto val = FindConfigValueForKey<float, int64>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {60.0, 50.0, 0.41, 0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 2, 3);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {60.0, 0.41, 50.0, 0.14, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 3, 2);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, 70);
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 60);
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 55);
+    EXPECT_FLOAT_EQ(0.14, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 50);
+    EXPECT_FLOAT_EQ(0.14, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 20);
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 30);
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+}
+
+TEST(SparseUtils, GetLinearBucket) {
+  EXPECT_EQ(11, GetLinearBucket(11, 5));
+  EXPECT_EQ(11, GetLinearBucket(12, 5));
+  EXPECT_EQ(1, GetLinearBucket(4ll, 5ll));
+}
+
+TEST(SparseUtils, GetPowerBucket) {
+  EXPECT_EQ(6, GetPowerBucket(11, 5));
+  EXPECT_EQ(6, GetPowerBucket(12, 5));
+  EXPECT_EQ(1332, GetPowerBucket(1335, 11));
+  EXPECT_EQ(5, GetPowerBucket(5ll, 4ll));
+  EXPECT_EQ(1, GetPowerBucket(4ll, 1ll));
+}
+
+}  // namespace
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
index df035506f7698d1d213efad6088e9bfb53d97282..0060410c95787fb69d206b646afd66c31a821f05 100644
--- a/tensorflow/core/kernels/tensor_forest/BUILD
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -27,7 +27,6 @@ tf_kernel_library(
         ":resources",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensor_forest_ops_op_lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
     ],
 )
@@ -39,7 +38,6 @@ tf_kernel_library(
         ":resources",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensor_forest_ops_op_lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
     ],
 )
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
index da258e5017ca8cc9b996d83bcd767e89d61322d7..f0a78f97264336acc9ba293d6547cc0fe10343ee 100644
--- a/tensorflow/core/kernels/tensor_forest/resources.h
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -34,7 +34,7 @@ class TensorForestTreeResource : public ResourceBase {
  public:
   TensorForestTreeResource();
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("TensorForestTree[size=", get_size(), "]");
   }
 
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index f8144867014eccf04c892d0ce90a2aa280dfd764..43fd0d20adbf45ff135e46959506d71018fb1858 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -57,6 +57,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_uint8(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
@@ -78,6 +79,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_uint8(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.h
similarity index 85%
rename from tensorflow/core/kernels/tile_functor_gpu.cu.cc
rename to tensorflow/core/kernels/tile_functor_gpu.h
index 84a5060fc3cd17c09b905d606dba62bbaa7f1373..0de32e730ed858ccc3dfcbacb65a7cf922aa5ce2 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -80,28 +83,7 @@ void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
 }
 
 }  // end namespace internal
-
-namespace functor {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-// Register functors used for Tile functor.
-#define DEFINE_TYPE(T)                       \
-  template struct Tile<GPUDevice, T, int32>; \
-  template struct Tile<GPUDevice, T, int64>;
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_half(DEFINE_TYPE);
-TF_CALL_complex64(DEFINE_TYPE);
-TF_CALL_complex128(DEFINE_TYPE);
-
-#undef DEFINE_TYPE
-
-}  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
diff --git a/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a814c7a2c4de5964deb2eff875235f293cd7b0
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, bool, int32>;
+template struct Tile<GpuDevice, bool, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4dfa4bac1b6a08acc4c8eed18785785b3e4d6071
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, complex128, int32>;
+template struct Tile<GpuDevice, complex128, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..525ede938fd6d31df514ad9f6c049d62f8c25740
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, complex64, int32>;
+template struct Tile<GpuDevice, complex64, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25e024083e3d3ed44af51f1ff1ae2fb1305be526
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, double, int32>;
+template struct Tile<GpuDevice, double, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0f31370e43cdd3e06aadfe6daf0eb988cfd6ce4
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, float, int32>;
+template struct Tile<GpuDevice, float, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c3810a0bc63de50360845e5c56a693ebff56c2e
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, Eigen::half, int32>;
+template struct Tile<GpuDevice, Eigen::half, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2280dcbc82d320586ca262c8c372970a70958f27
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int16, int32>;
+template struct Tile<GpuDevice, int16, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05403badae96d24fde13c1532eb32ab67695d06
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int32, int32>;
+template struct Tile<GpuDevice, int32, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d83c6b3a1c2257b47ab978767713e9d93d22323
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int64, int32>;
+template struct Tile<GpuDevice, int64, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index b9b37612ad569fa8c23f4bb06d641a8c9215383d..2e01fa17630e3b32845dd4828b0907a45e4e42d9 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -136,6 +136,7 @@ class TileOp : public OpKernel {
 
     // Invoke macro using TF_CALL_* so type-filtering for platform applies.
     TF_CALL_bool(HANDLE_TYPE_NAME);
+    TF_CALL_bfloat16(HANDLE_TYPE_NAME);
     TF_CALL_float(HANDLE_TYPE_NAME);
     TF_CALL_double(HANDLE_TYPE_NAME);
     TF_CALL_uint8(HANDLE_TYPE_NAME);
@@ -214,6 +215,7 @@ inline void TileOp<Device, Tmultiples>::HandleCase(
 
 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
+TF_CALL_bfloat16(HANDLE_TYPE_NAME_CPU);
 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
 TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index df6a666cd441d9c1306d950bbe0e79bf3dae28d9..8b0c80159a34cb7c61f2efcb9a001c6950be23c2 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -33,6 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
 
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_int16(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
@@ -55,6 +56,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_uint8(DEFINE_TYPE);
 TF_CALL_int16(DEFINE_TYPE);
diff --git a/tensorflow/core/kernels/topk_op_gpu.cu.cc b/tensorflow/core/kernels/topk_op_gpu.h
similarity index 98%
rename from tensorflow/core/kernels/topk_op_gpu.cu.cc
rename to tensorflow/core/kernels/topk_op_gpu.h
index 2fbe1fe7cbb5ad0d90dfcb651fdbb8359c7c1d69..70d6a606647207dd77793299617cef649b9a33fa 100644
--- a/tensorflow/core/kernels/topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -410,7 +412,7 @@ struct SegmentOffsetCreator {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
     return idx * num_cols_;
-  };
+  }
 
   int num_cols_;
 };
@@ -561,14 +563,8 @@ struct TopKFunctor<GPUDevice, T> {
 };
 
 }  // end namespace functor
-
-#define INSTANTIATE_TEMPLATE(type) \
-  template struct functor::TopKFunctor<GPUDevice, type>;
-
-TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_TEMPLATE);
-TF_CALL_INTEGRAL_TYPES(INSTANTIATE_TEMPLATE);
-#undef INSTANTIATE_TEMPLATE
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
diff --git a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a5a7e71b1b3126335acd75d1061b816046a18b7
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b69396bb13dc4414e07e742c7ed90b03fc3df51
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e53586aeca2d00c1d6e6e75fad9538abc8ba1d6a
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5bd310523c98d33cadd6324296468629f0dbec4b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55b393a0c02b15c4bce08994e1d8a4e82684d97b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4a775056310d2e58d8f339bcace213741ef699
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac73cd170b8fbd956921120ac106b0b1813b1605
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int8>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d5f8ceb06d171c43cf25e59fe47602f4410977f
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc1a8a2c8cca11e52d2b9eb53c269cc78e44b3d1
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint8>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index b2239ab5c39fea33fc70b6aaf170d456cd1ba3fe..5594c998dd1f69e597c31b800bde55a8b7f63e53 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 48e392c07073a9adf989fc2171222e966aede0f6..1c0d70c333f8bbef08e9a37e06694ec5ff19b20d 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/transpose_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index c9c2ac1e69c431957b3db60f10e598b102ba9ebe..c071db606485dbf5747c8695e299da69095c4de3 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "unicode/unistr.h"  // TF:icu
 #include "unicode/uset.h"  // TF:icu
 #include "unicode/utypes.h"  // TF:icu
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 3bdcfc90b878479572ad144bc82e9dc6763a4abf..adf84bae49cf7f70577e8b22390527c6b276a170 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 1e1647db5c1c41d6242cab87b0d8a8cf66d32a28..8577ce7bf792d1b724e9e0ea699accb7c2dded09 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index eadea18f760b6109c6c10700285a2a2e54e4b083..00994bbe8e7142f0c8ca7a31aef7f0a540b48824 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -35,7 +35,7 @@ class LegacyVar : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
@@ -116,7 +116,7 @@ class TemporaryVariableOp : public OpKernel {
     mutex mu;
     Tensor val;
     string name;
-    string DebugString() override { return name; }
+    string DebugString() const override { return name; }
     ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
   };
 
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 3330442ffd602c7293a4ddc3c675524698364c4e..374257d1766a04feb52fcdb07bae4cfccfc537ed 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -137,8 +137,10 @@ class WhereCPUOp : public OpKernel {
     const int input_dims = input.dims();
 
     Tensor num_true;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true));
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
+                                                   &num_true, attr));
     auto num_true_t = num_true.scalar<int64>();
 
     Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
@@ -368,6 +370,12 @@ class WhereGPUOp : public AsyncOpKernel {
       Name("Where").Device(DEVICE_GPU).TypeConstraint<T>("T"), WhereGPUOp<T>);
 
 TF_CALL_WHERE_GPU_TYPES(REGISTER_GPU_WHERE_OP);
+REGISTER_KERNEL_BUILDER(Name("Where")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("index"),
+                        WhereCPUOp<int32>);
 
 #undef REGISTER_GPU_WHERE_OP
 
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index e63b3ba8cde5e284a8ef7664a4453fef343cdfa2..7297d37ffb8fc19dd924a4396b110b4e87bf795c 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -27,7 +27,6 @@ namespace tensorflow {
 #define TF_CALL_WHERE_GPU_TYPES(m) \
   TF_CALL_int8(m);                 \
   TF_CALL_uint8(m);                \
-  TF_CALL_int32(m);                \
   TF_CALL_int64(m);                \
   TF_CALL_float(m);                \
   TF_CALL_double(m);               \
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 2255597651ffa17cb21650dfad28c24f15b36fc9..54b22d230ab02da46016e253cf7b75211df62e26 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -25,9 +25,9 @@ limitations under the License.
 #include "third_party/cub/device/device_select.cuh"
 #include "third_party/cub/iterator/counting_input_iterator.cuh"
 #include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/where_op.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index d5cbe6c61674b80978ec16d5c00d3747b667e1f5..4815f7c2cc6c4197c4dbd6017213e275d38b105e 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -150,6 +150,10 @@ string FormatColocationNodeForError(const T& names) {
       });
 }
 
+inline string FormatFunctionForError(const string& name) {
+  return strings::StrCat("{{function_node ", name, "}}");
+}
+
 // The CanonicalCode() for non-errors.
 using ::tensorflow::error::OK;
 
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index 7be5b9b51316d5c325e5f7eb4186819d3e1476b8..0b63f66f6da0792b0cdba23ea3e5a4abba5e4bdc 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -156,19 +156,26 @@ Status StatusGroup::as_status() const {
     return Status::OK();
   }
 
-  // If there is only one message, or all of the messages are identical, return
-  // the original status.  This reduces verbosity and preserves existing
-  // behavior when possible.
+  // Reduce verbosity when handling duplicate messages. If there is only a
+  // single message, or all messages have similar content, then return the
+  // longest status message.
+  std::vector<Status> sorted_children(children_);
+  std::sort(sorted_children.begin(), sorted_children.end(),
+            [](const Status& a, const Status& b) {
+              return a.error_message().length() > b.error_message().length();
+            });
   bool single_status = true;
-  for (const Status& s : children_) {
-    if (s != children_[0]) {
+  for (const auto& s : sorted_children) {
+    if (s.code() != sorted_children[0].code() ||
+        sorted_children[0].error_message().find(s.error_message()) ==
+            string::npos) {
       single_status = false;
       break;
     }
   }
 
   if (single_status) {
-    return children_[0];
+    return sorted_children[0];
   }
 
   std::vector<string> fmt;
@@ -193,7 +200,7 @@ Status StatusGroup::as_status() const {
          const std::pair<error::Code, int>& b) { return a.second < b.second; });
 
   fmt.push_back(
-      strings::Printf("Combined status information from %lu operations:\n",
+      strings::Printf("Combined status information from %zu operations:\n",
                       num_ok_ + children_.size()));
 
   for (const auto& p : count_vec) {
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index d3296b4fac451215fa2b13d6713965740966da9a..7c28184080406ee97dbdad01143619323bfe2325 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -141,6 +141,31 @@ TEST(StatusGroup, ContainsChildMessages) {
   LOG(INFO) << d.as_status();
 }
 
+TEST(StatusGroup, ContainsIdenticalMessage) {
+  StatusGroup sg;
+  const Status internal(errors::Internal("Original error"));
+  for (size_t i = 0; i < 10; i++) {
+    sg.Update(internal);
+  }
+  EXPECT_EQ(sg.as_status(), internal);
+}
+
+TEST(StatusGroup, ContainsCommonPrefix) {
+  StatusGroup sg;
+  const Status a(errors::Internal("Original error"));
+  const Status b(errors::Internal("Original error is"));
+  const Status c(errors::Internal("Original error is invalid"));
+  sg.Update(a);
+  sg.Update(c);
+  sg.Update(c);
+  sg.Update(b);
+  sg.Update(c);
+  sg.Update(b);
+  sg.Update(a);
+  sg.Update(b);
+  EXPECT_EQ(sg.as_status(), c);
+}
+
 static void BM_TF_CHECK_OK(int iters) {
   tensorflow::Status s =
       (iters < 0) ? errors::InvalidArgument("Invalid") : Status::OK();
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 9a5215320f58d10c22872c2837e882bed82f5b52..ce842e97230657b1a933e4bebb7660b7b0033cb4 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -82,9 +82,20 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
+  // Don't request more memory than needed for each frame, preventing OOM
+  int max_frame_width = 0;
+  int max_frame_height = 0;
+  for (int k = 0; k < gif_file->ImageCount; k++) {
+    SavedImage* si = &gif_file->SavedImages[k];
+    if (max_frame_height < si->ImageDesc.Height)
+      max_frame_height = si->ImageDesc.Height;
+    if (max_frame_width < si->ImageDesc.Width)
+      max_frame_width = si->ImageDesc.Width;
+  }
+
   const int num_frames = gif_file->ImageCount;
-  const int width = gif_file->SWidth;
-  const int height = gif_file->SHeight;
+  const int width = max_frame_width;
+  const int height = max_frame_height;
   const int channel = 3;
 
   uint8* const dstdata = allocate_output(num_frames, width, height, channel);
@@ -136,6 +147,14 @@ uint8* Decode(const void* srcdata, int datasize,
         GifByteType color_index =
             this_image->RasterBits[(i - img_desc->Top) * (img_desc->Width) +
                                    (j - img_desc->Left)];
+
+        if (color_index >= color_map->ColorCount) {
+          *error_string = strings::StrCat("found color index ", color_index,
+                                          " outside of color map range ",
+                                          color_map->ColorCount);
+          return nullptr;
+        }
+
         const GifColorType& gif_color = color_map->Colors[color_index];
         p_dst[j * channel + 0] = gif_color.Red;
         p_dst[j * channel + 1] = gif_color.Green;
diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc
index 61d364017cb90933e8e9af7e800db4a6988d8442..89d2d0e8fe8ac652d976477722ed850785a5ba9a 100644
--- a/tensorflow/core/lib/gtl/int_type_test.cc
+++ b/tensorflow/core/lib/gtl/int_type_test.cc
@@ -45,7 +45,7 @@ typedef ::testing::Types<Int8_IT, UInt8_IT, Int16_IT, UInt16_IT, Int32_IT,
                          Int64_IT, UInt64_IT, Long_IT>
     SupportedIntTypes;
 
-TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes);
+TYPED_TEST_SUITE(IntTypeTest, SupportedIntTypes);
 
 TYPED_TEST(IntTypeTest, TestInitialization) {
   constexpr typename TestFixture::T a;
diff --git a/tensorflow/core/lib/gtl/stl_util.h b/tensorflow/core/lib/gtl/stl_util.h
index ffeca4e88a93936ee6a1711afec735d97d04172e..853a290bf6383c679ddc9c00dbce38d18d3d35b6 100644
--- a/tensorflow/core/lib/gtl/stl_util.h
+++ b/tensorflow/core/lib/gtl/stl_util.h
@@ -23,9 +23,12 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "absl/meta/type_traits.h"
+
 namespace tensorflow {
 namespace gtl {
 
@@ -48,16 +51,38 @@ inline const T* vector_as_array(const std::vector<T, Allocator>* v) {
   return v->data();
 }
 
+namespace gtl_internal {
+
+// HasMember is true_type or false_type, depending on whether or not
+// T has a __resize_default_init member. Resize will call the
+// __resize_default_init member if it exists, and will call the resize
+// member otherwise.
+template <typename string_type, typename = void>
+struct ResizeUninitializedTraits {
+  using HasMember = std::false_type;
+  static void Resize(string_type* s, size_t new_size) { s->resize(new_size); }
+};
+
+// __resize_default_init is provided by libc++ >= 8.0 and by Google's internal
+// ::string implementation.
+template <typename string_type>
+struct ResizeUninitializedTraits<
+    string_type, absl::void_t<decltype(std::declval<string_type&>()
+                                           .__resize_default_init(237))> > {
+  using HasMember = std::true_type;
+  static void Resize(string_type* s, size_t new_size) {
+    s->__resize_default_init(new_size);
+  }
+};
+
+}  // namespace gtl_internal
+
 // Like str->resize(new_size), except any new characters added to "*str" as a
 // result of resizing may be left uninitialized, rather than being filled with
 // '0' bytes. Typically used when code is then going to overwrite the backing
-// store of the string with known data. Uses a Google extension to ::string.
+// store of the string with known data.
 inline void STLStringResizeUninitialized(string* s, size_t new_size) {
-#if __google_stl_resize_uninitialized_string
-  s->resize_uninitialized(new_size);
-#else
-  s->resize(new_size);
-#endif
+  gtl_internal::ResizeUninitializedTraits<string>::Resize(s, new_size);
 }
 
 // Calls delete (non-array version) on the SECOND item (pointer) in each pair in
diff --git a/tensorflow/core/lib/io/table_test.cc b/tensorflow/core/lib/io/table_test.cc
index addba92005d98e9778f819ff397cca7c9e35bd8c..d57135be720bb631f5277df71e3d230464de75ec 100644
--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@@ -96,6 +96,9 @@ class StringSink : public WritableFile {
 
   Status Close() override { return Status::OK(); }
   Status Flush() override { return Status::OK(); }
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("StringSink does not support Name()");
+  }
   Status Sync() override { return Status::OK(); }
   Status Tell(int64* pos) override {
     *pos = contents_.size();
@@ -120,6 +123,10 @@ class StringSource : public RandomAccessFile {
 
   uint64 Size() const { return contents_.size(); }
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("StringSource does not support Name()");
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     if (offset > contents_.size()) {
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index 726d2b1364e79326c077b97bc4145c64e906d25a..aa7bdab03a37013bb0d46f18c5641044cbc40883 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -197,6 +197,10 @@ Status ZlibOutputBuffer::Flush() {
   return Status::OK();
 }
 
+Status ZlibOutputBuffer::Name(StringPiece* result) const {
+  return file_->Name(result);
+}
+
 Status ZlibOutputBuffer::Sync() {
   TF_RETURN_IF_ERROR(Flush());
   return file_->Sync();
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.h b/tensorflow/core/lib/io/zlib_outputbuffer.h
index f4c4d9cbc36edae15440cf9abd9bec36e03d770b..e3d2aec37eac056a19bd425e6ea35a1f0b1f4b2c 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.h
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -77,6 +78,9 @@ class ZlibOutputBuffer : public WritableFile {
   // will fail.
   Status Close() override;
 
+  // Returns the name of the underlying file.
+  Status Name(StringPiece* result) const override;
+
   // Deflates any cached input, writes all output to file and syncs it.
   Status Sync() override;
 
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index f7a359eb5b30804834ec7d5368d91c2074faf8a5..9e7d1e64108c40b6827a3fc2cd3513d148334e6d 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -157,7 +157,8 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   jpeg_calc_output_dimensions(&cinfo);
 
   int64 total_size = static_cast<int64>(cinfo.output_height) *
-                     static_cast<int64>(cinfo.output_width);
+                     static_cast<int64>(cinfo.output_width) *
+                     static_cast<int64>(cinfo.num_components);
   // Some of the internal routines do not gracefully handle ridiculously
   // large images, so fail fast.
   if (cinfo.output_width <= 0 || cinfo.output_height <= 0) {
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h
index 058ed95ffb43586b78f8d82e03b5cf420cfb28f2..f4bbc689d477694e426ea6edc889dd22e2101831 100644
--- a/tensorflow/core/lib/random/philox_random.h
+++ b/tensorflow/core/lib/random/philox_random.h
@@ -49,6 +49,7 @@ namespace random {
 template <typename T, int ElementCount>
 class Array {
  public:
+  static const int kElementCount = ElementCount;
   PHILOX_DEVICE_INLINE Array() {
     for (int i = 0; i < ElementCount; ++i) {
       data_[i] = T(0);
@@ -131,6 +132,12 @@ class PhiloxRandom {
   PHILOX_DEVICE_INLINE
   PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
 
+  PHILOX_DEVICE_INLINE
+  ResultType const& counter() const { return counter_; }
+
+  PHILOX_DEVICE_INLINE
+  Key const& key() const { return key_; }
+
   // Skip the specified number of samples of 128-bits in the current stream.
   PHILOX_DEVICE_INLINE
   void Skip(uint64 count) {
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 4be33b2a0cf10a2525f9a93b5d4942b381d92629..a19e1af888405aa1de9e9a4ca519b895c369cfdf 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -20,9 +20,11 @@ cc_library(
     name = "nccl_lib",
     srcs = if_cuda([
         "nccl_manager.cc",
-        "nccl_manager.h",
         "nccl_rewrite.cc",
     ]),
+    hdrs = if_cuda([
+        "nccl_manager.h",
+    ]),
     copts = tf_copts(),
     deps = if_cuda([
         "@local_config_nccl//:nccl",
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index 58bb84ac57ebf658ecf8efb0351e1376b5a42e6f..e65af133891b3acdc74ee4316c5f6e35d236eb32 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -209,7 +209,7 @@ const Scalar NcclManagerTest<Scalar>::max_ =
 
 // Instantiate tests for float and double.
 using TypeList = ::testing::Types<float, double>;
-TYPED_TEST_CASE(NcclManagerTest, TypeList);
+TYPED_TEST_SUITE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
 TYPED_TEST(NcclManagerTest, BasicSumReduction) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 281e2996ed7c2b07881d5ab564fc31463f8f8607..108f187c981932c420b6bfea709b628832ed719c 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -347,6 +347,16 @@ REGISTER_OP("Pack")
       while (index < rank) dims.push_back(c->Dim(cur, index++));
 
       c->set_output(0, c->MakeShape(dims));
+      for (int i = 0; i < c->num_inputs(); ++i) {
+        auto* shape_and_type = c->input_handle_shapes_and_types(i);
+        if (shape_and_type) {
+          if (!c->RelaxOutputHandleShapesAndMergeTypes(0, *shape_and_type)) {
+            c->set_output_handle_shapes_and_types(
+                0, std::vector<shape_inference::ShapeAndType>({}));
+            break;
+          }
+        }
+      }
       return Status::OK();
     });
 
@@ -1034,6 +1044,12 @@ REGISTER_OP("Fill")
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
       c->set_output(0, out);
+
+      auto* shape_and_type = c->input_handle_shapes_and_types(1);
+      if (shape_and_type) {
+        c->set_output_handle_shapes_and_types(0, *shape_and_type);
+      }
+
       return Status::OK();
     });
 
@@ -1206,27 +1222,13 @@ REGISTER_OP("Identity")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Snapshot")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklIdentity")
@@ -1235,14 +1237,7 @@ REGISTER_OP("_MklIdentity")
     .Output("output: T")
     .Output("mkl_output: uint8")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc( Mkl implementation of IdentityOp
 )Doc");
 #endif
@@ -1626,6 +1621,11 @@ REGISTER_OP("StridedSlice")
       TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(final_shape, &out));
       c->set_output(0, out);
 
+      auto* shape_and_type = c->input_handle_shapes_and_types(0);
+      if (shape_and_type) {
+        c->set_output_handle_shapes_and_types(0, *shape_and_type);
+      }
+
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/clustering_ops.cc b/tensorflow/core/ops/clustering_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..189f00730b4da4a548c8d738ae893ea1e346f3ef
--- /dev/null
+++ b/tensorflow/core/ops/clustering_ops.cc
@@ -0,0 +1,43 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations under
+// the License.
+// ==============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KmeansPlusPlusInitialization")
+    .Input("points: float32")
+    .Input("num_to_sample: int64")
+    .Input("seed: int64")
+    .Input("num_retries_per_sample: int64")
+    .Output("samples: float32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("KMC2ChainInitialization")
+    .Input("distances: float32")
+    .Input("seed: int64")
+    .Output("index: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("NearestNeighbors")
+    .Input("points: float32")
+    .Input("centers: float32")
+    .Input("k: int64")
+    .Output("nearest_center_indices: int64")
+    .Output("nearest_center_distances: float32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index d6157a69df5cf535a0957df8b7ed6d4f597acd1d..e45a8a9b361183303d98f36aef25991566c6f267 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -28,6 +28,7 @@ REGISTER_OP("CollectiveReduce")
     .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
     .Attr("final_op: {'Id', 'Div'}")
     .Attr("subdiv_offsets: list(int)")
+    .Attr("wait_for: list(int) = []")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index d2e53f019635ae9a344b6106608b02ae8a309c99..9c1a0b0d9b5c615db284c39d1552c92faa3ed190 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1298,6 +1298,34 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "AdjustHue"
   input_arg {
@@ -1313,6 +1341,34 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "AdjustSaturation"
   input_arg {
@@ -1328,6 +1384,34 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "All"
   input_arg {
@@ -12758,6 +12842,77 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -13631,6 +13786,88 @@ op {
     }
   }
 }
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "Conv2DBackpropFilter"
   input_arg {
@@ -13848,14 +14085,14 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
@@ -13871,7 +14108,9 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -13893,6 +14132,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -13909,6 +14157,18 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv2DBackpropInput"
@@ -13934,7 +14194,6 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13973,18 +14232,6 @@ op {
       }
     }
   }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
 }
 op {
   name: "Conv2DBackpropInput"
@@ -14012,7 +14259,6 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -14064,54 +14310,17 @@ op {
   }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
   input_arg {
-    name: "filter"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -14123,6 +14332,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -14131,65 +14342,14 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
+    name: "use_cudnn_on_gpu"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      b: true
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
   attr {
     name: "padding"
     type: "string"
@@ -14204,12 +14364,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -14222,16 +14382,15 @@ op {
         i: 1
         i: 1
         i: 1
-        i: 1
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
@@ -14250,6 +14409,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -14258,58 +14419,14 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
   attr {
     name: "padding"
     type: "string"
@@ -14317,55 +14434,28 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
       }
     }
   }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
+    name: "data_format"
     type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -14378,26 +14468,18 @@ op {
         i: 1
         i: 1
         i: 1
-        i: 1
       }
     }
   }
-  deprecation {
-    version: 10
-  }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -14432,17 +14514,13 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -14490,17 +14568,13 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -14563,7 +14637,7 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -14611,7 +14685,342 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -21866,6 +22275,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDatasetCardinality"
   input_arg {
@@ -23238,6 +23724,42 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -29871,6 +30393,44 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "KMC2ChainInitialization"
+  input_arg {
+    name: "distances"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "KmeansPlusPlusInitialization"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_to_sample"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_retries_per_sample"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "samples"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -38392,6 +38952,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NearestNeighbors"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "centers"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "Neg"
   input_arg {
@@ -38509,6 +39092,34 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "NextIteration"
   input_arg {
@@ -43892,7 +44503,1331 @@ op {
         type: DT_QUINT8
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
@@ -43903,108 +45838,24 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
+    name: "Tbias"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
+        type: DT_FLOAT
         type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Tfilter"
+    name: "Tsummand"
     type: "type"
     allowed_values {
       list {
@@ -44020,7 +45871,7 @@ op {
     name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -53171,6 +55022,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
@@ -72922,6 +74810,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -76560,6 +78492,34 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -76639,6 +78599,10 @@ op {
     name: "indices"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "element_dtype"
@@ -76658,6 +78622,10 @@ op {
     name: "index"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "item"
     type_attr: "element_dtype"
@@ -76684,6 +78652,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "output_handle"
     type: DT_VARIANT
@@ -76764,6 +78736,22 @@ op {
     }
   }
 }
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListScatter"
   input_arg {
@@ -76859,6 +78847,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "tensor"
     type_attr: "element_dtype"
@@ -79827,6 +81819,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index f904e2536dfe67facc25335dc3f86b3d45fd116f..316e405188cd8af4ef538a0fc6d8eb1ddbd383a0 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -352,6 +352,16 @@ REGISTER_OP("ExperimentalStatsAggregatorSummary")
     .Output("summary: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalTakeWhileDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("predicate: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalUnbatchDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -453,6 +463,15 @@ REGISTER_OP("ExperimentalLMDBDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalChooseFastestDataset")
+    .Input("input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("N: int >= 2")
+    .Attr("num_experiments: int")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalIdentityIndexedDataset")
     .Input("size: uint64")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/fingerprint64_map_ops.cc b/tensorflow/core/ops/fingerprint64_map_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91b24b401787f154ce67e1c6c7aaaf2a9f65d475
--- /dev/null
+++ b/tensorflow/core/ops/fingerprint64_map_ops.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("Fingerprint64Map")
+    .Output("table_handle: resource")
+    .Attr("heterogeneous_key_dtype: type")
+    .Attr("table_value_dtype: type = DT_INT64")
+    .Attr("num_oov_buckets: int >= 1")
+    .Attr("offset: int >= 0 = 0")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 5e0bdd888cea1c508a38afe2f40c7c9f17d28269..be440ed728129d6553b017fa537e0585d076c35d 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -170,6 +170,7 @@ REGISTER_OP("While")
     .Attr("cond: func")
     .Attr("body: func")
     .Attr("output_shapes: list(shape) = []")
+    .Attr("parallel_iterations: int = 10")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<PartialTensorShape> output_shapes;
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index ee8b1e58d67a18eda6ef9a6fdeb32f2a63cdddbd..0f1555f49cf8dabb8d5fef71d0b14737cc6bc48c 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -408,9 +408,10 @@ REGISTER_OP("AdjustContrast")
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustContrastv2")
-    .Input("images: float")
+    .Input("images: T")
     .Input("contrast_factor: float")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {half, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       // The contrast_factor should be scalar only.
       ShapeHandle unused;
@@ -420,18 +421,20 @@ REGISTER_OP("AdjustContrastv2")
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustHue")
-    .Input("images: float")
+    .Input("images: T")
     .Input("delta: float")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {half, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustSaturation")
-    .Input("images: float")
+    .Input("images: T")
     .Input("scale: float")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {half, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 01ebcd15439d670274d7e2a784ce78c5c1ee44ef..fdaa5a2129e1ee54395b039f958bbcd6f80be7a0 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -51,11 +51,11 @@ REGISTER_OP("TensorListPushBack")
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to push to list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -98,11 +98,11 @@ REGISTER_OP("TensorListPushBackBatch")
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to push to list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -130,6 +130,7 @@ REGISTER_OP("TensorListLength")
 
 REGISTER_OP("TensorListPopBack")
     .Input("input_handle: variant")
+    .Input("element_shape: int32")
     .Output("output_handle: variant")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
@@ -138,11 +139,11 @@ REGISTER_OP("TensorListPopBack")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with invalid variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -166,6 +167,7 @@ REGISTER_OP("TensorListPopBack")
 
 REGISTER_OP("TensorListStack")
     .Input("input_handle: variant")
+    .Input("element_shape: int32")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
@@ -174,11 +176,11 @@ REGISTER_OP("TensorListStack")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -212,16 +214,22 @@ REGISTER_OP("TensorListConcat")
     .Output("tensor: element_dtype")
     .Output("lengths: int64")
     .Attr("element_dtype: type")
+    .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       DataType element_dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
-      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      PartialTensorShape raw_element_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &raw_element_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(raw_element_shape,
+                                                            &element_shape));
+
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -231,10 +239,10 @@ REGISTER_OP("TensorListConcat")
               DataTypeString(list_shape_type.dtype), " but expected type ",
               DataTypeString(element_dtype));
         }
-        shape_inference::ShapeHandle ignored;
+        shape_inference::ShapeHandle merged;
         TF_RETURN_IF_ERROR(
-            c->Merge(element_shape, list_shape_type.shape, &ignored));
-        element_shape = list_shape_type.shape;
+            c->Merge(element_shape, list_shape_type.shape, &merged));
+        element_shape = merged;
       }
       if (c->RankKnown(element_shape)) {
         shape_inference::ShapeHandle result;
@@ -345,6 +353,7 @@ REGISTER_OP("TensorListReserve")
 REGISTER_OP("TensorListGetItem")
     .Input("input_handle: variant")
     .Input("index: int32")
+    .Input("element_shape: int32")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -367,6 +376,24 @@ REGISTER_OP("TensorListGetItem")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListResize")
+    .Input("input_handle: variant")
+    .Input("size: int32")
+    .Output("output_handle: variant")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Check that `size` has scalar shape.
+      shape_inference::ShapeHandle size_shape = c->input(1);
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(size_shape, 0, &unused));
+      c->set_output(0, c->Scalar());
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListSetItem")
     .Input("input_handle: variant")
     .Input("index: int32")
@@ -394,6 +421,7 @@ REGISTER_OP("TensorListSetItem")
 REGISTER_OP("TensorListGather")
     .Input("input_handle: variant")
     .Input("indices: int32")
+    .Input("element_shape: int32")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -437,6 +465,26 @@ REGISTER_OP("TensorListScatter")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListScatterV2")
+    .Input("tensor: element_dtype")
+    .Input("indices: int32")
+    .Input("element_shape: shape_type")
+    .Input("num_elements: int32")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListConcatLists")
     .Input("input_a: variant")
     .Input("input_b: variant")
@@ -453,15 +501,18 @@ REGISTER_OP("TensorListConcatLists")
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
-      if (handle_data_a == nullptr && handle_data_b == nullptr) {
+      if ((handle_data_a == nullptr || handle_data_a->empty()) &&
+          (handle_data_b == nullptr || handle_data_b->empty())) {
         c->set_output_handle_shapes_and_types(
             0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
-          (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
+          (handle_data_a && !handle_data_a->empty()) ? handle_data_a->at(0)
+                                                     : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
-          (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
+          (handle_data_b && !handle_data_b->empty()) ? handle_data_b->at(0)
+                                                     : handle_data_a->at(0);
       if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 6f261dc1b1813ea1e78736725bdf8af66eab2c18..747536a019c378328bbdcfff6535d7361ca52cb8 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -39,7 +39,61 @@ REGISTER_OP("AddN")
                                         " with other shapes.");
       }
       c->set_output(0, cur);
-      return Status::OK();
+
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+
+      if (dtype != DT_VARIANT) {
+        // Exit early if not DT_VARIANT.
+        return Status::OK();
+      } else {
+        // DT_VARIANT shape handle shape inference.  All sizes and dtypes must
+        // be the same; all shapes must be compatible via Merge.
+        std::vector<shape_inference::ShapeAndType> cur_shapes_and_types;
+        auto* shapes_and_types =
+            c->input_handle_shapes_and_types(c->num_inputs() - 1);
+        if (shapes_and_types) {
+          cur_shapes_and_types = *shapes_and_types;
+        }
+
+        for (int i = c->num_inputs() - 2; i >= 0; --i) {
+          auto shapes_and_types_i = c->input_handle_shapes_and_types(i);
+          if (!shapes_and_types && shapes_and_types_i) {
+            // TODO(ebrevdo): Find cases where this happens and fix their shape
+            // inference.  If we are calling AddN on variant types, they should
+            // all have consistent shape_and_type info.
+            shapes_and_types = shapes_and_types_i;
+          } else if (shapes_and_types && shapes_and_types_i) {
+            if (shapes_and_types_i->size() != shapes_and_types->size()) {
+              return errors::InvalidArgument(
+                  "shapes_and_types[", i,
+                  "].size() == ", shapes_and_types_i->size(),
+                  " != shapes_and_types[0].size() == ",
+                  shapes_and_types->size());
+            }
+            for (int j = 0; j < shapes_and_types->size(); ++j) {
+              if (shapes_and_types->at(j).dtype !=
+                  shapes_and_types_i->at(j).dtype) {
+                return errors::InvalidArgument(
+                    "shapes_and_types[", i, "][", j, "].dtype() == ",
+                    DataTypeString(shapes_and_types_i->at(j).dtype),
+                    " != shapes_and_types[0][", j, "].dtype == ",
+                    DataTypeString(shapes_and_types->at(j).dtype));
+              }
+              TF_RETURN_WITH_CONTEXT_IF_ERROR(
+                  c->Merge(shapes_and_types_i->at(j).shape,
+                           cur_shapes_and_types.at(j).shape,
+                           &cur_shapes_and_types.at(j).shape),
+                  "From merging shapes_and_types[", i, "][", j, "].shape with ",
+                  "shapes_and_types[0][", j, "].shape");
+            }
+          }
+        }
+        if (shapes_and_types) {
+          c->set_output_handle_shapes_and_types(0, cur_shapes_and_types);
+        }
+        return Status::OK();
+      }
     });
 
 // --------------------------------------------------------------------------
@@ -1368,7 +1422,14 @@ REGISTER_OP("Conj")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {complex64, complex128, variant} = DT_COMPLEX64")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1688,4 +1749,11 @@ inputs: Must all be the same size and shape.
 
 #endif  // INTEL_MKL
 
+REGISTER_OP("NextAfter")
+    .Attr("T: {float64, float32} = DT_FLOAT")
+    .Input("x1: T")
+    .Input("x2: T")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 658afd99013485ce3c6c16906d3d6f9415ad48f6..f6b132a692758a39363b2b70b29eeff40b644055 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -59,6 +59,63 @@ REGISTER_OP("_MklFusedConv2D")
  is expected to create these operators.
 )doc");
 
+REGISTER_OP("__MklDummyPadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
+REGISTER_OP("_MklPadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_args: num_args * uint8")
+    .Input("mkl_paddings: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
 REGISTER_OP("_MklQuantizedMaxPool")
     .Input("input:         T")
     .Input("min_input:     float")
@@ -634,6 +691,50 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
       return Status::OK();
     });
 
+REGISTER_OP("_MklDepthwiseConv2dNativeBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklDepthwiseConv2dNativeBackpropFilter")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index c7cd3140be060612d697ba6f0de44077e8799ce6..cbde632c503330794e3e4613c061dfe143ba3eab 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -269,10 +269,11 @@ REGISTER_OP("Conv2D")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
-    .SetShapeFn(shape_inference::Conv2DShape);
+    .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding);
 
 REGISTER_OP("Conv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -282,7 +283,8 @@ REGISTER_OP("Conv2DBackpropInput")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -304,7 +306,8 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -1540,6 +1543,22 @@ REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
     });
 
 #ifdef INTEL_MKL
+REGISTER_OP("_MklDepthwiseConv2dNative")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
+
 REGISTER_OP("_MklConv2D")
     .Input("input: T")
     .Input("filter: T")
@@ -2515,6 +2534,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#endif  // INTEL_MKL
 REGISTER_OP("QuantizedConv2DAndRequantize")
     .Input("input: Tinput")
     .Input("filter: Tfilter")
@@ -2851,6 +2871,5 @@ REGISTER_OP("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
       return Status::OK();
     });
 
-#endif  // INTEL_MKL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2c0980e203f81ea885ef773271e740b61fca5921..ef990b4ff2e4be516eade07f9fb956a1c0743fd9 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -486,7 +486,7 @@ op {
   name: "AdjustContrastv2"
   input_arg {
     name: "images"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "contrast_factor"
@@ -494,14 +494,27 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
   name: "AdjustHue"
   input_arg {
     name: "images"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "delta"
@@ -509,14 +522,27 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
   name: "AdjustSaturation"
   input_arg {
     name: "images"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "scale"
@@ -524,7 +550,20 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
@@ -5417,6 +5456,14 @@ op {
     name: "subdiv_offsets"
     type: "list(int)"
   }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
   is_stateful: true
 }
 op {
@@ -5876,83 +5923,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
+    name: "explicit_paddings"
     type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
     default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
       }
     }
   }
@@ -5983,216 +5962,311 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-  deprecation {
-    version: 10
-    explanation: "Use Conv3DBackpropFilterV2"
-  }
-}
-op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+    explanation: "Use Conv3DBackpropFilterV2"
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -10361,6 +10435,40 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDatasetCardinality"
   input_arg {
@@ -11383,6 +11491,42 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -15043,6 +15187,44 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "KMC2ChainInitialization"
+  input_arg {
+    name: "distances"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "KmeansPlusPlusInitialization"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_to_sample"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_retries_per_sample"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "samples"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -18933,6 +19115,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NearestNeighbors"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "centers"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "Neg"
   input_arg {
@@ -18998,6 +19203,34 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "NextIteration"
   input_arg {
@@ -22179,6 +22412,1246 @@ op {
     }
   }
 }
+op {
+  name: "QuantizedConv2DAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "QuantizedInstanceNorm"
   input_arg {
@@ -26745,6 +28218,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
@@ -34204,6 +35714,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -36691,6 +38245,15 @@ op {
     name: "element_dtype"
     type: "type"
   }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
 }
 op {
   name: "TensorListConcatLists"
@@ -36771,6 +38334,10 @@ op {
     name: "indices"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "element_dtype"
@@ -36790,6 +38357,10 @@ op {
     name: "index"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "item"
     type_attr: "element_dtype"
@@ -36816,6 +38387,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "output_handle"
     type: DT_VARIANT
@@ -36896,6 +38471,22 @@ op {
     }
   }
 }
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListScatter"
   input_arg {
@@ -36991,6 +38582,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "tensor"
     type_attr: "element_dtype"
@@ -38801,6 +40396,13 @@ op {
       }
     }
   }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
   is_stateful: true
 }
 op {
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 65bdde375bf07f8a43d682dd6ff58bc89ef80f68..f54ed52ea295c296e184bd333c79fc7d31d4029c 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -29,21 +29,20 @@ namespace tensorflow {
 
 namespace {
 
-Status ValidateVariableResourceHandle(InferenceContext* c,
-                                      ShapeAndType* shape_and_type) {
+Status ValidateVariableResourceHandle(
+    InferenceContext* c, std::vector<ShapeAndType>* shape_and_type) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->empty()) {
-    shape_and_type->shape = c->UnknownShape();
-    shape_and_type->dtype = DT_INVALID;
+    shape_and_type->emplace_back(c->UnknownShape(), DT_INVALID);
   } else {
-    *shape_and_type = (*handle_data)[0];
+    *shape_and_type = *handle_data;
     DataType value_dtype;
     TF_RETURN_IF_ERROR(c->GetAttr("dtype", &value_dtype));
-    if (shape_and_type->dtype != value_dtype) {
+    if (shape_and_type->at(0).dtype != value_dtype) {
       return errors::InvalidArgument(
           "Trying to read variable with wrong dtype. "
           "Expected ",
-          DataTypeString(shape_and_type->dtype), " got ",
+          DataTypeString(shape_and_type->at(0).dtype), " got ",
           DataTypeString(value_dtype));
     }
   }
@@ -51,9 +50,15 @@ Status ValidateVariableResourceHandle(InferenceContext* c,
 }
 
 Status ReadVariableShapeFn(InferenceContext* c) {
-  ShapeAndType shape_and_type;
+  std::vector<ShapeAndType> shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &shape_and_type));
-  c->set_output(0, shape_and_type.shape);
+  c->set_output(0, shape_and_type[0].shape);
+  if (shape_and_type[0].dtype == DT_VARIANT && shape_and_type.size() > 1) {
+    std::vector<ShapeAndType> variant_shape_and_type;
+    std::copy(shape_and_type.begin() + 1, shape_and_type.end(),
+              std::back_inserter(variant_shape_and_type));
+    c->set_output_handle_shapes_and_types(0, variant_shape_and_type);
+  }
   return Status::OK();
 }
 
@@ -180,13 +185,27 @@ REGISTER_OP("DestroyResourceOp")
     .SetShapeFn(shape_inference::NoOutputs);
 
 Status CreateAssignShapeFn(InferenceContext* c) {
-  ShapeAndType handle_shape_and_type;
+  std::vector<ShapeAndType> handle_shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
 
   ShapeHandle value_shape = c->input(1);
   ShapeHandle unused;
   TF_RETURN_IF_ERROR(
-      c->Merge(handle_shape_and_type.shape, value_shape, &unused));
+      c->Merge(handle_shape_and_type[0].shape, value_shape, &unused));
+
+  if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+      handle_shape_and_type.size() > 1 &&
+      c->input_handle_shapes_and_types(1) != nullptr) {
+    auto* value_handle_shape_and_type = c->input_handle_shapes_and_types(1);
+    if (value_handle_shape_and_type->size() !=
+        handle_shape_and_type.size() - 1) {
+      return errors::InvalidArgument(
+          "Incompatible handle variant shape_and_type size and input "
+          "shape_and_type size: ",
+          handle_shape_and_type.size() - 1, " vs. ",
+          value_handle_shape_and_type->size());
+    }
+  }
   return Status::OK();
 }
 
@@ -240,29 +259,37 @@ REGISTER_OP("ResourceGather")
     .Attr("dtype: type")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
+      std::vector<ShapeAndType> handle_shape_and_type;
       TF_RETURN_IF_ERROR(
           ValidateVariableResourceHandle(c, &handle_shape_and_type));
 
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(
-          c->WithRankAtLeast(handle_shape_and_type.shape, 1, &unused));
+          c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused));
       ShapeHandle params_subshape;
       TF_RETURN_IF_ERROR(
-          c->Subshape(handle_shape_and_type.shape, 1, &params_subshape));
+          c->Subshape(handle_shape_and_type[0].shape, 1, &params_subshape));
       ShapeHandle indices_shape = c->input(1);
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
       c->set_output(0, out);
+      if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+          !handle_shape_and_type.empty()) {
+        std::vector<ShapeAndType> variant_shape_and_type;
+        std::copy(handle_shape_and_type.begin() + 1,
+                  handle_shape_and_type.end(),
+                  std::back_inserter(variant_shape_and_type));
+        c->set_output_handle_shapes_and_types(0, variant_shape_and_type);
+      }
       return Status::OK();
     });
 
 namespace {
 
 Status ResourceScatterUpdateShape(InferenceContext* c) {
-  ShapeAndType handle_shape_and_type;
+  std::vector<ShapeAndType> handle_shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
-  ShapeHandle var_shape = handle_shape_and_type.shape;
+  ShapeHandle var_shape = handle_shape_and_type[0].shape;
   ShapeHandle indices_shape = c->input(1);
 
   ShapeHandle unused_updates_shape;
@@ -274,6 +301,19 @@ Status ResourceScatterUpdateShape(InferenceContext* c) {
       InferenceContext::Rank(c->input(2)) == 0
           ? Status::OK()
           : c->Merge(c->input(2), concat, &unused_updates_shape));
+  if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+      handle_shape_and_type.size() > 1 &&
+      c->input_handle_shapes_and_types(2) != nullptr) {
+    auto* value_handle_shape_and_type = c->input_handle_shapes_and_types(2);
+    if (value_handle_shape_and_type->size() !=
+        handle_shape_and_type.size() - 1) {
+      return errors::InvalidArgument(
+          "Incompatible handle variant shape_and_type size and input "
+          "shape_and_type size: ",
+          handle_shape_and_type.size() - 1, " vs. ",
+          value_handle_shape_and_type->size());
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index aa975cb77bafb3b31f0d612d0f662cef0bde06f2..d2bf033461ebdc99889bae5357704205e6172501 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -231,6 +231,15 @@ REGISTER_OP("ResourceScatterNdAdd")
     .Attr("use_locking: bool = true")
     .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
+REGISTER_OP("ResourceScatterNdSub")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c9277eda557bedf8f7e9e4368892ee46810d768
--- /dev/null
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+Status StatefulRandomShape(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+REGISTER_OP("StatefulStandardNormal")
+    .Input("resource: resource")
+    .Input("shape: shape_dtype")
+    .Output("output: dtype")
+    .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT")
+    .Attr("shape_dtype: {int32, int64} = DT_INT64")
+    .SetShapeFn(StatefulRandomShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index fe2e0f5b1f6c1529e6f3d0ab448fb81210fdf89b..d1e5779f023d205bb9595e7dbd322eb9e7e73fe6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -310,6 +310,11 @@ class GcsRandomAccessFile : public RandomAccessFile {
   GcsRandomAccessFile(const string& filename, ReadFn read_fn)
       : filename_(filename), read_fn_(std::move(read_fn)) {}
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
@@ -394,6 +399,10 @@ class GcsWritableFile : public WritableFile {
 
   Status Flush() override { return Sync(); }
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("GCSWritableFile does not support Name()");
+  }
+
   Status Sync() override {
     TF_RETURN_IF_ERROR(CheckWritable());
     if (!sync_needed_) {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index bf057d876f7e85bf6db8706659c8960ff3bce0e1..dc9eb7796f76aa8eeb5137bd311f4a99940f1388 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -79,6 +79,10 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
 
+  StringPiece filename;
+  TF_EXPECT_OK(file->Name(&filename));
+  EXPECT_EQ(filename, "gs://bucket/random_access.txt");
+
   char scratch[6];
   StringPiece result;
 
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index 9b85cae9b90eabfd303ee465ac90e9121c7285cf..a8657359a3561d84b37a47a2696641e869ed567a 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -95,6 +95,11 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   if (!md) {
     return errors::Internal("Could not get a sha256 encryptor.");
   }
+
+  // EVP_MD_CTX_destroy is renamed to EVP_MD_CTX_free in OpenSSL 1.1.0 but
+  // the old name is still retained as a compatibility macro.
+  // Keep this around until support is dropped for OpenSSL 1.0
+  // https://www.openssl.org/news/cl110.txt
   std::unique_ptr<EVP_MD_CTX, std::function<void(EVP_MD_CTX*)>> md_ctx(
       EVP_MD_CTX_create(), [](EVP_MD_CTX* ptr) { EVP_MD_CTX_destroy(ptr); });
   if (!md_ctx) {
@@ -119,7 +124,6 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   if (EVP_DigestSignFinal(md_ctx.get(), sig.get(), &sig_len) != 1) {
     return errors::Internal("DigestFinal (signature compute) failed.");
   }
-  EVP_MD_CTX_cleanup(md_ctx.get());
   return Base64Encode(StringPiece(reinterpret_cast<char*>(sig.get()), sig_len),
                       signature);
 }
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 1cd0641cd3a7dd8376a365f243d63cbfc6b177c2..ce3b9d79c8b12c85a47b5ee6a773f9fadccb2127 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -166,7 +166,6 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
                 const_cast<unsigned char*>(
                     reinterpret_cast<const unsigned char*>(signature.data())),
                 signature.size()));
-  EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
   EVP_PKEY_free(key);
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index f72a66669015d901f2d45dd03ccc8e1604f59489..9659edd890efe0a2c84da62b859162a1a3c2229c 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -150,6 +150,10 @@ class RetryingRandomAccessFile : public RandomAccessFile {
                            const RetryConfig& retry_config)
       : base_file_(std::move(base_file)), retry_config_(retry_config) {}
 
+  Status Name(StringPiece* result) const override {
+    return base_file_->Name(result);
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     return RetryingUtils::CallWithRetries(
@@ -187,6 +191,9 @@ class RetryingWritableFile : public WritableFile {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Flush(); }, retry_config_);
   }
+  Status Name(StringPiece* result) const override {
+    return base_file_->Name(result);
+  }
   Status Sync() override {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Sync(); }, retry_config_);
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 2bc9d830aa92fa768f1e113f1f9433184e7ae684..8a0b865499befb1d984babbe9f8e9176625d0321 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -60,6 +60,9 @@ class MockCallSequence {
 class MockRandomAccessFile : public RandomAccessFile {
  public:
   explicit MockRandomAccessFile(const ExpectedCalls& calls) : calls_(calls) {}
+  Status Name(StringPiece* result) const override {
+    return calls_.ConsumeNextCall("Name");
+  }
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     return calls_.ConsumeNextCall("Read");
@@ -77,6 +80,9 @@ class MockWritableFile : public WritableFile {
   }
   Status Close() override { return calls_.ConsumeNextCall("Close"); }
   Status Flush() override { return calls_.ConsumeNextCall("Flush"); }
+  Status Name(StringPiece* result) const override {
+    return calls_.ConsumeNextCall("Name");
+  }
   Status Sync() override { return calls_.ConsumeNextCall("Sync"); }
   Status Tell(int64* position) override {
     return calls_.ConsumeNextCall("Tell");
@@ -177,7 +183,8 @@ class MockFileSystem : public FileSystem {
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls({std::make_tuple("Read", Status::OK())});
+  ExpectedCalls expected_file_calls({std::make_tuple("Name", Status::OK()),
+                                     std::make_tuple("Read", Status::OK())});
   std::unique_ptr<RandomAccessFile> base_file(
       new MockRandomAccessFile(expected_file_calls));
 
@@ -196,6 +203,9 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
 
   // Use it and check the results.
   StringPiece result;
+  TF_EXPECT_OK(random_access_file->Name(&result));
+  EXPECT_EQ(result, "");
+
   char scratch[10];
   TF_EXPECT_OK(random_access_file->Read(0, 10, &result, scratch));
 }
@@ -287,7 +297,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
 
 TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls({std::make_tuple("Sync", Status::OK()),
+  ExpectedCalls expected_file_calls({std::make_tuple("Name", Status::OK()),
+                                     std::make_tuple("Sync", Status::OK()),
                                      std::make_tuple("Close", Status::OK())});
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
@@ -305,6 +316,10 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   std::unique_ptr<WritableFile> writable_file;
   TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
 
+  StringPiece result;
+  TF_EXPECT_OK(writable_file->Name(&result));
+  EXPECT_EQ(result, "");
+
   // Use it and check the results.
   TF_EXPECT_OK(writable_file->Sync());
 }
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 2efe0c0876e871f6752bb3e7724de4c505102130..38fc453008fcc9b4d59e44591c42ad83df061e70 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -138,8 +138,16 @@ void InfoAboutUnusedCPUFeatures() {
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
+#ifndef INTEL_MKL
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
+#else
+      LOG(INFO) << "This TensorFlow binary is optimized with Intel(R) MKL-DNN "
+                << "to use the following CPU instructions in performance "
+                << "critical operations: " << missing_instructions << std::endl
+                << "To enable them in non-MKL-DNN operations, rebuild "
+                << "TensorFlow with the appropriate compiler flags.";
+#endif
     }
   });
 }
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 769e28902521c64c0020caf08cfd97a948eaac10..e94900840333e12814d420f03e6755b7f3c092d8 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -663,6 +664,8 @@ def tf_additional_cloud_op_deps():
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:api_version_2": [],
+        "//tensorflow:windows_and_api_version_2": [],
         "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
@@ -670,13 +673,15 @@ def tf_additional_cloud_op_deps():
         ],
     })
 
-# TODO(jart, jhseu): Delete when GCP is default on.
+# TODO(jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
     return select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:api_version_2": [],
+        "//tensorflow:windows_and_api_version_2": [],
         "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
@@ -731,6 +736,11 @@ def tf_additional_binary_deps():
             "//tensorflow/stream_executor:cuda_platform",
             "//tensorflow/core/platform/default/build_config:cuda",
         ],
+    ) + if_rocm(
+        [
+            "//tensorflow/stream_executor:rocm_platform",
+            "//tensorflow/core/platform/default/build_config:rocm",
+        ],
     ) + [
         # TODO(allenl): Split these out into their own shared objects (they are
         # here because they are shared between contrib/ op shared objects and
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index ee6936b372acf35c0568331d73615451b2675dd4..6faf5c512f393535d089e327925292167406a374 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -8,6 +8,7 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
@@ -32,16 +33,26 @@ cc_library(
 
 tf_cuda_library(
     name = "stream_executor",
+    cuda_deps = ["//tensorflow/stream_executor/cuda:cuda_activation"],
     deps = [
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
     ] + select({
-        "//tensorflow:using_cuda_clang": ["//tensorflow/stream_executor:cuda_platform"],
-        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor:cuda_platform"],
-        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
-        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
     }) + select({
-        "@local_config_cuda//cuda:darwin": ["IOKit"],
+        "//tensorflow:using_cuda_clang": ["//tensorflow/stream_executor/cuda:all_runtime"],
+        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "//tensorflow:using_rocm_hipcc": ["//tensorflow/stream_executor/rocm:all_runtime"],
         "//conditions:default": [],
     }),
 )
@@ -49,19 +60,41 @@ tf_cuda_library(
 cc_library(
     name = "stream_executor_cuda",
     deps = [
-        "//tensorflow/stream_executor",
+        ":stream_executor_no_cuda",
+        ":cuda",
     ] + if_static(
-        ["//tensorflow/stream_executor:cuda_platform"],
+        ["//tensorflow/stream_executor/cuda:all_runtime"],
     ) + select({
         "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
     }),
 )
 
+cc_library(
+    name = "stream_executor_rocm",
+    deps = [
+        ":stream_executor_no_cuda",
+        ":rocm",
+    ] + if_static(
+        ["//tensorflow/stream_executor/rocm:all_runtime"],
+    ) + select({
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "stream_executor_no_cuda",
     deps = [
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/stream_executor/host:host_platform",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
     ],
 )
 
@@ -250,6 +283,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rocm",
+    data = [],
+    linkopts = select({
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
+        ],
+    }),
+    deps = [],
+)
+
 cc_library(
     name = "sycl",
     data = if_ccpp([
@@ -275,8 +319,3 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
-
-alias(
-    name = "logger",
-    actual = "//tensorflow/core:default_logger",
-)
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 37475feebe2047f81ec60aea677cfcb0be73a08b..ab05b25d6822c12d82d14f6d5c4717d77c27f8e5 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -2,11 +2,19 @@
 # The functions in this file might be referred by tensorflow.bzl. They have to
 # be separate to avoid cyclic references.
 
+load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
+
 def tf_cuda_tests_tags():
-    return ["requires-gpu", "local", "gpu"]
+    return ["requires-gpu", "gpu"] + gpu_test_tags()
 
 def tf_sycl_tests_tags():
-    return ["requires-gpu", "local", "gpu"]
+    return ["requires-gpu", "gpu"] + gpu_test_tags()
+
+def tf_exec_compatible_with(kwargs):
+    if ("tags" in kwargs and kwargs["tags"] != None and
+        "remote-gpu" in kwargs["tags"]):
+        return ["@org_tensorflow//third_party/toolchains:gpu_test"]
+    return []
 
 def tf_additional_plugin_deps():
     return select({
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..ea8fa8708e48efec42499400807d931a6cd11b10
--- /dev/null
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -0,0 +1,41 @@
+"""Build rules for tf.distritbute testing."""
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+def distribute_py_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        full_precision = False,
+        **kwargs):
+    """Generates py_test targets for CPU and GPU.
+
+    Args:
+        name: test target name to generate suffixed with `test`.
+        srcs: source files for the tests.
+        deps: additional dependencies for the test targets.
+        tags: tags to be assigned to the different test targets.
+        data: data files that need to be associated with the target files.
+        main: optional main script.
+        args: arguments to the tests.
+        shard_count: number of shards to split the tests across.
+        **kwargs: extra keyword arguments to the test.
+    """
+
+    _ignore = (full_precision)
+    cuda_py_test(
+        name = name,
+        srcs = srcs,
+        data = data,
+        main = main,
+        additional_deps = deps,
+        shard_count = shard_count,
+        tags = tags,
+        args = args,
+        **kwargs
+    )
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index bb8735ed32505294eff75620006694a4eda80bcc..99dd6de14164e1f9abd915348ce288ed3238a650 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -240,8 +240,7 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
     if (TF_PREDICT_FALSE(v2 < 0)) {                                       \
       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
-    const size_t uval = (size_t)((unsigned)v1);                           \
-    return name##Impl<size_t, size_t>(uval, v2, exprtext);                \
+    return name##Impl<size_t, size_t>(v1, v2, exprtext);                  \
   }                                                                       \
   inline string* name##Impl(const int v1, const size_t v2,                \
                             const char* exprtext) {                       \
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index 2708d6ebda41c01edd881e733b985e237aa3242a..aeef2d9b882c0a3e2624db2dd194345a373bfe0c 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/descriptor.pb.h"
 #include "google/protobuf/dynamic_message.h"
+#include "google/protobuf/io/tokenizer.h"
 #include "google/protobuf/text_format.h"
 #include "google/protobuf/util/json_util.h"
 #include "google/protobuf/util/type_resolver_util.h"
diff --git a/tensorflow/core/platform/default/stacktrace.h b/tensorflow/core/platform/default/stacktrace.h
index c8e297fa8d8c1ee48b060e6e2c7ee89eb0d23b39..b64bc15971037f204a40513cbf74cc7c944e08f2 100644
--- a/tensorflow/core/platform/default/stacktrace.h
+++ b/tensorflow/core/platform/default/stacktrace.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_STACKTRACE_H_
 
 #include "tensorflow/core/platform/platform.h"
-#if !defined(IS_MOBILE_PLATFORM) && defined(PLATFORM_POSIX) && \
-    (defined(__clang__) || defined(__GNUC__))
+#if !defined(IS_MOBILE_PLATFORM) && !defined(PLATFORM_WINDOWS) && \
+    defined(PLATFORM_POSIX) && (defined(__clang__) || defined(__GNUC__))
 #define TF_GENERATE_BACKTRACE
 #endif
 
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index afc4201e5382194b02b8b0f5cdebfc90688c9f00..59768bf92ae9e854f684623ec15c83a70839312d 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -29,6 +29,9 @@ limitations under the License.
 #include "tensorflow/core/platform/windows/wide_char.h"
 #define PATH_MAX MAX_PATH
 #else
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
 #include <unistd.h>
 #endif
 
@@ -314,7 +317,31 @@ string Env::GetExecutablePath() {
   string file_path = WideCharToUtf8(wc_file_path);
   std::copy(file_path.begin(), file_path.end(), exe_path);
 #else
-  CHECK_NE(-1, readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  char buf[PATH_MAX] = {0};
+  int path_length = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+  CHECK_NE(-1, path_length);
+
+  if (strstr(buf, "python") != nullptr) {
+    // Discard the path of the python binary, and any flags.
+    int fd = open("/proc/self/cmdline", O_RDONLY);
+    int cmd_length = read(fd, buf, PATH_MAX - 1);
+    CHECK_NE(-1, cmd_length);
+    int token_pos = 0;
+    for (bool token_is_first_or_flag = true; token_is_first_or_flag;) {
+      // Get token length, including null
+      int token_len = strlen(&buf[token_pos]) + 1;
+      token_is_first_or_flag = false;
+      // Check if we can skip without overshooting
+      if (token_pos + token_len < cmd_length) {
+        token_pos += token_len;
+        token_is_first_or_flag = (buf[token_pos] == '-');  // token is a flag
+      }
+    }
+    snprintf(exe_path, sizeof(exe_path), "%s", &buf[token_pos]);
+  } else {
+    snprintf(exe_path, sizeof(exe_path), "%s", buf);
+  }
+
 #endif
   // Make sure it's null-terminated:
   exe_path[sizeof(exe_path) - 1] = 0;
@@ -338,22 +365,10 @@ bool Env::LocalTempFilename(string* filename) {
 }
 
 bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
-#ifdef __APPLE__
-  uint64_t tid64;
-  pthread_threadid_np(nullptr, &tid64);
-  int32 tid = static_cast<int32>(tid64);
-  int32 pid = static_cast<int32>(getpid());
-#elif defined(__FreeBSD__)
-  // Has to be casted to long first, else this error appears:
-  // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
-  // is not allowed
-  int32 tid = static_cast<int32>(static_cast<int64>(pthread_self()));
-  int32 pid = static_cast<int32>(getpid());
-#elif defined(PLATFORM_WINDOWS)
-  int32 tid = static_cast<int32>(GetCurrentThreadId());
+  int32 tid = GetCurrentThreadId();
+#ifdef PLATFORM_WINDOWS
   int32 pid = static_cast<int32>(GetCurrentProcessId());
 #else
-  int32 tid = static_cast<int32>(pthread_self());
   int32 pid = static_cast<int32>(getpid());
 #endif
   uint64 now_microsec = NowMicros();
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 1b5382841574e6b8843079ae9cb359c5c9b475d0..280076e098d5fdd121bf095d79be5353c0e2b57f 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -271,6 +271,15 @@ class Env {
                               const string& name,
                               std::function<void()> fn) TF_MUST_USE_RESULT = 0;
 
+  // Returns the thread id of calling thread.
+  // Posix: Returns pthread id which is only guaranteed to be unique within a
+  //        process.
+  // Windows: Returns thread id which is unique.
+  virtual int32 GetCurrentThreadId() = 0;
+
+  // Copies current thread name to "name". Returns true if success.
+  virtual bool GetCurrentThreadName(string* name) = 0;
+
   // \brief Schedules the given closure on a thread-pool.
   //
   // NOTE(mrry): This closure may block.
@@ -360,6 +369,10 @@ class EnvWrapper : public Env {
                       std::function<void()> fn) override {
     return target_->StartThread(thread_options, name, fn);
   }
+  int32 GetCurrentThreadId() override { return target_->GetCurrentThreadId(); }
+  bool GetCurrentThreadName(string* name) override {
+    return target_->GetCurrentThreadName(name);
+  }
   void SchedClosure(std::function<void()> closure) override {
     target_->SchedClosure(closure);
   }
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 2e1d4a263f643da6bf9d0600ffc2cb4469ca8d70..5a5d133c96c85384bc496ec1cb0b4fb099dc4d59 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -392,4 +392,18 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
   EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
+TEST_F(DefaultEnvTest, GetThreadInformation) {
+  Env* env = Env::Default();
+
+  EXPECT_NE(env->GetCurrentThreadId(), 0);
+  string thread_name;
+  bool res = env->GetCurrentThreadName(&thread_name);
+#if defined(PLATFORM_WINDOWS) || defined(__ANDROID__)
+  EXPECT_FALSE(res);
+#else
+  EXPECT_TRUE(res);
+  EXPECT_GT(thread_name.size(), 0);
+#endif
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/env_time.h b/tensorflow/core/platform/env_time.h
index b4756ed209cf7f945a2cf4f1bea7271dded7518a..c12b6ba6fb86e7bda394b85fa449c8176c817054 100644
--- a/tensorflow/core/platform/env_time.h
+++ b/tensorflow/core/platform/env_time.h
@@ -25,6 +25,7 @@ namespace tensorflow {
 /// access timer related operations.
 class EnvTime {
  public:
+  static constexpr uint64 kMicrosToPicos = 1000ULL * 1000ULL;
   static constexpr uint64 kMicrosToNanos = 1000ULL;
   static constexpr uint64 kMillisToMicros = 1000ULL;
   static constexpr uint64 kMillisToNanos = 1000ULL * 1000ULL;
diff --git a/tensorflow/core/platform/fake_python_env_test.cc b/tensorflow/core/platform/fake_python_env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b521db3c054bff0e324a3b0571e0af7f47c269c4
--- /dev/null
+++ b/tensorflow/core/platform/fake_python_env_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has "python" in its name. Thus, it should trigger the python
+// specific code paths.
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+int myargc;
+char** myargv;
+
+char kMagicBazelDirSubstring[] = ".runfiles/org_tensorflow";
+char kPythonFile[] =
+    "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py";
+
+namespace tensorflow {
+
+TEST(FakePythonEnvTest, GetExecutablePath) {
+  // See if argc is greater than 1 and first arg is kPythonFile
+  // If not, rerun the executable with proper args.
+  if (myargc <= 1 || strstr(myargv[1], kMagicBazelDirSubstring) == nullptr) {
+    const char* filename = myargv[0];
+    char* new_argv[] = {
+        myargv[0],
+        kPythonFile,
+        nullptr,
+    };
+
+    execv(filename, new_argv);
+  }
+
+  Env* env = Env::Default();
+  // We depend on the file/executable name to include python and fool the
+  // library to think this is running under the python interpreter.
+  string path = env->GetExecutablePath();
+  EXPECT_TRUE(strstr(path.c_str(), kMagicBazelDirSubstring) != nullptr);
+}
+
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  myargc = argc;
+  myargv = argv;
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 48ffa66358495686332da0eb29b22236bb87e454..a3a2b6c7f3c96f6ccfc8d3b4f81a4be123758bd4 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -234,6 +234,14 @@ class RandomAccessFile {
   RandomAccessFile() {}
   virtual ~RandomAccessFile();
 
+  /// \brief Returns the name of the file.
+  ///
+  /// This is an optional operation that may not be implemented by every
+  /// filesystem.
+  virtual Status Name(StringPiece* result) const {
+    return errors::Unimplemented("This filesystem does not support Name()");
+  }
+
   /// \brief Reads up to `n` bytes from the file starting at `offset`.
   ///
   /// `scratch[0..n-1]` may be written by this routine.  Sets `*result`
@@ -297,6 +305,14 @@ class WritableFile {
   /// persisted, depending on the implementation.
   virtual Status Flush() = 0;
 
+  // \brief Returns the name of the file.
+  ///
+  /// This is an optional operation that may not be implemented by every
+  /// filesystem.
+  virtual Status Name(StringPiece* result) const {
+    return errors::Unimplemented("This filesystem does not support Name()");
+  }
+
   /// \brief Syncs contents of file to filesystem.
   ///
   /// This waits for confirmation from the filesystem that the contents
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index ad4ed5cdd8d1c626b4522652629a5263b1e2ec6b..2cf1036cc898ca8afefcb01d622a41240ec7ca56 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -205,6 +205,11 @@ class HDFSRandomAccessFile : public RandomAccessFile {
     }
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Status s;
@@ -310,6 +315,11 @@ class HDFSWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Sync() override {
     if (hdfs_->hdfsHSync(fs_, file_) != 0) {
       return IOError(filename_, errno);
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index d29667944ab5350cd10f1468535bc442ac24e603..b9e8f28739891868f11aa21ec7c48e93afe2b1c5 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -75,9 +75,12 @@ TEST_F(HadoopFileSystemTest, RandomAccessFile) {
   std::unique_ptr<RandomAccessFile> reader;
   TF_EXPECT_OK(hdfs.NewRandomAccessFile(fname, &reader));
 
+  StringPiece result;
+  TF_EXPECT_OK(reader->Name(&result));
+  EXPECT_EQ(result, fname);
+
   string got;
   got.resize(content.size());
-  StringPiece result;
   TF_EXPECT_OK(
       reader->Read(0, content.size(), &result, gtl::string_as_array(&got)));
   EXPECT_EQ(content.size(), result.size());
@@ -94,6 +97,9 @@ TEST_F(HadoopFileSystemTest, WritableFile) {
   std::unique_ptr<WritableFile> writer;
   const string fname = TmpDir("WritableFile");
   TF_EXPECT_OK(hdfs.NewWritableFile(fname, &writer));
+  StringPiece result;
+  TF_EXPECT_OK(writer->Name(&result));
+  EXPECT_EQ(result, fname);
   TF_EXPECT_OK(writer->Append("content1,"));
   int64 pos;
   TF_EXPECT_OK(writer->Tell(&pos));
diff --git a/tensorflow/core/platform/default/logger.cc b/tensorflow/core/platform/logger.cc
similarity index 72%
rename from tensorflow/core/platform/default/logger.cc
rename to tensorflow/core/platform/logger.cc
index 54b1a1a67ca7da65aa6897e6461ebe9b54fb4767..f5a961e4d318529ca00846d4e6647b20a5232568 100644
--- a/tensorflow/core/platform/default/logger.cc
+++ b/tensorflow/core/platform/logger.cc
@@ -18,17 +18,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
 
-Logger* Logger::Singleton() {
-  class DefaultLogger : public Logger {
-   private:
-    void DoLogProto(google::protobuf::Any* proto) override {
-      VLOG(2) << proto->ShortDebugString();
-    }
-    void DoFlush() override {}
-  };
-  static Logger* instance = new DefaultLogger();
-  return instance;
-}
+class DefaultLogger : public Logger {
+ private:
+  void DoLogProto(google::protobuf::Any* proto) override {
+    VLOG(2) << proto->ShortDebugString();
+  }
+  void DoFlush() override {}
+};
+
+}  // namespace
+
+Logger::FactoryFunc Logger::singleton_factory_ = []() -> Logger* {
+  return new DefaultLogger();
+};
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/logger.h b/tensorflow/core/platform/logger.h
index 5d304bea63a7c78e4a90d78ea2be4ce01caa802d..f0bfef4f2d9ebce091917cd85cf6e5e903ab52a4 100644
--- a/tensorflow/core/platform/logger.h
+++ b/tensorflow/core/platform/logger.h
@@ -26,7 +26,22 @@ namespace tensorflow {
 // log anything to a non-local place, e.g. a database.
 class Logger {
  public:
-  static Logger* Singleton();
+  // The singleton is supposed to be used in the following steps:
+  // * At program start time, REGISTER_MOUDLE_INITIALIZER calls
+  //   SetSingletonFactory.
+  // * At some point in the program execution, Singleton() is called for the
+  //   first time, initializing the logger.
+  // * Succeeding calls to Singleton() return the initiailized logger.
+  using FactoryFunc = Logger* (*)();
+
+  static void SetSingletonFactory(FactoryFunc factory) {
+    singleton_factory_ = factory;
+  }
+
+  static Logger* Singleton() {
+    static Logger* instance = singleton_factory_();
+    return instance;
+  }
 
   virtual ~Logger() = default;
 
@@ -44,6 +59,8 @@ class Logger {
  private:
   virtual void DoLogProto(google::protobuf::Any* proto) = 0;
   virtual void DoFlush() = 0;
+
+  static FactoryFunc singleton_factory_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 0a939aef25236dc33e2be8ec1d76f9ea0075e350..4e9373616e343d5e82077801b7658a846d89f5d3 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -86,6 +86,35 @@ class PosixEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  int32 GetCurrentThreadId() {
+#ifdef __APPLE__
+    uint64_t tid64;
+    pthread_threadid_np(nullptr, &tid64);
+    return static_cast<int32>(tid64);
+#elif defined(__FreeBSD__)
+    // Has to be casted to long first, else this error appears:
+    // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
+    // is not allowed
+    return static_cast<int32>(static_cast<int64>(pthread_self()));
+#else
+    return static_cast<int32>(pthread_self());
+#endif
+  }
+
+  bool GetCurrentThreadName(string* name) {
+#ifdef __ANDROID__
+    return false;
+#else
+    char buf[100];
+    int res = pthread_getname_np(pthread_self(), buf, static_cast<size_t>(100));
+    if (res != 0) {
+      return false;
+    }
+    *name = buf;
+    return true;
+#endif
+  }
+
   void SchedClosure(std::function<void()> closure) override {
     // TODO(b/27290852): Spawning a new thread here is wasteful, but
     // needed to deal with the fact that many `closure` functions are
@@ -121,13 +150,25 @@ class PosixEnv : public Env {
 
   string GetRunfilesDir() override {
     string bin_path = this->GetExecutablePath();
-    string runfiles_path = bin_path + ".runfiles/org_tensorflow";
+    string runfiles_suffix = ".runfiles/org_tensorflow";
+    std::size_t pos = bin_path.find(runfiles_suffix);
+
+    // Sometimes (when executing under python) bin_path returns the full path to
+    // the python scripts under runfiles. Get the substring.
+    if (pos != std::string::npos) {
+      return bin_path.substr(0, pos + runfiles_suffix.length());
+    }
+
+    // See if we have the executable path. if executable.runfiles exists, return
+    // that folder.
+    string runfiles_path = bin_path + runfiles_suffix;
     Status s = this->IsDirectory(runfiles_path);
     if (s.ok()) {
       return runfiles_path;
-    } else {
-      return bin_path.substr(0, bin_path.find_last_of("/\\"));
     }
+
+    // If nothing can be found, return something close.
+    return bin_path.substr(0, bin_path.find_last_of("/\\"));
   }
 
  private:
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 2f59940ef311c9d468986125415e343d302d7f2c..003ab170fe8db2980bb9c7ad79bf90b523e36b76 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -52,6 +52,11 @@ class PosixRandomAccessFile : public RandomAccessFile {
       : filename_(fname), fd_(fd) {}
   ~PosixRandomAccessFile() override { close(fd_); }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Status s;
@@ -115,6 +120,11 @@ class PosixWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Sync() override {
     Status s;
     if (fflush(file_) != 0) {
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 41184b6fd9ed12c0164f06e2c92816b2c99a03f7..7bc4d80db5b0ab31540f5c95d91ad29239458bce 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -14,7 +14,7 @@ load(
 )
 
 tf_cc_binary(
-    name = "s3_file_system.so",
+    name = "libs3_file_system_shared.so",
     srcs = [
         "aws_crypto.cc",
         "aws_crypto.h",
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index e0b8e377453393429a3e5853b1aa2ce871334bff..0ff65fb6b38a9e64cc9c0778c483922c0e7d6bfc 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -170,6 +170,10 @@ class S3RandomAccessFile : public RandomAccessFile {
                      std::shared_ptr<Aws::S3::S3Client> s3_client)
       : bucket_(bucket), object_(object), s3_client_(s3_client) {}
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("S3RandomAccessFile does not support Name()");
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Aws::S3::Model::GetObjectRequest getObjectRequest;
@@ -235,6 +239,10 @@ class S3WritableFile : public WritableFile {
 
   Status Flush() override { return Sync(); }
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("S3WritableFile does not support Name()");
+  }
+
   Status Sync() override {
     if (!outfile_) {
       return errors::FailedPrecondition(
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 592626bfa17e691d1b10ddce5c7f0f31ed825861..5573b2fc93f8b28777e78ad50d423ecb57409821 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
+#include <cfenv>  // NOLINT
+
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index 0a590b3d40c0dbf007feee07fc93be4838924679..437e8a1c95632af71c3f2db2c4b35cfb48849b8a 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -18,11 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/dso_loader.h"
-#else
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
@@ -31,6 +26,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 50a5e732c0ec222d3ee2329a57fc6ea9ac4b233c..129ee6c7a7503b680e90ccc68e39a3c838bb0e65 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -18,11 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/dso_loader.h"
-#else
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
@@ -31,6 +26,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 77ce2026d9d2cdda7ef1ea0ad6bb71050a6467af..e0e3dda7055b5cbe8f0e08be4a251232b8005fd2 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -84,6 +84,12 @@ class WindowsEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  int32 GetCurrentThreadId() override {
+    return static_cast<int32>(::GetCurrentThreadId());
+  }
+
+  bool GetCurrentThreadName(string* name) override { return false; }
+
   static VOID CALLBACK SchedClosureCallback(PTP_CALLBACK_INSTANCE Instance,
                                             PVOID Context, PTP_WORK Work) {
     CloseThreadpoolWork(Work);
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 993b9906b1c072cb48c816855fb2fc1498ae3f40..8580c3a3efb6807c3d96650f6809a8b9b54b0e89 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -112,6 +112,11 @@ class WindowsRandomAccessFile : public RandomAccessFile {
     }
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Status s;
@@ -189,6 +194,11 @@ class WindowsWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Sync() override { return Flush(); }
 };
 
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 7f2cd3f698c860f16cd7b027b5ff7c8e24338cf0..38a8e0285118aa664f22ba866edd59bf8ffdbcde 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -54,10 +54,10 @@ cpu_micros: This is the cpu times.
 
 ### Memory
 
-Tensor memory are usually ref-counted. The memory is released when there is
-no more reference to it. It will be difficult to track the release of memory.
+Tensor memory are usually ref-counted. The memory is released when there is no
+more reference to it. It will be difficult to track the release of memory.
 Currently, profiler only tracks the allocation of memory. As a result, the
-accumulated memory request is uaually larger than the peak memory of the overall
+accumulated memory request is usually larger than the peak memory of the overall
 model.
 
 It's recommended to generate timeline to see the allocator memory usage over
diff --git a/tensorflow/core/profiler/g3doc/profile_memory.md b/tensorflow/core/profiler/g3doc/profile_memory.md
index 6eda5abdd973ece435855b0952a5edd4a86b8217..03229e497f3bc150c6258c27d87c3be621ef7065 100644
--- a/tensorflow/core/profiler/g3doc/profile_memory.md
+++ b/tensorflow/core/profiler/g3doc/profile_memory.md
@@ -14,10 +14,7 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 ******************************************************
 ```
 
-<left>
 ![Timeline](graph_timeline.png)
-</left>
-
 
 ```python
 # You can also visualize the memory information through other methods.
@@ -77,4 +74,4 @@ _TFProfRoot (--/74148.60MB)
                   seq2seq_attention_model.py:320:_add_train_op:tf.summary.scalar... (0B/64B)
                 seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0B/25216.74MB)
                   seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0B/21542.55MB)
-```
\ No newline at end of file
+```
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0320ae1a6812e5ae5e08c2ac6817863c0bf1ddce
--- /dev/null
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -0,0 +1,42 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "eager_profiler",
+    srcs = [
+        "eager_profiler.cc",
+    ],
+    hdrs = [
+        "eager_profiler.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/contrib/tpu/profiler:trace_events_proto_cc",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu_lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:session_options",
+            "//tensorflow/core:device_tracer",
+        ],
+    }),
+)
diff --git a/tensorflow/core/profiler/lib/eager_profiler.cc b/tensorflow/core/profiler/lib/eager_profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9293e7aba9504af37034e8481ca6d42d17cc91a6
--- /dev/null
+++ b/tensorflow/core/profiler/lib/eager_profiler.cc
@@ -0,0 +1,162 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/lib/eager_profiler.h"
+#include <string>
+#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/device_tracer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
+                                    tpu::Trace* trace,
+                                    const uint64 profile_start_time_micros) {
+  auto trace_devices = trace->mutable_devices();
+  // TODO(fishx): use a lighter representation instead of GraphDef to insert
+  // python information into trace event.
+
+  for (size_t device_id = 0;
+       device_id < run_metadata->step_stats().dev_stats_size(); ++device_id) {
+    // Create device
+    auto* device_stats =
+        run_metadata->mutable_step_stats()->mutable_dev_stats(device_id);
+    tensorflow::tpu::Device device;
+    device.set_name(device_stats->device());
+    device.set_device_id(device_id);
+    tensorflow::tpu::Resource resource;
+    resource.set_name("0");
+    resource.set_resource_id(0);
+    (*device.mutable_resources())[0] = resource;
+    (*trace_devices)[device_id] = device;
+
+    // Emit events.
+    for (auto node :
+         run_metadata->step_stats().dev_stats(device_id).node_stats()) {
+      auto* event = trace->add_trace_events();
+      auto* args = event->mutable_args();
+      event->set_device_id(device_id);
+      event->set_resource_id(0);
+      event->set_name(node.node_name());
+      event->set_timestamp_ps(
+          (node.all_start_micros() - profile_start_time_micros) *
+          EnvTime::kMicrosToPicos);
+      event->set_duration_ps(node.all_end_rel_micros() *
+                             EnvTime::kMicrosToPicos);
+      (*args)["label"] = node.timeline_label();
+    }
+  }
+
+  // TODO(fishx): Convert allocation data as well.
+}
+
+}  // namespace
+
+/*static*/ std::unique_ptr<EagerProfiler> EagerProfiler::Create(
+    EagerContext* const context) {
+  return absl::WrapUnique(new EagerProfiler(context));
+}
+
+void EagerProfiler::BeforeClearRunMetadata() {
+  mutex_lock l(mutex_);
+  run_metadata_.MergeFrom(*context_->RunMetadataProto());
+}
+
+Status EagerProfiler::Status() {
+  mutex_lock l(mutex_);
+  return status_;
+}
+
+Status EagerProfiler::SerializeToString(string* content) {
+  mutex_lock l(mutex_);
+  if (!status_.ok()) return status_;
+  Stop();
+
+  // Get profiling data from device tracer
+  if (device_tracer_ != nullptr) {
+    std::unique_ptr<StepStatsCollector> step_stats_collector(
+        new StepStatsCollector(run_metadata_.mutable_step_stats()));
+    tensorflow::Status s = device_tracer_->Collect(step_stats_collector.get());
+    if (!s.ok()) {
+      device_tracer_.reset(nullptr);
+      LOG(WARNING) << "Failed to collect data from device tracer. "
+                   << s.error_message();
+    }
+    step_stats_collector->Finalize();
+  }
+
+  tpu::Trace trace;
+
+  ConvertRunMetadataToTraceEvent(&run_metadata_, &trace, start_time_micros_);
+
+  trace.SerializeToString(content);
+  return Status::OK();
+}
+
+EagerProfiler::EagerProfiler(EagerContext* const context)
+    : context_(context),
+      start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
+  LOG(INFO) << "Eager Profiler started.";
+
+  status_ = context_->RegisterRunMetadataListener(this);
+  if (!status_.ok()) {
+    context_ = nullptr;
+    LOG(WARNING)
+        << "Eager Profiler failed to start. Another profiler is running.";
+    return;
+  }
+
+  // TODO(fishx): Allow user disable device tracer.
+  device_tracer_ = CreateDeviceTracer();
+  if (!device_tracer_) {
+    LOG(WARNING) << "Continue profiling without device tracer. "
+                 << "Failed to create device tracer.";
+    return;
+  }
+  class Status s = device_tracer_->Start();
+  if (!s.ok()) {
+    device_tracer_.reset(nullptr);
+    LOG(WARNING) << "Continue profiling without device tracer. "
+                 << s.error_message();
+  }
+}
+
+EagerProfiler::~EagerProfiler() { Stop(); }
+
+void EagerProfiler::Stop() {
+  if (context_ != nullptr) {
+    context_->ClearRunMetadataListener();
+    run_metadata_.MergeFrom(*context_->RunMetadataProto());
+    context_ = nullptr;
+    if (device_tracer_ != nullptr) {
+      tensorflow::Status s = device_tracer_->Stop();
+      if (!s.ok()) {
+        device_tracer_.reset(nullptr);
+        LOG(WARNING) << "Failed to stop device tracer. " << s.error_message();
+      }
+    }
+    LOG(INFO) << "Eager Profiler ended with status:" << status_;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/eager_profiler.h b/tensorflow/core/profiler/lib/eager_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cdb76ff36f751206f0cc814a75f2a5e31266890
--- /dev/null
+++ b/tensorflow/core/profiler/lib/eager_profiler.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/device_tracer.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// A profiler which will start profiling when creating the object and will stop
+// when either the object is destroyed or SerializedToString is called. It will
+// profile all operations run under the given EagerContext.
+// Multiple instances of it can be created, but at most one of them will profile
+// for each EagerContext. Status() will return OK only for the instance that is
+// profiling.
+// Thread-safety: TFE_Profiler is thread-safe.
+class EagerProfiler : RunMetadataListener {
+ public:
+  // Creates and EagerProfiler and starts profiling.
+  static std::unique_ptr<EagerProfiler> Create(EagerContext* const context);
+
+  // Deletes an exsiting Profiler and enables starting a new one.
+  ~EagerProfiler() override;
+
+  void BeforeClearRunMetadata() override LOCKS_EXCLUDED(mutex_)
+      EXCLUSIVE_LOCKS_REQUIRED(context_->MetadataMu());
+  tensorflow::Status Status() LOCKS_EXCLUDED(mutex_);
+
+  tensorflow::Status SerializeToString(string* content) LOCKS_EXCLUDED(mutex_);
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit EagerProfiler(EagerContext* const context);
+
+  // Profiler is neither copyable or movable.
+  EagerProfiler(const EagerProfiler&) = delete;
+  EagerProfiler& operator=(const EagerProfiler&) = delete;
+
+  void Stop() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  RunMetadata run_metadata_ GUARDED_BY(mutex_);
+  tensorflow::Status status_ GUARDED_BY(mutex_);
+  std::unique_ptr<DeviceTracer> device_tracer_ GUARDED_BY(mutex_);
+  EagerContext* context_ GUARDED_BY(mutex_);
+  const uint64 start_time_micros_;
+  mutex mutex_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..83ec75dfb4175595bf402969e9832f68e867dcfd
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -0,0 +1,35 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
+tf_cuda_library(
+    name = "profiler_service_impl",
+    srcs = ["profiler_service_impl.cc"],
+    hdrs = ["profiler_service_impl.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:grpc++",
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler/lib:eager_profiler",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_library(
+    name = "profiler_server",
+    srcs = ["profiler_server.cc"],
+    hdrs = ["profiler_server.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":profiler_service_impl",
+        "//tensorflow:grpc++",
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime/eager:context",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08affffb2d592f137cd06e142124da26ddf9c954
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include <memory>
+#include <utility>
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+std::unique_ptr<Thread> StartProfilerServer(EagerContext* const eager_context,
+                                            int32 port) {
+  return WrapUnique(eager_context->TFEnv()->StartThread(
+      {}, "profiler server", [eager_context, port]() {
+        string server_address = strings::StrCat("0.0.0.0:", port);
+        std::unique_ptr<TPUProfiler::Service> service =
+            CreateProfilerService(eager_context);
+        ::grpc::ServerBuilder builder;
+        builder.AddListeningPort(server_address,
+                                 ::grpc::InsecureServerCredentials());
+        builder.RegisterService(service.get());
+        std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+        LOG(INFO) << "Profiling Server listening on " << server_address;
+        server->Wait();
+      }));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebefdd6cb08289e8a43b0f1525f3176657530095
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+
+namespace tensorflow {
+std::unique_ptr<Thread> StartProfilerServer(EagerContext* const eager_context,
+                                            int32 port);
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bde2ff2f5f5537f655f7a2ce1380e786bd4a8628
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "grpcpp/support/status.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/profiler/lib/eager_profiler.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(fishx): Rename TPUProfiler to something more generic.
+class ProfilerServiceImpl : public TPUProfiler::Service {
+ public:
+  explicit ProfilerServiceImpl(EagerContext* const eager_context)
+      : eager_context_(eager_context) {}
+  ~ProfilerServiceImpl() override {}
+
+  ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
+                         MonitorResponse* response) override {
+    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "unimplemented.");
+  }
+
+  ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
+                         ProfileResponse* response) override {
+    LOG(INFO) << "Received a profile request.";
+    std::unique_ptr<EagerProfiler> profiler =
+        EagerProfiler::Create(eager_context_);
+    if (!profiler->Status().ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            profiler->Status().error_message());
+    }
+
+    Env* env = eager_context_->TFEnv();
+    for (size_t i = 0; i < req->duration_ms(); ++i) {
+      env->SleepForMicroseconds(1000);
+      if (ctx->IsCancelled()) {
+        return ::grpc::Status::CANCELLED;
+      }
+    }
+
+    Status s = profiler->SerializeToString(response->mutable_encoded_trace());
+    if (!s.ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL, s.error_message());
+    }
+
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  EagerContext* const eager_context_;
+};
+}  // namespace
+
+std::unique_ptr<TPUProfiler::Service> CreateProfilerService(
+    EagerContext* const eager_context) {
+  return MakeUnique<ProfilerServiceImpl>(eager_context);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..79dc767eb10ea2f755ebd0263afb0d8447025b1d
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/profiler/lib/eager_profiler.h"
+
+namespace tensorflow {
+std::unique_ptr<TPUProfiler::Service> CreateProfilerService(
+    EagerContext* const eager_context);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/profiler/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
index 9e5ef0a0a31600e12e76cb8f5f3e5a1c6f62a3d5..faca22c425b91553f67e7ffdfda14a044295b17f 100644
--- a/tensorflow/core/profiler/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -96,7 +96,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
   for (const string& kv_str : kv_split) {
     const std::vector<string> kv =
         str_util::Split(kv_str, "=", str_util::SkipEmpty());
-    if (kv.size() != 2) {
+    if (kv.size() < 2) {
       return tensorflow::Status(
           tensorflow::error::INVALID_ARGUMENT,
           "Visualize format: -output timeline:key=value,key=value,...");
@@ -107,7 +107,8 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
           strings::Printf("Unrecognized options %s for output_type: %s\n",
                           kv[0].c_str(), output_type->c_str()));
     }
-    (*output_options)[kv[0]] = kv[1];
+    const std::vector<string> kv_without_key(kv.begin() + 1, kv.end());
+    (*output_options)[kv[0]] = str_util::Join(kv_without_key, "=");
   }
 
   for (const string& opt : required_options) {
diff --git a/tensorflow/core/protobuf/checkpointable_object_graph.proto b/tensorflow/core/protobuf/checkpointable_object_graph.proto
index 651f692f6d7b6d677b480a007f9ffe5c814beec3..f2956404b5e0d384f8fcec391ac0ac6c8b583a5e 100644
--- a/tensorflow/core/protobuf/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/checkpointable_object_graph.proto
@@ -30,6 +30,10 @@ message CheckpointableObjectGraph {
       string full_name = 2;
       // The generated name of the Tensor in the checkpoint.
       string checkpoint_key = 3;
+      // Whether checkpoints should be considered as matching even without this
+      // value restored. Used for non-critical values which don't affect the
+      // TensorFlow graph, such as layer configurations.
+      bool optional_restore = 4;
     }
 
     message SlotVariableReference {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index b3dc5dccc02737202f9f5ced78471f332efd2eba..a2cc1bc9353bf434438ec9d21ff3995e0806f1d0 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -425,6 +425,10 @@ message ConfigProto {
     // use NUMA affinity where applicable.  One consequence will be the
     // existence of as many CPU devices as there are available NUMA nodes.
     bool use_numa_affinity = 5;
+
+    // If true, make collective op execution order sequential and deterministic
+    // for potentially concurrent collective instances.
+    bool collective_deterministic_sequential_execution = 6;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 515d673828e3792ac6f4268fd55b58e43aab509b..17e76c4d03d3b57fbb2b452252c4dd8b9e6d99bc 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -5,9 +5,10 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+// add go_package externally with copybara
 
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/verifier_config.proto";
 
 message AutoParallelOptions {
   bool enable = 1;
@@ -166,4 +167,11 @@ message RewriterConfig {
 
   // list of CustomGraphOptimizers to apply.
   repeated CustomGraphOptimizer custom_optimizers = 200;
+
+  // VerifierConfig specifying the verifiers to be run after every optimizer.
+  VerifierConfig inter_optimizer_verifier_config = 300;
+
+  // VerifierConfig specifying the verifiers to be run at the end, after all
+  // optimizers have run.
+  VerifierConfig post_optimization_verifier_config = 301;
 }
diff --git a/tensorflow/core/protobuf/verifier_config.proto b/tensorflow/core/protobuf/verifier_config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..207f0f2a974cbc58413490380edf3795c7206aba
--- /dev/null
+++ b/tensorflow/core/protobuf/verifier_config.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "VerifierConfigProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+// add go_package externally with copybara
+
+// The config for graph verifiers.
+message VerifierConfig {
+  enum Toggle {
+    DEFAULT = 0;
+    ON = 1;
+    OFF = 2;
+  }
+
+  // Deadline for completion of all verification i.e. all the Toggle ON
+  // verifiers must complete execution within this time.
+  int64 verification_timeout_in_ms = 1;
+
+  // Perform structural validation on a tensorflow graph. Default is OFF.
+  Toggle structure_verifier = 2;
+
+  // Next tag: 3
+}
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 74058c846530bc2b4577d18034d02ed002d8983f..4284dd119edf3167915942c6458827ebb7191ad5 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -535,6 +535,7 @@ message CompleteInstanceRequest {
 message CompleteInstanceResponse {
   int32 instance_key = 1;
   int32 source_rank = 2;
+  bytes communicator_key = 3;
 }
 
 // Request for next agreed-upon step_id for the specified graph_keys.
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/core/summary/BUILD
similarity index 98%
rename from tensorflow/contrib/tensorboard/db/BUILD
rename to tensorflow/core/summary/BUILD
index 6507546ee9f81108add181a9c83064c9860005e2..a89175cdb1db2ff1184d8da26bc180d578faaf69 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,5 +1,5 @@
 # Description:
-#   TensorBoard database code.
+#   C++ implementation code for the summary writing APIs.
 
 package(default_visibility = ["//tensorflow:internal"])
 
diff --git a/tensorflow/contrib/tensorboard/db/loader.cc b/tensorflow/core/summary/loader.cc
similarity index 97%
rename from tensorflow/contrib/tensorboard/db/loader.cc
rename to tensorflow/core/summary/loader.cc
index 6439328022329cbc56d767e787ec9d6797045768..68535feacfae6d8c9edf6b0725fe4d4c8d63bf60 100644
--- a/tensorflow/contrib/tensorboard/db/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/schema.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/core/summary/schema.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/schema.cc
rename to tensorflow/core/summary/schema.cc
index 3c7bc87e4a2dbeadef2b9589d58c845204049123..822e2fa3bfdaf2be5f03704fc83d39f0e00369d3 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/core/summary/schema.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/tensorboard/db/schema.h b/tensorflow/core/summary/schema.h
similarity index 87%
rename from tensorflow/contrib/tensorboard/db/schema.h
rename to tensorflow/core/summary/schema.h
index 3da450422523dbe4304446869a38d43981d76eb5..6305f8eabd7cacb9dca8922b694e92ca4596d777 100644
--- a/tensorflow/contrib/tensorboard/db/schema.h
+++ b/tensorflow/core/summary/schema.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
+#define TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/db/sqlite.h"
@@ -30,4 +30,4 @@ Status SetupTensorboardSqliteDb(Sqlite* db);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
diff --git a/tensorflow/contrib/tensorboard/db/schema_test.cc b/tensorflow/core/summary/schema_test.cc
similarity index 95%
rename from tensorflow/contrib/tensorboard/db/schema_test.cc
rename to tensorflow/core/summary/schema_test.cc
index 4d3f2880bd02682ad00a90760f2a4478f1e6b2a2..fa21b45b62cca2b116010de87a2dc2bae5cbe866 100644
--- a/tensorflow/contrib/tensorboard/db/schema_test.cc
+++ b/tensorflow/core/summary/schema_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 
 #include <memory>
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_converter.cc
rename to tensorflow/core/summary/summary_converter.cc
index 93c1183072b4d791843e740f970234ba52857463..e6e34e9602fa8cc3ed91d773d1d4cbec0d0c5232 100644
--- a/tensorflow/contrib/tensorboard/db/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/summary.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.h b/tensorflow/core/summary/summary_converter.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_converter.h
rename to tensorflow/core/summary/summary_converter.h
index 329c7f9f2f9fe25cdff8d5ac2e52c25362f624c2..dc005d2604ff1687e765341ebdb9e86c62c78f3a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_converter.h
+++ b/tensorflow/core/summary/summary_converter.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -35,4 +35,4 @@ Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer.cc
rename to tensorflow/core/summary/summary_db_writer.cc
index cfdc884277a025aa11995d329389f3748b17490c..b203d439ccf82b36b3d0e1bdd958fdcfac87f4b0 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 
 #include <deque>
 
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -972,7 +972,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return MigrateEvent(std::move(e));
   }
 
-  string DebugString() override { return "SummaryDbWriter"; }
+  string DebugString() const override { return "SummaryDbWriter"; }
 
  private:
   Status Write(int64 step, const Tensor& t, const string& tag,
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.h b/tensorflow/core/summary/summary_db_writer.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer.h
rename to tensorflow/core/summary/summary_db_writer.h
index 746da1533b157bf7b2be5c85ada8b61ba224cc3e..5669afe7f67e1019d3d62d45ea99a64f1a31c82e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.h
+++ b/tensorflow/core/summary/summary_db_writer.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
 
 #include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -39,4 +39,4 @@ Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/core/summary/summary_db_writer_test.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
rename to tensorflow/core/summary/summary_db_writer_test.cc
index 2e8d4109dd624ab66d774668ad04def9a7d3cdf2..c4e9ddea2c51673c94273900b0407517b6533f3d 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/core/summary/summary_db_writer_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
similarity index 97%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer.cc
rename to tensorflow/core/summary/summary_file_writer.cc
index 22b6f09d0cd88068f7bedabe7687920420a3028f..711a7d3d1007090259f34652f10cf43a4d0c5f0a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -148,7 +148,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return Status::OK();
   }
 
-  string DebugString() override { return "SummaryFileWriter"; }
+  string DebugString() const override { return "SummaryFileWriter"; }
 
  private:
   double GetWallTime() {
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.h b/tensorflow/core/summary/summary_file_writer.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer.h
rename to tensorflow/core/summary/summary_file_writer.h
index 73b0a5542beabdc460c32156dd44aacc5f08610a..7d964516da3ceecdc4cdedae000ba873ec92e1e9 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.h
+++ b/tensorflow/core/summary/summary_file_writer.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
 
 #include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -40,4 +40,4 @@ Status CreateSummaryFileWriter(int max_queue, int flush_millis,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
rename to tensorflow/core/summary/summary_file_writer_test.cc
index ffbfb9533e887e54b0f5bdfde11dadce21073a94..d3b19c3abdb8b773e22472c5987d91852fc6ac8e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/vacuum.cc b/tensorflow/core/summary/vacuum.cc
similarity index 100%
rename from tensorflow/contrib/tensorboard/db/vacuum.cc
rename to tensorflow/core/summary/vacuum.cc
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 523d37ecc244b3634545ea82385b377c871569c8..d275e076f865f809192e6f3aea652434d5654bb3 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -84,6 +84,10 @@ string WriteTextProtoToUniqueFile(Env* env, const string& name,
     dir = getenv("TF_DUMP_GRAPH_PREFIX");
   }
   if (!dir) {
+    LOG(WARNING)
+        << "Failed to dump " << name << " because dump location is not "
+        << " specified through either TF_DUMP_GRAPH_PREFIX environment "
+        << "variable or function argument.";
     return "(TF_DUMP_GRAPH_PREFIX not specified)";
   }
   Status status = env->RecursivelyCreateDir(dir);
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 9ce85be551191dee754f34ec531e65f3eac056b7..2d3ae62777358ee371c60fe9b04d27d140c6f414 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -95,7 +95,7 @@ enum WorkerHealth {
 // signal is received.
 enum WorkerShutdownMode {
   DEFAULT = 0;
-  SHUTDOWN_IMMEDIATELY = 1;
+  NOT_CONFIGURED = 1;
   WAIT_FOR_COORDINATOR = 2;
 }
 
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index d3439cbc9385184da830f70e53acb27eff570ba1..b1773a25171916b6da0b3e0b86129ee25c32b1b6 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -56,6 +56,11 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
   ~RandomAccessFileFromMemmapped() override = default;
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented(
+        "RandomAccessFileFromMemmapped does not support Name()");
+  }
+
   Status Read(uint64 offset, size_t to_read, StringPiece* result,
               char* scratch) const override {
     if (offset >= length_) {
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index b04d52b48fa4b19146d0988be1e59d05169a5f70..499a24b3dc1d7e74e05ca1030a123c862bb7d0ac 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -140,8 +140,8 @@ typedef enum {
   MKL_GROUP_FILTER_DIM_W = 4
 } MklDnnFilterGroupDims;
 
+// Enum used to templatize MklOp kernel implementations
 
-// Enum used to templatize MklOp kernel implementations
 // that support both fp32 and int8 versions.
 enum class MklQuantization {
   QUANTIZED_VERSION,
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index 117de5ee4bdd61af148ad7f1e620e940cb38216a..9e7fb8489e8e37b94ebecd53fde0568c68879c92 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -29,12 +29,55 @@ Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
     *value = SAME;
   } else if (str_value == "VALID") {
     *value = VALID;
+  } else if (str_value == "EXPLICIT") {
+    *value = EXPLICIT;
   } else {
     return errors::NotFound(str_value, " is not an allowed padding type");
   }
   return Status::OK();
 }
 
+Status CheckValidPadding(Padding padding_type,
+                         const std::vector<int64>& explicit_paddings,
+                         int num_dims, TensorFormat data_format) {
+  if (padding_type == Padding::EXPLICIT) {
+    if (explicit_paddings.size() != 2 * num_dims) {
+      return errors::InvalidArgument(
+          "explicit_paddings attribute must contain ", 2 * num_dims,
+          " values, but got: ", explicit_paddings.size());
+    }
+    for (int64 padding_value : explicit_paddings) {
+      if (padding_value < 0) {
+        return errors::InvalidArgument(
+            "All elements of explicit_paddings must be nonnegative");
+      }
+    }
+    const int32 batch_index = GetTensorBatchDimIndex(num_dims, data_format);
+    const int32 depth_index = GetTensorFeatureDimIndex(num_dims, data_format);
+    if (explicit_paddings[2 * batch_index] != 0 ||
+        explicit_paddings[2 * batch_index + 1] != 0 ||
+        explicit_paddings[2 * depth_index] != 0 ||
+        explicit_paddings[2 * depth_index + 1] != 0) {
+      return errors::InvalidArgument(
+          "Nonzero explicit padding in the batch or depth dimensions is not "
+          "supported");
+    }
+  } else if (!explicit_paddings.empty()) {
+    return errors::InvalidArgument(
+        "explicit_paddings attribute must be empty if the padding attribute is "
+        "not EXPLICIT");
+  }
+  return Status::OK();
+}
+
 string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
 
+string GetPaddingAttrStringWithExplicit() {
+  return "padding: {'SAME', 'VALID', 'EXPLICIT'}";
+}
+
+string GetExplicitPaddingsAttrString() {
+  return "explicit_paddings: list(int) = []";
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index 76f9b4dd9a99e7b4e152ca0c06b9323acf84b13d..a1dd1c0bd9556935f233609683a79452f3692e06 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -20,8 +20,10 @@ limitations under the License.
 // kernels.
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
@@ -34,16 +36,29 @@ class NodeDef;
 //   VALID: No padding is carried out.
 //   SAME: The pad value is computed so that the output will have the same
 //         dimensions as the input.
+//   EXPLICIT: The user specifies the pad values in the explicit_padding
+//             attribute.
 // The padded area is zero-filled.
 enum Padding {
-  VALID = 1,  // No padding.
-  SAME = 2,   // Input and output layers have the same size.
+  VALID = 1,     // No padding.
+  SAME = 2,      // Input and output layers have the same size.
+  EXPLICIT = 3,  // Padding is explicitly specified
 };
 
+// Returns an error if the padding attributes are invalid.
+Status CheckValidPadding(Padding padding_type,
+                         const std::vector<int64>& explicit_paddings,
+                         int num_dims, TensorFormat data_format);
+
 // Return the string containing the list of valid padding types, that can be
 // used as an Attr() in REGISTER_OP.
 string GetPaddingAttrString();
 
+// Like GetPaddingAttrString(), but also includes EXPLICIT.
+string GetPaddingAttrStringWithExplicit();
+
+string GetExplicitPaddingsAttrString();
+
 // Specialization to parse an attribute directly into a Padding enum.
 Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
                    Padding* value);
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index f88ad2faaff344832d65b04357c3d8c2665ebad5..1cdde34562a7616827850fde830373350138687d 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/prefetch.h"
 
 namespace tensorflow {
 
@@ -132,6 +133,15 @@ class PresizedCuckooMap {
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
+  // Prefetch memory associated with the key k into cache levels specified by
+  // hint.
+  template <port::PrefetchHint hint = port::PREFETCH_HINT_T0>
+  void PrefetchKey(const key_type k) const {
+    const uint64 tk = key_transform(k);
+    port::prefetch<hint>(&buckets_[fast_map_to_buckets(tk)].keys);
+    port::prefetch<hint>(&buckets_[fast_map_to_buckets(h2(tk))].keys);
+  }
+
   int64 MemoryUsed() const {
     return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
   }
diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc
index f2be1e8a2fffdd9b61839809667a858a512751d2..f2c7904b00452487ceef4a8f8a870af548e1af03 100644
--- a/tensorflow/core/util/presized_cuckoo_map_test.cc
+++ b/tensorflow/core/util/presized_cuckoo_map_test.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/presized_cuckoo_map.h"
 #include <array>
+
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/util/presized_cuckoo_map.h"
 
 namespace tensorflow {
 namespace {
@@ -50,6 +51,14 @@ TEST(PresizedCuckooMapTest, Basic) {
   EXPECT_EQ(out, 2);
 }
 
+TEST(PresizedCuckooMapTest, Prefetch) {
+  PresizedCuckooMap<int64> pscm(2);
+  EXPECT_TRUE(pscm.InsertUnique(1, 2));
+  // Works for both present and absent keys.
+  pscm.PrefetchKey(1);
+  pscm.PrefetchKey(2);
+}
+
 TEST(PresizedCuckooMapTest, TooManyItems) {
   static constexpr int kTableSize = 1000;
   PresizedCuckooMap<int> pscm(kTableSize);
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index 7e549c77647529934bc6cebef1f2996af47428bb..b990f0a74918454fcdf8dff44006ef2e6a5602e1 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -68,5 +68,20 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:platform_base",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "proto_utils_test",
+    srcs = ["proto_utils_test.cc"],
+    deps = [
+        ":proto_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index 8dde14dffcdc5ffe4d64360f3af40521efe29bf8..188830cc1f4b58da975bf69baddb2b51d6b17e50 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -91,7 +91,7 @@ inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
 // the 64 bit version instead of copying the code.
 inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
                                           uint32* value) {
-  uint64 tmp;
+  uint64 tmp = 0;
   const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
   *value = tmp & 0xffffffff;
   return buf;
@@ -106,7 +106,7 @@ const uint8* ReadFromArray(const uint8* buf, TensorType* value);
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT32>(
     const uint8* buf, int64* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int64>(temp);
@@ -116,7 +116,7 @@ inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT32>(
 template <>
 inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
     const uint8* buf, int32* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int32>(temp);
@@ -126,7 +126,7 @@ inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
     const uint8* buf, int64* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WrapUnsignedAsSigned64(temp);
@@ -136,7 +136,7 @@ inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
 template <>
 inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
     const uint8* buf, uint64* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = temp;
@@ -160,7 +160,7 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT64>(
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT32>(
     const uint8* buf, int64* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -170,7 +170,7 @@ inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT32>(
 template <>
 inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
     const uint8* buf, int32* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -180,7 +180,7 @@ inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT64>(
     const uint8* buf, int64* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode64(temp);
@@ -280,7 +280,7 @@ inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
 template <>
 inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
     const uint8* buf, bool* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = temp != 0;
@@ -290,7 +290,7 @@ inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
 template <>
 inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
     const uint8* buf, int* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int>(temp);
diff --git a/tensorflow/core/util/proto/proto_utils.cc b/tensorflow/core/util/proto/proto_utils.cc
index 201f05a129b03bca8867a53a43886690de638579..f1064141390faba9f3d08a0a62c5459b3434e464 100644
--- a/tensorflow/core/util/proto/proto_utils.cc
+++ b/tensorflow/core/util/proto/proto_utils.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/util/proto/proto_utils.h"
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
-#include "tensorflow/core/util/proto/proto_utils.h"
-
 namespace tensorflow {
 namespace proto_utils {
 
@@ -66,5 +69,49 @@ bool IsCompatibleType(FieldDescriptor::Type field_type, DataType dtype) {
   }
 }
 
+Status ParseTextFormatFromString(absl::string_view input,
+                                 protobuf::Message* output) {
+  DCHECK(output != nullptr) << "output must be non NULL";
+  // When checks are disabled, instead log the error and return an error status.
+  if (output == nullptr) {
+    LOG(ERROR) << "output must be non NULL";
+    return Status(error::INVALID_ARGUMENT, "output must be non NULL");
+  }
+  string err;
+  StringErrorCollector err_collector(&err, /*one-indexing=*/true);
+  protobuf::TextFormat::Parser parser;
+  parser.RecordErrorsTo(&err_collector);
+  if (!parser.ParseFromString(string(input), output)) {
+    return Status(error::INVALID_ARGUMENT, err);
+  }
+  return Status::OK();
+}
+
+StringErrorCollector::StringErrorCollector(string* error_text)
+    : StringErrorCollector(error_text, false) {}
+
+StringErrorCollector::StringErrorCollector(string* error_text,
+                                           bool one_indexing)
+    : error_text_(error_text), index_offset_(one_indexing ? 1 : 0) {
+  DCHECK(error_text_ != nullptr) << "error_text must be non NULL";
+  // When checks are disabled, just log and then ignore added errors/warnings.
+  if (error_text_ == nullptr) {
+    LOG(ERROR) << "error_text must be non NULL";
+  }
+}
+
+void StringErrorCollector::AddError(int line, int column,
+                                    const string& message) {
+  if (error_text_ != nullptr) {
+    absl::SubstituteAndAppend(error_text_, "$0($1): $2\n", line + index_offset_,
+                              column + index_offset_, message);
+  }
+}
+
+void StringErrorCollector::AddWarning(int line, int column,
+                                      const string& message) {
+  AddError(line, column, message);
+}
+
 }  // namespace proto_utils
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index d5e0b9006c08be349d5466c52944d5b056b9a49b..9451e317a13dec9b0c96096d9a7144263efc600f 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -27,6 +29,35 @@ using tensorflow::protobuf::FieldDescriptor;
 // Returns true if the proto field type can be converted to the tensor dtype.
 bool IsCompatibleType(FieldDescriptor::Type field_type, DataType dtype);
 
+// Parses a text-formatted protobuf from a string into the given Message* output
+// and returns status OK if valid, or INVALID_ARGUMENT with an accompanying
+// parser error message if the text format is invalid.
+Status ParseTextFormatFromString(absl::string_view input,
+                                 protobuf::Message* output);
+
+class StringErrorCollector : public protobuf::io::ErrorCollector {
+ public:
+  // String error_text is unowned and must remain valid during the use of
+  // StringErrorCollector.
+  explicit StringErrorCollector(string* error_text);
+  // If one_indexing is set to true, all line and column numbers will be
+  // increased by one for cases when provided indices are 0-indexed and
+  // 1-indexed error messages are desired
+  StringErrorCollector(string* error_text, bool one_indexing);
+  StringErrorCollector(const StringErrorCollector&) = delete;
+  StringErrorCollector& operator=(const StringErrorCollector&) = delete;
+
+  // Implementation of protobuf::io::ErrorCollector::AddError.
+  void AddError(int line, int column, const string& message) override;
+
+  // Implementation of protobuf::io::ErrorCollector::AddWarning.
+  void AddWarning(int line, int column, const string& message) override;
+
+ private:
+  string* const error_text_;
+  const int index_offset_;
+};
+
 }  // namespace proto_utils
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/proto/proto_utils_test.cc b/tensorflow/core/util/proto/proto_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f6312a718511d19c82d8f0f2f1f6dba495e0cb7
--- /dev/null
+++ b/tensorflow/core/util/proto/proto_utils_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/proto/proto_utils.h"
+
+#include <gmock/gmock.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using proto_utils::ParseTextFormatFromString;
+using proto_utils::StringErrorCollector;
+using ::testing::ContainsRegex;
+
+TEST(ParseTextFormatFromStringTest, Success) {
+  protobuf::DescriptorProto output;
+  TF_ASSERT_OK(ParseTextFormatFromString("name: \"foo\"", &output));
+  EXPECT_EQ(output.name(), "foo");
+}
+
+TEST(ParseTextFormatFromStringTest, ErrorOnInvalidSyntax) {
+  protobuf::DescriptorProto output;
+  Status status = ParseTextFormatFromString("name: foo", &output);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_THAT(status.error_message(), ContainsRegex("foo"));
+  EXPECT_FALSE(output.has_name());
+}
+
+TEST(ParseTextFormatFromStringTest, ErrorOnUnknownFieldName) {
+  protobuf::DescriptorProto output;
+  Status status = ParseTextFormatFromString("badname: \"foo\"", &output);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_THAT(status.error_message(), ContainsRegex("badname"));
+  EXPECT_FALSE(output.has_name());
+}
+
+TEST(ParseTextFormatFromStringTest, DiesOnNullOutputPointer) {
+#ifndef NDEBUG
+  ASSERT_DEATH(ParseTextFormatFromString("foo", nullptr).IgnoreError(),
+               "output.*non NULL");
+#else
+  // Under NDEBUG we don't die but should still return an error status.
+  Status status = ParseTextFormatFromString("foo", nullptr);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_THAT(status.error_message(), ContainsRegex("output.*non NULL"));
+#endif
+}
+
+TEST(StringErrorCollectorTest, AppendsError) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddError(1, 2, "foo");
+  EXPECT_EQ("1(2): foo\n", err);
+}
+
+TEST(StringErrorCollectorTest, AppendsWarning) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddWarning(1, 2, "foo");
+  EXPECT_EQ("1(2): foo\n", err);
+}
+
+TEST(StringErrorCollectorTest, AppendsMultipleError) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddError(1, 2, "foo");
+  collector.AddError(3, 4, "bar");
+  EXPECT_EQ("1(2): foo\n3(4): bar\n", err);
+}
+
+TEST(StringErrorCollectorTest, AppendsMultipleWarning) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddWarning(1, 2, "foo");
+  collector.AddWarning(3, 4, "bar");
+  EXPECT_EQ("1(2): foo\n3(4): bar\n", err);
+}
+
+TEST(StringErrorCollectorTest, OffsetWorks) {
+  string err;
+  StringErrorCollector collector(&err, true);
+  collector.AddError(1, 2, "foo");
+  collector.AddWarning(3, 4, "bar");
+  EXPECT_EQ("2(3): foo\n4(5): bar\n", err);
+}
+
+TEST(StringErrorCollectorTest, DiesOnNullErrorText) {
+#ifndef NDEBUG
+  ASSERT_DEATH(StringErrorCollector(nullptr), "error_text.*non NULL");
+#else
+  // Under NDEBUG we don't die and instead AddError/AddWarning just do nothing.
+  StringErrorCollector collector(nullptr);
+  collector.AddError(1, 2, "foo");
+  collector.AddWarning(3, 4, "bar");
+#endif
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/sparse/dim_comparator.h b/tensorflow/core/util/sparse/dim_comparator.h
index 0782e7e1a8af19a7936bde267c0905dc5f7d00e7..498df7a021df3e65557d96dc25577e9e24e911a6 100644
--- a/tensorflow/core/util/sparse/dim_comparator.h
+++ b/tensorflow/core/util/sparse/dim_comparator.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_SPARSE_DIM_COMPARATOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 89c163aa5133fafc23b01c7153ac40d32efcaaf6..cd168f6597347a9a19cd4486ef437302c5ab735d 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index e191737bb2c8eb85518e51b3a06884a7983a392e..5005ee08a4bf3292097820983ad85a8b56377a82 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <map>
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 55688e580848e42bdd453a270a530a5423fb3aec..9db35da6a2e230bd7e83f1e008a40f6d1c0ea946 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/strided_slice_op.h"
 
 #include <array>
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index a296fb447e252e62809aeb17d9d00cf35ad15fc9..643e14e0b56bb152b5ca135cd4b813108b8eab16 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -408,18 +408,24 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   return GetTensorDimIndex<2>(format, dimension);
 }
 
+inline int32 GetTensorDimIndex(TensorFormat format, char dimension,
+                               int num_total_dims) {
+  int32 index = (GetTensorSpatialDims(num_total_dims, format) == 3)
+                    ? GetTensorDimIndex<3>(format, dimension)
+                    : GetTensorDimIndex<2>(format, dimension);
+  CHECK(index >= 0 && index < num_total_dims)  // Crash OK.
+      << "Invalid index from the dimension: " << index << ", " << format << ", "
+      << dimension;
+  return index;
+}
+
 // Return the element from 'dimension_attributes' that corresponds to the
 // specified 'dimension' according to 'tensor_format'.
 template <typename T>
 T GetTensorDim(gtl::ArraySlice<T> dimension_attributes,
                TensorFormat tensor_format, char dimension) {
   int index =
-      (GetTensorSpatialDims(dimension_attributes.size(), tensor_format) == 3)
-          ? GetTensorDimIndex<3>(tensor_format, dimension)
-          : GetTensorDimIndex<2>(tensor_format, dimension);
-  CHECK(index >= 0 && index < dimension_attributes.size())
-      << "Invalid index from the dimension: " << index << ", " << tensor_format
-      << ", " << dimension;
+      GetTensorDimIndex(tensor_format, dimension, dimension_attributes.size());
   return dimension_attributes[index];
 }
 
@@ -476,6 +482,15 @@ inline int64 GetFilterDim(const Tensor& tensor,
   return GetFilterDim(tensor.shape(), filter_tensor_format, dimension);
 }
 
+inline void GetExplicitPaddingForDim(
+    const std::vector<int64>& explicit_paddings, TensorFormat tensor_format,
+    char dimension, int64* padding_before, int64* padding_after) {
+  int index =
+      GetTensorDimIndex(tensor_format, dimension, explicit_paddings.size() / 2);
+  *padding_before = explicit_paddings[2 * index];
+  *padding_after = explicit_paddings[2 * index + 1];
+}
+
 // Return the string that specifies the data format for convnet operations.
 string GetConvnetDataFormatAttrString();
 string GetConvnet3dDataFormatAttrString();
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 2b39b3683f260b840b36e7f991b0d0c8e19aa18b..a4d6f204cd94f751c39ef71b23b512ccc35aa3b6 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -10,6 +10,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_tests_tags")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_exec_compatible_with")
 
 exports_files(["LICENSE"])
 
@@ -118,6 +119,7 @@ py_test(
     name = "cuda_op_test",
     size = "small",
     srcs = ["cuda_op_test.py"],
+    exec_compatible_with = tf_exec_compatible_with({"tags": tf_cuda_tests_tags()}),
     srcs_version = "PY2AND3",
     tags = tf_cuda_tests_tags() + ["notap"],
     deps = [
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index d6ec1f393bab82a45f0c1032670b5abed42bf6d3..a22d55e5af7630d5660a59970244357897aa1aa3 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -28,17 +28,8 @@ sh_test(
     size = "large",
     srcs = ["examples_test.sh"],
     data = [
-        ":boston",
-        ":iris",
         ":iris_custom_decay_dnn",
         ":iris_custom_model",
-        ":iris_run_config",
-        ":random_forest_mnist",
-        ":resnet",
-        ":text_classification",
-        ":text_classification_character_cnn",
-        ":text_classification_character_rnn",
-        ":text_classification_cnn",
     ],
     tags = [
         "manual",
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f40c6d9d28f677d9beafa83d81927224049ac6df
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -0,0 +1,54 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_binary(
+    name = "export_text_rnn_model",
+    srcs = ["export_text_rnn_model.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_text_rnn_model",
+    srcs = ["use_text_rnn_model.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_rnn_cell",
+    srcs = ["export_rnn_cell.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_rnn_cell",
+    srcs = ["use_rnn_cell.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "saved_model_test",
+    srcs = [
+        "saved_model_test.py",
+    ],
+    data = [
+        ":export_rnn_cell",
+        ":export_text_rnn_model",
+        ":use_rnn_cell",
+        ":use_text_rnn_model",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ae38b8139ba17f06d58593527e5463bec94c31
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
@@ -0,0 +1,63 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Export an RNN cell in SavedModel format."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+def main(argv):
+  del argv
+
+  root = tf.train.Checkpoint()
+  # Create a cell and attach to our checkpointable.
+  root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
+
+  # Wrap the rnn_cell.__call__ function and assign to next_state.
+  root.next_state = tf.function(root.rnn_cell.__call__, autograph=False)
+
+  # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
+  # attribute with the same name.
+  @tf.function(input_signature=[tf.TensorSpec([None, None], tf.float32)])
+  def get_initial_state(tensor):
+    return root.rnn_cell.get_initial_state(tensor, None, None)
+
+  root.get_initial_state = get_initial_state
+
+  # Construct an initial_state, then call next_state explicitly to trigger a
+  # trace for serialization (we need an explicit call, because next_state has
+  # not been annotated with an input_signature).
+  initial_state = root.get_initial_state(
+      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
+  root.next_state(
+      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
+      initial_state)
+
+  tf.saved_model.save(root, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..32bdb284f21ed235410aad7ad077f03c52d2ab93
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
@@ -0,0 +1,193 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text RNN model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+class TextRnnModel(tf.train.Checkpoint):
+  """Text RNN model.
+
+  A full generative text RNN model that can train and decode sentences from a
+  starting word.
+  """
+
+  def __init__(self, vocab, emb_dim, buckets, state_size):
+    super(TextRnnModel, self).__init__()
+    self._buckets = buckets
+    self._lstm_cell = tf.keras.layers.LSTMCell(units=state_size)
+    self._rnn_layer = tf.keras.layers.RNN(
+        self._lstm_cell, return_sequences=True)
+    self._embeddings = tf.Variable(tf.random.uniform(shape=[buckets, emb_dim]))
+    self._logit_layer = tf.keras.layers.Dense(buckets)
+    self._set_up_vocab(vocab)
+
+  def _tokenize(self, sentences):
+    # Perform a minimalistic text preprocessing by removing punctuation and
+    # splitting on spaces.
+    normalized_sentences = tf.strings.regex_replace(
+        input=sentences, pattern=r"\pP", rewrite="")
+    sparse_tokens = tf.string_split(normalized_sentences, " ")
+
+    # Deal with a corner case: there is one empty sentence.
+    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
+    # Deal with a corner case: all sentences are empty.
+    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
+
+    return (sparse_tokens.indices, sparse_tokens.values,
+            sparse_tokens.dense_shape)
+
+  def _set_up_vocab(self, vocab_tokens):
+    # TODO(vbardiovsky): Currently there is no real vocabulary, because
+    # saved_model serialization does not support trackable resources. Add a real
+    # vocabulary when it does.
+    vocab_list = ["UNK"] * self._buckets
+    for vocab_token in vocab_tokens:
+      index = self._words_to_indices(vocab_token).numpy()
+      vocab_list[index] = vocab_token
+    # This is a variable representing an inverse index.
+    self._vocab_tensor = tf.Variable(vocab_list)
+
+  def _indices_to_words(self, indices):
+    return tf.gather(self._vocab_tensor, indices)
+
+  def _words_to_indices(self, words):
+    return tf.strings.to_hash_bucket(words, self._buckets)
+
+  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
+  def train(self, sentences):
+    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
+    tokens_sparse = tf.sparse.SparseTensor(
+        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
+    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")
+
+    sparse_lookup_ids = tf.sparse.SparseTensor(
+        indices=tokens_sparse.indices,
+        values=self._words_to_indices(tokens_sparse.values),
+        dense_shape=tokens_sparse.dense_shape)
+    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)
+
+    # Targets are the next word for each word of the sentence.
+    tokens_ids_seq = lookup_ids[:, 0:-1]
+    tokens_ids_target = lookup_ids[:, 1:]
+
+    tokens_prefix = tokens[:, 0:-1]
+
+    # Mask determining which positions we care about for a loss: all positions
+    # that have a valid non-terminal token.
+    mask = tf.logical_and(
+        tf.logical_not(tf.equal(tokens_prefix, "")),
+        tf.logical_not(tf.equal(tokens_prefix, "<E>")))
+
+    input_mask = tf.cast(mask, tf.int32)
+
+    with tf.GradientTape() as t:
+      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
+                                                   tokens_ids_seq)
+
+      lstm_initial_state = self._lstm_cell.get_initial_state(
+          sentence_embeddings)
+
+      lstm_output = self._rnn_layer(
+          inputs=sentence_embeddings, initial_state=lstm_initial_state)
+
+      # Stack LSTM outputs into a batch instead of a 2D array.
+      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])
+
+      logits = self._logit_layer(lstm_output)
+
+      targets = tf.reshape(tokens_ids_target, [-1])
+      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)
+
+      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=targets, logits=logits)
+
+      # Final loss is the mean loss for all token losses.
+      final_loss = tf.math.divide(
+          tf.reduce_sum(tf.multiply(losses, weights)),
+          tf.reduce_sum(weights),
+          name="final_loss")
+
+    watched = t.watched_variables()
+    gradients = t.gradient(final_loss, watched)
+
+    for w, g in zip(watched, gradients):
+      w.assign_sub(g)
+
+    return final_loss
+
+  @tf.function
+  def decode_greedy(self, sequence_length, first_word):
+    initial_state = self._lstm_cell.get_initial_state(
+        dtype=tf.float32, batch_size=1)
+
+    sequence = [first_word]
+    current_word = first_word
+    current_id = tf.expand_dims(self._words_to_indices(current_word), 0)
+    current_state = initial_state
+
+    for _ in range(sequence_length):
+      token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id)
+      lstm_outputs, current_state = self._lstm_cell(token_embeddings,
+                                                    current_state)
+      lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size])
+      logits = self._logit_layer(lstm_outputs)
+      softmax = tf.nn.softmax(logits)
+
+      next_ids = tf.math.argmax(softmax, axis=1)
+      next_words = self._indices_to_words(next_ids)[0]
+
+      current_id = next_ids
+      current_word = next_words
+      sequence.append(current_word)
+
+    return sequence
+
+
+def main(argv):
+  del argv
+
+  sentences = ["<S> hello there <E>", "<S> how are you doing today <E>"]
+  vocab = [
+      "<S>", "<E>", "hello", "there", "how", "are", "you", "doing", "today"
+  ]
+
+  module = TextRnnModel(vocab=vocab, emb_dim=10, buckets=100, state_size=128)
+
+  for _ in range(100):
+    _ = module.train(tf.constant(sentences))
+
+  # We have to call this function explicitly if we want it exported, because it
+  # has no input_signature in the @tf.function decorator.
+  decoded = module.decode_greedy(
+      sequence_length=10, first_word=tf.constant("<S>"))
+  _ = [d.numpy() for d in decoded]
+
+  tf.saved_model.save(module, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..770e809ffcd0241aec3c62339265e3e6c13f3e3c
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -0,0 +1,64 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel integration tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+
+import tensorflow as tf
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+
+class SavedModelTest(tf.test.TestCase):
+
+  def assertCommandSucceeded(self, binary, **flags):
+    command_parts = [binary]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+
+    logging.info("Running: %s" % command_parts)
+    subprocess.check_call(
+        command_parts, env=dict(os.environ, TF2_BEHAVIOR="enabled"))
+
+  @test_util.run_v2_only
+  def test_text_rnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_text_rnn_model")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile("use_text_rnn_model")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_rnn_cell(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_rnn_cell")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile("use_rnn_cell")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+if __name__ == "__main__":
+  # tf.enable_v2_behavior()
+  tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
new file mode 100644
index 0000000000000000000000000000000000000000..798033517c0d30030b3876943c726e729d43e53d
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use an RNN cell stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+
+import tensorflow as tf
+# TODO(vbardiovsky): Remove when load is available.
+from tensorflow.python.saved_model.load import load
+
+tf.saved_model.load = load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def main(argv):
+  del argv
+  cell = tf.saved_model.load(FLAGS.model_dir)
+
+  initial_state = cell.get_initial_state(
+      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
+
+  cell.next_state(
+      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
+      initial_state)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f75c49a327ecf5fed2a4e5ca0957a64f58613b2
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
@@ -0,0 +1,50 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use RNN model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+# TODO(vbardiovsky): Remove when load is available.
+from tensorflow.python.saved_model.load import load
+
+tf.saved_model.load = load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def main(argv):
+  del argv
+
+  sentences = [
+      "<S> sentence <E>", "<S> second sentence <E>", "<S> third sentence<E>"
+  ]
+
+  model = tf.saved_model.load(FLAGS.model_dir)
+  model.train(tf.constant(sentences))
+  decoded = model.decode_greedy(
+      sequence_length=10, first_word=tf.constant("<S>"))
+  _ = [d.numpy() for d in decoded]
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/speech_commands/label_wav.py b/tensorflow/examples/speech_commands/label_wav.py
index 0017aec3a54bdcd2ddaec6a1012d629f83564827..eb8323454c23c07d5b536bbdfec30d690767a0fd 100644
--- a/tensorflow/examples/speech_commands/label_wav.py
+++ b/tensorflow/examples/speech_commands/label_wav.py
@@ -45,7 +45,7 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.GFile(filename, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
diff --git a/tensorflow/examples/speech_commands/label_wav_dir.py b/tensorflow/examples/speech_commands/label_wav_dir.py
index a34db512dda86be138e07a4ffaa1963fe00a5cea..2e1890c3e864b153a4e01badf08b5b55b4377ab6 100644
--- a/tensorflow/examples/speech_commands/label_wav_dir.py
+++ b/tensorflow/examples/speech_commands/label_wav_dir.py
@@ -46,7 +46,7 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.GFile(filename, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 77889effc8e61210445d87976e4bbfbed2c62440..805ec203b489e51ef25149d9c8a2b1085461e543 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import collections
 import math
 import os
-import sys
-import argparse
 import random
+import sys
 from tempfile import gettempdir
 import zipfile
 
@@ -34,320 +34,324 @@ import tensorflow as tf
 
 from tensorflow.contrib.tensorboard.plugins import projector
 
-# Give a folder path as an argument with '--log_dir' to save
-# TensorBoard summaries. Default is a log folder in current directory.
-current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    '--log_dir',
-    type=str,
-    default=os.path.join(current_path, 'log'),
-    help='The log directory for TensorBoard summaries.')
-FLAGS, unparsed = parser.parse_known_args()
-
-# Create the directory for TensorBoard variables if there is not.
-if not os.path.exists(FLAGS.log_dir):
-  os.makedirs(FLAGS.log_dir)
-
-# Step 1: Download the data.
-url = 'http://mattmahoney.net/dc/'
-
-
-# pylint: disable=redefined-outer-name
-def maybe_download(filename, expected_bytes):
-  """Download a file if not present, and make sure it's the right size."""
-  local_filename = os.path.join(gettempdir(), filename)
-  if not os.path.exists(local_filename):
-    local_filename, _ = urllib.request.urlretrieve(url + filename,
-                                                   local_filename)
-  statinfo = os.stat(local_filename)
-  if statinfo.st_size == expected_bytes:
-    print('Found and verified', filename)
-  else:
-    print(statinfo.st_size)
-    raise Exception('Failed to verify ' + local_filename +
-                    '. Can you get to it with a browser?')
-  return local_filename
-
-
-filename = maybe_download('text8.zip', 31344016)
-
-
-# Read the data into a list of strings.
-def read_data(filename):
-  """Extract the first file enclosed in a zip file as a list of words."""
-  with zipfile.ZipFile(filename) as f:
-    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
-  return data
-
-
-vocabulary = read_data(filename)
-print('Data size', len(vocabulary))
-
-# Step 2: Build the dictionary and replace rare words with UNK token.
-vocabulary_size = 50000
-
-
-def build_dataset(words, n_words):
-  """Process raw inputs into a dataset."""
-  count = [('UNK', -1)]
-  count.extend(collections.Counter(words).most_common(n_words - 1))
-  dictionary = dict()
-  for word, _ in count:
-    dictionary[word] = len(dictionary)
-  data = list()
-  unk_count = 0
-  for word in words:
-    index = dictionary.get(word, 0)
-    if index == 0:  # dictionary['UNK']
-      unk_count += 1
-    data.append(index)
-  count[0][1] = unk_count
-  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
-  return data, count, dictionary, reversed_dictionary
-
-
-# Filling 4 global variables:
-# data - list of codes (integers from 0 to vocabulary_size-1).
-#   This is the original text but words are replaced by their codes
-# count - map of words(strings) to count of occurrences
-# dictionary - map of words(strings) to their codes(integers)
-# reverse_dictionary - maps codes(integers) to words(strings)
-data, count, dictionary, reverse_dictionary = build_dataset(
-    vocabulary, vocabulary_size)
-del vocabulary  # Hint to reduce memory.
-print('Most common words (+UNK)', count[:5])
-print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
-
 data_index = 0
 
 
-# Step 3: Function to generate a training batch for the skip-gram model.
-def generate_batch(batch_size, num_skips, skip_window):
-  global data_index
-  assert batch_size % num_skips == 0
-  assert num_skips <= 2 * skip_window
-  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
-  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
-  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
-  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
-  if data_index + span > len(data):
-    data_index = 0
-  buffer.extend(data[data_index:data_index + span])
-  data_index += span
-  for i in range(batch_size // num_skips):
-    context_words = [w for w in range(span) if w != skip_window]
-    words_to_use = random.sample(context_words, num_skips)
-    for j, context_word in enumerate(words_to_use):
-      batch[i * num_skips + j] = buffer[skip_window]
-      labels[i * num_skips + j, 0] = buffer[context_word]
-    if data_index == len(data):
-      buffer.extend(data[0:span])
-      data_index = span
+def word2vec_basic(log_dir):
+  """Example of building, training and visualizing a word2vec model."""
+  # Create the directory for TensorBoard variables if there is not.
+  if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+
+  # Step 1: Download the data.
+  url = 'http://mattmahoney.net/dc/'
+
+  # pylint: disable=redefined-outer-name
+  def maybe_download(filename, expected_bytes):
+    """Download a file if not present, and make sure it's the right size."""
+    local_filename = os.path.join(gettempdir(), filename)
+    if not os.path.exists(local_filename):
+      local_filename, _ = urllib.request.urlretrieve(url + filename,
+                                                     local_filename)
+    statinfo = os.stat(local_filename)
+    if statinfo.st_size == expected_bytes:
+      print('Found and verified', filename)
     else:
-      buffer.append(data[data_index])
-      data_index += 1
-  # Backtrack a little bit to avoid skipping words in the end of a batch
-  data_index = (data_index + len(data) - span) % len(data)
-  return batch, labels
-
-
-batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
-for i in range(8):
-  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
-        reverse_dictionary[labels[i, 0]])
-
-# Step 4: Build and train a skip-gram model.
-
-batch_size = 128
-embedding_size = 128  # Dimension of the embedding vector.
-skip_window = 1  # How many words to consider left and right.
-num_skips = 2  # How many times to reuse an input to generate a label.
-num_sampled = 64  # Number of negative examples to sample.
-
-# We pick a random validation set to sample nearest neighbors. Here we limit the
-# validation samples to the words that have a low numeric ID, which by
-# construction are also the most frequent. These 3 variables are used only for
-# displaying model accuracy, they don't affect calculation.
-valid_size = 16  # Random set of words to evaluate similarity on.
-valid_window = 100  # Only pick dev samples in the head of the distribution.
-valid_examples = np.random.choice(valid_window, valid_size, replace=False)
-
-graph = tf.Graph()
-
-with graph.as_default():
-
-  # Input data.
-  with tf.name_scope('inputs'):
-    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
-
-  # Ops and variables pinned to the CPU because of missing GPU implementation
-  with tf.device('/cpu:0'):
-    # Look up embeddings for inputs.
-    with tf.name_scope('embeddings'):
-      embeddings = tf.Variable(
-          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-      embed = tf.nn.embedding_lookup(embeddings, train_inputs)
-
-    # Construct the variables for the NCE loss
-    with tf.name_scope('weights'):
-      nce_weights = tf.Variable(
-          tf.truncated_normal(
-              [vocabulary_size, embedding_size],
-              stddev=1.0 / math.sqrt(embedding_size)))
-    with tf.name_scope('biases'):
-      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
-
-  # Compute the average NCE loss for the batch.
-  # tf.nce_loss automatically draws a new sample of the negative labels each
-  # time we evaluate the loss.
-  # Explanation of the meaning of NCE loss:
-  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
-  with tf.name_scope('loss'):
-    loss = tf.reduce_mean(
-        tf.nn.nce_loss(
-            weights=nce_weights,
-            biases=nce_biases,
-            labels=train_labels,
-            inputs=embed,
-            num_sampled=num_sampled,
-            num_classes=vocabulary_size))
-
-  # Add the loss value as a scalar to summary.
-  tf.summary.scalar('loss', loss)
-
-  # Construct the SGD optimizer using a learning rate of 1.0.
-  with tf.name_scope('optimizer'):
-    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
-
-  # Compute the cosine similarity between minibatch examples and all embeddings.
-  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
-  normalized_embeddings = embeddings / norm
-  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
-                                            valid_dataset)
-  similarity = tf.matmul(
-      valid_embeddings, normalized_embeddings, transpose_b=True)
-
-  # Merge all summaries.
-  merged = tf.summary.merge_all()
-
-  # Add variable initializer.
-  init = tf.global_variables_initializer()
-
-  # Create a saver.
-  saver = tf.train.Saver()
-
-# Step 5: Begin training.
-num_steps = 100001
-
-with tf.Session(graph=graph) as session:
-  # Open a writer to write summaries.
-  writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
-
-  # We must initialize all variables before we use them.
-  init.run()
-  print('Initialized')
-
-  average_loss = 0
-  for step in xrange(num_steps):
-    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
-                                                skip_window)
-    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
-
-    # Define metadata variable.
-    run_metadata = tf.RunMetadata()
-
-    # We perform one update step by evaluating the optimizer op (including it
-    # in the list of returned values for session.run()
-    # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
-    # Feed metadata variable to session for visualizing the graph in TensorBoard.
-    _, summary, loss_val = session.run(
-        [optimizer, merged, loss],
-        feed_dict=feed_dict,
-        run_metadata=run_metadata)
-    average_loss += loss_val
-
-    # Add returned summaries to writer in each step.
-    writer.add_summary(summary, step)
-    # Add metadata to visualize the graph for the last run.
-    if step == (num_steps - 1):
-      writer.add_run_metadata(run_metadata, 'step%d' % step)
-
-    if step % 2000 == 0:
-      if step > 0:
-        average_loss /= 2000
-      # The average loss is an estimate of the loss over the last 2000 batches.
-      print('Average loss at step ', step, ': ', average_loss)
-      average_loss = 0
-
-    # Note that this is expensive (~20% slowdown if computed every 500 steps)
-    if step % 10000 == 0:
-      sim = similarity.eval()
-      for i in xrange(valid_size):
-        valid_word = reverse_dictionary[valid_examples[i]]
-        top_k = 8  # number of nearest neighbors
-        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
-        log_str = 'Nearest to %s:' % valid_word
-        for k in xrange(top_k):
-          close_word = reverse_dictionary[nearest[k]]
-          log_str = '%s %s,' % (log_str, close_word)
-        print(log_str)
-  final_embeddings = normalized_embeddings.eval()
-
-  # Write corresponding labels for the embeddings.
-  with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
-    for i in xrange(vocabulary_size):
-      f.write(reverse_dictionary[i] + '\n')
-
-  # Save the model for checkpoints.
-  saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))
-
-  # Create a configuration for visualizing embeddings with the labels in TensorBoard.
-  config = projector.ProjectorConfig()
-  embedding_conf = config.embeddings.add()
-  embedding_conf.tensor_name = embeddings.name
-  embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
-  projector.visualize_embeddings(writer, config)
-
-writer.close()
-
-# Step 6: Visualize the embeddings.
-
-
-# pylint: disable=missing-docstring
-# Function to draw visualization of distance between embeddings.
-def plot_with_labels(low_dim_embs, labels, filename):
-  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
-  plt.figure(figsize=(18, 18))  # in inches
-  for i, label in enumerate(labels):
-    x, y = low_dim_embs[i, :]
-    plt.scatter(x, y)
-    plt.annotate(
-        label,
-        xy=(x, y),
-        xytext=(5, 2),
-        textcoords='offset points',
-        ha='right',
-        va='bottom')
-
-  plt.savefig(filename)
-
-
-try:
-  # pylint: disable=g-import-not-at-top
-  from sklearn.manifold import TSNE
-  import matplotlib.pyplot as plt
-
-  tsne = TSNE(
-      perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
-  plot_only = 500
-  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
-  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
-  plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
-
-except ImportError as ex:
-  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
-  print(ex)
+      print(statinfo.st_size)
+      raise Exception('Failed to verify ' + local_filename +
+                      '. Can you get to it with a browser?')
+    return local_filename
+
+  filename = maybe_download('text8.zip', 31344016)
+
+  # Read the data into a list of strings.
+  def read_data(filename):
+    """Extract the first file enclosed in a zip file as a list of words."""
+    with zipfile.ZipFile(filename) as f:
+      data = tf.compat.as_str(f.read(f.namelist()[0])).split()
+    return data
+
+  vocabulary = read_data(filename)
+  print('Data size', len(vocabulary))
+
+  # Step 2: Build the dictionary and replace rare words with UNK token.
+  vocabulary_size = 50000
+
+  def build_dataset(words, n_words):
+    """Process raw inputs into a dataset."""
+    count = [['UNK', -1]]
+    count.extend(collections.Counter(words).most_common(n_words - 1))
+    dictionary = dict()
+    for word, _ in count:
+      dictionary[word] = len(dictionary)
+    data = list()
+    unk_count = 0
+    for word in words:
+      index = dictionary.get(word, 0)
+      if index == 0:  # dictionary['UNK']
+        unk_count += 1
+      data.append(index)
+    count[0][1] = unk_count
+    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
+    return data, count, dictionary, reversed_dictionary
+
+  # Filling 4 global variables:
+  # data - list of codes (integers from 0 to vocabulary_size-1).
+  #   This is the original text but words are replaced by their codes
+  # count - map of words(strings) to count of occurrences
+  # dictionary - map of words(strings) to their codes(integers)
+  # reverse_dictionary - maps codes(integers) to words(strings)
+  data, count, unused_dictionary, reverse_dictionary = build_dataset(
+      vocabulary, vocabulary_size)
+  del vocabulary  # Hint to reduce memory.
+  print('Most common words (+UNK)', count[:5])
+  print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
+
+  # Step 3: Function to generate a training batch for the skip-gram model.
+  def generate_batch(batch_size, num_skips, skip_window):
+    global data_index
+    assert batch_size % num_skips == 0
+    assert num_skips <= 2 * skip_window
+    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
+    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
+    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
+    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
+    if data_index + span > len(data):
+      data_index = 0
+    buffer.extend(data[data_index:data_index + span])
+    data_index += span
+    for i in range(batch_size // num_skips):
+      context_words = [w for w in range(span) if w != skip_window]
+      words_to_use = random.sample(context_words, num_skips)
+      for j, context_word in enumerate(words_to_use):
+        batch[i * num_skips + j] = buffer[skip_window]
+        labels[i * num_skips + j, 0] = buffer[context_word]
+      if data_index == len(data):
+        buffer.extend(data[0:span])
+        data_index = span
+      else:
+        buffer.append(data[data_index])
+        data_index += 1
+    # Backtrack a little bit to avoid skipping words in the end of a batch
+    data_index = (data_index + len(data) - span) % len(data)
+    return batch, labels
+
+  batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
+  for i in range(8):
+    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
+          reverse_dictionary[labels[i, 0]])
+
+  # Step 4: Build and train a skip-gram model.
+
+  batch_size = 128
+  embedding_size = 128  # Dimension of the embedding vector.
+  skip_window = 1  # How many words to consider left and right.
+  num_skips = 2  # How many times to reuse an input to generate a label.
+  num_sampled = 64  # Number of negative examples to sample.
+
+  # We pick a random validation set to sample nearest neighbors. Here we limit
+  # the validation samples to the words that have a low numeric ID, which by
+  # construction are also the most frequent. These 3 variables are used only for
+  # displaying model accuracy, they don't affect calculation.
+  valid_size = 16  # Random set of words to evaluate similarity on.
+  valid_window = 100  # Only pick dev samples in the head of the distribution.
+  valid_examples = np.random.choice(valid_window, valid_size, replace=False)
+
+  graph = tf.Graph()
+
+  with graph.as_default():
+
+    # Input data.
+    with tf.name_scope('inputs'):
+      train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
+      train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
+      valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
+
+    # Ops and variables pinned to the CPU because of missing GPU implementation
+    with tf.device('/cpu:0'):
+      # Look up embeddings for inputs.
+      with tf.name_scope('embeddings'):
+        embeddings = tf.Variable(
+            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
+        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
+
+      # Construct the variables for the NCE loss
+      with tf.name_scope('weights'):
+        nce_weights = tf.Variable(
+            tf.truncated_normal([vocabulary_size, embedding_size],
+                                stddev=1.0 / math.sqrt(embedding_size)))
+      with tf.name_scope('biases'):
+        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
+
+    # Compute the average NCE loss for the batch.
+    # tf.nce_loss automatically draws a new sample of the negative labels each
+    # time we evaluate the loss.
+    # Explanation of the meaning of NCE loss:
+    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
+    with tf.name_scope('loss'):
+      loss = tf.reduce_mean(
+          tf.nn.nce_loss(
+              weights=nce_weights,
+              biases=nce_biases,
+              labels=train_labels,
+              inputs=embed,
+              num_sampled=num_sampled,
+              num_classes=vocabulary_size))
+
+    # Add the loss value as a scalar to summary.
+    tf.summary.scalar('loss', loss)
+
+    # Construct the SGD optimizer using a learning rate of 1.0.
+    with tf.name_scope('optimizer'):
+      optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
+
+    # Compute the cosine similarity between minibatch examples and all
+    # embeddings.
+    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
+    normalized_embeddings = embeddings / norm
+    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
+                                              valid_dataset)
+    similarity = tf.matmul(
+        valid_embeddings, normalized_embeddings, transpose_b=True)
+
+    # Merge all summaries.
+    merged = tf.summary.merge_all()
+
+    # Add variable initializer.
+    init = tf.global_variables_initializer()
+
+    # Create a saver.
+    saver = tf.train.Saver()
+
+  # Step 5: Begin training.
+  num_steps = 100001
+
+  with tf.Session(graph=graph) as session:
+    # Open a writer to write summaries.
+    writer = tf.summary.FileWriter(log_dir, session.graph)
+
+    # We must initialize all variables before we use them.
+    init.run()
+    print('Initialized')
+
+    average_loss = 0
+    for step in xrange(num_steps):
+      batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
+                                                  skip_window)
+      feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
+
+      # Define metadata variable.
+      run_metadata = tf.RunMetadata()
+
+      # We perform one update step by evaluating the optimizer op (including it
+      # in the list of returned values for session.run()
+      # Also, evaluate the merged op to get all summaries from the returned
+      # "summary" variable. Feed metadata variable to session for visualizing
+      # the graph in TensorBoard.
+      _, summary, loss_val = session.run([optimizer, merged, loss],
+                                         feed_dict=feed_dict,
+                                         run_metadata=run_metadata)
+      average_loss += loss_val
+
+      # Add returned summaries to writer in each step.
+      writer.add_summary(summary, step)
+      # Add metadata to visualize the graph for the last run.
+      if step == (num_steps - 1):
+        writer.add_run_metadata(run_metadata, 'step%d' % step)
+
+      if step % 2000 == 0:
+        if step > 0:
+          average_loss /= 2000
+        # The average loss is an estimate of the loss over the last 2000
+        # batches.
+        print('Average loss at step ', step, ': ', average_loss)
+        average_loss = 0
+
+      # Note that this is expensive (~20% slowdown if computed every 500 steps)
+      if step % 10000 == 0:
+        sim = similarity.eval()
+        for i in xrange(valid_size):
+          valid_word = reverse_dictionary[valid_examples[i]]
+          top_k = 8  # number of nearest neighbors
+          nearest = (-sim[i, :]).argsort()[1:top_k + 1]
+          log_str = 'Nearest to %s:' % valid_word
+          for k in xrange(top_k):
+            close_word = reverse_dictionary[nearest[k]]
+            log_str = '%s %s,' % (log_str, close_word)
+          print(log_str)
+    final_embeddings = normalized_embeddings.eval()
+
+    # Write corresponding labels for the embeddings.
+    with open(log_dir + '/metadata.tsv', 'w') as f:
+      for i in xrange(vocabulary_size):
+        f.write(reverse_dictionary[i] + '\n')
+
+    # Save the model for checkpoints.
+    saver.save(session, os.path.join(log_dir, 'model.ckpt'))
+
+    # Create a configuration for visualizing embeddings with the labels in
+    # TensorBoard.
+    config = projector.ProjectorConfig()
+    embedding_conf = config.embeddings.add()
+    embedding_conf.tensor_name = embeddings.name
+    embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
+    projector.visualize_embeddings(writer, config)
+
+  writer.close()
+
+  # Step 6: Visualize the embeddings.
+
+  # pylint: disable=missing-docstring
+  # Function to draw visualization of distance between embeddings.
+  def plot_with_labels(low_dim_embs, labels, filename):
+    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
+    plt.figure(figsize=(18, 18))  # in inches
+    for i, label in enumerate(labels):
+      x, y = low_dim_embs[i, :]
+      plt.scatter(x, y)
+      plt.annotate(
+          label,
+          xy=(x, y),
+          xytext=(5, 2),
+          textcoords='offset points',
+          ha='right',
+          va='bottom')
+
+    plt.savefig(filename)
+
+  try:
+    # pylint: disable=g-import-not-at-top
+    from sklearn.manifold import TSNE
+    import matplotlib.pyplot as plt
+
+    tsne = TSNE(
+        perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
+    plot_only = 500
+    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
+    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
+    plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(),
+                                                        'tsne.png'))
+
+  except ImportError as ex:
+    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
+    print(ex)
+
+
+# All functionality is run after tf.app.run() (b/122547914). This could be split
+# up but the methods are laid sequentially with their usage for clarity.
+def main(unused_argv):
+  # Give a folder path as an argument with '--log_dir' to save
+  # TensorBoard summaries. Default is a log folder in current directory.
+  current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default=os.path.join(current_path, 'log'),
+      help='The log directory for TensorBoard summaries.')
+  flags, unused_flags = parser.parse_known_args()
+  word2vec_basic(flags.log_dir)
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/go/attrs.go b/tensorflow/go/attrs.go
index f86c5737bc79f1e349e442669615598949ecd333..ed1a1f0b5419f7f76c6aa8ccb657e16480e85780 100644
--- a/tensorflow/go/attrs.go
+++ b/tensorflow/go/attrs.go
@@ -170,7 +170,8 @@ func listAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interf
 			}
 			// A []C.int64_t slice backed by C memory.
 			// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-			slice := (*[1 << 30]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim]
+			// Using [1<<27] instead of [1<<30] so it works on 32-bit architecture
+			slice := (*[1 << 27]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim]
 			list[i] = makeCShape(slice)
 		}
 		return list, nil
diff --git a/tensorflow/go/genop/internal/api_def_map.go b/tensorflow/go/genop/internal/api_def_map.go
index 8600452b476dee49292cbffe630026cf6077e22b..0bbd88b61c345906a13944aa3c7ad7b0582fffae 100644
--- a/tensorflow/go/genop/internal/api_def_map.go
+++ b/tensorflow/go/genop/internal/api_def_map.go
@@ -31,7 +31,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // Encapsulates a collection of API definitions.
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index fb8163121850cee36e1fcc652ca258b1fe2d42ff..1c05715a1a2f50b857c78e8c192d6c865b70e6c7 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -47,7 +47,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // GenerateFunctionsForRegisteredOps writes a Go source code file to w
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index d20d22e0c1502f92ade7ef5aa40985dce73b7552..acce6dea67c2e93309df70dd5009ad0dc086c523 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -22,7 +22,7 @@ import (
 	"testing"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // Creates an ApiDef based on opdef and applies overrides
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 52742716f187c572c22fc6f18a12c9afd66c55c7..c95c8c90b53699d8f4e5e3030839466006dc416e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -327,6 +327,192 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 	return op.Output(0)
 }
 
+// Subtracts sparse `updates` from an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by subtracting sparse `updates` from the
+// passed in `tensor`.
+// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+// are subtracted from an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_sub is to subtract individual elements
+// from a tensor by index. For example, say we want to insert 4 scattered elements
+// in a rank-1 tensor with 8 elements.
+//
+// In Python, this scatter subtract operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, -10, 1, -9, -8, 1, 1, -11]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates subtracted according to the indices.
+func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterSub",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by applying sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd`, except that the updates are
+// scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 11, 1, 10, 9, 1, 1, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func TensorScatterUpdate(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterUpdate",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Scatter `updates` into a new tensor according to `indices`.
 //
 // Creates a new tensor by applying sparse `updates` to individual values or
@@ -334,6 +520,10 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 // the given `shape` according to indices.  This operator is the inverse of the
 // `tf.gather_nd` operator which extracts values or slices from a given tensor.
 //
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
 // If `indices` contains duplicates, then their updates are accumulated (summed).
 //
 // **WARNING**: The order in which updates are applied is nondeterministic, so the
@@ -464,6 +654,15 @@ func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 }
 
 // QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+//
+// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+// used when rounding float values to their quantized equivalents. The following
+// rounding modes are currently supported:
+//
+// *   HALF_TO_EVEN: this is the default round_mode.
+// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+//     rounds up to -7.
+//
 // If not specified, defaults to "HALF_TO_EVEN"
 func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
@@ -523,7 +722,7 @@ func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr
 //
 // output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 //
-// The above round function uses half to even rounding.
+// The above round function rounds the value based on the given round_mode.
 //
 //
 // Arguments:
@@ -3422,11 +3621,11 @@ func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 // bucketized values for a single feature.
 //
 // Arguments:
-//	float_values: float; List of Rank 2 Tensor each containing float values for a single feature.
+//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
 //	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
 // feature.
 //
-// Returns int; List of Rank 2 Tensors each containing the bucketized values for a single feature.
+// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
 func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -3497,15 +3696,16 @@ func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resou
 
 // Makes the summary of quantiles for the batch.
 //
-// An op that takes a list of tensors and outputs the quantile summaries for each tensor.
+// An op that takes a list of tensors (one tensor per feature) and outputs the
+// quantile summaries for each tensor.
 //
 // Arguments:
-//	float_values: float; List of Rank 2 Tensors each containing values for a single feature.
+//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
 //	example_weights: float; Rank 1 Tensor with weights per instance.
 //	epsilon: float; The required maximum approximation error.
 //
-// Returns float; List of Rank 2 Tensors each containing the quantile summary (value, weight,
-// min_rank, max_rank) of a single feature.
+// Returns float; List of Rank 2 Tensors each containing the quantile summary
+// (value, weight, min_rank, max_rank) of a single feature.
 func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -3806,6 +4006,70 @@ func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTrees
 	return op.Output(0)
 }
 
+// Output the logits for the given input data
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//	dense_features: Rank 2 dense features tensor.
+//	logits_dimension: Scalar, dimension of the logits.
+//
+// Returns The logits predictions from the tree for each instance in the batch.
+func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreePredict",
+		Input: []tf.Input{
+			tree_handle, dense_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the number of nodes in a tree
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//
+// Returns The size of the tree.
+func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tree resource and returns a handle to it.
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be created.
+//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//
+// Returns the created operation.
+func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestCreateTreeVariable",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
 type ComputeAccidentalHitsAttr func(optionalAttr)
 
@@ -4262,12 +4526,26 @@ func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_
 	return op.Output(0)
 }
 
+// CollectiveReduceAttr is an optional argument to CollectiveReduce.
+type CollectiveReduceAttr func(optionalAttr)
+
+// CollectiveReduceWaitFor sets the optional wait_for attribute to value.
+// If not specified, defaults to <>
+func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["wait_for"] = value
+	}
+}
+
 // Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "CollectiveReduce",
 		Input: []tf.Input{
@@ -4829,6 +5107,119 @@ func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.
 	return weights, biases
 }
 
+// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
+type CudnnRNNBackpropV3Attr func(optionalAttr)
+
+// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNNV3.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "sequence_lengths" input than CudnnRNNBackprop.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
 type CudnnRNNBackpropV2Attr func(optionalAttr)
 
@@ -6196,65 +6587,57 @@ func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Opera
 	return scope.AddOperation(opspec)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+// Delete the tensor specified by its handle in the session.
 //
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			a, b, x,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Identity",
+		Type: "GetSessionHandle",
 		Input: []tf.Input{
-			input,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// Deprecated. Use TensorArraySizeV3
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan2",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			y, x,
+			handle, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -6450,6 +6833,35 @@ func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Selects the k nearest centers for each point.
+//
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
+//
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NearestNeighbors",
+		Input: []tf.Input{
+			points, centers, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Returns x * y element-wise.
 //
 // *NOTE*: `Multiply` supports broadcasting. More about broadcasting
@@ -6678,6 +7090,34 @@ func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//
+//
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSlidingWindowDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, window_shift, window_stride,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns which elements of x are finite.
 //
 // @compatibility(numpy)
@@ -7053,6 +7493,19 @@ func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	}
 }
 
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -7212,6 +7665,33 @@ func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf
 	return op.Output(0)
 }
 
+// Returns the index of a data point that should be added to the seed set.
+//
+// Entries in distances are assumed to be squared distances of candidate points to
+// the already sampled centers in the seed set. The op constructs one Markov chain
+// of the k-MC^2 algorithm and returns the index of one candidate point to be added
+// as an additional cluster center.
+//
+// Arguments:
+//	distances: Vector with squared distances to the closest previously sampled cluster center
+// for each candidate point.
+//	seed: Scalar. Seed for initializing the random number generator.
+//
+// Returns Scalar with the index of the sampled point.
+func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "KMC2ChainInitialization",
+		Input: []tf.Input{
+			distances, seed,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes hyperbolic sine of x element-wise.
 func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -7332,6 +7812,47 @@ func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Get the value of the tensor specified by its handle.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
+//
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatrixInverseAttr is an optional argument to MatrixInverse.
 type MatrixInverseAttr func(optionalAttr)
 
@@ -7548,55 +8069,28 @@ func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x AND y element-wise.
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "ExperimentalDenseToSparseBatchDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, batch_size, row_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7684,7 +8178,7 @@ func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 // Arguments:
 //	input: The text to be processed.
 //	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	rewrite: The rewrite to be applied to the matched expression.
 //
 // Returns The text after applying pattern and rewrite.
 func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
@@ -7706,6 +8200,60 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 	return op.Output(0)
 }
 
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
+//
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComplexAbs",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x AND y element-wise.
+//
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Quantized Batch normalization.
 //
 // This op is deprecated and will be removed in the future. Prefer
@@ -8382,6 +8930,26 @@ func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// Checks whether a tree has been initialized.
+//
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeIsInitializedOp",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Gets next element for the provided shard number.
 //
 // Arguments:
@@ -8947,77 +9515,6 @@ func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
-//
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
-
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)
 
@@ -9564,6 +10061,113 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
 // The input `SparseTensor` is represented via the tuple of inputs
@@ -9883,6 +10487,37 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Selects num_to_sample rows of input using the KMeans++ criterion.
+//
+// Rows of points are assumed to be input points. One row is selected at random.
+// Subsequent rows are sampled with probability proportional to the squared L2
+// distance from the nearest row selected thus far till num_to_sample rows have
+// been sampled.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	num_to_sample: Scalar. The number of rows to sample. This value must not be larger than n.
+//	seed: Scalar. Seed for initializing the random number generator.
+//	num_retries_per_sample: Scalar. For each row that is sampled, this parameter
+// specifies the number of additional points to draw from the current
+// distribution before selecting the best. If a negative value is specified, a
+// heuristic is used to sample O(log(num_to_sample)) additional points.
+//
+// Returns Matrix of shape (num_to_sample, d). The sampled rows.
+func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample tf.Output, seed tf.Output, num_retries_per_sample tf.Output) (samples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "KmeansPlusPlusInitialization",
+		Input: []tf.Input{
+			points, num_to_sample, seed, num_retries_per_sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Shuffle dimensions of x according to a permutation.
 //
 // The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
@@ -10844,6 +11479,61 @@ func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Outp
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
+
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceMax",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the Gauss error function of `x` element-wise.
 func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -10917,7 +11607,6 @@ func OneHotAxis(value int64) OneHotAttr {
 // =========
 //
 // Suppose that
-//
 // ```
 //   indices = [0, 2, -1, 1]
 //   depth = 3
@@ -10927,16 +11616,15 @@ func OneHotAxis(value int64) OneHotAttr {
 // ```
 //
 // Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
+// ```
+// output =
+//   [5.0 0.0 0.0]  // one_hot(0)
+//   [0.0 0.0 5.0]  // one_hot(2)
+//   [0.0 0.0 0.0]  // one_hot(-1)
+//   [0.0 5.0 0.0]  // one_hot(1)
+// ```
 //
 // Suppose that
-//
 // ```
 //   indices = [0, 2, -1, 1]
 //   depth = 3
@@ -10946,19 +11634,19 @@ func OneHotAxis(value int64) OneHotAttr {
 // ```
 //
 // Then output is `[3 x 4]`:
+// ```
+// output =
+//   [0.0 3.0 3.0 3.0]
+//   [3.0 3.0 3.0 0.0]
+//   [3.0 3.0 3.0 3.0]
+//   [3.0 0.0 3.0 3.0]
+// //  ^                one_hot(0)
+// //      ^            one_hot(2)
+// //          ^        one_hot(-1)
+// //              ^    one_hot(1)
+// ```
 //
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
 // Suppose that
-//
 // ```
 //   indices = [[0, 2], [1, -1]]
 //   depth = 3
@@ -10968,15 +11656,16 @@ func OneHotAxis(value int64) OneHotAttr {
 // ```
 //
 // Then output is `[2 x 2 x 3]`:
-//
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// ```
+// output =
+//   [
+//     [1.0, 0.0, 0.0]  // one_hot(0)
+//     [0.0, 0.0, 1.0]  // one_hot(2)
+//   ][
+//     [0.0, 1.0, 0.0]  // one_hot(1)
+//     [0.0, 0.0, 0.0]  // one_hot(-1)
+//   ]
+// ```
 //
 // Arguments:
 //	indices: A tensor of indices.
@@ -11004,223 +11693,6 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NthElement",
-		Input: []tf.Input{
-			input, n,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
-//   }
-//   out_arg {
-//     name: "output"
-//     description: <<END
-// Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-//
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // CudnnRNNAttr is an optional argument to CudnnRNN.
 type CudnnRNNAttr func(optionalAttr)
 
@@ -11534,8 +12006,8 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 // Arguments:
 //	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
 // `params` RaggedTensor input.
-//	params_dense_values: The `inner_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to inner_values, so dense_values is the
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
 // deprecated name.
 //	indices: Indices in the outermost dimension of `params` of the values that should be
 // gathered.
@@ -11544,7 +12016,7 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 // `indices.shape.ndims + params.ragged_rank - 1`.
 //
 // Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `inner_values` for the returned RaggedTensor.
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
 func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -11617,6 +12089,47 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Converts a `RaggedTensor` into a `SparseTensor` with the same values.
 //
 // input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
@@ -11625,7 +12138,7 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 //
 // Arguments:
 //	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
 //
 // Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
 func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
@@ -11768,217 +12281,6 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
-//
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
-//
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
-		Input: []tf.Input{
-			image_size, bounding_boxes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Computes sigmoid of `x` element-wise.
 //
 // Specifically, `y = 1 / (1 + exp(-x))`.
@@ -12285,6 +12587,261 @@ func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.
 	return scope.AddOperation(opspec)
 }
 
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exp",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Finds values of the `n`-th order statistic for the last dimension.
+//
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NthElement",
+		Input: []tf.Input{
+			input, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
+
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormal",
+		Input: []tf.Input{
+			resource, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
 // is alive, any other request to use `MutexLock` with this mutex will wait.
@@ -12558,47 +13115,6 @@ func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Adds sparse updates to the variable referenced by `resource`.
 //
 // This operation computes
@@ -12882,141 +13398,6 @@ func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_itera
 	return op.Output(0)
 }
 
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
-
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces the match of pattern in input with rewrite.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
-//
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
-//
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TakeDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Reads the value of a variable.
 //
 // The tensor returned by this operation is immutable.
@@ -13087,9 +13468,7 @@ func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 	}
 }
 
-// Adds sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
+// Applies sparse addition to individual values or slices in a Variable.
 //
 // `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
@@ -13103,24 +13482,24 @@ func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 // `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
 // ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 // ```
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
 //
 // ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_add(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
 // ```
 //
 // The resulting update to ref would look like this:
 //
-//     [1, 12, 3, 14, 14, 6, 7, 20]
+//     [1, 13, 3, 14, 14, 6, 7, 20]
 //
 // See `tf.scatter_nd` for more details about how to make updates to
 // slices.
@@ -13206,6 +13585,65 @@ func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Bucketizes 'input' based on 'boundaries'.
 //
 // For example, if the inputs are
@@ -13244,63 +13682,6 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
 type StatelessTruncatedNormalAttr func(optionalAttr)
 
@@ -13482,6 +13863,91 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
 // (1) Broadcasts the dense side to have the same shape as the sparse side, if
@@ -13530,6 +13996,84 @@ func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
+
+// UnicodeEncodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// Encode a tensor of ints into unicode strings.
+//
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
+//
+// ---
+//
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
+//
+// Arguments:
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
+//
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeEncode",
+		Input: []tf.Input{
+			input_values, input_splits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the number of tensors in the input tensor list.
 //
 // input_handle: the input list
@@ -14852,6 +15396,117 @@ func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Ou
 	return op.Output(0)
 }
 
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // Applies softmax to a batched N-D `SparseTensor`.
 //
 // The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
@@ -15143,28 +15798,47 @@ func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// Update '*var' according to the RMSProp algorithm.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// For example:
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15173,59 +15847,166 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the power of one value to another.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			input,
+			image_size, bounding_boxes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // LRNAttr is an optional argument to LRN.
@@ -15418,7 +16199,7 @@ func TensorListStackNumElements(value int64) TensorListStackAttr {
 // tensor: the gathered result
 // num_elements: optional. If not -1, the number of elements in the list.
 //
-func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15429,7 +16210,7 @@ func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.Data
 	opspec := tf.OpSpec{
 		Type: "TensorListStack",
 		Input: []tf.Input{
-			input_handle,
+			input_handle, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -16460,7 +17241,7 @@ func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (o
 // tensor: the withdrawn last element of the list
 // element_dtype: the type of elements in the list
 // element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16468,7 +17249,7 @@ func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.Da
 	opspec := tf.OpSpec{
 		Type: "TensorListPopBack",
 		Input: []tf.Input{
-			input_handle,
+			input_handle, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -16682,27 +17463,6 @@ func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
-		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
 type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
@@ -16870,6 +17630,23 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
 type SparseTensorDenseMatMulAttr func(optionalAttr)
 
@@ -16988,6 +17765,90 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 	return scope.AddOperation(opspec)
 }
 
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // SerializeManySparseAttr is an optional argument to SerializeManySparse.
 type SerializeManySparseAttr func(optionalAttr)
 
@@ -18206,45 +19067,94 @@ func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "UnicodeDecodeWithOffsets",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Returns x - y element-wise.
@@ -18265,47 +19175,6 @@ func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
 // *NOTE*: `Maximum` supports broadcasting. More about broadcasting
@@ -18658,6 +19527,141 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
+
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
+//
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexReplace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TakeDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // The gradient operator for the SparseAdd op.
 //
 // The SparseAdd op calculates A + B, where A, B, and the sum are all represented
@@ -19241,6 +20245,42 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	return op.Output(0)
 }
 
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AudioSummaryAttr is an optional argument to AudioSummary.
 type AudioSummaryAttr func(optionalAttr)
 
@@ -19527,65 +20567,6 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Reorders a SparseTensor into the canonical, row-major ordering.
 //
 // Note that by convention, all sparse ops preserve the canonical ordering along
@@ -19768,6 +20749,176 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the TensorArray from its resource container.
+//
+// This enables the user to close and release the resource in the middle
+// of a step/run.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
+
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
 // This Op does not require `a_indices` be sorted in standard lexicographic order.
@@ -20313,20 +21464,57 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 	return op.Output(0)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+// Returns the element-wise sum of a list of tensors.
 //
-// The Hurwitz zeta function is defined as:
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
 //
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			x, q,
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -20339,9 +21527,9 @@ func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 // inner-most dimension of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
+// Returns A complex tensor of the same shape as `input`. The inner-most
 //   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
 // @compatibility(numpy)
@@ -20367,9 +21555,9 @@ func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 // 2 dimensions of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
 //   dimensions of `input` are replaced with their 2D Fourier transform.
 //
 // @compatibility(numpy)
@@ -20395,9 +21583,9 @@ func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 // inner-most 2 dimensions of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
 //   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
 // @compatibility(numpy)
@@ -20417,27 +21605,6 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse 2D real-valued fast Fourier transform.
 //
 // Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
@@ -20737,6 +21904,44 @@ func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_
 	return op.Output(0)
 }
 
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
+
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcat",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Returns the set of files matching one or more glob patterns.
 //
 // Note that this routine only supports wildcard characters in the
@@ -21028,63 +22233,6 @@ func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
-		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
@@ -21110,176 +22258,6 @@ func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output,
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Saves the input tensors to disk.
 //
 // The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
@@ -21948,6 +22926,61 @@ func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdamWithAmsgrad",
+		Input: []tf.Input{
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
 type MapUnstageNoKeyAttr func(optionalAttr)
 
@@ -22186,6 +23219,69 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Assigns a new value to a variable.
 //
 // Any ReadVariableOp with a control dependency on this op is guaranteed to return
@@ -22349,6 +23445,93 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
+// Adds sparse `updates` to an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterAdd",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sign and the log of the absolute value of the determinant of
 //
 // one or more square matrices.
@@ -22444,25 +23627,6 @@ func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_uppe
 	return op.Output(0)
 }
 
-// Delete the tensor specified by its handle in the session.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Subtracts a value from the current value of a variable.
 //
 // Any ReadVariableOp with a control dependency on this op is guaranteed to
@@ -22592,6 +23756,84 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	num_threads: Identifies the number of threads to use for the private threadpool.
+//
+//
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalPrivateThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, num_threads,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
+// If not specified, defaults to false
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+	return func(m optionalAttr) {
+		m["sloppy"] = value
+	}
+}
+
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
+//
+// Arguments:
+//
+//
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalParseExampleDataset",
+		Input: []tf.Input{
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
 type SdcaOptimizerAttr func(optionalAttr)
 
@@ -23155,6 +24397,377 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
+//
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatOffset",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the cardinality of `input_dataset`.
+//
+// Returns the cardinality of `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
+//
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetCardinality",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
+//
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+//
+// Arguments:
+//
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReluX",
+		Input: []tf.Input{
+			features, max_value, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
+//
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoisson",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the truth value of (x >= y) element-wise.
 //
 // *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
@@ -23336,6 +24949,26 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseMatMulAttr is an optional argument to SparseMatMul.
 type SparseMatMulAttr func(optionalAttr)
 
@@ -26491,6 +28124,28 @@ func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
+//
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Inverse 3D fast Fourier transform.
 //
 // Computes the inverse 3-dimensional discrete Fourier transform over the
@@ -26618,6 +28273,96 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
+//
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
+
+// PrintV2OutputStream sets the optional output_stream attribute to value.
+//
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["output_stream"] = value
+	}
+}
+
+// Prints a string scalar.
+//
+// Prints a string scalar to the desired output_stream.
+//
+// Arguments:
+//	input: The string scalar to print.
+//
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrintV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:
@@ -27018,6 +28763,29 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
+// Splits a tensor into a list.
+//
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+//
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSplit",
+		Input: []tf.Input{
+			tensor, element_shape, lengths,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPoolAttr is an optional argument to AvgPool.
 type AvgPoolAttr func(optionalAttr)
 
@@ -27122,7 +28890,7 @@ func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.
 // item: the element at that position
 //
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27130,7 +28898,7 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	opspec := tf.OpSpec{
 		Type: "TensorListGetItem",
 		Input: []tf.Input{
-			input_handle, index,
+			input_handle, index, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -27138,6 +28906,26 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
+// Resizes the list.
+//
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a diagonal tensor with a given diagonal values.
 //
 // Given a `diagonal`, this operation returns a tensor with the `diagonal` and
@@ -27262,7 +29050,7 @@ func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, it
 // input_handle: The input tensor list.
 // indices: The indices used to index into the list.
 // values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27270,7 +29058,7 @@ func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, e
 	opspec := tf.OpSpec{
 		Type: "TensorListGather",
 		Input: []tf.Input{
-			input_handle, indices,
+			input_handle, indices, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -27328,7 +29116,7 @@ func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, elemen
 //	limits: The limits of each range.
 //	deltas: The deltas of each range.
 //
-// Returns The `row_splits` for the returned `RaggedTensor`.The `inner_values` for the returned `RaggedTensor`.
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
 func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -27707,6 +29495,66 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns a serialized GraphDef representing `input_dataset`.
 //
 // Returns a graph representation for `input_dataset`.
@@ -27729,6 +29577,162 @@ func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	return op.Output(0)
 }
 
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
+	return func(m optionalAttr) {
+		m["output_idx_type"] = value
+	}
+}
+
+// Computes the LU decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be invertible.
+//
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
+//
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
+//
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Lu",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Deprecated. Use TensorArrayCloseV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A dataset that creates window datasets from the input dataset.
+//
+// Arguments:
+//
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "WindowDataset",
+		Input: []tf.Input{
+			input_dataset, size, shift, stride, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the matrix square root of one or more square matrices:
 //
 // matmul(sqrtm(A), sqrtm(A)) = A
@@ -27838,96 +29842,6 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
-
-// PrintV2OutputStream sets the optional output_stream attribute to value.
-//
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["output_stream"] = value
-	}
-}
-
-// Prints a string scalar.
-//
-// Prints a string scalar to the desired output_stream.
-//
-// Arguments:
-//	input: The string scalar to print.
-//
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PrintV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
-//
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
-		Input: []tf.Input{
-			handle, tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Computes the product along segments of a tensor.
 //
 // Read
@@ -28987,90 +30901,6 @@ func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Outpu
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
-		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
 type ResizeBicubicGradAttr func(optionalAttr)
 
@@ -29948,6 +31778,43 @@ func Iterator(scope *Scope, shared_name string, container string, output_types [
 	return op.Output(0)
 }
 
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
 type CropAndResizeGradImageAttr func(optionalAttr)
 
@@ -30312,6 +32179,29 @@ func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Outpu
 	return op.Output(0)
 }
 
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
+//
+// This operation returns the same result as the C++ std::nextafter function.
+//
+// It can also return a subnormal number.
+//
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextAfter",
+		Input: []tf.Input{
+			x1, x2,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the gradient for the inverse of `x` wrt its input.
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
@@ -30461,6 +32351,71 @@ func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_strea
 	return scope.AddOperation(opspec)
 }
 
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUnbatchDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that overrides the maximum intra-op parallelism.
+//
+// Arguments:
+//
+//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+//
+//
+func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalMaxIntraOpParallelismDataset",
+		Input: []tf.Input{
+			input_dataset, max_intra_op_parallelism,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StringSplitV2Attr is an optional argument to StringSplitV2.
 type StringSplitV2Attr func(optionalAttr)
 
@@ -30823,6 +32778,83 @@ func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.
 	return scope.AddOperation(opspec)
 }
 
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
+
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse subtraction to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdSub",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
 type TensorArrayConcatV2Attr func(optionalAttr)
 
@@ -31032,6 +33064,43 @@ func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
+
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // A container for an iterator resource.
 //
 // Returns A handle to the iterator that can be passed to a "MakeIterator" or
@@ -31157,6 +33226,21 @@ func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int
 	return op.Output(0)
 }
 
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
 // This operation may be executed multiple times. Each execution will reset the
@@ -31488,6 +33572,26 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
+// Deserializes a proto into the tree handle
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be restored.
+//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
+//
+// Returns the created operation.
+func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeDeserialize",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
@@ -31705,9 +33809,9 @@ func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []
 // dimension of `input`.
 //
 // Arguments:
-//	input: A complex64 tensor.
+//	input: A complex tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
+// Returns A complex tensor of the same shape as `input`. The inner-most
 //   dimension of `input` is replaced with its 1D Fourier transform.
 //
 // @compatibility(numpy)
@@ -31769,6 +33873,219 @@ func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a MultiDeviceIterator resource.
+//
+// Arguments:
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
@@ -31820,6 +34137,19 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	}
 }
 
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -32300,124 +34630,54 @@ func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
-//
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
+// Forwards the value of an available tensor from `inputs` to `output`.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	inputs: The input tensors, exactly one of which will become available.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "Merge",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// A dataset that creates window datasets from the input dataset.
+// Writes the given dataset to the given file using the TFRecord format.
 //
 // Arguments:
-//
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
-//
-//
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "WindowDataset",
-		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayCloseV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
 //
 // Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "ExperimentalDatasetToTFRecord",
 		Input: []tf.Input{
-			handle,
+			input_dataset, filename, compression_type,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
-//
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
-//
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
-//
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Merge",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // QueueCloseV2Attr is an optional argument to QueueCloseV2.
 type QueueCloseV2Attr func(optionalAttr)
 
@@ -32754,6 +35014,23 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StackPushV2Attr is an optional argument to StackPushV2.
 type StackPushV2Attr func(optionalAttr)
 
@@ -33195,61 +35472,6 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0)
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AsStringAttr is an optional argument to AsString.
 type AsStringAttr func(optionalAttr)
 
@@ -33346,327 +35568,3 @@ func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, val
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
-//
-// The resulting value `output` would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to slices.
-//
-// Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
-//
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
-		Input: []tf.Input{
-			input, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
-//
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a MultiDeviceIterator resource.
-//
-// Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArraySizeV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
-//
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/go/tensor_handle.go b/tensorflow/go/tensor_handle.go
index 3b06773dd161f3531c805cd12dc7d59265348a74..09192eccefe13cc4573e69dfac85c8aa169dd6fc 100644
--- a/tensorflow/go/tensor_handle.go
+++ b/tensorflow/go/tensor_handle.go
@@ -123,13 +123,22 @@ func (th *TensorHandle) DeviceName() (string, error) {
 // BackingDeviceName returns the name of the device in whose memory the tensor
 // handle resides. This function will block till the operation that produces
 // `h` has completed.
+//
+// WARNING: The implementation currently returns the same as DeviceName().
+// After TensoFlow 1.13's C library is released, this implementation will
+// be updated to return what the documentation says!
 func (th *TensorHandle) BackingDeviceName() (string, error) {
+	// TODO(ashankar): Restore after TensorFlow 1.13 is released.
+	// See https://github.com/tensorflow/tensorflow/issues/23257#issuecomment-433751410
+	return th.DeviceName()
+	/*
 	status := newStatus()
 	name := C.TFE_TensorHandleBackingDeviceName(th.c, status.c)
 	if err := status.Err(); err != nil {
 		return "", err
 	}
 	return C.GoString(name), nil
+	*/
 }
 
 // ToTensor returns the Tensor referenced by th. It may block if this tensor is
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 10808e162ee4cc679430c0573e5bff8322ad6fff..af5503f2ad308fffb03d2ebd5964eec273896c72 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -295,6 +295,19 @@ tf_java_test(
     ],
 )
 
+tf_java_test(
+    name = "GeneratedOperationsTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.core.GeneratedOperationsTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 tf_java_test(
     name = "GradientsTest",
     size = "small",
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 951e8bdd0dd8aae46a361a8ffcff276579433641..4206f6f9fc8ed029d1a7d9b044dd079ec523de31 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -20,13 +20,13 @@
 Releases built from release branches are available on Maven Central.
 Additionally, every day binaries are built from the `master` branch on GitHub:
 
-- [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
-- [Sourc JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
-- JNI:
-  - [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
-  - [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
-  - [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
-  - Windows: (No nightly builds available yet)
+-   [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
+-   [Source JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
+-   JNI:
+    -   [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
+    -   [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
+    -   [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
+    -   Windows: (No nightly builds available yet)
 
 ## Building from source
 
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index 0d9e0883af262ee1f262a5e1308cb9df8763488d..cf4bb03dadec421411300100880f9129d7da47be 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -35,7 +35,7 @@ const char kUsageHeader[] =
     "graph.\n\n"
     "Operation wrappers are generated under the path specified by the "
     "'--output_dir' argument. This path can be absolute or relative to the\n"
-    "current working directory and will be created if it does not exists.\n\n"
+    "current working directory and will be created if it does not exist.\n\n"
     "Note that the operations will not be available through the "
     "'org.tensorflow.op.Ops' API until the generated classes are compiled\n"
     "using an appropriate annotation processor.\n\n"
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 4f5a491d259a1381976d21c777bc0871ada1b916..4024efedefd41fb90b215a9d5227d6028331cdaa 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -91,11 +91,6 @@ class TypeResolver {
 
 Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
   *iterable_out = false;
-  if (!arg_def.number_attr().empty()) {
-    // when number_attr is set, argument has to be a list of tensors
-    *iterable_out = true;
-    visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
-  }
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     type = Type::ForDataType(arg_def.type());
@@ -122,6 +117,11 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     LOG(FATAL) << "Cannot resolve data type of argument \"" << arg_def.name()
                << "\" in operation \"" << op_def_.name() << "\"";
   }
+  if (!arg_def.number_attr().empty()) {
+    // when number_attr is set, argument has to be a list of tensors
+    *iterable_out = true;
+    visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
+  }
   return type;
 }
 
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 1b7bcdab35f45142aefdc9e9635b398090e60b17..df1426ad75143d720f1d5bd3cf4ce44d30cb226e 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -340,7 +340,7 @@ public final class OperatorProcessor extends AbstractProcessor {
                     + "{@link $T @Operator} is exposed\n"
                     + "by this API or one of its subgroup.\n<p>Example usage:\n<pre>{@code\n"
                     + "try (Graph g = new Graph()) {\n"
-                    + "  Ops ops = new Ops(g);\n"
+                    + "  Ops ops = Ops.create(g);\n"
                     + "  // Operations are typed classes with convenience\n"
                     + "  // builders in Ops.\n"
                     + "  Constant three = ops.constant(3);\n"
diff --git a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
index 49594e6b47b9295d164a1823386b0981776e66f4..e653373f8569d9e84a8e524fd0f7439d7747104f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
@@ -84,7 +84,7 @@ public class SavedModelBundle implements AutoCloseable {
    * <p>This method is a shorthand for:
    *
    * <pre>{@code
-   * SavedModelBundler.loader().withTags(tags).load();
+   * SavedModelBundle.loader().withTags(tags).load();
    * }</pre>
    *
    * @param exportDir the directory path containing a saved model.
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
index 3782240edb4008cc71c55cf48cba8f5873b71018..38f466c57416eac96a09cd1dfe8558fcb8e3606f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
@@ -25,11 +25,11 @@ import java.lang.annotation.Target;
  * Annotation used by classes to make TensorFlow operations conveniently accessible via {@code
  * org.tensorflow.op.Ops}.
  *
- * <p>An annotation processor (TODO: not yet implemented) builds the {@code Ops} class by
- * aggregating all classes annotated as {@code @Operator}s. Each annotated class <b>must</b> have at
- * least one public static factory method named {@code create} that accepts a {@link
- * org.tensorflow.op.Scope} as its first argument. The processor then adds a convenience method in
- * the {@code Ops} class. For example:
+ * <p>An annotation processor ({@code org.tensorflow.processor.OperatorProcessor}) builds the
+ * {@code Ops} class by aggregating all classes annotated as {@code @Operator}s. Each annotated
+ * class <b>must</b> have at least one public static factory method named {@code create} that
+ * accepts a {@link org.tensorflow.op.Scope} as its first argument. The processor then adds a
+ * convenience method in the {@code Ops} class. For example:
  *
  * <pre>{@code
  * @Operator
@@ -45,7 +45,7 @@ import java.lang.annotation.Target;
  * <pre>{@code
  * import org.tensorflow.op.Ops;
  * ...
- * Ops ops = new Ops(graph);
+ * Ops ops = Ops.create(graph);
  * ...
  * ops.myOp(operand);
  * // and has exactly the same effect as calling
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..49c4ff639ecd36763c65e0143d60ab2590aa008b
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op.core;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Operand;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.op.Ops;
+
+@RunWith(JUnit4.class)
+public final class GeneratedOperationsTest {
+
+  @Test
+  public void tensorInputTensorOutput() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Ops ops = Ops.create(g);
+      Operand<Integer> x = ops.math().add(ops.constant(1), ops.constant(2));
+      try (Tensor<Integer> result = sess.runner().fetch(x).run().get(0).expect(Integer.class)) {
+        assertEquals(3, result.intValue());
+      }
+    }
+  }
+
+  @Test
+  public void testListInputTensorOutput() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Ops ops = Ops.create(g);
+      ArrayList<Operand<Integer>> inputs = new ArrayList<Operand<Integer>>();
+      inputs.add(ops.constant(1));
+      inputs.add(ops.constant(2));
+      inputs.add(ops.constant(3));
+      Operand<Integer> x = ops.math().addN(inputs);
+      try (Tensor<Integer> result = sess.runner().fetch(x).run().get(0).expect(Integer.class)) {
+        assertEquals(6, result.intValue());
+      }
+    }
+  }
+}
diff --git a/tensorflow/js/ops/ts_op_gen_test.cc b/tensorflow/js/ops/ts_op_gen_test.cc
index 03241689b5fe2c18f1131e9400c51b88298f143a..1c51dd030f52bc2d248f9a98f17f9d656a34065d 100644
--- a/tensorflow/js/ops/ts_op_gen_test.cc
+++ b/tensorflow/js/ops/ts_op_gen_test.cc
@@ -112,22 +112,15 @@ import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 }
 
 TEST(TsOpGenTest, InputSingleAndList) {
-  const string api_def = R"(
-op {
-  name: "Foo"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-    number_attr: "N"
-  }
-}
-)";
+  const string api_def = R"pb(
+    op { graph_op_name: "Foo" arg_order: "dim" arg_order: "images" }
+  )pb";
 
   string ts_file_text;
   GenerateTsOpFileText("", api_def, &ts_file_text);
 
   const string expected = R"(
-export function Foo(images: tfc.Tensor[], dim: tfc.Tensor): tfc.Tensor {
+export function Foo(dim: tfc.Tensor, images: tfc.Tensor[]): tfc.Tensor {
 )";
   ExpectContainsStr(ts_file_text, expected);
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 8fca01624cfa2c21cd428e63ed1eadf7b853f107..2cc661f2badfe1dcadb2c6f002b50bcbc9c5f2ec 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -60,16 +60,13 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "arena_planner_test",
     size = "small",
     srcs = ["arena_planner_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":arena_planner",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -196,7 +193,7 @@ cc_library(
         "//tensorflow/lite/core/api:api",
         "//tensorflow/lite/kernels:eigen_support",
         "//tensorflow/lite/kernels:gemm_support",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index c17eddf47bc86c9537364117c302df38e390c8da..bfac073d8702e55008903f4d6a5ba6a976d36c1b 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -228,10 +228,12 @@ def generated_test_models():
         "arg_min_max",
         "avg_pool",
         "batch_to_space_nd",
+        "ceil",
         "concat",
         "constant",
         "control_dep",
         "conv",
+        "conv2d_transpose",
         "conv_with_shared_weights",
         "conv_to_depthwiseconv_with_shared_weights",
         "depthwiseconv",
@@ -252,6 +254,7 @@ def generated_test_models():
         "greater_equal",
         "sum",
         "l2norm",
+        "l2norm_shared_epsilon",
         "l2_pool",
         "leaky_relu",
         "less",
@@ -262,7 +265,7 @@ def generated_test_models():
         "logical_and",
         "logical_or",
         "logical_xor",
-        "lstm",
+        #"lstm", TODO(b/122889684): Resolve toco structured line parsing in oss.
         "max_pool",
         "maximum",
         "mean",
@@ -288,6 +291,7 @@ def generated_test_models():
         "relu6",
         "reshape",
         "resize_bilinear",
+        "reverse_v2",
         "rsqrt",
         "shape",
         "sigmoid",
@@ -311,6 +315,7 @@ def generated_test_models():
         "topk",
         "transpose",
         "transpose_conv",
+        "unique",
         "unpack",
         "unroll_batch_matmul",
         "where",
@@ -324,6 +329,7 @@ def generated_test_models_failing(conversion_mode):
     if conversion_mode == "toco-flex":
         return [
             "lstm",  # TODO(b/117510976): Restore when lstm flex conversion works.
+            "unroll_batch_matmul",  # TODO(b/123030774): Fails in 1.13 tests.
         ]
 
     return []
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index f97d3ac4bf0b27cdd9b1f5ab7258a12036c29179..b6ffb826f597c9c883a55a88de2d528c59e7051f 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -128,6 +128,9 @@ typedef enum {
   kTfLiteBuiltinMirrorPad = 100,
   kTfLiteBuiltinAbs = 101,
   kTfLiteBuiltinSplitV = 102,
+  kTfLiteBuiltinUnique = 103,
+  kTfLiteBuiltinCeil = 104,
+  kTfLiteBuiltinReverseV2 = 105,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 91c04a5f1fb5bb1a15bd1da074a1276a3d8e7793..680addfa2faa7afd3c7032e147f3be2c600c7668 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -12,7 +12,6 @@ cc_library(
         "c_api_internal.h",
     ],
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
 )
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 58e7221bc6e5a9d062127e30d0007838563db76e..332c2db14511af18a8e3d99fc93891ce92d1792a 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -25,6 +25,11 @@ extern "C" {
 
 // TODO(aselle): Consider using "if this then that" for testing.
 
+// Useful placeholder to put in otherwise empty structs to avoid size warnings.
+typedef struct {
+  char dummy;
+} EmptyStructPlaceholder;
+
 // IMPORTANT: All new members of structs must be added at the end to ensure
 // backwards compatibility.
 
@@ -152,9 +157,11 @@ typedef struct {
 } TfLiteAddParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
@@ -230,9 +237,11 @@ typedef struct {
 } TfLiteResizeNearestNeighborParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLitePadParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLitePadV2Params;
 
 typedef struct {
@@ -272,6 +281,7 @@ typedef struct {
 } TfLiteGatherParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteTransposeParams;
 
 typedef struct {
@@ -351,6 +361,10 @@ typedef struct {
   float alpha;
 } TfLiteLeakyReluParams;
 
+typedef struct {
+  TfLiteType index_out_type;
+} TfLiteUniqueParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index 2923dbad4ef285c497ca2c84d86168954fe8ec99..f20ee23bd81eb87c25a1a7f61cce59df7ae6678e 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -70,6 +70,20 @@ TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
 
 void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
 
+int TfLiteFloatArrayGetSizeInBytes(int size) {
+  static TfLiteFloatArray dummy;
+  return sizeof(dummy) + sizeof(dummy.data[0]) * size;
+}
+
+TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
+  TfLiteFloatArray* ret =
+      (TfLiteFloatArray*)malloc(TfLiteFloatArrayGetSizeInBytes(size));
+  ret->size = size;
+  return ret;
+}
+
+void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
+
 void TfLiteTensorDataFree(TfLiteTensor* t) {
   if (t->allocation_type == kTfLiteDynamic && t->data.raw) {
     free(t->data.raw);
@@ -77,10 +91,30 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
   t->data.raw = NULL;
 }
 
+void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
+  if (quantization->type == kTfLiteAffineQuantization) {
+    TfLiteAffineQuantization* q_params =
+        (TfLiteAffineQuantization*)(quantization->params);
+    if (q_params->scale) {
+      TfLiteFloatArrayFree(q_params->scale);
+      q_params->scale = NULL;
+    }
+    if (q_params->zero_point) {
+      TfLiteIntArrayFree(q_params->zero_point);
+      q_params->zero_point = NULL;
+    }
+    free(q_params);
+  }
+  quantization->params = NULL;
+  quantization->type = kTfLiteNoQuantization;
+}
+
 void TfLiteTensorFree(TfLiteTensor* t) {
   TfLiteTensorDataFree(t);
   if (t->dims) TfLiteIntArrayFree(t->dims);
   t->dims = NULL;
+
+  TfLiteQuantizationFree(&t->quantization);
 }
 
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
@@ -98,6 +132,9 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
   tensor->allocation_type = allocation_type;
   tensor->allocation = allocation;
   tensor->is_variable = is_variable;
+
+  tensor->quantization.type = kTfLiteNoQuantization;
+  tensor->quantization.params = NULL;
 }
 
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 1b1bc6db8fbe87cd8a7c98f2bbac9211411fea0e..83e2be690762be3e2cacf02ea8311b76dc1731c4 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -98,8 +98,32 @@ int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]);
 // You are expected to free memory with TfLiteIntArrayFree
 TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
 
-// Free memory of array `v`.
-void TfLiteIntArrayFree(TfLiteIntArray* v);
+// Free memory of array `a`.
+void TfLiteIntArrayFree(TfLiteIntArray* a);
+
+// Fixed size list of floats. Used for per-channel quantization.
+typedef struct {
+  int size;
+// gcc 6.1+ have a bug where flexible members aren't properly handled
+// https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+    __GNUC_MINOR__ >= 1
+  float data[0];
+#else
+  float data[];
+#endif
+} TfLiteFloatArray;
+
+// Given the size (number of elements) in a TfLiteFloatArray, calculate its size
+// in bytes.
+int TfLiteFloatArrayGetSizeInBytes(int size);
+
+// Create a array of a given `size` (uninitialized entries).
+// This returns a pointer, that you must free using TfLiteFloatArrayFree().
+TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
+
+// Free memory of array `a`.
+void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 
 // Since we must not depend on any libraries, define a minimal subset of
 // error macros while avoiding names that have pre-conceived meanings like
@@ -185,14 +209,48 @@ typedef enum {
 // Return the name of a given type, for error reporting purposes.
 const char* TfLiteTypeGetName(TfLiteType type);
 
+// SupportedQuantizationTypes.
+typedef enum {
+  // No quantization.
+  kTfLiteNoQuantization = 0,
+  // Affine quantization (with support for per-channel quantization).
+  // Corresponds to TfLiteAffineQuantization.
+  kTfLiteAffineQuantization = 1,
+} TfLiteQuantizationType;
+
+// Structure specifying the quantization used by the tensor, if-any.
+typedef struct {
+  // The type of quantization held by params.
+  TfLiteQuantizationType type;
+  // Holds a reference to one of the quantization param structures specified
+  // below.
+  void* params;
+} TfLiteQuantization;
+
+// Legacy. Will be deprecated in favor of TfLiteAffineQuantization.
+// If per-layer quantization is specified this field will still be populated in
+// addition to TfLiteAffineQuantization.
 // Parameters for asymmetric quantization. Quantized values can be converted
 // back to float using:
-//    real_value = scale * (quantized_value - zero_point);
+//     real_value = scale * (quantized_value - zero_point)
 typedef struct {
   float scale;
   int32_t zero_point;
 } TfLiteQuantizationParams;
 
+// Parameters for asymmetric quantization across a dimension (i.e per output
+// channel quantization).
+// quantized_dimension specifies which dimension the scales and zero_points
+// correspond to.
+// For a particular value in quantized_dimension, quantized values can be
+// converted back to float using:
+//     real_value = scale * (quantized_value - zero_point)
+typedef struct {
+  TfLiteFloatArray* scale;
+  TfLiteIntArray* zero_point;
+  int32_t quantized_dimension;
+} TfLiteAffineQuantization;
+
 // A union of pointers that points to memory for a given tensor.
 typedef union {
   int* i32;
@@ -274,12 +332,18 @@ typedef struct {
 
   // True if the tensor is a variable.
   bool is_variable;
+
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
 } TfLiteTensor;
 
-// Free data memory of tensor `t`;
+// Free data memory of tensor `t`.
 void TfLiteTensorDataFree(TfLiteTensor* t);
 
-// Free memory of tensor `t`;
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+// Free memory of tensor `t`.
 void TfLiteTensorFree(TfLiteTensor* t);
 
 // Set all of a tensor's fields (and free any previously allocated data).
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index acf0dfc5be8e233b642ccea42f72cbf6af2d4c5d..d01cf63a3e059d05a300accc5a26dd4d411f326a 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -65,6 +65,13 @@ TEST(IntArray, TestIntArrayEqual) {
   TfLiteIntArrayFree(d);
 }
 
+TEST(FloatArray, TestFloatArrayCreate) {
+  TfLiteFloatArray* a = TfLiteFloatArrayCreate(0);
+  TfLiteFloatArray* b = TfLiteFloatArrayCreate(3);
+  TfLiteFloatArrayFree(a);
+  TfLiteFloatArrayFree(b);
+}
+
 TEST(Types, TestTypeNames) {
   auto type_name = [](TfLiteType t) {
     return std::string(TfLiteTypeGetName(t));
@@ -81,6 +88,20 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteString), "STRING");
 }
 
+TEST(Quantization, TestQuantizationFree) {
+  TfLiteTensor t;
+  // Set these values, otherwise TfLiteTensorFree has uninitialized values.
+  t.allocation_type = kTfLiteArenaRw;
+  t.dims = nullptr;
+  t.quantization.type = kTfLiteAffineQuantization;
+  auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  params->scale = TfLiteFloatArrayCreate(3);
+  params->zero_point = TfLiteIntArrayCreate(3);
+  t.quantization.params = reinterpret_cast<void*>(params);
+  TfLiteTensorFree(&t);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6a43b0322d17041a5ae4a0527376d1465a539b1d..db6b4a2d18ecd894fa3b8a0bf646ca9f8c6b6511 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -51,6 +51,7 @@ cc_test(
     srcs = ["flatbuffer_conversions_test.cc"],
     deps = [
         ":api",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index e73c4ce023d7ecde7f8422cf3e2709f45b35b621..3a74b1e6c9db0457ac96d643607a8dcdd2709e0b 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdlib>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -26,22 +28,27 @@ namespace {
 // Copies the contents from the flatbuffer int vector `flatbuffer` into the
 // int array `buffer`. `flat_vector` and `buffer` represent the same
 // configuration operation for a given operation.
-void FlatBufferIntVectorToArray(int max_size_of_buffer,
-                                const flatbuffers::Vector<int32_t>* flat_vector,
-                                int* buffer, ErrorReporter* error_reporter) {
+TfLiteStatus FlatBufferIntVectorToArray(
+    int max_size_of_buffer, const flatbuffers::Vector<int32_t>* flat_vector,
+    int* buffer, ErrorReporter* error_reporter, const char* op_name) {
   if (!flat_vector) {
-    error_reporter->Report("Input array not provided for operation.\n");
+    error_reporter->Report("Input array not provided for operation '%s'.\n",
+                           op_name);
+    return kTfLiteError;
   } else {
     int num_dimensions = flat_vector->Length();
     if (num_dimensions > max_size_of_buffer / sizeof(int)) {
       error_reporter->Report(
-          "Found too many dimensions in the operation's input array.\n");
+          "Found too many dimensions in the input array of operation '%s'.\n",
+          op_name);
+      return kTfLiteError;
     } else {
       for (int i = 0; i < num_dimensions; ++i) {
         buffer[i] = flat_vector->Get(i);
       }
     }
   }
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -450,8 +457,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = allocator->AllocatePOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
         auto* new_shape = schema_params->new_shape();
-        FlatBufferIntVectorToArray(sizeof(params->shape), new_shape,
-                                   params->shape, error_reporter);
+        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+            sizeof(params->shape), new_shape, params->shape, error_reporter,
+            "reshape"));
         params->num_dimensions = new_shape->Length();
       }
       *builtin_data = reinterpret_cast<void*>(params);
@@ -519,8 +527,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = allocator->AllocatePOD<TfLiteSqueezeParams>();
       if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
         const auto& squeeze_dims = schema_params->squeeze_dims();
-        FlatBufferIntVectorToArray(sizeof(params->squeeze_dims), squeeze_dims,
-                                   params->squeeze_dims, error_reporter);
+        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+            sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
+            error_reporter, "squeeze"));
         params->num_squeeze_dims = squeeze_dims->Length();
       }
       *builtin_data = reinterpret_cast<void*>(params);
@@ -651,6 +660,18 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIQUE: {
+      TfLiteUniqueParams* params = allocator->AllocatePOD<TfLiteUniqueParams>();
+      auto* unique_params = op->builtin_options_as_UniqueOptions();
+      if (unique_params != nullptr) {
+        params->index_out_type =
+            unique_params->idx_out_type() == tflite::TensorType_INT64
+                ? TfLiteType::kTfLiteInt64
+                : TfLiteType::kTfLiteInt32;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_ABS:
@@ -665,6 +686,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
+    case BuiltinOperator_CEIL:
     case BuiltinOperator_FLOOR:
     case BuiltinOperator_GREATER:
     case BuiltinOperator_GREATER_EQUAL:
@@ -704,6 +726,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FLOOR_MOD:
     case BuiltinOperator_RANGE:
     case BuiltinOperator_SQUARED_DIFFERENCE:
+    case BuiltinOperator_REVERSE_V2:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 4d1d1b21fda106b3196ff43421996f45ab83af4f..4a5de48302c1e840c524335ee549c74a162e107e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstring>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace {
@@ -33,6 +35,8 @@ class MockErrorReporter : public ErrorReporter {
   char* GetBuffer() { return buffer_; }
   int GetBufferSize() { return buffer_size_; }
 
+  string GetAsString() const { return string(buffer_, buffer_size_); }
+
  private:
   static constexpr int kBufferSize = 256;
   char buffer_[kBufferSize];
@@ -60,25 +64,56 @@ class MockDataAllocator : public BuiltinDataAllocator {
 
 }  // namespace
 
-TEST(FlatbufferConversions, TestParseOpDataConv) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
-  MockDataAllocator mock_allocator;
-
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<void> conv_options =
-      CreateConv2DOptions(builder, Padding_SAME, 1, 2,
-                          ActivationFunctionType_RELU, 3, 4)
-          .Union();
-  flatbuffers::Offset<Operator> conv_offset = CreateOperatorDirect(
-      builder, 0, nullptr, nullptr, BuiltinOptions_Conv2DOptions, conv_options,
-      nullptr, CustomOptionsFormat_FLEXBUFFERS, nullptr);
-  builder.Finish(conv_offset);
-  void* conv_pointer = builder.GetBufferPointer();
-  const Operator* conv_op = flatbuffers::GetRoot<Operator>(conv_pointer);
+class FlatbufferConversionsTest : public ::testing::Test {
+ public:
+  const Operator* BuildTestOperator(BuiltinOptions op_type,
+                                    flatbuffers::Offset<void> options) {
+    flatbuffers::Offset<Operator> offset =
+        CreateOperatorDirect(builder_, 0, nullptr, nullptr, op_type, options,
+                             nullptr, CustomOptionsFormat_FLEXBUFFERS, nullptr);
+    builder_.Finish(offset);
+    void* pointer = builder_.GetBufferPointer();
+    return flatbuffers::GetRoot<Operator>(pointer);
+  }
+
+ protected:
+  MockErrorReporter mock_reporter_;
+  MockDataAllocator mock_allocator_;
+  flatbuffers::FlatBufferBuilder builder_;
+};
+
+TEST_F(FlatbufferConversionsTest, ParseBadSqueeze) {
+  const Operator* op = BuildTestOperator(
+      BuiltinOptions_SqueezeOptions, CreateSqueezeOptions(builder_).Union());
+  void* output_data = nullptr;
+  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
+                                   &mock_allocator_, &output_data));
+  EXPECT_THAT(mock_reporter_.GetAsString(),
+              ::testing::ContainsRegex(
+                  "Input array not provided for operation 'squeeze'"));
+}
+
+TEST_F(FlatbufferConversionsTest, ParseBadReshape) {
+  const Operator* op = BuildTestOperator(
+      BuiltinOptions_ReshapeOptions, CreateSqueezeOptions(builder_).Union());
+  void* output_data = nullptr;
+  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_RESHAPE, &mock_reporter_,
+                                   &mock_allocator_, &output_data));
+  EXPECT_THAT(mock_reporter_.GetAsString(),
+              ::testing::ContainsRegex(
+                  "Input array not provided for operation 'reshape'"));
+}
+
+TEST_F(FlatbufferConversionsTest, TestParseOpDataConv) {
+  const Operator* conv_op =
+      BuildTestOperator(BuiltinOptions_Conv2DOptions,
+                        CreateConv2DOptions(builder_, Padding_SAME, 1, 2,
+                                            ActivationFunctionType_RELU, 3, 4)
+                            .Union());
   void* output_data = nullptr;
-  EXPECT_EQ(kTfLiteOk, ParseOpData(conv_op, BuiltinOperator_CONV_2D, reporter,
-                                   &mock_allocator, &output_data));
+  EXPECT_EQ(kTfLiteOk,
+            ParseOpData(conv_op, BuiltinOperator_CONV_2D, &mock_reporter_,
+                        &mock_allocator_, &output_data));
   EXPECT_NE(nullptr, output_data);
   TfLiteConvParams* params = reinterpret_cast<TfLiteConvParams*>(output_data);
   EXPECT_EQ(kTfLitePaddingSame, params->padding);
@@ -89,30 +124,20 @@ TEST(FlatbufferConversions, TestParseOpDataConv) {
   EXPECT_EQ(4, params->dilation_height_factor);
 }
 
-TEST(FlatbufferConversions, TestParseOpDataCustom) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
-  MockDataAllocator mock_allocator;
-
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<void> null_options;
-  flatbuffers::Offset<Operator> custom_offset = CreateOperatorDirect(
-      builder, 0, nullptr, nullptr, BuiltinOptions_NONE, null_options, nullptr,
-      CustomOptionsFormat_FLEXBUFFERS, nullptr);
-  builder.Finish(custom_offset);
-  void* custom_pointer = builder.GetBufferPointer();
-  const Operator* custom_op = flatbuffers::GetRoot<Operator>(custom_pointer);
+TEST_F(FlatbufferConversionsTest, TestParseOpDataCustom) {
+  const Operator* custom_op =
+      BuildTestOperator(BuiltinOptions_NONE, flatbuffers::Offset<void>());
   void* output_data = nullptr;
-  EXPECT_EQ(kTfLiteOk, ParseOpData(custom_op, BuiltinOperator_CUSTOM, reporter,
-                                   &mock_allocator, &output_data));
+  EXPECT_EQ(kTfLiteOk,
+            ParseOpData(custom_op, BuiltinOperator_CUSTOM, &mock_reporter_,
+                        &mock_allocator_, &output_data));
   EXPECT_EQ(nullptr, output_data);
 }
 
-TEST(FlatbufferConversions, TestConvertTensorType) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
+TEST_F(FlatbufferConversionsTest, TestConvertTensorType) {
   TfLiteType type;
-  EXPECT_EQ(kTfLiteOk, ConvertTensorType(TensorType_FLOAT32, &type, reporter));
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_FLOAT32, &type, &mock_reporter_));
   EXPECT_EQ(kTfLiteFloat32, type);
 }
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 763ab5657524b60ee245442dc2dc00ddffa03eb9..ab83456229ba1853b4a7d2d7757943794f29530e 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -72,6 +72,34 @@ bool HasDynamicTensor(const TfLiteContext& context,
   return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
 }
 
+// Gets the legacy TfLiteQuantizationParams from the current TfLiteQuantization.
+TfLiteQuantizationParams GetLegacyQuantization(
+    const TfLiteQuantization& quantization) {
+  TfLiteQuantizationParams legacy_quantization;
+  legacy_quantization.scale = 0;
+  legacy_quantization.zero_point = 0;
+
+  // If the quantization type isn't affine, return the empty
+  // legacy_quantization.
+  if (quantization.type != kTfLiteAffineQuantization) {
+    return legacy_quantization;
+  }
+
+  auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(quantization.params);
+  if (!affine_quantization || !affine_quantization->scale ||
+      !affine_quantization->zero_point ||
+      affine_quantization->scale->size != 1 ||
+      affine_quantization->zero_point->size != 1) {
+    return legacy_quantization;
+  }
+
+  // We know its per-layer quantization now.
+  legacy_quantization.scale = affine_quantization->scale->data[0];
+  legacy_quantization.zero_point = affine_quantization->zero_point->data[0];
+  return legacy_quantization;
+}
+
 }  // namespace
 
 // A trivial implementation of GraphInfo around the Interpreter.
@@ -363,6 +391,12 @@ TfLiteStatus Subgraph::SetVariables(std::vector<int> variables) {
   return kTfLiteOk;
 }
 
+void Subgraph::SetCancellationFunction(void* data,
+                                       bool (*check_cancelled_func)(void*)) {
+  cancellation_data_ = data;
+  check_cancelled_func_ = check_cancelled_func;
+}
+
 TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
                                           int length) {
   // Making sure kOptionalTensor is not re-defined to something other than -1.
@@ -664,15 +698,21 @@ TfLiteStatus Subgraph::Invoke() {
       TfLiteTensor* tensor = &tensors_[tensor_index];
       if (tensor->delegate && tensor->delegate != node.delegate &&
           tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
+        TF_LITE_ENSURE_STATUS(EnsureTensorDataIsReadable(tensor_index));
       }
     }
 
+    if (check_cancelled_func_ != nullptr &&
+        check_cancelled_func_(cancellation_data_)) {
+      ReportError("Client requested cancel during Invoke()");
+      return kTfLiteError;
+    }
+
     EnsureTensorsVectorCapacity();
     tensor_resized_since_op_invoke_ = false;
     if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(context_, node, registration, node_index,
-                             "failed to invoke");
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to invoke");
     }
 
     // Force execution prep for downstream ops if the latest op triggered the
@@ -767,7 +807,7 @@ TfLiteStatus Subgraph::GetNodeAndRegistration(
 
 TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    const int* dims, TfLiteQuantization quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
@@ -792,16 +832,22 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
       EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
     // Fast path which does not invalidate the invokable property.
     TfLiteTensorDataFree(&tensor);
+    TfLiteQuantizationFree(&tensor.quantization);
     tensor.data.raw = const_cast<char*>(buffer);
     if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
+    tensor.params = GetLegacyQuantization(quantization);
+    tensor.quantization = quantization;
     tensor.allocation_type = kTfLiteMmapRo;
     tensor.allocation = allocation;
   } else {
     state_ = kStateUninvokable;
     TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
+                      GetLegacyQuantization(quantization),
+                      const_cast<char*>(buffer), bytes, kTfLiteMmapRo,
+                      allocation, false, &tensor);
+    // TODO(suharshs): Update TfLiteTensorReset to include the new quantization
+    // if there are other required callers.
+    tensor.quantization = quantization;
   }
   return kTfLiteOk;
 }
@@ -812,7 +858,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
 // to Interpreter.
 TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+    const int* dims, TfLiteQuantization quantization, bool is_variable) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
         "SetTensorParametersReadWrite is disallowed when graph is immutable.");
@@ -842,10 +888,14 @@ TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     allocation_type = kTfLiteArenaRwPersistent;
   }
 
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
   TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
+                    GetLegacyQuantization(quantization),
                     /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_->tensors[tensor_index]);
+                    nullptr, is_variable, &tensor);
+  // TODO(suharshs): Update TfLiteTensorReset to include the new quantization
+  // if there are other required callers.
+  tensor.quantization = quantization;
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 979226b86477cabc0d2a2640f5c89f29ebf1e4ee..5ca2977e0c3e8d60ecb0ea352ceb6649d865214d 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -79,18 +79,22 @@ class Subgraph {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
-  TfLiteStatus SetTensorParametersReadOnly(
-      int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization,
-      const char* buffer, size_t bytes, const Allocation* allocation);
+  TfLiteStatus SetTensorParametersReadOnly(int tensor_index, TfLiteType type,
+                                           const char* name, const size_t rank,
+                                           const int* dims,
+                                           TfLiteQuantization quantization,
+                                           const char* buffer, size_t bytes,
+                                           const Allocation* allocation);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
-  TfLiteStatus SetTensorParametersReadWrite(
-      int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name, const size_t rank,
+                                            const int* dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable);
 
   // WARNING: Experimental interface, subject to change
   // Overrides execution plan. This bounds checks indices sent in.
@@ -208,6 +212,15 @@ class Subgraph {
     return context_->allow_fp32_relax_to_fp16;
   }
 
+  // Sets the cancellation function pointer in order to cancel a request in the
+  // middle of a call to Invoke(). The interpreter queries this function during
+  // inference, between op invocations; when it returns true, the interpreter
+  // will abort execution and return `kTfLiteError`. The `data` parameter
+  // contains any data used by the cancellation function, and if non-null,
+  // remains owned by the caller.
+  // WARNING: This is an experimental API and subject to change.
+  void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
+
   // Ensure the data in `tensor.data` is readable. In case delegate is used,
   // it might require to copy the data from delegate buffer to raw memory.
   // WARNING: This is an experimental API and subject to change.
@@ -502,6 +515,15 @@ class Subgraph {
   // public function).
   // The value is invalid before `PrepareOpStartingAt` is called.
   bool has_dynamic_tensors_ = true;
+
+  // Reference to cancellation function that can cancel a request in the middle
+  // of a call to Invoke(). When this function returns True, a kTfLiteError is
+  // thrown by Invoke().
+  bool (*check_cancelled_func_)(void*) = nullptr;
+
+  // Reference to data used by the cancellation function in
+  // `check_cancelled_func_`.
+  void* cancellation_data_ = nullptr;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index 9e8472f1e7d2c3e0f5e73f3e5ce98bae7f15063f..accaf3045246b35705085bd5324e5b33ec8ea12a 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -44,6 +44,7 @@ UniqueTfLiteTensor MakeLiteTensor(const std::vector<int>& shape,
   tensor->dims = ConvertVectorToTfLiteIntArray(shape);
   tensor->data.raw = nullptr;
   tensor->is_variable = false;
+  memset(&tensor->quantization, 0, sizeof(TfLiteQuantization));
   TfLiteTensorRealloc(data.size() * sizeof(T), tensor.get());
   memcpy(tensor->data.raw, data.data(), data.size() * sizeof(T));
   return tensor;
@@ -62,6 +63,7 @@ UniqueTfLiteTensor MakeLiteTensor<string>(const std::vector<int>& shape,
   tensor->dims = ConvertVectorToTfLiteIntArray(shape);
   tensor->data.raw = nullptr;
   tensor->is_variable = false;
+  memset(&tensor->quantization, 0, sizeof(TfLiteQuantization));
   TfLiteTensorRealloc(data.size() * sizeof(string), tensor.get());
 
   DynamicBuffer b;
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 4e66921146159e48f48030a2ef11aa8933349bcf..ceb9918f6fa7ccfbb4d27a0bf921987faecc1c12 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -51,102 +51,317 @@ namespace tflite {
 namespace flex {
 namespace kernel {
 
-// Controls the lifetime of tensor handles in a vector.
-class VectorOfHandles {
+struct OpNode;
+
+// Represents the origin of a given tensor as a reference to the output
+// of an upstream node.
+struct TensorSource {
+  OpNode* node;
+  int node_output_index;
+};
+
+// A list of inputs of a given node of the TensorFlow/Eager graph.
+class OpInputs {
  public:
-  explicit VectorOfHandles(int num_elements) : vector_(num_elements, nullptr) {}
+  explicit OpInputs(const TfLiteIntArray* indexes) {
+    for (int index : TfLiteIntArrayView(indexes)) {
+      inputs_.push_back(index);
+    }
+    forwardable_.resize(inputs_.size());
+  }
+  ~OpInputs() {}
+
+  int Size() const { return inputs_.size(); }
+
+  int TfLiteIndex(int i) const { return inputs_[i]; }
+
+  // Given a map relating tensors to the node that originates them, populate a
+  // list of sources for the tensors in this class.
+  void InitializeTensorSources(
+      const std::map<int, TensorSource>& tflite_tensor_sources) {
+    sources_.clear();
+    for (int i : inputs_) {
+      auto it = tflite_tensor_sources.find(i);
+      if (it == tflite_tensor_sources.end()) {
+        sources_.push_back({nullptr, 0});
+      } else {
+        sources_.push_back(it->second);
+      }
+    }
+  }
 
-  ~VectorOfHandles() {
-    for (auto* handle : vector_) {
-      if (handle) handle->Unref();
+  void SetForwardable(int i, bool v) { forwardable_[i] = v; }
+
+  bool IsForwardable(int i) const { return forwardable_[i]; }
+
+  TensorSource GetTensorSource(int i) const { return sources_[i]; }
+
+ private:
+  std::vector<int> inputs_;
+  std::vector<TensorSource> sources_;
+
+  // List of tensors that can be used by TF in its forwarding optimization.
+  // Doing so allows an input tensor to be modified and used as the output
+  // tensor. The delegate takes care of not holding any references to tensors
+  // in this list while Eager is executing the corresponding op.
+  std::vector<int> forwardable_;
+};
+
+// A list of outputs of a given node of the TensorFlow/Eager graph, along with
+// the actual outputs of the EagerOperation.
+class OpOutputs {
+ public:
+  explicit OpOutputs(const TfLiteIntArray* indexes) {
+    for (int index : TfLiteIntArrayView(indexes)) {
+      outputs_.push_back(index);
+    }
+    vector_.resize(outputs_.size());
+  }
+  ~OpOutputs() { ResetTensorHandles(); }
+
+  // Stores information about which of the tensors in this class are also
+  // outputs of the sugbraph.
+  void InitializeGraphOutputs(const std::set<int>& subgraph_outputs) {
+    subgraph_outputs_.clear();
+    for (int i : outputs_) {
+      subgraph_outputs_.push_back(subgraph_outputs.count(i) > 0);
     }
   }
 
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* GetVector() {
-    return &vector_;
+  // Returns true if the tensor given by index 'i' is an output of the entire
+  // subgraph.
+  bool IsSubgraphOutput(int i) const { return subgraph_outputs_[i]; }
+
+  // Returns a handle to a given tensor and, optionally, remove it from the
+  // internal vector.
+  tensorflow::TensorHandle* GetHandle(int i, bool remove) {
+    auto* handle = vector_[i];
+    if (!remove) {
+      handle->Ref();
+    } else {
+      // Don't increase the ref-count. Instead, simply take it out of the
+      // vector.
+      vector_[i] = nullptr;
+    }
+    return handle;
+  }
+
+  int Size() const { return outputs_.size(); }
+
+  int TfLiteIndex(int i) const { return outputs_[i]; }
+
+  // Carefully unreference all the handles in the eager output vector.
+  void ResetTensorHandles() {
+    for (int i = 0; i < vector_.size(); ++i) {
+      if (vector_[i]) {
+        vector_[i]->Unref();
+        vector_[i] = nullptr;
+      }
+    }
   }
 
-  tensorflow::TensorHandle* GetHandle(int index) { return vector_[index]; }
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>*
+  GetTensorHandles() {
+    return &vector_;
+  }
 
  private:
+  std::vector<int> outputs_;
+  std::vector<bool> subgraph_outputs_;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> vector_;
 };
 
-// Executes the TensorFlow op given by 'op_name', with the attributes specified
-// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
-tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
-                                 BufferMap* buffer_map, const string& op_name,
-                                 const tensorflow::NodeDef& nodedef,
-                                 const std::vector<int>& inputs,
-                                 const std::vector<int>& outputs) {
-  const tensorflow::AttrTypeMap* attr_types;
-  bool is_function = false;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
-      " (while processing attributes of '", op_name, "')");
-  if (is_function) {
-    return tensorflow::errors::NotFound(
-        "Operation '", op_name,
-        "' is not registered.  (while processing attributes of '", op_name,
-        "')");
-  }
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
-                                /*is_function=*/false, attr_types);
-  for (const auto& attr : nodedef.attr()) {
-    op.MutableAttrs()->Set(attr.first, attr.second);
+// A single node within the larger 'op'. Note that this kernel executes many
+// TensorFlow ops within a single TF Lite op.
+class OpNode {
+ public:
+  OpNode(const TfLiteIntArray* inputs, const TfLiteIntArray* outputs)
+      : inputs_(inputs), outputs_(outputs) {}
+  ~OpNode() {
+    if (op_) ClearEagerInputs();
   }
 
-  for (int input_index : inputs) {
-    if (!buffer_map->HasTensor(input_index)) {
+  const string& name() const { return name_; }
+  void set_name(const string& name) { name_ = name; }
+
+  int index() const { return index_; }
+  void set_index(int index) { index_ = index; }
+
+  const tensorflow::NodeDef& nodedef() const { return nodedef_; }
+
+  const OpInputs& inputs() const { return inputs_; }
+  OpInputs* mutable_inputs() { return &inputs_; }
+
+  const OpOutputs& outputs() const { return outputs_; }
+  OpOutputs* mutable_outputs() { return &outputs_; }
+
+  int NumInputs() const { return inputs_.Size(); }
+  int NumOutputs() const { return outputs_.Size(); }
+
+  tensorflow::EagerOperation* op() { return op_.get(); }
+
+  tensorflow::Status InitializeNodeDef(const void* custom_initial_data,
+                                       int custom_initial_data_size) {
+    if (!custom_initial_data) {
       return tensorflow::errors::Internal(
-          "Cannot read from invalid tensor index ", input_index);
+          "Cannot convert empty data into a valid NodeDef");
     }
-    auto* handle = new tensorflow::TensorHandle(
-        buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
-    op.AddInput(handle);
-    handle->Unref();
+    // The flexbuffer contains a vector where the first elements is the
+    // op name and the second is a serialized NodeDef.
+    const flexbuffers::Vector& v =
+        flexbuffers::GetRoot(
+            reinterpret_cast<const uint8_t*>(custom_initial_data),
+            custom_initial_data_size)
+            .AsVector();
+
+    name_ = v[0].AsString().str();
+    if (!nodedef_.ParseFromString(v[1].AsString().str())) {
+      nodedef_.Clear();
+      return tensorflow::errors::Internal(
+          "Failed to parse data into a valid NodeDef");
+    }
+
+    // Fill NodeDef with defaults if it's a valid op.
+    const tensorflow::OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(
+        tensorflow::OpRegistry::Global()->LookUp(nodedef_.op(), &op_reg_data));
+    AddDefaultsToNodeDef(op_reg_data->op_def, &nodedef_);
+
+    return tensorflow::Status::OK();
   }
 
-  int num_retvals = outputs.size();
-  VectorOfHandles retvals(num_retvals);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EagerExecute(&op, retvals.GetVector(), &num_retvals),
-      " (while executing '", op_name, "' via Eager)");
+  // Build thew new EagerOperation. In case of error, the returned 'op' is
+  // guaranteed to be 'nullptr'.
+  tensorflow::Status BuildEagerOp(tensorflow::EagerContext* eager_context) {
+    op_.reset();
+
+    const tensorflow::AttrTypeMap* attr_types;
+    bool is_function = false;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        tensorflow::AttrTypeMapForOp(name_.c_str(), &attr_types, &is_function),
+        " (while processing attributes of '", name_, "')");
+    if (is_function) {
+      return tensorflow::errors::NotFound(
+          "Operation '", name_,
+          "' is not registered.  (while processing attributes of '", name_,
+          "')");
+    }
 
-  if (num_retvals != outputs.size()) {
-    return tensorflow::errors::Internal(
-        "Unexpected number of outputs from EagerExecute");
+    op_.reset(new tensorflow::EagerOperation(eager_context, name_.c_str(),
+                                             /*is_function=*/false,
+                                             attr_types));
+
+    op_->MutableAttrs()->NumInputs(inputs_.Size());
+    for (const auto& attr : nodedef_.attr()) {
+      op_->MutableAttrs()->Set(attr.first, attr.second);
+    }
+
+    // Precalculating a cache key saves about 10% of inference time for very
+    // small models.
+    tensorflow::Device* device = op_->Device();
+    op_->MutableAttrs()->CacheKey(device == nullptr ? "unspecified"
+                                                    : device->name());
+
+    return tensorflow::Status::OK();
   }
 
-  for (int i = 0; i < num_retvals; ++i) {
-    const tensorflow::Tensor* tensor = nullptr;
-    TF_RETURN_IF_ERROR(retvals.GetHandle(i)->Tensor(&tensor));
-    buffer_map->SetFromTensorFlow(outputs[i], *tensor);
+  void ClearEagerInputs() {
+    for (tensorflow::TensorHandle* h : *op_->MutableInputs()) {
+      if (h) h->Unref();
+    }
+    op_->MutableInputs()->clear();
   }
 
-  return tensorflow::Status::OK();
-}
+  tensorflow::Status BuildEagerInputs(const BufferMap* buffer_map) {
+    for (int i = 0; i < inputs_.Size(); ++i) {
+      int input_index = inputs_.TfLiteIndex(i);
+      TensorSource s = inputs_.GetTensorSource(i);
+      if (!s.node) {
+        // This input is not produced by this Eager subgraph (it could be a TF
+        // Lite native buffer, or could be produced by a separater subgraph). We
+        // need to fetch it from the delegate's buffer_map.
+        if (!buffer_map->HasTensor(input_index)) {
+          return tensorflow::errors::Internal(
+              "Cannot read from invalid tensor index ", input_index);
+        }
+        auto* handle = new tensorflow::TensorHandle(
+            buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
+        op_->MutableInputs()->push_back(handle);
+      } else {
+        // If this is a forwardable tensor, we will remove it from the previous
+        // op's list, giving TF the opportunity to reuse its buffer.
+        bool unref_handle = inputs_.IsForwardable(i);
+        auto* handle =
+            s.node->outputs_.GetHandle(s.node_output_index, unref_handle);
+        op_->MutableInputs()->push_back(handle);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+  tensorflow::Status PersistEagerOutputs(BufferMap* buffer_map) {
+    auto* handles = outputs_.GetTensorHandles();
+    for (int i = 0; i < outputs_.Size(); ++i) {
+      if (outputs_.IsSubgraphOutput(i)) {
+        const tensorflow::Tensor* tensor = nullptr;
+        TF_RETURN_IF_ERROR(handles->at(i)->Tensor(&tensor));
+        buffer_map->SetFromTensorFlow(outputs_.TfLiteIndex(i), *tensor);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  OpNode(const OpNode&) = delete;
+  OpNode& operator=(const OpNode&) = delete;
 
-// A single node within the larger 'op'. Note that this kernel executes many
-// TensorFlow ops within a single TF Lite op.
-struct OpNode {
   // The name of the TensorFlow op to execute.
-  string name;
+  string name_;
   // Index of this node into TF Lite's operator list.
-  int index;
+  int index_;
   // The corresponding NodeDef, containing the attributes for the op.
-  tensorflow::NodeDef nodedef;
+  tensorflow::NodeDef nodedef_;
   // List of inputs, as TF Lite tensor indices.
-  std::vector<int> inputs;
+  OpInputs inputs_;
   // List of outputs, as TF Lite tensor indices.
-  std::vector<int> outputs;
+  OpOutputs outputs_;
+
+  std::unique_ptr<tensorflow::EagerOperation> op_;
 };
 
-// The Larger 'op', which contains all the nodes in a supported subgraph.
+// Executes the TensorFlow op given by 'op_name', with the attributes specified
+// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
+tensorflow::Status ExecuteFlexOp(TfLiteContext* context, BufferMap* buffer_map,
+                                 OpNode* node_data) {
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(node_data->BuildEagerInputs(buffer_map),
+                                  " (while executing '", node_data->name(),
+                                  "' via Eager)");
+
+  node_data->mutable_outputs()->ResetTensorHandles();
+  int num_retvals = node_data->NumOutputs();
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      EagerExecute(node_data->op(),
+                   node_data->mutable_outputs()->GetTensorHandles(),
+                   &num_retvals),
+      " (while executing '", node_data->name(), "' via Eager)");
+
+  if (num_retvals != node_data->NumOutputs()) {
+    return tensorflow::errors::Internal(
+        "Unexpected number of outputs from EagerExecute");
+  }
+
+  TF_RETURN_IF_ERROR(node_data->PersistEagerOutputs(buffer_map));
+
+  node_data->ClearEagerInputs();
+
+  return tensorflow::Status::OK();
+}
+
+// The larger 'op', which contains all the nodes in a supported subgraph.
 struct OpData {
   tensorflow::EagerContext* eager_context;
   BufferMap* buffer_map;
-  std::vector<OpNode> nodes;
+  std::vector<std::unique_ptr<OpNode>> nodes;
   std::vector<int> subgraph_inputs;
   std::vector<int> subgraph_outputs;
 };
@@ -166,8 +381,10 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
                             ->GetBufferMap(context);
 
   CHECK(params->output_tensors);
+  std::set<int> output_set;
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
     op_data->subgraph_outputs.push_back(tensor_index);
+    output_set.insert(tensor_index);
   }
 
   CHECK(params->input_tensors);
@@ -175,49 +392,55 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     op_data->subgraph_inputs.push_back(tensor_index);
   }
 
+  op_data->nodes.reserve(params->nodes_to_replace->size);
+
   CHECK(params->nodes_to_replace);
+  tensorflow::Status status;
   for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
     TfLiteNode* node;
     TfLiteRegistration* reg;
     context->GetNodeAndRegistration(context, node_index, &node, &reg);
 
-    op_data->nodes.push_back(OpNode());
-    OpNode& node_data = op_data->nodes.back();
-
-    node_data.index = node_index;
-    node_data.name = "";
-    if (node->custom_initial_data) {
-      // The flexbuffer contains a vector where the first elements is the
-      // op name and the second is a serialized NodeDef.
-      const flexbuffers::Vector& v =
-          flexbuffers::GetRoot(
-              reinterpret_cast<const uint8_t*>(node->custom_initial_data),
-              node->custom_initial_data_size)
-              .AsVector();
-
-      node_data.name = v[0].AsString().str();
-      if (!node_data.nodedef.ParseFromString(v[1].AsString().str())) {
-        // We will just leave the nodedef empty and error out in Eval().
-        node_data.nodedef.Clear();
-      }
-    }
+    op_data->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
+    OpNode& node_data = *op_data->nodes.back();
 
-    // Fill NodeDef with defaults if it's a valid op.
-    const tensorflow::OpRegistrationData* op_reg_data;
-    auto tf_status = tensorflow::OpRegistry::Global()->LookUp(
-        node_data.nodedef.op(), &op_reg_data);
-    if (tf_status.ok()) {
-      AddDefaultsToNodeDef(op_reg_data->op_def, &node_data.nodedef);
-    }
+    node_data.set_index(node_index);
+    node_data.set_name("");
 
-    for (auto input_index : TfLiteIntArrayView(node->inputs)) {
-      node_data.inputs.push_back(input_index);
-    }
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      node_data.outputs.push_back(output_index);
+    status = node_data.InitializeNodeDef(node->custom_initial_data,
+                                         node->custom_initial_data_size);
+    if (!status.ok()) break;
+    status = node_data.BuildEagerOp(op_data->eager_context);
+    if (!status.ok()) break;
+  }
+
+  if (ConvertStatus(context, status) != kTfLiteOk) {
+    // We can't return an error from this function but ConvertStatus will
+    // report them and we will stop processing in Prepare() if anything went
+    // wrong.
+    return op_data;
+  }
+
+  // Given a TfLite tensor index, return the OpNode that produces it,
+  // along with it index into that OpNodes list of outputs.
+  std::map<int, TensorSource> tflite_tensor_sources;
+
+  // Find out how each tensor is produced. This does not account for
+  // tensors that are not produce by eager ops.
+  for (auto& node_data : op_data->nodes) {
+    node_data->mutable_outputs()->InitializeGraphOutputs(output_set);
+    for (int i = 0; i < node_data->outputs().Size(); ++i) {
+      int output_index = node_data->outputs().TfLiteIndex(i);
+      tflite_tensor_sources[output_index] = TensorSource{node_data.get(), i};
     }
   }
 
+  // For each node, resolve the inputs, so we can keep pointers to the nodes
+  // that produces them.
+  for (auto& node_data : op_data->nodes) {
+    node_data->mutable_inputs()->InitializeTensorSources(tflite_tensor_sources);
+  }
+
   return op_data;
 }
 
@@ -233,6 +456,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       "device has not been registered, presumably because some symbols from "
       "tensorflow/core:core_cpu_impl were not linked into the binary.");
 
+  // We will keep track of the number of references to each tensor in the
+  // graph, so we can make them "forwardable" if there is only one reference.
+  std::map<int, int> tensor_ref_count;
+
   // Whenever we find a constant tensor, insert it in the buffer map.
   BufferMap* buffer_map = op_data->buffer_map;
   for (auto tensor_index : op_data->subgraph_inputs) {
@@ -242,21 +469,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         buffer_map->SetFromTfLite(tensor_index, tensor);
       }
     }
+
+    // Input tensors should never be forwarded so we increment their ref counts
+    // twice: once for this graph and another for the possibility of them being
+    // used by another subgraph, or being an output of the full graph.
+    tensor_ref_count[tensor_index] += 2;
   }
 
   // All output tensors are allocated by TensorFlow/Eager, so we
   // mark them as kTfLiteDynamic.
   for (auto tensor_index : op_data->subgraph_outputs) {
     SetTensorToDynamic(&context->tensors[tensor_index]);
+    ++tensor_ref_count[tensor_index];
+  }
+
+  for (const auto& node_data : op_data->nodes) {
+    if (node_data->nodedef().op().empty()) {
+      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
+                           node_data->name().c_str());
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE(context, node_data->op());
+
+    for (int i = 0; i < node_data->inputs().Size(); ++i) {
+      ++tensor_ref_count[node_data->inputs().TfLiteIndex(i)];
+    }
+  }
+
+  // All tensors that are referenced exactly once are marked as "forwardable",
+  // meaning that we will allow TensorFlow to reuse its buffer as the output of
+  // an op.
+  for (auto& node_data : op_data->nodes) {
+    for (int i = 0; i < node_data->inputs().Size(); ++i) {
+      bool f = (tensor_ref_count[node_data->inputs().TfLiteIndex(i)] == 1);
+      node_data->mutable_inputs()->SetForwardable(i, f);
+    }
   }
 
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
   BufferMap* buffer_map = op_data->buffer_map;
-  tensorflow::EagerContext* eager_context = op_data->eager_context;
 
   // Insert a tensor in the buffer map for all inputs that are not constant.
   // Constants were handled in Prepare() already.
@@ -273,18 +528,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Execute the TensorFlow Ops sequentially.
-  for (const auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data->nodes) {
     SCOPED_TAGGED_OPERATOR_PROFILE(
         reinterpret_cast<profiling::Profiler*>(context->profiler),
-        node_data.name.c_str(), node_data.index);
-    if (node_data.nodedef.op().empty()) {
-      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
-                           node_data.name.c_str());
-      return kTfLiteError;
-    }
-    auto status =
-        ExecuteFlexOp(eager_context, buffer_map, node_data.name,
-                      node_data.nodedef, node_data.inputs, node_data.outputs);
+        node_data->name().c_str(), node_data->index());
+
+    auto status = ExecuteFlexOp(context, buffer_map, node_data.get());
     TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index cc5c8b32a015b710ac55b1466d0ff27c128f64d5..5b3a6d164707a805f05765764b13d2d01eac967f 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -25,6 +25,7 @@ namespace {
 
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
                             const std::vector<int>& supported_nodes) {
@@ -36,13 +37,38 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
   return kTfLiteOk;
 }
 
+// There is no easy way to pass a parameter into the TfLiteDelegate's
+// 'prepare' function, so we keep a global map for testing purpused.
+// To avoid collisions use: GetPrepareFunction<__LINE__>().
+std::map<int, std::vector<int>>* GetGlobalOpLists() {
+  static auto* op_list = new std::map<int, std::vector<int>>;
+  return op_list;
+}
+
 class KernelTest : public testing::FlexModelTest {
  public:
+  static constexpr int kOnes = 1;  // This is the index of a tensor of 1's.
+  static constexpr int kTwos = 2;  // This is the index of a tensor of 2's.
+  static constexpr int kMaxTensors = 30;
+
+  static void SetUpTestSuite() { GetGlobalOpLists()->clear(); }
+
   KernelTest() {
     CHECK(delegate_data_.Prepare(tensorflow::SessionOptions{}).ok());
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
+  typedef TfLiteStatus (*PrepareFunction)(TfLiteContext* context,
+                                          TfLiteDelegate* delegate);
+
+  template <int KEY>
+  PrepareFunction GetPrepareFunction() {
+    GetGlobalOpLists()->insert({KEY, tf_ops_});
+    return [](TfLiteContext* context, TfLiteDelegate* delegate) {
+      return GenericPrepare(context, delegate, GetGlobalOpLists()->at(KEY));
+    };
+  }
+
   template <typename T>
   void ConfigureDelegate(T prepare_function) {
     delegate_.data_ = &delegate_data_;
@@ -54,9 +80,13 @@ class KernelTest : public testing::FlexModelTest {
                                         TfLiteBufferHandle buffer_handle,
                                         TfLiteTensor* output) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
-                                           ->GetTensor(buffer_handle)
-                                           .tensor_data();
+      auto* buffer_map = delegate_data->GetBufferMap(context);
+      if (!buffer_map->HasTensor(buffer_handle)) {
+        context->ReportError(context, "Tensor '%d' not found", buffer_handle);
+        return kTfLiteError;
+      }
+      tensorflow::StringPiece values =
+          buffer_map->GetTensor(buffer_handle).tensor_data();
       memcpy(output->data.raw, values.data(), values.size());
       return kTfLiteOk;
     };
@@ -114,12 +144,9 @@ TEST_F(KernelTest, BadTensorFlowOp) {
     return GenericPrepare(context, delegate, {0});
   });
 
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
+  ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("while processing attributes of 'NonExistentOp'"));
+              ContainsRegex("Op type not registered 'NonExistentOp'"));
 }
 
 TEST_F(KernelTest, BadNumberOfOutputs) {
@@ -166,10 +193,7 @@ TEST_F(KernelTest, WrongSetOfNodes) {
     return GenericPrepare(context, delegate, {0, 1});
   });
 
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
+  ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
               ContainsRegex("Invalid NodeDef in Flex op"));
 }
@@ -228,7 +252,7 @@ TEST_F(KernelTest, SplitGraph) {
   AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
 
   ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    // All ops by #3 are TF ops, handled by the delegate. However, because #4
+    // All ops but #3 are TF ops, handled by the delegate. However, because #4
     // depends on the non-TF op, two subgraphs are necessary:
     //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
     //    TF Lite Op: 3
@@ -263,6 +287,132 @@ TEST_F(KernelTest, SplitGraph) {
   ASSERT_THAT(GetValues(17), ElementsAre(18.0f));
 }
 
+class MultipleSubgraphsTest : public KernelTest {
+ public:
+  static constexpr int kInput = 0;
+
+  void PrepareInterpreter(PrepareFunction prepare,
+                          const std::vector<float>& input) {
+    ConfigureDelegate(prepare);
+
+    SetShape(kOnes, {3});
+    SetValues(kOnes, {1.0f, 1.0f, 1.0f});
+    SetShape(kTwos, {3});
+    SetValues(kTwos, {2.0f, 2.0f, 2.0f});
+
+    SetValues(kInput, input);
+  }
+
+  std::vector<float> Apply(const std::vector<float>& input,
+                           std::function<float(float)> function) {
+    std::vector<float> result;
+    for (float f : input) {
+      result.push_back(function(f));
+    }
+    return result;
+  }
+};
+
+TEST_F(MultipleSubgraphsTest, ForwardabilityIsLocal) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be forwarded, so we build a small first graph
+  // to produce tensor #10. Here #10 is forwardable, because it is only
+  // used once, as an output.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10, which is not forwardable here
+  // since it is used by more than one op. The existing code will forward the
+  // tensor anyway, because it was deemed to be forwardable by the previous
+  // subgraph.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {11, 10}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (4 * in + 4) * (in + 1);
+              })));
+}
+
+// Subgraphs should not remove input tensors from the buffer_map, since
+// they could be necessary for downstream graphs.
+TEST_F(MultipleSubgraphsTest, DoNotRemoveInputTensors) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be removed, so we build a small first graph
+  // to produce tensor #10. We make sure it is used by more than one
+  // op, so it is not forwardable here.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+  AddTfOp(testing::kAdd, {10, kOnes}, {15});
+  AddTfOp(testing::kAdd, {10, kOnes}, {16});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10. The existing code will remove
+  // from the buffer_map all tensors that are not outputs, so #10 will
+  // disappear. Note that we are using #10 in two ops, so it is not forwardable
+  // either.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {10, 11}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (4 * in + 4) * (in + 1);
+              })));
+}
+
+// A tensor is deemed forwardable but it happens to be the input to
+// more than one subgraph. It should not be forwarded, otherwise its
+// contents will be overwritten.
+TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be forwarded, so we build a small first graph
+  // to produce tensor #10.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10 and will think it is forwardable
+  // because it is used by a single op. However, the subgraph doesn't have
+  // enough information to make that judgment, as the input tensor could be
+  // used by another graph further downstream. The existing code will forward
+  // the tensor and remove it from the buffer_map, causing a failure later.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {11, 4}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (5 * in + 5) * (in + 1);
+              })));
+}
+
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index aa24675a7b1beab8632435debc8dd1fc04f347e7..a67aeef231b497de2b4749b2ce2fdd5edd5c6129 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -90,6 +90,8 @@ void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
 
 void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
                                    const std::vector<int>& outputs) {
+  ++next_op_index_;
+
   static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
   reg.builtin_code = BuiltinOperator_MUL;
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
@@ -114,6 +116,9 @@ void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
 
 void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
                             const std::vector<int>& outputs) {
+  tf_ops_.push_back(next_op_index_);
+  ++next_op_index_;
+
   auto attr = [](const string& key, const string& value) {
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 2cc2dc30e92586535687187105057d41ab5c0350..1913a406e8388af30ff5ca88f18f03fb75d46c49 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -103,6 +103,7 @@ class FlexModelTest : public ::testing::Test {
  protected:
   std::unique_ptr<Interpreter> interpreter_;
   TestErrorReporter error_reporter_;
+  std::vector<int> tf_ops_;
 
  private:
   // Helper method to add a TensorFlow op. tflite_names needs to start with
@@ -112,6 +113,8 @@ class FlexModelTest : public ::testing::Test {
                const std::vector<int>& outputs);
 
   std::vector<std::vector<uint8_t>> flexbuffers_;
+
+  int next_op_index_ = 0;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index fd954ba222627ab0457711b87baf9c3f7573e129..99cd6d3f859e7645d57f455d5ee06689b4e6c094 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -3,6 +3,7 @@ package(default_visibility = [
 ])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -15,7 +16,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
     ],
 )
 
@@ -23,7 +24,11 @@ tf_cc_test(
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
-    tags = ["no_oss"],
+    tags = [
+        # TODO(b/122987564): Enable on Android after resolving API 27 failures.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":nnapi_delegate",
         "//tensorflow/lite:framework",
@@ -32,3 +37,5 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 7908bbf1641fcf07408b9380fb1587768d9f233c..d5d319432f9af93f771e8129ae2ef9cbc9559749 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+#include <cstring>
 #include <iostream>
 #include <memory>
 #include <vector>
@@ -24,11 +25,13 @@ limitations under the License.
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 #ifdef __ANDROID__
-#include <sys/mman.h>
 #include <sys/system_properties.h>
+#endif
+#if defined __ANDROID__ || defined __unix__
+#include <sys/mman.h>
 #include <unistd.h>
 #endif
 
@@ -37,7 +40,7 @@ namespace {
 
 // TODO(b/80621585): Consider printing error string, but don't for now to
 // minimize binary size.
-#define CHECK_NN(context, code)                                               \
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code)                        \
   do {                                                                        \
     const auto _code = (code);                                                \
     if (_code != ANEURALNETWORKS_NO_ERROR) {                                  \
@@ -48,65 +51,47 @@ namespace {
   } while (0)
 
 namespace {
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return std::numeric_limits<int32_t>::max();
-      }
-    }
-    return atoi(sdkVersion);
-  }
-#endif  // __ANDROID__
-  return 0;
-}
-
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
-static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
-
 }  // namespace
 
 // RAII NN API Model Destructor for use with std::unique_ptr
 struct NNFreeModel {
   void operator()(ANeuralNetworksModel* model) {
-    ANeuralNetworksModel_free(model);
+    NnApiImplementation()->ANeuralNetworksModel_free(model);
   }
 };
 // RAII NN API Compilation Destructor for use with std::unique_ptr
 struct NNFreeCompilation {
   void operator()(ANeuralNetworksCompilation* model) {
-    ANeuralNetworksCompilation_free(model);
+    NnApiImplementation()->ANeuralNetworksCompilation_free(model);
   }
 };
 
 // Manage NNAPI shared memory handle
 class NNMemory {
  public:
-  NNMemory(const char* name, size_t size) {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
+  NNMemory(const NnApi* nnapi, const char* name, size_t size) {
+    nnapi_ = nnapi;
     byte_size_ = size;
-    fd_ = ASharedMemory_create(name, size);
+    fd_ = nnapi_->ASharedMemory_create(name, size);
     data_ptr_ = reinterpret_cast<uint8_t*>(
         mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
-    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
-                                       &nn_memory_handle_);
-#endif
+    nnapi_->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE,
+                                               fd_, 0, &nn_memory_handle_);
   }
+#else
+  NNMemory(const NnApi* /*nnapi*/, const char* /*name*/, size_t /*size*/) {}
+#endif
 
   ~NNMemory() {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
     if (data_ptr_) {
       munmap(data_ptr_, byte_size_);
     }
     if (nn_memory_handle_) {
-      ANeuralNetworksMemory_free(nn_memory_handle_);
+      nnapi_->ANeuralNetworksMemory_free(nn_memory_handle_);
     }
     if (fd_ > 0) close(fd_);
 #endif
@@ -116,7 +101,8 @@ class NNMemory {
   uint8_t* get_data_ptr() { return data_ptr_; }
 
  private:
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
+  const NnApi* nnapi_;
   int fd_ = 0;
   size_t byte_size_ = 0;
 #endif
@@ -166,9 +152,10 @@ class OperandMapping {
 // operands for both tensors and parameters, and TFLite separates the two.
 class NNAPIOpBuilder {
  public:
-  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
-                 ANeuralNetworksModel* nn_model)
-      : context_(context),
+  NNAPIOpBuilder(const NnApi* nnapi, TfLiteContext* context,
+                 OperandMapping* tensor_mapping, ANeuralNetworksModel* nn_model)
+      : nnapi_(nnapi),
+        context_(context),
         operand_mapping_(tensor_mapping),
         nn_model_(nn_model) {}
 
@@ -223,8 +210,9 @@ class NNAPIOpBuilder {
         .type = ANEURALNETWORKS_TENSOR_FLOAT32,
         .dimensionCount = dimension_count,
         .dimensions = dims.data()};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     int ann_operand = operand_mapping_->add_new_non_tensor_operand();
     augmented_outputs_.push_back(ann_operand);
     return kTfLiteOk;
@@ -240,8 +228,9 @@ class NNAPIOpBuilder {
         static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
         tensor->params.zero_point};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     augmented_outputs_.push_back(ann_index);
 
     *ann_tensor_index_out = ann_index;
@@ -297,14 +286,16 @@ class NNAPIOpBuilder {
     ANeuralNetworksOperandType operand_type{
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
 
     if (tensor->allocation_type == kTfLiteMmapRo) {
       // TODO(b/80630405): Use NNAPIAllocation.
-      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                             nn_model_, ann_tensor_index, tensor->data.raw,
-                             tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          nnapi_->ANeuralNetworksModel_setOperandValue(
+              nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes));
     }
 
     *ann_tensor_index_out = ann_tensor_index;
@@ -314,12 +305,13 @@ class NNAPIOpBuilder {
   // Finish emitting the op (of type `type`) into the NN API.
   TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
     // Actually add a NN API operation
-    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
-                           nn_model_, type,
-                           static_cast<uint32_t>(augmented_inputs_.size()),
-                           augmented_inputs_.data(),
-                           static_cast<uint32_t>(augmented_outputs_.size()),
-                           augmented_outputs_.data()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperation(
+            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
+            augmented_inputs_.data(),
+            static_cast<uint32_t>(augmented_outputs_.size()),
+            augmented_outputs_.data()));
     augmented_inputs_.clear();
     augmented_outputs_.clear();
     return kTfLiteOk;
@@ -329,11 +321,13 @@ class NNAPIOpBuilder {
   template <typename T>
   TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{.type = nn_type};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                           nn_model_, ann_operand, &value, sizeof(T)));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_operand, &value, sizeof(T)));
     augmented_inputs_.push_back(ann_operand);
     return kTfLiteOk;
   }
@@ -343,16 +337,20 @@ class NNAPIOpBuilder {
                                 int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{
         .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
     int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_,
-             ANeuralNetworksModel_setOperandValue(
-                 nn_model_, ann_operand, values, sizeof(T) * num_values));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_operand, values, sizeof(T) * num_values));
     augmented_inputs_.push_back(ann_operand);
     return kTfLiteOk;
   }
 
+  // Access to NNAPI.
+  const NnApi* const nnapi_;
+
   // TfLiteContext for error handling.
   TfLiteContext* const context_;
 
@@ -388,7 +386,7 @@ ANeuralNetworksOperationType BasicMappingFn(
 // The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
-  NNAPIDelegateKernel() = default;
+  NNAPIDelegateKernel() { nnapi_ = NnApiImplementation(); }
 
   typedef ANeuralNetworksOperationType (*MappingFn)(
       const NNAPIOpMappingArgs& mapping_args);
@@ -397,7 +395,7 @@ class NNAPIDelegateKernel {
   // when called. You can use this function to see if a node is supported
   // (i.e. that MappingFn is not nullptr).
   static MappingFn Map(TfLiteContext* context, int builtin_code, int version,
-                       TfLiteNode* node) {
+                       int android_sdk_version, TfLiteNode* node) {
     switch (builtin_code) {
       case kTfLiteBuiltinAdd:
         if (version == 1) {
@@ -516,7 +514,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSqueeze:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
@@ -632,7 +630,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSub:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float sub.
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -645,7 +643,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinDiv:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float div.
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -658,7 +656,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinPad:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             node->inputs->size == 2 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI does not support specifying the padding value.
@@ -668,12 +666,12 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSpaceToBatchNd:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
         }
         break;
       case kTfLiteBuiltinStridedSlice:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
@@ -691,7 +689,7 @@ class NNAPIDelegateKernel {
         // dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((version == 1) &&
-            (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
+            (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
             (node->inputs->size > 1) &&
             (context->tensors[node->inputs->data[1]].allocation_type ==
              kTfLiteMmapRo)) {
@@ -789,7 +787,7 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinMean:
         // NNAPI does not support generating a scalar as output for MEAN.
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 &&
             context->tensors[node->outputs->data[0]].dims->size > 0) {
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -833,7 +831,8 @@ class NNAPIDelegateKernel {
 
     if (!nn_model_) {
       ANeuralNetworksModel* model;
-      CHECK_NN(context, ANeuralNetworksModel_create(&model));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksModel_create(&model));
       nn_model_.reset(model);
 
       TF_LITE_ENSURE_STATUS(
@@ -842,9 +841,11 @@ class NNAPIDelegateKernel {
 
     if (!nn_compilation_) {
       ANeuralNetworksCompilation* compilation;
-      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
-                                                          &compilation));
-      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                             &compilation));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksCompilation_finish(compilation));
       nn_compilation_.reset(compilation);
     }
     return kTfLiteOk;
@@ -852,8 +853,9 @@ class NNAPIDelegateKernel {
 
   TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
     ANeuralNetworksExecution* execution = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
-                                                      &execution));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                         &execution));
 
     // Set the input tensor buffers. Note: we access tflite tensors using
     // absolute indices but NN api indices inputs by relative indices.
@@ -871,10 +873,11 @@ class NNAPIDelegateKernel {
         // copy data to pre-allocated shared memory.
         memcpy(nn_input_memory_->get_data_ptr() + input_offset,
                tensor->data.raw, tensor->bytes);
-        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
-                              execution, relative_input_index, nullptr,
-                              nn_input_memory_->get_handle(), input_offset,
-                              tensor->bytes));
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context,
+            nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                execution, relative_input_index, nullptr,
+                nn_input_memory_->get_handle(), input_offset, tensor->bytes));
         input_offset += tensor->bytes;
         relative_input_index++;
       }
@@ -885,10 +888,11 @@ class NNAPIDelegateKernel {
     size_t output_offset = 0;
     for (auto output_index : TfLiteIntArrayView(node->outputs)) {
       TfLiteTensor* tensor = &context->tensors[output_index];
-      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
-                            execution, relative_output_index, nullptr,
-                            nn_output_memory_->get_handle(), output_offset,
-                            tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
+              execution, relative_output_index, nullptr,
+              nn_output_memory_->get_handle(), output_offset, tensor->bytes));
       output_offset += tensor->bytes;
       relative_output_index++;
     }
@@ -901,17 +905,21 @@ class NNAPIDelegateKernel {
       // Here we are using a deep copy for state_in tensors so that we are not
       // reading and writing into the same buffer during a invocation.
       // TODO(110369471): using double shared buffer to minimize the copies.
-      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
-                            execution, relative_output_index, nullptr,
-                            tensor->data.raw, tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksExecution_setOutput(
+                       execution, relative_output_index, nullptr,
+                       tensor->data.raw, tensor->bytes));
       relative_output_index++;
     }
     // Invoke ANN in blocking fashion.
     ANeuralNetworksEvent* event = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
-    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
-    ANeuralNetworksEvent_free(event);
-    ANeuralNetworksExecution_free(execution);
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksExecution_startCompute(execution, &event));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(context,
+                                    nnapi_->ANeuralNetworksEvent_wait(event));
+    nnapi_->ANeuralNetworksEvent_free(event);
+    nnapi_->ANeuralNetworksExecution_free(execution);
 
     // copy results from shared memory to the destination.
     output_offset = 0;
@@ -926,6 +934,8 @@ class NNAPIDelegateKernel {
   }
 
  private:
+  // Access to NNApi.
+  const NnApi* nnapi_;
   // ANN API state.
   std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
   std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
@@ -946,7 +956,7 @@ class NNAPIDelegateKernel {
     // The operand builder allows creating a single op. We create it at this
     // reduced power position rather than in the for loop to avoid reallocating
     // the vectors.
-    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
+    NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_, nn_model_.get());
     // Add Tensors
     // allocate outside to avoid realloc
     for (auto node_index : nodes_) {
@@ -969,9 +979,10 @@ class NNAPIDelegateKernel {
         }
       }
       // Get op type and operands
-      int nn_op_type = Map(context, reg->builtin_code, reg->version, node)(
-          {context, &builder, node, &model_state_outputs_,
-           &model_state_tfl_inputs_});
+      int nn_op_type = Map(
+          context, reg->builtin_code, reg->version, nnapi_->android_sdk_version,
+          node)({context, &builder, node, &model_state_outputs_,
+                 &model_state_tfl_inputs_});
       // Map outputs to NN API tensor indices.
       for (auto output_index : TfLiteIntArrayView(node->outputs)) {
         TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
@@ -1016,24 +1027,28 @@ class NNAPIDelegateKernel {
     }
 
     // Tell ANN to declare inputs/outputs
-    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
-                          nn_model_.get(), inputs.size(), inputs.data(),
-                          outputs.size(), outputs.data()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs(
+                     nn_model_.get(), inputs.size(), inputs.data(),
+                     outputs.size(), outputs.data()));
 
     // Set relaxed computation mode for fp32 if possible.
-    if (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-      CHECK_NN(context,
-               ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-                   nn_model_.get(), context->allow_fp32_relax_to_fp16));
+    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+              nn_model_.get(), context->allow_fp32_relax_to_fp16));
     }
 
     // Finalize the model
-    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksModel_finish(nn_model_.get()));
 
     // Create shared memory pool for inputs and outputs.
-    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
+    nn_input_memory_.reset(
+        new NNMemory(nnapi_, "input_pool", total_input_byte_size));
     nn_output_memory_.reset(
-        new NNMemory("output_pool", total_output_byte_size));
+        new NNMemory(nnapi_, "output_pool", total_output_byte_size));
 
     return kTfLiteOk;
   }
@@ -1049,7 +1064,9 @@ TfLiteDelegate* NnApiDelegate() {
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         // Do not check nodes_ if NN API is unavailable.
-        if (kAndroidSdkVersion < kMinSdkVersionForNNAPI || !NNAPIExists()) {
+        const NnApi* nnapi = NnApiImplementation();
+        if (nnapi->android_sdk_version < kMinSdkVersionForNNAPI ||
+            !nnapi->nnapi_exists) {
           return kTfLiteOk;
         }
 
@@ -1062,6 +1079,7 @@ TfLiteDelegate* NnApiDelegate() {
         TfLiteIntArray* plan;
         TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
 
+        int android_sdk_version = NnApiImplementation()->android_sdk_version;
         // Check for every node if it is supported
         // TODO(b/80625235): Fix this to do more careful checking of versioning.
         for (int node_index : TfLiteIntArrayView(plan)) {
@@ -1070,7 +1088,8 @@ TfLiteDelegate* NnApiDelegate() {
           TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
               context, node_index, &node, &registration));
           if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
-                                       registration->version, node)) {
+                                       registration->version,
+                                       android_sdk_version, node)) {
             supported_nodes.push_back(node_index);
           }
         }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ca48af0c95211e644fc7e2a1a1472a2f1b46ad35..5da052eb42275d684bfbf83e7b52227ccbb97a06 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -27,6 +27,16 @@ using ::testing::ElementsAreArray;
 // TODO(b/110368244): figure out how to share the existing tests in kernels/ but
 // with the delegation on. Also, add more unit tests to improve code coverage.
 
+// This matcher uses 1 as maximum tolerance.
+MATCHER(QuantizedNear, "") {
+  const int diff = abs(std::get<0>(arg) - std::get<1>(arg));
+  if (diff > 1) {
+    *result_listener << "Quantized values can be at most off by one: " << diff;
+    return false;
+  }
+  return true;
+}
+
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() {
@@ -585,14 +595,14 @@ class ReshapeOpModel : public SingleOpModelWithNNAPI {
   ReshapeOpModel(std::initializer_list<int> input_shape,
                  std::initializer_list<int> new_shape) {
     input_ = AddInput(TensorType_FLOAT32);
-    new_shape_ = AddInput(TensorType_INT32);
+    new_shape_ = AddConstInput<int>(TensorType_INT32, new_shape,
+                                    {static_cast<int>(new_shape.size())});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
         CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
             .Union());
     BuildInterpreter({input_shape, {static_cast<int>(new_shape.size())}});
-    PopulateTensor<int>(new_shape_, new_shape);
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -1326,7 +1336,8 @@ TEST(NNAPIDelegate, LogisticQuantized) {
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+              testing::Pointwise(QuantizedNear(),
+                                 {128, 1, 227, 251, 244, 32, 255, 188}));
 }
 
 #if 0
@@ -1576,14 +1587,17 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
                       std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> begin_data,
                       std::initializer_list<int> end_shape,
-                      std::initializer_list<int> strides_shape, int begin_mask,
+                      std::initializer_list<int> end_data,
+                      std::initializer_list<int> strides_shape,
+                      std::initializer_list<int> strides_data, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
     input_ = AddInput(tensor_input_type);
-    begin_ = AddInput(TensorType_INT32);
-    end_ = AddInput(TensorType_INT32);
-    strides_ = AddInput(TensorType_INT32);
+    begin_ = AddConstInput(TensorType_INT32, begin_data, begin_shape);
+    end_ = AddConstInput(TensorType_INT32, end_data, end_shape);
+    strides_ = AddConstInput(TensorType_INT32, strides_data, strides_shape);
     output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
@@ -1596,15 +1610,6 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
   void SetInput(std::initializer_list<input_type> data) {
     PopulateTensor<input_type>(input_, data);
   }
-  void SetBegin(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(begin_, data);
-  }
-  void SetEnd(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(end_, data);
-  }
-  void SetStrides(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(strides_, data);
-  }
 
   std::vector<input_type> GetOutput() {
     return ExtractVector<input_type>(output_);
@@ -1619,39 +1624,47 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
   int output_;
 };
 
-TEST(NNAPIDelegate, StridedSliceIn2D) {
-  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
-  m.SetInput({1, 2, 3, 4, 5, 6});
-  m.SetBegin({1, 0});
-  m.SetEnd({2, 2});
-  m.SetStrides({1, 1});
+TEST(StridedSliceOpTest, In1D) {
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, {3}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
 }
 
-TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxis_NegativeSlice) {
-  // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1].
-  StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
-  m.SetInput({0, 1, 2, 3});
-  m.SetBegin({-2, -1});
-  m.SetEnd({-1, 0});
-  m.SetStrides({1, 1});
+TEST(StridedSliceOpTest, In1D_BeginMask) {
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, {3}, {1}, {1}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
 
+TEST(StridedSliceOpTest, In2D_Stride2) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {0, 0}, {2}, {2, 3}, {2}, {2, 2}, 0, 0,
+                          0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
 }
 
-TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxisMask) {
-  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+TEST(StridedSliceOpTest, In2D_EndMask) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {1, 0}, {2}, {2, 2}, {2}, {1, 1}, 0, 2,
+                          0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
-  m.SetBegin({0, 0});
-  m.SetEnd({1, 1});
-  m.SetStrides({1, 1});
   m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {0, 0, 0}, {3}, {2, 3, 1}, {3},
+                          {1, 1, 1}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
 }
 
 static float rnn_input[] = {
@@ -1990,7 +2003,9 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     input_ = AddInput(TensorType_FLOAT32);
     weights_feature_ = AddInput(weights_feature_type);
     weights_time_ = AddInput(weights_time_type);
-    bias_ = AddNullInput();
+    // TODO(b/121383394) : figure out why optional bias causes TFLite segfault
+    // when using NNAPI delegate.
+    bias_ = AddInput(TensorType_FLOAT32);
     const int num_filters = units * rank;
     activation_state_ = AddInput(
         TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
@@ -2006,6 +2021,8 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
         {units_},                             // bias tensor
         {batches, memory_size * num_filters}  // activation_state tensor
     });
+    // TODO(b/121383394) : remove once the optional bias bug is fixed.
+    PopulateTensor(bias_, std::vector<float>(units_));
   }
 
   // Populates the weights_feature tensor.
@@ -2048,12 +2065,16 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
 class SVDFOpModel : public BaseSVDFOpModel {
  public:
   using BaseSVDFOpModel::BaseSVDFOpModel;
+};
 
+class SVDFOpTest : public ::testing::Test {
+ protected:
   void VerifyGoldens(float golden_input[], float golden_output[],
-                     int golden_size, float tolerance = 1e-5) {
-    const int svdf_num_batches = num_batches();
-    const int svdf_input_size = input_size();
-    const int svdf_num_units = num_units();
+                     int golden_size, BaseSVDFOpModel* svdf,
+                     float tolerance = 1e-5) {
+    const int svdf_num_batches = svdf->num_batches();
+    const int svdf_input_size = svdf->input_size();
+    const int svdf_num_units = svdf->num_units();
     const int input_sequence_size =
         golden_size / sizeof(float) / (svdf_input_size * svdf_num_batches);
     // Going over each input batch, setting the input tensor, invoking the SVDF
@@ -2062,9 +2083,9 @@ class SVDFOpModel : public BaseSVDFOpModel {
       float* batch_start =
           golden_input + i * svdf_input_size * svdf_num_batches;
       float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
-      SetInput(0, batch_start, batch_end);
+      svdf->SetInput(0, batch_start, batch_end);
 
-      Invoke();
+      svdf->Invoke();
 
       const float* golden_start =
           golden_output + i * svdf_num_units * svdf_num_batches;
@@ -2073,13 +2094,13 @@ class SVDFOpModel : public BaseSVDFOpModel {
       std::vector<float> expected;
       expected.insert(expected.end(), golden_start, golden_end);
 
-      EXPECT_THAT(GetOutput(),
+      EXPECT_THAT(svdf->GetOutput(),
                   ElementsAreArray(ArrayFloatNear(expected, tolerance)));
     }
   }
 };
 
-TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) {
+TEST_F(SVDFOpTest, BlackBoxTestRank1) {
   SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
                    /*memory_size=*/10, /*rank=*/1);
   svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
@@ -2099,10 +2120,11 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) {
        -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
        -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
 
-  svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input));
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input),
+                &svdf);
 }
 
-TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) {
+TEST_F(SVDFOpTest, BlackBoxTestRank2) {
   SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
                    /*memory_size=*/10, /*rank=*/2);
   svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
@@ -2137,7 +2159,8 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) {
        0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
        0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
 
-  svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input));
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input),
+                &svdf);
 }
 
 class LSTMOpModel : public SingleOpModelWithNNAPI {
@@ -2223,71 +2246,69 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::initializer_list<float> f) {
+  void SetInputToInputWeights(std::vector<float> f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
+  void SetInputToForgetWeights(std::vector<float> f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
+  void SetInputToCellWeights(std::vector<float> f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
+  void SetInputToOutputWeights(std::vector<float> f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToInputWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+  void SetRecurrentToForgetWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+  void SetRecurrentToCellWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToOutputWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
+  void SetCellToInputWeights(std::vector<float> f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
+  void SetCellToForgetWeights(std::vector<float> f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
+  void SetCellToOutputWeights(std::vector<float> f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputGateBias(std::initializer_list<float> f) {
+  void SetInputGateBias(std::vector<float> f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::initializer_list<float> f) {
+  void SetForgetGateBias(std::vector<float> f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::initializer_list<float> f) {
-    PopulateTensor(cell_bias_, f);
-  }
+  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
 
-  void SetOutputGateBias(std::initializer_list<float> f) {
+  void SetOutputGateBias(std::vector<float> f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
+  void SetProjectionWeights(std::vector<float> f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::initializer_list<float> f) {
+  void SetProjectionBias(std::vector<float> f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -2342,22 +2363,22 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
 class BaseLstmTest : public ::testing::Test {
  protected:
   // Weights of the LSTM model. Some are optional.
-  std::initializer_list<float> input_to_input_weights_;
-  std::initializer_list<float> input_to_cell_weights_;
-  std::initializer_list<float> input_to_forget_weights_;
-  std::initializer_list<float> input_to_output_weights_;
-  std::initializer_list<float> input_gate_bias_;
-  std::initializer_list<float> cell_gate_bias_;
-  std::initializer_list<float> forget_gate_bias_;
-  std::initializer_list<float> output_gate_bias_;
-  std::initializer_list<float> recurrent_to_input_weights_;
-  std::initializer_list<float> recurrent_to_cell_weights_;
-  std::initializer_list<float> recurrent_to_forget_weights_;
-  std::initializer_list<float> recurrent_to_output_weights_;
-  std::initializer_list<float> cell_to_input_weights_;
-  std::initializer_list<float> cell_to_forget_weights_;
-  std::initializer_list<float> cell_to_output_weights_;
-  std::initializer_list<float> projection_weights_;
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> projection_weights_;
 
   // LSTM input is stored as num_batch x num_inputs vector.
   std::vector<std::vector<float>> lstm_input_;
diff --git a/tensorflow/lite/examples/android/app/build.gradle b/tensorflow/lite/examples/android/app/build.gradle
index e5f5c7efd13b396161218294905857df479e5c3b..b372afae190ded84947b45655018a78633715c16 100644
--- a/tensorflow/lite/examples/android/app/build.gradle
+++ b/tensorflow/lite/examples/android/app/build.gradle
@@ -2,7 +2,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion '26.0.2'
+    buildToolsVersion '27.0.3'
     defaultConfig {
         applicationId "org.tensorflow.lite.demo"
         minSdkVersion 15
@@ -45,6 +45,6 @@ project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
 apply from: "download-models.gradle"
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
 }
diff --git a/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml b/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
index bc9574d646b7661de8ac9b745bd53cbba1eb9f31..e63c5d267efa827bf5b6feabbfa401c5d2319517 100644
--- a/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
+++ b/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
@@ -24,10 +24,6 @@
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
     <uses-permission android:name="android.permission.RECORD_AUDIO" />
 
-    <uses-sdk
-        android:minSdkVersion="21"
-        android:targetSdkVersion="23" />
-
     <application android:allowBackup="true"
         android:debuggable="true"
         android:label="@string/app_name"
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
index 7c79358e45937e5f1cb061fc24e7de603b964885..7c038ddd46418b6498251068a284e8ffcdeda96f 100644
--- a/tensorflow/lite/examples/android/build.gradle
+++ b/tensorflow/lite/examples/android/build.gradle
@@ -6,7 +6,7 @@ buildscript {
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 4d5ea40cd05696f6853e7aee5f601a42a8947c90..4f6fcaa96c4b917b79dacc5180594c1458ef18ff 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -346,7 +346,15 @@ void ProcessInputWithQuantizedModel(
   NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", end - start, total_latency / total_count,
         total_count);
 
-  const int output_size = 1000;
+  // read output size from the output sensor
+  const int output_tensor_index = interpreter->outputs()[0];
+  TfLiteTensor* output_tensor = interpreter->tensor(output_tensor_index);
+  TfLiteIntArray* output_dims = output_tensor->dims;
+  if (output_dims->size != 2 || output_dims->data[0] != 1) {
+    LOG(FATAL) << "Output of the model is in invalid format.";
+  }
+  const int output_size = output_dims->data[1];
+
   const int kNumResults = 5;
   const float kThreshold = 0.1f;
 
diff --git a/tensorflow/lite/examples/ios/camera/data/labels.txt b/tensorflow/lite/examples/ios/camera/data/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..572eccf90087c1c19874e40b950c1610f59cc9c2
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/data/labels.txt
@@ -0,0 +1,1001 @@
+dummy
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh
index 4828617d95e94c1b6ad811e04d3b94b659bd8f74..a450aba042e9975e1282453160f841b4ff55e0b9 100755
--- a/tensorflow/lite/examples/ios/download_models.sh
+++ b/tensorflow/lite/examples/ios/download_models.sh
@@ -17,42 +17,31 @@
 set -ex
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
-QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
+FLOAT_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+QUANTIZED_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
 DOWNLOADS_DIR=$(mktemp -d)
 
-cd $SCRIPT_DIR
+cd "$SCRIPT_DIR"
 
 download_and_extract() {
-  local usage="Usage: download_and_extract URL DIR"
-  local url="${1:?${usage}}"
-  local dir="${2:?${usage}}"
+  local url="$1"
+  local dir="$2"
   echo "downloading ${url}" >&2
   mkdir -p "${dir}"
   tempdir=$(mktemp -d)
-  tempdir2=$(mktemp -d)
 
-  curl -L ${url} > ${tempdir}/zipped.zip
-  unzip ${tempdir}/zipped.zip -d ${tempdir2}
-
-  # If the zip file contains nested directories, extract the files from the
-  # inner directory.
-  if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
-    # unzip has no strip components, so unzip to a temp dir, and move the
-    # files we want from the tempdir to destination.
-    cp -R ${tempdir2}/*/* ${dir}/
-  else
-    cp -R ${tempdir2}/* ${dir}/
-  fi
-  rm -rf ${tempdir2} ${tempdir}
+  curl -L ${url} > ${tempdir}/archive.tgz
+  cd ${dir}
+  tar zxvf ${tempdir}/archive.tgz
+  rm -rf ${tempdir}
 }
 
-download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
-download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
-
-file ${DOWNLOADS_DIR}/models
+download_and_extract "${FLOAT_MODEL_URL}" "${DOWNLOADS_DIR}/float_model"
+download_and_extract "${QUANTIZED_MODEL_URL}" "${DOWNLOADS_DIR}/quantized_model"
 
-cp ${DOWNLOADS_DIR}/models/models/* simple/data/
-cp ${DOWNLOADS_DIR}/models/models/* camera/data/
-cp "${DOWNLOADS_DIR}/quantized_models/mobilenet_quant_v1_224.tflite" \
+cd "$SCRIPT_DIR"
+cp "${DOWNLOADS_DIR}/float_model/mobilenet_v1_1.0_224.tflite" "simple/data/mobilenet_v1_1.0_224.tflite"
+cp "${DOWNLOADS_DIR}/float_model/mobilenet_v1_1.0_224.tflite" "camera/data/mobilenet_v1_1.0_224.tflite"
+cp "${DOWNLOADS_DIR}/quantized_model/mobilenet_v1_1.0_224_quant.tflite" \
    'camera/data/mobilenet_quant_v1_224.tflite'
+echo "Done"
diff --git a/tensorflow/lite/examples/ios/simple/data/labels.txt b/tensorflow/lite/examples/ios/simple/data/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..572eccf90087c1c19874e40b950c1610f59cc9c2
--- /dev/null
+++ b/tensorflow/lite/examples/ios/simple/data/labels.txt
@@ -0,0 +1,1001 @@
+dummy
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/lite/examples/label_image/label_image.md b/tensorflow/lite/examples/label_image/label_image.md
index fd9f49918b4494eab845da7716a350ad6246f532..178f5b9d3012206571b6fcf8af1d2416df9a42e5 100644
--- a/tensorflow/lite/examples/label_image/label_image.md
+++ b/tensorflow/lite/examples/label_image/label_image.md
@@ -40,7 +40,7 @@ To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and
 
 Run it:
 ```
-> ./label_image                                        
+> ./label_image
 Loaded model ./mobilenet_quant_v1_224.tflite
 resolved reporter
 invoked
@@ -51,9 +51,9 @@ average time: 100.986 ms
 0.0235294: 514 cornet
 0.0196078: 835 suit
 ```
-Run `interpreter->Invoker()` 100 times:
+Run `interpreter->Invoke()` 100 times:
 ```
-> ./label_image   -c 100                               
+> ./label_image   -c 100
 Loaded model ./mobilenet_quant_v1_224.tflite
 resolved reporter
 invoked
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
index cde53e283830aca9c7990e3d8c4901f997621bc2..2f0f4327b7028feaa19719352c0d9c67f51b9f39 100644
--- a/tensorflow/lite/experimental/c/BUILD
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -66,7 +66,6 @@ cc_library(
         ":c_api_internal",
         "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:builtin_ops",
     ],
 )
@@ -94,7 +93,6 @@ cc_test(
     deps = [
         ":c_api",
         "//tensorflow/lite:context",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index 0c351ee4eccee515ed34ec5e8607914f7064ffbf..a4950d2dad7c701cb738ba85a091face568b297f 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -17,6 +17,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tflite_rnn",
+    srcs = ["tflite_rnn.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "unidirectional_sequence_lstm_test",
     size = "large",
@@ -38,3 +51,69 @@ py_test(
         "@six_archive//:six",
     ],
 )
+
+py_test(
+    name = "unidirectional_sequence_rnn_test",
+    size = "large",
+    srcs = ["unidirectional_sequence_rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":tflite_rnn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "bidirectional_sequence_lstm_test",
+    size = "large",
+    srcs = ["bidirectional_sequence_lstm_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":tflite_lstm",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "large",
+    srcs = ["bidirectional_sequence_rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":tflite_rnn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f4beda59f5fa1dcf6e1dd2dcf0f1463ca1604a
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+    # Import MNIST dataset
+    self.mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
+
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Lstm Units.
+    self.num_units = 16
+
+  def buildLstmLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        TFLiteLSTMCell(
+            self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
+        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        TFLiteLSTMCell(
+            self.num_units // 2,
+            use_peepholes=True,
+            num_proj=8,
+            forget_bias=0,
+            name="rnn3"),
+        TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
+    ])
+
+  def buildModel(self, fw_lstm_layer, bw_lstm_layer):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units * 2, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    lstm_input = tf.unstack(x, self.time_steps, 1)
+    outputs, _, _ = tf.nn.static_bidirectional_rnn(
+        fw_lstm_layer, bw_lstm_layer, lstm_input, dtype="float32")
+
+    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
+    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, fw_lstm_layer, bw_lstm_layer, sess, saver):
+    model_dir = tempfile.mkdtemp()
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(fw_lstm_layer, bw_lstm_layer)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    # It is important to keep all the ophint output nodes.
+    hinted_outputs_nodes = find_all_hinted_output_nodes(sess)
+    hinted_outputs_nodes.append(output_class.op.name)
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, hinted_outputs_nodes)
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    tflite = tf.lite.toco_convert(
+        curr, [tflite_input], [outputs], allow_custom_ops=False)
+
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    try:
+      interpreter.allocate_tensors()
+    except ValueError:
+      assert False
+
+    input_index = (interpreter.get_input_details()[0]["index"])
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = (interpreter.get_output_details()[0]["index"])
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
+                                                  self.buildLstmLayer())
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(), self.buildLstmLayer(), sess, saver)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d049c7834e8a782936160b92844585877ef1b022
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
@@ -0,0 +1,187 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import flags
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.tflite_rnn import TfLiteRNNCell
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+FLAGS = flags.FLAGS
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(BidirectionalSequenceRnnTest, self).__init__(*args, **kwargs)
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Rnn Units.
+    self.num_units = 16
+
+  def setUp(self):
+    super(BidirectionalSequenceRnnTest, self).setUp()
+    # Import MNIST dataset
+    data_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    self.mnist = input_data.read_data_sets(data_dir, one_hot=True)
+
+  def buildRnnLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        TfLiteRNNCell(self.num_units, name="rnn1"),
+        TfLiteRNNCell(self.num_units, name="rnn2")
+    ])
+
+  def buildModel(self, fw_rnn_layer, bw_rnn_layer):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units * 2, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    rnn_input = tf.unstack(x, self.time_steps, 1)
+    outputs, _, _ = tf.nn.static_bidirectional_rnn(
+        fw_rnn_layer, bw_rnn_layer, rnn_input, dtype="float32")
+
+    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
+    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, fw_rnn_layer, bw_rnn_layer, sess, saver):
+    model_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(fw_rnn_layer, bw_rnn_layer)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    # It is important to keep all the ophint output nodes.
+    hinted_outputs_nodes = find_all_hinted_output_nodes(sess)
+    hinted_outputs_nodes.append(output_class.op.name)
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, hinted_outputs_nodes)
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    tflite = tf.lite.toco_convert(
+        curr, [tflite_input], [outputs], allow_custom_ops=False)
+
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    interpreter.allocate_tensors()
+
+    input_index = interpreter.get_input_details()[0]["index"]
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = interpreter.get_output_details()[0]["index"]
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildRnnLayer(),
+                                                  self.buildRnnLayer())
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), self.buildRnnLayer(), sess, saver)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py b/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
index 2fe8ebf9e99f8b0e592e83c2e473dd2f8395c6c0..461345060badbad0fc65b37466436b1a1eb424a4 100644
--- a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
+++ b/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
@@ -184,7 +184,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
       var = self.add_variable(
           name, shape=shape, initializer=initializer, partitioner=partitioner)
       return self._tflite_wrapper.add_input(
-          var, name="name", index_override=index)
+          var, name=name, index_override=index)
 
     weight_initializer = self._initializer
     if self.dtype is None:
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py b/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4aad18367e6c8bf9669e928dff8d7c9376043b7
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TfLite BasicRnnCell wrapper.
+
+TODO(renjieliu): Find a better home for this one.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import itertools
+
+from tensorflow.lite.python import lite
+from tensorflow.python.keras import activations
+from tensorflow.python.layers import base as base_layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
+
+
+class TfLiteRNNCell(rnn_cell_impl.LayerRNNCell):
+  """The most basic RNN cell.
+
+  This is used only for TfLite, it provides hints and it also makes the
+  variables in the desired for the tflite ops.
+  """
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initializes the parameters for an RNN cell.
+
+    Args:
+      num_units: int, The number of units in the RNN cell.
+      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
+        that is within Keras activation function names.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope. Raises an error if not `True` and the existing scope
+        already has the given variables.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
+
+    Raises:
+      ValueError: If the existing scope already has the given variables.
+    """
+    super(TfLiteRNNCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be Rank-2.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceRnn")
+    self._num_units = num_units
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def build(self, inputs_shape):
+    """Builds the RNN cell.
+
+    Args:
+      inputs_shape: Rnn input tensor shape.
+
+    Raises:
+      ValueError: If last dimension of the input shape is not known.
+    """
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       (inputs_shape,))
+
+    input_depth = inputs_shape[-1]
+
+    def add_variable_wrapped(name, shape, initializer, index):
+      var = self.add_variable(name, shape=shape, initializer=initializer)
+      return self._tflite_wrapper.add_input(
+          var, name=name, index_override=index)
+
+    self._input_weights = add_variable_wrapped(
+        "input_weights", [self._num_units, input_depth], None, 1)
+    self._recurrent_weights = add_variable_wrapped(
+        "recurrent_weights", [self._num_units, self._num_units], None, 2)
+    self._bias = add_variable_wrapped(
+        "bias",
+        shape=[self._num_units],
+        initializer=init_ops.zeros_initializer(dtype=self.dtype),
+        index=3)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+    inputs = self._tflite_wrapper.add_input(
+        inputs, tag="input", name="input", aggregate="stack", index_override=0)
+    state = self._tflite_wrapper.add_input(
+        state,
+        tag="hidden_state",
+        name="hidden_state",
+        aggregate="first",
+        index_override=4)
+    weights = array_ops.transpose(
+        array_ops.concat([self._input_weights, self._recurrent_weights], 1))
+    gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
+    output = self._activation(gate_inputs)
+    output = self._tflite_wrapper.add_output(
+        output,
+        tag="output",
+        name="output",
+        index_override=1,
+        aggregate="stack")
+    return output, output
+
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(TfLiteRNNCell, self).get_config()
+    return dict(itertools.chain(base_config.items(), config.items()))
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9e2dd9498f03665b52e423db43ce38d5401eb1
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -0,0 +1,195 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import flags
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.tflite_rnn import TfLiteRNNCell
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+FLAGS = flags.FLAGS
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(UnidirectionalSequenceRnnTest, self).__init__(*args, **kwargs)
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Rnn Units.
+    self.num_units = 16
+
+  def setUp(self):
+    super(UnidirectionalSequenceRnnTest, self).setUp()
+    # Import MNIST dataset
+    data_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    self.mnist = input_data.read_data_sets(data_dir, one_hot=True)
+
+  def buildRnnLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        TfLiteRNNCell(self.num_units, name="rnn1"),
+        TfLiteRNNCell(self.num_units, name="rnn2")
+    ])
+
+  def buildModel(self, rnn_layer):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    # x is shaped [batch_size,time_steps,num_inputs]
+    rnn_input = tf.unstack(x, self.time_steps, 1)
+    outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
+
+    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
+    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    sess.run(tf.global_variables_initializer())
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, rnn_layer, sess, saver):
+    """Saves and restores the model to mimic the most common use case.
+
+    Args:
+      rnn_layer: The rnn layer either a single rnn cell or a multi rnn cell.
+      sess: Old session.
+      saver: saver created by tf.train.Saver()
+
+    Returns:
+      A tuple containing:
+
+      - Input tensor of the restored model.
+      - Prediction tensor of the restored model.
+      - Output tensor, which is the softwmax result of the prediction tensor.
+      - new session of the restored model.
+
+    """
+    model_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(rnn_layer)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, [output_class.op.name])
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    tflite = tf.lite.toco_convert(
+        curr, [tflite_input], [outputs], allow_custom_ops=False)
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+    interpreter.allocate_tensors()
+
+    input_index = interpreter.get_input_details()[0]["index"]
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = interpreter.get_output_details()[0]["index"]
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildRnnLayer())
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), sess, saver)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index dd314545cb6488ea2a76494df39b4b69e92eca33..5d2337f2e225de71689d1fbe8b2d945c0f88a4a1 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -50,21 +50,13 @@ cc_library(
     }),
     deps = [
         ":ctc_utils",
-        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:gemm_support",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:optimized",
         "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index e11159868e11a09e1b10d59da274cd08ee472593..b16b8b49f8a46a43475cc08807570e84a160aed4 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -12,6 +12,8 @@ load(
 cc_library(
     name = "micro_framework",
     srcs = [
+        "debug_log.cc",
+        "debug_log_numbers.cc",
         "micro_error_reporter.cc",
         "micro_interpreter.cc",
         "micro_mutable_op_resolver.cc",
@@ -19,13 +21,14 @@ cc_library(
     ],
     hdrs = [
         "compatibility.h",
+        "debug_log.h",
+        "debug_log_numbers.h",
         "micro_error_reporter.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
         "simple_tensor_allocator.h",
     ],
     deps = [
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index 673daed74c41a1880e6f8803258033cce8d333ca..30070bf0088e287ffed46f844e18ed9eda8caaa3 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -1,46 +1,90 @@
 # TensorFlow Lite for Microcontrollers
 
-This an experimental port of TensorFlow Lite aimed at micro controllers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems. The core runtime fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword detection model, takes up a total of 22KB.
-
-The design goals are for the framework to be:
-
-- **Readable**: We want embedded software engineers to be able to understand what's required to run ML inference without having to study research papers. We've tried to keep the code base small, modular, and have reference implementations of all operations to help with this.
-
-- **Easy to modify**: We know that there are a lot of different platforms and requirements in the embedded world, and we don't expect to cover all of them in one framework. Instead, we're hoping that it can be a good starting point for developers to build on top of to meet their own needs. For example, we tried to make it easy to replace the implementations of key computational operators that are often crucial for performance, without having to touch the data flow and other runtime code. We want it to make more sense to use our workflow to handle things like model import and less-important operations, and customize the parts that matter, rather than having to reimplement everything in your own engine.
-
-- **Well-tested**: If you're modifying code, you need to know if your changes are correct. Having an easy way to test lets you develop much faster. To help there, we've written tests for all the components, and we've made sure that the tests can be run on almost any platform, with no dependencies apart from the ability to log text to a debug console somewhere. We also provide an easy way to run all the tests on-device as part of an automated test framework, and we use qemu/Renode emulation so that tests can be run even without physical devices present.
-
-- **Easy to integrate**: We want to be as open a system as possible, and use the best code available for each platform. To do that, we're going to rely on projects like [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to handle as much performance-critical code as possible. We know that there are an increasing number of options to accelerate neural networks on microcontrollers, so we're aiming to be a good host for deploying those hardware technologies too.
-
-- **Compatible**: We're using the same file schema, interpreter API, and kernel interface as regular TensorFlow Lite, so we leverage the large existing set of tools, documentation, and examples for the project. The biggest barrier to deploying ML models is getting them from a training environment into a form that's easy to run inference on, so we see reusing this rich ecosystem as being crucial to being easily usable. We also hope to integrate this experimental work back into the main codebase in the future.
-
-To meet those goals, we've made some tradeoffs:
-
-- **Simple C++**: To help with readability, our code is written in a modern version of C++, but we generally treat it as a "better C", rather relying on more complex features such as template meta-programming. As mentioned earlier, we avoid any use of dynamic memory allocation (new/delete) or the standard C/C++ libraries, so we believe this should still be fairly portable. It does mean that some older devices with C-only toolchains won't be supported, but we're hoping that the reference operator implementations (which are simple C-like functions) can still be useful in those cases. The interfaces are also designed to be C-only, so it should be possible to integrate the resulting library with pure C projects.
-
-- **Interpreted**: Code generation is a popular pattern for embedded code, because it gives standalone code that's easy to modify and step through, but we've chosen to go with an interpreted approach. In our internal microcontroller work we've found that using an extremely stripped-down interpreter with almost no dependencies gives us a lot of the same advantages, but is easier to maintain. For example, when new updates come out for the underlying library, you can just merge your local modifications in a single step, rather than having to regenerate new code and then patch in any changes you subsequently made. The coarse granularity of the interpreted primitives means that each operation call typically takes hundreds of thousands of instruction cycles at least, so we don't see noticeable performance gains from avoiding what's essentially a single switch statement at the interpreter level to call each operation. We're still working on improving the packaging though, for example we're considering having the ability to snapshot all the source files and headers used for a particular model, being able to compile the code and data together as a library, and then access it through a minimal set of C interface calls which hide the underlying complexity.
-
-- **Flatbuffers**: We represent our models using [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs), with the difference that we always keep it in read-only program memory (typically flash) rather than relying on having a file system to read it from. This is a good fit because flatbuffer's serialized format is designed to be mapped into memory without requiring any extra memory allocations or modifications to access it. All of the functions to read model values work directly on the serialized bytes, and large sections of data like weights are directly accessible as sequential C-style arrays of their data type, with no strides or unpacking needed. We do get a lot of value from using flatbuffers, but there is a cost in complexity. The flat buffer library code is all inline [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h), but it isn't straightforward to inspect their implementations, and the model data structures aren't easy to comprehend from the debugger. The header for the schema itself also has to be periodically updated when new information is added to the file format, though we try to handle that transparently for most developers by checking in a pre-generated version.
-
-- **Code Duplication**: Some of the code in this prototype largely duplicates the logic in other parts of the TensorFlow Lite code base, for example the operator wrappers. We've tried to keep share as much as we can between the two interpreters, but there are some assumptions built into the original runtime that make this difficult. We'll be working on modularizing the main interpreter so that we can move to an entirely shared system.
-
-This initial preview release is designed to get early feedback, and is not intended to be a final product. It only includes enough operations to run a simple keyword recognition model, and the implementations are not optimized. We're hoping this will be a good way to get feedback and collaborate to improve the framework.
-
-## Getting Started
-
-Building requires a Linux or OS X machine.
-
- - Open a terminal
- - Download the TensorFlow source with `git clone https://github.com/tensorflow`
- - Enter the source root directory by running `cd tensorflow`
- - Download the dependencies by running `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes
- - Build and test the library with `make -f tensorflow/lite/experimental/micro/tools/make/Makefile test`
+This an experimental port of TensorFlow Lite aimed at micro controllers and
+other devices with only kilobytes of memory. It doesn't require any operating
+system support, any standard C or C++ libraries, or dynamic memory allocation,
+so it's designed to be portable even to 'bare metal' systems. The core runtime
+fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword
+detection model, takes up a total of 22KB.
+
+## Table of Contents
+
+-   [Getting Started](#getting-started)
+    *   [Getting Started with Portable Reference Code](#getting-started-with-portable-reference-code)
+    *   [Building Portable Reference Code using Make](#building-portable-reference-code-using-make)
+    *   [Building for the "Blue Pill" STM32F103 using Make](#building-for-the-blue-pill-stm32f103-using-make)
+    *   [Building for "Hifive1" SiFive FE310 development board using Make](#building-for-hifive1-sifive-fe310-development-board-using-make)
+    *   [Building for Ambiq Micro Apollo3Blue EVB using Make](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+        *   [Additional Apollo3 Instructions](#additional-apollo3-instructions)
+-   [Goals](#goals)
+-   [Generating Project Files](#generating-project-#files)
+-   [How to Port TensorFlow Lite Micro to a New Platform](#how-to-port-tensorflow-lite-micro-to-a-new-platform)
+    *   [Requirements](#requirements)
+    *   [Getting Started](getting-started)
+    *   [Troubleshooting](#troubleshooting)
+    *   [Optimizing for your Platform](#optimizing-for-your-platform)
+    *   [Code Module Organization](#code-module-organization)
+    *   [Working with Generated Projects](#working-with-generated-projects)
+    *   [Supporting a Platform with Makefiles](#supporting-a-platform-with-makefiles)
+    *   [Supporting a Platform with Emulation Testing](#supporting-a-platform-with-emulation-testing)
+    *   [Implementing More Optimizations](#implementing-more-optimizations)
+
+# Getting Started
+
+One of the challenges of embedded software development is that there are a lot
+of different architectures, devices, operating systems, and build systems. We
+aim to support as many of the popular combinations as we can, and make it as
+easy as possible to add support for others.
+
+If you're a product developer, we have build instructions or pre-generated
+project files that you can download for the following platforms:
+
+| Device | Mbed | Keil | Make/GCC
+-------- | ---- | ---- | --------
+[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html) | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | - | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl) | -
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | - | - | [Instructions](#building-for-the-blue-pill-stm32f103-using-make)
+[Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/) | - | - | [Instructions](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+[Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/) | - | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
+
+If your device is not yet supported, it may not be too hard to add support. You
+can learn about that process
+[here](#how-to-port-tensorflow-lite-micro-to-a-new-platform). We're looking
+forward to getting your help expanding this table!
+
+## Getting Started with Portable Reference Code
+
+If you don't have a particular microcontroller platform in mind yet, or just
+want to try out the code before beginning porting, the easiest way to begin is
+by
+[downloading the platform-agnostic reference code](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+You'll see a series of folders inside the archive, with each one containing just
+the source files you need to build one binary. There is a simple Makefile for
+each folder, but you should be able to load the files into almost any IDE and
+build them. There's also a [Visual Studio Code](https://code.visualstudio.com/) project file already set up, so
+you can easily explore the code in a cross-platform IDE.
+
+## Building Portable Reference Code using Make
+
+It's easy to build portable reference code directly from GitHub using make if
+you're on a Linux or OS X machine.
+
+-   Open a terminal
+-   Download the TensorFlow source with `git clone
+    https://github.com/tensorflow/tensorflow.git`
+-   Enter the source root directory by running `cd tensorflow`
+-   Download the dependencies by running
+    `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`.
+    This may take a few minutes
+-   Build and test the library with `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile test`
 
 You should see a series of compilation steps, followed by `~~~ALL TESTS
 PASSED~~~` for the various tests of the code that it will run. If there's an
 error, you should get an informative message from make about what went wrong.
 
-These tests are all built as simple binaries with few dependencies, so you can run them manually. For example, here's how to run the depthwise convolution test, and its output:
+These tests are all built as simple binaries with few dependencies, so you can
+run them manually. For example, here's how to run the depthwise convolution
+test, and its output:
 
 ```
 tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test
@@ -53,7 +97,9 @@ Testing SimpleTestReluQuantized
 ~ALL TESTS PASSED~~~
 ```
 
-Looking at the [depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc) code, you'll see a sequence that looks like this:
+Looking at the
+[depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc)
+code, you'll see a sequence that looks like this:
 
 ```
 ...
@@ -74,19 +120,41 @@ output, and the test harness that runs the binary during the make process knows
 that everything ran correctly. If there's an error, the lack of the expected
 string lets the harness know that the test failed.
 
-So, why are we running tests in this complicated way? So far, we've been building binaries that run locally on the Mac OS or Linux machine you're building on, but this approach becomes important when we're targeting simple micro controller devices.
-
-## Building for the "Blue Pill" STM32F103
-
-The goal of this library is to enable machine learning on resource-constrained micro controllers and DSPs, and as part of that we've targeted the ["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so it's a good device to ensure we can run efficiently on small chips.
-
-It's fairly easy to [buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill), but even if you don't have an actual device, the [Renode project](https://renode.io/) makes it easy to run a faithful emulation on your desktop machine. You'll need [Docker](https://www.docker.com/) installed, but once you have that set up, try running the following command:
-
-`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test`
-
-You should see a similar set of outputs as you did in the previous section, with the addition of some extra Docker logging messages. These are because we're using Docker to run the Renode micro controller emulation tool, and the tests themselves are being run on a simulated STM32F103 device. The communication channels between an embedded device and the host are quite limited, so the test harness looks at the output of the debug log to see if tests have passed, just as it did in the previous section. This makes it a very flexible way to run cross-platform tests, even when a platform has no operating system facilities, as long as it can output debugging text logs.
-
-To understand what's happening here, try running the same depthwise convolution test, but through the emulated device test harness, with the following command:
+So, why are we running tests in this complicated way? So far, we've been
+building binaries that run locally on the Mac OS or Linux machine you're
+building on, but this approach becomes important when we're targeting simple
+micro controller devices.
+
+## Building for the "Blue Pill" STM32F103 using Make
+
+The goal of this library is to enable machine learning on resource-constrained
+micro controllers and DSPs, and as part of that we've targeted the
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib)
+as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so
+it's a good device to ensure we can run efficiently on small chips.
+
+It's fairly easy to
+[buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill),
+but even if you don't have an actual device, the
+[Renode project](https://renode.io/) makes it easy to run a faithful emulation
+on your desktop machine. You'll need [Docker](https://www.docker.com/)
+installed, but once you have that set up, try running the following command:
+
+`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill
+test`
+
+You should see a similar set of outputs as you did in the previous section, with
+the addition of some extra Docker logging messages. These are because we're
+using Docker to run the Renode micro controller emulation tool, and the tests
+themselves are being run on a simulated STM32F103 device. The communication
+channels between an embedded device and the host are quite limited, so the test
+harness looks at the output of the debug log to see if tests have passed, just
+as it did in the previous section. This makes it a very flexible way to run
+cross-platform tests, even when a platform has no operating system facilities,
+as long as it can output debugging text logs.
+
+To understand what's happening here, try running the same depthwise convolution
+test, but through the emulated device test harness, with the following command:
 
 ```
 tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh \
@@ -115,7 +183,7 @@ LOGS:
 03:27:32.4834 [DEBUG] cpu.uartSemihosting: [+0.18ms host +0s virt 0s virt from start]   Testing SimpleTestReluQuantized
 03:27:32.4838 [DEBUG] cpu.uartSemihosting: [+0.4ms host +0s virt 0s virt from start]   4/4 tests passed
 03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+41µs host +0s virt 0s virt from start]   ~~~ALL TESTS PASSED~~~
-03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]   
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]
 ...
 tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test: PASS
 ```
@@ -126,3 +194,682 @@ debug logs here, along with the magic string `~~~ALL TESTS PASSED~~~`. This is
 the exact same code as before, just compiled and run on the STM32F103 rather
 than your desktop. We hope that the simplicity of this testing approach will
 help make adding support for new platforms as easy as possible.
+
+## Building for "Hifive1" SiFive FE310 development board
+
+We've targeted the
+["HiFive1" Arduino-compatible development board](https://www.sifive.com/boards/hifive1)
+as a test platform for RISC-V MCU.
+
+Similar to Blue Pill setup, you will need Docker installed. The binary can be
+executed on either HiFive1 board or emulated using
+[Renode project](https://renode.io/) on your desktop machine.
+
+The following instructions builds and transfers the source files to the Docker
+`docker build -t riscv_build \ -f
+{PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
+\ {PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/`
+
+You should see output that looks something like this:
+
+```
+Sending build context to Docker daemon  28.16kB
+Step 1/4 : FROM antmicro/renode:latest
+ ---> 19c08590e817
+Step 2/4 : LABEL maintainer="Pete Warden <petewarden@google.com>"
+ ---> Using cache
+ ---> 5a7770d3d3f5
+Step 3/4 : RUN apt-get update
+ ---> Using cache
+ ---> b807ab77eeb1
+Step 4/4 : RUN apt-get install -y curl git unzip make g++
+ ---> Using cache
+ ---> 8da1b2aa2438
+Successfully built 8da1b2aa2438
+Successfully tagged riscv_build:latest
+```
+
+Building micro_speech_test binary
+
+-   Lauch the Docker that we just created using: `docker run -it-v
+    /tmp/copybara_out:/workspace riscv_build:latest bash`
+-   Enter the source root directory by running `cd /workspace`
+-   Download the dependencies by running
+    `./tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`.
+    This may take a few minutes.
+-   Set the path to RISC-V tools: `export
+    PATH=${PATH}:/workspace/tensorflow/lite/experimental/micro/tools/make/downloads/riscv_toolchain/bin/`
+-   Build the binary: `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=riscv32_mcu`
+
+Lauching Renode to test the binary, currently this set up is not automated.
+
+-   Execute the binary on Renode: `renode -P 5000 --disable-xwt -e 's
+    @/workspace/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc'`
+
+You should see the following log with the magic string `~~~ALL TEST PASSED~~~`:
+
+```
+02:25:22.2059 [DEBUG] uart0: [+17.25s host +80ms virt 80ms virt from start] core freq at 0 Hz
+02:25:22.2065 [DEBUG] uart0: [+0.61ms host +0s virt 80ms virt from start]   Testing TestInvoke
+02:25:22.4243 [DEBUG] uart0: [+0.22s host +0.2s virt 0.28s virt from start]   Ran successfully
+02:25:22.4244 [DEBUG] uart0: [+42µs host +0s virt 0.28s virt from start]
+02:25:22.4245 [DEBUG] uart0: [+0.15ms host +0s virt 0.28s virt from start]   1/1 tests passed
+02:25:22.4247 [DEBUG] uart0: [+62µs host +0s virt 0.28s virt from start]   ~~~ALL TESTS PASSED~~~
+02:25:22.4251 [DEBUG] uart0: [+8µs host +0s virt 0.28s virt from start]
+02:25:22.4252 [DEBUG] uart0: [+0.39ms host +0s virt 0.28s virt from start]
+02:25:22.4253 [DEBUG] uart0: [+0.16ms host +0s virt 0.28s virt from start]   Progam has exited with code:0x00000000
+```
+
+## Building for Ambiq Micro Apollo3Blue EVB using Make
+
+Follow these steps to get the pushbutton yes/no example working on Apollo 3:
+
+1.  Make sure to run the "Getting Started" section before performing the
+    following steps
+2.  Download Apollo3-SDK-2018.08.13 and place in
+    `tensorflow/lite/experimental/micro/tools/make/downloads`. This is not yet
+    publicly released, but you can contact ashah@ambiqmicro.com to request a
+    copy.
+3.  Compile the project with the following command: make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
+    pushbutton_cmsis_speech_test_bin
+4.  Install [Segger JLink tools](https://www.segger.com/downloads/jlink/)
+5.  Connect the Apollo3 EVB (with mic shield in slot 3 of Microbus Shield board)
+    to the computer and power it on.
+6.  Start the GDB server in a new terminal with the following command:
+    JLinkGDBServer -select USB -device AMA3B1KK-KBR -endian little -if SWD
+    -speed 1000 -noir -noLocalhostOnly
+    1.  The command has run successfully if you see the message "Waiting for GDB
+        connection"
+7.  Back in the original terminal, run the program via the debugger
+    1.  Navigate to
+        tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+    2.  Start gdb by entering the following command: arm-none-eabi-gdb
+    3.  Run the command script by entering the following command: source
+        pushbutton_cmsis_scores.cmd. This script does the following:
+        1.  Load the binary created in step 6
+        2.  Set a breakpoint after inference scores have been computed
+        3.  Tell the debugger what variables should be printed out at this
+            breakpoint
+        4.  Begin program execution
+        5.  Press Ctrl+c to exit
+    4.  Press BTN2. An LED will flash for 1 second. Speak your utterance during
+        this one second
+    5.  The debugger will print out four numbers. They are the probabilites for
+        1) no speech, 2) unknown speech, 3) yes, 4) no
+    6.  The EVB LEDs will indicate detection.
+        1.  LED0 (rightmost LED) - ON when capturing 1sec of audio
+        2.  LED1 - ON when detecting silence
+        3.  LED2 - ON when detecting UNKNOWN utterance
+        4.  LED3 - ON when detecting YES utterance
+        5.  LED4 (leftmost LED) - ON when detecting NO utterance
+
+### Additional Apollo3 Instructions
+
+To flash a part with JFlash Lite, do the following: 
+
+1. At the command line: JFlashLiteExe 
+2. Device = AMA3B1KK-KBR 
+3. Interface = SWD at 1000 kHz 
+4. Data file = `tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin`
+5. Prog Addr = 0x0000C000
+
+## Goals
+
+The design goals are for the framework to be:
+
+-   **Readable**: We want embedded software engineers to be able to understand
+    what's required to run ML inference without having to study research papers.
+    We've tried to keep the code base small, modular, and have reference
+    implementations of all operations to help with this.
+
+-   **Easy to modify**: We know that there are a lot of different platforms and
+    requirements in the embedded world, and we don't expect to cover all of them
+    in one framework. Instead, we're hoping that it can be a good starting point
+    for developers to build on top of to meet their own needs. For example, we
+    tried to make it easy to replace the implementations of key computational
+    operators that are often crucial for performance, without having to touch
+    the data flow and other runtime code. We want it to make more sense to use
+    our workflow to handle things like model import and less-important
+    operations, and customize the parts that matter, rather than having to
+    reimplement everything in your own engine.
+
+-   **Well-tested**: If you're modifying code, you need to know if your changes
+    are correct. Having an easy way to test lets you develop much faster. To
+    help there, we've written tests for all the components, and we've made sure
+    that the tests can be run on almost any platform, with no dependencies apart
+    from the ability to log text to a debug console somewhere. We also provide
+    an easy way to run all the tests on-device as part of an automated test
+    framework, and we use qemu/Renode emulation so that tests can be run even
+    without physical devices present.
+
+-   **Easy to integrate**: We want to be as open a system as possible, and use
+    the best code available for each platform. To do that, we're going to rely
+    on projects like
+    [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html),
+    [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to
+    handle as much performance-critical code as possible. We know that there are
+    an increasing number of options to accelerate neural networks on
+    microcontrollers, so we're aiming to be a good host for deploying those
+    hardware technologies too.
+
+-   **Compatible**: We're using the same file schema, interpreter API, and
+    kernel interface as regular TensorFlow Lite, so we leverage the large
+    existing set of tools, documentation, and examples for the project. The
+    biggest barrier to deploying ML models is getting them from a training
+    environment into a form that's easy to run inference on, so we see reusing
+    this rich ecosystem as being crucial to being easily usable. We also hope to
+    integrate this experimental work back into the main codebase in the future.
+
+To meet those goals, we've made some tradeoffs:
+
+-   **Simple C++**: To help with readability, our code is written in a modern
+    version of C++, but we generally treat it as a "better C", rather relying on
+    more complex features such as template meta-programming. As mentioned
+    earlier, we avoid any use of dynamic memory allocation (new/delete) or the
+    standard C/C++ libraries, so we believe this should still be fairly
+    portable. It does mean that some older devices with C-only toolchains won't
+    be supported, but we're hoping that the reference operator implementations
+    (which are simple C-like functions) can still be useful in those cases. The
+    interfaces are also designed to be C-only, so it should be possible to
+    integrate the resulting library with pure C projects.
+
+-   **Interpreted**: Code generation is a popular pattern for embedded code,
+    because it gives standalone code that's easy to modify and step through, but
+    we've chosen to go with an interpreted approach. In our internal
+    microcontroller work we've found that using an extremely stripped-down
+    interpreter with almost no dependencies gives us a lot of the same
+    advantages, but is easier to maintain. For example, when new updates come
+    out for the underlying library, you can just merge your local modifications
+    in a single step, rather than having to regenerate new code and then patch
+    in any changes you subsequently made. The coarse granularity of the
+    interpreted primitives means that each operation call typically takes
+    hundreds of thousands of instruction cycles at least, so we don't see
+    noticeable performance gains from avoiding what's essentially a single
+    switch statement at the interpreter level to call each operation. We're
+    still working on improving the packaging though, for example we're
+    considering having the ability to snapshot all the source files and headers
+    used for a particular model, being able to compile the code and data
+    together as a library, and then access it through a minimal set of C
+    interface calls which hide the underlying complexity.
+
+-   **Flatbuffers**: We represent our models using
+    [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs),
+    with the difference that we always keep it in read-only program memory
+    (typically flash) rather than relying on having a file system to read it
+    from. This is a good fit because flatbuffer's serialized format is designed
+    to be mapped into memory without requiring any extra memory allocations or
+    modifications to access it. All of the functions to read model values work
+    directly on the serialized bytes, and large sections of data like weights
+    are directly accessible as sequential C-style arrays of their data type,
+    with no strides or unpacking needed. We do get a lot of value from using
+    flatbuffers, but there is a cost in complexity. The flat buffer library code
+    is all inline
+    [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h),
+    but it isn't straightforward to inspect their implementations, and the model
+    data structures aren't easy to comprehend from the debugger. The header for
+    the schema itself also has to be periodically updated when new information
+    is added to the file format, though we try to handle that transparently for
+    most developers by checking in a pre-generated version.
+
+-   **Code Duplication**: Some of the code in this prototype largely duplicates
+    the logic in other parts of the TensorFlow Lite code base, for example the
+    operator wrappers. We've tried to keep share as much as we can between the
+    two interpreters, but there are some assumptions built into the original
+    runtime that make this difficult. We'll be working on modularizing the main
+    interpreter so that we can move to an entirely shared system.
+
+This initial preview release is designed to get early feedback, and is not
+intended to be a final product. It only includes enough operations to run a
+simple keyword recognition model, and the implementations are not optimized.
+We're hoping this will be a good way to get feedback and collaborate to improve
+the framework.
+
+## Generating Project Files
+
+It's not always easy or convenient to use a makefile-based build process,
+especially if you're working on a product that uses a different IDE for the rest
+of its code. To address that, it's possible to generate standalone project
+folders for various popular build systems. These projects are self-contained,
+with only the headers and source files needed by a particular binary, and
+include project files to make loading them into an IDE easy. These can be
+auto-generated for any target you can compile using the main Make system, using
+a command like this (making sure you've run `download_dependencies.sh` first):
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_micro_speech_mbed_project
+```
+
+This will create a folder in
+`tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech_main_test/mbed`
+that contains the source and header files, some Mbed configuration files, and a
+README. You should then be able to copy this directory to another machine, and
+use it just like any other Mbed project. There's more information about project
+files [below](#working-with-generated-projects).
+
+## How to Port TensorFlow Lite Micro to a New Platform
+
+Are you a hardware or operating system provider looking to run machine learning
+on your platform? We're keen to help, and we've had experience helping other
+teams do the same thing, so here are our recommendations.
+
+### Requirements
+
+Since the core neural network operations are pure arithmetic, and don't require
+any I/O or other system-specific functionality, the code doesn't have to have
+many dependencies. We've tried to enforce this, so that it's as easy as possible
+to get TensorFlow Lite Micro running even on 'bare metal' systems without an OS.
+Here are the core requirements that a platform needs to run the framework:
+
+-   C/C++ compiler capable of C++11 compatibility. This is probably the most
+    restrictive of the requirements, since C++11 is not as widely adopted in the
+    embedded world as it is elsewhere. We made the decision to require it since
+    one of the main goals of TFL Micro is to share as much code as possible with
+    the wider TensorFlow codebase, and since that relies on C++11 features, we
+    need compatibility to achieve it. We only use a small, sane, subset of C++
+    though, so don't worry about having to deal with template metaprogramming or
+    similar challenges!
+
+-   Debug logging. The core network operations don't need any I/O functions, but
+    to be able to run tests and tell if they've worked as expected, the
+    framework needs some way to write out a string to some kind of debug
+    console. This will vary from system to system, for example on Linux it could
+    just be `fprintf(stderr, debug_string)` whereas an embedded device might
+    write the string out to a specified UART. As long as there's some mechanism
+    for outputting debug strings, you should be able to use TFL Micro on that
+    platform.
+
+-   Math library. The C standard `libm.a` library is needed to handle some of
+    the mathematical operations used to calculate neural network results.
+
+-   Global variable initialization. We do use a pattern of relying on global
+    variables being set before `main()` is run in some places, so you'll need to
+    make sure your compiler toolchain
+
+And that's it! You may be wondering about some other common requirements that
+are needed by a lot of non-embedded software, so here's a brief list of things
+that aren't necessary to get started with TFL Micro on a new platform:
+
+-   Operating system. Since the only platform-specific function we need is
+    `DebugLog()`, there's no requirement for any kind of Posix or similar
+    functionality around files, processes, or threads.
+
+-   C or C++ standard libraries. The framework tries to avoid relying on any
+    standard library functions that require linker-time support. This includes
+    things like string functions, but still allows us to use headers like
+    `stdtypes.h` which typically just define constants and typedefs.
+    Unfortunately this distinction isn't officially defined by any standard, so
+    it's possible that different toolchains may decide to require linked code
+    even for the subset we use, but in practice we've found it's usually a
+    pretty obvious decision and stable over platforms and toolchains.
+
+-   Dynamic memory allocation. All the TFL Micro code avoids dynamic memory
+    allocation, instead relying on local variables on the stack in most cases,
+    or global variables for a few situations. These are all fixed-size, which
+    can mean some compile-time configuration to ensure there's enough space for
+    particular networks, but does avoid any need for a heap and the
+    implementation of `malloc\new` on a platform.
+
+-   Floating point. Eight-bit integer arithmetic is enough for inference on many
+    networks, so if a model sticks to these kind of quantized operations, no
+    floating point instructions should be required or executed by the framework.
+
+### Getting Started
+
+We recommend that you start trying to compile and run one of the simplest tests
+in the framework as your first step. The full TensorFlow codebase can seem
+overwhelming to work with at first, so instead you can begin with a collection
+of self-contained project folders that only include the source files needed for
+a particular test or executable. You can find a set of pre-generated projects
+[here](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+
+As mentioned above, the one function you will need to implement for a completely
+new platform is debug logging. If your device is just a variation on an existing
+platform you may be able to reuse code that's already been written. To
+understand what's available, begin with the default reference implementation at
+[tensorflow/lite/experimental/micro/debug_log.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/debug_log.cc]),
+which uses fprintf and stderr. If your platform has this level of support for
+the C standard library in its toolchain, then you can just reuse this.
+Otherwise, you'll need to do some research into how your platform and device can
+communicate logging statements to the outside world. As another example, take a
+look at
+[the Mbed version of `DebugLog()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/mbed/debug_log.cc),
+which creates a UART object and uses it to output strings to the host's console
+if it's connected.
+
+Begin by navigating to the micro_error_reporter_test folder in the pregenerated
+projects you downloaded. Inside here, you'll see a set of folders containing all
+the source code you need. If you look through them, you should find a total of
+around 60 C or C++ files that compiled together will create the test executable.
+There's an example makefile in the directory that lists all of the source files
+and include paths for the headers. If you're building on a Linux or MacOS host
+system, you may just be able to reuse that same makefile to cross-compile for
+your system, as long as you swap out the `CC` and `CXX` variables from their
+defaults, to point to your cross compiler instead (for example
+`arm-none-eabi-gcc` or `riscv64-unknown-elf-gcc`). Otherwise, set up a project
+in the build system you are using. It should hopefully be fairly
+straightforward, since all of the source files in the folder need to be
+compiled, so on many IDEs you can just drag the whole lot in. Then you need to
+make sure that C++11 compatibility is turned on, and that the right include
+paths (as mentioned in the makefile) have been added.
+
+You'll see the default `DebugLog()` implementation in
+'tensorflow/lite/experimental/micro/debug_log.cc' inside the
+micro_error_reporter_test folder. Modify that file to add the right
+implementation for your platform, and then you should be able to build the set
+of files into an executable. Transfer that executable to your target device (for
+example by flashing it), and then try running it. You should see output that
+looks something like this:
+
+```
+Number: 42
+Badly-formed format string
+Another  badly-formed  format string
+~~ALL TESTS PASSED~~~
+```
+
+If not, you'll need to debug what went wrong, but hopefully with this small
+starting project it should be manageable.
+
+### Troubleshooting
+
+When we've been porting to new platforms, it's often been hard to figure out
+some of the fundamentals like linker settings and other toolchain setup flags.
+If you are having trouble, see if you can find a simple example program for your
+platform, like one that just blinks an LED. If you're able to build and run that
+successfully, then start to swap in parts of the TF Lite Micro codebase to that
+working project, taking it a step at a time and ensuring it's still working
+after every change. For example, a first step might be to paste in your
+`DebugLog()` implementation and call `DebugLog("Hello World!")` from the main
+function.
+
+Another common problem on embedded platforms is the stack size being too small.
+Mbed defaults to 4KB for the main thread's stack, which is too small for most
+models since TensorFlow Lite allocates buffers and other data structures that
+require more memory. The exact size will depend on which model you're running,
+but try increasing it if you are running into strange corruption issues that
+might be related to stack overwriting.
+
+### Optimizing for your Platform
+
+The default reference implementations in TensorFlow Lite Micro are written to be
+portable and easy to understand, not fast, so you'll want to replace performance
+critical parts of the code with versions specifically tailored to your
+architecture. The framework has been designed with this in mind, and we hope the
+combination of small modules and many tests makes it as straightforward as
+possible to swap in your own code a piece at a time, ensuring you have a working
+version at every step. To write specialized implementations for a platform, it's
+useful to understand how optional components are handled inside the build
+system.
+
+### Code Module Organization
+
+We have adopted a system of small modules with platform-specific implementations
+to help with portability. Every module is just a standard `.h` header file
+containing the interface (either functions or a class), with an accompanying
+reference implementation in a `.cc` with the same name. The source file
+implements all of the code that's declared in the header. If you have a
+specialized implementation, you can create a folder in the same directory as the
+header and reference source, name it after your platform, and put your
+implementation in a `.cc` file inside that folder. We've already seen one
+example of this, where the Mbed and Bluepill versions of `DebugLog()` are inside
+[mbed](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/mbed)
+and
+[bluepill](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/bluepill)
+folders, children of the
+[same directory](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+where the stdio-based
+[`debug_log.cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/debug_log.cc)
+reference implementation is found.
+
+The advantage of this approach is that we can automatically pick specialized
+implementations based on the current build target, without having to manually
+edit build files for every new platform. It allows incremental optimizations
+from a always-working foundation, without cluttering the reference
+implementations with a lot of variants.
+
+To see why we're doing this, it's worth looking at the alternatives. TensorFlow
+Lite has traditionally used preprocessor macros to separate out some
+platform-specific code within particular files, for example:
+
+```
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+```
+
+There’s also a tradition in gemmlowp of using file suffixes to indicate
+platform-specific versions of particular headers, with kernel_neon.h being
+included by kernel.h if `USE_NEON` is defined. As a third variation, kernels are
+separated out using a directory structure, with
+tensorflow/lite/kernels/internal/reference containing portable implementations,
+and tensorflow/lite/kernels/internal/optimized holding versions optimized for
+NEON on Arm platforms.
+
+These approaches are hard to extend to multiple platforms. Using macros means
+that platform-specific code is scattered throughout files in a hard-to-find way,
+and can make following the control flow difficult since you need to understand
+the macro state to trace it. For example, I temporarily introduced a bug that
+disabled NEON optimizations for some kernels when I removed
+tensorflow/lite/kernels/internal/common.h from their includes, without realizing
+it was where USE_NEON was defined!
+
+It’s also tough to port to different build systems, since figuring out the right
+combination of macros to use can be hard, especially since some of them are
+automatically defined by the compiler, and others are only set by build scripts,
+often across multiple rules.
+
+The approach we are using extends the file system approach that we use for
+kernel implementations, but with some specific conventions:
+
+-   For each module in TensorFlow Lite, there will be a parent directory that
+    contains tests, interface headers used by other modules, and portable
+    implementations of each part.
+-   Portable means that the code doesn’t include code from any libraries except
+    flatbuffers, or other TF Lite modules. You can include a limited subset of
+    standard C or C++ headers, but you can’t use any functions that require
+    linking against those libraries, including fprintf, etc. You can link
+    against functions in the standard math library, in <math.h>.
+-   Specialized implementations are held inside subfolders of the parent
+    directory, named after the platform or library that they depend on. So, for
+    example if you had my_module/foo.cc, a version that used RISC-V extensions
+    would live in my_module/riscv/foo.cc. If you had a version that used the
+    CMSIS library, it should be in my_module/cmsis/foo.cc.
+-   These specialized implementations should completely replace the top-level
+    implementations. If this involves too much code duplication, the top-level
+    implementation should be split into smaller files, so only the
+    platform-specific code needs to be replaced.
+-   There is a convention about how build systems pick the right implementation
+    file. There will be an ordered list of 'tags' defining the preferred
+    implementations, and to generate the right list of source files, each module
+    will be examined in turn. If a subfolder with a tag’s name contains a .cc
+    file with the same base name as one in the parent folder, then it will
+    replace the parent folder’s version in the list of build files. If there are
+    multiple subfolders with matching tags and file names, then the tag that’s
+    latest in the ordered list will be chosen. This allows us to express “I’d
+    like generically-optimized fixed point if it’s available, but I’d prefer
+    something using the CMSIS library” using the list 'fixed_point cmsis'. These
+    tags are passed in as `TAGS="<foo>"` on the command line when you use the
+    main Makefile to build.
+-   There is an implicit “reference” tag at the start of every list, so that
+    it’s possible to support directory structures like the current
+    tensorflow/kernels/internal where portable implementations are held in a
+    “reference” folder that’s a sibling to the NEON-optimized folder.
+-   The headers for each unit in a module should remain platform-agnostic, and
+    be the same for all implementations. Private headers inside a sub-folder can
+    be used as needed, but shouldn’t be referred to by any portable code at the
+    top level.
+-   Tests should be at the parent level, with no platform-specific code.
+-   No platform-specific macros or #ifdef’s should be used in any portable code.
+
+The implementation of these rules is handled inside the Makefile, with a
+[`specialize` function](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc#L42)
+that takes a list of reference source file paths as an input, and returns the
+equivalent list with specialized versions of those files swapped in if they
+exist.
+
+### Working with Generated Projects
+
+So far, I've recommended that you use the standalone generated projects for your
+system. You might be wondering why you're not just checking out the full
+[TensorFlow codebase from GitHub](https://github.com/tensorflow/tensorflow/)?
+The main reason is that there is a lot more diversity of architectures, IDEs,
+support libraries, and operating systems in the embedded world. Many of the
+toolchains require their own copy of source files, or a list of sources to be
+written to a project file. When a developer working on TensorFlow adds a new
+source file or changes its location, we can't expect her to update multiple
+different project files, many of which she may not have the right software to
+verify the change was correct. That means we have to rely on a central listing
+of source files (which in our case is held in the makefile), and then call a
+tool to generate other project files from those. We could ask embedded
+developers to do this process themselves after downloading the main source, but
+running the makefile requires a Linux system which may not be available, takes
+time, and involves downloading a lot of dependencies. That is why we've opted to
+make regular snapshots of the results of generating these projects for popular
+IDEs and platforms, so that embedded developers have a fast and friendly way to
+start using TensorFlow Lite for Microcontrollers.
+
+This does have the disadvantage that you're no longer working directly on the
+main repository, instead you have a copy that's outside of source control. We've
+tried to make the copy as similar to the main repo as possible, for example by
+keeping the paths of all source files the same, and ensuring that there are no
+changes between the copied files and the originals, but it still makes it
+tougher to sync as the main repository is updated. There are also multiple
+copies of the source tree, one for each target, so any change you make to one
+copy has to be manually propagated across all the other projects you care about.
+This doesn't matter so much if you're just using the projects as they are to
+build products, but if you want to support a new platform and have the changes
+reflected in the main code base, you'll have to do some extra work.
+
+As an example, think about the `DebugLog()` implementation we discussed adding
+for a new platform earlier. At this point, you have a new version of
+`debug_log.cc` that does what's required, but how can you share that with the
+wider community? The first step is to pick a tag name for your platform. This
+can either be the operating system (for example 'mbed'), the name of a device
+('bluepill'), or some other text that describes it. This should be a short
+string with no spaces or special characters. Log in or create an account on
+GitHub, fork the full
+[TensorFlow codebase](https://github.com/tensorflow/tensorflow/) using the
+'Fork' button on the top left, and then grab your fork by using a command like
+`git clone https://github.com/<your user name>/tensorflow`.
+
+You'll either need Linux, MacOS, or Windows with something like CygWin installed
+to run the next steps, since they involve building a makefile. Run the following
+commands from a terminal, inside the root of the source folder:
+
+```
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile generate_projects
+```
+
+This will take a few minutes, since it has to download some large toolchains for
+the dependencies. Once it has finished, you should see some folders created
+inside a path like
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/`. The exact
+path depends on your host operating system, but you should be able to figure it
+out from all the copy commands. These folders contain the generated project and
+source files, with
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/keil`
+containing the Keil uVision targets,
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/mbed` with
+the Mbed versions, and so on.
+
+If you've got this far, you've successfully set up the project generation flow.
+Now you need to add your specialized implementation of `DebugLog()`. Start by
+creating a folder inside `tensorflow/lite/experimental/micro/` named after the
+tag you picked earlier. Put your `debug_log.cc` file inside this folder, and
+then run this command, with '<your tag>' replaced by the actual folder name:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS="<your tag>" generate_projects
+```
+
+If your tag name actually refers to a whole target architecture, then you'll use
+TARGET or TARGET_ARCH instead. For example, here's how a simple RISC-V set of
+projects is generated:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET="riscv32_mcu" generate_projects
+```
+
+The way it works is the same as TAGS though, it just looks for specialized
+implementations with the same containing folder name.
+
+If you look inside the projects that have been created, you should see that the
+default `DebugLog()` implementation is no longer present at
+`tensorflow/lite/experimental/micro/debug_log.cc`, and instead
+`tensorflow/lite/experimental/micro/<your tag>/debug_log.cc` is being used. Copy
+over the generated project files and try building them in your own IDE. If
+everything works, then you're ready to submit your change.
+
+To do this, run something like:
+
+```
+git add tensorflow/lite/experimental/micro/<your tag>/debug_log.cc
+git commit -a -m "Added DebugLog() support for <your platform>"
+git push origin master
+```
+
+Then go back to https://github.com/<your account>/tensorflow, and choose "New
+Pull Request" near the top. You should then be able to go through the standard
+TensorFlow PR process to get your change added to the main repository, and
+available to the rest of the community!
+
+### Supporting a Platform with Makefiles
+
+The changes you've made so far will enable other developers using the generated
+projects to use your platform, but TensorFlow's continuous integration process
+uses makefiles to build frequently and ensure changes haven't broken the build
+process for different systems. If you are able to convert your build procedure
+into something that can be expressed by a makefile, then we can integrate your
+platform into our CI builds and make sure it continues to work.
+
+Fully describing how to do this is beyond the scope of this documentation, but
+the biggest needs are:
+
+-   A command-line compiler that can be called for every source file.
+-   A list of the arguments to pass into the compiler to build and link all
+    files.
+-   The correct linker map files and startup assembler to ensure `main()` gets
+    called.
+
+### Supporting a Platform with Emulation Testing
+
+Integrating your platform into the makefile process should help us make sure
+that it continues to build, but it doesn't guarantee that the results of the
+build process will run correctly. Running tests is something we require to be
+able to say that TensorFlow officially supports a platform, since otherwise we
+can't guarantee that users will have a good experience when they try using it.
+Since physically maintaining a full set of all supported hardware devices isn't
+feasible, we rely on software emulation to run these tests. A good example is
+our
+[STM32F4 'Bluepill' support](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh),
+which uses [Docker](https://www.docker.com/) and [Renode](https://renode.io/) to
+run built binaries in an emulator. You can use whatever technologies you want,
+the only requirements are that they capture the debug log output of the tests
+being run in the emulator, and parse them for the string that indicates the test
+was successful. These scripts need to run on Ubuntu 18.04, in a bash
+environment, though Docker is available if you need to install extra software or
+have other dependencies.
+
+### Implementing More Optimizations
+
+Clearly, getting debug logging support is only the beginning of the work you'll
+need to do on a particular platform. It's very likely that you'll want to
+optimize the core deep learning operations that take up the most time when
+running models you care about. The good news is that the process for providing
+optimized implementations is the same as the one you just went through to
+provide your own logging. You'll need to identify parts of the code that are
+bottlenecks, and then add specialized implementations in their own folders.
+These don't need to be platform specific, they can also be broken out by which
+library they rely on for example. [Here's where we do that for the CMSIS
+implementation of integer fast-fourier
+transforms](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc).
+This more complex case shows that you can also add helper source files alongside
+the main implementation, as long as you
+[mention them in the platform-specific makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc).
+You can also do things like update the list of libraries that need to be linked
+in, or add include paths to required headers.
diff --git a/tensorflow/lite/experimental/micro/bluepill/debug_log.cc b/tensorflow/lite/experimental/micro/bluepill/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4812a918498ee2ab52e114bce9ca0cf3919b2254
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/bluepill/debug_log.cc
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+// For Arm Cortex-M devices, calling SYS_WRITE0 will output the zero-terminated
+// string pointed to by R1 to any debug console that's attached to the system.
+extern "C" void DebugLog(const char* s) {
+  asm("mov r0, #0x04\n"  // SYS_WRITE0
+      "mov r1, %[str]\n"
+      "bkpt #0xAB\n"
+      :
+      : [ str ] "r"(s)
+      : "r0", "r1");
+}
diff --git a/tensorflow/lite/experimental/micro/debug_log.cc b/tensorflow/lite/experimental/micro/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d4ca44d76b73020848e9757c230d7bf69ff5aaa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the DebugLog() function that's required for a
+// platform to support the TensorFlow Lite for Microcontrollers library. This is
+// the only function that's absolutely required to be available on a target
+// device, since it's used for communicating test results back to the host so
+// that we can verify the implementation is working correctly.
+// It's designed to be as easy as possible to supply an implementation though.
+// On platforms that have a POSIX stack or C library, it can be written as a
+// single call to `fprintf(stderr, "%s", s)` to output a string to the error
+// stream of the console, but if there's no OS or C library available, there's
+// almost always an equivalent way to write out a string to some serial
+// interface that can be used instead. For example on Arm M-series MCUs, calling
+// the `bkpt #0xAB` assembler instruction will output the string in r1 to
+// whatever debug serial connection is available. If you're running mbed, you
+// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
+// `pc.printf("%s", s)`.
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/experimental/micro/bluepill/debug_log.cc or the mbed one on
+// tensorflow/lite/experimental/micro/mbed/debug_log.cc.
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include <cstdio>
+
+extern "C" void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
diff --git a/tensorflow/lite/experimental/micro/debug_log.h b/tensorflow/lite/experimental/micro/debug_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0e395c3760e2e0c57b50c38c05737dfecb7e680
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log.h
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
+
+// This function should be implemented by each target platform, and provide a
+// way for strings to be output to some text stream. For more information, see
+// tensorflow/lite/experimental/micro/debug_log.cc.
+extern "C" void DebugLog(const char* s);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
diff --git a/tensorflow/lite/experimental/micro/debug_log_numbers.cc b/tensorflow/lite/experimental/micro/debug_log_numbers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e86730674859d5560e5ec6b243e40c95f88bf4f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log_numbers.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements debug logging for numbers by converting them into strings and then
+// calling the main DebugLog(char*) function. These are separated into a
+// different file so that platforms can just implement the string output version
+// of DebugLog() and then get the numerical variations without requiring any
+// more code.
+
+#include "tensorflow/lite/experimental/micro/debug_log_numbers.h"
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+namespace {
+
+// All input buffers to the number conversion functions must be this long.
+static const int kFastToBufferSize = 48;
+
+// Reverses a zero-terminated string in-place.
+char* ReverseStringInPlace(char* start, char* end) {
+  char* p1 = start;
+  char* p2 = end - 1;
+  while (p1 < p2) {
+    char tmp = *p1;
+    *p1++ = *p2;
+    *p2-- = tmp;
+  }
+  return start;
+}
+
+// Appends a string to a string, in-place. You need to pass in the maximum
+// string length as the second argument.
+char* StrCatStr(char* main, int main_max_length, const char* to_append) {
+  char* current = main;
+  while (*current != 0) {
+    ++current;
+  }
+  char* current_end = main + (main_max_length - 1);
+  while ((*to_append != 0) && (current < current_end)) {
+    *current = *to_append;
+    ++current;
+    ++to_append;
+  }
+  *current = 0;
+  return current;
+}
+
+// Populates the provided buffer with an ASCII representation of the number.
+char* FastUInt32ToBufferLeft(uint32_t i, char* buffer, int base) {
+  char* start = buffer;
+  do {
+    int32_t digit = i % base;
+    char character;
+    if (digit < 10) {
+      character = '0' + digit;
+    } else {
+      character = 'a' + (digit - 10);
+    }
+    *buffer++ = character;
+    i /= base;
+  } while (i > 0);
+  *buffer = 0;
+  ReverseStringInPlace(start, buffer);
+  return buffer;
+}
+
+// Populates the provided buffer with an ASCII representation of the number.
+char* FastInt32ToBufferLeft(int32_t i, char* buffer) {
+  uint32_t u = i;
+  if (i < 0) {
+    *buffer++ = '-';
+    u = -u;
+  }
+  return FastUInt32ToBufferLeft(u, buffer, 10);
+}
+
+// Converts a number to a string and appends it to another.
+char* StrCatInt32(char* main, int main_max_length, int32_t number) {
+  char number_string[kFastToBufferSize];
+  FastInt32ToBufferLeft(number, number_string);
+  return StrCatStr(main, main_max_length, number_string);
+}
+
+// Converts a number to a string and appends it to another.
+char* StrCatUInt32(char* main, int main_max_length, uint32_t number, int base) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(number, number_string, base);
+  return StrCatStr(main, main_max_length, number_string);
+}
+
+// Populates the provided buffer with ASCII representation of the float number.
+// Avoids the use of any floating point instructions (since these aren't
+// supported on many microcontrollers) and as a consequence prints values with
+// power-of-two exponents.
+char* FastFloatToBufferLeft(float f, char* buffer) {
+  char* current = buffer;
+  char* current_end = buffer + (kFastToBufferSize - 1);
+  // Access the bit fields of the floating point value to avoid requiring any
+  // float instructions. These constants are derived from IEEE 754.
+  const uint32_t sign_mask = 0x80000000;
+  const uint32_t exponent_mask = 0x7f800000;
+  const int32_t exponent_shift = 23;
+  const int32_t exponent_bias = 127;
+  const uint32_t fraction_mask = 0x007fffff;
+  const uint32_t u = *reinterpret_cast<uint32_t*>(&f);
+  const int32_t exponent =
+      ((u & exponent_mask) >> exponent_shift) - exponent_bias;
+  const uint32_t fraction = (u & fraction_mask);
+  // Expect ~0x2B1B9D3 for fraction.
+  if (u & sign_mask) {
+    *current = '-';
+    current += 1;
+  }
+  *current = 0;
+  // These are special cases for infinities and not-a-numbers.
+  if (exponent == 128) {
+    if (fraction == 0) {
+      current = StrCatStr(current, (current_end - current), "Inf");
+      return current;
+    } else {
+      current = StrCatStr(current, (current_end - current), "NaN");
+      return current;
+    }
+  }
+  // 0x007fffff (8388607) represents 0.99... for the fraction, so to print the
+  // correct decimal digits we need to scale our value before passing it to the
+  // conversion function. This scale should be 10000000/8388608 = 1.1920928955.
+  // We can approximate this using multiply-adds and right-shifts using the
+  // values in this array. The 1. portion of the number string is printed out
+  // in a fixed way before the fraction, below.
+  const int32_t scale_shifts_size = 13;
+  const int8_t scale_shifts[13] = {3,  4,  8,  11, 13, 14, 17,
+                                   18, 19, 20, 21, 22, 23};
+  uint32_t scaled_fraction = fraction;
+  for (int i = 0; i < scale_shifts_size; ++i) {
+    scaled_fraction += (fraction >> scale_shifts[i]);
+  }
+  *current = '1';
+  current += 1;
+  *current = '.';
+  current += 1;
+  *current = 0;
+  current = StrCatUInt32(current, (current_end - current), scaled_fraction, 10);
+  current = StrCatStr(current, (current_end - current), "*2^");
+  current = StrCatInt32(current, (current_end - current), exponent);
+  return current;
+}
+
+}  // namespace
+
+extern "C" void DebugLogInt32(int32_t i) {
+  char number_string[kFastToBufferSize];
+  FastInt32ToBufferLeft(i, number_string);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogUInt32(uint32_t i) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(i, number_string, 10);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogHex(uint32_t i) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(i, number_string, 16);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogFloat(float i) {
+  char number_string[kFastToBufferSize];
+  FastFloatToBufferLeft(i, number_string);
+  DebugLog(number_string);
+}
diff --git a/tensorflow/lite/experimental/micro/debug_log_numbers.h b/tensorflow/lite/experimental/micro/debug_log_numbers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d889e751730495e2d1bf6232e7b9c2cbb76c9667
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log_numbers.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
+
+#include <cstdint>
+
+// Output numbers to the debug logging stream.
+extern "C" {
+void DebugLogInt32(int32_t i);
+void DebugLogUInt32(uint32_t i);
+void DebugLogHex(uint32_t i);
+void DebugLogFloat(float i);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d8dd7532abcc65af52e9db03c516274e3d674dc1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
@@ -0,0 +1 @@
+*.wav
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 799b2e5a5dd097c6e017f574449d339992f7c41b..51ba2976a2eeca01a280fc8e9abb14289bf22adc 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -176,7 +176,6 @@ cc_library(
         ":audio_provider",
         ":model_settings",
         ":preprocessor_reference",
-        ":timer",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
@@ -191,7 +190,6 @@ tflite_micro_cc_test(
         ":audio_provider",
         ":feature_provider",
         ":model_settings",
-        ":timer",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -199,22 +197,30 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "timer",
+    name = "recognize_commands",
     srcs = [
-        "timer.cc",
+        "recognize_commands.cc",
     ],
     hdrs = [
-        "timer.h",
+        "recognize_commands.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "timer_test",
+    name = "recognize_commands_test",
     srcs = [
-        "timer_test.cc",
+        "recognize_commands_test.cc",
+    ],
+    tags = [
+        "no_oss",  # TODO(122853023): Resolve issues and re-enable.
     ],
     deps = [
-        ":timer",
+        ":recognize_commands",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -229,15 +235,12 @@ cc_binary(
     deps = [
         ":audio_provider",
         ":feature_provider",
-        ":features_test_data",
         ":model_settings",
-        ":preprocessor_reference",
-        ":timer",
+        ":recognize_commands",
         ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
-        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3d560510ad140ff0bba84ebcf790a0fda90e72fa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
@@ -0,0 +1,43 @@
+# Settings for targets that use the CMSIS library.
+ifneq ($(filter CMSIS,$(ALL_TAGS)),)
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/
+
+  CMSIS_PREPROCESSOR_SRCS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc \
+
+  CMSIS_PREPROCESSOR_HDRS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h \
+    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h
+
+  PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  PREPROCESSOR_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  FEATURE_PROVIDER_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  FEATURE_PROVIDER_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  MICRO_SPEECH_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  THIRD_PARTY_CC_SRCS += \
+    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
+    third_party/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
+    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
+    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
+    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
+    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/cmsis/CMSIS/DSP/Include/arm_common_tables.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h
+
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65aec34a1f7991fad33a61a12eddd414577c666d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
@@ -0,0 +1,23 @@
+# Description of files
+
+*   **create_constants.py**: Python file used to create hanning.cc, hanning.h,
+    sin_1k.cc, and sin_1k.h
+*   **hanning.cc**: Precomputed
+    [Hann window](https://en.wikipedia.org/wiki/Hann_function) for use in the
+    preprocessor. This file is created in ../create_constants.py
+*   **hanning.h**: Header file fro hanning.cc
+*   **preprocessor.cc**: CMSIS version of the preprocessor
+*   **sin_1k.cc**: A 1 kHZ sinusoid used for comparing the CMSIS preprocessor
+    with the Micro-Lite fixed_point preprocessor
+*   **sin_1k.h**: Header file for sin_1k.cc
+
+# Description of externally downloaded files in ../CMSIS_ext
+
+*   **arm_cmplx_mag_squared_q10p6.c**: Modified version of the ARM CMSIS
+    function
+    [arm_cmplx_mag_squared.c](http://arm-software.github.io/CMSIS_5/DSP/html/group__cmplx__mag__squared.html#ga45537f576102d960d467eb722b8431f2).
+    The modification is that we have changed the amount of right-shift to make
+    sure our data is in the correct range. We redistribute because the original
+    content was created with the Apache 2.0 license.
+*   **arm_cmplx_mag_squared_q10p6.h**: Header file for
+    arm_cmplx_mag_squared_q10p6.c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
new file mode 100755
index 0000000000000000000000000000000000000000..daf7e3cde89a0380cbbcae6ddc88859c8e87ffb9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Outputs tables used for fast calculations at runtime."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# import soundfile as sf
+import numpy as np
+
+
+def to_cc(x, varname, directory='', scale_factor=1):
+  """Writes table values to a C++ source file."""
+  x = (x / np.max(np.abs(x))) * 32768 * scale_factor
+  x[x > 32767] = 32767
+  x[x < -32768] = -32768
+  x = x.astype(int)
+  x = [str(v) if i % 10 != 0 else '\n    ' + str(v) for i, v in enumerate(x)]
+
+  cmsis_path = 'tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS'
+  xstr = '#include "{}/{}.h"\n\n'.format(cmsis_path, varname)
+  xstr += 'const int g_{}_size = {};\n'.format(varname, len(x))
+  xstr += 'const int16_t g_{}[{}] = {{{}}};\n'.format(varname, len(x),
+                                                      ', '.join(x))
+
+  with open(directory + varname + '.cc', 'w') as f:
+    f.write(xstr)
+
+
+def to_h(_, varname, directory=''):
+  """Writes a header file for the table values."""
+  tf_prepend = 'TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_'
+  xstr = '#ifndef {}{}_H_\n'.format(tf_prepend, varname.upper())
+  xstr += '#define {}{}_H_\n\n'.format(tf_prepend, varname.upper())
+  xstr += '#include <cstdint>\n\n'
+  xstr += 'extern const int g_{}_size;\n'.format(varname)
+  xstr += 'extern const int16_t g_{}[];\n\n'.format(varname)
+  xstr += '#endif'
+
+  with open(directory + varname + '.h', 'w') as f:
+    f.write(xstr)
+
+
+# x = sf.read('yes_f2e59fea_nohash_1.wav')[0]
+# to_cc(x, 'yes_waveform')
+# to_h(x, 'yes_waveform')
+#
+# x = sf.read('no_f9643d42_nohash_4.wav')[0]
+# to_cc(x, 'no_waveform')
+# to_h(x, 'no_waveform')
+
+# 30ms of data @ 16 kHz = 480 samples
+hann = np.hanning(int(16000 * 0.03))  # Window 30ms of data
+to_cc(hann, 'hanning', directory='./')
+to_h(hann, 'hanning', directory='./')
+
+t = np.arange(16000. * 0.03) / 16000.
+sin1k = np.sin(
+    2 * np.pi * 1000 *
+    t)  # Factor of 10 because micro preprocessing overflows otherwise
+to_cc(sin1k, 'sin_1k', directory='./', scale_factor=0.1)
+to_h(sin1k, 'sin_1k', directory='./')
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6a11ce52c6b41a9f6fcbfc5a31bf7e0da8361cf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+
+const int g_hanning_size = 480;
+const int16_t g_hanning[480] = {
+    0,     1,     5,     12,    22,    35,    50,    69,    90,    114,   140,
+    170,   202,   237,   275,   316,   359,   405,   454,   506,   560,   617,
+    677,   740,   805,   873,   943,   1016,  1092,  1171,  1252,  1336,  1422,
+    1511,  1602,  1696,  1793,  1892,  1993,  2097,  2204,  2312,  2424,  2537,
+    2653,  2772,  2893,  3016,  3141,  3269,  3399,  3531,  3665,  3802,  3941,
+    4082,  4225,  4370,  4517,  4666,  4817,  4971,  5126,  5283,  5442,  5603,
+    5765,  5930,  6096,  6265,  6435,  6606,  6779,  6954,  7131,  7309,  7489,
+    7670,  7853,  8037,  8223,  8410,  8598,  8788,  8979,  9171,  9365,  9560,
+    9756,  9953,  10151, 10350, 10551, 10752, 10954, 11157, 11362, 11567, 11772,
+    11979, 12186, 12395, 12603, 12813, 13023, 13233, 13445, 13656, 13868, 14081,
+    14294, 14507, 14721, 14935, 15149, 15363, 15578, 15793, 16008, 16222, 16437,
+    16652, 16867, 17082, 17297, 17511, 17725, 17939, 18153, 18367, 18580, 18793,
+    19005, 19217, 19428, 19639, 19850, 20059, 20269, 20477, 20685, 20892, 21098,
+    21303, 21508, 21712, 21914, 22116, 22317, 22517, 22716, 22913, 23110, 23305,
+    23499, 23692, 23884, 24075, 24264, 24451, 24638, 24823, 25006, 25188, 25369,
+    25548, 25725, 25901, 26075, 26247, 26418, 26587, 26754, 26920, 27083, 27245,
+    27405, 27563, 27719, 27874, 28026, 28176, 28324, 28470, 28614, 28756, 28896,
+    29034, 29169, 29303, 29434, 29563, 29689, 29813, 29935, 30055, 30172, 30287,
+    30400, 30510, 30617, 30723, 30825, 30926, 31023, 31119, 31211, 31301, 31389,
+    31474, 31556, 31636, 31713, 31788, 31860, 31929, 31996, 32059, 32121, 32179,
+    32235, 32288, 32338, 32386, 32430, 32472, 32512, 32548, 32582, 32613, 32641,
+    32666, 32689, 32708, 32725, 32739, 32751, 32759, 32765, 32767, 32767, 32765,
+    32759, 32751, 32739, 32725, 32708, 32689, 32666, 32641, 32613, 32582, 32548,
+    32512, 32472, 32430, 32386, 32338, 32288, 32235, 32179, 32121, 32059, 31996,
+    31929, 31860, 31788, 31713, 31636, 31556, 31474, 31389, 31301, 31211, 31119,
+    31023, 30926, 30825, 30723, 30617, 30510, 30400, 30287, 30172, 30055, 29935,
+    29813, 29689, 29563, 29434, 29303, 29169, 29034, 28896, 28756, 28614, 28470,
+    28324, 28176, 28026, 27874, 27719, 27563, 27405, 27245, 27083, 26920, 26754,
+    26587, 26418, 26247, 26075, 25901, 25725, 25548, 25369, 25188, 25006, 24823,
+    24638, 24451, 24264, 24075, 23884, 23692, 23499, 23305, 23110, 22913, 22716,
+    22517, 22317, 22116, 21914, 21712, 21508, 21303, 21098, 20892, 20685, 20477,
+    20269, 20059, 19850, 19639, 19428, 19217, 19005, 18793, 18580, 18367, 18153,
+    17939, 17725, 17511, 17297, 17082, 16867, 16652, 16437, 16222, 16008, 15793,
+    15578, 15363, 15149, 14935, 14721, 14507, 14294, 14081, 13868, 13656, 13445,
+    13233, 13023, 12813, 12603, 12395, 12186, 11979, 11772, 11567, 11362, 11157,
+    10954, 10752, 10551, 10350, 10151, 9953,  9756,  9560,  9365,  9171,  8979,
+    8788,  8598,  8410,  8223,  8037,  7853,  7670,  7489,  7309,  7131,  6954,
+    6779,  6606,  6435,  6265,  6096,  5930,  5765,  5603,  5442,  5283,  5126,
+    4971,  4817,  4666,  4517,  4370,  4225,  4082,  3941,  3802,  3665,  3531,
+    3399,  3269,  3141,  3016,  2893,  2772,  2653,  2537,  2424,  2312,  2204,
+    2097,  1993,  1892,  1793,  1696,  1602,  1511,  1422,  1336,  1252,  1171,
+    1092,  1016,  943,   873,   805,   740,   677,   617,   560,   506,   454,
+    405,   359,   316,   275,   237,   202,   170,   140,   114,   90,    69,
+    50,    35,    22,    12,    5,     1,     0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
similarity index 54%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
index 162952844a832ebd0b0273d13a929fec6fa22892..e7d9c5c85866988469f96a444c503863bc2bef4c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
@@ -13,19 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
 
 #include <cstdint>
 
-// Returns the time in milliseconds. There's no contract about what time zero
-// represents, the accuracy, or the granularity of the result. Subsequent calls
-// will generally not return a lower value, but even that's not guaranteed if
-// there's an overflow  wraparound.
-// The reference implementation of this function just returns a constantly
-// incrementing value for each call, since it would need a non-portable platform
-// call to access time information. For real applications, you'll need to write
-// your own platform-specific implementation.
-int32_t TimeInMilliseconds();
+extern const int g_hanning_size;
+extern const int16_t g_hanning[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c6978b5edef635af58873bf537a251fa4510ef4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+extern "C" {
+#define ARM_MATH_CM4
+#define IFFT_FLAG_R 0
+#define BIT_REVERSE_FLAG 1
+#define FFT_SIZE 512
+#define FFT_SIZE_DIV2 256
+#include <arm_math.h>
+#include "arm_cmplx_mag_squared_q10p6.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+}
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output);
+
+q15_t bufA[FFT_SIZE];
+q15_t bufB[FFT_SIZE];
+arm_rfft_instance_q15 S_arm_fft;
+arm_status arm_math_status;
+
+namespace {
+// These constants allow us to allocate fixed-sized arrays on the stack for our
+// working memory.
+constexpr int kInputSize = 512;
+constexpr int kAverageWindowSize = 6;
+constexpr int kOutputSize =
+    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+}  // namespace
+
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output) {
+  if (input_size > kInputSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kInputSize);
+    return kTfLiteError;
+  }
+  if (output_size != kOutputSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kOutputSize);
+    return kTfLiteError;
+  }
+
+  // 30ms at 16 kHz = 480 samples
+  // We want to pad the rest of the 512-sample buffer with zeros
+  arm_mult_q15((q15_t*)input, g_hanning, bufB, 480);
+  int i;
+  for (i = 480; i < 512; i++) {
+    bufB[i] = 0;
+  }
+
+  // Should move init code outside of Preprocess() function
+  arm_math_status =
+      arm_rfft_init_q15(&S_arm_fft, FFT_SIZE, IFFT_FLAG_R, BIT_REVERSE_FLAG);
+  arm_rfft_q15(&S_arm_fft, bufB, bufA);
+
+  // The rfft function packs data as follows:
+  // {real[0], real[N/2], real[1], imag[1], ..., real[N/2-1], imag[N/2-1]}
+  // Below we pack as follows:
+  // {real[0], 0, real[1], imag[1], ..., real[N/2-1], imag[N/2-1, real[N/2], 0}
+  bufA[FFT_SIZE_DIV2] = bufA[1];
+  bufA[FFT_SIZE_DIV2 + 1] = 0;
+  bufA[1] = 0;
+  arm_cmplx_mag_squared_q10p6(bufA, bufB, FFT_SIZE_DIV2 + 1);
+
+  quantize(bufA, bufB, output);
+
+  return kTfLiteOk;
+}
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output) {
+  int i;
+  for (i = 0; i < 42; i++) {
+    arm_mean_q15(bufB + 6 * i, 6, bufA + i);
+  }
+  arm_mean_q15(bufB + 252, 5, bufA + 42);
+
+  for (i = 0; i < 43; i++) {
+    output[i] = (uint8_t)(bufA[i] >> 5);
+  }
+}
+
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                             const int16_t* input, uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45e9f798ef04cf40268cf379f24ecbfa904be9b5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+
+const int g_sin_1k_size = 480;
+const int16_t g_sin_1k[480] = {
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,
+    3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317,
+    -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253,
+    -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,
+    3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253,
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,
+    3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317,
+    -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253,
+    -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,
+    3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253,
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
new file mode 100644
index 0000000000000000000000000000000000000000..653a6f583013dc03d0601cfd97a85b15db2c6677
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+
+#include <cstdint>
+
+extern const int g_sin_1k_size;
+extern const int16_t g_sin_1k[];
+
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
index 0e42329cade2e4b49b8000412c593f9a442af4ca..49aace3d7d05ba1d7010d3d834c66dc13e488c96 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
@@ -1,153 +1,106 @@
 
-# Tests loading and running a speech model.
 MICRO_SPEECH_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
-ALL_SRCS += $(MICRO_SPEECH_TEST_SRCS)
-MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
-MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-ALL_BINARIES += $(MICRO_SPEECH_TEST_BINARY)
-$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
-micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
-test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
-	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-# Source files that are used by multiple preprocessor tests.
-PREPROCESSOR_TEST_SHARED_SRCS := \
+
+MICRO_SPEECH_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
+
+PREPROCESSOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
 
-# Test the float reference code for feature generation.
-PREPROCESSOR_REFERENCE_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SHARED_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
-ALL_SRCS += $(PREPROCESSOR_REFERENCE_TEST_SRCS)
-PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
-PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
-ALL_BINARIES += $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
-test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-# Test the fixed point reference code for feature generation.
-PREPROCESSOR_FIXED_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SHARED_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
-ALL_SRCS += $(PREPROCESSOR_FIXED_TEST_SRCS)
-PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
-PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
-ALL_BINARIES += $(PREPROCESSOR_FIXED_TEST_BINARY)
-$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
-preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
-test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+PREPROCESSOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
 
-# Tests the audio provider module.
 AUDIO_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
-ALL_SRCS += $(AUDIO_PROVIDER_TEST_SRCS)
-AUDIO_PROVIDER_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(AUDIO_PROVIDER_TEST_SRCS))))
-AUDIO_PROVIDER_TEST_BINARY := $(BINDIR)audio_provider_test
-ALL_BINARIES += $(AUDIO_PROVIDER_TEST_BINARY)
-$(AUDIO_PROVIDER_TEST_BINARY): $(AUDIO_PROVIDER_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(AUDIO_PROVIDER_TEST_BINARY) $(AUDIO_PROVIDER_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-audio_provider_test: $(AUDIO_PROVIDER_TEST_BINARY)
-audio_provider_test_bin: $(AUDIO_PROVIDER_TEST_BINARY).bin
-test_audio_provider: $(AUDIO_PROVIDER_TEST_BINARY)
-	$(TEST_SCRIPT) $(AUDIO_PROVIDER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
-# Tests the feature provider module.
+AUDIO_PROVIDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+
 FEATURE_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
-ALL_SRCS += $(FEATURE_PROVIDER_TEST_SRCS)
-FEATURE_PROVIDER_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(FEATURE_PROVIDER_TEST_SRCS))))
-FEATURE_PROVIDER_TEST_BINARY := $(BINDIR)feature_provider_test
-ALL_BINARIES += $(FEATURE_PROVIDER_TEST_BINARY)
-$(FEATURE_PROVIDER_TEST_BINARY): $(FEATURE_PROVIDER_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(FEATURE_PROVIDER_TEST_BINARY) $(FEATURE_PROVIDER_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-feature_provider_test: $(FEATURE_PROVIDER_TEST_BINARY)
-feature_provider_test_bin: $(FEATURE_PROVIDER_TEST_BINARY).bin
-test_feature_provider: $(FEATURE_PROVIDER_TEST_BINARY)
-	$(TEST_SCRIPT) $(FEATURE_PROVIDER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-# Tests the timer module.
-TIMER_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
-ALL_SRCS += $(TIMER_TEST_SRCS)
-TIMER_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TIMER_TEST_SRCS))))
-TIMER_TEST_BINARY := $(BINDIR)timer_test
-ALL_BINARIES += $(TIMER_TEST_BINARY)
-$(TIMER_TEST_BINARY): $(TIMER_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(TIMER_TEST_BINARY) $(TIMER_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-timer_test: $(TIMER_TEST_BINARY)
-timer_test_bin: $(TIMER_TEST_BINARY).bin
-test_timer: $(TIMER_TEST_BINARY)
-	$(TEST_SCRIPT) $(TIMER_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
 
-# Builds a standalone speech command recognizer binary.
+FEATURE_PROVIDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+
+RECOGNIZE_COMMANDS_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+
+RECOGNIZE_COMMANDS_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+
 MICRO_SPEECH_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
-ALL_SRCS += $(MICRO_SPEECH_SRCS)
-MICRO_SPEECH_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_SRCS))))
-MICRO_SPEECH_BINARY := $(BINDIR)micro_speech
-ALL_BINARIES += $(MICRO_SPEECH_BINARY)
-$(MICRO_SPEECH_BINARY): $(MICRO_SPEECH_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_BINARY) $(MICRO_SPEECH_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-micro_speech: $(MICRO_SPEECH_BINARY)
-micro_speech_bin: $(MICRO_SPEECH_BINARY).bin
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+
+MICRO_SPEECH_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+
+# Find any platform-specific rules for this example.
+include $(wildcard tensorflow/lite/experimental/micro/examples/micro_speech/*/Makefile.inc)
+
+# Tests loading and running a speech model.
+$(eval $(call microlite_test,micro_speech_test,\
+$(MICRO_SPEECH_TEST_SRCS),$(MICRO_SPEECH_TEST_HDRS)))
+
+# Test the code for feature generation.
+$(eval $(call microlite_test,preprocessor_test,\
+$(PREPROCESSOR_TEST_SRCS), $(PREPROCESSOR_TEST_HDRS)))
+
+# Tests the audio provider module.
+$(eval $(call microlite_test,audio_provider_test,\
+$(AUDIO_PROVIDER_TEST_SRCS),$(AUDIO_PROVIDER_TEST_HDRS)))
+
+# Tests the feature provider module.
+$(eval $(call microlite_test,feature_provider_test,\
+$(FEATURE_PROVIDER_TEST_SRCS),$(FEATURE_PROVIDER_TEST_HDRS)))
+
+# Tests the feature provider module.
+$(eval $(call microlite_test,recognize_commands_test,\
+$(RECOGNIZE_COMMANDS_TEST_SRCS),$(RECOGNIZE_COMMANDS_TEST_HDRS)))
+
+# Builds a standalone speech command recognizer binary.
+$(eval $(call microlite_test,micro_speech,\
+$(MICRO_SPEECH_SRCS),$(MICRO_SPEECH_HDRS)))
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cb8d4d02c418e5d8c903c69729e8e1b3ee44a8bf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
@@ -0,0 +1,4 @@
+captured_data.txt
+captured_data.wav
+cmsis_*.txt
+micro_*.txt
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa362be0038f8757387a6311021e183dc19dabd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -0,0 +1,100 @@
+# Settings for apollo3 evb platforms.
+ifeq ($(TARGET), apollo3evb)
+
+  PUSHBUTTON_MICRO_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/../preprocessor.cc \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(APOLLO3_SDK)/devices/am_devices_led.c
+  ALL_SRCS += $(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS)
+  PUSHBUTTON_MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS))))
+  PUSHBUTTON_MICRO_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_micro_speech_test
+  $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY): $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  pushbutton_micro_speech_test: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
+  pushbutton_micro_speech_test_bin: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY).bin
+  test_pushbutton_micro_speech: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(CMSIS_DIR)/preprocessor.cc \
+    $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(APOLLO3_SDK)/devices/am_devices_led.c \
+    $(CMSIS_SRCS)
+  ALL_SRCS += $(PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS)
+  PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS))) \
+    arm_bitreversal2.o)
+  PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_cmsis_speech_test
+  $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY): $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  pushbutton_cmsis_speech_test: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
+  pushbutton_cmsis_speech_test_bin: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY).bin
+  test_pushbutton_cmsis_speech: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_1K_SRCS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
+
+  PREPROCESSOR_1K_MICRO_TEST_SRCS := \
+    $(PREPROCESSOR_1K_SRCS) \
+    $(AP3_MICRO_DIR)/../fixed_point/preprocessor.cc \
+    $(AP3_EXT_MICRO_DIR)/system_apollo3.c \
+    $(AP3_MICRO_DIR)/_main.c
+  ALL_SRCS += $(PREPROCESSOR_1K_MICRO_TEST_SRCS)
+  PREPROCESSOR_1K_MICRO_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_MICRO_TEST_SRCS))))
+  PREPROCESSOR_1K_MICRO_TEST_BINARY := $(BINDIR)preprocessor_1k_micro_test
+  $(PREPROCESSOR_1K_MICRO_TEST_BINARY): $(PREPROCESSOR_1K_MICRO_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_1K_MICRO_TEST_BINARY) $(PREPROCESSOR_1K_MICRO_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  preprocessor_1k_micro_test: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+  preprocessor_1k_micro_test_bin: $(PREPROCESSOR_1K_MICRO_TEST_BINARY).bin
+  test_preprocessor_1k_micro: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_MICRO_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_1K_CMSIS_TEST_SRCS := \
+    $(PREPROCESSOR_1K_SRCS) \
+    $(CMSIS_DIR)/preprocessor.cc \
+    $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(AP3_EXT_MICRO_DIR)/system_apollo3.c \
+    $(AP3_MICRO_DIR)/_main.c \
+    $(CMSIS_SRCS)
+  ALL_SRCS += $(PREPROCESSOR_1K_CMSIS_TEST_SRCS)
+  PREPROCESSOR_1K_CMSIS_TEST_BINARY := $(BINDIR)preprocessor_1k_cmsis_test
+  PREPROCESSOR_1K_CMSIS_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_CMSIS_TEST_SRCS)))\
+    arm_bitreversal2.o)
+  $(PREPROCESSOR_1K_CMSIS_TEST_BINARY): $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  preprocessor_1k_cmsis_test: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+  preprocessor_1k_cmsis_test_bin: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY).bin
+  test_preprocessor_1k_cmsis: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c 
+
+  $(OBJDIR)arm_bitreversal2.o:
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.S -o $(OBJDIR)arm_bitreversal2.o
+
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10be9f136a9088d1ad098d685791ae357e8a9c22
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
@@ -0,0 +1,129 @@
+# Description of Apollo3 Makefile targets
+
+*   **pushbutton_cmsis_speech_test_bin**:
+    *   When users press BTN2 on the Apollo3 EVK, 1 second of audio is captured.
+    *   Then the audio is sent to the CMSIS version of the preprocessor and into
+        the neural net
+    *   To print out the neural net's inference scores, run GDB and source
+        pushbutton\_cmsis\_scores.cmd
+    *   To save the captured audio to a text file (captured\_data.txt), run GDB
+        and source pushbutton\_cmsis\_voice.cmd
+    *   Setup python
+        *   sudo apt install python-pip
+        *   sudo apt install python-tk
+        *   pip install numpy
+        *   pip install matplotlib
+        *   pip install pysoundfile
+        *   python captured_data_to_wav.py
+    *   captured\_data.txt can be turned into a \*.wav file using
+        captured\_data\_to\_wav.py by executing "python
+        captured\_data\_to\_wav.py"
+*   **preprocessor_1k_cmsis_test_bin**:
+    *   Sends a 1 kHz sine wave to the CMSIS fixed\_point version of the
+        preprocessor
+    *   **This test should be compiled with the -O0 option.** Otherwise, the
+        breakpoints will not be reached
+        *   In
+            tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+            change "-O3" to "-O0" on line 47
+        *   **DO NOT FORGET TO REVERT CHANGE AFTER EXPERIMENT**
+        *   In future, enhance scripts to handle automatically, NOT manually!
+    *   Clean project by running "make -f
+        tensorflow/lite/experimental/micro/tools/make/Makefile clean"
+    *   Compile BIN by running "make -f
+        tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
+        preprocessor_1k_cmsis_test_bin"
+    *   Run with the preprocessor\_1k\_cmsis\_test.cmd GDB command file
+    *   Produces four text files corresponding to outputs from the CMSIS
+        fixed\_point version of this algorithm:
+        *   cmsis_windowed_input.txt: the sinusoid after multiplying elementwise
+            with a Hann window
+        *   cmsis_dft.txt: the DFT of the windowed sinusoid
+        *   cmsis_power.txt: the magnitude squared of the DFT
+        *   cmsis_power_avg.txt: the 6-bin average of the magnitude squared of
+            the DFT
+    *   Run both verisons of the 1KHz pre-processor test and then compare.
+        *   These files can be plotted with "python compare\_1k.py"
+    *   Also prints out the number of cycles the code took to execute (using the
+        DWT->CYCCNT register)
+*   **preprocessor_1k_micro_test_bin**
+    *   Sends a 1 kHz sine wave to the Micro-Lite fixed\_point version of the
+        preprocessor
+    *   **This test should be compiled with the -O0 option.** Otherwise, the
+        breakpoints will not be reached
+    *   Run with the preprocessor\_1k\_micro\_test.cmd GDB command file
+    *   Produces four text files corresponding to outputs from the Micro-Lite
+        version of this algorithm:
+        *   micro_windowed_input.txt: the sinusoid after multiplying elementwise
+            with a Hann window
+        *   micro_dft.txt: the DFT of the windowed sinusoid
+        *   micro_power.txt: the magnitude squared of the DFT
+        *   micro_power_avg.txt: the 6-bin average of the magnitude squared of
+            the DFT
+    *   Run both verisons of the 1KHz pre-processor test and then compare.
+        *   These files can be plotted with "python compare\_1k.py"
+    *   Also prints out the number of cycles the code took to execute (using the
+        DWT->CYCCNT register)
+
+# Description of files
+
+*   **.gitignore**: Git should ignore \*.txt and \*.wav files that result from
+    experiments run in this directory
+*   **captured\_data\_to\_wav.py**: Python script that parses a text file
+    containing data dumped from GDB (specifically the verilog format) and
+    creates a \*.wav file using
+    [PySoundFile](https://pysoundfile.readthedocs.io/en/0.9.0/).
+*   **compare\_1k.py**: This script compares the intermediate variables and
+    final outputs of the micro-lite fixed-point preprocessor function and the
+    CMSIS version of this function. The stimulus provided to each preprocessor
+    is the same: a 1 kHz sinusoid.
+*   **get\_yesno\_data.cmd**: A GDB command file that runs preprocessor_test
+    (where TARGET=apollo3evb) and dumps the calculated data for the "yes" and
+    "no" input wavfeorms to text files
+*   **\_main.c**: Point of entry for the micro_speech test
+*   **preprocessor_1k.cc**: A version of preprocessor.cc where a 1 kHz sinusoid
+    is provided as input to the preprocessor
+*   **preprocessor_1k_cmsis_test.cmd**: GDB command file for the CMSIS
+    preprocessor 1 kHz test
+*   **preprocessor_1k_micro_test.cmd**: GDB command file for the Micro-Lite
+    preprocessor 1 kHz test
+*   **preprocessor_test.cmd**: GDB command file for the preprocessor test
+*   **pushbutton_cmsis_scores.cmd**: GDB command file that runs
+    pushbutton_cmsis_speech_test_bin. It adds a breakpoint immediately after the
+    scores are reported and prints out each score. Then it continues code
+    execution.
+*   **pushbutton_cmsis_voice.cmd**: GDB command file that runs
+    pushbutton_cmsis_speech_test_bin. Dumps the recorded 1 second of audio to
+    captured_data.txt, which can then be processed by the python file
+    captured_data_to_wav.py.
+*   **pushbutton_main.c**: Source file containing program point of entry
+    \_main() for the pushbutton\_\* tests. Contains Interrupt Service Routines
+    for PDM data capture and pushbuttons. Calls the main() function of
+    pushbutton_test.cc
+*   **pushbutton_test.cc**: Source file containing main() function for the
+    pushbutton\_\* tests. main() calls the preprocessor function and the neural
+    net inference function.
+
+# Description of externally downloaded files in ../apollo3_ext
+
+*   **apollo3.h**: Apollo 3 version of the
+    [CMSIS Device Header File (device.h)](https://www.keil.com/pack/doc/CMSIS/Core/html/device_h_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+*   **system_apollo3.c**: Apollo 3 version of the
+    [CMSIS System Configuration File system\_\<device\>.c](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+*   **system_apollo3.h**: Apollo 3 version of the
+    [CMSIS System Configuration File system\_\<device\>.h](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+
+# FFT scaling
+
+See https://github.com/ARM-software/CMSIS_5/issues/220
+
+> And as @xizhizhang pointed, I think there may be an error on the internal
+> downscaling, or at least on the documentation. It looks like during the fft
+> computation, the downscaling factor reach 2**-9 for a 512 rfft operation,
+> being the output in Q10.22, instead the documented 2**-8 and Q9.23.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..b49d5c50ffc936fd34115cc9150829b47a1e3ab5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
@@ -0,0 +1,117 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"  // Defines AM_CMSIS_REGS
+#include "am_util.h"
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf("%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf("0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf("%f", i); }
+
+int _main(void) {
+  am_util_id_t sIdDevice;
+  uint32_t ui32StrBuf;
+
+  //
+  // Set the clock frequency.
+  //
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  //
+  // Set the default cache configuration
+  //
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
+  //
+  // Configure the board for low power operation.
+  //
+  am_bsp_low_power_init();
+
+  //
+  // Initialize the printf interface for UART output
+  //
+  am_bsp_uart_printf_enable();
+
+  //
+  // Print the banner.
+  //
+  am_util_stdio_terminal_clear();
+  am_util_stdio_printf("Hello World!\n\n");
+
+  //
+  // Print the device info.
+  //
+  am_util_id_device(&sIdDevice);
+  am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
+  am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
+
+  am_util_stdio_printf("Qualified: %s\n",
+                       sIdDevice.sMcuCtrlDevice.ui32Qualified ? "Yes" : "No");
+
+  am_util_stdio_printf(
+      "Device Info:\n"
+      "\tPart number: 0x%08X\n"
+      "\tChip ID0:    0x%08X\n"
+      "\tChip ID1:    0x%08X\n"
+      "\tRevision:    0x%08X (Rev%c%c)\n",
+      sIdDevice.sMcuCtrlDevice.ui32ChipPN, sIdDevice.sMcuCtrlDevice.ui32ChipID0,
+      sIdDevice.sMcuCtrlDevice.ui32ChipID1,
+      sIdDevice.sMcuCtrlDevice.ui32ChipRev, sIdDevice.ui8ChipRevMaj,
+      sIdDevice.ui8ChipRevMin);
+
+  //
+  // If not a multiple of 1024 bytes, append a plus sign to the KB.
+  //
+  ui32StrBuf = (sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024) ? '+' : 0;
+  am_util_stdio_printf(
+      "\tFlash size:  %7d (%d KB%s)\n", sIdDevice.sMcuCtrlDevice.ui32FlashSize,
+      sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024, &ui32StrBuf);
+
+  ui32StrBuf = (sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024) ? '+' : 0;
+  am_util_stdio_printf(
+      "\tSRAM size:   %7d (%d KB%s)\n\n", sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
+      sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024, &ui32StrBuf);
+
+  //
+  // Print the compiler version.
+  //
+  am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
+#ifdef AM_PART_APOLLO3
+  am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
+  am_util_stdio_printf("HAL SDK version: %d.%d.%d\n", g_ui32HALversion.s.Major,
+                       g_ui32HALversion.s.Minor, g_ui32HALversion.s.Revision);
+  am_util_stdio_printf("HAL compiled with %s-style registers\n",
+                       g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
+
+  am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice,
+                       &ui32StrBuf);
+  am_hal_security_info_t secInfo;
+  char sINFO[32];
+  uint32_t ui32Status;
+#endif  // AM_PART_APOLLO3
+  main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a05b6dcf1bbd5c779f7ee7bdf4d01ebde76017
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
@@ -0,0 +1,46 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts values pulled from the microcontroller into audio files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import struct
+# import matplotlib.pyplot as plt
+import numpy as np
+import soundfile as sf
+
+
+def new_data_to_array(fn):
+  vals = []
+  with open(fn) as f:
+    for n, line in enumerate(f):
+      if n is not 0:
+        vals.extend([int(v, 16) for v in line.split()])
+  b = ''.join(map(chr, vals))
+  y = struct.unpack('<' + 'h' * int(len(b) / 2), b)
+
+  return y
+
+
+data = 'captured_data.txt'
+values = np.array(new_data_to_array(data)).astype(float)
+
+# plt.plot(values, 'o-')
+# plt.show(block=False)
+
+wav = values / np.max(np.abs(values))
+sf.write('captured_data.wav', wav, 16000)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..52352bad94a1e5627a9ca35d07a5082b6d79e6a6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
@@ -0,0 +1,167 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Debugging script for checking calculation values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import struct
+import matplotlib.pyplot as plt
+import numpy as np
+
+# import soundfile as sf
+
+
+def new_data_to_array(fn, datatype='int16'):
+  """Converts file information to an in-memory array."""
+  vals = []
+  with open(fn) as f:
+    for n, line in enumerate(f):
+      if n is not 0:
+        vals.extend([int(v, 16) for v in line.split()])
+  b = ''.join(map(chr, vals))
+
+  if datatype == 'int8':
+    typestr = 'b'
+    arraylen = int(len(b))
+  elif datatype == 'int16':
+    typestr = 'h'
+    arraylen = int(len(b) // 2)
+  elif datatype == 'int32':
+    typestr = 'i'
+    arraylen = int(len(b) // 4)
+  if datatype == 'uint8':
+    typestr = 'B'
+    arraylen = int(len(b))
+  elif datatype == 'uint16':
+    typestr = 'H'
+    arraylen = int(len(b) // 2)
+  elif datatype == 'uint32':
+    typestr = 'I'
+    arraylen = int(len(b) // 4)
+
+  y = np.array(struct.unpack('<' + typestr * arraylen, b))
+
+  return y
+
+
+# x is the fixed-point input in Qm.n format
+def to_float(x, n):
+  return x.astype(float) * 2**(-n)
+
+
+micro_windowed_input = new_data_to_array(
+    'micro_windowed_input.txt', datatype='int32')
+cmsis_windowed_input = new_data_to_array(
+    'cmsis_windowed_input.txt', datatype='int16')
+
+micro_dft = new_data_to_array('micro_dft.txt', datatype='int32')
+cmsis_dft = new_data_to_array('cmsis_dft.txt', datatype='int16')
+py_dft = np.fft.rfft(to_float(cmsis_windowed_input, 15), n=512)
+py_result = np.empty((2 * py_dft.size), dtype=np.float)
+py_result[0::2] = np.real(py_dft)
+py_result[1::2] = np.imag(py_dft)
+
+micro_power = new_data_to_array('micro_power.txt', datatype='int32')
+cmsis_power = new_data_to_array('cmsis_power.txt', datatype='int16')
+py_power = np.square(np.abs(py_dft))
+
+micro_power_avg = new_data_to_array('micro_power_avg.txt', datatype='uint8')
+cmsis_power_avg = new_data_to_array('cmsis_power_avg.txt', datatype='uint8')
+
+plt.figure(1)
+plt.subplot(311)
+plt.plot(micro_windowed_input, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_windowed_input, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_windowed_input, 30), label='Micro to float')
+plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS to float')
+plt.legend()
+
+plt.figure(2)
+plt.subplot(311)
+plt.plot(micro_dft, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_dft, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_dft, 22), label='Micro to float')
+# CMSIS result has 6 fractionanl bits (not 7) due to documentation error (see
+# README.md)
+plt.plot(to_float(cmsis_dft, 6), label='CMSIS to float')
+plt.plot(py_result, label='Python result')
+plt.legend()
+
+plt.figure(3)
+plt.subplot(311)
+plt.plot(micro_power, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_power[0:256], label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_power, 22), label='Micro to float')
+plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS to float')
+plt.plot(py_power, label='Python result')
+plt.legend()
+
+plt.figure(4)
+plt.plot(micro_power_avg, label='Micro fixed')
+plt.plot(cmsis_power_avg, label='CMSIS fixed')
+plt.legend()
+plt.show()
+
+# t = np.arange(16000.*0.03)/16000.
+# # Factor of 10 because micro preprocessing overflows otherwise
+# sin1k = 0.1*np.sin(2*np.pi*1000*t)
+#
+# plt.figure(1)
+# plt.subplot(511)
+# plt.plot(sin1k)
+# plt.title('Input sine')
+#
+# plt.subplot(512)
+# plt.plot(to_float(micro_windowed_input, 30), label='Micro-Lite')
+# plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS')
+# plt.title('Windowed sine')
+# plt.legend(loc='center right')
+#
+# plt.subplot(513)
+# plt.plot(to_float(micro_dft, 22), label='Micro-Lite')
+# plt.plot(to_float(cmsis_dft, 6), label='CMSIS')
+# plt.title('FFT')
+# plt.legend(loc='center')
+#
+# plt.subplot(514)
+# plt.plot(to_float(micro_power, 22), label='Micro-Lite')
+# plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS')
+# plt.title('|FFT|^2')
+# plt.legend(loc='center right')
+#
+# plt.subplot(515)
+# plt.plot(micro_power_avg, label='Micro-Lite')
+# plt.plot(cmsis_power_avg, label='CMSIS')
+# plt.title('Averaged |FFT|^2')
+# plt.legend(loc='center right')
+#
+# plt.tight_layout(pad=0, w_pad=0.2, h_pad=0.2)
+#
+# plt.show()
+#
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
new file mode 100644
index 0000000000000000000000000000000000000000..007772e77a53b43607be90e6b8b9243b00c79546
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file preprocessor.cc
+ */
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+extern "C" {
+#include "apollo3.h"
+#include "system_apollo3.h"
+}
+
+#define output_data_size 43
+int count;
+
+extern TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                               const int16_t* input, int input_size,
+                               int output_size, uint8_t* output);
+
+TF_LITE_MICRO_TESTS_BEGIN
+CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+// DWT->LAR = 0xC5ACCE55;
+DWT->CYCCNT = 0;
+DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t calculated_data[output_data_size];
+  TfLiteStatus yes_status = Preprocess(error_reporter, g_sin_1k, g_sin_1k_size,
+                                       output_data_size, calculated_data);
+  count = DWT->CYCCNT;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..6988057f37fc8ecfa89bf8e4d87b665be540cb2e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
@@ -0,0 +1,37 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Needs to be compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+monitor reset
+break preprocessor.cc:68
+commands
+dump verilog value cmsis_windowed_input.txt bufB
+c
+end
+break preprocessor.cc:76
+commands
+dump verilog value cmsis_dft.txt bufA
+c
+end
+break preprocessor.cc:81
+commands
+dump verilog value cmsis_power.txt bufB
+c
+end
+break preprocessor.cc:83
+commands
+dump verilog memory cmsis_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:50
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..dc9cd4f0a41b20a50d487da8c68fa93b35439e38
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
@@ -0,0 +1,25 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Needs to be run when compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+monitor reset
+break preprocessor.cc:211
+commands
+dump verilog value micro_windowed_input.txt fixed_input
+dump verilog value micro_dft.txt fourier_values
+dump verilog value micro_power.txt power_spectrum
+dump verilog memory micro_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:50
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..bd2048e80ae3dffc5b6650d730c96b617a1379f9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
@@ -0,0 +1,11 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+target remote localhost:2331
+load ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+monitor reset
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..ace278ff9a2e20f51590dd9fd5d66b84e65c023b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
@@ -0,0 +1,26 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:307
+commands
+printf "Silence score: %d\n", g_silence_score
+printf "Unknown score: %d\n", g_unknown_score
+printf "Yes score: %d\n", g_yes_score
+printf "No score: %d\n", g_no_score
+printf "g_scores[0]: %d\n", g_scores[0]
+printf "g_scores[1]: %d\n", g_scores[1]
+printf "g_scores[2]: %d\n", g_scores[2]
+printf "g_scores[3]: %d\n", g_scores[3]
+printf "max_score: %d\n", max_score
+printf "max_score_index: %d\n", max_score_index
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..5dea48e62aba123b54a19c02847236cf28fc2a38
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:296
+commands
+dump verilog value captured_data.txt captured_data
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..afee38343b3fac81de945dcd01b53ad35e8be270
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
@@ -0,0 +1,322 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file _main.c */
+
+#include <stdint.h>
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"  // Defines AM_CMSIS_REGS
+#include "am_util.h"
+
+#define ARM_MATH_CM4
+#include <arm_math.h>
+
+//*****************************************************************************
+// Parameters
+//
+// Total number of bytes transferred = 320*50*2 = 32000
+//*****************************************************************************
+
+#define FRAME_SIZE 320  // Capture one 320-sample (20-ms) frame at a time
+#define NUM_FRAMES 50   // Number of frames in 1 second
+
+//*****************************************************************************
+// GLOBALS
+//*****************************************************************************
+
+volatile int16_t g_numFramesCaptured = 0;
+volatile bool g_bPDMDataReady = false;
+int16_t
+    captured_data[FRAME_SIZE * NUM_FRAMES];  // Location of 1-second data buffer
+extern uint8_t g_silence_score;
+extern uint8_t g_unknown_score;
+extern uint8_t g_yes_score;
+extern uint8_t g_no_score;
+q7_t g_scores[4] = {0};
+
+//*****************************************************************************
+// The entry point for the application.
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf("%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf("0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf("%f", i); }
+
+//*****************************************************************************
+// PDM configuration information.
+//*****************************************************************************
+void* PDMHandle;
+
+am_hal_pdm_config_t g_sPdmConfig = {
+    .eClkDivider = AM_HAL_PDM_MCLKDIV_1,
+    .eLeftGain = AM_HAL_PDM_GAIN_P225DB,
+    .eRightGain = AM_HAL_PDM_GAIN_P225DB,
+    .ui32DecimationRate =
+        48,  // OSR = 1500/16 = 96 = 2*SINCRATE --> SINC_RATE = 48
+    .bHighPassEnable = 0,
+    .ui32HighPassCutoff = 0xB,
+    .ePDMClkSpeed = AM_HAL_PDM_CLK_1_5MHZ,
+    .bInvertI2SBCLK = 0,
+    .ePDMClkSource = AM_HAL_PDM_INTERNAL_CLK,
+    .bPDMSampleDelay = 0,
+    .bDataPacking = 1,
+    .ePCMChannels = AM_HAL_PDM_CHANNEL_RIGHT,
+    .bLRSwap = 0,
+};
+
+//*****************************************************************************
+// BUTTON0 pin configuration settings.
+//*****************************************************************************
+const am_hal_gpio_pincfg_t g_deepsleep_button0 = {
+    .uFuncSel = 3,
+    .eIntDir = AM_HAL_GPIO_PIN_INTDIR_LO2HI,
+    .eGPInput = AM_HAL_GPIO_PIN_INPUT_ENABLE,
+};
+
+//*****************************************************************************
+// PDM initialization.
+//*****************************************************************************
+void pdm_init(void) {
+  //
+  // Initialize, power-up, and configure the PDM.
+  //
+  am_hal_pdm_initialize(0, &PDMHandle);
+  am_hal_pdm_power_control(PDMHandle, AM_HAL_PDM_POWER_ON, false);
+  am_hal_pdm_configure(PDMHandle, &g_sPdmConfig);
+  am_hal_pdm_enable(PDMHandle);
+
+  //
+  // Configure the necessary pins.
+  //
+  am_hal_gpio_pincfg_t sPinCfg = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // ARPIT 181019
+  // sPinCfg.uFuncSel = AM_HAL_PIN_10_PDMCLK;
+  // am_hal_gpio_pinconfig(10, sPinCfg);
+  sPinCfg.uFuncSel = AM_HAL_PIN_12_PDMCLK;
+  am_hal_gpio_pinconfig(12, sPinCfg);
+
+  sPinCfg.uFuncSel = AM_HAL_PIN_11_PDMDATA;
+  am_hal_gpio_pinconfig(11, sPinCfg);
+
+  // am_hal_gpio_state_write(14, AM_HAL_GPIO_OUTPUT_CLEAR);
+  // am_hal_gpio_pinconfig(14, g_AM_HAL_GPIO_OUTPUT);
+
+  //
+  // Configure and enable PDM interrupts (set up to trigger on DMA
+  // completion).
+  //
+  am_hal_pdm_interrupt_enable(PDMHandle,
+                              (AM_HAL_PDM_INT_DERR | AM_HAL_PDM_INT_DCMP |
+                               AM_HAL_PDM_INT_UNDFL | AM_HAL_PDM_INT_OVF));
+
+#if AM_CMSIS_REGS
+  NVIC_EnableIRQ(PDM_IRQn);
+#else
+  am_hal_interrupt_enable(AM_HAL_INTERRUPT_PDM);
+#endif
+}
+
+//*****************************************************************************
+//
+// Start a transaction to get some number of bytes from the PDM interface.
+//
+//*****************************************************************************
+void pdm_data_get(void) {
+  //
+  // Configure DMA and target address.
+  //
+  am_hal_pdm_transfer_t sTransfer;
+  sTransfer.ui32TargetAddr =
+      (uint32_t)(&captured_data[FRAME_SIZE * g_numFramesCaptured]);
+  sTransfer.ui32TotalCount = 2 * FRAME_SIZE;  // Each sample is 2 bytes
+
+  //
+  // Start the data transfer.
+  //
+  am_hal_pdm_dma_start(PDMHandle, &sTransfer);
+}
+
+//*****************************************************************************
+//
+// PDM interrupt handler.
+//
+//*****************************************************************************
+void am_pdm_isr(void) {
+  uint32_t ui32Status;
+  //
+  // Read the interrupt status.
+  //
+  am_hal_pdm_interrupt_status_get(PDMHandle, &ui32Status, true);
+  am_hal_pdm_interrupt_clear(PDMHandle, ui32Status);
+
+  //
+  // Once our DMA transaction completes, send a flag to the main routine
+  //
+  if (ui32Status & AM_HAL_PDM_INT_DCMP) g_bPDMDataReady = true;
+}
+
+//*****************************************************************************
+// GPIO ISR
+// Will enable the PDM, set number of frames transferred to 0, and turn on LED
+//*****************************************************************************
+void am_gpio_isr(void) {
+  //
+  // Delay for debounce.
+  //
+  am_util_delay_ms(200);
+
+  //
+  // Clear the GPIO Interrupt (write to clear).
+  //
+  am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  // Start audio transfer
+  am_hal_pdm_fifo_flush(PDMHandle);
+  pdm_data_get();
+  am_hal_pdm_enable(PDMHandle);
+
+  //
+  // Turn on LED 0
+  //
+  am_devices_led_on(am_bsp_psLEDs, 0);
+}
+
+int _main(void) {
+  am_util_id_t sIdDevice;
+  uint32_t ui32StrBuf;
+
+  //
+  // Set the clock frequency.
+  //
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  //
+  // Set the default cache configuration
+  //
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
+  //
+  // Configure the board for low power operation.
+  //
+  am_bsp_low_power_init();
+
+#if defined(AM_BSP_NUM_BUTTONS) && defined(AM_BSP_NUM_LEDS)
+  //
+  // Configure the button pin.
+  //
+  am_hal_gpio_pinconfig(AM_BSP_GPIO_BUTTON0, g_deepsleep_button0);
+
+  //
+  // Clear the GPIO Interrupt (write to clear).
+  //
+  am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  //
+  // Enable the GPIO/button interrupt.
+  //
+  am_hal_gpio_interrupt_enable(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  //
+  // Configure the LEDs.
+  //
+  am_devices_led_array_init(am_bsp_psLEDs, AM_BSP_NUM_LEDS);
+
+  //
+  // Turn the LEDs off
+  //
+  for (int ix = 0; ix < AM_BSP_NUM_LEDS; ix++) {
+    am_devices_led_off(am_bsp_psLEDs, ix);
+  }
+
+//    am_devices_led_on(am_bsp_psLEDs, 1);
+#endif  // defined(AM_BSP_NUM_BUTTONS)  &&  defined(AM_BSP_NUM_LEDS)
+
+#if AM_CMSIS_REGS
+  NVIC_EnableIRQ(GPIO_IRQn);
+#else   // AM_CMSIS_REGS
+  am_hal_interrupt_enable(AM_HAL_INTERRUPT_GPIO);
+#endif  // AM_CMSIS_REGS
+
+  //
+  // Enable interrupts to the core.
+  //
+  am_hal_interrupt_master_enable();
+
+  // Turn on PDM
+  pdm_init();
+
+  //
+  // Initialize the printf interface for UART output
+  //
+  am_bsp_uart_printf_enable();
+
+  //
+  // Print the banner.
+  //
+  am_util_stdio_terminal_clear();
+  am_util_stdio_printf("Starting streaming test\n\n");
+
+  // Score variables
+  q7_t max_score = 0;
+  uint32_t max_score_index = 0;
+
+  while (1) {
+    am_hal_interrupt_master_disable();
+
+    if (g_bPDMDataReady) {
+      g_bPDMDataReady = false;
+      g_numFramesCaptured++;
+
+      if (g_numFramesCaptured < NUM_FRAMES) {
+        pdm_data_get();  // Start converting the next set of PCM samples.
+      }
+
+      else {
+        g_numFramesCaptured = 0;
+        // am_hal_pdm_disable(PDMHandle);
+        am_devices_led_off(am_bsp_psLEDs, 0);
+
+        main(0, NULL);
+
+        g_scores[0] = (q7_t)g_silence_score - 128;
+        g_scores[1] = (q7_t)g_unknown_score - 128;
+        g_scores[2] = (q7_t)g_yes_score - 128;
+        g_scores[3] = (q7_t)g_no_score - 128;
+
+        am_devices_led_off(
+            am_bsp_psLEDs,
+            max_score_index + 1);  // Turn off LED for previous max score
+        arm_max_q7(g_scores, 4, &max_score, &max_score_index);
+        am_devices_led_on(
+            am_bsp_psLEDs,
+            max_score_index + 1);  // Turn on LED for new max score
+      }
+    }
+
+    //
+    // Go to Deep Sleep.
+    //
+    am_hal_sysctrl_sleep(AM_HAL_SYSCTRL_SLEEP_DEEP);
+
+    am_hal_interrupt_master_enable();
+  }
+
+  // main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95043f857b34b953c91a762bc1a54e9489431bff
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file
+ * micro_speech_test.cc */
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+extern int16_t captured_data[16000];
+uint8_t g_silence_score = 0;
+uint8_t g_unknown_score = 0;
+uint8_t g_yes_score = 0;
+uint8_t g_no_score = 0;
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t preprocessed_data[43 * 49];
+  TfLiteStatus preprocess_1sec_status =
+      Preprocess_1sec(error_reporter, captured_data, preprocessed_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, preprocess_1sec_status);
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* input = interpreter.input(0);
+
+  // Make sure the input has the properties we expect.
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+
+  // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
+  // into the memory area used for the input.
+  for (int i = 0; i < input->bytes; ++i) {
+    input->data.uint8[i] = preprocessed_data[i];
+  }
+
+  // Run the model on this input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  // Get the output from the model, and make sure it's the expected size and
+  // type.
+  TfLiteTensor* output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+
+  // There are four possible classes in the output, each with a score.
+  const int kSilenceIndex = 0;
+  const int kUnknownIndex = 1;
+  const int kYesIndex = 2;
+  const int kNoIndex = 3;
+
+  // Make sure that the expected "Yes" score is higher than the other classes.
+  g_silence_score = output->data.uint8[kSilenceIndex];
+  g_unknown_score = output->data.uint8[kUnknownIndex];
+  g_yes_score = output->data.uint8[kYesIndex];
+  g_no_score = output->data.uint8[kNoIndex];
+
+  error_reporter->Report("Ran successfully\n");
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
index c0365d56901b503628b323a2fe09a4fa0de9165e..52db18e6868371afc0b7cd39f6f41d0d60b91689 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 namespace {
 int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
 }  // namespace
 
 TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
@@ -31,3 +32,8 @@ TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
   *audio_samples = g_dummy_audio_data;
   return kTfLiteOk;
 }
+
+int32_t LatestAudioTimestamp() {
+  g_latest_audio_timestamp += 100;
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
index 7e2442a5e83ee1f809f82587c816adb01dc09e5e..b69067364198d7285d3f2bfc34208168effacb35 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
@@ -33,4 +33,14 @@ TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
                              int start_ms, int duration_ms,
                              int* audio_samples_size, int16_t** audio_samples);
 
+// Returns the time that audio data was last captured in milliseconds. There's
+// no contract about what time zero represents, the accuracy, or the granularity
+// of the result. Subsequent calls will generally not return a lower value, but
+// even that's not guaranteed if there's an overflow  wraparound.
+// The reference implementation of this function just returns a constantly
+// incrementing value for each call, since it would need a non-portable platform
+// call to access time information. For real applications, you'll need to write
+// your own platform-specific implementation.
+int32_t LatestAudioTimestamp();
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
index 5f7c7605f0feb3fd3179a0edd5e51574b867ce68..85fbbb80a6c5b330230c1d1d0186de795edc4754 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -41,4 +44,27 @@ TF_LITE_MICRO_TEST(TestAudioProvider) {
   }
 }
 
+TF_LITE_MICRO_TEST(TestTimer) {
+  // Make sure that the technically-undefined overflow behavior we rely on below
+  // works on this platform. It's still not guaranteed, but at least this is a
+  // sanity check.  Turn off when running with ASan, as it will complain about
+  // the following undefined behavior.
+#ifndef ADDRESS_SANITIZER
+  int32_t overflow_value = std::numeric_limits<int32_t>::max();
+  overflow_value += 1;
+  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
+#endif
+
+  const int32_t first_time = LatestAudioTimestamp();
+  const int32_t second_time = LatestAudioTimestamp();
+
+  // It's possible that the timer may have wrapped around from +BIG_NUM to
+  // -BIG_NUM between the first and second calls, since we're storing
+  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
+  // would have taken more than 2^31 milliseconds though, so look at the
+  // difference and rely on integer overflow to ensure it's accurate.
+  const int32_t time_delta = (second_time - first_time);
+  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..5585ed7269b71d279f1dd22cb9dd04120e7dd37f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
@@ -0,0 +1,7 @@
+# Settings for the Discovery STM32F746NG board.
+ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
+  MBED_PROJECT_FILES += \
+    AUDIO_DISCO_F746NG.lib \
+    BSP_DISCO_F746NG.lib \
+    SDRAM_DISCO_F746NG.lib
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06647d0c536564c26d72cb73396ca36efb3aeb25
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+#include "AUDIO_DISCO_F746NG.h"
+#include "SDRAM_DISCO_F746NG.h"
+#include "mbed.h"  // NOLINT
+
+namespace {
+
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// For a full example of how to access audio on the STM32F746NG board, see
+// https://os.mbed.com/teams/ST/code/DISCO-F746NG_AUDIO_demo/
+AUDIO_DISCO_F746NG g_audio_device;
+SDRAM_DISCO_F746NG g_sdram_device;
+
+typedef enum {
+  BUFFER_OFFSET_NONE = 0,
+  BUFFER_OFFSET_HALF = 1,
+  BUFFER_OFFSET_FULL = 2,
+} BUFFER_StateTypeDef;
+
+#define AUDIO_BLOCK_SIZE ((uint32_t)2048)
+#define AUDIO_BUFFER_IN SDRAM_DEVICE_ADDR /* In SDRAM */
+#define AUDIO_BUFFER_OUT \
+  (SDRAM_DEVICE_ADDR + (AUDIO_BLOCK_SIZE * 2)) /* In SDRAM */
+__IO uint32_t g_audio_rec_buffer_state = BUFFER_OFFSET_NONE;
+
+uint8_t SetSysClock_PLL_HSE_200MHz() {
+  RCC_ClkInitTypeDef RCC_ClkInitStruct;
+  RCC_OscInitTypeDef RCC_OscInitStruct;
+
+  // Enable power clock
+  __PWR_CLK_ENABLE();
+
+  // Enable HSE oscillator and activate PLL with HSE as source
+  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE;
+  RCC_OscInitStruct.HSEState = RCC_HSE_ON; /* External xtal on OSC_IN/OSC_OUT */
+
+  // Warning: this configuration is for a 25 MHz xtal clock only
+  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
+  RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
+  RCC_OscInitStruct.PLL.PLLM = 25;   // VCO input clock = 1 MHz (25 MHz / 25)
+  RCC_OscInitStruct.PLL.PLLN = 400;  // VCO output clock = 400 MHz (1 MHz * 400)
+  RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;  // PLLCLK = 200 MHz (400 MHz / 2)
+  RCC_OscInitStruct.PLL.PLLQ = 8;  // USB clock = 50 MHz (400 MHz / 8)
+
+  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK) {
+    return 0;  // FAIL
+  }
+
+  // Activate the OverDrive to reach the 216 MHz Frequency
+  if (HAL_PWREx_EnableOverDrive() != HAL_OK) {
+    return 0;  // FAIL
+  }
+
+  // Select PLL as system clock source and configure the HCLK, PCLK1 and PCLK2
+  // clocks dividers
+  RCC_ClkInitStruct.ClockType = (RCC_CLOCKTYPE_SYSCLK | RCC_CLOCKTYPE_HCLK |
+                                 RCC_CLOCKTYPE_PCLK1 | RCC_CLOCKTYPE_PCLK2);
+  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;  // 200 MHz
+  RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;         // 200 MHz
+  RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;          //  50 MHz
+  RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;          // 100 MHz
+
+  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_7) != HAL_OK) {
+    return 0;  // FAIL
+  }
+  HAL_RCC_MCOConfig(RCC_MCO1, RCC_MCO1SOURCE_HSE, RCC_MCODIV_4);
+  return 1;  // OK
+}
+
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  SetSysClock_PLL_HSE_200MHz();
+
+  // Initialize SDRAM buffers.
+  memset((uint16_t*)AUDIO_BUFFER_IN, 0, AUDIO_BLOCK_SIZE * 2);
+  memset((uint16_t*)AUDIO_BUFFER_OUT, 0, AUDIO_BLOCK_SIZE * 2);
+  g_audio_rec_buffer_state = BUFFER_OFFSET_NONE;
+
+  // Start Recording.
+  g_audio_device.IN_Record((uint16_t*)AUDIO_BUFFER_IN, AUDIO_BLOCK_SIZE);
+
+  // Also play results out to headphone jack.
+  g_audio_device.OUT_SetAudioFrameSlot(CODEC_AUDIOFRAME_SLOT_02);
+  g_audio_device.OUT_Play((uint16_t*)AUDIO_BUFFER_OUT, AUDIO_BLOCK_SIZE * 2);
+
+  return kTfLiteOk;
+}
+
+void CaptureSamples(const int16_t* sample_data) {
+  const int sample_size = AUDIO_BLOCK_SIZE / (sizeof(int16_t) * 2);
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp + (sample_size / (kAudioSampleFrequency / 1000));
+
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index =
+        (start_sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] =
+        (sample_data[(i * 2) + 0] / 2) + (sample_data[(i * 2) + 1] / 2);
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+}  // namespace
+
+// These callbacks need to be linkable symbols, because they override weak
+// default versions.
+void BSP_AUDIO_IN_TransferComplete_CallBack(void) {
+  g_audio_rec_buffer_state = BUFFER_OFFSET_FULL;
+  /* Copy recorded 1st half block */
+  memcpy((uint16_t*)(AUDIO_BUFFER_OUT), (uint16_t*)(AUDIO_BUFFER_IN),
+         AUDIO_BLOCK_SIZE);
+  CaptureSamples(reinterpret_cast<int16_t*>(AUDIO_BUFFER_IN));
+  return;
+}
+
+// Another weak symbol override.
+void BSP_AUDIO_IN_HalfTransfer_CallBack(void) {
+  g_audio_rec_buffer_state = BUFFER_OFFSET_HALF;
+  /* Copy recorded 2nd half block */
+  memcpy((uint16_t*)(AUDIO_BUFFER_OUT + (AUDIO_BLOCK_SIZE)),
+         (uint16_t*)(AUDIO_BUFFER_IN + (AUDIO_BLOCK_SIZE)), AUDIO_BLOCK_SIZE);
+  CaptureSamples(
+      reinterpret_cast<int16_t*>(AUDIO_BUFFER_IN + AUDIO_BLOCK_SIZE));
+  return;
+}
+
+// Main entry point for getting audio data.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
similarity index 81%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
index 6c96a61ab517487413e875dc7369bddb1c9a0d9a..a8f0fe4bd50c3b6d16a426adc461ea125cbc9859 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
 
-int32_t TimeInMilliseconds() {
-  static int current_time = 0;
-  current_time += 100;
-  return current_time;
+namespace {
+int32_t g_current_time = 0;
 }
+
+void SetTimeInMilliseconds(int32_t time) { g_current_time = time; }
+
+int32_t TimeInMilliseconds() { return g_current_time; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
index c4c52ac0ff3696a05192465f8ac911b5d6a83925..7f9ece41dd3f013ae328ffd1bdc98f197855a131 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -18,20 +18,11 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
-
-namespace {
-// Stores the timestamp for the previous fetch of audio data, so that we can
-// avoid recalculating all the features from scratch if some earlier timeslices
-// are still present.
-int32_t g_last_time_in_ms = 0;
-// Make sure we don't try to use cached information if this is the first call
-// into the provider.
-bool g_is_first_run = true;
-}  // namespace
 
 FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
-    : feature_size_(feature_size), feature_data_(feature_data) {
+    : feature_size_(feature_size),
+      feature_data_(feature_data),
+      is_first_run_(true) {
   // Initialize the feature data to default values.
   for (int n = 0; n < feature_size_; ++n) {
     feature_data_[n] = 0;
@@ -41,24 +32,23 @@ FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
 FeatureProvider::~FeatureProvider() {}
 
 TfLiteStatus FeatureProvider::PopulateFeatureData(
-    tflite::ErrorReporter* error_reporter, int* how_many_new_slices) {
+    tflite::ErrorReporter* error_reporter, int32_t last_time_in_ms,
+    int32_t time_in_ms, int* how_many_new_slices) {
   if (feature_size_ != kFeatureElementCount) {
     error_reporter->Report("Requested feature_data_ size %d doesn't match %d",
                            feature_size_, kFeatureElementCount);
     return kTfLiteError;
   }
 
-  const int32_t time_in_ms = TimeInMilliseconds();
   // Quantize the time into steps as long as each window stride, so we can
   // figure out which audio data we need to fetch.
-  const int last_step = (g_last_time_in_ms / kFeatureSliceStrideMs);
+  const int last_step = (last_time_in_ms / kFeatureSliceStrideMs);
   const int current_step = (time_in_ms / kFeatureSliceStrideMs);
-  g_last_time_in_ms = time_in_ms;
 
   int slices_needed = current_step - last_step;
   // If this is the first call, make sure we don't use any cached information.
-  if (g_is_first_run) {
-    g_is_first_run = false;
+  if (is_first_run_) {
+    is_first_run_ = false;
     slices_needed = kFeatureSliceCount;
   }
   if (slices_needed > kFeatureSliceCount) {
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
index a86c56ebf053a8807e38c42c6a7088c146a31b9e..ee3a480e947eced06e30ac089433f44e18d6adc3 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
@@ -38,11 +38,15 @@ class FeatureProvider {
   // Fills the feature data with information from audio inputs, and returns how
   // many feature slices were updated.
   TfLiteStatus PopulateFeatureData(tflite::ErrorReporter* error_reporter,
+                                   int32_t last_time_in_ms, int32_t time_in_ms,
                                    int* how_many_new_slices);
 
  private:
   int feature_size_;
   uint8_t* feature_data_;
+  // Make sure we don't try to use cached information if this is the first call
+  // into the provider.
+  bool is_first_run_;
 };
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
index 1e52aec8d2741678a0f79f643bb7dcf42c848a58..556cbfe799bd9adf2df8f584a4f10b4a1c834bd4 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -30,7 +30,8 @@ TF_LITE_MICRO_TEST(TestFeatureProvider) {
 
   int how_many_new_slices = 0;
   TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
-      error_reporter, &how_many_new_slices);
+      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 10000,
+      &how_many_new_slices);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
   TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
 }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
index 1890c25cf2b44c96c549757b31f88255d4a9ee09..3a9a5a4df1bf8239950dd2c79a1048706004e1f5 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -68,16 +70,21 @@ int main(int argc, char* argv[]) {
   FeatureProvider feature_provider(kFeatureElementCount,
                                    model_input->data.uint8);
 
+  RecognizeCommands recognizer(error_reporter);
+
+  int32_t previous_time = 0;
   // Keep reading and analysing audio data in an infinite loop.
   while (true) {
     // Fetch the spectrogram for the current time.
+    const int32_t current_time = LatestAudioTimestamp();
     int how_many_new_slices = 0;
     TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
-        error_reporter, &how_many_new_slices);
+        error_reporter, previous_time, current_time, &how_many_new_slices);
     if (feature_status != kTfLiteOk) {
       error_reporter->Report("Feature generation failed");
       return 1;
     }
+    previous_time = current_time;
     // If no new audio samples have been received since last time, don't bother
     // running the network model.
     if (how_many_new_slices == 0) {
@@ -105,7 +112,19 @@ int main(int argc, char* argv[]) {
       }
     }
 
-    error_reporter->Report("Heard %s", kCategoryLabels[top_category_index]);
+    const char* found_command = nullptr;
+    uint8_t score = 0;
+    bool is_new_command = false;
+    TfLiteStatus process_status = recognizer.ProcessLatestResults(
+        output, current_time, &found_command, &score, &is_new_command);
+    if (process_status != kTfLiteOk) {
+      error_reporter->Report(
+          "RecognizeCommands::ProcessLatestResults() failed");
+      return 1;
+    }
+    if (is_new_command) {
+      error_reporter->Report("Heard %s (%d)", found_command, score);
+    }
   }
 
   return 0;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
index 1d8f3123a57bc5b807d39151adaf64f29d2f5f95..f48252d14d251673f0070e63dfa4169ca3a89025 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
@@ -23,6 +23,7 @@ limitations under the License.
 // frequency information. This has to be a power of two, and since we're dealing
 // with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
 constexpr int kMaxAudioSampleSize = 512;
+constexpr int kAudioSampleFrequency = 16000;
 
 // All of these values are derived from the values used during model training,
 // if you change your model you'll need to update these constants.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..8f8b33a9fa2afca902ef5fbcfa7f641b5cc58028
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc
@@ -0,0 +1,8 @@
+# Settings for Mac OS platforms.
+ifeq ($(TARGET), osx)
+  LINKER_FLAGS := \
+    -framework Foundation \
+    -framework AudioToolbox
+
+  MICROLITE_LIBS += $(LINKER_FLAGS)
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..892757e799f3832db725424163e613bea35ab9e7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+
+constexpr int kNumberRecordBuffers = 3;
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// Checks for MacOS errors, prints information and returns a TF Lite version.
+#define RETURN_IF_OS_ERROR(error, error_reporter)                       \
+  do {                                                                  \
+    if (error != noErr) {                                               \
+      error_reporter->Report("Error: %s:%d (%d)\n", __FILE__, __LINE__, \
+                             error);                                    \
+      return kTfLiteError;                                              \
+    }                                                                   \
+  } while (0);
+
+// Called when an audio input buffer has been filled.
+void OnAudioBufferFilledCallback(
+    void* user_data, AudioQueueRef queue, AudioQueueBufferRef buffer,
+    const AudioTimeStamp* start_time, UInt32 num_packets,
+    const AudioStreamPacketDescription* packet_description) {
+  const int sample_size = buffer->mAudioDataByteSize / sizeof(float);
+  const int64_t sample_offset = start_time->mSampleTime;
+  const int32_t time_in_ms =
+      (sample_offset + sample_size) / (kAudioSampleFrequency / 1000);
+  const float* float_samples = static_cast<const float*>(buffer->mAudioData);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index = (sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] = float_samples[i] * ((1 << 15) - 1);
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+  AudioQueueEnqueueBuffer(queue, buffer, 0, nullptr);
+}
+
+// Set up everything we need to capture audio samples from the default recording
+// device on MacOS.
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  // Set up the format of the audio - single channel, 32-bit float at 16KHz.
+  AudioStreamBasicDescription recordFormat = {0};
+  recordFormat.mSampleRate = kAudioSampleFrequency;
+  recordFormat.mFormatID = kAudioFormatLinearPCM;
+  recordFormat.mFormatFlags =
+      kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked;
+  recordFormat.mBitsPerChannel = 8 * sizeof(float);
+  recordFormat.mChannelsPerFrame = 1;
+  recordFormat.mBytesPerFrame = sizeof(float) * recordFormat.mChannelsPerFrame;
+  recordFormat.mFramesPerPacket = 1;
+  recordFormat.mBytesPerPacket =
+      recordFormat.mBytesPerFrame * recordFormat.mFramesPerPacket;
+  recordFormat.mReserved = 0;
+
+  UInt32 propSize = sizeof(recordFormat);
+  RETURN_IF_OS_ERROR(AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 0,
+                                            NULL, &propSize, &recordFormat),
+                     error_reporter);
+
+  // Create a recording queue.
+  AudioQueueRef queue;
+  RETURN_IF_OS_ERROR(
+      AudioQueueNewInput(&recordFormat, OnAudioBufferFilledCallback,
+                         error_reporter, nullptr, nullptr, 0, &queue),
+      error_reporter);
+
+  // Set up the buffers we'll need.
+  int buffer_bytes = 512 * sizeof(float);
+  for (int i = 0; i < kNumberRecordBuffers; ++i) {
+    AudioQueueBufferRef buffer;
+    RETURN_IF_OS_ERROR(AudioQueueAllocateBuffer(queue, buffer_bytes, &buffer),
+                       error_reporter);
+    RETURN_IF_OS_ERROR(AudioQueueEnqueueBuffer(queue, buffer, 0, nullptr),
+                       error_reporter);
+  }
+
+  // Start capturing audio.
+  RETURN_IF_OS_ERROR(AudioQueueStart(queue, nullptr), error_reporter);
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+      g_audio_output_buffer[i] = 0;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
index f4a7f801cc6251b82339509f691fd64012fbe390..f8858aad72f3c141d20077ffa927e30bd9492987 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
@@ -32,6 +32,9 @@ limitations under the License.
 
 namespace {
 
+// Needed because some platforms don't have M_PI defined.
+constexpr float kPi = 3.14159265358979323846f;
+
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
 // and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
@@ -48,11 +51,11 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
   for (int i = 0; i < time_series_size / 2; ++i) {
     float real = 0;
     for (int j = 0; j < time_series_size; ++j) {
-      real += time_series[j] * cos(j * i * M_PI * 2 / time_series_size);
+      real += time_series[j] * cos(j * i * kPi * 2 / time_series_size);
     }
     float imaginary = 0;
     for (int j = 0; j < time_series_size; ++j) {
-      imaginary -= time_series[j] * sin(j * i * M_PI * 2 / time_series_size);
+      imaginary -= time_series[j] * sin(j * i * kPi * 2 / time_series_size);
     }
     fourier_output[(i * 2) + 0] = real;
     fourier_output[(i * 2) + 1] = imaginary;
@@ -63,7 +66,7 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
 // of the current sample window are weighted more heavily than those at the end.
 void CalculatePeriodicHann(int window_length, float* window_function) {
   for (int i = 0; i < window_length; ++i) {
-    window_function[i] = 0.5 - 0.5 * cos((2 * M_PI * i) / window_length);
+    window_function[i] = 0.5 - 0.5 * cos((2 * kPi * i) / window_length);
   }
 }
 
@@ -143,3 +146,12 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                             const int16_t* input, uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
index adff790d6cc527578dbfb9dc481c99c1021b92db..d710beeceea6a7b6fb7fca748e5795f602276e32 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
@@ -28,4 +28,7 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output);
 
+TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
+                             const int16_t* input, uint8_t* output);
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9366dc71e0d76d087a3dad9b9c4c206a0749e235
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
+
+#include <limits>
+
+RecognizeCommands::RecognizeCommands(tflite::ErrorReporter* error_reporter,
+                                     int32_t average_window_duration_ms,
+                                     uint8_t detection_threshold,
+                                     int32_t suppression_ms,
+                                     int32_t minimum_count)
+    : error_reporter_(error_reporter),
+      average_window_duration_ms_(average_window_duration_ms),
+      detection_threshold_(detection_threshold),
+      suppression_ms_(suppression_ms),
+      minimum_count_(minimum_count),
+      previous_results_(error_reporter) {
+  previous_top_label_ = "_silence_";
+  previous_top_label_time_ = 0;
+}
+
+TfLiteStatus RecognizeCommands::ProcessLatestResults(
+    const TfLiteTensor* latest_results, const int32_t current_time_ms,
+    const char** found_command, uint8_t* score, bool* is_new_command) {
+  if ((latest_results->dims->size != 2) ||
+      (latest_results->dims->data[0] != 1) ||
+      (latest_results->dims->data[1] != kCategoryCount)) {
+    error_reporter_->Report(
+        "The results for recognition should contain %d elements, but there are "
+        "%d in an %d-dimensional shape",
+        kCategoryCount, latest_results->dims->data[1],
+        latest_results->dims->size);
+    return kTfLiteError;
+  }
+
+  if (latest_results->type != kTfLiteUInt8) {
+    error_reporter_->Report(
+        "The results for recognition should be uint8 elements, but are %d",
+        latest_results->type);
+    return kTfLiteError;
+  }
+
+  if ((!previous_results_.empty()) &&
+      (current_time_ms < previous_results_.front().time_)) {
+    error_reporter_->Report(
+        "Results must be fed in increasing time order, but received a "
+        "timestamp of %d that was earlier than the previous one of %d",
+        current_time_ms, previous_results_.front().time_);
+    return kTfLiteError;
+  }
+
+  // Add the latest results to the head of the queue.
+  previous_results_.push_back({current_time_ms, latest_results->data.uint8});
+
+  // Prune any earlier results that are too old for the averaging window.
+  const int64_t time_limit = current_time_ms - average_window_duration_ms_;
+  while ((!previous_results_.empty()) &&
+         previous_results_.front().time_ < time_limit) {
+    previous_results_.pop_front();
+  }
+
+  // If there are too few results, assume the result will be unreliable and
+  // bail.
+  const int64_t how_many_results = previous_results_.size();
+  const int64_t earliest_time = previous_results_.front().time_;
+  const int64_t samples_duration = current_time_ms - earliest_time;
+  if ((how_many_results < minimum_count_) ||
+      (samples_duration < (average_window_duration_ms_ / 4))) {
+    *found_command = previous_top_label_;
+    *score = 0;
+    *is_new_command = false;
+    return kTfLiteOk;
+  }
+
+  // Calculate the average score across all the results in the window.
+  int32_t average_scores[kCategoryCount];
+  for (int offset = 0; offset < previous_results_.size(); ++offset) {
+    PreviousResultsQueue::Result previous_result =
+        previous_results_.from_front(offset);
+    const uint8_t* scores = previous_result.scores_;
+    for (int i = 0; i < kCategoryCount; ++i) {
+      if (offset == 0) {
+        average_scores[i] = scores[i];
+      } else {
+        average_scores[i] += scores[i];
+      }
+    }
+  }
+  for (int i = 0; i < kCategoryCount; ++i) {
+    average_scores[i] /= how_many_results;
+  }
+
+  // Find the current highest scoring category.
+  int current_top_index = 0;
+  int32_t current_top_score = 0;
+  for (int i = 0; i < kCategoryCount; ++i) {
+    if (average_scores[i] > current_top_score) {
+      current_top_score = average_scores[i];
+      current_top_index = i;
+    }
+  }
+  const char* current_top_label = kCategoryLabels[current_top_index];
+
+  // If we've recently had another label trigger, assume one that occurs too
+  // soon afterwards is a bad result.
+  int64_t time_since_last_top;
+  if ((previous_top_label_ == kCategoryLabels[0]) ||
+      (previous_top_label_time_ == std::numeric_limits<int32_t>::min())) {
+    time_since_last_top = std::numeric_limits<int32_t>::max();
+  } else {
+    time_since_last_top = current_time_ms - previous_top_label_time_;
+  }
+  if ((current_top_score > detection_threshold_) &&
+      (current_top_label != previous_top_label_) &&
+      (time_since_last_top > suppression_ms_)) {
+    previous_top_label_ = current_top_label;
+    previous_top_label_time_ = current_time_ms;
+    *is_new_command = true;
+  } else {
+    *is_new_command = false;
+  }
+  *found_command = current_top_label;
+  *score = current_top_score;
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
new file mode 100644
index 0000000000000000000000000000000000000000..adefffe850076821dd1e0bf683fdd2180d6999ea
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Partial implementation of std::dequeue, just providing the functionality
+// that's needed to keep a record of previous neural network results over a
+// short time period, so they can be averaged together to produce a more
+// accurate overall prediction. This doesn't use any dynamic memory allocation
+// so it's a better fit for microcontroller applications, but this does mean
+// there are hard limits on the number of results it can store.
+class PreviousResultsQueue {
+ public:
+  PreviousResultsQueue(tflite::ErrorReporter* error_reporter)
+      : error_reporter_(error_reporter), front_index_(0), size_(0) {}
+
+  // Data structure that holds an inference result, and the time when it
+  // was recorded.
+  struct Result {
+    Result() : time_(0), scores_() {}
+    Result(int32_t time, uint8_t* scores) : time_(time) {
+      for (int i = 0; i < kCategoryCount; ++i) {
+        scores_[i] = scores[i];
+      }
+    }
+    int32_t time_;
+    uint8_t scores_[kCategoryCount];
+  };
+
+  int size() { return size_; }
+  bool empty() { return size_ == 0; }
+  Result& front() { return results_[front_index_]; }
+  Result& back() {
+    int back_index = front_index_ + (size_ - 1);
+    if (back_index >= kMaxResults) {
+      back_index -= kMaxResults;
+    }
+    return results_[back_index];
+  }
+
+  void push_back(const Result& entry) {
+    if (size() >= kMaxResults) {
+      error_reporter_->Report(
+          "Couldn't push_back latest result, too many already!");
+      return;
+    }
+    size_ += 1;
+    back() = entry;
+  }
+
+  Result pop_front() {
+    if (size() <= 0) {
+      error_reporter_->Report("Couldn't pop_front result, none present!");
+      return Result();
+    }
+    Result result = front();
+    front_index_ += 1;
+    if (front_index_ >= kMaxResults) {
+      front_index_ = 0;
+    }
+    size_ -= 1;
+    return result;
+  }
+
+  // Most of the functions are duplicates of dequeue containers, but this
+  // is a helper that makes it easy to iterate through the contents of the
+  // queue.
+  Result& from_front(int offset) {
+    if ((offset < 0) || (offset >= size_)) {
+      error_reporter_->Report("Attempt to read beyond the end of the queue!");
+      offset = size_ - 1;
+    }
+    int index = front_index_ + offset;
+    if (index >= kMaxResults) {
+      index -= kMaxResults;
+    }
+    return results_[index];
+  }
+
+ private:
+  tflite::ErrorReporter* error_reporter_;
+  static constexpr int kMaxResults = 50;
+  Result results_[kMaxResults];
+
+  int front_index_;
+  int size_;
+};
+
+// This class is designed to apply a very primitive decoding model on top of the
+// instantaneous results from running an audio recognition model on a single
+// window of samples. It applies smoothing over time so that noisy individual
+// label scores are averaged, increasing the confidence that apparent matches
+// are real.
+// To use it, you should create a class object with the configuration you
+// want, and then feed results from running a TensorFlow model into the
+// processing method. The timestamp for each subsequent call should be
+// increasing from the previous, since the class is designed to process a stream
+// of data over time.
+class RecognizeCommands {
+ public:
+  // labels should be a list of the strings associated with each one-hot score.
+  // The window duration controls the smoothing. Longer durations will give a
+  // higher confidence that the results are correct, but may miss some commands.
+  // The detection threshold has a similar effect, with high values increasing
+  // the precision at the cost of recall. The minimum count controls how many
+  // results need to be in the averaging window before it's seen as a reliable
+  // average. This prevents erroneous results when the averaging window is
+  // initially being populated for example. The suppression argument disables
+  // further recognitions for a set time after one has been triggered, which can
+  // help reduce spurious recognitions.
+  explicit RecognizeCommands(tflite::ErrorReporter* error_reporter,
+                             int32_t average_window_duration_ms = 1000,
+                             uint8_t detection_threshold = 51,
+                             int32_t suppression_ms = 500,
+                             int32_t minimum_count = 3);
+
+  // Call this with the results of running a model on sample data.
+  TfLiteStatus ProcessLatestResults(const TfLiteTensor* latest_results,
+                                    const int32_t current_time_ms,
+                                    const char** found_command, uint8_t* score,
+                                    bool* is_new_command);
+
+ private:
+  // Configuration
+  tflite::ErrorReporter* error_reporter_;
+  int32_t average_window_duration_ms_;
+  uint8_t detection_threshold_;
+  int32_t suppression_ms_;
+  int32_t minimum_count_;
+
+  // Working variables
+  PreviousResultsQueue previous_results_;
+  int previous_results_head_;
+  int previous_results_tail_;
+  const char* previous_top_label_;
+  int32_t previous_top_label_time_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0cc73f10b3dadfdf06cb0f2935140b792635add
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  PreviousResultsQueue queue(error_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+
+  uint8_t scores_a[4] = {0, 0, 0, 1};
+  queue.push_back({0, scores_a});
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.back().time_);
+
+  uint8_t scores_b[4] = {0, 0, 1, 0};
+  queue.push_back({1, scores_b});
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
+
+  PreviousResultsQueue::Result pop_result = queue.pop_front();
+  TF_LITE_MICRO_EXPECT_EQ(0, pop_result.time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
+
+  uint8_t scores_c[4] = {0, 1, 0, 0};
+  queue.push_back({2, scores_c});
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.back().time_);
+}
+
+TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  PreviousResultsQueue queue(error_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+
+  for (int i = 0; i < 123; ++i) {
+    uint8_t scores[4] = {0, 0, 0, 1};
+    queue.push_back({i, scores});
+    TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+    TF_LITE_MICRO_EXPECT_EQ(i, queue.front().time_);
+    TF_LITE_MICRO_EXPECT_EQ(i, queue.back().time_);
+
+    PreviousResultsQueue::Result pop_result = queue.pop_front();
+    TF_LITE_MICRO_EXPECT_EQ(i, pop_result.time_);
+    TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+  }
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {255, 0, 0, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  bool has_found_new_command = false;
+  const char* new_command;
+  for (int i = 0; i < 10; ++i) {
+    const char* found_command;
+    uint8_t score;
+    bool is_new_command;
+    int32_t current_time_ms = 0 + (i * 100);
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk, recognize_commands.ProcessLatestResults(
+                       &yes_results, current_time_ms, &found_command, &score,
+                       &is_new_command));
+    if (is_new_command) {
+      TF_LITE_MICRO_EXPECT(!has_found_new_command);
+      has_found_new_command = true;
+      new_command = found_command;
+    }
+  }
+  TF_LITE_MICRO_EXPECT(has_found_new_command);
+  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+
+  TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+  has_found_new_command = false;
+  new_command = "";
+  uint8_t score;
+  for (int i = 0; i < 10; ++i) {
+    const char* found_command;
+    bool is_new_command;
+    int32_t current_time_ms = 1000 + (i * 100);
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk, recognize_commands.ProcessLatestResults(
+                       &no_results, current_time_ms, &found_command, &score,
+                       &is_new_command));
+    if (is_new_command) {
+      TF_LITE_MICRO_EXPECT(!has_found_new_command);
+      has_found_new_command = true;
+      new_command = found_command;
+    }
+  }
+  TF_LITE_MICRO_EXPECT(has_found_new_command);
+  TF_LITE_MICRO_EXPECT_EQ(231, score);
+  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 3}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &bad_results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 100, &found_command, &score, &is_new_command));
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 100, &found_command, &score, &is_new_command));
+  TF_LITE_MICRO_EXPECT_EQ(0, score);
+  TF_LITE_MICRO_EXPECT_EQ(false, is_new_command);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
deleted file mode 100644
index 0487a12b25fc17208f1d9ab2b51538102f7ec914..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
-
-#include <limits>
-
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
-#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(TestTimer) {
-  // Make sure that the technically-undefined overflow behavior we rely on below
-  // works on this platform. It's still not guaranteed, but at least this is a
-  // sanity check.  Turn off when running with ASan, as it will complain about
-  // the following undefined behavior.
-#ifndef ADDRESS_SANITIZER
-  int32_t overflow_value = std::numeric_limits<int32_t>::max();
-  overflow_value += 1;
-  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
-#endif
-
-  const int32_t first_time = TimeInMilliseconds();
-  const int32_t second_time = TimeInMilliseconds();
-
-  // It's possible that the timer may have wrapped around from +BIG_NUM to
-  // -BIG_NUM between the first and second calls, since we're storing
-  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
-  // would have taken more than 2^31 milliseconds though, so look at the
-  // difference and rely on integer overflow to ensure it's accurate.
-  const int32_t time_delta = (second_time - first_time);
-  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index a54fd41760d58f2023e6b7b2aac72ac5f5e95ae3..e2d3164d4c3828bbd067e068fdbf0f6ba3babc7f 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -22,7 +22,6 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels:padding",
@@ -43,27 +42,10 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":micro_ops",
-        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
-cc_library(
-    name = "test_utils",
-    srcs = [
-    ],
-    hdrs = [
-        "test_utils.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/micro:micro_framework",
-        "//tensorflow/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
 tflite_micro_cc_test(
     name = "depthwise_conv_test",
     srcs = [
@@ -71,7 +53,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -85,7 +66,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -99,7 +79,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index f70437a4b943e6e71547e010a0fea9ab551194db..05ba8798c0dc34eab5c563489cf9fc928325d00f 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
index 300f8aaf78ad38a2cd4a7c715cf63315a0b2e751..c2e1446848db68a4be42eab282da34e38999670f 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
index 7253b3be8ce20ff6d30ca725060da606c416c8e1..8933b6c0ed090b175c5d42282dc0ec6f22142206 100644
--- a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/mbed/debug_log.cc b/tensorflow/lite/experimental/micro/mbed/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4a4a5a8429bb7867c225a97696c28eb5ad8d3b7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/mbed/debug_log.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include <mbed.h>
+
+// On mbed platforms, we set up a serial port and write to it for debug logging.
+extern "C" void DebugLog(const char* s) {
+  static Serial pc(USBTX, USBRX);
+  pc.printf("%s", s);
+}
diff --git a/tensorflow/lite/experimental/micro/micro_error_reporter.h b/tensorflow/lite/experimental/micro/micro_error_reporter.h
index 0ab853ec2ac915a8eb3da87eb8b86f2ecec697c7..6c18367c95fc9f07eb67b90a0e736b64271d9291 100644
--- a/tensorflow/lite/experimental/micro/micro_error_reporter.h
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter.h
@@ -17,26 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/experimental/micro/compatibility.h"
-
-#ifdef TF_LITE_MCU_DEBUG_LOG
-// These functions should be supplied by the micro target library
-extern "C" {
-#include <stdint.h>
-void DebugLog(const char* s);
-void DebugLogInt32(int32_t i);
-void DebugLogUInt32(uint32_t i);
-void DebugLogHex(uint32_t i);
-void DebugLogFloat(float i);
-}
-#else  // TF_LITE_MCU_DEBUG_LOG
-#include <cstdint>
-#include <cstdio>
-static void inline DebugLog(const char* s) { fprintf(stderr, "%s", s); }
-static void inline DebugLogInt32(int32_t i) { fprintf(stderr, "%d", i); }
-static void inline DebugLogUInt32(uint32_t i) { fprintf(stderr, "%d", i); }
-static void inline DebugLogHex(uint32_t i) { fprintf(stderr, "0x%8x", i); }
-static void inline DebugLogFloat(float i) { fprintf(stderr, "%f", i); }
-#endif  // TF_LITE_MCU_DEBUG_LOG
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/debug_log_numbers.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/experimental/micro/riscv32_mcu/README.md b/tensorflow/lite/experimental/micro/riscv32_mcu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5477d7ae951cbd8c47312f51acdea16d87f5f910
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/riscv32_mcu/README.md
@@ -0,0 +1,7 @@
+# RISC-V MCU
+
+This folder contains TFLite kernel operations optimized for RISC-V micro
+controllers.
+
+It is designed to be portable even to 'bare metal', so it follows the same
+design goals as the micro experimental port.
diff --git a/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc b/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1c2df866e9f8e4c99aabcc7fe73e4879b079b42
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
@@ -0,0 +1,18 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// TODO(b/121324430): Add test for DebugLog fuctions
+// TODO(b/121275099): Remove dependency on debug_log once the platform supports
+// printf
+
+#include <stdio.h>
+
+extern "C" void DebugLog(const char* s) { puts(s); }
diff --git a/tensorflow/lite/experimental/micro/testing/BUILD b/tensorflow/lite/experimental/micro/testing/BUILD
index 5a31a709ca3f0205b8764528d6e8f2c0fe0f93d0..1623df5b8650a34aa900cb6d362e444bc640fc8e 100644
--- a/tensorflow/lite/experimental/micro/testing/BUILD
+++ b/tensorflow/lite/experimental/micro/testing/BUILD
@@ -10,8 +10,10 @@ cc_library(
     name = "micro_test",
     hdrs = [
         "micro_test.h",
+        "test_utils.h",
     ],
     deps = [
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv b/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
new file mode 100644
index 0000000000000000000000000000000000000000..4f7ac555e6f89c1d209dc6a4d62786d357db91ed
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This docker configuration file lets you emulate a Hifive1 board
+# on an x86 desktop or laptop, which can be useful for debugging and
+# automated testing.
+FROM antmicro/renode:latest
+
+LABEL maintainer="Pete Warden <petewarden@google.com>"
+
+RUN apt-get update
+RUN apt-get install -y curl git unzip make g++
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc b/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc
new file mode 100644
index 0000000000000000000000000000000000000000..c84ce5091c778fc3226ad4a7dbb0230d38037438
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc
@@ -0,0 +1,20 @@
+:name: SiFive-FE310
+:description: This script runs Zephyr RTOS shell sample on SiFive-FE310 platform.
+
+$name?="SiFive-FE310"
+
+using sysbus
+mach create $name
+machine LoadPlatformDescription @platforms/cpus/sifive-fe310.repl
+
+$bin?=@/workspace/tensorflow/lite/experimental/micro/tools/make/gen/riscv32_mcu_riscv32_mcu/bin/micro_speech_test
+
+showAnalyzer uart0 Antmicro.Renode.Analyzers.LoggingUartAnalyzer
+logFile @/tmp/renode_riscv_log.txt
+
+sysbus LoadELF $bin
+
+sysbus Tag <0x10008000 4> "PRCI_HFROSCCFG" 0xFFFFFFFF
+sysbus Tag <0x10008008 4> "PRCI_PLLCFG" 0xFFFFFFFF
+
+cpu PerformanceInMips 320
diff --git a/tensorflow/lite/experimental/micro/kernels/test_utils.h b/tensorflow/lite/experimental/micro/testing/test_utils.h
similarity index 91%
rename from tensorflow/lite/experimental/micro/kernels/test_utils.h
rename to tensorflow/lite/experimental/micro/testing/test_utils.h
index 95f2d8a9d217a1b1f23c0198ddce5156e1c6cb36..e37eaf46e0815087cdc48c6aa23353f6f1cf9d7f 100644
--- a/tensorflow/lite/experimental/micro/kernels/test_utils.h
+++ b/tensorflow/lite/experimental/micro/testing/test_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
 
 #include <cstdarg>
 #include <initializer_list>
@@ -21,8 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -164,7 +163,20 @@ inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
   return CreateQuantized32Tensor(data.begin(), dims, name, min, max);
 }
 
+// Do a simple string comparison for testing purposes, without requiring the
+// standard C library.
+inline int TestStrcmp(const char* a, const char* b) {
+  if ((a == nullptr) || (b == nullptr)) {
+    return -1;
+  }
+  while ((*a != 0) && (*a == *b)) {
+    a++;
+    b++;
+  }
+  return *(const unsigned char*)a - *(const unsigned char*)b;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
diff --git a/tensorflow/lite/experimental/micro/tools/make/.gitignore b/tensorflow/lite/experimental/micro/tools/make/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..752f078fb56ca734056d694d0528943a82a8ef3e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/.gitignore
@@ -0,0 +1,2 @@
+downloads
+gen
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 20307e2b211f451997216f760c218b4daae6a201..8be57cc18d511840787d4adf65a6719f910cf43f 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -1,5 +1,9 @@
+
 MAKEFILE_DIR := tensorflow/lite/experimental/micro/tools/make
 
+# Pull in some convenience functions.
+include $(MAKEFILE_DIR)/helper_functions.inc
+
 # Try to figure out the host system
 HOST_OS :=
 ifeq ($(OS),Windows_NT)
@@ -21,10 +25,16 @@ HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32
 TARGET := $(HOST_OS)
 TARGET_ARCH := $(HOST_ARCH)
 
+# Specify TAGS on the command line to add a particular set of specialized
+# implementations, for example TAGS="CMSIS disco_f746ng" to target a Discovery
+# STM32F746NG board, using the CMSIS library's implementations where possible.
+ALL_TAGS := $(TAGS) $(TARGET)
+
 INCLUDES := \
 -I. \
 -I$(MAKEFILE_DIR)/../../../../../ \
 -I$(MAKEFILE_DIR)/../../../../../../ \
+-I$(MAKEFILE_DIR)/../../../../../../../ \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
@@ -56,6 +66,9 @@ MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
 
+MICROLITE_TEST_HDRS := \
+$(wildcard tensorflow/lite/experimental/micro/testing/*.h)
+
 MICROLITE_CC_BASE_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*.cc) \
@@ -66,6 +79,51 @@ tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/kernels/kernel_util.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
+MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
+
+MICROLITE_CC_HDRS := \
+$(wildcard tensorflow/lite/experimental/micro/*.h) \
+$(wildcard tensorflow/lite/experimental/micro/kernels/*.h) \
+LICENSE \
+tensorflow/lite/c/c_api_internal.h \
+tensorflow/lite/c/builtin_op_data.h \
+tensorflow/lite/core/api/error_reporter.h \
+tensorflow/lite/core/api/flatbuffer_conversions.h \
+tensorflow/lite/core/api/op_resolver.h \
+tensorflow/lite/kernels/kernel_util.h \
+tensorflow/lite/kernels/op_macros.h \
+tensorflow/lite/kernels/padding.h \
+tensorflow/lite/kernels/internal/common.h \
+tensorflow/lite/kernels/internal/compatibility.h \
+tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h \
+tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h \
+tensorflow/lite/kernels/internal/reference/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/softmax.h \
+tensorflow/lite/kernels/internal/round.h \
+tensorflow/lite/kernels/internal/tensor_ctypes.h \
+tensorflow/lite/kernels/internal/types.h \
+tensorflow/lite/kernels/internal/quantization_util.h \
+tensorflow/lite/schema/schema_generated.h \
+tensorflow/lite/version.h
+
+THIRD_PARTY_CC_HDRS := \
+third_party/gemmlowp/fixedpoint/fixedpoint.h \
+third_party/gemmlowp/fixedpoint/fixedpoint_sse.h \
+third_party/gemmlowp/internal/detect_platform.h \
+third_party/gemmlowp/LICENSE \
+third_party/flatbuffers/include/flatbuffers/base.h \
+third_party/flatbuffers/include/flatbuffers/stl_emulation.h \
+third_party/flatbuffers/include/flatbuffers/flatbuffers.h \
+third_party/flatbuffers/LICENSE.txt
+
+MAKE_PROJECT_FILES := \
+  README_MAKE.md \
+  Makefile
+
+MBED_PROJECT_FILES := \
+  README_MBED.md \
+  mbed-os.lib \
+  mbed_app.json
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
@@ -73,6 +131,8 @@ MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SR
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
+ALL_TAGS += $(TARGET_ARCH)
+
 ALL_SRCS := \
 	$(MICROLITE_CC_SRCS) \
 	$(MICROLITE_TEST_SRCS)
@@ -82,6 +142,7 @@ GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
 OBJDIR := $(GENDIR)obj/
 BINDIR := $(GENDIR)bin/
 LIBDIR := $(GENDIR)lib/
+PRJDIR := $(GENDIR)prj/
 
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
@@ -95,9 +156,6 @@ include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
 
-MICROLITE_TEST_TARGETS := $(addprefix $(BINDIR), \
-$(patsubst %_test.cc,%.test_target,$(MICROLITE_TEST_SRCS)))
-
 # For normal manually-created TensorFlow C++ source files.
 $(OBJDIR)%.o: %.cc
 	@mkdir -p $(dir $@)
@@ -108,8 +166,13 @@ $(OBJDIR)%.o: %.c
 	@mkdir -p $(dir $@)
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
+# For normal manually-created TensorFlow ASM source files.
+$(OBJDIR)%.o: %.S
+	@mkdir -p $(dir $@)
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
 # The target that's compiled if there's no command-line arguments.
-all: $(MICROLITE_LIB_PATH) $(ALL_BINARIES)
+all: $(MICROLITE_LIB_PATH)
 
 microlite: $(MICROLITE_LIB_PATH)
 
@@ -131,7 +194,11 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
 
-test: test_micro_speech $(MICROLITE_TEST_TARGETS)
+# Generate standalone makefile projects for all of the test targets.
+$(foreach TEST_TARGET,$(MICROLITE_TEST_SRCS),\
+$(eval $(call microlite_test,$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
+
+test: $(MICROLITE_TEST_TARGETS)
 
 # Gets rid of all generated files.
 clean:
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
index 6749858bdb9ffe7942efcc1dc22acb4c6aa6a533..82c15e32f6572f36588945431918cf75299d3a64 100755
--- a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -33,6 +33,11 @@ GEMMLOWP_URL="https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
 CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
 STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
+SIFIVE_FE310_LIB_URL="https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
+RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+AP3_URL="https://github.com/AmbiqMicro/TFLiteMicro_Apollo3/archive/dfbcef9a57276c087d95aab7cb234f1d4c9eaaba.zip"
+CUST_CMSIS_URL="https://github.com/AmbiqMicro/TFLiteMicro_CustCMSIS/archive/8f63966c5692e6a3a83956efd2e4aed77c4c9949.zip"
+GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
 
 download_and_extract() {
   local usage="Usage: download_and_extract URL DIR"
@@ -42,6 +47,8 @@ download_and_extract() {
   mkdir -p "${dir}"
   if [[ "${url}" == *gz ]]; then
     curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *bz2 ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xj
   elif [[ "${url}" == *zip ]]; then
     tempdir=$(mktemp -d)
     tempdir2=$(mktemp -d)
@@ -65,9 +72,37 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
+patch_apollo3_sdk() {
+  local ap3_dir="${1}"
+  if [ ! -f ${ap3_dir}/VERSION.txt ]; then
+    echo "Could not find ${ap3_dir}, skipping Apollo3 SDK";
+    return;
+  fi
+  local src_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc
+  local dest_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+  rm -rf ${dest_dir}
+  mkdir ${dest_dir}
+  cp "${src_dir}/startup_gcc.c" "${dest_dir}/startup_gcc.c"
+  cp "${src_dir}/hello_world.ld" "${dest_dir}/apollo3evb.ld"
+  sed -i -e '131s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
+  sed -i -e 's/main/_main/g' "${dest_dir}/startup_gcc.c"
+  sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '3s/startup_gnu/startup_gcc/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '6s/am_reset_isr/Reset_Handler/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "${dest_dir}/apollo3evb.ld"
+  echo "Finished preparing Apollo3 files"
+}
+
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
 download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
+download_and_extract "${SIFIVE_FE310_LIB_URL}" "${DOWNLOADS_DIR}/sifive_fe310_lib"
+download_and_extract "${RISCV_TOOLCHAIN_URL}" "${DOWNLOADS_DIR}/riscv_toolchain"
+download_and_extract "${AP3_URL}" "${DOWNLOADS_DIR}/apollo3_ext"
+patch_apollo3_sdk "${DOWNLOADS_DIR}/Apollo3-SDK-2018.08.13"
+download_and_extract "${CUST_CMSIS_URL}" "${DOWNLOADS_DIR}/CMSIS_ext"
+download_and_extract "${GCC_EMBEDDED_URL}" "${DOWNLOADS_DIR}/gcc_embedded"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
new file mode 100644
index 0000000000000000000000000000000000000000..c20cc5cfb33d624c9c6c9ab620f3bc6b7098166c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -0,0 +1,119 @@
+
+# Reverses a space-separated list of words.
+reverse = $(if $(1),$(call reverse,$(wordlist 2,$(words $(1)),$(1)))) $(firstword $(1))
+
+# Look for platform or target-specific implementation files to replace reference
+# implementations with, given a tag. These are expected to occur in subfolders
+# of a directory where a reference implementation exists, and have the same
+# interface and header file. For example,
+# tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+# defines a module for supplying audio data, but since no platform or OS can be
+# presumed, it just always returns zeroes for its samples. The MacOS-specific
+# tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
+# has an implementation that relies on CoreAudio, and there are equivalent
+# versions for other operating systems.
+# The specific implementation yielded by the first tag in the list that produces
+# a match is returned, else the reference version if none of the tags produce a
+# match.
+# All lists of source files are put through this substitution process with the
+# tags of their target OS and architecture, so that implementations can be added
+# by simply placing them in the file tree, with no changes to the build files
+# needed.
+# One confusing thing about this implementation is that we're using wildcard to
+# act as a 'does file exist?' function, rather than expanding an expression.
+# Wildcard will return an empty string if given a plain file path with no actual
+# wildcards, if the file doesn't exist, so taking the first word of the list
+# between that and the reference path will pick the specialized one if it's
+# available.
+substitute_specialized_implementation = \
+  $(firstword $(wildcard $(dir $(1))$(2)/$(notdir $(1))) $(wildcard $(1)))
+substitute_specialized_implementations = \
+  $(foreach source,$(1),$(call substitute_specialized_implementation,$(source),$(2)))
+# Here we're first looking for specialized implementations in ref_dir/$(TAG1)
+# and then ref_dir/$(TAG2), etc, before falling back to ref_dir's
+# implementation.
+# The argument to this function should be a list of space-separated file paths,
+# with any wildcards already expanded.
+define specialize_on_tags
+$(if $(2),$(call substitute_specialized_implementations,$(call specialize_on_tags,$(1),$(wordlist 2,$(words $(2)),$(2))),$(firstword $(2))),$(1))
+endef
+# The entry point that most targets should use to find implementation-specific
+# versions of their source files. The only argument is a list of file paths.
+specialize = $(call specialize_on_tags,$(1),$(strip $(call reverse,$(ALL_TAGS))))
+
+# Creates a set of rules to build a standalone makefile project for an
+# executable, including all of the source and header files required in a
+# separate folder and a simple makefile.
+# Arguments are:
+# 1 - Project type (make, mbed, etc).
+# 2 - Project file template name.
+# 3 - Name of executable.
+# 4 - List of C/C++ source files needed to build the target.
+# 5 - List of C/C++ header files needed to build the target.
+# 6 - Linker flags required.
+# 7 - C++ compilation flags needed.
+# Calling eval on the output will create a <Name>_makefile target that you
+# can invoke to create the standalone project.
+define generate_project
+$(PRJDIR)$(3)/$(1)/%: %
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+$(PRJDIR)$(3)/$(1)/third_party/%: tensorflow/lite/experimental/micro/tools/make/downloads/%
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/experimental/micro/tools/make/templates/%.tpl
+	@mkdir -p $$(dir $$@)
+	sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
+	sed -E 's#\%\{EXECUTABLE\}\%#$(3)#g' | \
+	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' > $$@
+
+generate_$(3)_$(1)_project: $(addprefix $(PRJDIR)$(3)/$(1)/, $(4) $(5) $(2))
+endef
+
+# Specialized version of generate_project for TF Lite Micro test targets that
+# automatically includes standard library files, so you just need to pass the
+# test name and any extra source files required.
+# Arguments are:
+# 1 - Name of test.
+# 2 - C/C++ source files implementing the test.
+# 3 - C/C++ header files needed for the test.
+# Calling eval on the output will create targets that you can invoke to
+# generate the standalone project.
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
+$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
+endef
+
+
+# Handles the details of generating a binary target, including specializing
+# for the current platform, and generating project file targets.
+# Arguments are:
+# 1 - Name of test.
+# 2 - C/C++ source files implementing the test.
+# 3 - C/C++ header files needed for the test.
+# Calling eval on the output will create the targets that you need.
+define microlite_test
+$(1)_LOCAL_SRCS := $(2)
+$(1)_LOCAL_SRCS := $$(call specialize,$$($(1)_LOCAL_SRCS))
+ALL_SRCS += $$($(1)_LOCAL_SRCS)
+$(1)_LOCAL_HDRS := $(3)
+$(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
+$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
+$(1)_BINARY := $$(BINDIR)$(1)
+$$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
+	@mkdir -p $$(dir $$@)
+	$$(CXX) $$(CXXFLAGS) $$(INCLUDES) \
+	-o $$($(1)_BINARY) $$($(1)_LOCAL_OBJS) \
+	$$(LIBFLAGS) $$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
+$(1): $$($(1)_BINARY)
+$(1)_bin: $$($(1)_BINARY).bin
+test_$(1): $$($(1)_BINARY)
+	$$(TEST_SCRIPT) $$($(1)_BINARY) '~~~ALL TESTS PASSED~~~'
+ifneq (,$(findstring _test,$(1)))
+  MICROLITE_TEST_TARGETS += test_$(1)
+endif
+$(eval $(call generate_microlite_projects,$(1),$(call specialize,$(2)),$(3)))
+endef
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cb646e29d9ab950e7697b284cc5a87a302397219
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore
@@ -0,0 +1,4 @@
+startup_gcc.c
+am_*.c
+libam*.a
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
deleted file mode 100644
index bd238ac55f96dbe62aa16a92180a5995ce395945..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <stdint.h>
-#include "am_mcu_apollo.h"              // Defines AM_CMSIS_REGS
-#include "am_bsp.h"
-#include "am_util.h"
-
-//*****************************************************************************
-//
-// The entry point for the application.
-//
-//*****************************************************************************
-extern int main(int argc, char**argv);
-
-void DebugLog(const char* s) { am_util_stdio_printf( "%s", s); }
-void DebugLogInt32(int32_t i) { am_util_stdio_printf( "%d", i); }
-void DebugLogUInt32(uint32_t i) { am_util_stdio_printf( "%d", i); }
-void DebugLogHex(uint32_t i) { am_util_stdio_printf( "0x%8x", i); }
-void DebugLogFloat(float i) { am_util_stdio_printf( "%f", i); }
-
-int _main(void)
-{
-    am_util_id_t sIdDevice;
-    uint32_t ui32StrBuf;
-
-    //
-    // Set the clock frequency.
-    //
-    am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
-
-    //
-    // Set the default cache configuration
-    //
-    am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
-    am_hal_cachectrl_enable();
-
-    //
-    // Configure the board for low power operation.
-    //
-    am_bsp_low_power_init();
-
-    //
-    // Initialize the printf interface for UART output
-    //
-    am_bsp_uart_printf_enable();
-
-    //
-    // Print the banner.
-    //
-    am_util_stdio_terminal_clear();
-    am_util_stdio_printf("Hello World!\n\n");
-
-    //
-    // Print the device info.
-    //
-    am_util_id_device(&sIdDevice);
-    am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
-    am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
-
-
-    am_util_stdio_printf("Qualified: %s\n",
-                         sIdDevice.sMcuCtrlDevice.ui32Qualified ?
-                         "Yes" : "No");
-
-    am_util_stdio_printf("Device Info:\n"
-                         "\tPart number: 0x%08X\n"
-                         "\tChip ID0:    0x%08X\n"
-                         "\tChip ID1:    0x%08X\n"
-                         "\tRevision:    0x%08X (Rev%c%c)\n",
-                         sIdDevice.sMcuCtrlDevice.ui32ChipPN,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipID0,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipID1,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipRev,
-                         sIdDevice.ui8ChipRevMaj, sIdDevice.ui8ChipRevMin );
-
-    //
-    // If not a multiple of 1024 bytes, append a plus sign to the KB.
-    //
-    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024 ) ? '+' : 0;
-    am_util_stdio_printf("\tFlash size:  %7d (%d KB%s)\n",
-                         sIdDevice.sMcuCtrlDevice.ui32FlashSize,
-                         sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024,
-                         &ui32StrBuf);
-
-    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024 ) ? '+' : 0;
-    am_util_stdio_printf("\tSRAM size:   %7d (%d KB%s)\n\n",
-                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
-                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024,
-                         &ui32StrBuf);
-
-    //
-    // Print the compiler version.
-    //
-    am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
-#ifdef AM_PART_APOLLO3
-    am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
-    am_util_stdio_printf("HAL SDK version: %d.%d.%d\n",
-                         g_ui32HALversion.s.Major,
-                         g_ui32HALversion.s.Minor,
-                         g_ui32HALversion.s.Revision);
-    am_util_stdio_printf("HAL compiled with %s-style registers\n",
-                         g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
-
-    am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice, &ui32StrBuf);
-    am_hal_security_info_t secInfo;
-    char sINFO[32];
-    uint32_t ui32Status;
-#endif // AM_PART_APOLLO3
-    main(0, NULL);
-}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7ef23095022b24922b28580ce3e8d1c76b81086f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+AP3_DIR="tensorflow/lite/experimental/micro/tools/make/downloads/Apollo3-SDK-2018.08.13"
+if [ ! -d $AP3_DIR ]; then
+    echo "Apollo 3 SDK does not exist"
+    echo "Either the SDK has not been downloaded, or this script is not being run from the root of the repository"
+else
+    DEST_DIR="tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/startup_gcc.c" "$DEST_DIR"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/hello_world.ld" "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '131s/1024/1024\*20/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e 's/main/_main/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '3s/startup_gnu/startup_gcc/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '6s/am_reset_isr/Reset_Handler/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "$DEST_DIR/apollo3evb.ld"
+    echo "Finished preparing Apollo3 files"
+    
+
+fi
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index f722204feaded521945cd269b36576e560dac3e4..6ed402a623188a7c39a007a1cfd7dbc67b775103 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -1,13 +1,14 @@
 # Settings for apollo3 evb platforms.
 ifeq ($(TARGET), apollo3evb)
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
   TARGET_ARCH := cortex-m4
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   # Download the Ambiq Apollo3 SDK and set this variable to find the header
   # files:
-  APOLLO3_SDK := /ssd/ambiq/AmbiqSuite\ SDK\ for\ Apollo3/Apollo3-SDK-2018.08.13/
+  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/Apollo3-SDK-2018.08.13
   # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
   # with the softfp interfaces.
-  GCC_ARM := /ssd/gnu_arm_toolchain/gcc-arm-none-eabi-7-2018-q2-update/
+  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
@@ -16,6 +17,8 @@ ifeq ($(TARGET), apollo3evb)
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
+    -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
     -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
@@ -41,8 +44,8 @@ ifeq ($(TARGET), apollo3evb)
     -fomit-frame-pointer \
     -fpermissive \
     -nostdlib \
-    -g \
-    -Os
+    -ggdb \
+    -O3
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
@@ -52,17 +55,18 @@ ifeq ($(TARGET), apollo3evb)
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
     -fno-exceptions \
     -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(MAKEFILE_DIR)/targets/apollo3evb/apollo3evb.ld \
+    -Wl,-T,$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
     -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
   BUILD_TYPE := micro
-  # The apollo3evb libs should be copied from the SDK after building them.
   MICROLITE_LIBS := \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_bsp.a \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_hal.a \
+    $(APOLLO3_SDK)/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a \
+    $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
     $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
     -lm
   INCLUDES += \
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/ \
     -I$(GCC_ARM)/arm-none-eabi/ \
     -I$(APOLLO3_SDK)/mcu/apollo3/ \
     -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
@@ -79,26 +83,37 @@ ifeq ($(TARGET), apollo3evb)
   # setting clock speed, default uart setups, etc. and an implementation
   # of the DebugLog interfaces.
   MICROLITE_CC_SRCS += \
-    $(MAKEFILE_DIR)/targets/apollo3evb/startup_gcc.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/_main.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_delay.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_faultisr.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_id.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_stdio.c
+    $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
+    $(APOLLO3_SDK)/utils/am_util_delay.c \
+    $(APOLLO3_SDK)/utils/am_util_faultisr.c \
+    $(APOLLO3_SDK)/utils/am_util_id.c \
+    $(APOLLO3_SDK)/utils/am_util_stdio.c
+
+  CMSIS_SRC_DIR := tensorflow/lite/experimental/micro/tools/make/downloads/cmsis/CMSIS/DSP/Source
+  CMSIS_SRCS := \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+  AP3_EXT_MICRO_DIR := $(MAKEFILE_DIR)/downloads/apollo3_ext
+  AP3_MICRO_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+  CMSIS_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS
+  CMSIS_EXT_DIR := $(MAKEFILE_DIR)/downloads/CMSIS_ext
+
+  MICRO_SPEECH_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c
 
   TEST_SCRIPT := tensorflow/lite/experimental/log_test/test_apollo3evb_binary.sh
-  # These are tests that don't currently work on the blue pill.
+  # These are tests that don't currently work on the Apollo3 board.
   EXCLUDED_TESTS := \
     tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
     tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
-# These are microcontroller-specific rules for converting the ELF output
-# of the linker into a binary image that can be loaded directly.
-OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
-
-$(BINDIR)/%.bin: $(BINDIR)/%
-	@mkdir -p $(dir $@)
-	$(OBJCOPY) $< $@ -O binary
-
 endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
index 5e3105a109b99b061a35b9c6f6c7c5f3681e2b45..b344f844bca7e7045eafaba141dc5e6371c3f496 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
@@ -47,7 +47,10 @@ ifeq ($(TARGET), bluepill)
   MICROLITE_CC_SRCS += \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-    TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
+  EXCLUDED_SRCS := \
+    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
   # These are tests that don't currently work on the blue pill.
   EXCLUDED_TESTS := \
     tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3cb74a72437be8017527c0ea05a1b82eb1a4ac9e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
@@ -0,0 +1,32 @@
+Running The Micro Speech Example On Eta Compute's ECM3531EVB
+
+This code will enable you to compile and execute the Tensorflow Lite Micro Speech Example on Eta Computes's low power ECM3531 chip.
+
+
+GETTING STARTED:
+
+1. Download the Tensorflow code from Github and follow instructions there to download other dependencies.  
+
+2. Download the Eta Compute SDK, version 0.0.17.
+
+3. Install the Arm compiler arm-none-eabi-gcc, version = arm-none-eabi-gcc (GNU Tools for Arm Embedded Processors 7-2018-q2-update) 7.3.1 20180622 (release) [ARM/embedded-7-branch revision 261907]
+
+4. Edit the file   tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc  so that the variable ETA_SDK points to the location where the Eta Compute SDK is installed, and the variable GCC_ARM points to the Arm compiler.
+
+5. Compile the code with the command   "make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531 test".  This will create the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test.
+
+6. Connect the board to the host computer, start PuTTY (Connection type = Serial, Speed = 11520, Data bits = 8, Stop bits = 1,  Parity = None), and load the executable with ocd.  A sample script for loading the image is provided in tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program.  
+
+The following  will be printed on the Uart:
+
+Testing TestInvoke
+Ran successfully
+
+/ tests passed
+~~~ALL TESTS PASSED~~~
+
+
+
+CONTACT INFORMATION:
+
+Contact info@etacompute.com  for more information on obtaining the Eta Compute SDK and evalution board.
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..2764f3ba50de699fa72717585114369cf833d76e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
@@ -0,0 +1,95 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This is file contains the entry point to the application and is called after
+   startup.
+   The GPIOs, Uart and timer are intialized and Tensorflow is invoked with the
+   call to main().
+   Tensorflow will print out if the tests have passed or failed and the
+   execution time is also
+   printed. */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "eta_bsp.h"
+#include "eta_chip.h"
+#include "eta_csp.h"
+#include "eta_csp_buck.h"
+#include "eta_csp_gpio.h"
+#include "eta_csp_io.h"
+#include "eta_csp_pwr.h"
+#include "eta_csp_rtc.h"
+#include "eta_csp_socctrl.h"
+#include "eta_csp_sys_clock.h"
+#include "eta_csp_timer.h"
+#include "eta_csp_uart.h"
+
+tUart g_sUart0 = {eUartNum0, eUartBaud115200};
+tUart g_sUart1 = {eUartNum1, eUartBaud115200};
+
+int init_main(int);
+void EtaPrintExecutionTime(uint64_t);
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { EtaCspIoPrintf("%s", s); }
+void DebugLogInt32(int32_t i) { EtaCspIoPrintf("%d", i); }
+void DebugLogUInt32(uint32_t i) { EtaCspIoPrintf("%d", i); }
+void DebugLogHex(uint32_t i) { EtaCspIoPrintf("0x%8x", i); }
+void DebugLogFloat(float i) { EtaCspIoPrintf("%f", i); }
+
+int _main(void) {
+  uint64_t time_ms;
+
+  EtaCspInit();      // initialize csp registers
+  EtaCspGpioInit();  // initialize gpios
+  EtaCspUartInit(&g_sUart1, eUartNum0, eUartBaud115200,
+                 eUartFlowControlHardware);  // initialize Uart
+  EtaCspBuckInit(ETA_BSP_VDD_IO_SETTING, eBuckAo600Mv, eBuckM3Frequency60Mhz,
+                 eBuckMemVoltage900Mv);  // set M3 freq
+  EtaCspTimerInitMs();                   // start timer
+  main(0, NULL);  // Call to Tensorflow; this will print if test was successful.
+  time_ms = EtaCspTimerCountGetMs();  // read time
+  EtaPrintExecutionTime(time_ms);     // print execution time
+}
+
+void EtaPrintExecutionTime(uint64_t time_ms) {
+  uint8_t c;
+  int k1;
+  char time_string[] = "00000";
+
+  EtaCspIoPrintf("Execution time (msec) = ");
+  if (time_ms < 100000)  // Convert time to a string
+  {
+    for (k1 = 0; k1 < 5; k1++) {
+      c = time_ms % 10;
+      time_ms = time_ms / 10;
+      time_string[k1] = (char)(0x30 + c);
+    }
+    for (k1 = 4; k1 >= 0; k1--) {  // print out 1 char at a time
+      EtaCspUartPutc(&g_sUart1, time_string[k1]);
+    }
+  } else {
+    EtaCspIoPrintf("Execution time exceeds 100 sec\n");
+  }
+  EtaCspIoPrintf("\n\n");
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
new file mode 100644
index 0000000000000000000000000000000000000000..af34f988f2d04a0c1c87f20d6058df560db7e2c5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/*
+ * linker script for use with ECM3531
+ * All sections must map to 128KBytes of SRAM beginning at 0x10000000
+ *
+ */
+
+ /*
+  * Indicate to the linker the entry point.
+  */
+ENTRY(ResetISR)
+
+/*
+ *   SRAM is at 0x10000000 of length 0x00020000
+ */
+MEMORY
+{
+    SRAM (RWX) : ORIGIN = 0x10000000, LENGTH = 0x00020000
+}
+
+SECTIONS
+{
+    .text :
+    {
+        _text = .;
+        KEEP(*(.vectors))
+        . = ALIGN(0x4);
+        *(.text*)
+        . = ALIGN(0x4);
+        *(.rodata*)
+        . = ALIGN(0x4);
+        _etext = .;
+    } > SRAM= 0
+    .dummy :
+    {
+        . = ALIGN(0x4);
+        _eftext = .;
+    } > SRAM
+    .datax :
+    {
+        _datax = .;
+        KEEP(*(.mainStack))
+        . += 12288;
+        _edatax = .;
+        _stack_top = .;
+        . += 4;
+    } > SRAM
+    .data :
+       AT (ADDR(.text) + SIZEOF(.text) ) 
+    {
+        _data = .;
+        *(.data*)
+        KEEP(*(.mainHeap))
+        _edata = .;
+    } > SRAM
+
+    .bss :
+    {
+        _bss = .;
+        *(.bss*)
+        *(COMMON)
+        _ebss = .;
+    } > SRAM
+    .ARM.exidx :
+    {
+       *(.ARM.exidx*)
+    }
+
+}
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
new file mode 100755
index 0000000000000000000000000000000000000000..ac1f49962a61756ccbde02300c612bd7b4f48e84
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram to load the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test into SRAM
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+# git path to project elf file
+cur_dir = os.getcwd()
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech_test'
+print("elf_file = ",elf_file)
+
+
+# use these to download and run the elf fle
+ocd_commands = ["halt\n",
+                "load_image {}\n".format(elf_file),
+                "mww 0x1001FFF8 0xDEADBEEF\n",
+                "mww 0x1001FFFC 0xC369A517\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands: 
+    print(x)
+    send_ocd_cmd(x)
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c
new file mode 100644
index 0000000000000000000000000000000000000000..32d817ba4882f9123a9ed6321f9339355d82db5c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c
@@ -0,0 +1,432 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is called at power up time to initialize the chip.  It in turn
+calls _main() which is the entry point into the application */
+
+#include <stdint.h>
+#include "eta_chip.h"
+#include "memio.h"
+
+#ifndef NULL
+#define NULL (0)
+#endif
+
+//*****************************************************************************
+//
+// Macro for hardware access, both direct and via the bit-band region.
+//
+//*****************************************************************************
+
+int _main(int argc, char *argv[]);
+void set_vtor(void);
+void *startup_get_my_pc(void);
+
+//*****************************************************************************
+// Forward DECLS for interrupt service routines (ISR)
+//*****************************************************************************
+extern void ResetISR(void) __attribute__((weak, alias("default_ResetISR")));
+extern void NmiSR(void) __attribute__((weak, alias("default_NmiSR")));
+extern void FaultISR(void) __attribute__((weak, alias("default_FaultISR")));
+
+extern void DebugMonitor_ISR(void)
+    __attribute__((weak, alias("default_DebugMonitor_ISR")));
+extern void SVCall_ISR(void) __attribute__((weak, alias("default_SVCall_ISR")));
+extern void PENDSV_ISR(void) __attribute__((weak, alias("default_PENDSV_ISR")));
+
+extern void SYSTICK_ISR(void)
+    __attribute__((weak, alias("default_SYSTICK_ISR")));
+
+extern void GPIO0_ISR(void) __attribute__((weak, alias("default_GPIO0_ISR")));
+extern void GPIO1_ISR(void) __attribute__((weak, alias("default_GPIO1_ISR")));
+extern void TIMER0_ISR(void) __attribute__((weak, alias("default_TIMER0_ISR")));
+extern void TIMER1_ISR(void) __attribute__((weak, alias("default_TIMER1_ISR")));
+extern void UART0_ISR(void) __attribute__((weak, alias("default_UART0_ISR")));
+extern void UART1_ISR(void) __attribute__((weak, alias("default_UART1_ISR")));
+extern void SPI0_ISR(void) __attribute__((weak, alias("default_SPI0_ISR")));
+extern void SPI1_ISR(void) __attribute__((weak, alias("default_SPI1_ISR")));
+extern void I2C0_ISR(void) __attribute__((weak, alias("default_I2C0_ISR")));
+extern void I2C1_ISR(void) __attribute__((weak, alias("default_I2C1_ISR")));
+extern void RTC0_ISR(void) __attribute__((weak, alias("default_RTC0_ISR")));
+extern void RTC1_ISR(void) __attribute__((weak, alias("default_RTC1_ISR")));
+extern void DSP_ISR(void) __attribute__((weak, alias("default_DSP_ISR")));
+extern void ADC_ISR(void) __attribute__((weak, alias("default_ADC_ISR")));
+extern void SW0_ISR(void) __attribute__((weak, alias("default_SW0_ISR")));
+extern void SW1_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void PWM_ISR(void) __attribute__((weak, alias("default_PWM_ISR")));
+extern void WDT_ISR(void) __attribute__((weak, alias("default_WDT_ISR")));
+extern void RTC_TMR_ISR(void)
+    __attribute__((weak, alias("default_RTC_TMR_ISR")));
+
+extern void SW2_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW3_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW4_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW5_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW6_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+
+extern void IntDefaultHandler(void) __attribute__((weak));
+
+//*****************************************************************************
+//
+// Reserve space for the system stack.
+//
+//*****************************************************************************
+extern uint32_t _stack_top;
+//__attribute__ ((section(".mainStack"), used))
+// static uint32_t pui32Stack[2048];
+#define STARTUP_STACK_TOP (&_stack_top)
+
+//*****************************************************************************
+// VECTOR TABLE
+//*****************************************************************************
+__attribute__((section(".vectors"), used)) void (*const gVectors[])(void) = {
+    //(void (*)(void))((uint32_t)pui32Stack + sizeof(pui32Stack)), // Stack
+    // pointer
+    (void *)STARTUP_STACK_TOP,
+    ResetISR,           // Reset handler
+    NmiSR,              // The NMI handler
+    FaultISR,           // The hard fault handler
+    IntDefaultHandler,  // 4 The MPU fault handler
+    IntDefaultHandler,  // 5 The bus fault handler
+    IntDefaultHandler,  // 6 The usage fault handler
+    0,                  // 7 Reserved
+    0,                  // 8 Reserved
+    0,                  // 9 Reserved
+    0,                  // 10 Reserved
+    SVCall_ISR,         // 11 SVCall handler
+    DebugMonitor_ISR,   // 12 Debug monitor handler
+    0,                  // 13 Reserved
+    PENDSV_ISR,         // 14 The PendSV handler
+    SYSTICK_ISR,        // 15 The SysTick handler
+
+    // external interrupt service routines (ISR)
+    GPIO0_ISR,    // 16 GPIO Port A            [ 0]
+    GPIO1_ISR,    // 17 GPIO Port B            [ 1]
+    TIMER0_ISR,   // 18 Timer 0                [ 2]
+    TIMER1_ISR,   // 19 Timer 1                [ 3]
+    UART0_ISR,    // 20 UART 0                 [ 4]
+    UART1_ISR,    // 21 UART 1                 [ 5]
+    SPI0_ISR,     // 22 SPI0                   [ 6]
+    SPI1_ISR,     // 23 SPI1                   [ 7]
+    I2C0_ISR,     // 24 I2C 0                  [ 8]
+    I2C1_ISR,     // 25 I2C 1                  [ 9]
+    RTC0_ISR,     // 26 RTC 0                  [10]
+    RTC1_ISR,     // 27 RTC 1                  [11]
+    DSP_ISR,      // 28 DSP MAILBOX            [12]
+    ADC_ISR,      // 29 ADC                    [13]
+    PWM_ISR,      // 32 PWM                    [14]
+    WDT_ISR,      // 33 WDT                    [15]
+    RTC_TMR_ISR,  // 34 RTC                    [16]
+
+    SW0_ISR,  // 30 Software Interrupt 0   [17]
+    SW1_ISR,  // 31 Software Interrupt 1   [18]
+    SW2_ISR,  // 35 Software Interrupt 2   [19]
+    SW3_ISR,  // 36 Software Interrupt 3   [20]
+    SW4_ISR,  // 37 Software Interrupt 4   [21]
+    SW5_ISR,  // 38 Software Interrupt 5   [22]
+    SW6_ISR,  // 39 Software Interrupt 6   [23]
+
+};
+
+//*****************************************************************************
+//
+// The following are constructs created by the linker, indicating where the
+// the "data" and "bss" segments reside in memory.  The initializers for the
+// for the "data" segment resides immediately following the "text" segment.
+//
+//*****************************************************************************
+extern uint32_t _etext;
+extern uint32_t _eftext;
+extern uint32_t _data;
+extern uint32_t _edata;
+extern uint32_t _bss;
+extern uint32_t _ebss;
+
+//
+// And here are the weak interrupt handlers.
+//
+void default_NmiSR(void) {
+  __asm("    movs     r0, #2");
+  while (1) {
+  }
+}
+
+void default_FaultISR(void) {
+  __asm("    movs     r0, #3");
+  MEMIO32(0x1001FFF0) = 0xbad0beef;  // near the top of 128KB of SRAM
+  MEMIO32(0x1001FFF4) = 0xbad1beef;  // near the top of 128KB of SRAM
+  while (1) {
+    __asm("    BKPT      #1");
+  }
+}
+
+void IntDefaultHandler(void) {
+  __asm("    movs     r0, #20");
+  while (1) {
+    __asm("    BKPT      #1");
+  }
+}
+
+void default_SVCall_ISR(void) {
+  __asm("    movs     r0, #11");
+  while (1) {
+    __asm("    BKPT      #11");
+  }
+}
+
+void default_DebugMonitor_ISR(void) {
+  __asm("    movs     r0, #12");
+  while (1) {
+    __asm("    BKPT      #12");
+  }
+}
+
+void default_PENDSV_ISR(void) {
+  __asm("    movs     r0, #14");
+  while (1) {
+    __asm("    BKPT      #14");
+  }
+}
+
+void default_SYSTICK_ISR(void) {
+  __asm("    movs     r0, #15");
+  while (1) {
+    __asm("    BKPT      #15");
+  }
+}
+
+//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+void default_SPI0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SPI1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_I2C0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_I2C1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_UART0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_UART1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_GPIO0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_GPIO1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_ADC_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_DSP_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_TIMER0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_TIMER1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_PWM_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_WDT_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC_TMR_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SW0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SW1_ISR(void) {
+  __asm("    movs     r0, #17");
+  while (1) {
+    __asm("    BKPT      #17");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Reset ISR
+////////////////////////////////////////////////////////////////////////////////
+void default_ResetISR(void) {
+  int rc;
+  bool bRunningInFlash;
+
+  set_vtor();
+
+  bRunningInFlash =
+      ((((uint32_t)startup_get_my_pc()) & 0xFF000000) == 0x01000000);
+
+  if ((!REG_RTC_AO_CSR.BF.WARM_START_MODE) || bRunningInFlash) {
+    //
+    //  Copy any .ro bytes to .data so that initialized global variables
+    //  are actually properly initialized.
+    //
+    __asm(
+        "    ldr      r0, =_eftext\n"
+        "    ldr      r1, =_data\n"
+        "    ldr      r2, =_edata\n"
+        "ro_copy_loop:\n"
+        "    ldr      r3, [r0], #4\n"
+        "    str      r3, [r1], #4\n"
+        "    cmp      r1, r2\n"
+        "    ble      ro_copy_loop\n");
+
+    //
+    // Zero fill the .bss section.
+    //
+    __asm(
+        "    ldr      r0, =_bss\n"
+        "    ldr      r1, =_ebss\n"
+        "    mov      r2, #0\n"
+        "bss_zero_loop:\n"
+        "    cmp      r0, r1\n"
+        "    it       lt\n"
+        "    strlt    r2, [r0], #4\n"
+        "    blt      bss_zero_loop\n");
+  }
+
+  //
+  // call the main routine barefoot, i.e. without the normal CRTC0 entry
+  // point.
+  //
+  rc = _main(0, NULL);
+
+  //
+  //  If main ever returns, trap it here and wake up the debugger if it is
+  //  connected.
+  //
+  while (1)  // for FPGA/real chip use
+  {
+    __asm("    BKPT      #1");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get my PC
+////////////////////////////////////////////////////////////////////////////////
+void *startup_get_my_pc(void) {
+  void *pc;
+  asm("mov %0, pc" : "=r"(pc));
+  return pc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get my SP
+////////////////////////////////////////////////////////////////////////////////
+void *startup_get_my_sp(void) {
+  void *sp;
+  asm("mov %0, sp" : "=r"(sp));
+  return sp;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Set VTOR based on PC
+////////////////////////////////////////////////////////////////////////////////
+void set_vtor(void) {
+  __asm(
+      "    ldr      r0, =0xe000ed08\n"
+      "    ldr      r1, =0xFF000000\n"
+      "    mov      r2, lr\n"
+      "    and      r1, r2\n"
+      "    str      r1, [r0]\n");
+
+  return;
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..baae58f87e1761c978a87256fda8b7e90edb79e5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
@@ -0,0 +1,103 @@
+# Settings for eta ecm3531 platform
+ifeq ($(TARGET), ecm3531)
+  TARGET_ARCH := cortex-m3
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  ETA_SDK := /home/hari/TensaiSDK-v0.0.17/soc/
+  GCC_ARM := /home/hari/Downloads/gcc-arm-none-eabi-7-2018-q2-update/
+
+  ifeq ($(wildcard $(ETA_SDK)),)
+    $(error Path to ETA SDK is not set (ETA_SDK))
+  endif
+
+  ifeq ($(wildcard $(GCC_ARM)),)
+    $(error Path to gcc arm compiler is not set (GCC_ARM))
+  endif
+
+  PLATFORM_FLAGS = \
+    -DFIRMWARE_BUILD \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m3 \
+    -mthumb \
+    -mlittle-endian \
+    -mno-unaligned-access \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -fpermissive \
+    -nostdlib \
+    -g \
+    -Os
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+# Adding the --specs=nano.specs flag causes the linker to use libc_nano.a
+# instead of libc.a.  This gets rid of lots of errors with various pieces
+# of the exception unwinding code not being found.  Not clear why it is
+# trying to link in this code to begin with, though.
+  LDFLAGS += \
+    -mthumb -mcpu=cortex-m3 \
+    -nostartfiles -static \
+    -Wl,--gc-sections -Wl,--entry,ResetISR \
+    -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
+    -fno-exceptions \
+    -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
+    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.lds \
+    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+  BUILD_TYPE := micro
+  MICROLITE_LIBS := \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    -lm
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -I$(GCC_ARM)/arm-none-eabi/include/ \
+    -I$(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/inc/ \
+    -I$(ETA_SDK)/ecm3531/m3/reg/inc/ \
+    -I$(ETA_SDK)/ecm3531/m3/csp/inc/ \
+    -I$(ETA_SDK)/ecm3531/common/csp/inc/ \
+    -I$(ETA_SDK)/common/inc/  \
+    -I$(ETA_SDK)/../utils/inc/  \
+    -I$(ETA_SDK)/ecm3531/boards/eta_evb/eta_bsp/inc
+
+  # _main.c contains application and target specific initialization, like
+  # setting clock speed, default uart setups, etc. and an implementation
+  # of the DebugLog interfaces.
+  MICROLITE_CC_SRCS += \
+    $(MAKEFILE_DIR)/targets/ecm3531/startup.c \
+    $(MAKEFILE_DIR)/targets/ecm3531/_main.c \
+    $(wildcard $(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/src/*.c) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.c) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s) \
+
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
+  # These are tests that don't currently work on the blue pill.
+  EXCLUDED_TESTS := \
+    tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
+    tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+
+$(BINDIR)/%.bin: $(BINDIR)/%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..161ff34cdbda07768d33b9af45ed9655665b9bfd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
@@ -0,0 +1,4 @@
+# Settings for mbed platforms.
+ifeq ($(TARGET), mbed)
+  TARGET_ARCH := cortex-m4
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/mcu_riscv_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..a259f68a3e0759baff04105cc6776212b49e2755
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/mcu_riscv_makefile.inc
@@ -0,0 +1,76 @@
+# Settings for RISCV 32-bit MCU toolchain.
+ifeq ($(TARGET), riscv32_mcu)
+  TARGET_ARCH := riscv32_mcu
+  TARGET_TOOLCHAIN_PREFIX := riscv64-unknown-elf-
+
+  PLATFORM_FLAGS = \
+    -march=rv32imac \
+    -mabi=ilp32 \
+    -mcmodel=medany \
+    -mexplicit-relocs \
+    -fno-builtin-printf \
+    -fno-exceptions \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -DTF_LITE_USE_GLOBAL_ROUND \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -Os
+
+  CXXFLAGS += $(PLATFORM_FLAGS) \
+    -fpermissive \
+    -fno-rtti \
+    --std=gnu++11
+
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  BUILD_TYPE := micro
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/include \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/drivers/ \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1
+
+  MICROLITE_CC_SRCS += \
+    $(wildcard tensorflow/lite/experimental/micro/riscv32_mcu/*.cc)
+  MICRO_SPEECH_TEST_SRCS += \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.cc) \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
+  LIBWRAP_SYMS := malloc free \
+                  open lseek read write fstat stat close link unlink \
+                  execve fork getpid kill wait \
+                  isatty times sbrk _exit puts
+
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
+  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
+  LDFLAGS += \
+   -T$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/flash.lds \
+   -nostartfiles \
+   -L$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env \
+   --specs=nano.specs
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+  OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+  $(BINDIR)/%.bin: $(BINDIR)/%
+		@mkdir -p $(dir $@)
+		$(OBJCOPY) $< $@ -O binary
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3b91eeff9fd5f2df06caa9a5f73b221815f9bbdf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
@@ -0,0 +1,10 @@
+# Settings for Mac OS platforms.
+ifeq ($(TARGET), osx)
+
+  PLATFORM_FLAGS = \
+    -DTF_LITE_DISABLE_X86_NEON
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..11dae1ea16c4ac990af07aebd8b5e59ff748fc2d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/AUDIO_DISCO_F746NG/#7046ce26b7ed
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..48dc1317072d537b3c61b0481b272855eb5941be
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/BSP_DISCO_F746NG/#df2ea349c37a
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..74d54f1ebee12d7773edfd1b073ddf17dd3791d6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
@@ -0,0 +1,26 @@
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+INCLUDES := \
+-I. \
+-I./third_party/gemmlowp \
+-I./third_party/flatbuffers/include
+
+CXXFLAGS += %{CXX_FLAGS}%
+
+LDFLAGS += %{LINKER_FLAGS}%
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+%{EXECUTABLE}% : $(OBJS)
+	$(CXX) $(LDFLAGS) $(OBJS) \
+	-o $@
+
+all: %{EXECUTABLE}%
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..7906a3226ab5b475d3f0f93f39111e8e21d39a40
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl
@@ -0,0 +1,29 @@
+# TensorFlow Lite Micro Make Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the make tool.
+
+## Usage
+
+To build this, run:
+
+```
+make
+```
+
+This should attempt to build the target locally on your platform, using the
+standard Makefile variables like CFLAGS, CC, CXX, and so on.
+
+## Project Generation
+
+See
+[tensorflow/lite/experimental/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+for details on how projects like this can be generated from the main source
+tree.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..2682236edf5b847a95aa07fa6d0e30c5a9a10c9a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl
@@ -0,0 +1,48 @@
+# TensorFlow Lite Micro Mbed Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the Mbed command line interface.
+
+## Usage
+
+To load the dependencies this code requires, run:
+
+```
+mbed config root .
+mbed deploy
+```
+
+TensorFlow requires C++ 11, so you'll need to update your profiles to reflect
+this. Here's a short Python command that does that:
+
+```
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
+```
+
+With that setting updated, you should now be able to compile:
+
+```
+mbed compile -m auto -t GCC_ARM
+```
+
+If this works, it will give you a .bin file that you can flash onto the device
+you're targeting. For example, using a Discovery STM3246G board, you can deploy
+it by copying the bin to the volume mounted as a USB drive, just by dragging
+over the file.
+
+## Project Generation
+
+See
+[tensorflow/lite/experimental/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+for details on how projects like this can be generated from the main source
+tree.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..e2ccd7b81b28df938f19638f953b500c387594dc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/SDRAM_DISCO_F746NG/#370f402a2219
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..69fff22f335953f62576d3408fbf15e24be5280f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
@@ -0,0 +1 @@
+https://github.com/ARMmbed/mbed-os/#6a0a86538c0b9b2bfcc4583b1e2b7fea8f4e71e9
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..1c547369fb2784b27a9152ba4b7ade77c12211b0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl
@@ -0,0 +1,7 @@
+{
+    "config": {
+	"main-stack-size": {
+            "value": 65536
+	}
+    }
+}
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 57ce63636714aa616cb50e04fe2c15210cc2eb1c..6aa81ff173408d3378285e8b12a7acf2d347a0a5 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -33,7 +33,6 @@ cc_library(
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
     ],
 )
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index fa360a2f47e3dba34e05d2e32616821294f0e678..73813446138a9d3b7686012c84310d456f502894 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -69,6 +69,7 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteOneHotParams",
                                       "TfLiteLeakyReluParams",
                                       "TfLiteMirrorPaddingParams",
+                                      "TfLiteUniqueParams",
                                       nullptr};
 }  // namespace
 
@@ -156,10 +157,12 @@ class OpOptionData {
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
+    op_to_option_["UNIQUE"] = "";      // TODO(karimnosseir): UniqueOptions.
     // Manually specified mappings between ops and options (none)
     op_to_option_["EMBEDDING_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
     op_to_option_["FLOOR"] = "";
+    op_to_option_["CEIL"] = "";
     op_to_option_["HASHTABLE_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
     op_to_option_["LOGISTIC"] = "";
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 0c79e79fddbd43b5a7340ea334ba4011a8c540ac..9c48e1e54d153b9ff043e43f75f25cc36398bc60 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -59,6 +59,10 @@ upper_tabs:
       - title: Post-training quantization example
         path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb
         status: external
+      - title: GPU delegate
+        path: /lite/performance/gpu
+      - title: Advanced GPU
+        path: /lite/performance/gpu_advanced
 
       - title: TF Mobile
         style: accordion
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
index 1b3f1d616ae953e3c6a659301d7a7dd6860dcbf2..7153b7c6f670375df8183a9269bb7eaf096ac0c2 100644
--- a/tensorflow/lite/g3doc/_index.yaml
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -4,7 +4,7 @@ description: <!--no description-->
 landing_page:
   custom_css_path: /site-assets/css/style.css
   rows:
-  - heading: TensorFlow Lite is for mobile and embedded devices.
+  - heading: TensorFlow Lite is for mobile and embedded devices
     description: >
       <p style="max-width: 75%;">
         TensorFlow Lite is the official solution for running machine learning
@@ -13,9 +13,6 @@ landing_page:
         iOS, and other operating systems.
       </p>
       <style>
-      .tfo-landing-row-heading {
-        padding-top: 0 !important;
-      }
       .tfo-landing-row-heading h2 {
         margin-top: 0 !important;
       }
@@ -71,58 +68,16 @@ landing_page:
           icon_name: lens
           foreground: theme
 
-  - classname: devsite-landing-row-logos tfo-landing-row-heading
-    heading: Companies using TensorFlow Lite
-    items:
-    - custom_image:
-        path: ./images/landing-page/photos_logo.png
-      path: https://www.photos.google.com
-    - custom_image:
-        path: ./images/landing-page/gboard_logo.png
-      path: https://play.google.com/store/apps/details?id=com.google.android.inputmethod.latin&hl=en_US
-    - custom_image:
-        path: ./images/landing-page/gmail_logo.png
-      path: https://www.google.com/gmail/
-    - custom_image:
-        path: ./images/landing-page/assistant_logo.png
-      path: https://assistant.google.com/
-
-  - classname: devsite-landing-row-logos
-    items:
-    - custom_image:
-        path: ./images/landing-page/vsco_logo.png
-      path: https://vsco.co
-    - custom_image:
-        path: ./images/landing-page/shazam_logo.png
-      path: https://www.shazam.com/
-    - custom_image:
-        path: ./images/landing-page/nest_logo.png
-      path: https://nest.com/
-    - custom_image:
-        path: ./images/landing-page/loseit_logo.png
-      path: https://www.loseit.com/
-
-  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
-    background: grey
-    items:
-    - description: >
-        <em>“TensorFlow Lite helped us introduce machine learning and AI into our
-        app in an easy and streamlined way. We could reduce the size of our
-        models while keeping the accuracy high. This helped us create an amazing
-        fishing experience for our users by allowing them to identify any fish
-        species with just a photo.”</em>
-      image_path: ./images/landing-page/fishbrain_logo_big.png
-
   - heading: How it works
     items:
-    - heading: Build
+    - heading: Pick a model
       icon:
         icon_name: build
       description: >
-        Build a new model or retrain an existing one, such as using transfer learning.
+        Pick a new model or retrain an existing one.
       buttons:
-      - label: Read the developer guide
-        path: /lite/devguide
+      - label: Pick
+        path: /lite/devguide#1_choose_a_model
         classname: button button-primary tfo-button-primary
     - heading: Convert
       icon:
@@ -131,18 +86,29 @@ landing_page:
         Convert a TensorFlow model into a compressed flat buffer with the
         TensorFlow Lite Converter.
       buttons:
-      - label: Read the converter guide
-        path: /lite/convert/
+      - label: Convert
+        path: /lite/devguide#2_convert_the_model_format
         classname: button button-primary tfo-button-primary
     - heading: Deploy
+      icon:
+        icon_name: settings_cell
+      description: >
+        Take the compressed <code>.tflite</code> file and load it into a mobile or embedded device.
+      buttons:
+      - label: Deploy
+        path: /lite/devguide#3_use_the_tensorflow_lite_model_for_inference_in_a_mobile_app
+        classname: button button-primary tfo-button-primary
+    - heading: Optimize
       icon:
         icon_name: bolt
       description: >
-        Take the compressed <code>.tflite</code> file and load it into a mobile
-        or embedded device.<br/>
-        See the <a href="#build-your-first-tensorflow-lite-app">tutorials below</a> to build an app.
+        [optional] Quantize by converting 32-bit floats to more efficient 8-bit integers or run on GPU.
+      buttons:
+      - label: Optimize
+        path: /lite/devguide#4_optimize_your_model_optional
+        classname: button button-primary tfo-button-primary
 
-  - heading: Build your first TensorFlow Lite app
+  - heading: Build your first TensorFlow Lite app with Codelabs
     background: grey
     items:
     - classname: tfo-landing-row-item-inset-white
@@ -160,28 +126,40 @@ landing_page:
         We love to hear what you're working on—it may even get highlighted on
         our social media! <a href="https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss" class="external">Tell us</a>.
 
-  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
+  - classname: devsite-landing-row-logos tfo-landing-row-heading
+    heading: TensorFlow Lite users
     items:
-    - description: >
-        <p>
-          <em>“The release of TensorFlow Lite has allowed us to deploy an engaging
-          real-time experience to our users that eliminates the requirement
-          for a data connection. TensorFlow Lite’s ability to compress and
-          optimize the TensorFlow graph for mobile deployment has been
-          transformative in expanding the capabilities of Snap It.</em>
-        </p>
-        <p>
-          <em>Through TensorFlow Lite, our users can now enjoy a state of the
-          art, computer-vision-based food logging experience without worrying
-          about signal strength. We look forward to future collaborations
-          with the TensorFlow Lite team.”</em>
-        </p>
-      image_path: ./images/landing-page/loseit_logo_big.png
+    - custom_image:
+        path: ./images/landing-page/photos_logo.png
+    - custom_image:
+        path: ./images/landing-page/gboard_logo.png
+    - custom_image:
+        path: ./images/landing-page/gmail_logo.png
+    - custom_image:
+        path: ./images/landing-page/assistant_logo.png
+
+  - classname: devsite-landing-row-logos
+    items:
+    - custom_image:
+        path: ./images/landing-page/vsco_logo.png
+    - custom_image:
+        path: ./images/landing-page/shazam_logo.png
+    - custom_image:
+        path: ./images/landing-page/nest_logo.png
+    - custom_image:
+        path: ./images/landing-page/loseit_logo.png
+
 
   - classname: devsite-landing-row-cards
     background: grey
     heading: Updates
     items:
+    - heading: "TensorFlow Lite Now Faster with Mobile GPUs (Developer Preview)"
+      image_path: ./images/landing-page/facial_contour_detection.png
+      path: https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7
+      buttons:
+      - label: Read more
+        path: https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7
     - heading: "AI in motion: react in the real world"
       image_path: ./images/landing-page/ai_in_motion.png
       path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md
index b15159ce4145727863c335126557e06402f8dbd3..1a05142bc44b824e090fd6eb513360837eac2c69 100644
--- a/tensorflow/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/apis.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite APIs
 
 TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
@@ -8,8 +7,7 @@ no surprise that the APIs try to avoid unnecessary copies at the expense of
 convenience.  Similarly, consistency with TensorFlow APIs was not an explicit
 goal and some variance is to be expected.
 
-There is also a Python API for TensorFlow Lite described
-[here](../toco/g3doc/python_api.md#interpreter).
+There is also a [Python API for TensorFlow Lite](g3doc/convert/python_api.md).
 
 ## C++
 
diff --git a/tensorflow/lite/g3doc/custom_operators.md b/tensorflow/lite/g3doc/custom_operators.md
index 4a22d6a67577cf5c06f2c0d32e30650fd4d4bb32..2d80668f37d645054596e1150f1eee6249122f75 100644
--- a/tensorflow/lite/g3doc/custom_operators.md
+++ b/tensorflow/lite/g3doc/custom_operators.md
@@ -137,9 +137,9 @@ operations instead of a single operator.
 
 ## Special TF Graph Attributes
 
-When Toco convertes a TF graph into TFLite format, it makes some assumption
-about custom operations that might be not correct. In this case, the generated
-graph can be not executable.
+When `tflite_convert` converts a TensorFlow graph into TFLite format, it makes
+some assumption about custom operations that might be not correct. In this case,
+the generated graph may not execute.
 
 It is possible to add aditional information about your custom op output to TF
 graph before it is converted. The following attributes are supported:
diff --git a/tensorflow/lite/g3doc/demo_ios.md b/tensorflow/lite/g3doc/demo_ios.md
index fbf1dd6392591183d0dc484018bba501de1851d8..f4b481dc6192db703dea4161ed28e2fd63812ebf 100644
--- a/tensorflow/lite/g3doc/demo_ios.md
+++ b/tensorflow/lite/g3doc/demo_ios.md
@@ -7,22 +7,23 @@ instructions walk you through building and running the demo on an iOS device.
 
 ## Prerequisites
 
-* You must have [Xcode](https://developer.apple.com/xcode/) installed and have a
-  valid Apple Developer ID, and have an iOS device set up and linked to your
-  developer account with all of the appropriate certificates. For these
-  instructions, we assume that you have already been able to build and deploy an
-  app to an iOS device with your current developer environment.
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
 
-* The demo app requires a camera and must be executed on a real iOS device. You
-  can build it and run with the iPhone Simulator but it won't have any camera
-  information to classify.
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
 
-* You don't need to build the entire TensorFlow library to run the demo, but you
-  will need to clone the TensorFlow repository if you haven't already:
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
 
         git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
 
-* You'll also need the Xcode command-line tools:
+*   You'll also need the Xcode command-line tools:
 
         xcode-select --install
 
@@ -31,35 +32,41 @@ instructions walk you through building and running the demo on an iOS device.
 
 ## Building the iOS Demo App
 
-1. Install CocoaPods if you don't have it:
+1.  Install CocoaPods if you don't have it:
 
         sudo gem install cocoapods
 
-2. Download the model files used by the demo app (this is done from inside the
-   cloned directory):
+2.  Download the model files used by the demo app (this is done from inside the
+    cloned directory):
 
         sh tensorflow/lite/examples/ios/download_models.sh
 
-3. Install the pod to generate the workspace file:
+3.  Install the pod to generate the workspace file:
 
         cd tensorflow/lite/examples/ios/camera
         pod install
 
     If you have installed this pod before and that command doesn't work, try
 
-        pod update
+        pod repo update
 
-    At the end of this step you should have a file called 
+    At the end of this step you should have a file called
     `tflite_camera_example.xcworkspace`.
 
-4. Open the project in Xcode by typing this on the command line:
+4.  Open the project in Xcode by typing this on the command line:
 
         open tflite_camera_example.xcworkspace
 
     This launches Xcode if it isn't open already and opens the
     `tflite_camera_example` project.
 
-5. Build and run the app in Xcode.
+5.  Under `Project navigator -> tflite_camera_example -> Targets ->
+    tflite_camera_example -> General` change the bundle identifier by
+    pre-pending your name:
+
+    ![pre-pend your name to the bundle identifier](images/ios/bundle_identifier.png)
+
+6.  Build and run the app in Xcode.
 
     Note that as mentioned earlier, you must already have a device set up and
     linked to your Apple Developer account in order to deploy the app on a
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/devguide.md
index fdd02638f9b78e05e77cfeb22644bfb37878a580..cbad036407fabea9d49910e22b4c968470566211 100644
--- a/tensorflow/lite/g3doc/devguide.md
+++ b/tensorflow/lite/g3doc/devguide.md
@@ -180,7 +180,6 @@ bazel run tensorflow/lite/tools:visualize -- model.tflite model_viz.html
 This generates an interactive HTML page listing subgraphs, operations, and a
 graph visualization.
 
-
 ## 3. Use the TensorFlow Lite model for inference in a mobile app
 
 After completing the prior steps, you should now have a `.tflite` model file.
@@ -221,3 +220,47 @@ devices. To use the converter, refer to the
 Compile Tensorflow Lite for a Raspberry Pi by following the
 [RPi build instructions](rpi.md) This compiles a static library file (`.a`) used
 to build your app. There are plans for Python bindings and a demo app.
+
+## 4. Optimize your model (optional)
+
+There are two options. If you plan to run on CPU, we recommend that you quantize
+your weights and activation tensors. If the hardware is available, another
+option is to run on GPU for massively parallelizable workloads.
+
+### Quantization
+Compress your model size by lowering the precision of the parameters (i.e.
+neural network weights) from their training-time 32-bit floating-point
+representations into much smaller and efficient 8-bit integer ones.
+
+This will execute the heaviest computations fast in lower precision, but the
+most sensitive ones with higher precision, thus typically resulting in little to
+no final accuracy losses for the task, yet a significant speed-up over pure
+floating-point execution.
+
+The post-training quantization technique is integrated into the TensorFlow Lite
+conversion tool. Getting started is easy: after building your TensorFlow model,
+simply enable the ‘post_training_quantize’ flag in the TensorFlow Lite
+conversion tool. Assuming that the saved model is stored in saved_model_dir, the
+quantized tflite flatbuffer can be generated in command line:
+
+```
+converter=tf.contrib.lite.TocoConverter.from_saved_model(saved_model_dir)
+converter.post_training_quantize=True
+tflite_quantized_model=converter.convert()
+open(“quantized_model.tflite”, “wb”).write(tflite_quantized_model)
+```
+
+Read the full documentation [here](performance/post_training_quantization) and see a tutorial [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb).
+
+### GPU
+Run on GPU GPUs are designed to have high throughput for massively
+parallelizable workloads. Thus, they are well-suited for deep neural nets, which
+consist of a huge number of operators, each working on some input tensor(s) that
+can be easily divided into smaller workloads and carried out in parallel,
+typically resulting in lower latency.
+
+Another benefit with GPU inference is its power efficiency. GPUs carry out the
+computations in a very efficient and optimized manner, so that they consume less
+power and generate less heat than when the same task is run on CPUs.
+
+Read the tutorial [here](performance/gpu) and full documentation [here](performance/gpu_advanced).
diff --git a/tensorflow/lite/g3doc/images/convert/sample_after.png b/tensorflow/lite/g3doc/images/convert/sample_after.png
index 6c451f97903f7f70a9f28dee8abf6daeb7ec5693..db09d0a6ca70695205833acfd2bd8ac6682cb065 100644
Binary files a/tensorflow/lite/g3doc/images/convert/sample_after.png and b/tensorflow/lite/g3doc/images/convert/sample_after.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/sample_before.png b/tensorflow/lite/g3doc/images/convert/sample_before.png
index e5317ef295062e79c66430512ef1c45925858ce0..55440d324977f0ff5b795bc80898857918066e96 100644
Binary files a/tensorflow/lite/g3doc/images/convert/sample_before.png and b/tensorflow/lite/g3doc/images/convert/sample_before.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/bundle_identifier.png b/tensorflow/lite/g3doc/images/ios/bundle_identifier.png
new file mode 100644
index 0000000000000000000000000000000000000000..398763916b353e61f236392e2b8898aad2aafe8e
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/bundle_identifier.png differ
diff --git a/tensorflow/lite/g3doc/images/landing-page/facial_contour_detection.png b/tensorflow/lite/g3doc/images/landing-page/facial_contour_detection.png
new file mode 100644
index 0000000000000000000000000000000000000000..27bb49826ff3246bd5a971e7ecd0926121dbf749
Binary files /dev/null and b/tensorflow/lite/g3doc/images/landing-page/facial_contour_detection.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/android.md b/tensorflow/lite/g3doc/models/image/label/android.md
index 9cd54aad1e933823eab169b313fdd6232dd16aa1..c755328ac059013d2d45bbeb3c67516dafbb0ff1 100644
--- a/tensorflow/lite/g3doc/models/image/label/android.md
+++ b/tensorflow/lite/g3doc/models/image/label/android.md
@@ -1,3 +1,207 @@
-# Android
+# TensorFlow Lite Android Image Classifier App Example
 
-lorem
+This tutorial provides a simple Android mobile application to classify images
+using the Android device camera. In this tutorial, you will download the demo
+application from the Tensorflow repository, build it on your computer, and
+install it on your Android Device. You will also learn how to customize the
+application to suit your requirements.
+
+### Prerequisites
+
+*   Android Studio 3.2 (installed on a Linux, Mac or Windows machine)
+
+*   Android Device
+
+*   USB cable (to connect Android device to your computer)
+
+### Step 1. Clone the TensorFlow source code
+
+First, we clone the GitHub repository on the computer in a folder to get the
+demo application.
+
+```
+
+git clone https://github.com/tensorflow/tensorflow
+
+```
+
+Open the TensorFlow source code in Android Studio. To do this, open Android
+Studio and select `Open an existing project` setting the folder to
+`tensorflow/lite/examples/android`
+
+![Step 1](images/classifydemo_img1.png)
+
+This folder contains the demo application for image classification, object
+detection, and speech hotword detection.
+
+### Step 2. Build the Android Studio project
+
+In this step, Select `Build -> Make Project` and check that the project builds
+successfully. You will need Android SDK configured in the settings. You'll need
+at least SDK version 23. The gradle file will prompt you to download any missing
+libraries.
+
+![Step 2](images/classifydemo_img4.png)
+
+![Step 2a](images/classifydemo_img2.png)
+
+#### TensorFlow Lite AAR from JCenter:
+
+Note that the `build.gradle` is configured to use TensorFlow Lite's nightly
+build.
+
+If you see a build error related to compatibility with Tensorflow Lite's Java
+API (example: method X is undefined for type Interpreter), there has likely been
+a backwards compatible change to the API. You will need to pull new app code
+that's compatible with the nightly build by running git pull.
+
+### Step 3. Install and Run the app
+
+Connect the Android device to the computer, and be sure to approve any ADB
+permission prompts that appear on your phone. Select `Run -> Run app.` Select
+the deployment target in the connected devices to the device on which app will
+be installed. This will install the app on the device.
+
+![Step 3](images/classifydemo_img5.png)
+
+![Step 3a](images/classifydemo_img6.png)
+
+![Step 3b](images/classifydemo_img7.png)
+
+![Step 3c](images/classifydemo_img8.png)
+
+To test the app, open the app named `TFL Classify` on the device. When you run
+the app first time, the app will request permission to access the camera.
+Re-installing the app may require you to uninstall the previous installations.
+
+## Understanding Android App Code
+
+### Get camera input
+
+This mobile application gets the camera input using the functions defined in the
+file CameraActivity.java in the folder
+`tensorflow/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java.`
+This file depends on `AndroidManifest.xml` in the folder
+`tensorflow/tensorflow/lite/examples/android/app/src/main` to set the camera
+orientation.
+
+### Pre-process of bitmap image
+
+The mobile application code that pre-processes the images and runs inference is
+in
+`tensorflow/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java.`
+Here, we take the input camera bitmap image and convert it to a Bytebuffer
+format for efficient processing. We pre-allocate the memory for ByteBuffer
+object based on the image dimensions because Bytebuffer objects can't infer the
+object shape.
+
+```
+c.imgData =
+ByteBuffer.allocateDirect( DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y *
+DIM_PIXEL_SIZE);
+c.imgData.order(ByteOrder.nativeOrder());
+```
+
+While running the application, we preprocess the incoming bitmap images from the
+camera to a Bytebuffer. Since this model is quantized 8-bit, we will put a
+single byte for each channel. `imgData` will contain an encoded `Color` for each
+pixel in ARGB format, so we need to mask the least significant 8 bits to get
+blue, and next 8 bits to get green and next 8 bits to get blue, and we have an
+opaque image so alpha can be ignored.
+
+```
+ imgData.rewind();
+ bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+ // Convert the image to floating point.
+ int pixel = 0;
+ for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
+   for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+     final int val = intValues[pixel++];
+     imgData.put((byte) ((val >> 16) & 0xFF));
+     imgData.put((byte) ((val >> 8) & 0xFF));
+     imgData.put((byte) (val & 0xFF));
+     }
+  }
+```
+
+### Create interpreter
+
+To create the interpreter, we need to load the model file. In Android devices,
+we recommend pre-loading and memory mapping the model file as shown below to
+offer faster load times and reduce the dirty pages in memory. If your model file
+is compressed, then you will have to load the model as a `File`, as it cannot be
+directly mapped and used from memory.
+
+```
+// Memory-map the model file
+AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+FileInputStream inputStream = new
+FileInputStream(fileDescriptor.getFileDescriptor()); FileChannel fileChannel =
+inputStream.getChannel(); long startOffset = fileDescriptor.getStartOffset();
+long declaredLength = fileDescriptor.getDeclaredLength(); return
+fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+```
+
+Then, create the interpreter object using `new Interpreter()` that takes the
+model file as argument as shown below.
+
+```
+// Create Interpreter
+c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+```
+
+### Run inference
+
+The output of the inference is stored in a byte array `labelprob.` We
+pre-allocate the memory for the output buffer. Then, we run inference on the
+interpreter object using function `run()` that takes input and output buffers as
+arguments.
+
+```
+// Pre-allocate output buffers.
+c.labelProb = new byte[1][c.labels.size()];
+// Run Inference
+tfLite.run(imgData, labelProb);
+```
+
+### Post-process values
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores.
+
+```
+// Find the best classifications
+PriorityQueue<Recognition> pq = ...
+for (int i = 0; i < labels.size(); ++i)
+{
+  pq.add( new Recognition( ' '+ i,
+  labels.size() > i ? labels.get(i) : unknown,
+  (float) labelProb[0][i], null));
+}
+```
+
+And we display up to MAX_RESULTS number of classifications in the application,
+where Recognition is a generic class defined in `Classifier.java` that contains
+the following information of the classified object: id, title, label, and its
+location when the model is an object detection model.
+
+```
+// Display the best classifications
+final ArrayList<Recognition> recognitions =
+  new ArrayList<Recognition>();
+int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
+for (int i = 0; i < recognitionsSize; ++i) {
+  recognitions.add(pq.poll());
+}
+```
+
+### Load onto display
+
+We render the results on the Android device screen using the following lines in
+`processImage()` function in `ClassifierActivity.java` which uses the UI defined
+in `RecognitionScoreView.java.`
+
+```
+resultsView.setResults(results);
+requestRender();
+```
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png
new file mode 100644
index 0000000000000000000000000000000000000000..916639c067081b5a193f479d6a9ce61239fc0c6e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png
new file mode 100644
index 0000000000000000000000000000000000000000..366ec834a842fa8030369d35d21126cf22a93d5c
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png
new file mode 100644
index 0000000000000000000000000000000000000000..360b843c9430bb39191cf7e49adaaada5f372338
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png
new file mode 100644
index 0000000000000000000000000000000000000000..d6192ae9a76d78479fed168e48429c1f96d13593
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png
new file mode 100644
index 0000000000000000000000000000000000000000..4216153d3886ee814f9e13657795815fac280dce
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png
new file mode 100644
index 0000000000000000000000000000000000000000..034eedbc1e5370f597b5b6d95564efbf66074dcc
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png
new file mode 100644
index 0000000000000000000000000000000000000000..940395346510815a7d0454ddc5e47eca8c5be6cd
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png differ
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index b76414cebe0d7092086073a478eb6330cbea713e..5f41a7027538f571601c85a0a367208200155dd6 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -1,6 +1,9 @@
 # Performance best practices
 
-Mobile and embedded devices have limited computational resources and it is important to keep your application resource efficient. We have compiled a list of best practices and strategies you can use to optimize your model and application when using Tensorflow Lite.
+Mobile and embedded devices have limited computational resources and it is
+important to keep your application resource efficient. We have compiled a list
+of best practices and strategies that you can use to optimize your model and
+application when using TensorFlow Lite.
 
 ## Choose the best model for the task
 Depending on the task you will need to make a tradeoff between model complexity and size. If your task requires high accuracy then you may need a large and complex model. Some tasks may work with a less precise model, for these tasks it is better to use a smaller but less precise model. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. For example, graphs below show accuracy and latency tradeoff for some common image classification models.
@@ -10,7 +13,7 @@ Depending on the task you will need to make a tradeoff between model complexity
 
 ![latency vs model size](../images/performance/model_size_vs_latency.png "Latency vs Model size")
 
-One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](../models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
+One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. TensorFlow Lite [models page](../models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
 
 You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
 [image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
@@ -18,27 +21,70 @@ You can retrain the listed models on your own dataset by using transfer learning
 
 
 ## Profile your model
-Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
+Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. TensorFlow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
 
 ## Profile and optimize operators in the graph
 If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
- This scenario should be rare as Tensorflow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](../custom_operators.md).
+ This scenario should be rare as TensorFlow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](../custom_operators.md).
 
 ## Quantize your model
 If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](model_optimization.md) for details about optimizing your model. 
 
 ## Tweak the number of threads
-Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/lite/interpreter.h#L337) threads. Multi-threaded execution however comes at the cost of increased performance variability depending on what else is been executed concurrently. This is particularly the case for mobile apps. For example, isolated tests may show 2x speed up vs single-threaded but if another app is executing at the same time may result in worst performance than single-threaded.
+
+TensorFlow Lite supports multi-threaded kernels for many operators. You can
+increase the number of threads and speed up execution of operators. Increasing
+the number of threads will however make your model use more resources and power.
+For some applications latency may be more important than energy efficiency. You
+can increase the number of threads by setting the number of
+[interpreter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L333)
+threads. Multi-threaded execution however comes at the cost of increased
+performance variability depending on what else is been executed concurrently.
+This is particularly the case for mobile apps. For example, isolated tests may
+show 2x speed up vs single-threaded but if another app is executing at the same
+time may result in worst performance than single-threaded.
 
 ## Eliminate redundant copies
-If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
+
+If your application is not careful, there can be redundant copies when feeding
+the input to the model and reading output from the model. Make sure to eliminate
+redundant copies. If you are using higher level APIs like Java API, make sure to
+carefully check the documentation for performance caveats. For example, the Java
+API is a lot faster if ByteBuffers are used as
+[inputs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L175).
 
 ## Profile your application with platform specific tools
 Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
 
 ## Evaluate whether your model benefits from using hardware accelerators available on the device
-Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
-You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable Neural Networks API call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/lite/interpreter.h#L334) on the interpreter instance.
+
+TensorFlow Lite has added been new ways to accelerate models with faster
+hardware like GPUs, DSPs, and neural accelerators. Typically, these accelerators
+are exposed through *delegate* submodules that take over parts of the
+interpreter execution. TensorFlow Lite can use delegates by:
+
+*   Using Android's
+    [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/).
+    You can utilize these hardware accelerator backends to improve the speed and
+    efficiency of your model. To enable the Neural Networks API, call
+    [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L330)
+    on the interpreter instance.
+*   A binary-only GPU delegate has been released for Android and iOS—using
+    OpenGL and Metal, respectively. To try them out, see the
+    [GPU delegate tutorial](gpu.md) and [documentation](gpu_advanced.md).
+*   It is possible to create your own delegate if you have access to
+    non-standard hardware. View the NN API delegate in the source code as an
+    example.
+
+Be aware that some accelerators work better for different types of models. It is
+important to benchmark each delegate to see if it is a good choice for your
+application. For example, if you have a very small model, it may not be worth
+delegating the model to either the NN API or the GPU. Conversely, accelerators
+are a great choice for large models that have high arithmetic intensity.
 
 ## Need more help
-The Tensorflow team is happy to help diagnose and address specific performance issues you may be facing. Please file an issue on [GitHub](https://github.com/tensorflow/tensorflow/issues) with details of the issue.
+
+The TensorFlow team is happy to help diagnose and address specific performance
+issues you may be facing. Please file an issue on
+[GitHub](https://github.com/tensorflow/tensorflow/issues) with details of the
+issue.
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7389226123746180c8c5e6020431ffe579112a7
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -0,0 +1,236 @@
+# TensorFlow Lite GPU Delegate Tutorial
+
+[TensorFlow Lite](https://www.tensorflow.org/lite) supports several hardware
+accelerators. This document describes how to preview the experimental GPU backend using the
+TensorFlow Lite delegate APIs on Android and iOS.
+
+GPUs are designed to have high throughput for massively parallelizable
+workloads. Thus, they are well-suited for deep neural nets, which consist of a
+huge number of operators, each working on some input tensor(s) that can be
+easily divided into smaller workloads and carried out in parallel, typically
+resulting in lower latency. In the best scenario, inference on the GPU may now
+run fast enough for previously not available real-time applications.
+
+Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
+not require quantization for optimal performance.
+
+Another benefit with GPU inference is its power efficiency. GPUs carry out the
+computations in a very efficient and optimized manner, so that they consume less
+power and generate less heat than when the same task is run on CPUs.
+
+## Demo App Tutorials
+
+The easiest way to try out the experimental GPU delegate is to follow the below tutorials, which go through building our classification demo applications with GPU support. The GPU code is only binary for now; it will be open-sourced soon. Once you understand how to get our demos working, you can try this out on your own custom models.
+
+### Android (with Android Studio)
+
+For a step-by-step tutorial, watch the
+[Experimental GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video.
+
+Note: This requires OpenGL ES 3.1 or higher.
+
+#### Step 1. Clone the TensorFlow source code and open it in Android Studio
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+#### Step 2. Edit `app/build.gradle` to use the experimental GPU AAR
+
+Replace the existing `tensorflow-lite` package in the existing `dependencies`
+block.
+
+```
+dependencies {
+    ...
+    // implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-gpu-experimental'
+}
+```
+
+#### Step 3. Build and run
+
+Run → Run ‘app’.  When you run the application you will see a button for
+enabling the GPU. Change from quantized to a float model and then click GPU to
+run on the GPU.
+
+![running android gpu demo and switch to gpu](images/android_gpu_demo.gif)
+
+### iOS (with XCode)
+
+For a step-by-step tutorial, watch the
+[Experimental GPU Delegate for iOS](https://youtu.be/a5H4Zwjp49c) video.
+
+Note: This requires XCode v10.1 or later.
+
+#### Step 1. Get the demo source code and make sure it compiles.
+
+Follow our iOS Demo App [tutorial](https://www.tensorflow.org/lite/demo_ios).
+This will get you to a point where the unmodified iOS camera demo is working
+on your phone.
+
+
+#### Step 2. Modify the Podfile to use the TensorFlow Lite GPU CocoaPod
+
+We have built a binary CocoaPod that includes the GPU delegate. To switch the
+project to use it, modify the
+`tensorflow/tensorflow/lite/examples/ios/camera/Podfile` file to use
+the `TensorFlowLiteGpuExperimental` pod instead of `TensorFlowLite`.
+
+```
+target 'YourProjectName'
+  # pod 'TensorFlowLite', '1.12.0'
+  pod 'TensorFlowLiteGpuExperimental'
+```
+
+#### Step 3. Enable the GPU Delegate
+
+You will need to change two `#define` flags in `CameraExampleViewController.h`
+to enable the GPU delegate. First, change `TFLITE_USE_CONTRIB_LITE` from 1 to 0
+since TensorFlow Lite has moved from TensorFlow contrib into core.
+
+```c
+#define TFLITE_USE_CONTRIB_LITE 0
+```
+
+Next, change `TFLITE_USE_GPU_DELEGATE` from 0 to 1, to enable the code that will
+use the GPU delegate.
+
+```c
+#define TFLITE_USE_GPU_DELEGATE 1
+```
+
+#### Step 4. Build and run the demo app
+
+After following the previous step, you should be able to run the app.
+
+
+#### Step 5. Release mode.
+
+While in Step 4 you ran in debug mode, to get better performance, you should
+change to a release build with the appropriate optimal Metal settings. In
+particular, To edit these settings go to the `Product > Scheme > Edit
+Scheme...`. Select `Run`. On the `Info` tab, change `Build Configuration`, from
+`Debug` to `Release`, uncheck `Debug executable`.
+
+![setting up release](images/iosdebug.png)
+
+Then
+click the `Options` tab and change `GPU Frame Capture` to `Disabled` and
+`Metal API Validation` to `Disabled`.
+
+![setting up metal options](images/iosmetal.png)
+
+Lastly make sure Release only builds on 64-bit architecture. Under `Project
+navigator -> tflite_camera_example -> PROJECT -> tflite_camera_example -> Build
+Settings` set `Build Active Architecture Only > Release` to Yes.
+
+![setting up release options](images/iosrelease.png)
+
+## Trying the GPU Delegate on your own model
+
+### Android
+
+Look at the demo to see how to add the
+delegate. In your application, add the AAR as above, import
+`org.tensorflow.lite.experimental.GpuDelegate` module, and use the`addDelegate`
+function to register the GPU delegate to the interpreter:
+
+```java
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.experimental.GpuDelegate;
+
+// Initialize interpreter with GPU delegate
+GpuDelegate delegate = new GpuDelegate();
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+Interpreter interpreter = new Interpreter(model, options);
+
+// Run inference
+while (true) {
+  writeToInput(input);
+  interpreter.run(input, output);
+  readFromOutput(output);
+}
+
+// Clean up
+delegate.close();
+```
+
+### iOS
+
+In your application code, include the GPU delegate header and call the
+`Interpreter::ModifyGraphWithDelegate` function to register the GPU delegate to
+the interpreter:
+
+```cpp
+#import "tensorflow/lite/delegates/gpu/metal_delegate.h"
+
+// Initialize interpreter with GPU delegate
+std::unique_ptr<Interpreter> interpreter;
+InterpreterBuilder(*model, resolver)(&interpreter);
+auto* delegate = NewGpuDelegate(nullptr);  // default config
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference
+while (true) {
+  WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
+  if (interpreter->Invoke() != kTfLiteOk) return false;
+  ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
+}
+
+// Clean up
+interpreter = nullptr;
+DeleteGpuDelegate(delegate);
+```
+
+## Supported Models and Ops
+
+With the release of the GPU delegate, we included a handful of models that can
+be run on the backend:
+
+* [MobileNet v1 (224x224) image classification](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobilenet_v1_1.0_224.tflite)
+<br /><i>(image classification model designed for mobile and embedded based vision applications)</i>
+* [DeepLab segmentation (257x257)](https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite)
+<br /><i>(image segmentation model that assigns semantic labels (e.g., dog, cat, car) to every pixel in the input image)</i>
+* [MobileNet SSD object detection](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite)
+<br /><i>(image classification model that detects multiple objects with bounding boxes)</i>
+* [PoseNet for pose estimation](https://github.com/tensorflow/tfjs-models/tree/master/posenet) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite)
+<br /><i>(vision model that estimates the poses of a person(s) in image or video)</i>
+
+To see a full list of supported ops, please see the [advanced documentation](gpu_advanced.md).
+
+## Non-supported models and ops
+
+If some of the ops are not supported by the GPU delegate, the framework will
+only run a part of the graph on the GPU and the remaining part on the CPU.  Due
+to the high cost of CPU/GPU synchronization, a split execution mode like this
+will often result in a performance slower than when the whole network is run on
+the CPU alone.  In this case, the user will get a warning like:
+
+```
+WARNING: op code #42 cannot be handled by this delegate.
+```
+
+We did not provide a callback for this failure, as this is not a true run-time
+failure, but something that the developer can observe while trying to get the
+network to run on the delegate.
+
+## Tips for optimization
+
+Some operations that are trivial on the CPU may have a high cost for the GPU.
+One class of such operation is various forms of reshape operations, including
+`BATCH_TO_SPACE`, `SPACE_TO_BATCH`, `SPACE_TO_DEPTH`, and so forth. If those ops
+are inserted into the network just for the network architect's logical thinking,
+it is worth removing them for performance.
+
+On GPU, tensor data is sliced into 4-channels. Thus, a computation on a tensor
+of shape `[B,H,W,5]` will perform about the same on a tensor of shape
+`[B,H,W,8]` but significantly worse than `[B,H,W,4]`.
+
+In that sense, if the camera hardware supports image frames in RGBA, feeding
+that 4-channel input is significantly faster as a memory copy (from 3-channel
+RGB to 4-channel RGBX) can be avoided.
+
+For best performance, do not hesitate to retrain your classifier with a mobile-
+optimized network architecture. That is a significant part of optimization for
+on-device inference.
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
new file mode 100644
index 0000000000000000000000000000000000000000..627494804029a42d1fc0d89c6a7d5af888051d83
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -0,0 +1,303 @@
+# TensorFlow Lite on GPU
+
+[TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several
+hardware accelerators.  This document describes how to use the GPU backend using
+the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher)
+and iOS (requires iOS 8 or later).
+
+## Benefits of GPU Acceleration
+
+### Speed
+
+GPUs are designed to have high throughput for massively parallelizable
+workloads. Thus, they are well-suited for deep neural nets, which consist of a
+huge number of operators, each working on some input tensor(s) that can be
+easily divided into smaller workloads and carried out in parallel. This
+parallelism typically results in lower latency. In the best scenario, inference
+on the GPU may run fast enough to become suitable for real-time applications
+that were not previously possible.
+
+### Accuracy
+
+GPUs do their computation with 16-bit or 32-bit floating point numbers and
+(unlike the CPUs) do not require quantization for optimal performance. If
+decreased accuracy made quantization untenable for your models, running your
+neural network on a GPU may eliminate this concern.
+
+### Energy Efficiency
+
+Another benefit that comes with GPU inference is its power efficiency. A GPU
+carries out computations in a very efficient and optimized way, consuming less
+power and generating less heat than the same task run on a CPU.
+
+## Supported Ops
+
+TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float
+precision:
+
+* `ADD v1`
+* `AVERAGE_POOL_2D v1`
+* `CONCATENATION v1`
+* `CONV_2D v1`
+* `DEPTHWISE_CONV_2D v1-2`
+* `FULLY_CONNECTED v1`
+* `LOGISTIC v1`
+* `MAX_POOL_2D v1`
+* `MUL v1`
+* `PAD v1`
+* `PRELU v1`
+* `RELU v1`
+* `RELU6 v1`
+* `RESHAPE v1`
+* `RESIZE_BILINEAR v1`
+* `SOFTMAX v1`
+* `STRIDED_SLICE v1`
+* `SUB v1`
+* `TRANSPOSE_CONV v1`
+
+## Basic Usage
+
+### Android
+
+Run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify the
+GpuDelegate through `Interpreter.Options`.
+
+```java
+// NEW: Prepare GPU delegate.
+GpuDelegate delegate = new GpuDelegate();
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+
+// Set up interpreter.
+Interpreter interpreter = new Interpreter(model, options);
+
+// Run inference.
+writeToInputTensor(inputTensor);
+interpreter.run(inputTensor, outputTensor);
+readFromOutputTensor(outputTensor);
+
+// Clean up.
+delegate.close();
+```
+
+### iOS
+
+To use TensorFlow Lite on GPU, get the GPU delegate via `NewGpuDelegate()` and
+then pass it to `Interpreter::ModifyGraphWithDelegate()` (instead of calling
+`Interpreter::AllocateTensors()`).
+
+```c++
+// Set up interpreter.
+auto model = FlatBufferModel::BuildFromFile(model_path);
+if (!model) return false;
+tflite::ops::builtin::BuiltinOpResolver op_resolver;
+std::unique_ptr<Interpreter> interpreter;
+InterpreterBuilder(*model, op_resolver)(&interpreter);
+
+// NEW: Prepare GPU delegate.
+
+const GpuDelegateOptions options = {
+  .allow_precision_loss = false,
+  .wait_type = kGpuDelegateOptions::WaitType::Passive,
+};
+
+auto* delegate = NewGpuDelegate(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference.
+WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
+if (interpreter->Invoke() != kTfLiteOk) return false;
+ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
+
+// Clean up.
+DeleteGpuDelegate(delegate);
+```
+
+Note: When calling `Interpreter::ModifyGraphWithDelegate()` or
+`Interpreter::Invoke()`, the caller must have an `EGLContext` in the current
+thread and `Interpreter::Invoke()` must be called from the same `EGLContext`. If
+an `EGLContext` does not exist, the delegate will internally create one, but
+then the developer must ensure that `Interpreter::Invoke()` is always called
+from the same thread in which `Interpreter::ModifyGraphWithDelegate()` was
+called.
+
+## Advanced Usage
+
+### Delegate Options for iOS
+
+`NewGpuDelegate()` accepts a `struct` of options.
+
+```c++
+struct GpuDelegateOptions {
+  // Allows to quantify tensors, downcast values, process in float16 etc.
+  bool allow_precision_loss;
+
+  enum class WaitType {
+    // waitUntilCompleted
+    kPassive,
+    // Minimize latency. It uses active spinning instead of mutex and consumes
+    // additional CPU resources.
+    kActive,
+    // Useful when the output is used with GPU pipeline then or if external
+    // command encoder is set
+    kDoNotWait,
+  };
+  WaitType wait_type;
+};
+```
+
+Passing `nullptr` into `NewGpuDelegate()` sets the default options (which are
+explicated in the Basic Usage example above).
+
+```c++
+
+// THIS:
+const GpuDelegateOptions options = {
+  .allow_precision_loss = false,
+  .wait_type = kGpuDelegateOptions::WaitType::Passive,
+};
+
+auto* delegate = NewGpuDelegate(options);
+
+// IS THE SAME AS THIS:
+auto* delegate = NewGpuDelegate(nullptr);
+
+```
+
+While it is convenient to use `nullptr`, we recommend that you explicitly set
+the options, to avoid any unexpected behavior if default values are changed in
+the future.
+
+### Input/Output Buffers
+
+To do computation on the GPU, data must be made available to the GPU. This often
+requires performing a memory copy. It is desirable not to cross the CPU/GPU
+memory boundary if possible, as this can take up a significant amount of time.
+Usually, such crossing is inevitable, but in some special cases, one or the
+other can be omitted.
+
+If the network's input is an image already loaded in the GPU memory (for
+example, a GPU texture containing the camera feed) it can stay in the GPU memory
+without ever entering the CPU memory. Similarly, if the network's output is in
+the form of a renderable image (for example,
+[image style transfer](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf)_)
+it can be directly displayed on the screen.
+
+To achieve best performance, TensorFlow Lite makes it possible for users to
+directly read from and write to the TensorFlow hardware buffer and bypass
+avoidable memory copies.
+
+#### Android
+
+Assuming the image input is in the GPU memory, it must first be converted to an
+OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to
+a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that
+`Interpreter.bindGlBufferToTensor()` must be called before
+`Interpreter.modifyGraphWithDelegate()`.
+
+```java
+// Ensure a valid EGL rendering context.
+EGLContext eglContext = eglGetCurrentContext();
+if (eglContext.equals(EGL_NO_CONTEXT)) return false;
+
+// Create an SSBO.
+int[] id = new int[1];
+glGenBuffers(id.length, id, 0);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
+glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
+int inputSsboId = id[0];
+
+// Create interpreter.
+Interpreter interpreter = new Interpreter(tfliteModel);
+Tensor inputTensor = interpreter.getInputTensor(0);
+GpuDelegate gpuDelegate = new GpuDelegate();
+// The buffer must be bound before the delegate is installed.
+gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId);
+interpreter.modifyGraphWithDelegate(gpuDelegate);
+
+// Run inference; the null input argument indicates use of the bound buffer for input.
+fillSsboWithCameraImageTexture(inputSsboId);
+float[] outputArray = new float[outputSize];
+interpreter.runInference(null, outputArray);
+```
+
+A similar approach can be applied to the output tensor. In that case,
+`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to
+disable the default copying of the network's output from GPU memory to CPU
+memory.
+
+```java
+// Ensure a valid EGL rendering context.
+EGLContext eglContext = eglGetCurrentContext();
+if (eglContext.equals(EGL_NO_CONTEXT)) return false;
+
+// Create a SSBO.
+int[] id = new int[1];
+glGenBuffers(id.length, id, 0);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
+glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
+int outputSsboId = id[0];
+
+// Create interpreter.
+Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true);
+Interpreter interpreter = new Interpreter(tfliteModel, options);
+Tensor outputTensor = interpreter.getOutputTensor(0);
+GpuDelegate gpuDelegate = new GpuDelegate();
+// The buffer must be bound before the delegate is installed.
+gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId);
+interpreter.modifyGraphWithDelegate(gpuDelegate);
+
+// Run inference; the null output argument indicates use of the bound buffer for output.
+ByteBuffer input = getCameraImageByteBuffer();
+interpreter.runInference(input, null);
+renderOutputSsbo(outputSsboId);
+```
+
+#### iOS
+
+Assuming the image input is in GPU memory, it must first be converted to a
+`MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
+user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that
+`BindMetalBufferToTensor()` must be called before
+`Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
+by default, copied from GPU memory to CPU memory. This behavior can be turned
+off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
+initialization.
+
+```c++
+// Prepare GPU delegate.
+auto* delegate = NewGpuDelegate(nullptr);
+interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
+if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
+if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference.
+if (interpreter->Invoke() != kTfLiteOk) return false;
+```
+
+Note: Once the default behavior is turned off, copying the inference output from
+GPU memory to CPU memory requires an explicit call to
+`Interpreter::EnsureTensorDataIsReadable()` for each output tensor.
+
+## Tips and Tricks
+
+*   Some operations that are trivial on the CPU may be high cost on a GPU. One
+    class of such operation includes various forms of reshape operations
+    (including `BATCH_TO_SPACE`, `SPACE_TO_BATCH`, `SPACE_TO_DEPTH`, and similar
+    operation). If these operations are not required (for example, they were
+    inserted to help the network architect reason about the system but do not
+    otherwise affect output), it is worth removing them for performance.
+
+*   On a GPU, tensor data is sliced into 4-channels. Thus, a computation on a
+    tensor of shape `[B, H, W, 5]` will perform about the same on a tensor of
+    shape `[B, H, W, 8]`, but significantly worse than `[B, H, W, 4]`.
+
+    *   For example, if the camera hardware supports image frames in RGBA,
+        feeding that 4-channel input is significantly faster, because a memory
+        copy (from 3-channel RGB to 4-channel RGBX) can be avoided.
+
+*   For best performance, do not hesitate to re-train your classifier with
+    mobile-optimized network architecture. That is a significant part of
+    optimization for on-device inference.
diff --git a/tensorflow/lite/g3doc/performance/images/android_gpu_demo.gif b/tensorflow/lite/g3doc/performance/images/android_gpu_demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..31aee24c34b6451727abffd95bb7f2f0d40f55af
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/android_gpu_demo.gif differ
diff --git a/tensorflow/lite/g3doc/performance/images/iosdebug.png b/tensorflow/lite/g3doc/performance/images/iosdebug.png
new file mode 100644
index 0000000000000000000000000000000000000000..8cebbb84688b8129c149108ee4a47736a23dddff
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/iosdebug.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/iosmetal.png b/tensorflow/lite/g3doc/performance/images/iosmetal.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e2b8bde8c1dac18ff66920f4f2a3f369f81bb3a
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/iosmetal.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/iosrelease.png b/tensorflow/lite/g3doc/performance/images/iosrelease.png
new file mode 100644
index 0000000000000000000000000000000000000000..a160c6700e60726d8d9775c4a1c28b3e34b1e930
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/iosrelease.png differ
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index dcfda72137cafbc676dec2fb5dbf5da8ab8cb45a..cff4afc250852e58e29c7a6f4dc57a2ef1158f16 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -185,7 +185,7 @@ Inputs {
   1: a tensor
 }
 Outputs {
-  0: A tensor of indices of minium values.
+  0: A tensor of indices of minimum values.
 }
 ```
 
@@ -362,6 +362,17 @@ Outputs {
 }
 ```
 
+**CEIL**
+
+```
+inputs {
+  0: tensor
+}
+outputs: {
+  0: result of computing element-wise ceil of the input tensor
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
diff --git a/tensorflow/lite/g3doc/tfmobile/android_build.md b/tensorflow/lite/g3doc/tfmobile/android_build.md
index 2eb776d10cf8ec68987d13b580eddf2f1bda8e78..f8c0243298e435382a7514e72ada89880fb00c1c 100644
--- a/tensorflow/lite/g3doc/tfmobile/android_build.md
+++ b/tensorflow/lite/g3doc/tfmobile/android_build.md
@@ -91,10 +91,10 @@ following lines to your Gradle build file:
         repositories {
             jcenter()
         }
-	}
+    }
 
     dependencies {
-        compile 'org.tensorflow:tensorflow-android:+'
+        implementation 'org.tensorflow:tensorflow-android:+'
     }
 
 This automatically downloads the latest stable version of TensorFlow as an AAR
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
index aa51f58baa4ecf01fbe75d2ce9095bb1a5286ae8..269774a4b10648f92aab5ee6bf5ae3687c263f75 100644
--- a/tensorflow/lite/g3doc/using_select_tf_ops.md
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -130,7 +130,7 @@ allprojects {
 }
 
 dependencies {
-    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
 }
 ```
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index e2129ed46d94061211e02445a437f7adca51363e..7a6a074dedbd21aed30c27a39032f045cf0196be 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -32,6 +32,26 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+
+// Gets the current TfLiteQuantization from the legacy fLiteQuantizationParams.
+TfLiteQuantization GetQuantizationFromLegacy(
+    const TfLiteQuantizationParams& legacy_quantization) {
+  TfLiteQuantization quantization;
+  quantization.type = kTfLiteAffineQuantization;
+  auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  affine_quantization->scale->data[0] = legacy_quantization.scale;
+  affine_quantization->zero_point->data[0] = legacy_quantization.zero_point;
+  quantization.params = affine_quantization;
+
+  return quantization;
+}
+
+}  // namespace
+
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
@@ -102,15 +122,16 @@ TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  TfLiteStatus status = primary_subgraph().Invoke();
+  TF_LITE_ENSURE_STATUS(primary_subgraph().Invoke());
 
   if (!allow_buffer_handle_output_) {
     for (int tensor_index : outputs()) {
-      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
+      TF_LITE_ENSURE_STATUS(
+          primary_subgraph().EnsureTensorDataIsReadable(tensor_index));
     }
   }
 
-  return status;
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
@@ -122,24 +143,49 @@ TfLiteStatus Interpreter::ResetVariableTensors() {
   return primary_subgraph().ResetVariableTensors();
 }
 
+TfLiteStatus Interpreter::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantization quantization,
+    const char* buffer, size_t bytes, const Allocation* allocation) {
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, dims.size(), dims.data(), quantization, buffer,
+      bytes, allocation);
+}
+
+TfLiteStatus Interpreter::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantization quantization,
+    bool is_variable) {
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, dims.size(), dims.data(), quantization,
+      is_variable);
+}
+
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
-  return primary_subgraph().SetTensorParametersReadOnly(
-      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
-      allocation);
+  TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
+  if (primary_subgraph().SetTensorParametersReadOnly(
+          tensor_index, type, name, rank, dims, new_quantization, buffer, bytes,
+          allocation) != kTfLiteOk) {
+    TfLiteQuantizationFree(&new_quantization);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
-// Set description of inputs/outputs/data/fptrs for node `node_index`.
-// This variant assumes an external buffer has been allocated of size
-// bytes. The lifetime of buffer must be ensured to be greater or equal
-// to Interpreter.
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  return primary_subgraph().SetTensorParametersReadWrite(
-      tensor_index, type, name, rank, dims, quantization, is_variable);
+  TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
+  if (primary_subgraph().SetTensorParametersReadWrite(
+          tensor_index, type, name, rank, dims, new_quantization,
+          is_variable) != kTfLiteOk) {
+    TfLiteQuantizationFree(&new_quantization);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
@@ -167,6 +213,15 @@ void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
   }
 }
 
+// TODO(b/121264966): Subgraphs added after cancellation is set will not get the
+// cancellation function added to their context.
+void Interpreter::SetCancellationFunction(void* data,
+                                          bool (*check_cancelled_func)(void*)) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->SetCancellationFunction(data, check_cancelled_func);
+  }
+}
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   return primary_subgraph().ModifyGraphWithDelegate(delegate);
 }
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 6192d56ca2b5810d7ffaddbf4cc7ae3c1b27c268..806b66c12a0bf119985927e4e937c71fc6fed487 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -160,6 +160,12 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  // Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -179,6 +185,13 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name,
+                                            const std::vector<int>& dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable = false);
+
+  // Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadWrite(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -343,6 +356,15 @@ class Interpreter {
     return context_->allow_fp32_relax_to_fp16;
   }
 
+  // Sets the cancellation function pointer in order to cancel a request in the
+  // middle of a call to Invoke(). The interpreter queries this function during
+  // inference, between op invocations; when it returns true, the interpreter
+  // will abort execution and return `kTfLiteError`. The `data` parameter
+  // contains any data used by the cancellation function, and if non-null,
+  // remains owned by the caller.
+  // WARNING: This is an experimental API and subject to change.
+  void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
+
   // Owning handle to a TfLiteDelegate instance.
   using TfLiteDelegatePtr =
       std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 40e5134bc7b3357f2a0479d096972c36928f545b..e1aedfe65ccdfcb4c38d2b13cf53007f17f5f798 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -73,8 +73,9 @@ TEST(BasicInterpreter, TestAllocateTensorsResetVariableTensors) {
   int tensor_index;
   ASSERT_EQ(interpreter.AddTensors(1, &tensor_index), kTfLiteOk);
   constexpr int kTensorSize = 16;
+  TfLiteQuantizationParams quant;
   interpreter.SetTensorParametersReadWrite(tensor_index, kTfLiteFloat32, "",
-                                           {kTensorSize}, {}, true);
+                                           {kTensorSize}, quant, true);
   interpreter.SetVariables({tensor_index});
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
   TfLiteTensor* tensor = interpreter.tensor(tensor_index);
@@ -170,6 +171,55 @@ TEST(BasicInterpreter, CheckAllocate) {
   }
 }
 
+TEST(BasicInterpreter, CheckQuantization) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+  interpreter.SetInputs({0, 1});
+  interpreter.SetOutputs({});
+  TfLiteType tensor_type = kTfLiteInt8;
+  const uint8_t int8s[] = {3, 4};
+  float scale = 0.5f;
+  int32_t zero_point = 12;
+
+  TfLiteQuantization rw_quantization;
+  rw_quantization.type = kTfLiteAffineQuantization;
+  auto* rw_affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  rw_affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  rw_affine_quantization->scale->data[0] = scale;
+  rw_affine_quantization->zero_point->data[0] = zero_point;
+  rw_quantization.params = rw_affine_quantization;
+
+  TfLiteQuantization ro_quantization;
+  ro_quantization.type = kTfLiteAffineQuantization;
+  auto* ro_affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  ro_affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  ro_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  ro_affine_quantization->scale->data[0] = scale;
+  ro_affine_quantization->zero_point->data[0] = zero_point;
+  ro_quantization.params = ro_affine_quantization;
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(0, tensor_type, "", {3},
+                                                     rw_quantization),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadOnly(
+                1, tensor_type, "", {2}, ro_quantization,
+                reinterpret_cast<const char*>(int8s), 2),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  // Check that the legacy scale and zero_point are set correctly.
+  ASSERT_EQ(interpreter.tensor(0)->params.scale, scale);
+  ASSERT_EQ(interpreter.tensor(0)->params.zero_point, zero_point);
+  ASSERT_EQ(interpreter.tensor(0)->quantization.type, rw_quantization.type);
+  ASSERT_EQ(interpreter.tensor(0)->quantization.type, rw_quantization.type);
+  ASSERT_EQ(interpreter.tensor(1)->params.scale, scale);
+  ASSERT_EQ(interpreter.tensor(1)->params.zero_point, zero_point);
+  ASSERT_EQ(interpreter.tensor(1)->quantization.type, ro_quantization.type);
+  ASSERT_EQ(interpreter.tensor(1)->quantization.type, ro_quantization.type);
+}
+
 TEST(BasicInterpreter, CheckResize) {
   const float floats[] = {-3., -4.};
   const int32_t int32s[] = {-3, -4};
@@ -1396,6 +1446,130 @@ TEST(TestDelegateOwnership, ProperlyDisposed) {
   EXPECT_TRUE(destroyed);
 }
 
+// CancellationData contains the data required to cancel a call to Invoke().
+struct CancellationData {
+  bool is_cancelled = false;
+};
+
+// Indicates whether Invoke() has been cancelled based on the value of the
+// CancellationData object passed in.
+bool CheckCancellation(void* data) {
+  CancellationData* cancellation_data =
+      static_cast<struct CancellationData*>(data);
+  return cancellation_data->is_cancelled;
+}
+
+static struct CancellationData cancellation_data_;
+
+// Test fixture to test cancellation within the Interpreter.
+class CancellationTest : public ::testing::Test {
+ public:
+  TfLiteStatus Invoke() { return interpreter_.Invoke(); }
+  void Cancel() { cancellation_data_.is_cancelled = true; }
+
+  // Adds an CancelOp with input tensor `input` and output tensor `output`.
+  void MakeCancelNode(int input, int output) {
+    TfLiteRegistration op = CancelOpRegistration();
+    ASSERT_EQ(interpreter_.AddNodeWithParameters({input}, {output}, nullptr, 0,
+                                                 nullptr, &op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  // Adds an OkOp with input tensor `input` and output tensor `output`.
+  void MakeOkNode(int input, int output) {
+    TfLiteRegistration op = OkOpRegistration();
+    ASSERT_EQ(interpreter_.AddNodeWithParameters({input}, {output}, nullptr, 0,
+                                                 nullptr, &op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  Interpreter interpreter_;
+
+ private:
+  // Build the kernel registration for an op that cancels the operation.
+  TfLiteRegistration CancelOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    // Set output size to the input size in CancelOp::Prepare(). Code exists to
+    // have a framework in Prepare. The input and output tensors are not used.
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* in_tensor = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* out_tensor = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* new_size = TfLiteIntArrayCopy(in_tensor->dims);
+      return context->ResizeTensor(context, out_tensor, new_size);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      cancellation_data_.is_cancelled = true;
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  // Build the kernel registration for an op that returns kTfLiteOk.
+  TfLiteRegistration OkOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    // Set output size to the input size in OkOp::Prepare(). Code exists to have
+    // a framework in Prepare. The input and output tensors are not used.
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* in_tensor = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* out_tensor = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* new_size = TfLiteIntArrayCopy(in_tensor->dims);
+      return context->ResizeTensor(context, out_tensor, new_size);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  void SetUp() final {
+    cancellation_data_.is_cancelled = false;
+
+    // Set up the interpreter. Create the input and output tensors.
+    int num_tensors = 3;
+    ASSERT_EQ(interpreter_.AddTensors(num_tensors), kTfLiteOk);
+    interpreter_.SetInputs({0});
+    interpreter_.SetOutputs({2});
+    TfLiteQuantizationParams quantized;
+    for (int tensor_index = 0; tensor_index < num_tensors; tensor_index++) {
+      ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                    tensor_index, kTfLiteFloat32, "", {3}, quantized),
+                kTfLiteOk);
+    }
+    interpreter_.SetCancellationFunction(&cancellation_data_,
+                                         &CheckCancellation);
+  }
+};
+
+TEST_F(CancellationTest, CancelBeforeInvoke) {
+  // Cancel prior to calling Invoke.
+  CancellationTest::MakeOkNode(1, 2);
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+  CancellationTest::Cancel();
+  TfLiteStatus invoke_error_code = CancellationTest::Invoke();
+  ASSERT_EQ(invoke_error_code, kTfLiteError);
+}
+
+TEST_F(CancellationTest, CancelDuringInvoke) {
+  // Tests a model which sets the cancel in order to test cancellation works
+  // between ops.
+  //
+  // The first op will set the cancellation bit to true. The second op returns
+  // `kTfLiteOk` if executed.
+  CancellationTest::MakeCancelNode(0, 1);
+  CancellationTest::MakeOkNode(1, 2);
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+  TfLiteStatus invoke_error_code = CancellationTest::Invoke();
+  ASSERT_EQ(invoke_error_code, kTfLiteError);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/java/AndroidManifest.xml b/tensorflow/lite/java/AndroidManifest.xml
index b91c6d149a213926be90b9b131bd632d4f79a0fc..a76a727ec75d231a506b4ef693b3dcd681515b1a 100644
--- a/tensorflow/lite/java/AndroidManifest.xml
+++ b/tensorflow/lite/java/AndroidManifest.xml
@@ -3,7 +3,6 @@
     package="org.tensorflow.lite">
 
     <uses-sdk
-        android:minSdkVersion="4"
         android:targetSdkVersion="19" />
 
     <application />
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index adf7bc9087878ad84824844139058c140d7084f8..8983079a31d7d99dbd666387c0a2c0ded63747e8 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -90,6 +90,9 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
     deps = [
         ":tensorflowlitelib",
@@ -103,6 +106,9 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.DataTypeTest",
     deps = [
         ":tensorflowlitelib",
@@ -121,10 +127,14 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/quantized.bin",
+        "src/testdata/string.bin",
         "src/testdata/uint8.bin",
         "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
     deps = [
         ":tensorflowlitelib",
@@ -144,6 +154,9 @@ java_test(
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.InterpreterTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -162,6 +175,9 @@ java_test(
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_oss",  # Currently requires --config=monolithic, b/118895218.
+    ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -179,6 +195,9 @@ java_test(
         "src/testdata/add.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.TensorTest",
     deps = [
         ":tensorflowlitelib",
@@ -192,6 +211,9 @@ filegroup(
     srcs = select({
         "//conditions:default": [":libtensorflowlite_jni.so"],
     }),
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index b8fc282cb1dfe8a9c80692759e985bf369fc163d..8ea16a3417ca9733f518776692114501c4162a0e 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -2,7 +2,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    buildToolsVersion "27.0.3"
     defaultConfig {
         applicationId "android.example.com.tflitecamerademo"
         // Required by Camera2 API.
@@ -10,11 +10,6 @@ android {
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -40,6 +35,7 @@ repositories {
         url 'https://google.bintray.com/tensorflow'
     }
 }
+
 allprojects {
     repositories {
         // Uncomment if you want to use a local repo.
@@ -48,20 +44,18 @@ allprojects {
     }
 }
 
-
-
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:25.2.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.0.2'
+    implementation 'com.android.support:design:25.2.0'
+    implementation 'com.android.support:support-annotations:25.3.1'
+    implementation 'com.android.support:support-v13:25.2.0'
 
     // Build off of nightly TensorFlow Lite
-    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
     // Use local TensorFlow library
-    // compile 'org.tensorflow:tensorflow-lite-local:0.0.0'
+    // implementation 'org.tensorflow:tensorflow-lite-local:0.0.0'
 }
 
 def targetFolder = "src/main/assets"
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 165d33510131ac9c9fc08070f0a4d08653188fae..814d236872caff05e9fbd4dc5aa4a9a995eb586b 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -190,6 +190,8 @@ public class Camera2BasicFragment extends Fragment
 
   int currentModel = -1;
 
+  int currentNumThreads = -1;
+
   /** An additional thread for running tasks that shouldn't block the UI. */
   private HandlerThread backgroundThread;
 
@@ -323,13 +325,16 @@ public class Camera2BasicFragment extends Fragment
     // Get UI information before delegating to background
     final int modelIndex = modelView.getCheckedItemPosition();
     final int deviceIndex = deviceView.getCheckedItemPosition();
+    final int numThreads = np.getValue();
 
     backgroundHandler.post(() -> {
-      if (modelIndex == currentModel && deviceIndex == currentDevice) {
+      if (modelIndex == currentModel && deviceIndex == currentDevice
+              && numThreads == currentNumThreads) {
         return;
       }
       currentModel = modelIndex;
       currentDevice = deviceIndex;
+      currentNumThreads = numThreads;
 
       // Disable classifier while updating
       if (classifier != null) {
@@ -357,7 +362,11 @@ public class Camera2BasicFragment extends Fragment
         classifier = null;
       }
 
-      // Customzie the interpreter to the type of device we want to use.
+      // Customize the interpreter to the type of device we want to use.
+      if (classifier == null) {
+        return;
+      }
+      classifier.setNumThreads(numThreads);
       if (device.equals(cpu)) {
       } else if (device.equals(gpu)) {
         if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
@@ -437,7 +446,7 @@ public class Camera2BasicFragment extends Fragment
         new NumberPicker.OnValueChangeListener() {
           @Override
           public void onValueChange(NumberPicker picker, int oldVal, int newVal) {
-            backgroundHandler.post(() -> classifier.setNumThreads(newVal));
+            updateActiveModel();
           }
         });
 
@@ -476,7 +485,9 @@ public class Camera2BasicFragment extends Fragment
 
   @Override
   public void onDestroy() {
-    classifier.close();
+    if (classifier != null) {
+      classifier.close();
+    }
     super.onDestroy();
   }
 
@@ -805,7 +816,9 @@ public class Camera2BasicFragment extends Fragment
   /** Classifies a frame from the preview stream. */
   private void classifyFrame() {
     if (classifier == null || getActivity() == null || cameraDevice == null) {
-      showToast("Uninitialized Classifier or invalid context.");
+      // It's important to not call showToast every frame, or else the app will starve and
+      // hang. updateActiveModel() already puts a error message up with showToast.
+      // showToast("Uninitialized Classifier or invalid context.");
       return;
     }
     SpannableStringBuilder textToShow = new SpannableStringBuilder();
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index ee71ab808f4810ac092b37b0d996331072f44652..323b21dbcea3bd45f5dbca44aaf4823e4e8009b9 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -16,67 +16,103 @@
 
 <LinearLayout
     xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
+    xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent"
     android:layout_height="match_parent"
     android:background="#bb7700"
     android:orientation="horizontal">
 
-  <com.example.android.tflitecamerademo.AutoFitTextureView
+    <com.example.android.tflitecamerademo.AutoFitTextureView
       android:id="@+id/texture"
       android:layout_width="0dp"
       android:layout_height="match_parent"
       android:layout_weight=".8"/>
 
-  <LinearLayout
+    <LinearLayout
       android:layout_width="0dp"
       android:layout_height="match_parent"
       android:layout_weight=".2"
       android:orientation="vertical">
 
-    <ImageView
-        android:id="@+id/logoview"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:scaleType="centerInside"
-        android:src="@drawable/logo"/>
-
-    <RadioGroup
-        android:gravity="center"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:orientation="horizontal">
-        <RadioButton
-            android:id="@+id/radio_cpu"
-            android:background="#0000000f"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:text="@string/cpu"
-            android:textColor="@android:color/white" />
-        <RadioButton
-            android:id="@+id/radio_nnapi"
-            android:background="#0000000f"
+        <ImageView
+            android:id="@+id/logoview"
             android:layout_width="wrap_content"
+            android:layout_height="47dp"
+            android:scaleType="centerInside"
+            android:src="@drawable/logo"/>
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="160dp"
+            android:paddingTop="20dp"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold"/>
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="match_parent"
+            android:layout_height="150dp"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white"/>
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content">
+
+            </ListView>
+        </LinearLayout>
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="match_parent"
+            android:layout_height="150dp"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white"/>
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"/>
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:text="@string/nnapi"
-            android:textColor="@android:color/white" />
-        </RadioGroup>
-
-    <NumberPicker
-        android:id="@+id/np"
-        android:layout_width="wrap_content"
-        android:layout_height="47dp"
-        android:layout_gravity="center_horizontal"
-        android:visibility="visible"/>
-
-    <TextView
-        android:id="@+id/text"
-        android:textStyle="bold"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:paddingTop="20dp"
-        android:textColor="#FFF"
-        android:textSize="20sp"/>
-
-  </LinearLayout>
+        >
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:gravity="center"
+                android:text="Threads"
+                android:textAlignment="center"
+                android:textColor="@android:color/white"/>
+
+            <NumberPicker
+                android:id="@+id/np"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:theme="@style/AppTheme.Picker"
+                android:visibility="visible"/>
+
+        </LinearLayout>
+
+    </LinearLayout>
 </LinearLayout>
 
diff --git a/tensorflow/lite/java/demo/build.gradle b/tensorflow/lite/java/demo/build.gradle
index b78a0b86c939620b6f05483ce45c4d3ef0ef595e..a88b3fdc70d9bbd45fa15ad31b4d38a377621c16 100644
--- a/tensorflow/lite/java/demo/build.gradle
+++ b/tensorflow/lite/java/demo/build.gradle
@@ -2,10 +2,11 @@
 
 buildscript {
     repositories {
+        google()
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -14,6 +15,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
index fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a..9ff32fe2bb7afeaefdc8b3d6a1ecb0d32e1aed60 100644
--- a/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 774320871eec9afb2fae31824dc021fb7d338e1e..b00c9cd05809c9a694f32a25ae4fde3c33d40a88 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -19,7 +19,10 @@ java_test(
         "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
+    tags = [
+        "no_mac",
+        "no_oss",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.ovic.OvicClassifierTest",
     visibility = ["//visibility:public"],
     deps = [
@@ -87,7 +90,10 @@ java_test(
         "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
+    tags = [
+        "no_mac",
+        "no_oss",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.ovic.OvicDetectorTest",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/java/ovic/demo/app/build.gradle b/tensorflow/lite/java/ovic/demo/app/build.gradle
index 4f3a6cdb2f8fe58008c9315bf08f4d328e720073..77f568448a810c61ece9feef65fad422356be2f1 100644
--- a/tensorflow/lite/java/ovic/demo/app/build.gradle
+++ b/tensorflow/lite/java/ovic/demo/app/build.gradle
@@ -2,18 +2,13 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    buildToolsVersion "27.0.3"
     defaultConfig {
         applicationId "android.example.com.ovicbenchmarker"
         minSdkVersion 15
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -41,12 +36,12 @@ repositories {
 }
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:25.2.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.0.2'
+    implementation 'com.android.support:design:25.2.0'
+    implementation 'com.android.support:support-annotations:25.3.1'
+    implementation 'com.android.support:support-v13:25.2.0'
 
-    compile 'org.tensorflow:tensorflow-lite:+'
+    implementation 'org.tensorflow:tensorflow-lite:+'
 }
diff --git a/tensorflow/lite/java/ovic/demo/build.gradle b/tensorflow/lite/java/ovic/demo/build.gradle
index b78a0b86c939620b6f05483ce45c4d3ef0ef595e..a88b3fdc70d9bbd45fa15ad31b4d38a377621c16 100644
--- a/tensorflow/lite/java/ovic/demo/build.gradle
+++ b/tensorflow/lite/java/ovic/demo/build.gradle
@@ -2,10 +2,11 @@
 
 buildscript {
     repositories {
+        google()
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -14,6 +15,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
index fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a..9ff32fe2bb7afeaefdc8b3d6a1ecb0d32e1aed60 100644
--- a/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 1b2d0d5aa841942a0202a876ddef7ce368e756fb..5aef4fb05723d170e0c8b08ac18bce44bd11eb7b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -234,11 +234,15 @@ public final class Interpreter implements AutoCloseable {
    *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
    *     input data for primitive types, whereas string types require using the (multi-dimensional)
    *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
-   *     until model inference is done.
+   *     until model inference is done. A {@code null} value is allowed only if the caller is using
+   *     a {@link Delegate} that allows buffer handle interop, and such a buffer has been bound to
+   *     the input {@link Tensor}.
    * @param output a multidimensional array of output data, or a {@link ByteBuffer} of primitive
-   *     types including int, float, long, and byte.
+   *     types including int, float, long, and byte. A null value is allowed only if the caller is
+   *     using a {@link Delegate} that allows buffer handle interop, and such a buffer has been
+   *     bound to the output {@link Tensor}. See also {@link Options#setAllowBufferHandleOutput()}.
    */
-  public void run(@NonNull Object input, @NonNull Object output) {
+  public void run(Object input, Object output) {
     Object[] inputs = {input};
     Map<Integer, Object> outputs = new HashMap<>();
     outputs.put(0, output);
@@ -251,6 +255,10 @@ public final class Interpreter implements AutoCloseable {
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
    * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
+   * <p>Note: {@code null} values for invididual elements of {@code inputs} and {@code outputs} is
+   * allowed only if the caller is using a {@link Delegate} that allows buffer handle interop, and
+   * such a buffer has been bound to the corresponding input or output {@link Tensor}(s).
+   *
    * @param inputs an array of input data. The inputs should be in the same order as inputs of the
    *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
    *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index b56fcd772b1124e691e2f1c6e22d27bacb4235cb..725bb326ba1d6a9d9c206cd4fb01bdf687b0a79c 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -93,12 +93,20 @@ public final class Tensor {
    * Copies the contents of the provided {@code src} object to the Tensor.
    *
    * <p>The {@code src} should either be a (multi-dimensional) array with a shape matching that of
-   * this tensor, or a {@link ByteByffer} of compatible primitive type with a matching flat size.
+   * this tensor, a {@link ByteByffer} of compatible primitive type with a matching flat size, or
+   * {@code null} iff the tensor has an underlying delegate buffer handle.
    *
    * @throws IllegalArgumentException if the tensor is a scalar or if {@code src} is not compatible
    *     with the tensor (for example, mismatched data types or shapes).
    */
   void setTo(Object src) {
+    if (src == null) {
+      if (hasDelegateBufferHandle(nativeHandle)) {
+        return;
+      }
+      throw new IllegalArgumentException(
+          "Null inputs are allowed only if the Tensor is bound to a buffer handle.");
+    }
     throwExceptionIfTypeIsIncompatible(src);
     if (isByteBuffer(src)) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
@@ -117,11 +125,19 @@ public final class Tensor {
   /**
    * Copies the contents of the tensor to {@code dst} and returns {@code dst}.
    *
-   * @param dst the destination buffer, either an explicitly-typed array or a {@link ByteBuffer}.
+   * @param dst the destination buffer, either an explicitly-typed array, a {@link ByteBuffer} or
+   *     {@code null} iff the tensor has an underlying delegate buffer handle.
    * @throws IllegalArgumentException if {@code dst} is not compatible with the tensor (for example,
    *     mismatched data types or shapes).
    */
   Object copyTo(Object dst) {
+    if (dst == null) {
+      if (hasDelegateBufferHandle(nativeHandle)) {
+        return dst;
+      }
+      throw new IllegalArgumentException(
+          "Null outputs are allowed only if the Tensor is bound to a buffer handle.");
+    }
     throwExceptionIfTypeIsIncompatible(dst);
     if (dst instanceof ByteBuffer) {
       ByteBuffer dstByteBuffer = (ByteBuffer) dst;
@@ -135,6 +151,9 @@ public final class Tensor {
   /** Returns the provided buffer's shape if specified and different from this Tensor's shape. */
   // TODO(b/80431971): Remove this method after deprecating multi-dimensional array inputs.
   int[] getInputShapeIfDifferent(Object input) {
+    if (input == null) {
+      return null;
+    }
     // Implicit resizes based on ByteBuffer capacity isn't supported, so short-circuit that path.
     // The ByteBuffer's size will be validated against this Tensor's size in {@link #setTo(Object)}.
     if (isByteBuffer(input)) {
@@ -287,9 +306,7 @@ public final class Tensor {
 
   private static native int numBytes(long handle);
 
-  private static native int setBufferHandle(long handle, long delegateHandle, int bufferHandle);
-
-  private static native int bufferHandle(long handle);
+  private static native boolean hasDelegateBufferHandle(long handle);
 
   private static native void readMultiDimensionalArray(long handle, Object dst);
 
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index cc81eb8d517f16378e6cf3e00f572a48b93178bc..f07437e7f318944d6d254c5820d58fccc5a74f87 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -410,6 +410,17 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
   return static_cast<jint>(tensor->bytes);
 }
 
+JNIEXPORT jboolean JNICALL
+Java_org_tensorflow_lite_Tensor_hasDelegateBufferHandle(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle) {
+  const TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return false;
+  return tensor->delegate && (tensor->buffer_handle != kTfLiteNullBufferHandle)
+             ? JNI_TRUE
+             : JNI_FALSE;
+}
+
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_index(JNIEnv* env,
                                                              jclass clazz,
                                                              jlong handle) {
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.h b/tensorflow/lite/java/src/main/native/tensor_jni.h
index 52150bf3ab3106b082530de168a32090a00491ba..a14f24a47d0861881870558a4d7b0cd5082d713a 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.h
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.h
@@ -84,6 +84,16 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
                                                                 jclass clazz,
                                                                 jlong handle);
 
+/*
+ *  Class:     org_tensorflow_lite_Tensor
+ *  Method:    hasDelegateBufferHandle
+ *  Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_org_tensorflow_lite_Tensor_hasDelegateBufferHandle(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle);
+
 /*
  *  Class:     org_tensorflow_lite_Tensor
  *  Method:    readMultiDimensionalArray
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index f89062ba4584862044c86bc613398a8c2d3404ad..c5496e3a21e7f5d27c36d92e49dd6c8e622b0070 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -334,6 +334,30 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testNullInputs() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    try {
+      interpreter.run(null, new float[2][8][8][3]);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Expected failure.
+    }
+    interpreter.close();
+  }
+
+  @Test
+  public void testNullOutputs() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    try {
+      interpreter.run(new float[2][8][8][3], null);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Expected failure.
+    }
+    interpreter.close();
+  }
+
   /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
   @Test
   public void testFlexModel() throws Exception {
@@ -372,6 +396,25 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testNullInputsAndOutputsWithDelegate() throws Exception {
+    System.loadLibrary("tensorflowlite_test_jni");
+    Delegate delegate =
+        new Delegate() {
+          @Override
+          public long getNativeHandle() {
+            return getNativeHandleForDelegate();
+          }
+        };
+    Interpreter interpreter =
+        new Interpreter(MODEL_FILE, new Interpreter.Options().addDelegate(delegate));
+    // The delegate installs a custom buffer handle for all tensors, in turn allowing null to be
+    // provided for the inputs/outputs (as the client can reference the buffer directly).
+    interpreter.run(new float[2][8][8][3], null);
+    interpreter.run(null, new float[2][8][8][3]);
+    interpreter.close();
+  }
+
   @Test
   public void testModifyGraphWithDelegate() throws Exception {
     System.loadLibrary("tensorflowlite_test_jni");
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index be6a706b8d475375145578cf47e73e5b2acf932e..d9b20510106909d53b9024986a4daa88fc355177 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -78,6 +78,16 @@ public final class TensorTest {
     assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
   }
 
+  @Test
+  public void testCopyToNull() {
+    try {
+      tensor.copyTo(null);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Success.
+    }
+  }
+
   @Test
   public void testCopyToByteBuffer() {
     ByteBuffer parsedOutput =
@@ -150,6 +160,16 @@ public final class TensorTest {
     assertThat(output[0][0][0][0]).isEqualTo(3.0f);
   }
 
+  @Test
+  public void testSetToNull() {
+    try {
+      tensor.setTo(null);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Success.
+    }
+  }
+
   @Test
   public void testSetToInvalidByteBuffer() {
     ByteBuffer input = ByteBuffer.allocateDirect(3 * 4).order(ByteOrder.nativeOrder());
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
index 000e718ba7a641278fd132768d71d4eaea660bc0..f5bcc1249f0c2d6f624e7f9f4ae40ec912e3c401 100644
--- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -49,8 +49,6 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
       .custom_name = "",
       .version = 1,
   };
-  // A simple delegate which replaces all ops with a single op that outputs a
-  // vector of length 1 with the value [7].
   static TfLiteDelegate delegate = {
       .data_ = nullptr,
       .Prepare = [](TfLiteContext* context,
@@ -60,6 +58,11 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
             context->GetExecutionPlan(context, &execution_plan));
         context->ReplaceNodeSubsetsWithDelegateKernels(
             context, registration, execution_plan, delegate);
+        // Now bind delegate buffer handles for all tensors.
+        for (size_t i = 0; i < context->tensors_size; ++i) {
+          context->tensors[i].delegate = delegate;
+          context->tensors[i].buffer_handle = static_cast<int>(i);
+        }
         return kTfLiteOk;
       },
       .CopyFromBufferHandle = nullptr,
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index bad1c4aebf1e9d9c7c6d33f87a6e7ea9cab8d700..8f7860242a3b7758c7f43cb0fbaee0d79842c9bb 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -25,9 +25,6 @@ tf_cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -49,6 +46,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/tools/optimize:quantization_utils",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -114,7 +112,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels/internal:round",
-        "//tensorflow/lite/kernels/internal:types",
     ],
 )
 
@@ -122,9 +119,6 @@ tf_cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":kernel_util",
         "//tensorflow/lite/testing:util",
@@ -136,9 +130,6 @@ tf_cc_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":test_util",
         "//tensorflow/lite/testing:util",
@@ -167,6 +158,7 @@ cc_library(
         "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
         "cast.cc",
+        "ceil.cc",
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
@@ -209,13 +201,13 @@ cc_library(
         "reshape.cc",
         "resize_bilinear.cc",
         "resize_nearest_neighbor.cc",
+        "reverse.cc",
         "select.cc",
         "shape.cc",
         "skip_gram.cc",
         "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
-        "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
         "split_v.cc",
@@ -230,6 +222,7 @@ cc_library(
         "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
+        "unique.cc",
         "unpack.cc",
         "zeros_like.cc",
     ],
@@ -285,13 +278,24 @@ cc_library(
     ],
 )
 
+# The builtin_ops target will resolve to optimized kernels when available. This
+# target uses reference kernels only, and is useful for validation and testing.
+# It should *not* generally be used in production.
+cc_library(
+    name = "reference_ops",
+    srcs = ["register_ref.cc"],
+    hdrs = ["register_ref.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
 tf_cc_test(
     name = "audio_spectrogram_test",
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -305,9 +309,6 @@ tf_cc_test(
     name = "mfcc_test",
     size = "small",
     srcs = ["mfcc_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -321,9 +322,6 @@ tf_cc_test(
     name = "detection_postprocess_test",
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -337,25 +335,6 @@ tf_cc_test(
     name = "relu1_test",
     size = "small",
     srcs = ["relu1_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "sparse_output_fully_connected_test",
-    size = "small",
-    srcs = ["sparse_output_fully_connected_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -369,7 +348,6 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -382,7 +360,6 @@ tf_cc_test(
     name = "add_test",
     size = "small",
     srcs = ["add_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -395,9 +372,6 @@ tf_cc_test(
     name = "arg_min_max_test",
     size = "small",
     srcs = ["arg_min_max_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -410,9 +384,6 @@ tf_cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -425,9 +396,6 @@ tf_cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -440,9 +408,6 @@ tf_cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -457,9 +422,6 @@ tf_cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -472,9 +434,6 @@ tf_cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -487,9 +446,6 @@ tf_cc_test(
     name = "cast_test",
     size = "small",
     srcs = ["cast_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -502,7 +458,6 @@ tf_cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -515,7 +470,6 @@ tf_cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -529,7 +483,6 @@ tf_cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -543,13 +496,11 @@ tf_cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:types",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -559,7 +510,6 @@ tf_cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -572,9 +522,6 @@ tf_cc_test(
     name = "bidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -588,6 +535,18 @@ tf_cc_test(
     name = "floor_test",
     size = "small",
     srcs = ["floor_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "ceil_test",
+    size = "small",
+    srcs = ["ceil_test.cc"],
     tags = [
         "tflite_not_portable_ios",
     ],
@@ -603,9 +562,6 @@ tf_cc_test(
     name = "elementwise_test",
     size = "small",
     srcs = ["elementwise_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -618,9 +574,6 @@ tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -633,9 +586,6 @@ tf_cc_test(
     name = "bidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -648,9 +598,6 @@ tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -663,7 +610,6 @@ tf_cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -676,9 +622,6 @@ tf_cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -691,9 +634,6 @@ tf_cc_test(
     name = "fake_quant_test",
     size = "small",
     srcs = ["fake_quant_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -706,9 +646,6 @@ tf_cc_test(
     name = "maximum_minimum_test",
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -721,9 +658,6 @@ tf_cc_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -736,7 +670,6 @@ tf_cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -749,9 +682,6 @@ tf_cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -764,7 +694,6 @@ tf_cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -777,9 +706,6 @@ tf_cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -793,9 +719,6 @@ tf_cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -809,7 +732,6 @@ tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -822,7 +744,6 @@ tf_cc_test(
     name = "resize_nearest_neighbor_test",
     size = "small",
     srcs = ["resize_nearest_neighbor_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -835,7 +756,6 @@ tf_cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -848,7 +768,6 @@ tf_cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -861,7 +780,6 @@ tf_cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -874,7 +792,6 @@ tf_cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -889,7 +806,6 @@ tf_cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -902,7 +818,6 @@ tf_cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -915,7 +830,6 @@ tf_cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -929,9 +843,6 @@ tf_cc_test(
     name = "log_softmax_test",
     size = "small",
     srcs = ["log_softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -945,7 +856,6 @@ tf_cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -958,7 +868,6 @@ tf_cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -972,7 +881,6 @@ tf_cc_test(
     name = "layer_norm_lstm_test",
     size = "small",
     srcs = ["layer_norm_lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -986,7 +894,6 @@ tf_cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -999,7 +906,6 @@ tf_cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1013,7 +919,6 @@ tf_cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1026,9 +931,6 @@ tf_cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1041,9 +943,6 @@ tf_cc_test(
     name = "split_v_test",
     size = "small",
     srcs = ["split_v_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1056,9 +955,6 @@ tf_cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1071,9 +967,6 @@ tf_cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1086,9 +979,6 @@ tf_cc_test(
     name = "tile_test",
     size = "small",
     srcs = ["tile_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1104,9 +994,6 @@ tf_cc_test(
     srcs = [
         "comparisons_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1119,9 +1006,6 @@ tf_cc_test(
     name = "neg_test",
     size = "small",
     srcs = ["neg_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1136,9 +1020,6 @@ tf_cc_test(
     srcs = [
         "select_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1153,9 +1034,6 @@ tf_cc_test(
     srcs = [
         "slice_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1168,9 +1046,6 @@ tf_cc_test(
     name = "transpose_conv_test",
     size = "small",
     srcs = ["transpose_conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1184,9 +1059,6 @@ tf_cc_test(
     name = "expand_dims_test",
     size = "small",
     srcs = ["expand_dims_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1200,9 +1072,6 @@ tf_cc_test(
     name = "sparse_to_dense_test",
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1216,9 +1085,6 @@ tf_cc_test(
     name = "shape_test",
     size = "small",
     srcs = ["shape_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1232,9 +1098,6 @@ tf_cc_test(
     name = "pow_test",
     size = "small",
     srcs = ["pow_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1248,7 +1111,6 @@ tf_cc_test(
     name = "pack_test",
     size = "small",
     srcs = ["pack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1262,7 +1124,6 @@ tf_cc_test(
     name = "one_hot_test",
     size = "small",
     srcs = ["one_hot_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1275,7 +1136,6 @@ tf_cc_test(
     name = "logical_test",
     size = "small",
     srcs = ["logical_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1289,7 +1149,6 @@ tf_cc_test(
     name = "unpack_test",
     size = "small",
     srcs = ["unpack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1303,7 +1162,6 @@ tf_cc_test(
     name = "floor_div_test",
     size = "small",
     srcs = ["floor_div_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1317,7 +1175,6 @@ tf_cc_test(
     name = "zeros_like_test",
     size = "small",
     srcs = ["zeros_like_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1331,7 +1188,6 @@ tf_cc_test(
     name = "floor_mod_test",
     size = "small",
     srcs = ["floor_mod_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1345,7 +1201,6 @@ tf_cc_test(
     name = "range_test",
     size = "small",
     srcs = ["range_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1359,7 +1214,6 @@ tf_cc_test(
     name = "squared_difference_test",
     size = "small",
     srcs = ["squared_difference_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1372,7 +1226,29 @@ tf_cc_test(
     name = "fill_test",
     size = "small",
     srcs = ["fill_test.cc"],
-    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "unique_test",
+    srcs = ["unique_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "reverse_test",
+    size = "small",
+    srcs = ["reverse_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index ab09cf7196a951ded20f22e404570254be6ed233..4463a6c5a65bf848ad68635717750d3a214dd0a0 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -33,6 +34,11 @@ namespace ops {
 namespace builtin {
 namespace activations {
 
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 struct OpData {
   int32_t input_multiplier = 0;
   int input_left_shift = 0;
@@ -50,6 +56,20 @@ struct PreluOpData : public OpData {
   int output_shift = 0;
 };
 
+namespace {
+TfLiteStatus CheckOutputQuantParams(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    const TfLiteTensor* output) {
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+  } else {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -215,12 +235,12 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   const int num_dims = NumDimensions(input);
   TF_LITE_ENSURE(context, num_dims >= 1 && num_dims <= 4);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (CheckOutputQuantParams(context, input, output) == kTfLiteError) {
+      return kTfLiteError;
+    }
 
     static const int kScaledDiffIntegerBits = 5;
-
     tflite::PreprocessSoftmaxScaling(
         params->beta, input->params.scale, kScaledDiffIntegerBits,
         &data->input_multiplier, &data->input_left_shift);
@@ -367,22 +387,36 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+template <KernelType kernel_type>
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      optimized_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
-                          GetTensorShape(output), GetTensorData<float>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+      } else {
+        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      optimized_ops::Tanh(params, GetTensorShape(input),
-                          GetTensorData<int16_t>(input), GetTensorShape(output),
-                          GetTensorData<int16_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      } else {
+        reference_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
@@ -391,9 +425,15 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       params.input_range_radius = data->input_range_radius;
       params.input_multiplier = data->input_multiplier;
       params.input_left_shift = data->input_left_shift;
-      optimized_ops::Tanh(params, GetTensorShape(input),
-                          GetTensorData<uint8_t>(input), GetTensorShape(output),
-                          GetTensorData<uint8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     default:
@@ -404,6 +444,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 // Sigmoid is also know as "Logistic".
+template <KernelType kernel_type>
 TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -411,18 +452,28 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      size_t elements = input->bytes / sizeof(float);
-      float* in = input->data.f;
-      float* in_end = in + elements;
-      float* out = output->data.f;
-      for (; in < in_end; in++, out++) *out = 1.f / (1.f + std::exp(-*in));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Logistic(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        reference_ops::Logistic(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      }
       break;
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      optimized_ops::Logistic(
-          params, GetTensorShape(input), GetTensorData<int16_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      } else {
+        reference_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      }
       break;
     }
     case kTfLiteUInt8: {
@@ -431,9 +482,15 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       params.input_range_radius = data->input_range_radius;
       params.input_multiplier = data->input_multiplier;
       params.input_left_shift = data->input_left_shift;
-      optimized_ops::Logistic(
-          params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
       break;
     }
     default:
@@ -505,8 +562,8 @@ void Softmax3DFloat(const TfLiteTensor* input, TfLiteTensor* output,
       GetTensorData<float>(output));
 }
 
-void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax1DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
   // always traverses the last dimension of a 4D tensor, we will pretend our 1D
   // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
@@ -521,8 +578,8 @@ void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorShape({1, 1, 1, input_size}),
                          GetTensorData<uint8_t>(output));
 }
-void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax2DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
   // always traverses the last dimension of a 4D tensor, we will pretend our 2D
   // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
@@ -540,8 +597,8 @@ void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorData<uint8_t>(output));
 }
 
-void Softmax3DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax3DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   const int batch_size = input->dims->data[0];
   const int intermediate_size = input->dims->data[1];
   const int input_size = input->dims->data[2];
@@ -566,8 +623,8 @@ void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorData<float>(output));
 }
 
-void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax4DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   SoftmaxParams op_params;
   op_params.input_multiplier = data->input_multiplier;
   op_params.input_left_shift = data->input_left_shift;
@@ -577,6 +634,63 @@ void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorData<uint8_t>(output));
 }
 
+// TODO(jianlijianli): Try merging Softmax<n>DQuantizedInt8 with
+// Softmax<n>DQuantized, which needs a larger refactor.
+void Softmax1DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  const int input_size = input->dims->data[0];
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(
+      op_params, GetTensorShape({1, 1, 1, input_size}),
+      GetTensorData<int8_t>(input), GetTensorShape({1, 1, 1, input_size}),
+      GetTensorData<int8_t>(output));
+}
+
+void Softmax2DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(op_params,
+                                 GetTensorShape({batch_size, 1, 1, input_size}),
+                                 GetTensorData<int8_t>(input),
+                                 GetTensorShape({batch_size, 1, 1, input_size}),
+                                 GetTensorData<int8_t>(output));
+}
+
+void Softmax3DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  const int batch_size = input->dims->data[0];
+  const int intermediate_size = input->dims->data[1];
+  const int input_size = input->dims->data[2];
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(
+      op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}),
+      GetTensorData<int8_t>(input),
+      GetTensorShape({batch_size, intermediate_size, 1, input_size}),
+      GetTensorData<int8_t>(output));
+}
+
+void Softmax4DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+}
+
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
@@ -611,19 +725,19 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteUInt8: {
       if (NumDimensions(input) == 1) {
-        Softmax1DQuantized(input, output, params, data);
+        Softmax1DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       if (NumDimensions(input) == 2) {
-        Softmax2DQuantized(input, output, params, data);
+        Softmax2DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       if (NumDimensions(input) == 3) {
-        Softmax3DQuantized(input, output, params, data);
+        Softmax3DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       if (NumDimensions(input) == 4) {
-        Softmax4DQuantized(input, output, params, data);
+        Softmax4DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       context->ReportError(
@@ -631,6 +745,30 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           NumDimensions(input));
       return kTfLiteError;
     }
+    case kTfLiteInt8: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 2) {
+        Softmax2DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 3) {
+        Softmax3DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      context->ReportError(
+          context,
+          "Only 4D tensors supported currently for Int8 kernels, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+    }
+
     default:
       context->ReportError(
           context, "Only float32 and uint8_t supported currently, got %s.",
@@ -639,6 +777,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+template <KernelType kernel_type>
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   const LogSoftmaxOpData* data =
       reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
@@ -647,9 +786,15 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32: {
       SoftmaxParams op_params;
-      optimized_ops::LogSoftmax(
-          op_params, GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(output), GetTensorData<float>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        reference_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      }
       return kTfLiteOk;
     }
     case kTfLiteUInt8: {
@@ -659,9 +804,15 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.reverse_scaling_divisor = data->reverse_scaling_divisor;
       op_params.reverse_scaling_right_shift = data->reverse_scaling_right_shift;
       op_params.diff_min = data->diff_min;
-      optimized_ops::LogSoftmax(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
       return kTfLiteOk;
     }
     default:
@@ -756,17 +907,31 @@ TfLiteRegistration* Register_RELU6() {
   return &r;
 }
 
+TfLiteRegistration* Register_TANH_REF() {
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::TanhPrepare,
+      activations::TanhEval<activations::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::TanhPrepare,
-                                 activations::TanhEval};
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::TanhPrepare,
+      activations::TanhEval<activations::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOGISTIC_REF() {
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::SigmoidPrepare,
+      activations::SigmoidEval<activations::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_LOGISTIC() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::SigmoidPrepare,
-                                 activations::SigmoidEval};
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::SigmoidPrepare,
+      activations::SigmoidEval<activations::kGenericOptimized>};
   return &r;
 }
 
@@ -777,10 +942,19 @@ TfLiteRegistration* Register_SOFTMAX() {
   return &r;
 }
 
+TfLiteRegistration* Register_LOG_SOFTMAX_REF() {
+  static TfLiteRegistration r = {
+      activations::LogSoftmaxInit, activations::LogSoftmaxFree,
+      activations::LogSoftmaxPrepare,
+      activations::LogSoftmaxEval<activations::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_LOG_SOFTMAX() {
   static TfLiteRegistration r = {
       activations::LogSoftmaxInit, activations::LogSoftmaxFree,
-      activations::LogSoftmaxPrepare, activations::LogSoftmaxEval};
+      activations::LogSoftmaxPrepare,
+      activations::LogSoftmaxEval<activations::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 67f137baff29808d7a03571e1880901e44c34712..5e3c56ed5bf7092581fbbced6d3735958c19580c 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -44,6 +44,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     input_ = AddInput(input);
     if (input.type == TensorType_UINT8) {
       output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({TensorType_INT8, {}, 0, 0, 1. / 256, -128});
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -52,8 +54,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
-  BaseActivationsOpModel(BuiltinOperator type, const TensorData &input,
-                         const TensorData &output) {
+  BaseActivationsOpModel(BuiltinOperator type, const TensorData& input,
+                         const TensorData& output) {
     input_ = AddInput(input);
     output_ = AddOutput(output);
     SetBuiltinOp(type, BuiltinOptions_NONE, 0);
@@ -323,7 +325,7 @@ TEST(FloatActivationsOpTest, Softmax4D) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax4D) {
+TEST(QuantizedActivationsOpTest, Softmax4DUint8) {
   QuantizedActivationsOpModel m(
       0.1,
       /*input=*/{TensorType_UINT8, {1, 2, 1, 4}, -10, 10});
@@ -362,6 +364,145 @@ TEST(QuantizedActivationsOpTest, Softmax4D) {
                   kQuantizedTolerance)));
 }
 
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax1D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax1DInt8) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_INT8, {8}, -10, 10});
+  m.SetInput<int8_t>({0, -6, 2, 4, 3, -2, 10, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.09766, 0.05469, 0.12109, 0.14453,
+                                       0.13281, 0.07813, 0.26563, 0.10938},
+                                      kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax2D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax2DInt8) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_INT8, {2, 4}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(0.1,
+                                 /*input=*/{TensorType_INT8, {4, 2}, -10, 10});
+  m2.SetInput<int8_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax3D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax3DInt8) {
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT8, {1, 2, 4}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_INT8, {4, 1, 2}, -10, 10});
+  m2.SetInput<int8_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax4D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax4DInt8) {
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT8, {1, 2, 1, 4}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         -68, -95, -54, -38,  //
+                                         -70, -93, -12, -81,  //
+                                     }));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_INT8, {4, 1, 1, 2}, -10, 10});
+  m2.SetInput<int8_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedTolerance)));
+}
+
 TEST(FloatActivationsOpTest, Softmax3D) {
   FloatActivationsOpModel m(0.1,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4}});
@@ -393,7 +534,7 @@ TEST(FloatActivationsOpTest, Softmax3D) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax3D) {
+TEST(QuantizedActivationsOpTest, Softmax3DUint8) {
   QuantizedActivationsOpModel m(
       0.1,
       /*input=*/{TensorType_UINT8, {1, 2, 4}, -10, 10});
@@ -443,7 +584,7 @@ TEST(FloatActivationsOpTest, Softmax1D) {
           {.09752, .05352, .11911, .14548, .13164, .07984, .26509, .10778})));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax1D) {
+TEST(QuantizedActivationsOpTest, Softmax1DUint8) {
   QuantizedActivationsOpModel m(0.1,
                                 /*input=*/{TensorType_UINT8, {8}, -10, 10});
   m.SetInput<uint8_t>({0, -6, 2, 4, 3, -2, 10, 1});
@@ -486,7 +627,7 @@ TEST(FloatActivationsOpTest, Softmax2D) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax2D) {
+TEST(QuantizedActivationsOpTest, Softmax2DUint8) {
   QuantizedActivationsOpModel m(0.1,
                                 /*input=*/{TensorType_UINT8, {2, 4}, -10, 10});
   m.SetInput<uint8_t>({
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index eea2de27f74af8bf73df92c28ed6042e4d8fa4ff..f9adf6bc4e8eea8a653f3b4a2f8843675d858417 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -36,9 +36,15 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* input,
     axis_value += NumDimensions(input);
   }
 
-  // Copy the input dimensions to output except make the axis dimension 1.
-  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
-  output_dims->data[axis_value] = 1;
+  // Copy the input dimensions to output except the axis dimension.
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(NumDimensions(input) - 1);
+  int j = 0;
+  for (int i = 0; i < NumDimensions(input); ++i) {
+    if (i != axis_value) {
+      output_dims->data[j] = SizeOfDimension(input, i);
+      ++j;
+    }
+  }
   return context->ResizeTensor(context, output, output_dims);
 }
 
diff --git a/tensorflow/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
index dcdff74cc6f376b3418b64c025e8eb4a36c429a0..1b1000f29e88b6837544ac146dc8d7876cfe036b 100644
--- a/tensorflow/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -83,7 +83,7 @@ TEST(ArgMaxOpTest, GetMaxArgFloat) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgInt) {
@@ -94,7 +94,7 @@ TEST(ArgMaxOpTest, GetMaxArgInt) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
@@ -105,7 +105,7 @@ TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
@@ -116,7 +116,7 @@ TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1, 0, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgOutput64) {
@@ -127,7 +127,7 @@ TEST(ArgMaxOpTest, GetMaxArgOutput64) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMinOpTest, GetMinArgFloat) {
@@ -138,7 +138,7 @@ TEST(ArgMinOpTest, GetMinArgFloat) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMinOpTest, GetMinArgInt) {
@@ -149,7 +149,7 @@ TEST(ArgMinOpTest, GetMinArgInt) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMinOpTest, GetMinArgMulDimensions) {
@@ -160,7 +160,7 @@ TEST(ArgMinOpTest, GetMinArgMulDimensions) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
@@ -171,7 +171,7 @@ TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0, 0, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
 TEST(ArgMinOpTest, GetMinArgOutput64) {
@@ -182,7 +182,7 @@ TEST(ArgMinOpTest, GetMinArgOutput64) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
index 7c66ce1992f4c341d7518742cd209a53fa1de16b..a2c38b3b7d8c573244be803225398504a6c45f86 100644
--- a/tensorflow/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -27,6 +27,16 @@ namespace ops {
 namespace builtin {
 namespace rnn {
 
+namespace {
+int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
+  if (is_uint8) {
+    return reinterpret_cast<int8_t*>(tensor->data.uint8);
+  } else {
+    return tensor->data.int8;
+  }
+}
+}  // namespace
+
 constexpr int kInputTensor = 0;
 constexpr int kWeightsTensor = 1;
 constexpr int kRecurrentWeightsTensor = 2;
@@ -85,15 +95,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  bool is_hybrid =
+      input->type == kTfLiteFloat32 && (input_weights->type == kTfLiteUInt8 ||
+                                        input_weights->type == kTfLiteInt8);
+
   // Allocate temporary tensors to store quantized values of input and
   // hidden_state tensors.
-  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+  if (is_hybrid) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(3);
     node->temporaries->data[0] = *scratch_tensor_index;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -103,7 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[1] = *scratch_tensor_index + 1;
     TfLiteTensor* hidden_state_quantized =
         GetTemporary(context, node, /*index=*/1);
-    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->type = input_weights->type;
     hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
                              hidden_state->dims)) {
@@ -165,6 +179,7 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
                         TfLiteTensor* hidden_state_scratch,
                         TfLiteTensor* scaling_factors,
                         TfLiteTensor* hidden_state, TfLiteTensor* output) {
+  const bool is_uint8_hybrid = input_weights->type == kTfLiteUInt8;
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -178,18 +193,17 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
   float* output_ptr_batch = output->data.f;
   // Initialize input_weights, recurrent_weights and bias.
   const int8_t* input_weights_ptr =
-      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+      GetInt8DataPtr(input_weights, is_uint8_hybrid);
   const int8_t* recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+      GetInt8DataPtr(recurrent_weights, is_uint8_hybrid);
   const float* bias_ptr = bias->data.f;
   // Get the scale of the quantized weights.
   float input_weights_scale = input_weights->params.scale;
   float recurrent_weights_scale = recurrent_weights->params.scale;
   // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+  int8_t* quantized_input_ptr = GetInt8DataPtr(input_scratch, is_uint8_hybrid);
   int8_t* quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+      GetInt8DataPtr(hidden_state_scratch, is_uint8_hybrid);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
   kernel_utils::RnnBatchStep(
@@ -218,7 +232,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(input, input_weights, recurrent_weights, bias, params,
                        hidden_state, output);
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       // TODO(mirkov): implement eval with quantized inputs as well.
       TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
       TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index 240057d18a176dbb77e4962b48493c1a8d2dddab..9eb20444a6d119ec940a140a66e59961f1451c1c 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -233,15 +233,25 @@ class RNNOpModel : public SingleOpModel {
 // The hybrid model has quantized weights and recurrent_weights.
 class HybridRNNOpModel : public RNNOpModel {
  public:
-  HybridRNNOpModel(int batches, int units, int size)
-      : RNNOpModel(batches, units, size, TensorType_UINT8, TensorType_UINT8) {}
+  HybridRNNOpModel(int batches, int units, int size, TensorType tensor_type)
+      : RNNOpModel(batches, units, size, tensor_type, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
 
-  void SetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
+  TensorType tensor_type_;
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
   }
 
+  void SetWeights(std::initializer_list<float> f) { SetWeights(weights_, f); }
+
   void SetRecurrentWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+    SetWeights(recurrent_weights_, f);
   }
 };
 
@@ -272,8 +282,36 @@ TEST(RnnOpTest, BlackBoxTest) {
   }
 }
 
-TEST(HybridRnnOpTest, BlackBoxTest) {
-  HybridRNNOpModel rnn(2, 16, 8);
+TEST(HybridRnnOpTest, BlackBoxTestUint8) {
+  HybridRNNOpModel rnn(2, 16, 8, TensorType_UINT8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
+                                  (rnn.input_size() * rnn.num_batches());
+
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    rnn.SetInput(0, batch_start, batch_end);
+    rnn.SetInput(rnn.input_size(), batch_start, batch_end);
+
+    rnn.Invoke();
+
+    float* golden_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_end = golden_start + rnn.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                     expected, /*max_abs_error=*/0.0104)));
+  }
+}
+
+TEST(HybridRnnOpTest, BlackBoxTestInt8) {
+  HybridRNNOpModel rnn(2, 16, 8, TensorType_INT8);
   rnn.SetWeights(rnn_weights);
   rnn.SetBias(rnn_bias);
   rnn.SetRecurrentWeights(rnn_recurrent_weights);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index a3e06d4c89327050625ac514d41bc29c4f6493f3..f33089559992c1a6a6fa34161122c43b7954fbdb 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -114,6 +114,7 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
                                                4, 8, 11, 15, 12, 16}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
@@ -131,6 +132,7 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   m.SetCrops({0, 0, -1, 0});
   EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
 }
+#endif
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index b0be6d0dbd7fda35e1e167db22212ee5972da5f5..31c6e3f44c8323cee38d196b4cd24031586ad1b0 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -105,7 +105,10 @@ constexpr int kBwInputActivationStateTensor = 37;
 // Cell state tensors of size {n_batch, n_cell}
 constexpr int kBwInputCellStateTensor = 38;
 
-// Auxiliary input and weights when stacking.
+// Used as auxiliary input and weights when stacking for
+// tf.contrib.rnn.stack_bidirectional_rnn case (with cross links); Used as input
+// to the backward cell when stacking for tf.nn.static_bidirectional_rnn case
+// (without cross links).
 constexpr int kAuxInputTensor = 39;  // Optional
 // Forward weights.
 constexpr int kFwAuxInputToInputWeightsTensor = 40;   // Optional
@@ -459,8 +462,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_to_cell_weights != nullptr) &&
+  const bool aux_inputs_weights_all_or_none =
+      ((fw_aux_input_to_cell_weights != nullptr) &&
        (fw_aux_input_to_forget_weights != nullptr) &&
        (fw_aux_input_to_output_weights != nullptr) &&
        (bw_aux_input_to_cell_weights != nullptr) &&
@@ -472,8 +475,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
        (bw_aux_input_to_cell_weights == nullptr) &&
        (bw_aux_input_to_forget_weights == nullptr) &&
        (bw_aux_input_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
+  TF_LITE_ENSURE(context, aux_inputs_weights_all_or_none);
+
+  const bool has_aux_input = (fw_aux_input_to_forget_weights != nullptr);
 
   if (has_aux_input) {
     // Check that aux_input has the same dimensions (except last) as the input.
@@ -505,7 +509,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     context->ResizeTensor(context, fw_output, fw_output_size));
 
   // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8);
+  const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8 ||
+                             fw_input_to_output_weights->type == kTfLiteInt8);
 
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
@@ -603,7 +608,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = fw_input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -615,7 +620,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kFwActivationStateQuantized;
     TfLiteTensor* fw_activation_state_quantized =
         GetTemporary(context, node, kFwActivationStateQuantized);
-    fw_activation_state_quantized->type = kTfLiteUInt8;
+    fw_activation_state_quantized->type = fw_input_to_output_weights->type;
     fw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_activation_state_quantized->dims,
                              fw_activation_state->dims)) {
@@ -629,7 +634,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kBwActivationStateQuantized;
     TfLiteTensor* bw_activation_state_quantized =
         GetTemporary(context, node, kBwActivationStateQuantized);
-    bw_activation_state_quantized->type = kTfLiteUInt8;
+    bw_activation_state_quantized->type = fw_input_to_output_weights->type;
     bw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_activation_state_quantized->dims,
                              bw_activation_state->dims)) {
@@ -643,7 +648,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kFwCellStateQuantized;
     TfLiteTensor* fw_cell_state_quantized =
         GetTemporary(context, node, kFwCellStateQuantized);
-    fw_cell_state_quantized->type = kTfLiteUInt8;
+    fw_cell_state_quantized->type = fw_input_to_output_weights->type;
     fw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_cell_state_quantized->dims,
                              fw_cell_state->dims)) {
@@ -657,7 +662,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kBwCellStateQuantized;
     TfLiteTensor* bw_cell_state_quantized =
         GetTemporary(context, node, kBwCellStateQuantized);
-    bw_cell_state_quantized->type = kTfLiteUInt8;
+    bw_cell_state_quantized->type = fw_input_to_output_weights->type;
     bw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_cell_state_quantized->dims,
                              bw_cell_state->dims)) {
@@ -726,7 +731,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           *scratch_tensor_index + kAuxInputQuantized;
       TfLiteTensor* aux_input_quantized =
           GetTemporary(context, node, kAuxInputQuantized);
-      aux_input_quantized->type = kTfLiteUInt8;
+      aux_input_quantized->type = fw_input_to_output_weights->type;
       aux_input_quantized->allocation_type = kTfLiteArenaRw;
       if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
         TfLiteIntArray* aux_input_quantized_size =
@@ -869,6 +874,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
+  const bool has_previous_bw_output = (aux_input != nullptr);
+  const bool use_aux_input = (fw_aux_input_to_forget_weights != nullptr);
+
   // Populate a TfLiteLSTMParams struct for the evaluation functions.
   TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip,
                                   params->proj_clip, kTfLiteLSTMFullKernel};
@@ -878,6 +886,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output;
 
   const bool time_major = params->time_major;
+
+  // We want to cover the following cases:
+  //
+  // If not stacking (not connected after other bidi lstms):
+  //   both fw & bw will just use `input`; aux_input will be null.
+  //
+  // If stacking with cross_links, TensorFlow equivalent
+  // (tf.contrib.rnn.stack_bidirectional_rnn):
+  //   both fw & bw will use `input`, but aux_input will be none null.
+  //   Note, this time, whether connected after other bidi lstms both works.
+  //
+  // If stacking without cross_links, but connected after other bidi lstms,
+  // TensorFlow equivalent (tf.nn.static_bidirectional_rnn):
+  //   fw will use `input`, bw will use aux_input, and the `real aux_input`
+  //   will be null.
+
+  const bool non_stacking_mode = !use_aux_input && has_previous_bw_output;
+  const TfLiteTensor* bw_input = non_stacking_mode ? aux_input : input;
+  const TfLiteTensor* real_aux_input = non_stacking_mode ? nullptr : aux_input;
+
   switch (fw_input_to_output_weights->type) {
     case kTfLiteFloat32: {
       TfLiteStatus fw_pass_status = lstm_eval::EvalFloat(
@@ -890,7 +918,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
           fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
@@ -901,7 +929,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalFloat(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
           bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
@@ -910,7 +938,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
           bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
@@ -922,7 +950,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized =
           GetTemporary(context, node, kInputQuantized);
       TfLiteTensor* fw_activation_state_quantized =
@@ -940,9 +969,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* recovered_cell_weights =
           GetTemporary(context, node, kRecoveredCellWeights);
       TfLiteTensor* aux_input_quantized =
-          (aux_input == nullptr)
-              ? nullptr
-              : GetTemporary(context, node, kAuxInputQuantized);
+          use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
+                        : nullptr;
 
       TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
           input, fw_input_to_input_weights, fw_input_to_forget_weights,
@@ -954,7 +982,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
           fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
@@ -968,7 +996,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
           bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
@@ -977,7 +1005,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*input_layer_norm_coefficients=*/nullptr,
           /*forget_layer_norm_coefficients=*/nullptr,
           /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr, aux_input,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
           bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index f5df6d15af7912d663f61b9df93d92d4c029e2d5..707f06af8322234c3a09b12168445fe285573fee 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -38,7 +38,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
                            int sequence_length, bool use_cifg,
                            bool use_peephole, bool use_projection_weights,
                            bool use_projection_bias, bool merge_outputs,
-                           float cell_clip, float proj_clip,
+                           bool use_aux_input, float cell_clip, float proj_clip,
                            bool quantize_weights, bool time_major,
                            const std::vector<std::vector<int>>& input_shapes)
       : n_batch_(n_batch),
@@ -185,7 +185,11 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       bw_output_ = AddOutput(TensorType_FLOAT32);
     }
 
-    aux_input_ = AddNullInput();
+    if (use_aux_input) {
+      aux_input_ = AddInput(TensorType_FLOAT32);
+    } else {
+      aux_input_ = AddNullInput();
+    }
     fw_aux_input_to_input_weights_ = AddNullInput();
     fw_aux_input_to_forget_weights_ = AddNullInput();
     fw_aux_input_to_cell_weights_ = AddNullInput();
@@ -302,6 +306,10 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
+  void SetAuxInput(int offset, float* begin, float* end) {
+    PopulateTensor(aux_input_, offset, begin, end);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -392,7 +400,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 // indicating whether to use quantization or not.
 class LSTMOpTest : public ::testing::TestWithParam<bool> {};
 
-INSTANTIATE_TEST_CASE_P(QuantizationOrNot, LSTMOpTest, ::testing::Bool());
+INSTANTIATE_TEST_SUITE_P(QuantizationOrNot, LSTMOpTest, ::testing::Bool());
 
 TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
@@ -406,7 +414,8 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -570,7 +579,8 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/true, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/true,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -733,7 +743,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -895,7 +906,8 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -1047,7 +1059,8 @@ TEST(LSTMOpTest,
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -1199,7 +1212,8 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
@@ -1903,7 +1917,8 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
       /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/false,
       {
           {n_batch, sequence_length, n_input},  // input tensor
@@ -2590,6 +2605,175 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
   EXPECT_THAT(combined, ElementsAreArray(ArrayFloatNear(expected)));
 }
 
+// Same as the no cifg no peephole no projection no clipping test, but have an
+// aux input (without aux input weights), this is the case when stacking but no
+// cross-links.
+TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+  const bool quantize_weights = GetParam();
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/true, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          // Forward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          // Backward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, n_input},  // aux_input tensor
+          {n_cell, 0},                          // aux_fw_input_to_input tensor
+          {n_cell, 0},                          // aux_fw_input_to_forget tensor
+          {n_cell, 0},                          // aux_fw_input_to_cell tensor
+          {n_cell, 0},                          // aux_fw_input_to_output tensor
+          {n_cell, 0},                          // aux_bw_input_to_input tensor
+          {n_cell, 0},                          // aux_bw_input_to_forget tensor
+          {n_cell, 0},                          // aux_bw_input_to_cell tensor
+          {n_cell, 0},                          // aux_bw_input_to_output tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
+      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+  static float lstm_bw_golden_output[] = {
+      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
+      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+  // Aux input and input are the same, so we should observe the same outputs
+  // as there's no aux input.
+  lstm.SetAuxInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* fw_golden_start = lstm_fw_golden_output;
+  float* fw_golden_end =
+      fw_golden_start + lstm.num_fw_outputs() * lstm.sequence_length();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(fw_expected, quantize_weights ? 1e-2 : 1e-5)));
+
+  float* bw_golden_start = lstm_bw_golden_output;
+  float* bw_golden_end =
+      bw_golden_start + lstm.num_bw_outputs() * lstm.sequence_length();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(bw_expected, quantize_weights ? 1e-2 : 1e-5)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index 5194c2463092eedd41f634dda8b8db201b03e699..0adf574bb0641b2ddd2774f1563a92a66023f7a2 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -31,6 +31,18 @@ namespace ops {
 namespace builtin {
 namespace bidirectional_sequence_rnn {
 
+namespace {
+
+int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
+  if (is_uint8) {
+    return reinterpret_cast<int8_t*>(tensor->data.uint8);
+  } else {
+    return tensor->data.int8;
+  }
+}
+
+}  // namespace
+
 constexpr int kInputTensor = 0;
 // Forward and backward cell tensors.
 constexpr int kFwWeightsTensor = 1;
@@ -41,7 +53,10 @@ constexpr int kBwWeightsTensor = 5;
 constexpr int kBwRecurrentWeightsTensor = 6;
 constexpr int kBwBiasTensor = 7;
 constexpr int kBwHiddenStateTensor = 8;
-// Auxiliary inputs.
+// Used as auxiliary input and weights when stacking for
+// tf.contrib.rnn.stack_bidirectional_rnn case (with cross links); Used as input
+// to the backward cell when stacking for tf.nn.static_bidirectional_rnn case
+// (without cross links).
 constexpr int kAuxInputTensor = 9;       // Optional.
 constexpr int kFwAuxWeightsTensor = 10;  // Optional.
 constexpr int kBwAuxWeightsTensor = 11;  // Optional.
@@ -101,13 +116,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_weights =
       GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
 
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_weights != nullptr) &&
+  const bool aux_inputs_weights_or_none =
+      ((fw_aux_input_weights != nullptr) &&
        (bw_aux_input_weights != nullptr)) ||
-      ((aux_input == nullptr) && (fw_aux_input_weights == nullptr) &&
-       (bw_aux_input_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
+      ((fw_aux_input_weights == nullptr) && (bw_aux_input_weights == nullptr));
+  TF_LITE_ENSURE(context, aux_inputs_weights_or_none);
+  const bool has_aux_input = (fw_aux_input_weights != nullptr);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -154,8 +168,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       bw_aux_input_weights->dims->data[1]);
   }
 
-  const bool is_hybrid_op =
-      (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
+  const bool is_hybrid_op = ((fw_input_weights->type == kTfLiteUInt8 ||
+                              fw_input_weights->type == kTfLiteInt8) &&
+                             input->type == kTfLiteFloat32);
 
   if (is_hybrid_op) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
@@ -172,7 +187,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = fw_input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -184,7 +199,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kFwHiddenStateQuantized;
     TfLiteTensor* fw_hidden_state_quantized =
         GetTemporary(context, node, kFwHiddenStateQuantized);
-    fw_hidden_state_quantized->type = kTfLiteUInt8;
+    fw_hidden_state_quantized->type = fw_input_weights->type;
     fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims,
                              fw_hidden_state->dims)) {
@@ -199,7 +214,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kBwHiddenStateQuantized;
     TfLiteTensor* bw_hidden_state_quantized =
         GetTemporary(context, node, kBwHiddenStateQuantized);
-    bw_hidden_state_quantized->type = kTfLiteUInt8;
+    bw_hidden_state_quantized->type = fw_input_weights->type;
     bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims,
                              bw_hidden_state->dims)) {
@@ -230,7 +245,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           *scratch_tensor_index + kAuxInputQuantized;
       TfLiteTensor* aux_input_quantized =
           GetTemporary(context, node, kAuxInputQuantized);
-      aux_input_quantized->type = kTfLiteUInt8;
+      aux_input_quantized->type = fw_input_weights->type;
       aux_input_quantized->allocation_type = kTfLiteArenaRw;
       if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
         TfLiteIntArray* aux_input_quantized_size =
@@ -264,16 +279,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
-    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
-    const TfLiteTensor* bw_input_weights,
-    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
-    const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights,
-    const TfLiteTensor* bw_aux_input_weights,
-    const TfLiteBidirectionalSequenceRNNParams* params,
-    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
-    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
+TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
+                       const TfLiteTensor* fw_input_weights,
+                       const TfLiteTensor* fw_recurrent_weights,
+                       const TfLiteTensor* fw_bias,
+                       const TfLiteTensor* bw_input_weights,
+                       const TfLiteTensor* bw_recurrent_weights,
+                       const TfLiteTensor* bw_bias,
+                       const TfLiteTensor* aux_input,
+                       const TfLiteTensor* fw_aux_input_weights,
+                       const TfLiteTensor* bw_aux_input_weights,
+                       const TfLiteBidirectionalSequenceRNNParams* params,
+                       TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+                       TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -326,7 +344,7 @@ TfLiteStatus EvalFloat(
     float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
     for (int s = max_time - 1; s >= 0; s--) {
       const float* input_ptr_batch =
-          input->data.f + s * input_size * batch_size;
+          bw_input->data.f + s * input_size * batch_size;
       const float* aux_input_ptr_batch =
           (aux_input != nullptr)
               ? aux_input->data.f + s * input_size * batch_size
@@ -394,7 +412,8 @@ TfLiteStatus EvalFloat(
 }
 
 TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
+    const TfLiteTensor* input, const TfLiteTensor* bw_input,
+    const TfLiteTensor* fw_input_weights,
     const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
     const TfLiteTensor* bw_input_weights,
     const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
@@ -406,6 +425,7 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
     TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state,
     TfLiteTensor* bw_output) {
+  const bool is_uint8_hybrid = fw_input_weights->type == kTfLiteUInt8;
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -417,19 +437,19 @@ TfLiteStatus EvalHybrid(
   const int fw_num_units = fw_input_weights->dims->data[0];
   const float* fw_bias_ptr = fw_bias->data.f;
   const int8_t* fw_input_weights_ptr =
-      reinterpret_cast<const int8_t*>(fw_input_weights->data.uint8);
+      GetInt8DataPtr(fw_input_weights, is_uint8_hybrid);
   float fw_input_weights_scale = fw_input_weights->params.scale;
   const int8_t* fw_recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(fw_recurrent_weights->data.uint8);
+      GetInt8DataPtr(fw_recurrent_weights, is_uint8_hybrid);
   float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale;
 
   const int bw_num_units = bw_input_weights->dims->data[0];
   const float* bw_bias_ptr = bw_bias->data.f;
   const int8_t* bw_input_weights_ptr =
-      reinterpret_cast<const int8_t*>(bw_input_weights->data.uint8);
+      GetInt8DataPtr(bw_input_weights, is_uint8_hybrid);
   float bw_input_weights_scale = bw_input_weights->params.scale;
   const int8_t* bw_recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(bw_recurrent_weights->data.uint8);
+      GetInt8DataPtr(bw_recurrent_weights, is_uint8_hybrid);
   float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale;
 
   // Set the auxiliary pointers and scales if needed.
@@ -440,21 +460,22 @@ TfLiteStatus EvalHybrid(
   int8_t* aux_quantized_input_ptr = nullptr;
   if (aux_input_size > 0) {
     aux_fw_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_fw_input_weights->data.uint8);
+        GetInt8DataPtr(aux_fw_input_weights, is_uint8_hybrid);
     aux_fw_input_weights_scale = aux_fw_input_weights->params.scale;
     aux_bw_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_bw_input_weights->data.uint8);
+        GetInt8DataPtr(aux_bw_input_weights, is_uint8_hybrid);
     aux_bw_input_weights_scale = aux_bw_input_weights->params.scale;
-    aux_quantized_input_ptr = reinterpret_cast<int8_t*>(aux_input_quantized);
+    aux_quantized_input_ptr =
+        GetInt8DataPtr(aux_input_quantized, is_uint8_hybrid);
   }
 
   // Initialize temporary storage for quantized values.
   int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+      GetInt8DataPtr(input_quantized, is_uint8_hybrid);
   int8_t* fw_quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(fw_hidden_state_quantized->data.uint8);
+      GetInt8DataPtr(fw_hidden_state_quantized, is_uint8_hybrid);
   int8_t* bw_quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(bw_hidden_state_quantized->data.uint8);
+      GetInt8DataPtr(bw_hidden_state_quantized, is_uint8_hybrid);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
   const int fw_output_step =
@@ -489,7 +510,7 @@ TfLiteStatus EvalHybrid(
       float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
       for (int s = max_time - 1; s >= 0; s--) {
         const float* input_ptr_batch =
-            input->data.f + s * input_size * batch_size;
+            bw_input->data.f + s * input_size * batch_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
                 ? aux_input->data.f + s * input_size * batch_size
@@ -601,14 +622,37 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                 ? nullptr
                                 : GetOutput(context, node, kBwOutputTensor);
 
+  const bool has_previous_bw_output = (aux_input != nullptr);
+  const bool use_aux_input = (fw_aux_input_weights != nullptr);
+
+  // We want to cover the following cases:
+  //
+  // If not stacking (not connected after other bidi lstms):
+  //   both fw & bw will just use `input`; aux_input will be null.
+  //
+  // If stacking with cross_links, TensorFlow equivalent
+  // (tf.contrib.rnn.stack_bidirectional_rnn):
+  //   both fw & bw will use `input`, but aux_input will be none null.
+  //   Note, this time, whether connected after other bidi lstms both works.
+  //
+  // If stacking without cross_links, but connected after other bidi lstms,
+  // TensorFlow equivalent (tf.nn.static_bidirectional_rnn):
+  //   fw will use `input`, bw will use aux_input, and the `real aux_input`
+  //   will be null.
+
+  const bool non_stacking_mode = !use_aux_input && has_previous_bw_output;
+  const TfLiteTensor* bw_input = non_stacking_mode ? aux_input : input;
+  const TfLiteTensor* real_aux_input = non_stacking_mode ? nullptr : aux_input;
+
   switch (fw_input_weights->type) {
     case kTfLiteFloat32:
-      return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                       bw_input_weights, bw_recurrent_weights, bw_bias,
-                       aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                       params, fw_hidden_state, fw_output, bw_hidden_state,
-                       bw_output);
-    case kTfLiteUInt8: {
+      return EvalFloat(input, bw_input, fw_input_weights, fw_recurrent_weights,
+                       fw_bias, bw_input_weights, bw_recurrent_weights, bw_bias,
+                       real_aux_input, fw_aux_input_weights,
+                       bw_aux_input_weights, params, fw_hidden_state, fw_output,
+                       bw_hidden_state, bw_output);
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized =
           GetTemporary(context, node, kInputQuantized);
       TfLiteTensor* fw_hidden_state_quantized =
@@ -618,17 +662,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* scaling_factors =
           GetTemporary(context, node, kScalingFactors);
       TfLiteTensor* aux_input_quantized =
-          (aux_input != nullptr)
-              ? GetTemporary(context, node, kAuxInputQuantized)
-              : nullptr;
-
-      return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                        bw_input_weights, bw_recurrent_weights, bw_bias,
-                        aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                        params, scaling_factors, input_quantized,
-                        aux_input_quantized, fw_hidden_state_quantized,
-                        fw_hidden_state, fw_output, bw_hidden_state_quantized,
-                        bw_hidden_state, bw_output);
+          use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
+                        : nullptr;
+
+      return EvalHybrid(input, bw_input, fw_input_weights, fw_recurrent_weights,
+                        fw_bias, bw_input_weights, bw_recurrent_weights,
+                        bw_bias, real_aux_input, fw_aux_input_weights,
+                        bw_aux_input_weights, params, scaling_factors,
+                        input_quantized, aux_input_quantized,
+                        fw_hidden_state_quantized, fw_hidden_state, fw_output,
+                        bw_hidden_state_quantized, bw_hidden_state, bw_output);
     }
     default:
       context->ReportError(context, "Type not currently supported.");
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 5bad8e02c29608fa058d0d1104acbf09626f1b66..9b61f8238b558042e7a957d09dac162d8ea6450b 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -654,8 +654,8 @@ const std::initializer_list<float> recurrent_weights = {
 class BidirectionalRNNOpModel : public SingleOpModel {
  public:
   BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
-                          int bw_units, int input_size, bool time_major,
-                          bool merge_outputs)
+                          int bw_units, int input_size, bool use_aux_input,
+                          bool time_major, bool merge_outputs)
       : batches_(batches),
         sequence_len_(sequence_len),
         fw_units_(fw_units),
@@ -671,7 +671,13 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     bw_bias_ = AddInput(TensorType_FLOAT32);
     bw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
 
-    aux_input_ = AddNullInput();
+    int aux_input_size = 0;
+    if (use_aux_input) {
+      aux_input_ = AddInput(TensorType_FLOAT32);
+      aux_input_size = input_size_;
+    } else {
+      aux_input_ = AddNullInput();
+    }
     aux_fw_weights_ = AddNullInput();
     aux_bw_weights_ = AddNullInput();
 
@@ -691,18 +697,18 @@ class BidirectionalRNNOpModel : public SingleOpModel {
                      : std::vector<int>({batches_, sequence_len_, input_size_});
 
     BuildInterpreter({
-        input_shape,                   // input
-        {fw_units_, input_size_},      // fw_weights
-        {fw_units_, fw_units_},        // fw_recurrent_weights
-        {fw_units_},                   // fw_bias
-        {batches_, fw_units_},         // fw_hidden_state
-        {bw_units_, input_size_},      // bw_weights
-        {bw_units_, bw_units_},        // bw_recurrent_weights
-        {bw_units_},                   // bw_bias
-        {batches_, bw_units_},         // bw_hidden_state
-        {batches_, sequence_len_, 0},  // aux_input
-        {fw_units_, 0},                // aux_fw_weights
-        {bw_units_, 0},                // aux_bw_weights
+        input_shape,                                // input
+        {fw_units_, input_size_},                   // fw_weights
+        {fw_units_, fw_units_},                     // fw_recurrent_weights
+        {fw_units_},                                // fw_bias
+        {batches_, fw_units_},                      // fw_hidden_state
+        {bw_units_, input_size_},                   // bw_weights
+        {bw_units_, bw_units_},                     // bw_recurrent_weights
+        {bw_units_},                                // bw_bias
+        {batches_, bw_units_},                      // bw_hidden_state
+        {batches_, sequence_len_, aux_input_size},  // aux_input
+        {fw_units_, 0},                             // aux_fw_weights
+        {bw_units_, 0},                             // aux_bw_weights
     });
   }
 
@@ -738,6 +744,10 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
+  void SetAuxInput(int offset, float* begin, float* end) {
+    PopulateTensor(aux_input_, offset, begin, end);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -775,7 +785,8 @@ class BidirectionalRNNOpModel : public SingleOpModel {
 TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -813,7 +824,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/true,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -822,7 +834,6 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   rnn.SetFwRecurrentWeights(recurrent_weights);
   rnn.SetBwRecurrentWeights(recurrent_weights);
 
-  // const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
   // Insert the inputs in time_major format. The batch_major format is:
   // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
   // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
@@ -850,7 +861,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -888,7 +900,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/true,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/true,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -932,7 +945,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -979,7 +993,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
 TEST(BidirectionalRNNOpTest, EndToEndTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   const int output_size = 4;
   float dnn_weights[] = {
@@ -1046,6 +1061,137 @@ TEST(BidirectionalRNNOpTest, EndToEndTest) {
   }
 }
 
+// Same as BlackBox test, but has aux input.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInput) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as previous test, but has aux input is all zeros.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputZeros) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Initialize bw inputs with zeros.
+  std::vector<float> bw_inputs(rnn.sequence_len(), 0);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), &bw_inputs[0],
+                    &bw_inputs[bw_inputs.size() - 1]);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), &bw_inputs[0],
+                    &bw_inputs[bw_inputs.size() - 1]);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as previous test, but has input is all zeros, and aux input is the real
+// input. This is testing the bw path is functional.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputInputZeros) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Initialize bw inputs with zeros.
+  std::vector<float> fw_inputs(rnn.sequence_len(), 0);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput(2 * i * rnn.input_size(), &fw_inputs[0],
+                 &fw_inputs[fw_inputs.size() - 1]);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), &fw_inputs[0],
+                 &fw_inputs[fw_inputs.size() - 1]);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_fw_units();
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  }
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bb763255b136f1d5103dd2e72ce6aebf38f06d3
--- /dev/null
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace ceil {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  output->type = input->type;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  optimized_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
+                      GetTensorShape(output), GetTensorData<float>(output));
+
+  return kTfLiteOk;
+}
+}  // namespace ceil
+
+TfLiteRegistration* Register_CEIL() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, ceil::Prepare, ceil::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/ceil_test.cc b/tensorflow/lite/kernels/ceil_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e120105082751a732bb8812944c318ad9e5ecff5
--- /dev/null
+++ b/tensorflow/lite/kernels/ceil_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class CeilOpModel : public SingleOpModel {
+ public:
+  CeilOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_CEIL, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(CeilOpTest, SingleDim) {
+  CeilOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({9, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(CeilOpTest, MultiDims) {
+  CeilOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 9, 1, 10, 1, 0, -8, 0, -9, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 478df3354f56b67db1beb14ae419b8fb74e09a7d..d0350b2fa7f7bad804d4b1348f4d389cb102f68e 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -1069,7 +1069,7 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     ConvolutionOpTest, ConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index d924e6f700781e4aceef3d8554ed3d88d17ed774..75aed4cc4a96e76f35499d3c26cf0fc25f463160 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -437,11 +437,11 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
               ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index bb5f1e74a8b0174209043e14af9c35db32bf14b5..77254335fbde0ff4246af00291ccfba9ec8b0acf 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -27,13 +30,7 @@ class DequantizeOpModel : public SingleOpModel {
  public:
   DequantizeOpModel(TensorType type, std::initializer_list<int> shape,
                     float scale, int32_t zero_point) {
-    TensorData input_tensor_data;
-    input_tensor_data.type = type;
-    input_tensor_data.shape = shape;
-    input_tensor_data.min = 0;
-    input_tensor_data.max = 0;
-    input_tensor_data.scale = scale;
-    input_tensor_data.zero_point = zero_point;
+    const TensorData input_tensor_data = {type, shape, 0, 0, scale, zero_point};
     input_ = AddInput(input_tensor_data);
     output_ = AddOutput({TensorType_FLOAT32, shape});
     SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
@@ -58,7 +55,7 @@ TEST(DequantizeOpTest, UINT8) {
   // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
   DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127);
 
-  m.SetInput<uint8>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
@@ -69,7 +66,7 @@ TEST(DequantizeOpTest, INT8) {
   // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
   DequantizeOpModel m(TensorType_INT8, {2, 5}, 0.5, -1);
 
-  m.SetInput<int8>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 84e2a0efb27c5e2381d76dba89ddf3445077576c..a0df4a10fa1bb2f5441c9a6bdf1b36b1fe05ada1 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -498,8 +498,9 @@ TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
     }
     // Perform non-maximal suppression on single class
     std::vector<int> selected;
-    NonMaxSuppressionSingleClassHelper(context, node, op_data, class_scores,
-                                       &selected, num_detections_per_class);
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionSingleClassHelper(
+        context, node, op_data, class_scores, &selected,
+        num_detections_per_class));
     // Add selected indices from non-max suppression of boxes in this class
     int output_index = size_of_sorted_indices;
     for (int selected_index : selected) {
@@ -614,8 +615,8 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
   }
   // Perform non-maximal suppression on max scores
   std::vector<int> selected;
-  NonMaxSuppressionSingleClassHelper(context, node, op_data, max_scores,
-                                     &selected, op_data->max_detections);
+  TF_LITE_ENSURE_STATUS(NonMaxSuppressionSingleClassHelper(
+      context, node, op_data, max_scores, &selected, op_data->max_detections));
   // Allocate output tensors
   int output_box_index = 0;
   for (const auto& selected_index : selected) {
@@ -688,11 +689,11 @@ TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context,
       return kTfLiteError;
   }
   if (op_data->use_regular_non_max_suppression)
-    NonMaxSuppressionMultiClassRegularHelper(context, node, op_data,
-                                             GetTensorData<float>(scores));
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClassRegularHelper(
+        context, node, op_data, GetTensorData<float>(scores)));
   else
-    NonMaxSuppressionMultiClassFastHelper(context, node, op_data,
-                                          GetTensorData<float>(scores));
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClassFastHelper(
+        context, node, op_data, GetTensorData<float>(scores)));
 
   return kTfLiteOk;
 }
@@ -710,12 +711,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // This fills in temporary decoded_boxes
   // by transforming input_box_encodings and input_anchors from
   // CenterSizeEncodings to BoxCornerEncoding
-  DecodeCenterSizeBoxes(context, node, op_data);
+  TF_LITE_ENSURE_STATUS(DecodeCenterSizeBoxes(context, node, op_data));
   // This fills in the output tensors
   // by choosing effective set of decoded boxes
   // based on Non Maximal Suppression, i.e. selecting
   // highest scoring non-overlapping boxes.
-  NonMaxSuppressionMultiClass(context, node, op_data);
+  TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClass(context, node, op_data));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index bad5975a7c187cc4bdcd65721d397897ff2cf09d..e2a2c4aac9456dfae2e26d75d903c300e382b1d0 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -39,7 +39,7 @@ void SetEigenNbThreads(int threads) {
 #if defined(EIGEN_HAS_OPENMP)
   // The global Eigen thread count is only used when OpenMP is enabled. As this
   // call causes problems with tsan, make it only when OpenMP is available.
-  Eigen::setNbThreads(context->recommended_num_threads);
+  Eigen::setNbThreads(threads);
 #endif  // defined(EIGEN_HAS_OPENMP)
 }
 
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index fad32607b4980ce5d0e6b6a8540adf3b19529403..3f1d62389f470744d1628cf586d486059b0582fc 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -117,7 +117,12 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       // TODO(alanchiao): refactor scalar multiply into separate function
       // for ease of adding a neon equivalent if ever necessary.
       for (int j = 0; j < col_size; j++) {
-        const int8_t* value_ptr = reinterpret_cast<int8_t*>(value->data.uint8);
+        const int8_t* value_ptr;
+        if (value->type == kTfLiteUInt8) {
+          value_ptr = reinterpret_cast<int8_t*>(value->data.uint8);
+        } else {
+          value_ptr = value->data.int8;
+        }
         output->data.f[j + i * col_size] =
             value_ptr[j + idx * col_size] * scaling_factor;
       }
@@ -135,6 +140,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(context, node, lookup, value, output);
     case kTfLiteUInt8:
+    case kTfLiteInt8:
       return EvalHybrid(context, node, lookup, value, output);
     default:
       context->ReportError(context, "Type not currently supported.");
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index 8ea98a5f0dcbfbcec826c0b9dee0d28cd0bd2885..2462ff26933ef645769e87ca6b6a1eb8a650b662 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -28,6 +28,8 @@ License.
 namespace tflite {
 namespace {
 
+float kTestTolerance = 7.41e-03;
+
 using ::testing::ElementsAreArray;
 
 class BaseEmbeddingLookupOpModel : public SingleOpModel {
@@ -76,13 +78,17 @@ class EmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
 class HybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
  public:
   HybridEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
-                               std::initializer_list<int> weight_shape)
-      : BaseEmbeddingLookupOpModel(index_shape, weight_shape,
-                                   TensorType_UINT8) {}
+                               std::initializer_list<int> weight_shape,
+                               TensorType type)
+      : BaseEmbeddingLookupOpModel(index_shape, weight_shape, type) {}
 
   void SetWeight(std::initializer_list<float> data) {
     SymmetricQuantizeAndPopulate(weight_, data);
   }
+
+  void SetSignedWeight(std::initializer_list<float> data) {
+    SignedSymmetricQuantizeAndPopulate(weight_, data);
+  }
 };
 
 // TODO(ahentz): write more tests that exercise the details of the op, such as
@@ -103,8 +109,8 @@ TEST(EmbeddingLookupOpTest, SimpleTest) {
               })));
 }
 
-TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTest) {
-  HybridEmbeddingLookupOpModel m({3}, {3, 8});
+TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTestUint8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 8}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -121,11 +127,11 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTest) {
                       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
                       2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
-                  7.41e-03)));
+                  kTestTolerance)));
 }
 
-TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTest) {
-  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4});
+TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTestUint8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -142,11 +148,11 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTest) {
                       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
                       2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
-                  7.41e-03)));
+                  kTestTolerance)));
 }
 
-TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTest) {
-  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2});
+TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTestUint8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -163,7 +169,70 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTest) {
                       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
                       2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
-                  7.41e-03)));
+                  kTestTolerance)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTestInt8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 8}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTestInt8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTestInt8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index aa117e3cacfc4624d347ba812e23801c223bae7b..b6ccce3b938ed7b7a540b872daaea1459ca59e85 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -26,6 +27,11 @@ namespace floor {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -37,20 +43,34 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
+template <KernelType type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  if (type == kGenericOptimized) {
+    optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+  } else {
+    reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+  }
 
   return kTfLiteOk;
 }
 }  // namespace floor
 
+TfLiteRegistration* Register_FLOOR_REF() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, floor::Prepare,
+                                 floor::Eval<floor::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_FLOOR() {
   static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr, floor::Prepare, floor::Eval};
+                                 /*free=*/nullptr, floor::Prepare,
+                                 floor::Eval<floor::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index d1d29fc7e6c7d7ba4162ec0afc321b09350212a5..03f4ea71430f5d578288d913e8ba1d0222467882 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -725,11 +725,11 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(175, 177, 179, 243, 245, 247));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     QuantizedFullyConnectedOpTest, QuantizedFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
 
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 69816583f5020843aeff76890f51c6c306f11a4f..3b64bcce834436f5a93ed49bd318b69817d4f06d 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1,12 +1,13 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "transitive_hdrs")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
 package(default_visibility = [
     "//visibility:public",
 ])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-
 tflite_deps_intel = [
     "@arm_neon_2_x86_sse",
 ]
@@ -45,7 +46,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -59,7 +59,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -253,9 +252,6 @@ cc_library(
 cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":tensor",
         "@com_google_googletest//:gtest",
@@ -285,9 +281,6 @@ cc_library(
 cc_test(
     name = "quantization_util_test",
     srcs = ["quantization_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":quantization_util",
         "@com_google_googletest//:gtest",
@@ -314,6 +307,8 @@ cc_library(
         "reference/depthwiseconv_uint8.h",
         "reference/fully_connected.h",
         "reference/integer_ops/dequantize.h",
+        "reference/integer_ops/pooling.h",
+        "reference/integer_ops/softmax.h",
         "reference/reference_ops.h",
         "reference/softmax.h",
     ],
@@ -410,6 +405,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:activation_functor",
         "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:types",
     ],
 )
 
@@ -543,16 +539,20 @@ cc_library(
     name = "test_util",
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":types",
-        "//tensorflow/lite:string",
     ],
 )
 
-cc_test(
+# TODO(b/122597976): Eliminate TF dependency from lite/kernels:test_util,
+# in turn eliminating the need to use tf_cc_test for any dependent tests.
+tf_cc_test(
     name = "tensor_utils_test",
     srcs = ["tensor_utils_test.cc"],
-    copts = NEON_FLAGS_IF_APPLICABLE,
     linkopts = select({
         "//tensorflow:android": [
             "-fPIE -pie",
@@ -560,9 +560,6 @@ cc_test(
         "//conditions:default": [],
     }),
     linkstatic = 1,
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":tensor_utils",
         "//tensorflow/lite/c:c_api_internal",
@@ -587,9 +584,6 @@ cc_test(
     name = "depthwiseconv_quantized_test",
     srcs = ["depthwiseconv_quantized_test.cc"],
     shard_count = 2,
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -602,9 +596,6 @@ cc_test(
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -617,9 +608,6 @@ cc_test(
 cc_test(
     name = "resize_nearest_neighbor_test",
     srcs = ["resize_nearest_neighbor_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -635,6 +623,7 @@ cc_test(
     srcs = [
         "softmax_quantized_test.cc",
     ],
+    shard_count = 3,
     deps = [
         ":optimized_base",
         ":quantization_util",
@@ -651,7 +640,10 @@ cc_test(
     srcs = [
         "logsoftmax_quantized_test.cc",
     ],
+    shard_count = 3,
     tags = [
+        # TODO(b/122242739): Reenable after fixing the flakiness?
+        "nomac",
         "tflite_not_portable",
     ],
     deps = [
@@ -667,6 +659,10 @@ cc_test(
 cc_test(
     name = "log_quantized_test",
     srcs = ["log_quantized_test.cc"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -702,4 +698,78 @@ cc_test(
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
 
+filegroup(
+    name = "optimized_op_headers",
+    srcs = glob([
+        "optimized/*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "reference_op_headers",
+    srcs = glob([
+        "reference/*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "headers",
+    srcs = glob([
+        "*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+transitive_hdrs(
+    name = "nnapi_external_headers",
+    visibility = ["//tensorflow/lite:__subpackages__"],
+    deps = [
+        "//third_party/eigen3",
+        "@gemmlowp",
+    ],
+)
+
+# ---------------------------------------------------------
+# The public target "install_nnapi_extra_headers" is only
+# used for external targets that requires exporting optmized
+# and reference op headers.
+
+genrule(
+    name = "install_nnapi_extra_headers",
+    srcs = [
+        ":nnapi_external_headers",
+        ":headers",
+        ":optimized_op_headers",
+        ":reference_op_headers",
+    ],
+    outs = ["include"],
+    cmd = """
+    mkdir $@
+    for f in $(SRCS); do
+      d="$${f%/*}"
+      d="$${d#bazel-out*genfiles/}"
+      d="$${d#*external/eigen_archive/}"
+
+      if [[ $${d} == *local_config_* ]]; then
+        continue
+      fi
+
+      if [[ $${d} == external* ]]; then
+        extname="$${d#*external/}"
+        extname="$${extname%%/*}"
+        if [[ $${TF_SYSTEM_LIBS:-} == *$${extname}* ]]; then
+          continue
+        fi
+      fi
+
+      mkdir -p "$@/$${d}"
+      cp "$${f}" "$@/$${d}/"
+    done
+    """,
+    tags = ["manual"],
+    visibility = ["//visibility:private"],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index fdb72037f84e4cea9018516ef70eb8c8fa039082..bc30ac91220906588f204d6ff21c275faa2b6c25 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -131,6 +131,23 @@ int CountLeadingZeros(T integer_input) {
 #endif
 }
 
+inline int32 GetReciprocal(int32 x, int x_integer_digits,
+                           int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
+  // This is the number of bits to the left of the binary point above 1.0.
+  // Consider x=1.25.  In that case shifted_scale=0.8 and
+  // no later adjustment will be needed.
+  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
+  const int32 shifted_sum_minus_one =
+      static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
+                         (static_cast<uint32>(1) << 31));
+
+  gemmlowp::FixedPoint<int32, 0> shifted_scale =
+      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+          gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
+  return shifted_scale.raw();
+}
+
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
 // BROADCASTING.
 //
diff --git a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index 889a726f3a915fb592511d34c036b9726542fee9..945300dad1653257db69c3440f6db0589e0c1a7b 100644
--- a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -225,7 +225,7 @@ bool TryOneSkyscraperLogSoftmax(bool small_depth) {
 }
 
 TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneUniformLogSoftmax()) {
     }
@@ -233,7 +233,7 @@ TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
 }
 
 TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(false)) {
     }
@@ -241,7 +241,7 @@ TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
 }
 
 TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(true)) {
     }
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index d3dca799a7cca4a3048cd2d19477ba2b57fbcdac..0f4226dd158ab14d29c4e8bfba61fb0bef0cf340 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1983,6 +1983,7 @@ inline void DepthwiseConv(
           input_shape, filter_shape, stride_width, stride_height,
           dilation_width_factor, dilation_height_factor, pad_width, pad_height,
           depth_multiplier, output_shape, output_shift)) {
+    gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
     DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
                            filter_data, bias_shape, bias_data, output_shape,
                            output_data);
@@ -1990,6 +1991,8 @@ inline void DepthwiseConv(
   }
 #endif
 
+  gemmlowp::ScopedProfilingLabel specialized_label(
+      "DepthwiseConv/8bit/General");
   DepthwiseConvGeneral(params, input_shape, input_data, filter_shape,
                        filter_data, bias_shape, bias_data, output_shape,
                        output_data);
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index cf40ebb241d013a4853854f57fd55ebbce8a1752..a69a547cb9f15268d60919f4b4cb718e832d08bd 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -90,20 +90,28 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, int result_stride) {
   const int kWeightsPerUint32 = 4;
   const int kWeightsPerNeonLane = 16;
-  // If the number of rows is not divisible by kWeightsPerUint32, we set a
-  // flag and allocate an aligned memory block. The flag is used to use the
-  // aligned memory block later in the kernel loop.
+  // Assuming *matrix is kWeightsPerUint32-byte aligned,
+  // every row of the matrix is also
+  // kWeightsPerUint32-byte aligned as long as cols is
+  // a multiple of kWeightsPerUint32. The assumption
+  // is currently satisfied by TFLite's 16-byte memory
+  // alignment scheme.
+  //
+  // Otherwise, we allocate an aligned memory block and set
+  // a flag to later copy rows from matrix to the block
+  // for aligned multiplication.
   bool unaligned = false;
-  int8* aligned_row = nullptr;
+  int8_t* aligned_row = nullptr;
   void* aligned_row_free = nullptr;
   if ((m_cols & (kWeightsPerUint32 - 1)) != 0) {
     unaligned = true;
-    aligned_row = (int8*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
-                                       &aligned_row_free);
+    aligned_row = (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                                         &aligned_row_free);
   }
   void* aligned_vec_free = nullptr;
-  int8* aligned_vec = (int8*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
-                                           &aligned_vec_free);
+  int8_t* aligned_vec =
+      (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                             &aligned_vec_free);
 
   // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
@@ -114,13 +122,13 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   for (batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
     // Copy the vector data to an aligned vector.
-    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
     // Compute dot-product for every column.
     for (row = 0; row < m_rows; ++row, result += result_stride) {
       // Get the address of the first element of the row.
-      int8* row_ptr = (int8*)matrix + row * m_cols;  // NOLINT
+      int8_t* row_ptr = (int8_t*)matrix + row * m_cols;  // NOLINT
       if (unaligned) {
-        memcpy(aligned_row, row_ptr, sizeof(int8) * m_cols);
+        memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
         row_ptr = aligned_row;
       }
 
@@ -135,16 +143,17 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       col = 0;
       for (; col < postamble_start; col += kWeightsPerNeonLane) {
         // Load 16 8-bit values from the row and vector, each, to operate on.
-        // Here the assumption is that each buffer is 4-byte aligned.
-        TFLITE_CHECK_EQ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1),
-                        0);
+        // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
+        // performance may suffer significantly.
+        TFLITE_DCHECK_EQ(  // NOLINT
+            (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
         // Multiply the low bits (i.e. the lower 8 8bit numbers in the
         // registers).
         int16x8_t prod_16x8 =
             vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
-        // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+        // Multiply the high bits (i.e. the higher 8 8bit numbers in the
         // registers), and accumulate with the result of the low bits product.
         // The assumption here is that overflow will not happen as we quantize
         // our values to be in the range [-127, 127]. As such the sum of the 2
@@ -164,8 +173,9 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
         if ((m_cols - postamble_start) >= (kWeightsPerNeonLane >> 1)) {
           // Load 8 8-bit values from the row and column each to operate on.
           // Here the assumption is that each buffer is 4-bytes aligned.
-          TFLITE_CHECK_EQ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1),
-                          0);
+          // Otherwise, performance may suffer significantly.
+          TFLITE_DCHECK_EQ(  // NOLINT
+              (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
           const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
           const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
           const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
@@ -192,6 +202,118 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   free(aligned_vec_free);
 }
 
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  const int kBlockSize = 16;
+  const int kNeonLanesPerBlock = 4;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int r = 0; r < m_rows; r++) {
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+        const float* vector_in_batch = vector + b * m_cols;
+
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+
+          for (int c = 0; c < kNeonLanesPerBlock; c++) {
+            // Load 4 float values from the vector and matrix row.
+            float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr +
+                                                 c * kFloatWeightsPerNeonLane);
+            float32x4_t matrix_f32x4 =
+                vld1q_f32(matrix_ptr + c * kFloatWeightsPerNeonLane);
+            // Multiply the vector and matrix row and add to accumulator.
+            acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
+          }
+          matrix_ptr += kBlockSize;
+        }
+        *result_in_batch +=
+            (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+             vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+      }
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  const int kWeightsPerUint32 = 4;
+  const int kWeightsPerNeonLane = 16;
+  const int kBlockSize = kWeightsPerNeonLane;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  void* aligned_vec_free = nullptr;
+  int8_t* aligned_vec =
+      (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                             &aligned_vec_free);
+
+  int batch, row;
+  for (batch = 0; batch < n_batch; ++batch) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Copy the vector data to an aligned vector.
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
+
+    const uint8_t* ledger_ptr = ledger;
+    const int8_t* row_ptr = matrix;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32x4_t dotprod = vmovq_n_s32(0);
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        // Prefetch the row to cache.
+        __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                           3 /* temporal locality */);
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int col_index = *ledger_ptr++ * kBlockSize;
+          // Load 16 8-bit values from the row and vector, each, to operate on.
+          // Here the assumption is that each buffer is 4-byte aligned.
+          // Otherwise, performance may suffer significantly.
+          TFLITE_DCHECK_EQ(  // NOLINT
+              (uintptr_t)(&row_ptr) & (kWeightsPerUint32 - 1), 0);
+          const int8x16_t s1_8x16 =
+              vld1q_s8((const int8_t*)(aligned_vec + col_index));
+          const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr));
+          // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+          // registers).
+          int16x8_t prod_16x8 =
+              vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+          // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+          // registers), and accumulate with the result of the low bits product.
+          // The assumption here is that overflow will not happen as we quantize
+          // our values to be in the range [-127, 127]. As such the sum of the 2
+          // products is always strictly smaller than 15-bits (32767 in absolute
+          // value).
+          prod_16x8 =
+              vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+          dotprod = vpadalq_s16(dotprod, prod_16x8);
+          row_ptr += kBlockSize;
+        }
+        // Add the 4 intermediate sum values to get the final dot-prod value for
+        // this row.
+        int64x2_t pairwiseAdded = vpaddlq_s32(dotprod);
+        int32 neon_sum =
+            vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+        *result += neon_sum * batch_scaling_factor;
+      }
+    }  // for row
+  }    // for batch
+  free(aligned_vec_free);
+}
+
 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
                                   int v_size, float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 903f4c80139cd326b354ef6292a393c75af11608..a86457dba745dbe94ce3e1dc718012545f258804 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -40,6 +40,24 @@ void MatrixBatchVectorMultiplyAccumulate(
                    vectors, scaling_factors, n_batch, result, result_stride);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const float* vector, int n_batch, float* result,
+    int result_stride) {
+  NeonSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result, result_stride);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  NeonSparseMatrixBatchVectorMultiplyAccumulate(matrix, ledger, m_rows, m_cols,
+                                                vectors, scaling_factors,
+                                                n_batch, result, result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index bf3902ec31f98a6a1b388d10689b6167742b7bb9..744f5cc20bb12be758956eda57952c73c7451ec0 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -1906,7 +1906,20 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   MatrixRef matrix_c(c, m, n);
   ConstMatrixRef matrix_a(a, m, k);
   ConstMatrixRef matrix_b(b, n, k);
-  matrix_c.noalias() = matrix_a * matrix_b.transpose();
+
+  // The following special casing for when a or b is a vector is required
+  // as Eigen seem to fail to make this optimization on its own.
+  if (n == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
+  } else if (m == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    matrix_c.noalias() = matrix_a * matrix_b.transpose();
+  }
+
 #endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 
   optimized_ops::AddBiasAndEvalActivationFunction(
@@ -5039,6 +5052,14 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   output_map.array() = Eigen::floor(input_map.array());
 }
 
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Ceil");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = Eigen::ceil(input_map.array());
+}
+
 #ifdef USE_NEON
 inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
                                  float scale, float* output_ptr) {
@@ -5374,9 +5395,6 @@ inline void ResizeBilinearGenericSmallChannel(
     int32 output_height, int32 output_width, float height_scale,
     float width_scale, const RuntimeShape& input_shape, const T* input_data,
     const RuntimeShape& output_shape, T* output_data) {
-  memset(output_data, 0,
-         batches * output_height * output_width * depth * sizeof(T));
-
   T* output_ptr = &output_data[0];
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
@@ -5385,7 +5403,7 @@ inline void ResizeBilinearGenericSmallChannel(
       int32 y1 = std::min(y0 + 1, input_height - 1);
       for (int x = 0; x < output_width; ++x) {
         float input_x = x * width_scale;
-        int32 x0 = static_cast<int32>(input_x);
+        int32 x0 = static_cast<int32>(std::floor((input_x)));
         int32 x1 = std::min(x0 + 1, input_width - 1);
 
         int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0),
@@ -6069,7 +6087,27 @@ inline void TransposeConv(
     const float* filter_data, const RuntimeShape& output_shape,
     float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
   gemmlowp::ScopedProfilingLabel label("TransposeConv");
-
+  // The complexity of the reference implementation is input.flat_size() *
+  // filter.flat_size() / in_channel.
+  //
+  // While the complexity of im2col->gemm
+  // implmentation is batch * output_height * output_width *
+  // (filter.flat_size() / out_channel)^2 * out_channel.
+  //
+  // so if input.flat_size() * out_channel^2 is much smaller than
+  // output.flat_size() * filter.size() * in_channel we should fall back to the
+  // reference implementation.
+  //
+  // TODO(b/122331966): optimize the intuitive implementation.
+  const int out_channel = output_shape.Dims(3);
+  const int in_channel = input_shape.Dims(3);
+  if ((input_shape.FlatSize() * out_channel * out_channel * 4) <
+      (filter_shape.FlatSize() * output_shape.FlatSize() * in_channel)) {
+    reference_ops::TransposeConv(params, input_shape, input_data, filter_shape,
+                                 filter_data, output_shape, output_data,
+                                 im2col_shape, im2col_data);
+    return;
+  }
   // Note we could use transposed weights with forward conv for unstrided
   // cases. But we are already getting good performance with this code as-is.
   TFLITE_DCHECK(im2col_data);
diff --git a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
index 8f52ef131dedf4d0270c0346b1094add57f52dfc..00b2d7e063254e2941fd3453f15dbaf2dbd4451e 100644
--- a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -54,6 +54,25 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
diff --git a/tensorflow/lite/kernels/internal/quantization_util.cc b/tensorflow/lite/kernels/internal/quantization_util.cc
index 0279d2a9229e02721c01d15d380db1919b7bfd23..71eef71372c0afd17c0dd3e416648dd20e983ba3 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -366,4 +366,13 @@ bool CheckedLog2(const float x, int* log2_result) {
   return std::abs(x_log2_fracpart) < 1e-3;
 }
 
+void QuantizeMultiplierArray(const double* effective_scales, size_t size,
+                             int32_t* effective_scale_significand,
+                             int* effective_shift) {
+  for (size_t i = 0; i < size; ++i) {
+    QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
+                       &effective_shift[i]);
+  }
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/quantization_util.h b/tensorflow/lite/kernels/internal/quantization_util.h
index bf313f39cd8b407f6fb57dcbdf0540e98d96b7e8..5d67c0d0277b84f5c1a74871d9acdd652beef83b 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/lite/kernels/internal/quantization_util.h
@@ -275,6 +275,17 @@ void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
 // returns false.
 bool CheckedLog2(const float x, int* log2_result);
 
+// Decomposes an array of double multipliers into a Q0.31 int32 representation
+// of its significand, and shift representation of its exponent.
+//
+// Handles an arbitrary multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplierArray(const double* effective_scales, size_t size,
+                             int32_t* effective_scale_significand,
+                             int* effective_shift);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/lite/kernels/internal/quantization_util_test.cc b/tensorflow/lite/kernels/internal/quantization_util_test.cc
index 2f8f7713795bf0e736fe85fcb582744974654b9e..ca4ff370ad4dff4bc6c58a074ce96a8a52029d9e 100644
--- a/tensorflow/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+using ::testing::ElementsAreArray;
 using ::testing::Pair;
 
 template <class FloatIn, class IntOut>
@@ -406,6 +407,52 @@ TEST(QuantizationUtilTest, CalculateInputRadius) {
   EXPECT_EQ(CalculateInputRadius(4, 2), 503316480);
 }
 
+TEST(QuantizationUtilTest, QuantizeMultiplierArray) {
+  const std::vector<double> weights = {-4,    -2,   -1,  -0.5, -0.25, -0.125, 0,
+                                       0.125, 0.25, 0.5, 1,    2,     4};
+  const int size = weights.size();
+  std::vector<int32> effective_scale_significand(size);
+  std::vector<int> effective_scale_shift(size);
+  QuantizeMultiplierArray(weights.data(), size,
+                          effective_scale_significand.data(),
+                          effective_scale_shift.data());
+  const std::vector<int32> expected_effective_scale_significand = {
+      -1073741824,  // float scale = -4
+      -1073741824,  // float scale = -2
+      -1073741824,  // float scale = -1
+      -1073741824,  // float scale = -0.5
+      -1073741824,  // float scale = -0.25
+      -1073741824,  // float scale = -0.125
+      0,            // float scale = 0
+      1073741824,   // float scale = 0.125
+      1073741824,   // float scale = 0.25
+      1073741824,   // float scale = 0.5
+      1073741824,   // float scale = 1
+      1073741824,   // float scale = 2
+      1073741824,   // float scale = 4
+  };
+
+  const std::vector<int> expected_effective_scale_shift = {
+      3,   // float scale = -4
+      2,   // float scale = -2
+      1,   // float scale = -1
+      0,   // float scale = -0.5
+      -1,  // float scale = -0.25
+      -2,  // float scale = -0.125
+      0,   // float scale = 0
+      -2,  // float scale = 0.125
+      -1,  // float scale = 0.25
+      0,   // float scale = 0.5
+      1,   // float scale = 1
+      2,   // float scale = 2
+      3,   // float scale = 4
+  };
+  EXPECT_THAT(effective_scale_significand,
+              ElementsAreArray(expected_effective_scale_significand));
+  EXPECT_THAT(effective_scale_shift,
+              ElementsAreArray(expected_effective_scale_shift));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..22750bc91a856b360459fbf9b5ed0519e4ac6c88
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape, const int8* input_data,
+                        const RuntimeShape& output_shape, int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32 acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          // Round to the closest integer value.
+          acc = acc > 0 ? (acc + filter_count / 2) / filter_count
+                        : (acc - filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8>(acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h b/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f6bf1cb73e40b2bc396a59f5b47cefaea071d02
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_SOFTMAX_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Quantized softmax with int8 input and output.
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const int8* input_data,
+                    const RuntimeShape& output_shape, int8* output_data) {
+  const int32 input_beta_multiplier = params.input_multiplier;
+  const int32 input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    int8 max_in_row = -128;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int num_bits_over_unit;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        const int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+        const int32 shifted_output = unsat_output - 128;
+
+        output_data[i * depth + c] = static_cast<int8>(
+            std::max(std::min(shifted_output, static_cast<int32>(127)),
+                     static_cast<int32>(-128)));
+
+      } else {
+        output_data[i * depth + c] = -128;
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index 380fc8f98ebbdd90bb68144a46903640734bff08..390bf08e30300625471f8fe0bfceac21fc43756d 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -2033,7 +2034,16 @@ template <typename T1, typename T2, typename T3>
 void ArgMax(const T3* axis, const T1* input_data,
             const tflite::Dims<4>& input_dims, T2* output_data,
             const tflite::Dims<4>& output_dims) {
-  ArgMinMax(DimsToShape(input_dims), input_data, axis, DimsToShape(output_dims),
+  // Assumes the input always has 4 dimensions, and therefore,
+  // output always has three dimensions.
+  auto output_shape = RuntimeShape(
+      {output_dims.sizes[2], output_dims.sizes[1], output_dims.sizes[0]});
+  // Another way to interpret this is that output_dims.sizes[4] is always 1.
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(),
+                   DimsToShape(output_dims).FlatSize());
+  // Legacy path only supported this.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+  ArgMinMax(DimsToShape(input_dims), input_data, axis, output_shape,
             output_data, std::greater<T1>());
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d692063a968dab654eaf46b9956ddcd338b64410..f5c4b78dc1429f45e477ecc9528e976aeda2ab1f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -101,7 +102,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
                          3 /* temporal locality */);
 #endif
-      // For every block of 16 8-bit elements (128-bit register) from each row.
       for (col = 0; col < m_cols; ++col, ++row_ptr) {
         dotprod += (*row_ptr) * (vectors[col]);
       }  // for col
@@ -110,6 +110,73 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int r = 0; r < m_rows; r++) {
+      float dot_prod = 0.0f;
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        const float* vector_in_batch = vector + b * m_cols;
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
+          }
+        }
+      }
+      *result_in_batch += dot_prod;
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  int batch, row;
+  for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Get the address of the first row.
+    const int8_t* row_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const int8_t* vector_block_ptr = vectors + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dotprod += (*row_ptr++) * (*vector_block_ptr++);
+          }  // for block
+        }
+      }
+      *result += (dotprod * batch_scaling_factor);
+    }  // for row
+  }    // for batch
+}
+
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
                                       float* result) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index a06ebc1600d4fe47cf054b4e157bc21a5f70ddfc..49b59da0bbaf7aec6ba1b66b499df8d5426f5951 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -48,6 +48,16 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
@@ -165,6 +175,23 @@ void MatrixBatchVectorMultiplyAccumulate(
                                               result_stride);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result, result_stride);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index b6a8f3859f0aa36184304ee9d3af32d9f77d6f57..125fa9567495e5e2091d7ff66e7725ae8f8ac9db 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -3122,6 +3122,16 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::ceil(input_data[offset]);
+  }
+}
+
 template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -3950,11 +3960,8 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
                const T3* input2_data, const RuntimeShape& output_shape,
                T2* output_data, const Cmp& cmp) {
   gemmlowp::ScopedProfilingLabel label("ArgMinMax");
-  // For ArgMax, the number of output dimensions = (number of input dimensions -
-  // 1). For the sake of simplicity, the output dimensions are equal to the
-  // input dimensions here. We enforce the constraint that the axis dimension
-  // must always be 1.
-  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount(),
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
                    output_shape.DimensionsCount());
 
   int axis = input2_data[0];
@@ -3963,7 +3970,6 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
   }
 
   const int axis_size = input1_shape.Dims(axis);
-  TFLITE_DCHECK_EQ(output_shape.Dims(axis), 1);
 
   int outer_size = 1;
   for (int i = 0; i < axis; ++i) {
@@ -3974,7 +3980,7 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
   int inner_size = 1;
   const int dims_count = input1_shape.DimensionsCount();
   for (int i = axis + 1; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
     inner_size *= input1_shape.Dims(i);
   }
 
@@ -4714,6 +4720,33 @@ void Fill(const RuntimeShape& value_shape, const T* value_data,
   }
 }
 
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape& input_shape,
+             const Scalar* input_data, const RuntimeShape& output_shape,
+             Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i) {
+    for (int j = 0; j < dims_at_axis; ++j) {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar* output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index 51de6b51aa5308b69dd5b9ad6bf29cd18c0550ba..45a18cdb47f64b4a8f5f0c7cd53cb9b13956b151 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -102,19 +102,9 @@ inline void Softmax(const SoftmaxParams& params,
       }
     }
 
-    int32 fixed_sum_of_exps = sum_of_exps.raw();
-    int headroom_plus_one =
-        CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
-    // This is the number of bits to the left of the binary point above 1.0.
-    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-    // no later adjustment will be needed.
-    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-    int32 shifted_sum_minus_one = static_cast<int32>(
-        (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
-        (static_cast<uint32>(1) << 31));
-
-    FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-        FixedPoint0::FromRaw(shifted_sum_minus_one));
+    int num_bits_over_unit;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
 
     for (int c = 0; c < depth; ++c) {
       int32 input_diff =
diff --git a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
index 1c5ac1992f0f649ca47e2a5bc81ea332abc46bf5..4a19b69a7c9dfc70192d446f922052606c516365 100644
--- a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
@@ -76,6 +76,7 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
 }
 
 TEST(ResizeBilinear, TestResizeBilinear8Bit) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -91,6 +92,7 @@ TEST(ResizeBilinear, TestResizeBilinear8Bit) {
 }
 
 TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -106,6 +108,7 @@ TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
 }
 
 TEST(ResizeBilinear, TestResizeBilinear) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -121,6 +124,7 @@ TEST(ResizeBilinear, TestResizeBilinear) {
 }
 
 TEST(ResizeBilinear2x2, TestResizeBilinear) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
diff --git a/tensorflow/lite/kernels/internal/round.h b/tensorflow/lite/kernels/internal/round.h
index cb494bfd5374d90bac0c8f444e186f137f45a91f..135deced448afa63468bb018705e61bd03694a25 100644
--- a/tensorflow/lite/kernels/internal/round.h
+++ b/tensorflow/lite/kernels/internal/round.h
@@ -21,7 +21,8 @@ namespace tflite {
 
 // TODO(aselle): See if we can do this only on jdk. Also mikecase, check
 // if you need this for java host build.
-#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+#if defined(TF_LITE_USE_GLOBAL_ROUND) || \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__))
 template <class T>
 inline float TfLiteRound(const float x) {
   return ::round(x);
diff --git a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index 743ce0355c96fd2766fd2315299c2419703f11b7..8ac62d9af787b2846a0f2031a3c9bcd9f2ab44d7 100644
--- a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -210,7 +210,7 @@ bool TryOneSkyscraperSoftmax(bool small_depth) {
 }
 
 TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneUniformSoftmax()) {
     }
@@ -218,7 +218,7 @@ TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
 }
 
 TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperSoftmax(false)) {
     }
@@ -226,7 +226,7 @@ TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
 }
 
 TEST(TestQuantizedSoftmax, SmallSkyscraperSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperSoftmax(true)) {
     }
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 71ae69522f9a45745a9ed9eae211db3d048ba43d..4f18f283b6094c66fb89080115d359ffce776dd8 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -55,6 +55,21 @@ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int n_batch, float* result,
                                          int result_stride);
 
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//   with
+//      an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
 // Same as the function above, but for values quantized using symmetric
 // quantization (e.g. by calling SymmetricQuantizeFloats).
 // The passed scaling factors is a buffer of the quantization scaling factors
@@ -67,6 +82,23 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//   with
+//      an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result);
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 29866d066406e58e06e6caa2e5b410460564c966..3ba4af7c468421cbc8d559e3f8777854ba2fc53b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -149,6 +149,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   // 16-block SIMD code, the 8-block postamble, and the leftover postamble.
   const int a_rows = 4, a_cols = 29;
   const int kWeightsPerUint32 = 4;
+  /* clang-format off */
   const float a_float_data[] = {
       /* 1st row */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
@@ -174,126 +175,18 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min,
                           &a_max, &scaling_factor_a);
   const int8_t expected_a_int8_data[] = {
-      /* 1st row */
-      5,
-      10,
-      15,
-      20,
-      25,
-      30,
-      35,
-      40,
-      44,
-      45,
-      50,
-      54,
-      59,
-      64,
-      68,
-      73,
-      77,
-      82,
-      86,
-      91,
-      95,
-      100,
-      104,
-      109,
-      113,
-      118,
-      122,
-      127,
-      0,
-      /* 2nd row */
-      -5,
-      -10,
-      -15,
-      -20,
-      -25,
-      -30,
-      -35,
-      -40,
-      -44,
-      -45,
-      -50,
-      -54,
-      -59,
-      -64,
-      -68,
-      -73,
-      -77,
-      -82,
-      -86,
-      -91,
-      -95,
-      -100,
-      -104,
-      -109,
-      -113,
-      -118,
-      -122,
-      -127,
-      0,
-      /* 3rd row */
-      5,
-      -10,
-      15,
-      -20,
-      25,
-      -30,
-      35,
-      -40,
-      44,
-      -45,
-      50,
-      -54,
-      59,
-      -64,
-      68,
-      -73,
-      77,
-      -82,
-      86,
-      -91,
-      95,
-      -100,
-      104,
-      -109,
-      113,
-      -118,
-      122,
-      -127,
-      0,
-      /* 4th row */
-      -5,
-      10,
-      -15,
-      20,
-      -25,
-      30,
-      -35,
-      40,
-      -44,
-      45,
-      -50,
-      54,
-      -59,
-      64,
-      -68,
-      73,
-      -77,
-      82,
-      -86,
-      91,
-      -95,
-      100,
-      -104,
-      109,
-      -113,
-      118,
-      -122,
-      127,
-      0,
+    /* 1st row */
+    5, 10, 15, 20, 25, 30, 35, 40, 44, 45, 50, 54, 59, 64, 68, 73, 77, 82, 86,
+    91, 95, 100, 104, 109, 113, 118, 122, 127, 0,
+    /* 2nd row */
+    -5, -10, -15, -20, -25, -30, -35, -40, -44, -45, -50, -54, -59, -64, -68,
+    -73, -77, -82, -86, -91, -95, -100, -104, -109, -113, -118, -122, -127, 0,
+    /* 3rd row */
+    5, -10, 15, -20, 25, -30, 35, -40, 44, -45, 50, -54, 59, -64, 68, -73, 77,
+    -82, 86, -91, 95, -100, 104, -109, 113, -118, 122, -127, 0,
+    /* 4th row */
+    -5, 10, -15, 20, -25, 30, -35, 40, -44, 45, -50, 54, -59, 64, -68, 73, -77,
+    82, -86, 91, -95, 100, -104, 109, -113, 118, -122, 127, 0,
   };
   for (int i = 0; i < a_rows * a_cols; ++i) {
     EXPECT_EQ(expected_a_int8_data[i], a_int8_data[i]);
@@ -301,66 +194,14 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
 
   const int b_rows = 29, b_cols = 1, batches = 2;
   const float b_float_data[] = {
-      /* batch 1 */
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      /* batch 2 */
-      2.5,
-      -2.1,
-      3.0,
-      -1.3,
-      1.3,
-      -1.1,
-      2.0,
-      -1.7,
-      1.9,
-      -1.5,
-      0.5,
-      -0.7,
-      0.8,
-      -0.3,
-      2.8,
-      -2.8,
-      1.1,
-      -2.3,
-      1.9,
-      -1.9,
-      2.1,
-      -0.5,
-      2.4,
-      -0.1,
-      1.0,
-      -2.5,
-      0.7,
-      -1.9,
-      0.2,
+    /* batch 1 */
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0,
+    /* batch 2 */
+    2.5, -2.1, 3.0, -1.3, 1.3, -1.1, 2.0, -1.7, 1.9, -1.5, 0.5, -0.7, 0.8, -0.3,
+    2.8, -2.8, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5, 0.7, -1.9,
+    0.2,
   };
 
   // Quantized values of B:
@@ -374,67 +215,15 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
                           &scaling_factor_b[1]);
 
   const int8_t expected_b_int8_data[] = {
-      /* batch 1 */
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      /* batch 2 */
-      106,
-      -89,
-      127,
-      -55,
-      55,
-      -47,
-      85,
-      -72,
-      80,
-      -64,
-      21,
-      -30,
-      34,
-      -13,
-      119,
-      -119,
-      47,
-      -97,
-      80,
-      -80,
-      89,
-      -21,
-      102,
-      -4,
-      42,
-      -106,
-      30,
-      -80,
-      8,
+    /* batch 1 */
+    127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+    127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+    127,
+    /* batch 2 */
+    106, -89, 127, -55, 55, -47, 85, -72, 80, -64, 21, -30, 34, -13, 119, -119,
+    47, -97, 80, -80, 89, -21, 102, -4, 42, -106, 30, -80, 8,
   };
+  /* clang-format on */
   for (int i = 0; i < b_rows * b_cols * batches; ++i) {
     EXPECT_EQ(expected_b_int8_data[i], b_int8_data[i]);
   }
@@ -468,6 +257,161 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
 }
 #endif  // __ANDROID__
 
+TEST(uKernels, SparseMatrixBatchVectorMultiplyAccumulateTest) {
+  const int kRow = 4;
+  const int kCol = 48;
+  const int kBatch = 2;
+  /* clang-format off */
+  float matrix[kRow * kCol] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
+
+  // BCSR format of the above matrix.
+  float matrix_values[] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38, 39.39,
+      40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
+      /* 2nd row */
+      -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24, -25.25,
+      -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25, -26.26,
+      27.27, -28.28, 0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, -33.33, 34.34, -35.35, 36.36, -37.37, 38.38,
+      -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
+  uint8_t ledger[] = {
+      2, 0,  2,  // 1st row
+      1, 1,      // 2nd row
+      1, 1,      // 3rd row
+      2, 0,  2   // 4th row
+  };
+
+  float vector[kBatch * kCol] = {
+    /* 1st batch */
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    /* 2nd batch */
+    2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+    -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3, 0.0,
+    2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5,
+    0.7, -1.9, 0.2, 0.0, 0.1, 0.2,
+  };
+  /* clang-format on */
+
+  std::vector<float> dense_output(kRow * kBatch, 0.0);
+  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+                                      dense_output.data(), /*result_stride=*/1);
+
+  std::vector<float> sparse_output(kRow * kBatch, 0.0);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      matrix_values, ledger, kRow, kCol, vector, kBatch, sparse_output.data(),
+      /*result_stride=*/1);
+
+  EXPECT_THAT(sparse_output,
+              ElementsAreArray(ArrayFloatNear(dense_output, 1e-4)));
+}
+
+#ifdef __ANDROID__
+TEST(uKernels,
+     SparseMatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
+  const int kRow = 4;
+  const int kCol = 48;
+  const int kBatch = 2;
+  /* clang-format off */
+  const int8_t quantized_matrix[] = {
+      /* 1st row */
+      3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 98, 101, 104, 107, 110, 113, 115,
+      118, 121, 124, 127, 0, 0, 0, 0,
+      /* 2nd row */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -49, -52, -55, -58, -61,
+      -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0,
+      /* 3rd row */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, -52, 55, -58, 61, -64,
+      66, -69, 72, -75, 78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0,
+      /* 4th row */
+      -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -95, 98, -101, 104, -107, 110,
+      -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
+  };
+  const int8_t quantized_matrix_values[] = {
+      /* 1st row */
+      3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 95, 98, 101,
+      104, 107, 110, 113, 115, 118, 121, 124, 127, 0, 0, 0, 0,
+      /* 2nd row */
+      -49, -52, -55, -58, -61, -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0,
+      /* 3rd row */
+      49, -52, 55, -58, 61, -64, 66, -69, 72, -75, 78, -81, 0, 0, 0, 0,
+      /* 4th row */
+      -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, -95,
+      98, -101, 104, -107, 110, -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
+  };
+  uint8_t ledger[] = {
+      2, 0,  2,  // 1st row
+      1, 1,      // 2nd row
+      1, 1,      // 3rd row
+      2, 0,  2   // 4th row
+  };
+
+  float matrix_scaling_factor = 0.349921;
+
+  const int8_t quantized_vector[] = {
+      /* 1st batch */
+      127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
+      -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+      127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
+      -127, 127, -127, 127, -127, 127, -127, 127, -127,
+      /* 2nd batch */
+      106, 0, -89, 0, 127, 0, -55, 0, 55, 0, -47, 0, 85, 0, -72, 0, 80, 0,
+      -64, 0, 21, 0, -30, 0, 34, 0, -13, 0, 119, 0, -119, 0, 47, -97, 80, -80,
+      89, -21, 102, -4, 42, -106, 30, -80, 8, 1, 2, 3,
+  };
+  float vector_scaling_factor[2] = {0.00787402, 0.023622};
+
+  /* clang-format on */
+  float result_scaling_factor[2] = {
+      matrix_scaling_factor * vector_scaling_factor[0],
+      matrix_scaling_factor * vector_scaling_factor[1],
+  };
+  std::vector<float> dense_output(kRow * kBatch, 0.0);
+  MatrixBatchVectorMultiplyAccumulate(quantized_matrix, kRow, kCol,
+                                      quantized_vector, result_scaling_factor,
+                                      kBatch, dense_output.data(),
+                                      /*result_stride=*/1);
+  std::vector<float> sparse_output(kRow * kBatch, 0.0);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      quantized_matrix_values, ledger, kRow, kCol, quantized_vector,
+      result_scaling_factor, kBatch, sparse_output.data(),
+      /*result_stride=*/1);
+  EXPECT_THAT(sparse_output, ElementsAreArray(ArrayFloatNear(dense_output)));
+}
+#endif  // __ANDROID__
+
 TEST(uKernels, VectorVectorCwiseProductTest) {
   constexpr int kVectorSize = 10;
   static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index e39890e3320eb4d1e2dcd0c8256bb96631e75011..57f4bfa9fa29ca39aa2506a08870ef6b2d61ab09 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -103,6 +103,16 @@ void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
                                         act_max);
 }
 
+void CalculateActivationRangeInt8(TfLiteFusedActivation activation,
+                                  TfLiteTensor* output, int32_t* act_min,
+                                  int32_t* act_max) {
+  const int32_t qmin = std::numeric_limits<int8_t>::min();
+  const int32_t qmax = std::numeric_limits<int8_t>::max();
+
+  CalculateActivationRangeQuantizedImpl(activation, qmin, qmax, output, act_min,
+                                        act_max);
+}
+
 bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
   return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 3cc00588d63feddc90d17997cebe2c8d063c45eb..4cfc885f8939481f1515b445dfc9e261a4e79ed9 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -104,6 +104,9 @@ TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
 void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
                                    TfLiteTensor* output, int32_t* act_min,
                                    int32_t* act_max);
+void CalculateActivationRangeInt8(TfLiteFusedActivation activation,
+                                  TfLiteTensor* output, int32_t* act_min,
+                                  int32_t* act_max);
 // Calculates the useful range of an activation layer given its activation
 // tensor.a
 template <typename T>
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 70eb18365891097686d579bde4a5457703e84aee..185ddfaa9dfb350eebe25915cdd6b7f9437f880a 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -28,6 +28,8 @@ class KernelUtilTest : public ::testing::Test {
   KernelUtilTest() {
     context_.ReportError = ReportError;
 
+    memset(&tensor1_, 0, sizeof(TfLiteTensor));
+    memset(&tensor2_, 0, sizeof(TfLiteTensor));
     tensor1_.dims = nullptr;
     tensor2_.dims = nullptr;
     tensor1_.allocation_type = kTfLiteMmapRo;
diff --git a/tensorflow/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
index 49e8a53c829a0c4a8ae355f8e7a6b97e3bbb81e1..ce0c21dfcba770b72f144c272d7ab12b2e77e399 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// DEPRECATED: Tensorflow Lite has implemented layer norm lstm as builtin Op and
+// the implementation of layer norm lstm as custom Op in this file is
+// deprecated. It is only kept for backward compatibility.
+//
 // Layer Normalization LSTM op that applies normalization by mean and standard
 // deviation to the activation of the LSTM layers. Please see
 // https://arxiv.org/abs/1607.06450 for details.
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
index 1c13cee1c3f66ed2a3459cd2bcc32211c3b1f00e..5aed818f2407a96acb8893654971fc5bb91a81ed 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm_test.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
@@ -133,85 +133,87 @@ class LayerNormLSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormWeights(std::vector<float> f) {
+  void SetInputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_weights_, f);
   }
 
-  void SetForgetLayerNormWeights(std::vector<float> f) {
+  void SetForgetLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_weights_, f);
   }
 
-  void SetCellLayerNormWeights(std::vector<float> f) {
+  void SetCellLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_weights_, f);
   }
 
-  void SetOutputLayerNormWeights(std::vector<float> f) {
+  void SetOutputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_weights_, f);
   }
 
-  void SetInputGateBias(std::vector<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::vector<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
 
-  void SetOutputGateBias(std::vector<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::vector<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -280,67 +282,67 @@ class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
                              use_projection_bias, cell_clip, proj_clip,
                              input_shapes, TensorType_UINT8) {}
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormWeights(std::vector<float> f) {
+  void SetInputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_weights_, f);
   }
 
-  void SetForgetLayerNormWeights(std::vector<float> f) {
+  void SetForgetLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_weights_, f);
   }
 
-  void SetCellLayerNormWeights(std::vector<float> f) {
+  void SetCellLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_weights_, f);
   }
 
-  void SetOutputLayerNormWeights(std::vector<float> f) {
+  void SetOutputLayerNormWeights(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_weights_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(projection_weights_, f);
   }
 };
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 3689d77b012bf2ff4bd9bc791f10802c861c010d..470c74d207d51688c3c48de0fc8bdecda43097a7 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -382,7 +382,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // The weights are of consistent type, so it suffices to check one.
   // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+  const bool is_hybrid_op = ((input_to_output_weights->type == kTfLiteUInt8 ||
+                              input_to_output_weights->type == kTfLiteInt8) &&
                              input->type == kTfLiteFloat32);
 
   TfLiteIntArrayFree(node->temporaries);
@@ -418,7 +419,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // activation_state and cell_state tensors.
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -428,7 +429,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
     TfLiteTensor* activation_state_quantized =
         GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = kTfLiteUInt8;
+    activation_state_quantized->type = input_to_output_weights->type;
     activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
                              activation_state->dims)) {
@@ -441,7 +442,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
     TfLiteTensor* cell_state_quantized =
         GetTemporary(context, node, /*index=*/3);
-    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
       TfLiteIntArray* cell_state_quantized_size =
@@ -595,7 +596,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
           output);
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
       TfLiteTensor* activation_state_quantized =
           GetTemporary(context, node, /*index=*/2);
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 6ba1e19343746b8ffd7d4f34782a10d2a0f5a966..244cfae4a20b93b32022bee412f241397df53c49 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -39,9 +40,44 @@ const float kLayerNormEpsilon = 1e-8;
 //  - n_batch: size of batch,
 //  - n_cell: number of cells (or units),
 //  - n_input: the input size,
+//  - n_aux_input: the auxilary input size.
 //  - n_output: the output size.
 //  - output_batch_leading_dim: the leading dimension of the output buffer.
 //
+// LSTM weights:
+// Input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional (can be nullptr)
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_output_weights
+// Auxilary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
+// Recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
 // The pointers to the cell and output state and the output are updated.
 //
 // The pointers with the suffix "_batch" point to data aligned in batch_major
@@ -335,6 +371,11 @@ inline void LstmStepWithAuxInput(
 //   input_to_forget_weights
 //   input_to_cell_weights
 //   input_to_input_weights
+// Quantized auxilary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
 // Quantized recurrent weights of size 'n_cell * n_output':
 //   recurrent_to_input_weights        - optional
 //   recurrent_to_forget_weights
@@ -351,6 +392,10 @@ inline void LstmStepWithAuxInput(
 //   input_to_forget_weights_scale
 //   input_to_cell_weights_scale
 //   input_to_output_weights_scale
+//   aux_input_to_input_weights_scale  - optional
+//   aux_input_to_forget_weights_scale - optional
+//   aux_input_to_cell_weights_scale   - optional
+//   aux_input_to_output_weights_scale - optional
 //   recurrent_to_input_weights_scale  - optional
 //   recurrent_to_forget_weights_scale
 //   recurrent_to_cell_weights_scale
@@ -365,6 +410,12 @@ inline void LstmStepWithAuxInput(
 //   cell_gate_bias_ptr
 //   output_gate_bias_ptr
 //
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
 // Temporary pre-allocated storage for quantized values:
 //   quantized_input_ptr_batch (same size as input_ptr_batch)
 //   quantized_output_state_ptr (same size as output_state_ptr)
@@ -805,6 +856,15 @@ inline void LstmStepWithAuxInput(
     }
   }
 }
+
+int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
+  if (is_uint8) {
+    return reinterpret_cast<int8_t*>(tensor->data.uint8);
+  } else {
+    return tensor->data.int8;
+  }
+}
+
 }  // namespace
 
 TfLiteStatus EvalFloat(
@@ -1036,6 +1096,9 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
     TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output) {
+  // For operations that use int8 instead of uint8 we need to fetch raw data
+  // from the tensor different. We use this bool for that condition.
+  const bool is_uint8_hybrid = input_to_output_weights->type == kTfLiteUInt8;
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -1081,9 +1144,9 @@ TfLiteStatus EvalHybrid(
   float* input_gate_bias_ptr = nullptr;
   if (!use_cifg) {
     input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
+        GetInt8DataPtr(input_to_input_weights, is_uint8_hybrid);
     recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
+        GetInt8DataPtr(recurrent_to_input_weights, is_uint8_hybrid);
     input_gate_bias_ptr = input_gate_bias->data.f;
     input_to_input_weights_scale = input_to_input_weights->params.scale;
     recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
@@ -1098,13 +1161,13 @@ TfLiteStatus EvalHybrid(
   if (use_peephole) {
     if (!use_cifg) {
       cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
+          GetInt8DataPtr(cell_to_input_weights, is_uint8_hybrid);
       cell_to_input_weights_scale = cell_to_input_weights->params.scale;
     }
     cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
+        GetInt8DataPtr(cell_to_forget_weights, is_uint8_hybrid);
     cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
+        GetInt8DataPtr(cell_to_output_weights, is_uint8_hybrid);
     cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
     cell_to_output_weights_scale = cell_to_output_weights->params.scale;
   }
@@ -1122,7 +1185,7 @@ TfLiteStatus EvalHybrid(
   const int8_t* projection_weights_ptr =
       (projection_weights == nullptr)
           ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
+          : GetInt8DataPtr(projection_weights, is_uint8_hybrid);
   const float projection_weights_scale =
       (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
   const float* projection_bias_ptr =
@@ -1130,26 +1193,26 @@ TfLiteStatus EvalHybrid(
 
   // Required tensors, pointers are non-null.
   const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
+      GetInt8DataPtr(input_to_forget_weights, is_uint8_hybrid);
   const float input_to_forget_weights_scale =
       input_to_forget_weights->params.scale;
   const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
+      GetInt8DataPtr(input_to_cell_weights, is_uint8_hybrid);
   const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
   const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
+      GetInt8DataPtr(input_to_output_weights, is_uint8_hybrid);
   const float input_to_output_weights_scale =
       input_to_output_weights->params.scale;
   const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
+      GetInt8DataPtr(recurrent_to_forget_weights, is_uint8_hybrid);
   const float recurrent_to_forget_weights_scale =
       recurrent_to_forget_weights->params.scale;
   const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
+      GetInt8DataPtr(recurrent_to_cell_weights, is_uint8_hybrid);
   const float recurrent_to_cell_weights_scale =
       recurrent_to_cell_weights->params.scale;
   const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
+      GetInt8DataPtr(recurrent_to_output_weights, is_uint8_hybrid);
   const float recurrent_to_output_weights_scale =
       recurrent_to_output_weights->params.scale;
   const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
@@ -1158,15 +1221,15 @@ TfLiteStatus EvalHybrid(
 
   // Temporary storage for quantized values and scaling factors.
   int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+      GetInt8DataPtr(input_quantized, is_uint8_hybrid);
   int8_t* quantized_aux_input_ptr =
       (aux_input_quantized == nullptr)
           ? nullptr
-          : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8);
+          : GetInt8DataPtr(aux_input_quantized, is_uint8_hybrid);
   int8_t* quantized_output_state_ptr =
-      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
+      GetInt8DataPtr(output_state_quantized, is_uint8_hybrid);
   int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
+      GetInt8DataPtr(cell_state_quantized, is_uint8_hybrid);
   float* scaling_factors_ptr = scaling_factors->data.f;
   float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
   float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
@@ -1184,14 +1247,14 @@ TfLiteStatus EvalHybrid(
   if (aux_input_size > 0) {
     if (!use_cifg) {
       aux_input_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8);
+          GetInt8DataPtr(aux_input_to_input_weights, is_uint8_hybrid);
     }
     aux_input_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8);
+        GetInt8DataPtr(aux_input_to_forget_weights, is_uint8_hybrid);
     aux_input_to_cell_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8);
+        GetInt8DataPtr(aux_input_to_cell_weights, is_uint8_hybrid);
     aux_input_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8);
+        GetInt8DataPtr(aux_input_to_output_weights, is_uint8_hybrid);
     if (!use_cifg) {
       aux_input_to_input_weights_scale =
           aux_input_to_input_weights->params.scale;
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index fea95aacb1f877bab14af65fd4777d4ab1d342b5..40ee94888136207eddcb38577377027c718a0a58 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -129,85 +129,87 @@ class LSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormCoefficients(std::vector<float> f) {
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_coefficients_, f);
   }
 
-  void SetForgetLayerNormCoefficients(std::vector<float> f) {
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_coefficients_, f);
   }
 
-  void SetCellLayerNormCoefficients(std::vector<float> f) {
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_coefficients_, f);
   }
 
-  void SetOutputLayerNormCoefficients(std::vector<float> f) {
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_coefficients_, f);
   }
 
-  void SetInputGateBias(std::vector<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::vector<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
 
-  void SetOutputGateBias(std::vector<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::vector<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -270,57 +272,70 @@ class HybridLSTMOpModel : public LSTMOpModel {
                     bool use_cifg, bool use_peephole,
                     bool use_projection_weights, bool use_projection_bias,
                     float cell_clip, float proj_clip,
-                    const std::vector<std::vector<int>>& input_shapes)
+                    const std::vector<std::vector<int>>& input_shapes,
+                    TensorType tensor_type)
       : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
                     use_projection_weights, use_projection_bias, cell_clip,
-                    proj_clip, input_shapes, TensorType_UINT8) {}
+                    proj_clip, input_shapes, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  TensorType tensor_type_;
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
 
-  void SetInputToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetWeights(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetWeights(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_output_weights_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetWeights(projection_weights_, f);
   }
 };
 
@@ -487,7 +502,8 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -523,7 +539,67 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
+                /*tolerance=*/0.0157651);
+}
+
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -647,7 +723,8 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -684,7 +761,67 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
+}
+
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToCellWeights(input_to_cell_weights_);
   lstm.SetInputToForgetWeights(input_to_forget_weights_);
@@ -1364,7 +1501,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTesInt8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -1400,7 +1537,72 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
+}
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -1447,74 +1649,87 @@ class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
                              bool use_projection_weights,
                              bool use_projection_bias, float cell_clip,
                              float proj_clip,
-                             const std::vector<std::vector<int>>& input_shapes)
+                             const std::vector<std::vector<int>>& input_shapes,
+                             TensorType tensor_type)
       : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
                              use_peephole, use_projection_weights,
                              use_projection_bias, cell_clip, proj_clip,
-                             input_shapes, TensorType_UINT8) {}
+                             input_shapes, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  TensorType tensor_type_;
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
 
-  void SetInputToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetWeights(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetWeights(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_output_weights_, f);
   }
 
-  void SetInputLayerNormCoefficients(std::vector<float> f) {
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(input_layer_norm_coefficients_, f);
   }
 
-  void SetForgetLayerNormCoefficients(std::vector<float> f) {
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(forget_layer_norm_coefficients_, f);
   }
 
-  void SetCellLayerNormCoefficients(std::vector<float> f) {
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(cell_layer_norm_coefficients_, f);
   }
 
-  void SetOutputLayerNormCoefficients(std::vector<float> f) {
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
     PopulateTensor(output_layer_norm_coefficients_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetWeights(projection_weights_, f);
   }
 };
 
@@ -1744,7 +1959,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
 }
 
 TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
+       HybridLayerNormLstmBlackBoxTestUint8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 4;
@@ -1789,7 +2004,103 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // forget_layer_norm_coefficient tensor
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
-      });
+      },
+      TensorType_UINT8);
+
+  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0244576, 0.127847, -0.00181765,  // seq 0
+          0.0137518, 0.140892, 0.0402234,    // seq 1
+          -0.0048839, 0.155096, 0.0840309,   // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.00728636, 0.0843957, 0.0634786,  // seq 0
+          -0.00448382, 0.139278, 0.0737372,   // seq 1
+          0.00734616, 0.161793, 0.0560238,    // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTestInt8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_cell},  // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      },
+      TensorType_INT8);
 
   layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
   layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -1975,7 +2286,7 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
 }
 
 TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
+       HybridLayerNormLstmBlackBoxTestUint8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 4;
@@ -2020,7 +2331,99 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
           {n_cell},  // forget_layer_norm_coefficient tensor
           {n_cell},  // cell_layer_norm_coefficient tensor
           {n_cell},  // output_layer_norm_coefficient tensor
-      });
+      },
+      TensorType_UINT8);
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
+          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
+          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
+          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
+          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTestInt8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {0},       // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      },
+      TensorType_INT8);
 
   layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
   layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 415a285c707e6aa7a5a2029822cdf54d57692839..3caa4065dcbadd699ee9e61b8e97a42281d32309 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -175,6 +175,7 @@ class PadOpDynamicModel : public PadOpModel<float> {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
       PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -195,6 +196,7 @@ TEST(PadOpTest, InvalidPadValue) {
                       {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
+#endif
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -306,6 +308,7 @@ class QuantizedPadOpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -314,6 +317,7 @@ TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
                                  {TensorType_UINT8, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedPadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -371,6 +375,7 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(PadV2OpTest, TooManyDimensions) {
   EXPECT_DEATH(PadV2OpConstModel<float>(
                    {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -392,6 +397,7 @@ TEST(PadV2OpTest, InvalidPadValue) {
                                         {TensorType_FLOAT32}),
                "Pad value has to be greater than equal to 0.");
 }
+#endif
 
 TEST(PadV2OpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -495,6 +501,7 @@ class QuantizedPadV2OpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -504,6 +511,7 @@ TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
                                  {TensorType_UINT8, {}, 1.0, 2.0}),
       ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index 694a36ffbcf3c8c9d8fe65e1b922ca03921883b3..e6155fcb8c67ed3b5e676c2530ec7966d6cec56f 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -98,7 +99,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   data->padding.width = ComputePadding(params->stride_width, 1, width,
                                        params->filter_width, out_width);
 
-  if (input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (pool_type == kAverage || pool_type == kMax) {
       TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
       TF_LITE_ENSURE_EQ(context, input->params.zero_point,
@@ -147,9 +148,10 @@ void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <KernelType kernel_type>
-void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                          TfLitePoolParams* params, OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+void AverageEvalQuantizedUint8(TfLiteContext* context, TfLiteNode* node,
+                               TfLitePoolParams* params, OpData* data,
+                               const TfLiteTensor* input,
+                               TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -175,6 +177,27 @@ void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_AVERAGE_POOL
 }
 
+void AverageEvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                              TfLitePoolParams* params, OpData* data,
+                              const TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeInt8(params->activation, output, &activation_min,
+                               &activation_max);
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+  reference_integer_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+}
+
 template <KernelType kernel_type>
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLitePoolParams* params, OpData* data,
@@ -272,8 +295,11 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
       AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
-      AverageEvalQuantized<kernel_type>(context, node, params, data, input,
-                                        output);
+      AverageEvalQuantizedUint8<kernel_type>(context, node, params, data, input,
+                                             output);
+      break;
+    case kTfLiteInt8:
+      AverageEvalQuantizedInt8(context, node, params, data, input, output);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index 98777f1c13ff97551c05cddc1d319918ea6ed69a..e1b79340115ad18e50ecdb6944904bf2ab7c9e84 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -78,6 +78,25 @@ class QuantizedPoolingOpModel : public BasePoolingOpModel {
   }
 };
 
+class SymmetricQuantizedPoolingOpModel : public BasePoolingOpModel {
+ public:
+  using BasePoolingOpModel::BasePoolingOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
 TEST(FloatPoolingOpTest, AveragePool) {
   FloatPoolingOpModel m(BuiltinOperator_AVERAGE_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
@@ -128,6 +147,29 @@ TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
   EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
 }
 
+// Test quantized AveragePool with int8 input and output. The input is the same
+// as the uint8 test QuantizedPoolingOpTest.AveragePool. The float output is
+// identical to uint8 test and quantized output is identical to uint8 test with
+// a 128 shift.
+TEST(QuantizedPoolingOpTest, SymmetricAveragePool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  SymmetricQuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_INT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({2.75, 5.75})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({44 - 128, 92 - 128}));
+}
+
 // Send in a white image, expect something other than a white pixel, due to
 // overflow.
 TEST(QuantizedPoolingOpTest, AveragePoolImageSize17) {
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index d5219bfa8333586ebd61cfe5f9e2fd7687074e04..fcfe0b25bbc0efd295a3b4d9fde787de259f5834 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -94,6 +94,7 @@ TfLiteRegistration* Register_GREATER_EQUAL();
 TfLiteRegistration* Register_LESS();
 TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_CEIL();
 TfLiteRegistration* Register_TILE();
 TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_SUM();
@@ -129,6 +130,8 @@ TfLiteRegistration* Register_LEAKY_RELU();
 TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
+TfLiteRegistration* Register_UNIQUE();
+TfLiteRegistration* Register_REVERSE_V2();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -167,22 +170,34 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
   AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
-  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
-  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
-             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+             Register_BIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
-             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
-  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
@@ -195,12 +210,13 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PAD, Register_PAD());
   AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
@@ -236,6 +252,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LESS, Register_LESS());
   AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
   AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
@@ -271,6 +288,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
   AddBuiltin(BuiltinOperator_FILL, Register_FILL());
   AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
+  AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
+  AddBuiltin(BuiltinOperator_REVERSE_V2, Register_REVERSE_V2());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6840ea39bf243f476f7935ed85a53aacb044e498
--- /dev/null
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -0,0 +1,297 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/register_ref.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace ops {
+
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
+TfLiteRegistration* Register_LAYER_NORM_LSTM();
+TfLiteRegistration* Register_MFCC();
+TfLiteRegistration* Register_DETECTION_POSTPROCESS();
+TfLiteRegistration* Register_RELU_1();
+
+}  // namespace custom
+
+namespace builtin {
+
+// TODO(yunluli): Some of the registries, e.g. Tanh(), could only invoke
+// optimized kernels. Add a _REF() variant for them.
+TfLiteRegistration* Register_ABS();
+TfLiteRegistration* Register_RELU();
+TfLiteRegistration* Register_RELU_N1_TO_1();
+TfLiteRegistration* Register_RELU6();
+TfLiteRegistration* Register_TANH_REF();
+TfLiteRegistration* Register_LOGISTIC_REF();
+TfLiteRegistration* Register_AVERAGE_POOL_REF();
+TfLiteRegistration* Register_MAX_POOL_REF();
+TfLiteRegistration* Register_L2_POOL_REF();
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF();
+TfLiteRegistration* Register_SVDF();
+TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_LSH_PROJECTION();
+TfLiteRegistration* Register_HASHTABLE_LOOKUP();
+TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Register_CONCATENATION_REF();
+TfLiteRegistration* Register_ADD_REF();
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND_REF();
+TfLiteRegistration* Register_DIV_REF();
+TfLiteRegistration* Register_SUB_REF();
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND_REF();
+TfLiteRegistration* Register_MUL_REF();
+TfLiteRegistration* Register_L2NORM_REF();
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORM_REF();
+TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_PAD_REF();
+TfLiteRegistration* Register_PADV2_REF();
+TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_RESIZE_BILINEAR_REF();
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR_REF();
+TfLiteRegistration* Register_SKIP_GRAM();
+TfLiteRegistration* Register_SPACE_TO_DEPTH_REF();
+TfLiteRegistration* Register_GATHER();
+TfLiteRegistration* Register_TRANSPOSE_REF();
+TfLiteRegistration* Register_MEAN_REF();
+TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_SPLIT_V();
+TfLiteRegistration* Register_SQUEEZE();
+TfLiteRegistration* Register_STRIDED_SLICE_REF();
+TfLiteRegistration* Register_EXP();
+TfLiteRegistration* Register_TOPK_V2();
+TfLiteRegistration* Register_LOG();
+TfLiteRegistration* Register_LOG_SOFTMAX_REF();
+TfLiteRegistration* Register_CAST();
+TfLiteRegistration* Register_DEQUANTIZE();
+TfLiteRegistration* Register_PRELU();
+TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_ARG_MIN();
+TfLiteRegistration* Register_GREATER();
+TfLiteRegistration* Register_GREATER_EQUAL();
+TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_LESS_EQUAL();
+TfLiteRegistration* Register_FLOOR_REF();
+TfLiteRegistration* Register_TILE();
+TfLiteRegistration* Register_NEG();
+TfLiteRegistration* Register_SUM();
+TfLiteRegistration* Register_REDUCE_PROD();
+TfLiteRegistration* Register_REDUCE_MAX();
+TfLiteRegistration* Register_REDUCE_MIN();
+TfLiteRegistration* Register_REDUCE_ANY();
+TfLiteRegistration* Register_SELECT();
+TfLiteRegistration* Register_SLICE_REF();
+TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_TRANSPOSECONV_REF();
+TfLiteRegistration* Register_EXPAND_DIMS();
+TfLiteRegistration* Register_SPARSE_TO_DENSE();
+TfLiteRegistration* Register_EQUAL();
+TfLiteRegistration* Register_NOT_EQUAL();
+TfLiteRegistration* Register_SQRT();
+TfLiteRegistration* Register_RSQRT();
+TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_POW();
+TfLiteRegistration* Register_FAKE_QUANT();
+TfLiteRegistration* Register_PACK();
+TfLiteRegistration* Register_ONE_HOT();
+TfLiteRegistration* Register_LOGICAL_OR();
+TfLiteRegistration* Register_LOGICAL_AND();
+TfLiteRegistration* Register_LOGICAL_NOT();
+TfLiteRegistration* Register_UNPACK();
+TfLiteRegistration* Register_FLOOR_DIV();
+TfLiteRegistration* Register_SQUARE();
+TfLiteRegistration* Register_ZEROS_LIKE();
+TfLiteRegistration* Register_FLOOR_MOD();
+TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_MIRROR_PAD();
+
+namespace {
+
+TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
+  context->ReportError(
+      context,
+      "Regular TensorFlow ops are not supported by this interpreter. Make sure "
+      "you invoke the Flex delegate before inference.");
+  return kTfLiteError;
+}
+
+}  // namespace
+
+const TfLiteRegistration* BuiltinRefOpResolver::FindOp(
+    tflite::BuiltinOperator op, int version) const {
+  return MutableOpResolver::FindOp(op, version);
+}
+
+const TfLiteRegistration* BuiltinRefOpResolver::FindOp(const char* op,
+                                                       int version) const {
+  // Return the NULL Op for all ops whose name start with "Flex", allowing
+  // the interpreter to delegate their execution.
+  if (IsFlexOp(op)) {
+    static TfLiteRegistration null_op{
+        nullptr, nullptr, &UnsupportedTensorFlowOp,
+        nullptr, nullptr, BuiltinOperator_CUSTOM,
+        "Flex",  1};
+    return &null_op;
+  }
+  return MutableOpResolver::FindOp(op, version);
+}
+
+BuiltinRefOpResolver::BuiltinRefOpResolver() {
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
+  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+  AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH_REF());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC_REF());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_REF());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_REF());
+  AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_REF());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONVOLUTION_REF());
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
+             Register_DEPTHWISE_CONVOLUTION_REF(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+             Register_EMBEDDING_LOOKUP_SPARSE());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
+  AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION_REF());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD_REF());
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND,
+             Register_SPACE_TO_BATCH_ND_REF());
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND,
+             Register_BATCH_TO_SPACE_ND_REF());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF());
+  AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+             Register_LOCAL_RESPONSE_NORM_REF());
+  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD_REF());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2_REF());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR_REF());
+  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+             Register_RESIZE_NEAREST_NEIGHBOR_REF());
+  AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF());
+  AddBuiltin(BuiltinOperator_DIV, Register_DIV_REF());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB_REF());
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
+  AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE_REF());
+  AddBuiltin(BuiltinOperator_EXP, Register_EXP());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF());
+  AddBuiltin(BuiltinOperator_CAST, Register_CAST());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR_REF());
+  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF());
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
+  AddBuiltin(BuiltinOperator_SUM, Register_SUM());
+  AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
+  AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
+  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
+  AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_POW, Register_POW());
+  AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
+  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
+  AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
+  AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
+  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
+  AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
+  AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
+  AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
+
+  // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
+  // custom ops aren't always included by default.
+  AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
+  AddCustom("AudioSpectrogram",
+            tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
+  AddCustom("LayerNormLstm", tflite::ops::custom::Register_LAYER_NORM_LSTM());
+  AddCustom("Relu1", tflite::ops::custom::Register_RELU_1());
+  AddCustom("TFLite_Detection_PostProcess",
+            tflite::ops::custom::Register_DETECTION_POSTPROCESS());
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/register_ref.h b/tensorflow/lite/kernels/register_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..c66d4a25bc43a9e336f071ce6058ccd7ecce4d31
--- /dev/null
+++ b/tensorflow/lite/kernels/register_ref.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
+#define TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+class BuiltinRefOpResolver : public MutableOpResolver {
+ public:
+  BuiltinRefOpResolver();
+
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+};
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index 00bbbef57eccef67d043e85c02ebe80c3f9387ef..e9d12a9def7e1a33bc0b6db47d7b2f09036b84f2 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -123,6 +123,7 @@ class ReshapeOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_P(ReshapeOpTest, MismatchedDimensions) {
   if (GetParam() == kAsTensor) {
     ReshapeOpModel<float> m({1, 2, 4, 1}, {2}, {2, 1}, GetParam());
@@ -133,23 +134,17 @@ TEST_P(ReshapeOpTest, MismatchedDimensions) {
                  "num_input_elements != num_output_elements");
   }
 }
+#endif
 
 TEST_P(ReshapeOpTest, TooManyDimensions) {
-  if (GetParam() == kAsReshapeOption) {
+#ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
                                        {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam()),
                  "Found too many dimensions");
-  } else {
-    ReshapeOpModel<float> m({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
-                            {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam());
-    m.SetInput({3, 4});
-    m.Invoke();
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 4}));
-    EXPECT_THAT(m.GetOutputShape(),
-                ElementsAreArray({1, 1, 1, 1, 1, 1, 1, 1, 2}));
-  }
+#endif
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
   if (GetParam() != kAsTensor) {
     EXPECT_DEATH(
@@ -160,6 +155,7 @@ TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
     EXPECT_DEATH(m.Invoke(), "stretch_dim != -1");
   }
 }
+#endif
 
 // Create the model with a 2x2 shape. Processing still works because the new
 // shape ends up being hardcoded as a flat vector.
@@ -202,12 +198,16 @@ TEST_P(ReshapeOpTest, ScalarOutput) {
 // and output are scalars.
 TEST_P(ReshapeOpTest, LegacyScalarOutput) {
   if (GetParam() == kAsConstantTensor) {
+#ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1}, {1}, {0}, GetParam()),
                  "num_input_elements != num_output_elements");
+#endif
   } else if (GetParam() == kAsTensor) {
+#ifdef GTEST_HAS_DEATH_TEST
     ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
     m.SetInput({3});
     EXPECT_DEATH(m.Invoke(), "num_input_elements != num_output_elements");
+#endif
   } else {
     ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
     m.SetInput({3});
@@ -226,9 +226,9 @@ TEST_P(ReshapeOpTest, Strings) {
               ElementsAreArray({"1", "2", "3", "4", "5", "6", "7", "8"}));
 }
 
-INSTANTIATE_TEST_CASE_P(VariedShapeSpec, ReshapeOpTest,
-                        ::testing::Values(kAsReshapeOption, kAsConstantTensor,
-                                          kAsTensor));
+INSTANTIATE_TEST_SUITE_P(VariedShapeSpec, ReshapeOpTest,
+                         ::testing::Values(kAsReshapeOption, kAsConstantTensor,
+                                           kAsTensor));
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..855aee8df1c0969bba9ec7d32bee78e04aeccbca
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reverse {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxisTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(axis), 1);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= NumElements(axis));
+
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
+      input->type != kTfLiteUInt8 && input->type != kTfLiteInt16 &&
+      input->type != kTfLiteInt64) {
+    context->ReportError(context, "Type '%s' is not supported by reverse.",
+                         TfLiteTypeGetName(input->type));
+    return kTfLiteError;
+  }
+
+  if (axis->type != kTfLiteInt32) {
+    context->ReportError(context, "Axis Type '%s' is not supported by reverse.",
+                         TfLiteTypeGetName(axis->type));
+    return kTfLiteError;
+  }
+
+  // TODO(renjieliu): support multi-axis case.
+  if (NumElements(axis) > 1) {
+    context->ReportError(context, "Current does not support more than 1 axis.");
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis_tensor = GetInput(context, node, kAxisTensor);
+  int axis = GetTensorData<int32_t>(axis_tensor)[0];
+
+  TF_LITE_ENSURE(context, axis >= 0 && axis < NumDimensions(input));
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      reference_ops::Reverse<float>(
+          axis, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteUInt8: {
+      reference_ops::Reverse<uint8_t>(
+          axis, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      break;
+    }
+    case kTfLiteInt16: {
+      reference_ops::Reverse<int16_t>(
+          axis, GetTensorShape(input), GetTensorData<int16_t>(input),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+      break;
+    }
+    case kTfLiteInt32: {
+      reference_ops::Reverse<int32_t>(
+          axis, GetTensorShape(input), GetTensorData<int32_t>(input),
+          GetTensorShape(output), GetTensorData<int32_t>(output));
+      break;
+    }
+    case kTfLiteInt64: {
+      reference_ops::Reverse<int64_t>(
+          axis, GetTensorShape(input), GetTensorData<int64_t>(input),
+          GetTensorShape(output), GetTensorData<int64_t>(output));
+      break;
+    }
+    default: {
+      context->ReportError(context, "Type '%s' is not supported by reverse.",
+                           TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace reverse
+
+TfLiteRegistration* Register_REVERSE_V2() {
+  static TfLiteRegistration r = {nullptr, nullptr, reverse::Prepare,
+                                 reverse::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc0c24b64c197d5c9a60ff74bdd53c5ae0352b9
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ReverseOpModel : public SingleOpModel {
+ public:
+  ReverseOpModel(const TensorData& input, const TensorData& axis) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+
+    output_ = AddOutput({input.type, {}});
+
+    SetBuiltinOp(BuiltinOperator_REVERSE_V2, BuiltinOptions_ReverseV2Options,
+                 CreateReverseV2Options(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+// float32 tests.
+TEST(ReverseOpTest, FloatOneDimension) {
+  ReverseOpModel<float> model({TensorType_FLOAT32, {4}},
+                              {TensorType_INT32, {1}});
+  model.PopulateTensor<float>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, FloatMultiDimensions) {
+  ReverseOpModel<float> model({TensorType_FLOAT32, {4, 3, 2}},
+                              {TensorType_INT32, {1}});
+  model.PopulateTensor<float>(model.input(),
+                              {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                               13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int32 tests
+TEST(ReverseOpTest, Int32OneDimension) {
+  ReverseOpModel<int32_t> model({TensorType_INT32, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int32_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int32MultiDimensions) {
+  ReverseOpModel<int32_t> model({TensorType_INT32, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int32_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int64 tests
+TEST(ReverseOpTest, Int64OneDimension) {
+  ReverseOpModel<int64_t> model({TensorType_INT64, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int64_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int64MultiDimensions) {
+  ReverseOpModel<int64_t> model({TensorType_INT64, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int64_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// uint8 tests
+TEST(ReverseOpTest, Uint8OneDimension) {
+  ReverseOpModel<uint8_t> model({TensorType_UINT8, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<uint8_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Uint8MultiDimensions) {
+  ReverseOpModel<uint8_t> model({TensorType_UINT8, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<uint8_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int16 tests
+TEST(ReverseOpTest, Int16OneDimension) {
+  ReverseOpModel<int16_t> model({TensorType_INT16, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int16_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int16MultiDimensions) {
+  ReverseOpModel<int16_t> model({TensorType_INT16, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int16_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 116c81e4d57a9a27dfb0581fe0096f461aa6ab81..5fca7a3ea71aa41c6e466b7814921e2e1ac6293d 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -28,6 +28,11 @@ namespace ops {
 namespace builtin {
 namespace slice {
 
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 constexpr int kInputTensor = 0;
 constexpr int kBeginTensor = 1;
 constexpr int kSizeTensor = 2;
@@ -126,6 +131,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return ResizeOutputShape(context, input, begin, size, output);
 }
 
+template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
@@ -165,38 +171,44 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // The dimensions in the kernel used to be in reverse-order, and TFLite
   // arranged the begins and sizes vectors accordingly. This macro incorporates
   // the needed reversing.
-#define TF_LITE_SLICE(data_type)                                           \
-  {                                                                        \
-    TF_LITE_ENSURE_EQ(context, begins.size(), 4);                          \
-    TF_LITE_ENSURE_EQ(context, sizes.size(), 4);                           \
-    tflite::SliceParams op_params;                                         \
-    op_params.begin_count = 4;                                             \
-    op_params.size_count = 4;                                              \
-    for (int i = 0; i < 4; ++i) {                                          \
-      op_params.begin[i] = begins[3 - i];                                  \
-      op_params.size[i] = sizes[3 - i];                                    \
-    }                                                                      \
-                                                                           \
-    optimized_ops::Slice<data_type>(                                       \
-        op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
-        GetTensorShape(output), GetTensorData<data_type>(output));         \
+#define TF_LITE_SLICE(data_type, kernel_type)                                \
+  {                                                                          \
+    TF_LITE_ENSURE_EQ(context, begins.size(), 4);                            \
+    TF_LITE_ENSURE_EQ(context, sizes.size(), 4);                             \
+    tflite::SliceParams op_params;                                           \
+    op_params.begin_count = 4;                                               \
+    op_params.size_count = 4;                                                \
+    for (int i = 0; i < 4; ++i) {                                            \
+      op_params.begin[i] = begins[3 - i];                                    \
+      op_params.size[i] = sizes[3 - i];                                      \
+    }                                                                        \
+                                                                             \
+    if (kernel_type == kGenericOptimized) {                                  \
+      optimized_ops::Slice<data_type>(                                       \
+          op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
+          GetTensorShape(output), GetTensorData<data_type>(output));         \
+    } else {                                                                 \
+      reference_ops::Slice<data_type>(                                       \
+          op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
+          GetTensorShape(output), GetTensorData<data_type>(output));         \
+    }                                                                        \
   }
 
   switch (input->type) {
     case kTfLiteFloat32:
-      TF_LITE_SLICE(float);
+      TF_LITE_SLICE(float, kernel_type);
       break;
     case kTfLiteInt32:
-      TF_LITE_SLICE(int32_t);
+      TF_LITE_SLICE(int32_t, kernel_type);
       break;
     case kTfLiteInt64:
-      TF_LITE_SLICE(int64_t);
+      TF_LITE_SLICE(int64_t, kernel_type);
       break;
     case kTfLiteUInt8:
-      TF_LITE_SLICE(uint8_t);
+      TF_LITE_SLICE(uint8_t, kernel_type);
       break;
     case kTfLiteBool:
-      TF_LITE_SLICE(bool);
+      TF_LITE_SLICE(bool, kernel_type);
       break;
     default:
       context->ReportError(
@@ -209,8 +221,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace slice
 
+TfLiteRegistration* Register_SLICE_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare,
+                                 slice::Eval<slice::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_SLICE() {
-  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare, slice::Eval};
+  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare,
+                                 slice::Eval<slice::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 4d55ba56b71c5e0c44f0145981db56cbef6ec99a..c5d6e9a53062d97801b518f15305e2052f861e7c 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -106,12 +106,14 @@ class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(
       SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2},
                                  {0, 0, 0, 0}, {TensorType_FLOAT32}),
       "Cannot allocate tensors");
 }
+#endif
 
 TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, {2, 2},
@@ -220,6 +222,7 @@ class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -228,6 +231,7 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 5744669b6d62af61a0b20e7723b78c72f6db952d..3fa8d86348ef899b9bd42c19f5b1510b4c4e33d3 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -50,10 +50,12 @@ class SpaceToDepthOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(SpaceToDepthOpModel, BadBlockSize) {
   EXPECT_DEATH(SpaceToDepthOpModel({TensorType_FLOAT32, {1, 2, 2, 1}}, 3),
                "Cannot allocate tensors");
 }
+#endif
 
 TEST(SpaceToDepthOpModel, Float32) {
   SpaceToDepthOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, 2);
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected.cc b/tensorflow/lite/kernels/sparse_output_fully_connected.cc
deleted file mode 100644
index 73d850f0e2d094e9cc620f4f4733354d603b2a77..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/sparse_output_fully_connected.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// SparseOutputFullyConnected is a fully connected layer that uses a single
-// row in the weights and bias via a lookup.
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-namespace sparse_output_fully_connected {
-
-// Input tensors of size {n_batch, n_input}
-constexpr int kInputTensor = 0;
-// Auxiliary input tensor of size { 1 }
-constexpr int kInputLookupTensor = 1;
-
-// Weights tensor of size { n_embeddings , n_input }
-constexpr int kWeightsTensor = 2;
-// Bias tensor of size { n_embeddings }
-constexpr int kBiasTensor = 3;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-// Temporary tensors.
-enum TemporaryTensor {
-  kInputQuantized = 0,
-  kScalingFactors = 1,
-  kNumTemporaryTensors = 2
-};
-
-// Struct to hold op data.
-struct OpData {
-  int scratch_tensor_index;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  context->AddTensors(context, /*tensors_to_add=*/kNumTemporaryTensors,
-                      &data->scratch_tensor_index);
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
-  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
-  // Only support single lookup.
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(lookup, 0), 1);
-
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 2);
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(weights, 1), n_input);
-
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(weights, 0));
-
-  const bool is_hybrid_op =
-      (weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
-
-  // Resize output.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(1);
-  output_size_array->data[0] = 1;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size_array));
-
-  if (is_hybrid_op) {
-    TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
-
-    // Allocate temporary tensors to store quantized values of input.
-    node->temporaries->data[kInputQuantized] = op_data->scratch_tensor_index;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, /*index=*/kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-
-    // Tell interpreter to allocate temporary tensors to store scaling factors.
-    node->temporaries->data[kScalingFactors] =
-        op_data->scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, /*index=*/kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* lookup,
-                       const TfLiteTensor* weights, const TfLiteTensor* bias,
-                       TfLiteTensor* output) {
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const float* input_ptr_batch = input->data.f;
-
-  // Initialize pointer to right row according to lookup value.
-  int32 lookup_index = lookup->data.i32[0];
-  const float* weights_ptr = weights->data.f + lookup_index * n_input;
-
-  // Initialize output to bias.
-  if (bias) {
-    float* bias_ptr = bias->data.f + lookup_index;
-    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      weights_ptr, /*m_rows=*/1, n_input, input_ptr_batch, n_batch,
-      output->data.f, /*result_stride=*/1);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(const TfLiteTensor* input, const TfLiteTensor* lookup,
-                        const TfLiteTensor* weights, const TfLiteTensor* bias,
-                        TfLiteTensor* scaling_factors,
-                        TfLiteTensor* input_quantized, TfLiteTensor* output) {
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const float* input_ptr_batch = input->data.f;
-  // Initialize the pointer to storage for quantized values and
-  // scaling factors.
-  int8_t* quantized_input_ptr_batch =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-
-  // Initialize pointer to right row according to lookup value.
-  int32 lookup_index = lookup->data.i32[0];
-  int8_t* weights_ptr =
-      reinterpret_cast<int8_t*>(weights->data.uint8) + lookup_index * n_input;
-
-  // Initialize output to bias.
-  if (bias) {
-    float* bias_ptr = bias->data.f + lookup_index;
-    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
-  }
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-    // Quantize input from float to int8.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_input;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
-          &unused_min, &unused_max, &scaling_factors_ptr[b]);
-      scaling_factors_ptr[b] *= weights->params.scale;
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        weights_ptr, /*m_rows=*/1, n_input, quantized_input_ptr_batch,
-        scaling_factors_ptr, n_batch, output->data.f, /*result_stride=*/1);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, lookup, weights, bias, output);
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, /*index=*/kInputQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, /*index=*/kScalingFactors);
-      return EvalHybrid(input, lookup, weights, bias, scaling_factors,
-                        input_quantized, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace sparse_output_fully_connected
-
-TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {sparse_output_fully_connected::Init,
-                                 sparse_output_fully_connected::Free,
-                                 sparse_output_fully_connected::Prepare,
-                                 sparse_output_fully_connected::Eval};
-  return &r;
-}
-
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
deleted file mode 100644
index c25a32bde001e632afff2a34ad168467c092bcf5..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite sparse output fully connected op.
-#include <iomanip>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-
-namespace tflite {
-
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BaseSparseOutputFullyConnectedOpModel : public SingleOpModel {
- public:
-  BaseSparseOutputFullyConnectedOpModel(const TensorData& input,
-                                        const TensorData& weights,
-                                        const TensorData& output = {
-                                            TensorType_FLOAT32}) {
-    input_ = AddInput(input);
-    lookup_ = AddInput({TensorType_INT32, {1}});
-    weights_ = AddInput(weights);
-    int bias_size = GetShape(weights_)[0];
-    bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
-    output_ = AddOutput(output);
-
-    // Create empty (required) options map.
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {});
-    fbb.Finish();
-
-    SetCustomOp("SPARSE_OUTPUT_FULLY_CONNECTED", fbb.GetBuffer(),
-                Register_SPARSE_OUTPUT_FULLY_CONNECTED);
-    BuildInterpreter({GetShape(input_), GetShape(lookup_), GetShape(weights_),
-                      GetShape(bias_)});
-  }
-
-  void SetInput(const std::vector<float>& data) {
-    PopulateTensor(input_, data);
-  }
-
-  void SetLookup(const std::vector<int32>& f) { PopulateTensor(lookup_, f); }
-
-  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
- protected:
-  int input_;
-  int lookup_;
-  int weights_;
-  int bias_;
-  int output_;
-};
-
-class FloatSparseOutputFullyConnectedOpModel
-    : public BaseSparseOutputFullyConnectedOpModel {
- public:
-  using BaseSparseOutputFullyConnectedOpModel::
-      BaseSparseOutputFullyConnectedOpModel;
-
-  void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); }
-};
-
-class HybridSparseOutputFullyConnectedOpModel
-    : public BaseSparseOutputFullyConnectedOpModel {
- public:
-  using BaseSparseOutputFullyConnectedOpModel::
-      BaseSparseOutputFullyConnectedOpModel;
-
-  void SetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
-  }
-};
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestFloat) {
-  FloatSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                           {TensorType_FLOAT32, {3, 5}},
-                                           {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({28}));
-}
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestHybrid) {
-  HybridSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                            {TensorType_UINT8, {3, 5}},
-                                            {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  // We get 28.0552 instead of 28.
-  //
-  // Input -> -42, 0, 42, 85, 127 with scale factor of 127/3.
-  // Looked up weights ->  25, 51, 76, 102, 127 with scale factor of 127/5.
-  //
-  // (-42 * 25 + 0 * 51 + 42 * 76 + 85 * 102 + 127 * 127) * (3*5/127^2) + 3.0
-  // gives us the expected result.
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({28}, 0.0553)));
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index 59b53a6287dbbc863a61875be82090c1b9c6d442..3661cf9f98c5d0133090ae926f8d76e54f428eba 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -105,10 +105,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   } else if (output->type == kTfLiteInt32) {
     EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
   } else {
-    context->ReportError(context,
-                         "SquaredDifference only supports FLOAT32, INT32 and "
-                         "quantized UINT8 now, got %d.",
-                         output->type);
+    context->ReportError(
+        context,
+        "SquaredDifference only supports FLOAT32 and INT32 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 122e01b99ecbed1255ea4b2d29e82b57f04be80c..34875bf0497a000da02f3d0212b042399046a492 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -72,6 +72,7 @@ class StridedSliceOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
       StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
@@ -84,6 +85,7 @@ TEST(StridedSliceOpTest, UnssupportedArgs) {
   EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
+#endif
 
 TEST(StridedSliceOpTest, In1D) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index f07937140e9ac4abfbae47a1679ddbfba4d30938..d8fc7ce1cea6f8bbf7b4f08fa80e635b0735d08c 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -176,8 +176,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     context->ResizeTensor(context, output, output_size_array));
 
   // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op =
-      (input->type == kTfLiteFloat32 && weights_feature->type == kTfLiteUInt8);
+  const bool is_hybrid_op = (input->type == kTfLiteFloat32 &&
+                             (weights_feature->type == kTfLiteUInt8 ||
+                              weights_feature->type == kTfLiteInt8));
 
   // Resize scratch.
   TfLiteIntArrayFree(node->temporaries);
@@ -203,7 +204,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // of input tensors.
     node->temporaries->data[1] = scratch_tensor_index + 1;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = weights_feature->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -297,16 +298,24 @@ TfLiteStatus EvalHybrid(
   // Initialize the pointer to input.
   const float* input_ptr_batch = input->data.f;
 
-  // Initialize the pointer to storage for quantized values and
-  // scaling factors.
-  int8_t* quantized_input_ptr_batch =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  // Initialize the pointer to storage for quantized values and the weights
+  // feature.
+  int8_t* quantized_input_ptr_batch;
+  const int8_t* weights_feature_ptr;
+  if (weights_feature->type == kTfLiteUInt8) {
+    quantized_input_ptr_batch =
+        reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+    weights_feature_ptr =
+        reinterpret_cast<int8_t*>(weights_feature->data.uint8);
+  } else {
+    quantized_input_ptr_batch = input_quantized->data.int8;
+    weights_feature_ptr = weights_feature->data.int8;
+  }
 
+  // Initialize the pointer to storage for scaling factors.
   float* scaling_factors_ptr = scaling_factors->data.f;
 
-  // Other initializations.
-  const int8_t* weights_feature_ptr =
-      reinterpret_cast<int8_t*>(weights_feature->data.uint8);
+  // Initialize the weights scale.
   const float weights_feature_scale = weights_feature->params.scale;
 
   // Clear the activation (state left most column).
@@ -374,7 +383,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        bias, params, scratch, activation_state, output);
       break;
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
       TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
       TfLiteTensor* float_weights_time =
@@ -388,8 +398,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       // TODO(alanchiao): refactor logic out into dequantize function.
       if (!op_data->float_weights_time_initialized) {
         const float dequantization_scale = weights_time->params.scale;
-        const int8_t* weights_time_ptr =
-            reinterpret_cast<int8_t*>(weights_time->data.uint8);
+        const int8_t* weights_time_ptr;
+        if (weights_feature->type == kTfLiteUInt8) {
+          weights_time_ptr =
+              reinterpret_cast<int8_t*>(weights_time->data.uint8);
+        } else {
+          weights_time_ptr = weights_time->data.int8;
+        }
         for (int i = 0; i < NumElements(float_weights_time); ++i) {
           float_weights_time->data.f[i] =
               weights_time_ptr[i] * dequantization_scale;
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index 8accaa465ca8a51f2b6e00648a6195f31039d3f7..c420260bf51bd45944a7b77a81e20e56999c8fbb 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -203,17 +203,30 @@ class SVDFOpModel : public BaseSVDFOpModel {
 class HybridSVDFOpModel : public BaseSVDFOpModel {
  public:
   HybridSVDFOpModel(int batches, int units, int input_size, int memory_size,
-                    int rank)
+                    int rank, TensorType tensor_type)
       : BaseSVDFOpModel(batches, units, input_size, memory_size, rank,
-                        TensorType_UINT8, TensorType_UINT8) {}
+                        tensor_type, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
 
   void SetWeightsFeature(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_feature_, f);
+    SetWeights(weights_feature_, f);
   }
 
   void SetWeightsTime(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_time_, f);
+    SetWeights(weights_time_, f);
   }
+
+ protected:
+  TensorType tensor_type_;
 };
 
 class SVDFOpTest : public ::testing::Test {
@@ -312,9 +325,74 @@ TEST_F(SVDFOpTest, BlackBoxTestRank2) {
                 &svdf);
 }
 
-TEST_F(SVDFOpTest, BlackBoxTestHybridRank1) {
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank1Uint8) {
+  HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                         /*memory_size=*/10, /*rank=*/1, TensorType_UINT8);
+  svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
+                          0.22197971, 0.12416199, 0.27901134, 0.27557442,
+                          0.3905206, -0.36137494, -0.06634006, -0.10640851});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
+
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input),
+                &svdf,
+                /*tolerance=*/0.002945);
+}
+
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank2Uint8) {
+  HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                         /*memory_size=*/10, /*rank=*/2, TensorType_UINT8);
+  svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
+                          0.12416199,  0.15785322,  0.27901134,  0.3905206,
+                          0.21931258,  -0.36137494, -0.10640851, 0.31053296,
+                          -0.36118156, -0.0976817,  -0.36916667, 0.22197971,
+                          0.15294972,  0.38031587,  0.27557442,  0.39635518,
+                          -0.21580373, -0.06634006, -0.02702999, 0.27072677});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
+
+       -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+       0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
+
+       -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+       0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
+
+       -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+       -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
+
+       0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+       0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
+
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input),
+                &svdf,
+                /*tolerance=*/0.00625109);
+}
+
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank1Int8) {
   HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
-                         /*memory_size=*/10, /*rank=*/1);
+                         /*memory_size=*/10, /*rank=*/1, TensorType_INT8);
   svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
                           0.22197971, 0.12416199, 0.27901134, 0.27557442,
                           0.3905206, -0.36137494, -0.06634006, -0.10640851});
@@ -337,9 +415,9 @@ TEST_F(SVDFOpTest, BlackBoxTestHybridRank1) {
                 /*tolerance=*/0.002945);
 }
 
-TEST_F(SVDFOpTest, BlackBoxTestHybridRank2) {
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank2Int8) {
   HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
-                         /*memory_size=*/10, /*rank=*/2);
+                         /*memory_size=*/10, /*rank=*/2, TensorType_INT8);
   svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
                           0.12416199,  0.15785322,  0.27901134,  0.3905206,
                           0.21931258,  -0.36137494, -0.10640851, 0.31053296,
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 19d7e37409cba2f4b018082d13a2d3e130a3c5c4..295204f62e56488b06f8d5ed23a1ae62a4d1b106 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -47,7 +47,12 @@ std::vector<Matcher<std::complex<float>>> ArrayComplex64Near(
 }
 
 int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
-  int id = AddTensor<float>(t, {}, is_variable);
+  int id = 0;
+  if (t.per_channel_quantization) {
+    id = AddTensorPerChannelQuant(t);
+  } else {
+    id = AddTensor<float>(t, {}, is_variable);
+  }
   inputs_.push_back(id);
   return id;
 }
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 4a442f9fa7554fbc5c149e1dd20f82c162d392d4..c5435b3546215103a4943f434873f230902cbc6a 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -21,13 +21,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
 namespace tflite {
 
@@ -82,7 +83,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 // A helper struct to construct test tensors. This is particularly useful for
 // quantized tensor which must have their scale and zero_point defined before
 // the actual data is known. This mimics what happens in practice: quantization
-// parameters are calculated during training.
+// parameters are calculated during training or post training..
 struct TensorData {
   TensorType type;
   std::vector<int> shape;
@@ -90,6 +91,10 @@ struct TensorData {
   float max;
   float scale;
   int32_t zero_point;
+  bool per_channel_quantization;
+  std::vector<float> per_channel_quantization_scales;
+  std::vector<int64_t> per_channel_quantization_offsets;
+  int32_t channel_index;
 };
 
 class SingleOpResolver : public OpResolver {
@@ -172,6 +177,46 @@ class SingleOpModel {
     PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
   }
 
+  // Quantize and populate data for filter with per channel quantization.
+  void PerChannelSymmetricQuantizeAndPopulate(
+      int index, const std::vector<float>& input_data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    const int channel_index = params->quantized_dimension;
+
+    std::vector<int32_t> shape(t->dims->size);
+    for (int i = 0; i < shape.size(); ++i) {
+      shape[i] = t->dims->data[i];
+    }
+    const int32_t num_inputs = input_data.size();
+    const int32_t num_channel = shape[channel_index];
+    std::vector<int8_t> quantized_output(num_inputs);
+    std::vector<float> scales_inv(num_channel);
+    for (int i = 0; i < num_channel; ++i) {
+      scales_inv[i] = 1.0f / params->scale->data[i];
+    }
+    optimize::utils::SymmetricPerChannelQuantizeValues(
+        input_data.data(), scales_inv, shape, channel_index, &quantized_output);
+
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
+  // Quantize and populate data for bias with per channel quantization.
+  void PerChannelQuantizeBias(int index, const std::vector<float>& input_data) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<int32_t> quantized_output(num_inputs);
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    for (int i = 0; i < num_inputs; ++i) {
+      quantized_output[i] = input_data[i] * params->scale->data[i];
+    }
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
   const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
 
   float GetScale(int id) { return tensor_data_.at(id).scale; }
@@ -292,6 +337,24 @@ class SingleOpModel {
     return {scale, zero_point};
   }
 
+  int AddTensorPerChannelQuant(TensorData t) {
+    const int id = tensors_.size();
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+    q_params = CreateQuantizationParameters(
+        builder_, /*min=*/0, /*max=*/0,
+        /*scale=*/
+        builder_.CreateVector<float>(t.per_channel_quantization_scales),
+        /*zero point=*/
+        builder_.CreateVector<int64_t>(t.per_channel_quantization_offsets),
+        QuantizationDetails_NONE, 0, t.channel_index);
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape), t.type,
+                     /*buffer=*/0,
+                     /*name=*/0, q_params, /*is_variable=*/false));
+    tensor_data_[id] = t;
+    return id;
+  }
+
   template <typename T>
   int AddTensor(TensorData t, std::initializer_list<T> data,
                 bool is_variable = false) {
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 59eee51068c0efcf26d66d933e13ee2f931463bc..343f2ca59bad5df9c55b129bbf317b0bf25d26f0 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -119,8 +119,8 @@ TfLiteStatus ResizeIm2ColTensor(TfLiteContext* context,
   im2col_shape_array->data[1] = output_shape->data.i32[1];
   im2col_shape_array->data[2] = output_shape->data.i32[2];
   const int input_depth = SizeOfDimension(input, 3);
-  const int filter_width = SizeOfDimension(weights, 1);
-  const int filter_height = SizeOfDimension(weights, 2);
+  const int filter_width = SizeOfDimension(weights, 2);
+  const int filter_height = SizeOfDimension(weights, 1);
   im2col_shape_array->data[3] = input_depth * filter_height * filter_width;
 
   im2col->type = input->type;
@@ -197,8 +197,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Get height and width of the output image.
   const int width = SizeOfDimension(output, 2);
   const int height = SizeOfDimension(output, 1);
-  const int filter_width = SizeOfDimension(weights, 1);
-  const int filter_height = SizeOfDimension(weights, 2);
+  const int filter_width = SizeOfDimension(weights, 2);
+  const int filter_height = SizeOfDimension(weights, 1);
 
   const int stride_width = params->stride_width;
   const int stride_height = params->stride_height;
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 0520d84a30b50212bb3d86288236b49da523f4c2..44d1336b99fe03535451c7dbacfe77be58fd6fad 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -252,7 +252,7 @@ TEST_P(TransposeConvOpTest, AccuracyTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     TransposeConvOpTest, TransposeConvOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index 3ebaf3ca27ffd285ef86a81b2e63409fde565ef1..93df2c81db8c17de7a36d155c7d26b826c859c99 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -184,6 +184,7 @@ class TransposeOpDynamicModel : public TransposeOpModel {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, TestUnequalPermSize) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
@@ -194,6 +195,7 @@ TEST(TransposeTest, TestPermOutOfBounds) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
                "Transpose op permutations array is out of bounds.");
 }
+#endif
 
 TEST(TransposeTest, Test1DInputConstTensor) {
   TransposeOpConstModel m({3}, {1}, {0});
@@ -252,10 +254,12 @@ TEST(TransposeTest, Test3DInputDynamicTensor) {
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, Test5DInputTensor) {
   EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
                "Transpose op only supports 1D-4D input arrays.");
 }
+#endif
 
 TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
   TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 7d41491ba33ff0c6ef807c06da57b4d70be8895f..e2fc73ba29b5c96ad83536fb8752c11d70191d4d 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -306,7 +306,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // The weights are of consistent type, so it suffices to check one.
   // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+  const bool is_hybrid_op = ((input_to_output_weights->type == kTfLiteUInt8 ||
+                              input_to_output_weights->type == kTfLiteInt8) &&
                              input->type == kTfLiteFloat32);
 
   TfLiteIntArrayFree(node->temporaries);
@@ -344,7 +345,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -355,7 +356,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kOutputStateQuantized;
     TfLiteTensor* activation_state_quantized =
         GetTemporary(context, node, kOutputStateQuantized);
-    activation_state_quantized->type = kTfLiteUInt8;
+    activation_state_quantized->type = input_to_output_weights->type;
     activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
                              activation_state->dims)) {
@@ -369,7 +370,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kCellStateQuantized;
     TfLiteTensor* cell_state_quantized =
         GetTemporary(context, node, kCellStateQuantized);
-    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
       TfLiteIntArray* cell_state_quantized_size =
@@ -516,7 +517,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
           output);
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
       TfLiteTensor* activation_state_quantized =
           GetTemporary(context, node, /*index=*/2);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index ae7dd6b2bee1da06d9dc48f259585f541c72842f..bc35d90773b522d22e4373c60ca83121ff7fd09e 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -243,59 +243,73 @@ class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
       int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
       bool time_major, bool use_cifg, bool use_peephole,
       bool use_projection_weights, bool use_projection_bias, float cell_clip,
-      float proj_clip, const std::vector<std::vector<int>>& input_shapes)
+      float proj_clip, const std::vector<std::vector<int>>& input_shapes,
+      TensorType tensor_type)
       : UnidirectionalLSTMOpModel(
             n_batch, n_input, n_cell, n_output, sequence_length, time_major,
             use_cifg, use_peephole, use_projection_weights, use_projection_bias,
-            cell_clip, proj_clip, input_shapes, TensorType_UINT8) {}
+            cell_clip, proj_clip, input_shapes, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
 
   void SetInputToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+    SetWeights(input_to_input_weights_, f);
   }
 
   void SetInputToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+    SetWeights(input_to_forget_weights_, f);
   }
 
   void SetInputToCellWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+    SetWeights(input_to_cell_weights_, f);
   }
 
   void SetInputToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+    SetWeights(input_to_output_weights_, f);
   }
 
   void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+    SetWeights(recurrent_to_input_weights_, f);
   }
 
   void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+    SetWeights(recurrent_to_forget_weights_, f);
   }
 
   void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+    SetWeights(recurrent_to_cell_weights_, f);
   }
 
   void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+    SetWeights(recurrent_to_output_weights_, f);
   }
 
   void SetCellToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+    SetWeights(cell_to_input_weights_, f);
   }
 
   void SetCellToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+    SetWeights(cell_to_forget_weights_, f);
   }
 
   void SetCellToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+    SetWeights(cell_to_output_weights_, f);
   }
 
   void SetProjectionWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
+    SetWeights(projection_weights_, f);
   }
+
+ protected:
+  TensorType tensor_type_;
 };
 
 class BaseLstmTest : public ::testing::Test {
@@ -561,7 +575,8 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
                 /*time_major=*/false);
 }
 
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -601,7 +616,71 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
+                /*tolerance=*/0.0157651);
+}
+
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -730,7 +809,8 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -771,7 +851,70 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
+}
+
+TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToCellWeights(input_to_cell_weights_);
   lstm.SetInputToForgetWeights(input_to_forget_weights_);
@@ -1456,7 +1599,7 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTestUint8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -1496,7 +1639,75 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
+}
+
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 4c0fe00272a04ef3edc0787839f235f12aa546cb..3854695d0bfde5d6c3a14b0c3aa449f5ca2eb4fa 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -96,15 +96,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  const bool is_hybrid =
+      input->type == kTfLiteFloat32 && (input_weights->type == kTfLiteUInt8 ||
+                                        input_weights->type == kTfLiteInt8);
+
   // Allocate temporary tensors to store quantized values of input and
   // hidden_state tensors.
-  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+  if (is_hybrid) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(3);
     node->temporaries->data[0] = *scratch_tensor_index;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -114,7 +118,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[1] = *scratch_tensor_index + 1;
     TfLiteTensor* hidden_state_quantized =
         GetTemporary(context, node, /*index=*/1);
-    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->type = input_weights->type;
     hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
                              hidden_state->dims)) {
@@ -213,19 +217,31 @@ TfLiteStatus EvalHybrid(
 
   // Initialize the pointer bias.
   const float* bias_ptr = bias->data.f;
-  // Initialize input_weights and recurrent_weights.
-  const int8_t* input_weights_ptr =
-      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
-  const int8_t* recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+
+  // Initialize input_weights, recurrent_weights, and temporary storage for
+  // quantized values.
+  const int8_t* input_weights_ptr;
+  const int8_t* recurrent_weights_ptr;
+  int8_t* quantized_input_ptr;
+  int8_t* quantized_hidden_state_ptr;
+  if (input_weights->type == kTfLiteUInt8) {
+    input_weights_ptr =
+        reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+    recurrent_weights_ptr =
+        reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+    quantized_input_ptr = reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+    quantized_hidden_state_ptr =
+        reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+  } else {
+    input_weights_ptr = input_weights->data.int8;
+    recurrent_weights_ptr = recurrent_weights->data.int8;
+    quantized_input_ptr = input_scratch->data.int8;
+    quantized_hidden_state_ptr = hidden_state_scratch->data.int8;
+  }
+
   // Get the scale of the quantized weights.
   float input_weights_scale = input_weights->params.scale;
   float recurrent_weights_scale = recurrent_weights->params.scale;
-  // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
-  int8_t* quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
   if (time_major) {
@@ -286,7 +302,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(input, input_weights, recurrent_weights, bias, params,
                        hidden_state, output);
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       // TODO(mirkov): implement eval with quantized inputs as well.
       TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
       TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index a2f82ac67b1b22b226e7046af7158ed6095dcc8e..de1f7818bd0f2a1420b6f277c08670f7e70fef27 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -248,17 +248,29 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
 class HybridUnidirectionalRNNOpModel : public UnidirectionalRNNOpModel {
  public:
   HybridUnidirectionalRNNOpModel(int batches, int sequence_len, int units,
-                                 int size, bool time_major)
+                                 int size, bool time_major,
+                                 TensorType tensor_type)
       : UnidirectionalRNNOpModel(batches, sequence_len, units, size, time_major,
-                                 TensorType_UINT8, TensorType_UINT8) {}
+                                 tensor_type, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
 
-  void SetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
   }
 
+  void SetWeights(std::initializer_list<float> f) { SetWeights(weights_, f); }
+
   void SetRecurrentWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+    SetWeights(recurrent_weights_, f);
   }
+
+ protected:
+  TensorType tensor_type_;
 };
 
 TEST(UnidirectionalRNNOpTest, BlackBoxTest) {
@@ -285,10 +297,36 @@ TEST(UnidirectionalRNNOpTest, BlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
-TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTest) {
+TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTestUint8) {
   HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                                      /*units=*/16, /*size=*/8,
-                                     /*time_major=*/false);
+                                     /*time_major=*/false, TensorType_UINT8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_start = rnn_golden_output;
+  float* golden_end = golden_start + rnn.num_units() * rnn.sequence_len();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  expected.insert(expected.end(), golden_start, golden_end);
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
+
+TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTestInt8) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/false, TensorType_INT8);
   rnn.SetWeights(rnn_weights);
   rnn.SetBias(rnn_bias);
   rnn.SetRecurrentWeights(rnn_recurrent_weights);
@@ -340,10 +378,40 @@ TEST(UnidirectionalRNNOpTest, TimeMajorBlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
-TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTest) {
+TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTestUint8) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/true, TensorType_UINT8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_batch_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_batch_end = golden_batch_start + rnn.num_units();
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+  }
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
+
+TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTestInt8) {
   HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                                      /*units=*/16, /*size=*/8,
-                                     /*time_major=*/true);
+                                     /*time_major=*/true, TensorType_INT8);
   rnn.SetWeights(rnn_weights);
   rnn.SetBias(rnn_bias);
   rnn.SetRecurrentWeights(rnn_recurrent_weights);
diff --git a/tensorflow/lite/kernels/unique.cc b/tensorflow/lite/kernels/unique.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80c033aa5ce1f0fb302f7b2f06d3e2cae69b9062
--- /dev/null
+++ b/tensorflow/lite/kernels/unique.cc
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unique {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  static const int kOutputUniqueTensor = 0;
+  static const int kOutputIndexTensor = 1;
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output_unique_tensor =
+      GetOutput(context, node, kOutputUniqueTensor);
+  TfLiteTensor* output_index_tensor =
+      GetOutput(context, node, kOutputIndexTensor);
+
+  // The op only supports 1D input.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TfLiteIntArray* output_index_shape = TfLiteIntArrayCopy(input->dims);
+  // The unique values are determined during evaluation, so we don't know yet
+  // the size of the output tensor.
+  SetTensorToDynamic(output_unique_tensor);
+  return context->ResizeTensor(context, output_index_tensor,
+                               output_index_shape);
+}
+
+namespace {
+
+// Actual evaluation for the unique op.
+template <typename T, typename I>
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteNode* node) {
+  // Map from value, to index in the unique elements vector.
+  // Note that we prefer to use map than unordered_map as it showed less
+  // increase in the binary size.
+  std::map<T, int> unique_values;
+  TfLiteTensor* output_indexes = GetOutput(context, node, 1);
+  I* indexes = GetTensorData<I>(output_indexes);
+  const T* data = GetTensorData<T>(input);
+  const int num_elements = NumElements(input);
+
+  for (int i = 0; i < num_elements; ++i) {
+    const auto element_it = unique_values.find(data[i]);
+    if (element_it != unique_values.end()) {
+      indexes[i] = element_it->second;
+    } else {
+      const int unique_index = unique_values.size();
+      unique_values[data[i]] = unique_index;
+      indexes[i] = unique_index;
+    }
+  }
+  // Allocate output tensor.
+  TfLiteTensor* unique_output = GetOutput(context, node, 0);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(NumDimensions(input)), TfLiteIntArrayFree);
+  shape->data[0] = unique_values.size();
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, unique_output, shape.release()));
+  // Set the values in the output tensor.
+  T* output_unique_values = GetTensorData<T>(unique_output);
+  for (int i = 0; i < unique_values.size(); ++i) {
+    output_unique_values[i] = data[indexes[i]];
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteUniqueParams*>(node->builtin_data);
+  if (params == nullptr) {
+    context->ReportError(context, "Null params passed");
+    return kTfLiteError;
+  }
+  switch (params->index_out_type) {
+    case kTfLiteInt32:
+      return EvalImpl<T, int32_t>(context, input, node);
+    case kTfLiteInt64:
+      return EvalImpl<T, int64_t>(context, input, node);
+    default:
+      context->ReportError(
+          context,
+          "Unique index output array can only be Int32 or In64, requested: ",
+          TfLiteTypeGetName(params->index_out_type));
+  }
+  return kTfLiteError;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output_index_tensor = GetOutput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_index_tensor),
+                    NumElements(input));
+
+  switch (input->type) {
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int8_t>(context, input, node));
+      break;
+    case kTfLiteInt16:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int16_t>(context, input, node));
+      break;
+    case kTfLiteInt32:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int32_t>(context, input, node));
+      break;
+    case kTfLiteInt64:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int64_t>(context, input, node));
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_STATUS(EvalImpl<float>(context, input, node));
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_ENSURE_STATUS(EvalImpl<uint8_t>(context, input, node));
+      break;
+    default:
+      context->ReportError(context, "Currently Unique doesn't support type: %s",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unique
+
+TfLiteRegistration* Register_UNIQUE() {
+  static TfLiteRegistration r = {unique::Init, unique::Free, unique::Prepare,
+                                 unique::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/unique_test.cc b/tensorflow/lite/kernels/unique_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1df5e6b7967ea701c573e6d1f9abc04f0067b65a
--- /dev/null
+++ b/tensorflow/lite/kernels/unique_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T, typename I>
+class UniqueOpModel : public SingleOpModel {
+ public:
+  UniqueOpModel(const TensorData& input, TensorType input_type,
+                TensorType index_out_type) {
+    input_id_ = AddInput(input);
+    output_id_ = AddOutput(input_type);
+    output_index_id_ = AddOutput(index_out_type);
+    SetBuiltinOp(BuiltinOperator_UNIQUE, BuiltinOptions_UniqueOptions,
+                 CreateUniqueOptions(builder_, index_out_type).Union());
+    BuildInterpreter({GetShape(input_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+  std::vector<I> GetIndexesOutput() {
+    return ExtractVector<I>(output_index_id_);
+  }
+
+ protected:
+  int input_id_;
+  int output_id_;
+  int output_index_id_;
+};
+
+TEST(UniqueOpModelTest, OneElement) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {1}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_THAT(model.GetIndexesOutput(), ElementsAreArray({0}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_AllUnique) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {8}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(),
+                              {5, 2, 3, 51, 6, 72, 7, 8});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5, 2, 3, 51, 6, 72, 7, 8}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_AllDuplicates) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {5, 5, 5, 5, 5, 5, 5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 0, 0, 0, 0, 0, 0}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_SomeDuplicates) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {2, 3, 5, 7, 2, 7, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2, 3, 5, 7}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 3, 1}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_SomeDuplicates_IndexInt64) {
+  UniqueOpModel<float, int64_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT64);
+  model.PopulateTensor<float>(model.input_tensor_id(), {2, 3, 5, 7, 2, 7, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2, 3, 5, 7}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 3, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 663ee38280ed4d65d9dafb8353dd4746c6da6292..c736685a98097fd5e4aa2cd079926747e71ee2f6 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/model.h"
@@ -299,6 +300,56 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
   return status;
 }
 
+TfLiteStatus InterpreterBuilder::ParseQuantization(
+    const QuantizationParameters* src_quantization,
+    TfLiteQuantization* quantization) {
+  quantization->type = kTfLiteNoQuantization;
+  if (!src_quantization || !src_quantization->scale() ||
+      src_quantization->scale()->size() == 0) {
+    return kTfLiteOk;
+  }
+  if (!src_quantization->zero_point()) {
+    error_reporter_->Report(
+        "Quantization parameters has non-null scale but null zero_point.");
+    return kTfLiteError;
+  }
+
+  // Ensure that the number of scales matches the number of zero_points.
+  if (src_quantization->scale()->size() !=
+      src_quantization->zero_point()->size()) {
+    error_reporter_->Report(
+        "QuantizationParam has %d zero_point values and %d scale values. Must "
+        "have same number.",
+        src_quantization->zero_point()->size(),
+        src_quantization->scale()->size());
+    return kTfLiteError;
+  }
+
+  // Affine-quantization.
+  quantization->type = kTfLiteAffineQuantization;
+  auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  const size_t num_scales = src_quantization->scale()->size();
+  affine_quantization->scale = TfLiteFloatArrayCreate(num_scales);
+  affine_quantization->zero_point = TfLiteIntArrayCreate(num_scales);
+  for (size_t i = 0; i < num_scales; ++i) {
+    affine_quantization->scale->data[i] = src_quantization->scale()->Get(i);
+    affine_quantization->zero_point->data[i] =
+        src_quantization->zero_point()->Get(i);
+  }
+  if (src_quantization->quantized_dimension() < 0 ||
+      src_quantization->quantized_dimension() >= num_scales) {
+    error_reporter_->Report(
+        "quantized_dimension must be in range [0, %d). Was %d.", num_scales,
+        src_quantization->quantized_dimension());
+    return kTfLiteError;
+  }
+  affine_quantization->quantized_dimension =
+      src_quantization->quantized_dimension();
+  quantization->params = reinterpret_cast<void*>(affine_quantization);
+  return kTfLiteOk;
+}
+
 TfLiteStatus InterpreterBuilder::ParseTensors(
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
@@ -317,36 +368,11 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     const auto* tensor = tensors->Get(i);
     std::vector<int> dims = FlatBufferIntArrayToVector(tensor->shape());
 
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 0;
-    quantization.zero_point = 0;
-    auto* q_params = tensor->quantization();
-    if (q_params) {
-      // Note that the schema could hold per-channel quantization parameters
-      // but we really only support one value for the whole tensor.
-      // TODO(aselle): This breaks as well if these are nullptr's.
-      // TODO(aselle): This assumes non per-channel quantization.
-
-      if (q_params->scale()) {
-        if (q_params->scale()->size() != 1) {
-          error_reporter_->Report(
-              "QuantizationParam has %d scale values (only 1 is supported).",
-              q_params->scale()->size());
-          return kTfLiteError;
-        }
-        quantization.scale = q_params->scale()->Get(0);
-      }
-
-      if (q_params->zero_point()) {
-        if (q_params->zero_point()->size() != 1) {
-          error_reporter_->Report(
-              "QuantizationParam has %d zero_point values"
-              " (only 1 is supported).",
-              q_params->zero_point()->size());
-          return kTfLiteError;
-        }
-        quantization.zero_point = q_params->zero_point()->Get(0);
-      }
+    const auto* src_quantization = tensor->quantization();
+    TfLiteQuantization quantization;
+    if (ParseQuantization(src_quantization, &quantization) != kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
 
     TfLiteType type;
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 069cefabf91ceceaa6da79fdc8ebbdb31cf9a6d3..a9bd4c9c09c23609ec977866179eb7d6598c408d 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -35,6 +35,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_MODEL_H_
 
 #include <memory>
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
@@ -203,6 +204,8 @@ class InterpreterBuilder {
       const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
       Interpreter* interpreter);
   TfLiteStatus ApplyDelegates(Interpreter* interpreter);
+  TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization,
+                                 TfLiteQuantization* quantization);
 
   const ::tflite::Model* model_;
   const OpResolver& op_resolver_;
diff --git a/tensorflow/lite/models/smartreply/BUILD b/tensorflow/lite/models/smartreply/BUILD
index 078b8e6bc6a288542575293be66c19f7bb733fc4..5be2aaff1f2d39f961da9ae1d666b27f41ddb039 100644
--- a/tensorflow/lite/models/smartreply/BUILD
+++ b/tensorflow/lite/models/smartreply/BUILD
@@ -1,9 +1,14 @@
-package(default_visibility = ["//visibility:public"])
+package(default_visibility = [
+    "//visibility:public",
+])
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "gen_selected_ops", "tflite_copts")
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE"])
+
 gen_selected_ops(
     name = "smartreply_ops",
     model = "@tflite_smartreply//:smartreply.tflite",
@@ -22,10 +27,12 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -43,7 +50,25 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
+    name = "predictor_test",
+    srcs = ["predictor_test.cc"],
+    data = [
+        "//tensorflow/lite/models:testdata/smartreply_samples.tsv",
+        "@tflite_smartreply//:smartreply.tflite",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":predictor_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
     name = "extract_feature_op_test",
     size = "small",
     srcs = ["ops/extract_feature_test.cc"],
@@ -58,7 +83,7 @@ cc_test(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "normalize_op_test",
     size = "small",
     srcs = ["ops/normalize_test.cc"],
@@ -73,7 +98,7 @@ cc_test(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "predict_op_test",
     size = "small",
     srcs = ["ops/predict_test.cc"],
diff --git a/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc b/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
index efe59eeb4667cc55fb0a70d3005c1f9c2aaa73ce..914b47c1a9deba4e601fdc1b787f3a03179c2e6a 100644
--- a/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
@@ -94,7 +94,7 @@ TEST(ExtractFeatureOpTest, AllBlacklistInput) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/models/smartreply/ops/normalize_test.cc b/tensorflow/lite/models/smartreply/ops/normalize_test.cc
index 8c5131565d5892be946a9a115bb7c6cad8733214..46d2aebe756b84f067def401010e5ee4b37cfd8b 100644
--- a/tensorflow/lite/models/smartreply/ops/normalize_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize_test.cc
@@ -84,7 +84,7 @@ TEST(NormalizeOpTest, EmptyInput) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/models/smartreply/ops/predict.cc b/tensorflow/lite/models/smartreply/ops/predict.cc
index bb2ed4a3153ceb2ef2e6b6d7f8c640f41616d4b0..24b7d5489756de36c2bcc8a47ef1c8e478c3a9c0 100644
--- a/tensorflow/lite/models/smartreply/ops/predict.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict.cc
@@ -28,6 +28,7 @@ limitations under the License.
 //
 
 #include <algorithm>
+#include <cstdlib>
 #include <unordered_map>
 #include <vector>
 
diff --git a/tensorflow/lite/models/smartreply/ops/predict_test.cc b/tensorflow/lite/models/smartreply/ops/predict_test.cc
index ca64dcaad47108e346bd03f0b7b15edfbd6a50dc..6896a342c79a73390f1ad02a60db6cb70a1cf23b 100644
--- a/tensorflow/lite/models/smartreply/ops/predict_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict_test.cc
@@ -177,7 +177,7 @@ TEST(PredictOpTest, NoLabelGenerated) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/models/smartreply/predictor_test.cc b/tensorflow/lite/models/smartreply/predictor_test.cc
index 7eba26993e59172d8ae85a8961b6f3b171057a48..f4a9453b4220b45af937923a6b916c1516f9e22f 100644
--- a/tensorflow/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/lite/models/smartreply/predictor_test.cc
@@ -22,21 +22,24 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-//#include "tensorflow/lite/models/test_utils.h"
-#include "tensorflow/lite/string_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace custom {
 namespace smartreply {
 namespace {
 
-const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
-string TestDataPath() {
+string GetModelFilePath() {
+  return "external/tflite_smartreply/smartreply.tflite";  // NOLINT
+}
+
+string GetSamplesFilePath() {
   return string(absl::StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                             "lite/models/testdata/"));
+                             "lite/models/testdata/", kSamples));
 }
 
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
@@ -53,13 +56,14 @@ MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
 
 class PredictorTest : public ::testing::Test {
  protected:
-  PredictorTest() {
-    model_ = tflite::FlatBufferModel::BuildFromFile(
-        absl::StrCat(TestDataPath(), "/", kModelName).c_str());
-    CHECK(model_);
-  }
+  PredictorTest() {}
   ~PredictorTest() override {}
 
+  void SetUp() override {
+    model_ = tflite::FlatBufferModel::BuildFromFile(GetModelFilePath().c_str());
+    ASSERT_NE(model_.get(), nullptr);
+  }
+
   std::unique_ptr<::tflite::FlatBufferModel> model_;
 };
 
@@ -121,7 +125,7 @@ TEST_F(PredictorTest, BatchTest) {
   int total_triggers = 0;
 
   string line;
-  std::ifstream fin(absl::StrCat(TestDataPath(), "/", kSamples));
+  std::ifstream fin(GetSamplesFilePath());
   while (std::getline(fin, line)) {
     const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
@@ -151,3 +155,9 @@ TEST_F(PredictorTest, BatchTest) {
 }  // namespace smartreply
 }  // namespace custom
 }  // namespace tflite
+
+int main(int argc, char **argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/models/speech_test.cc b/tensorflow/lite/models/speech_test.cc
index 17b7e8f28e8fb0988ee2269d9d833626c2aec701..a3713c55312cb7cb6526b7e82606cb949e5c2af4 100644
--- a/tensorflow/lite/models/speech_test.cc
+++ b/tensorflow/lite/models/speech_test.cc
@@ -139,7 +139,7 @@ TEST_P(SpeechTest, DISABLED_SpeakerIdOkGoogleTest) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, DISABLED_AsrAmTest) {
+TEST_P(SpeechTest, AsrAmTest) {
   std::stringstream os;
   ASSERT_TRUE(
       ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
@@ -152,6 +152,19 @@ TEST_P(SpeechTest, DISABLED_AsrAmTest) {
       << test_driver.GetErrorMessage();
 }
 
+TEST_P(SpeechTest, AsrAmQuantizedTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_asr_am_model_int8.tflite", "speech_asr_am_model_in.csv",
+      "speech_asr_am_model_int8_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"104",
+      /*persistent_tensors=*/"18,19,38,39,58,59,78,79,98,99",
+      /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
+      << test_driver.GetErrorMessage();
+}
+
 // The original version of speech_asr_lm_model_test.cc ran a few sequences
 // through the interpreter and stored the sum of all the output, which was them
 // compared for correctness. In this test we are comparing all the intermediate
@@ -196,10 +209,10 @@ TEST_P(SpeechTest, DISABLED_TtsTest) {
 // 200s just to bring up the Android emulator.)
 static const int kAllInvocations = -1;
 static const int kFirstFewInvocations = 10;
-INSTANTIATE_TEST_CASE_P(LongTests, SpeechTest,
-                        ::testing::Values(kAllInvocations));
-INSTANTIATE_TEST_CASE_P(ShortTests, SpeechTest,
-                        ::testing::Values(kFirstFewInvocations));
+INSTANTIATE_TEST_SUITE_P(LongTests, SpeechTest,
+                         ::testing::Values(kAllInvocations));
+INSTANTIATE_TEST_SUITE_P(ShortTests, SpeechTest,
+                         ::testing::Values(kFirstFewInvocations));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/models/testdata/g3doc/README.md b/tensorflow/lite/models/testdata/g3doc/README.md
index 2a4f1c143a21722945e8e396b81bd23e3312e87e..afe5f16b383b26efd7aab866c3215a8d2a203f4c 100644
--- a/tensorflow/lite/models/testdata/g3doc/README.md
+++ b/tensorflow/lite/models/testdata/g3doc/README.md
@@ -3,6 +3,42 @@
 Sample test data has been provided for speech related models in Tensorflow Lite
 to help users working with speech models to verify and test their models.
 
+### Models and Inputs and Outputs:
+
+[ASR AM model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model.tflite)
+
+[ASR AM quantized model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_int8.tflite)
+
+[ASR AM test inputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_in.csv)
+
+[ASR AM test outputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_out.csv)
+
+[ASR AM int8 test outputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_int8_out.csv)
+
+The models below are not maintained.
+
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+
+### Test Bench
+
+[Model tests](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_test.cc)
+
+Download the ASR AM test models and inputs and output files to the
+models/testdata directory to run the tests.
+
+
+## Speech Model Architectures
+
 For the hotword, speaker-id and automatic speech recognition sample models, the
 architecture assumes that the models receive their input from a speech
 pre-processing module. The speech pre-processing module receives the audio
@@ -87,57 +123,3 @@ The model consists of a convolutional layer, followed by a fully-connected
 layer, two LSTM layers, and two additional fully-connected layers.
 The corresponding parameters as shown in the figure.
 ![endpointer_model](endpointer.svg "Endpointer model")
-
-
-## Speech models test input/output generation
-
-As mentioned above the input to models are generated from a pre-processing
-module (output of a log-mel filterbank, or linguistic features), and the outputs
-are generated by running the equivalent TensorFlow model by feeding them the
-same input.
-
-## Link to the open source code
-
-### Models:
-
-[Speech hotword model (Svdf
-rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
-
-[Speech hotword model (Svdf
-rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
-
-[Speaker-id
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
-
-[TTS
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
-
-[ASR AM
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
-
-### Test benches
-
-[Speech hotword model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_hotword_model_test.cc)
-
-[Speaker-id model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_speakerid_model_test.cc)
-
-[TTS model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_tts_model_test.cc)
-
-[ASR AM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_am_model_test.cc)
-
-[ASR LM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_lm_model_test.cc)
-
-[Endpointer model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_endpointer_model_test.cc)
-
-## Android Support
-The models have been tested on Android phones, using the following tests:
-
-[Hotword] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=25)
-
-[Speaker-id] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=36)
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 467a2b7a7bc9a40135428240585cd2c2a133cf9f..7af2b099e75565ec74e0861499063a0ba87dec37 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -8,6 +8,43 @@ cc_library(
     name = "nnapi_lib",
     hdrs = [
         "NeuralNetworksShim.h",
+        "NeuralNetworksTypes.h",
     ],
     linkopts = ["-ldl"],
 )
+
+cc_library(
+    name = "nnapi_implementation",
+    srcs = select({
+        "//tensorflow:ios": [
+            "nnapi_implementation_disabled.cc",
+        ],
+        "//tensorflow:windows": [
+            "nnapi_implementation_disabled.cc",
+        ],
+        "//conditions:default": [
+            "nnapi_implementation.cc",
+        ],
+    }),
+    hdrs = [
+        "nnapi_implementation.h",
+    ],
+    linkopts = ["-ldl"] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lrt"],
+    }),
+    deps = [
+        "//tensorflow/lite/nnapi:nnapi_lib",
+    ],
+)
+
+cc_test(
+    name = "nnapi_implementation_test",
+    srcs = ["nnapi_implementation_test.cc"],
+    deps = [
+        "//tensorflow/lite/nnapi:nnapi_implementation",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/nnapi/NeuralNetworksShim.h b/tensorflow/lite/nnapi/NeuralNetworksShim.h
index c39502f4acc5dc6262746a61688cd075861e6135..3a4e15006e8bee4adb1c23984657269947853082 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksShim.h
@@ -20,6 +20,13 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+// This interface is now deprecated. You should use instead
+// nnapi_implementation.
+
+// TODO(b/123017568): Update all current usages of this file.
+
 // helpers
 
 #define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
@@ -44,8 +51,6 @@ inline void* loadLibrary(const char* name) {
   return handle;
 }
 
-typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
-
 // ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
 // which was added in 8.1.
 inline int ASharedMemory_create(const char* name, size_t size) {
@@ -81,332 +86,6 @@ inline bool NNAPIExists() {
 // NN api types based on NNAPI header file
 // https://developer.android.com/ndk/reference/group/neural-networks
 
-/**
- * Operand types.
- *
- * The type of operands that can be added to a model.
- *
- * Although we define many types, most operators accept just a few
- * types.  Most used are ANEURALNETWORKS_TENSOR_FLOAT32,
- * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
- */
-enum {
-  ANEURALNETWORKS_FLOAT32 = 0,
-  ANEURALNETWORKS_INT32 = 1,
-  ANEURALNETWORKS_UINT32 = 2,
-  ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
-  ANEURALNETWORKS_TENSOR_INT32 = 4,
-  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
-};
-
-/**
- * Operation types.
- *
- * The type of operations that can be added to a model.
- */
-enum {
-  ANEURALNETWORKS_ADD = 0,
-  ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
-  ANEURALNETWORKS_CONCATENATION = 2,
-  ANEURALNETWORKS_CONV_2D = 3,
-  ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
-  ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
-  ANEURALNETWORKS_DEQUANTIZE = 6,
-  ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
-  ANEURALNETWORKS_FLOOR = 8,
-  ANEURALNETWORKS_FULLY_CONNECTED = 9,
-  ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
-  ANEURALNETWORKS_L2_NORMALIZATION = 11,
-  ANEURALNETWORKS_L2_POOL_2D = 12,
-  ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
-  ANEURALNETWORKS_LOGISTIC = 14,
-  ANEURALNETWORKS_LSH_PROJECTION = 15,
-  ANEURALNETWORKS_LSTM = 16,
-  ANEURALNETWORKS_MAX_POOL_2D = 17,
-  ANEURALNETWORKS_MUL = 18,
-  ANEURALNETWORKS_RELU = 19,
-  ANEURALNETWORKS_RELU1 = 20,
-  ANEURALNETWORKS_RELU6 = 21,
-  ANEURALNETWORKS_RESHAPE = 22,
-  ANEURALNETWORKS_RESIZE_BILINEAR = 23,
-  ANEURALNETWORKS_RNN = 24,
-  ANEURALNETWORKS_SOFTMAX = 25,
-  ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
-  ANEURALNETWORKS_SVDF = 27,
-  ANEURALNETWORKS_TANH = 28,
-  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
-  ANEURALNETWORKS_DIV = 30,
-  ANEURALNETWORKS_MEAN = 31,
-  ANEURALNETWORKS_PAD = 32,
-  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
-  ANEURALNETWORKS_SQUEEZE = 34,
-  ANEURALNETWORKS_STRIDED_SLICE = 35,
-  ANEURALNETWORKS_SUB = 36,
-  ANEURALNETWORKS_TRANSPOSE = 37,
-};
-
-/**
- * Fused activation function types.
- *
- */
-enum {
-  ANEURALNETWORKS_FUSED_NONE = 0,
-  ANEURALNETWORKS_FUSED_RELU = 1,
-  ANEURALNETWORKS_FUSED_RELU1 = 2,
-  ANEURALNETWORKS_FUSED_RELU6 = 3,
-};
-
-/**
- * Execution preferences.
- */
-enum {
-  ANEURALNETWORKS_PREFER_LOW_POWER = 0,
-  ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
-  ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
-};
-
-/**
- * Result codes.
- */
-enum {
-  ANEURALNETWORKS_NO_ERROR = 0,
-  ANEURALNETWORKS_OUT_OF_MEMORY = 1,
-  ANEURALNETWORKS_INCOMPLETE = 2,
-  ANEURALNETWORKS_UNEXPECTED_NULL = 3,
-  ANEURALNETWORKS_BAD_DATA = 4,
-  ANEURALNETWORKS_OP_FAILED = 5,
-  ANEURALNETWORKS_UNMAPPABLE = 5,
-  ANEURALNETWORKS_BAD_STATE = 6,
-};
-
-/**
- * Implicit padding algorithms.
- */
-enum {
-  ANEURALNETWORKS_PADDING_SAME = 1,
-  ANEURALNETWORKS_PADDING_VALID = 2,
-};
-
-/**
- * ANeuralNetworksMemory is an opaque type that represents memory.
- *
- * This type is used to represent shared memory, memory mapped files,
- * and similar memories.
- *
- * By using shared memory, a program can efficiently communicate to the
- * runtime and drivers the tensors that define a model. See
- * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
- * should typically create one shared memory object that contains every tensor
- * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
- * used to create shared memory from a file handle. {@link
- * ANeuralNetworksMemory_createShared} can be used to directly created shared
- * memory.
- *
- * Memory objects can also be used to specify the input and output arguments of
- * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
- * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
- */
-typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
-
-/**
- * ANeuralNetworksModel is an opaque type that contains a description of the
- * mathematical operations that constitute the model.
- *
- * <p>The model will be built by calling<ul>
- * <li>{@link ANeuralNetworksModel_create},</li>
- * <li>{@link ANeuralNetworksModel_addOperation},</li>
- * <li>{@link ANeuralNetworksModel_addOperand},</li>
- * </ul>
- *
- * A model is completed by calling {@link ANeuralNetworksModel_finish}.
- * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
- *
- * <p>It is the application's responsibility to make sure that only one thread
- * modifies a model at a given time. It is however safe for more than one
- * thread to use the model once {@link ANeuralNetworksModel_finish} has
- * returned.</p>
- *
- * <p>It is also the application's responsibility to ensure that there are no
- * other uses of the model after calling {@link ANeuralNetworksModel_free}. This
- * includes any compilation or execution object created using the model.</p>
- */
-typedef struct ANeuralNetworksModel ANeuralNetworksModel;
-
-/**
- * ANeuralNetworksCompilation is an opaque type that can be used to compile
- * a machine learning model.
- *
- * <p>To use:<ul>
- *    <li>Create a new compilation instance by calling the
- *        {@link ANeuralNetworksCompilation_create} function.</li>
- *    <li>Perform the compilation with {@link
- * ANeuralNetworksCompilation_start}.</li> <li>Wait for the compilation to
- * complete with {@link ANeuralNetworksCompilation_wait}.</li> <li>Use the
- * compilation as many times as needed with {@link
- * ANeuralNetworksExecution_create}.</li> <li>Destroy the compilation with
- * {@link ANeuralNetworksCompilation_free} once all executions using the
- * compilation have completed.</li></ul></p>
- *
- * <p>A compilation cannot be modified once {@link
- * ANeuralNetworksCompilation_start} has been called on it.</p>
- *
- * <p>It is the application's responsibility to make sure that only one thread
- * modifies a compilation at a given time. It is however safe for more than one
- * thread to use {@link ANeuralNetworksCompilation_wait} at the same time.
- * It is also safe for multiple threads to use a compilation object once
- * {@link ANeuralNetworksCompilation_wait} has completed.</p>
- *
- * <p>It is also the application's responsibility to ensure that there are no
- * other uses of the compilation after calling {@link
- * ANeuralNetworksCompilation_free}. This includes any execution object created
- * using the compilation.</p>
- */
-typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
-
-/**
- * ANeuralNetworksExecution is an opaque type that can be used to apply a
- * machine learning model to a set of inputs.
- *
- * <p>To use:<ul>
- *    <li>Create a new execution instance by calling the
- *        {@link ANeuralNetworksExecution_create} function.</li>
- *    <li>Associate data to the model inputs with
- *        {@link ANeuralNetworksExecution_setInput} or
- *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
- *    <li>Associate output buffers to the model outputs with
- *        {@link ANeuralNetworksExecution_setOutput} or
- *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
- *    <li>Apply the model with {@link
- * ANeuralNetworksExecution_startCompute}.</li> <li>Wait for the execution to
- * complete with {@link ANeuralNetworksExecution_wait}.</li> <li>Destroy the
- * execution with
- *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
- *
- * <p>An execution cannot be modified once {@link
- * ANeuralNetworksExecution_start} has been called on it.</p>
- *
- * <p>An execution can be applied to a model with
- * {@link ANeuralNetworksExecution_startCompute} only once. Create new
- * executions to do new evaluations of the model.</p>
- *
- * <p>It is the application's responsibility to make sure that only one thread
- * modifies an execution at a given time. It is however safe for more than one
- * thread to use {@link ANeuralNetworksExecution_wait} at the same time.</p>
- *
- * <p>It is also the application's responsibility to ensure that there are no
- * other uses of the request after calling {@link
- * ANeuralNetworksRequest_free}.</p>
- */
-typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
-
-/**
- * ANeuralNetworksOperandType describes the type of an operand.
- * This structure is used to describe both scalars and tensors.
- */
-typedef struct ANeuralNetworksOperandType {
-  /** The data type, e.g ANEURALNETWORKS_INT8. */
-  int32_t type;
-  /** The number of dimensions. It should be 0 for scalars. */
-  uint32_t dimensionCount;
-  /** The dimensions of the tensor. It should be nullptr for scalars. */
-  const uint32_t* dimensions;
-  /** These two fields are only used for quantized tensors.
-   * They should be zero for scalars and non-fixed point tensors.
-   * The dequantized value of each entry is (value - offset) * scale.
-   */
-  float scale;
-  int32_t zeroPoint;
-} ANeuralNetworksOperandType;
-
-/**
- * ANeuralNetworksEvent is an opaque type that represents an event
- * that will be signaled once an execution completes.
- */
-typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
-
-typedef int32_t ANeuralNetworksOperationType;
-
-// nn api function types
-
-typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
-    size_t size, int protect, int fd, size_t offset,
-    ANeuralNetworksMemory** memory);
-
-typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
-
-typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
-
-typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
-
-typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
-
-typedef int (*ANeuralNetworksCompilation_create_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
-
-typedef void (*ANeuralNetworksCompilation_free_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
-    ANeuralNetworksCompilation* compilation, int32_t preference);
-
-typedef int (*ANeuralNetworksCompilation_finish_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksModel_addOperand_fn)(
-    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
-
-typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
-    ANeuralNetworksModel* model, int32_t index, const void* buffer,
-    size_t length);
-
-typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
-    ANeuralNetworksModel* model, int32_t index,
-    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksModel_addOperation_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
-    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
-    const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
-    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
-    uint32_t outputCount, const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
-    ANeuralNetworksModel* model, bool allow);
-
-typedef int (*ANeuralNetworksExecution_create_fn)(
-    ANeuralNetworksCompilation* compilation,
-    ANeuralNetworksExecution** execution);
-
-typedef void (*ANeuralNetworksExecution_free_fn)(
-    ANeuralNetworksExecution* execution);
-
-typedef int (*ANeuralNetworksExecution_setInput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_startCompute_fn)(
-    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
-
-typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
-
-typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
-
 /**
  * Creates a shared memory object from a file descriptor.
  *
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..9291391491a3926202febcdb468a79cb7e0d5818
--- /dev/null
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -0,0 +1,354 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
+#define TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+// NN api types based on NNAPI header file
+// https://developer.android.com/ndk/reference/group/neural-networks
+
+/**
+ * Operand types.
+ *
+ * The type of operands that can be added to a model.
+ *
+ * Although we define many types, most operators accept just a few
+ * types.  Most used are ANEURALNETWORKS_TENSOR_FLOAT32,
+ * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
+ */
+enum {
+  ANEURALNETWORKS_FLOAT32 = 0,
+  ANEURALNETWORKS_INT32 = 1,
+  ANEURALNETWORKS_UINT32 = 2,
+  ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
+  ANEURALNETWORKS_TENSOR_INT32 = 4,
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+};
+
+/**
+ * Operation types.
+ *
+ * The type of operations that can be added to a model.
+ */
+enum {
+  ANEURALNETWORKS_ADD = 0,
+  ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
+  ANEURALNETWORKS_CONCATENATION = 2,
+  ANEURALNETWORKS_CONV_2D = 3,
+  ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
+  ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
+  ANEURALNETWORKS_DEQUANTIZE = 6,
+  ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
+  ANEURALNETWORKS_FLOOR = 8,
+  ANEURALNETWORKS_FULLY_CONNECTED = 9,
+  ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
+  ANEURALNETWORKS_L2_NORMALIZATION = 11,
+  ANEURALNETWORKS_L2_POOL_2D = 12,
+  ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
+  ANEURALNETWORKS_LOGISTIC = 14,
+  ANEURALNETWORKS_LSH_PROJECTION = 15,
+  ANEURALNETWORKS_LSTM = 16,
+  ANEURALNETWORKS_MAX_POOL_2D = 17,
+  ANEURALNETWORKS_MUL = 18,
+  ANEURALNETWORKS_RELU = 19,
+  ANEURALNETWORKS_RELU1 = 20,
+  ANEURALNETWORKS_RELU6 = 21,
+  ANEURALNETWORKS_RESHAPE = 22,
+  ANEURALNETWORKS_RESIZE_BILINEAR = 23,
+  ANEURALNETWORKS_RNN = 24,
+  ANEURALNETWORKS_SOFTMAX = 25,
+  ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
+  ANEURALNETWORKS_SVDF = 27,
+  ANEURALNETWORKS_TANH = 28,
+  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
+  ANEURALNETWORKS_DIV = 30,
+  ANEURALNETWORKS_MEAN = 31,
+  ANEURALNETWORKS_PAD = 32,
+  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
+  ANEURALNETWORKS_SQUEEZE = 34,
+  ANEURALNETWORKS_STRIDED_SLICE = 35,
+  ANEURALNETWORKS_SUB = 36,
+  ANEURALNETWORKS_TRANSPOSE = 37,
+};
+
+/**
+ * Fused activation function types.
+ *
+ */
+enum {
+  ANEURALNETWORKS_FUSED_NONE = 0,
+  ANEURALNETWORKS_FUSED_RELU = 1,
+  ANEURALNETWORKS_FUSED_RELU1 = 2,
+  ANEURALNETWORKS_FUSED_RELU6 = 3,
+};
+
+/**
+ * Execution preferences.
+ */
+enum {
+  ANEURALNETWORKS_PREFER_LOW_POWER = 0,
+  ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
+  ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
+};
+
+/**
+ * Result codes.
+ */
+enum {
+  ANEURALNETWORKS_NO_ERROR = 0,
+  ANEURALNETWORKS_OUT_OF_MEMORY = 1,
+  ANEURALNETWORKS_INCOMPLETE = 2,
+  ANEURALNETWORKS_UNEXPECTED_NULL = 3,
+  ANEURALNETWORKS_BAD_DATA = 4,
+  ANEURALNETWORKS_OP_FAILED = 5,
+  ANEURALNETWORKS_BAD_STATE = 6,
+  ANEURALNETWORKS_UNMAPPABLE = 7,
+  ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
+  ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
+};
+
+/**
+ * Implicit padding algorithms.
+ */
+enum {
+  ANEURALNETWORKS_PADDING_SAME = 1,
+  ANEURALNETWORKS_PADDING_VALID = 2,
+};
+
+/**
+ * ANeuralNetworksMemory is an opaque type that represents memory.
+ *
+ * This type is used to represent shared memory, memory mapped files,
+ * and similar memories.
+ *
+ * By using shared memory, a program can efficiently communicate to the
+ * runtime and drivers the tensors that define a model. See
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
+ * should typically create one shared memory object that contains every tensor
+ * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
+ * used to create shared memory from a file handle. {@link
+ * ANeuralNetworksMemory_createShared} can be used to directly created shared
+ * memory.
+ *
+ * Memory objects can also be used to specify the input and output arguments of
+ * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
+ * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ */
+typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
+
+/**
+ * ANeuralNetworksModel is an opaque type that contains a description of the
+ * mathematical operations that constitute the model.
+ *
+ * <p>The model will be built by calling<ul>
+ * <li>{@link ANeuralNetworksModel_create},</li>
+ * <li>{@link ANeuralNetworksModel_addOperation},</li>
+ * <li>{@link ANeuralNetworksModel_addOperand},</li>
+ * </ul>
+ *
+ * A model is completed by calling {@link ANeuralNetworksModel_finish}.
+ * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a model at a given time. It is however safe for more than one
+ * thread to use the model once {@link ANeuralNetworksModel_finish} has
+ * returned.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the model after calling {@link ANeuralNetworksModel_free}. This
+ * includes any compilation or execution object created using the model.</p>
+ */
+typedef struct ANeuralNetworksModel ANeuralNetworksModel;
+
+/**
+ * ANeuralNetworksCompilation is an opaque type that can be used to compile
+ * a machine learning model.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new compilation instance by calling the
+ *        {@link ANeuralNetworksCompilation_create} function.</li>
+ *    <li>Perform the compilation with {@link
+ * ANeuralNetworksCompilation_start}.</li> <li>Wait for the compilation to
+ * complete with {@link ANeuralNetworksCompilation_wait}.</li> <li>Use the
+ * compilation as many times as needed with {@link
+ * ANeuralNetworksExecution_create}.</li> <li>Destroy the compilation with
+ * {@link ANeuralNetworksCompilation_free} once all executions using the
+ * compilation have completed.</li></ul></p>
+ *
+ * <p>A compilation cannot be modified once {@link
+ * ANeuralNetworksCompilation_start} has been called on it.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a compilation at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksCompilation_wait} at the same time.
+ * It is also safe for multiple threads to use a compilation object once
+ * {@link ANeuralNetworksCompilation_wait} has completed.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the compilation after calling {@link
+ * ANeuralNetworksCompilation_free}. This includes any execution object created
+ * using the compilation.</p>
+ */
+typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
+
+/**
+ * ANeuralNetworksExecution is an opaque type that can be used to apply a
+ * machine learning model to a set of inputs.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new execution instance by calling the
+ *        {@link ANeuralNetworksExecution_create} function.</li>
+ *    <li>Associate data to the model inputs with
+ *        {@link ANeuralNetworksExecution_setInput} or
+ *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
+ *    <li>Associate output buffers to the model outputs with
+ *        {@link ANeuralNetworksExecution_setOutput} or
+ *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
+ *    <li>Apply the model with {@link
+ * ANeuralNetworksExecution_startCompute}.</li> <li>Wait for the execution to
+ * complete with {@link ANeuralNetworksExecution_wait}.</li> <li>Destroy the
+ * execution with
+ *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
+ *
+ * <p>An execution cannot be modified once {@link
+ * ANeuralNetworksExecution_start} has been called on it.</p>
+ *
+ * <p>An execution can be applied to a model with
+ * {@link ANeuralNetworksExecution_startCompute} only once. Create new
+ * executions to do new evaluations of the model.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies an execution at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksExecution_wait} at the same time.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the request after calling {@link
+ * ANeuralNetworksRequest_free}.</p>
+ */
+typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
+
+/**
+ * ANeuralNetworksOperandType describes the type of an operand.
+ * This structure is used to describe both scalars and tensors.
+ */
+typedef struct ANeuralNetworksOperandType {
+  /** The data type, e.g ANEURALNETWORKS_INT8. */
+  int32_t type;
+  /** The number of dimensions. It should be 0 for scalars. */
+  uint32_t dimensionCount;
+  /** The dimensions of the tensor. It should be nullptr for scalars. */
+  const uint32_t* dimensions;
+  /** These two fields are only used for quantized tensors.
+   * They should be zero for scalars and non-fixed point tensors.
+   * The dequantized value of each entry is (value - offset) * scale.
+   */
+  float scale;
+  int32_t zeroPoint;
+} ANeuralNetworksOperandType;
+
+/**
+ * ANeuralNetworksEvent is an opaque type that represents an event
+ * that will be signaled once an execution completes.
+ */
+typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
+
+typedef int32_t ANeuralNetworksOperationType;
+
+// nn api function types
+
+typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
+    size_t size, int protect, int fd, size_t offset,
+    ANeuralNetworksMemory** memory);
+
+typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
+
+typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
+
+typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
+
+typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
+
+typedef int (*ANeuralNetworksCompilation_create_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+typedef void (*ANeuralNetworksCompilation_free_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
+    ANeuralNetworksCompilation* compilation, int32_t preference);
+
+typedef int (*ANeuralNetworksCompilation_finish_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksModel_addOperand_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
+    ANeuralNetworksModel* model, int32_t index, const void* buffer,
+    size_t length);
+
+typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksModel_addOperation_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+    const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
+    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+    uint32_t outputCount, const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
+    ANeuralNetworksModel* model, bool allow);
+
+typedef int (*ANeuralNetworksExecution_create_fn)(
+    ANeuralNetworksCompilation* compilation,
+    ANeuralNetworksExecution** execution);
+
+typedef void (*ANeuralNetworksExecution_free_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_setInput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_startCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
+
+typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
+
+typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
+
+#endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8b9aed4226d9a1ad1817b38d62c5c3af726b2f6
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -0,0 +1,155 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdlib>
+
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif  // __ANDROID__
+
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
+
+namespace {
+
+#ifdef __ANDROID__
+int32_t GetAndroidSdkVersion() {
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    int32_t result = 0;
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher than expected;
+        return 0xffff;
+      }
+      result = result * 10 + digit;
+    }
+    return result;
+  }
+  return 0;
+}
+#endif  // __ANDROID__
+
+void* LoadFunction(void* handle, const char* name) {
+  if (handle == nullptr) {
+    return nullptr;
+  }
+  void* fn = dlsym(handle, name);
+  if (fn == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open function %s", name);
+  }
+  return fn;
+}
+
+#ifndef __ANDROID__
+// Add /dev/shm implementation of shared memory for non-Android platforms
+int ASharedMemory_create(const char* name, size_t size) {
+  int fd = shm_open(name, O_RDWR | O_CREAT, 0644);
+  if (fd < 0) {
+    return fd;
+  }
+  int result = ftruncate(fd, size);
+  if (result < 0) {
+    close(fd);
+    return -1;
+  }
+  return fd;
+}
+#endif  // __ANDROID__
+
+#define LOAD_FUNCTION(handle, name) \
+  nnapi.name = reinterpret_cast<name##_fn>(LoadFunction(handle, #name));
+
+const NnApi LoadNnApi() {
+  NnApi nnapi = {};
+  nnapi.android_sdk_version = 0;
+
+#ifdef __ANDROID__
+  void* libandroid = nullptr;
+  nnapi.android_sdk_version = GetAndroidSdkVersion();
+  if (nnapi.android_sdk_version < 27) {
+    NNAPI_LOG("nnapi error: requires android sdk version to be at least %d",
+              27);
+    nnapi.nnapi_exists = false;
+    return nnapi;
+  }
+  libandroid = dlopen("libandroid.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libandroid == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", "libandroid.so");
+  }
+#endif  // __ANDROID__
+
+  void* libneuralnetworks = nullptr;
+  // TODO(b/123243014): change RTLD_LOCAL? Assumes there can be multiple
+  // instances of nn api RT
+  libneuralnetworks = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libneuralnetworks == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", "libneuralnetworks.so");
+  }
+
+  nnapi.nnapi_exists = libneuralnetworks != nullptr;
+
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_createFromFd);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_finish);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperand);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_setOperandValue);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_setOperandValueFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperation);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_identifyInputsAndOutputs);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_relaxComputationFloat32toFloat16);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_setPreference);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_finish);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setInput);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setInputFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setOutput);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksExecution_setOutputFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_startCompute);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_wait);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_free);
+#ifdef __ANDROID__
+  LOAD_FUNCTION(libandroid, ASharedMemory_create);
+#else
+  nnapi.ASharedMemory_create = ASharedMemory_create;
+#endif  // __ANDROID__
+
+  return nnapi;
+}
+
+}  // namespace
+
+const NnApi* NnApiImplementation() {
+  static const NnApi nnapi = LoadNnApi();
+  return &nnapi;
+}
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..82d7cc75c1277b3cef2bc91aa00aee6033637b20
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -0,0 +1,582 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+#define TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+struct NnApi {
+  bool nnapi_exists;
+  int32_t android_sdk_version;
+
+  /**
+   * Creates a shared memory object from a file descriptor.
+   *
+   * The shared memory is backed by a file descriptor via mmap.
+   * See {@link ANeuralNetworksMemory} for a description on how to use
+   * this shared memory.
+   *
+   * @param size The requested size in bytes.
+   *             Must not be larger than the file size.
+   * @param prot The desired memory protection for the mapping.
+   *             It is either PROT_NONE or the bitwise OR of one or
+   *             more of the following flags: PROT_READ, PROT_WRITE.
+   * @param fd The requested file descriptor.
+   *           The file descriptor has to be mmap-able. The file
+   *           descriptor will be duplicated.
+   * @param offset The offset to the beginning of the file of the area to map.
+   *               The offset has to be aligned to a page size.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   */
+  int (*ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd,
+                                            size_t offset,
+                                            ANeuralNetworksMemory** memory);
+
+  /**
+   * Delete a memory object.
+   *
+   * Destroys the object used by the run time to keep track of the memory.
+   * This will free the underlying actual memory if no other code has open
+   * handles to this memory.
+   *
+   * @param memory The memory object to be freed.
+   */
+  void (*ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
+
+  /**
+   * Create an empty {@link ANeuralNetworksModel}.
+   *
+   * <p>This only creates the object. Computation is performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * The model should be constructed with calls to
+   * {@link ANeuralNetworksModel_addOperation} and
+   * {@link ANeuralNetworksModel_addOperand}
+   *
+   * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+   * has been fully constructed.</p>
+   *
+   * <p>{@link ANeuralNetworksModel_free} should be called once the model
+   * is no longer needed.</p>
+   *
+   * @param model The {@link ANeuralNetworksModel} to be created.
+   *              Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
+
+  /**
+   * Destroy a model.
+   *
+   * The model need not have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be destroyed. Passing NULL is acceptable and
+   *              results in no operation.
+   */
+  void (*ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
+
+  /**
+   * Indicate that we have finished modifying a model. Required before
+   * calling {@link ANeuralNetworksCompilation_compile}.
+   *
+   * An application is responsible to make sure that no other thread uses
+   * the model at the same time.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be finished.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
+
+  /**
+   * Add an operand to a model.
+   *
+   * The order in which the operands are added is important. The first one added
+   * to a model will have the index value 0, the second 1, etc. These indexes
+   * are used as operand identifiers in
+   * {@link ANeuralNetworksModel_addOperation},
+   * {@link ANeuralNetworksExecution_setInput},
+   * {@link ANeuralNetworksExecution_setInputFromMemory},
+   * {@link ANeuralNetworksExecution_setOutput},
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+   * {@link ANeuralNetworksExecution_setOperandValue}.
+   *
+   * To build a model that can accommodate inputs of various sizes, as you may
+   * want to do for a CNN, set the size of the dimensions that will vary at run
+   * time to 0. If you do so, provide the full dimensions when calling
+   * {@link ANeuralNetworksExecution_setInput} or {@link
+   * ANeuralNetworksExecution_setInputFromMemory}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+   * of the operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperand)(
+      ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+  /**
+   * Sets an operand to a constant value.
+   *
+   * For scalar values, the content of buffer is copied into the model.
+   *
+   * For tensor values, a pointer to the buffer is stored within the model.
+   * The application is responsible for not changing the content of this region
+   * until all executions using this model have completed. As the data may
+   * be copied during processing, modifying the data after this call yields
+   * undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model,
+                                              int32_t index, const void* buffer,
+                                              size_t length);
+
+  /**
+   * Sets an operand to a value stored in a memory object.
+   *
+   * The content of the memory is not copied. A reference to that memory is
+   * stored inside the model. The application is responsible for not changing
+   * the content of the memory region until all executions using this model have
+   * completed.
+   * As the data may be copied during processing, modifying the data after this
+   * call yields undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValueFromMemory)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Add an operation to a model.
+   *
+   * @param model The model to be modified.
+   * @param type The type of the operation.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying each operand.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying each operand.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model,
+                                           ANeuralNetworksOperationType type,
+                                           uint32_t inputCount,
+                                           const uint32_t* inputs,
+                                           uint32_t outputCount,
+                                           const uint32_t* outputs);
+
+  /**
+   * Specifies which operands will be the model's inputs and outputs.
+   *
+   * An operand cannot be used for both input and output. Doing so will
+   * return an error.
+   *
+   * @param model The model to be modified.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying the input operands.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying the output operands.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   */
+  int (*ANeuralNetworksModel_identifyInputsAndOutputs)(
+      ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+      uint32_t outputCount, const uint32_t* outputs);
+
+  /**
+   * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
+   * calculated with range and/or precision as low as that of the
+   * IEEE 754 16-bit floating-point format. By default,
+   * {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using at least
+   * the range and precision of the IEEE 754 32-bit floating-point format.
+   *
+   * @param model The model to be modified.
+   * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
+   *              calculated with range and/or precision as low as that of the
+   *              IEEE 754 16-bit floating point format. 'false' indicates
+   *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated
+   *              using at least the range and precision of the IEEE 754 32-bit
+   *              floating point format.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * Available since API level 28.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   */
+  int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16)(
+      ANeuralNetworksModel* model, bool allow);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+   * This only creates the object. Compilation is only performed once
+   * {@link ANeuralNetworksCompilation_start} is invoked.
+   *
+   * <p>The provided model must outlive the compilation.</p>
+   *
+   * The model must already have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param model The {@link ANeuralNetworksModel} to be compiled.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   */
+  int (*ANeuralNetworksCompilation_create)(
+      ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Destroy a compilation.
+   *
+   * <p>If called on a compilation for which
+   * {@link ANeuralNetworksCompilation_start} has been called, the
+   * function will return immediately but will mark the compilation to be
+   * deleted once the compilation completes. The
+   * {@link ANeuralNetworksCompilation_wait} will return ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be destroyed. Passing NULL is
+   * acceptable and results in no operation.
+   */
+  void (*ANeuralNetworksCompilation_free)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Sets the execution preference.
+   *
+   * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param preference Either {@link PREFER_LOW_POWER},
+   *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+   *                  {@link PREFER_SUSTAINED_SPEED}.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPreference)(
+      ANeuralNetworksCompilation* compilation, int32_t preference);
+
+  /**
+   * Waits until the compilation completes.
+   *
+   * More than one thread can wait on a compilation. When the compilation
+   * completes, all threads will be released.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
+   */
+  int (*ANeuralNetworksCompilation_finish)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+   * This only creates the object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * <p>The provided compilation must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param execution The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksExecution_create)(
+      ANeuralNetworksCompilation* compilation,
+      ANeuralNetworksExecution** execution);
+
+  /**
+   * Destroy an execution.
+   *
+   * <p>If called on an execution for which
+   * {@link ANeuralNetworksExecution_startCompute} has been called, the
+   * function will return immediately but will mark the execution to be deleted
+   * once the computation completes.   The {link ANeuralNetworksExecution_wait}
+   * will return ANEURALNETWORKS_ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Associate a user buffer with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This should be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other properties of the type must be the same as
+   *             specified in the model. If the type is the same as specified
+   *             when the model was built, NULL can be passed.
+   * @param buffer The buffer containing the data.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, const void* buffer,
+      size_t length);
+
+  /**
+   * Associate part of a memory object with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Associate a user buffer with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param buffer The buffer where the data is to be written.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+  /**
+   * Associate part of a memory object with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory where the data is to be stored.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The length in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Schedule evaluation of the execution.
+   *
+   * <p>Schedules evaluation of the execution. Once the model has been
+   * applied and the outputs are ready to be consumed, the execution will be
+   * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that
+   * signal.
+   * </p>
+   *
+   * Multiple executions can be scheduled and evaluated concurrently, and
+   * compilations can be performed concurrently with executions. The runtime
+   * makes no guarantee on the ordering of the completion of compilations and
+   * executions. If it's important to the application, the application should
+   * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
+   * {@link ANeuralNetworksExecution_wait}.
+   *
+   * ANeuralNetworksExecution_wait must be called to recuperate the resources
+   * used by the execution.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_startCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+  /**
+   * Waits until the execution completes.
+   *
+   * More than one thread can wait on an event. When the execution completes,
+   * all threads will be released.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
+
+  /**
+   * Destroys the event.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   */
+  void (*ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
+
+  // ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
+  // which was added in 8.1.
+  int (*ASharedMemory_create)(const char* name, size_t size);
+
+  /**/
+};
+
+/**
+ * Load the NNAPI implementation from the shared libraries.
+ * The NnApi structure is filled with all the pointers. If one function doesn't
+ * exist, a null pointer is stored.
+ */
+const NnApi* NnApiImplementation();
+
+#endif  // TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc b/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc78e53da64b209d53bfcfc97e194e7430f016c
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
@@ -0,0 +1,20 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+const NnApi* NnApiImplementation() {
+  static const NnApi nnapi = {};
+  return &nnapi;
+}
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_test.cc b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51fc404ff8f6f25170c6a7ef79b6623efa884ab1
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include <gtest/gtest.h>
+
+namespace {
+
+TEST(NnapiLibTest, NnApiImplementation) {
+  const NnApi* nnapi = NnApiImplementation();
+  EXPECT_NE(nnapi, nullptr);
+#ifdef __ANDROID__
+  EXPECT_GT(nnapi->android_sdk_version, 0);
+  if (nnapi.android_sdk_version < 27) {
+    EXPECT_FALSE(nnapi->nnapi_exists);
+    EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksMemory_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_finish, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+              nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
+    EXPECT_EQ(nnapi->ASharedMemory_create, nullptr);
+  } else {
+    EXPECT_TRUE(nnapi->nnapi_exists);
+    EXPECT_NE(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksMemory_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_finish, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+    if (nnapi->android_sdk_version >= 28) {
+      // relaxComputationFloat32toFloat16 only available with Android 9.0 (P).
+      EXPECT_NE(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+                nullptr);
+    } else {
+      EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+                nullptr);
+    }
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksEvent_wait, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksEvent_free, nullptr);
+    EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+  }
+#else
+  EXPECT_FALSE(nnapi->nnapi_exists);
+  EXPECT_EQ(nnapi->android_sdk_version, 0);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_finish, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
+  EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+#endif
+}
+
+}  // namespace
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 26d75696a1c889d752f9715358701da6300f49df..f7cb1588e1768dfe2926f92563d0796e82e47296 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 #ifdef __ANDROID__
 #include <android/log.h>
@@ -84,56 +84,27 @@ void logError(const char* format, ...) {
 static const int64_t kOperandIdNotSet = -1;
 static const int64_t kOperandNotNeeded = -2;
 
-namespace {
-
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return 0xFFFF;
-      }
-    }
-    return atoi(sdkVersion);
-  }
-  FATAL("No %s prop", sdkProp);
-#endif  // __ANDROID__
-  return 0;
-}
-
-int32_t GetAndroidSdkVersionCached() {
-  static int32_t androidSdkVersion = GetAndroidSdkVersion();
-  return androidSdkVersion;
-}
-
-}  // namespace
-
 NNAPIAllocation::NNAPIAllocation(const char* filename,
                                  ErrorReporter* error_reporter)
     : MMAPAllocation(filename, error_reporter) {
   if (mmapped_buffer_ != MAP_FAILED)
-    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
-                                                mmap_fd_, 0, &handle_));
+    CHECK_NN(NnApiImplementation()->ANeuralNetworksMemory_createFromFd(
+        buffer_size_bytes_, PROT_READ, mmap_fd_, 0, &handle_));
 }
 
 NNAPIAllocation::~NNAPIAllocation() {
   if (handle_) {
-    ANeuralNetworksMemory_free(handle_);
+    NnApiImplementation()->ANeuralNetworksMemory_free(handle_);
   }
 }
 
 NNAPIDelegate::~NNAPIDelegate() {
   if (nn_compiled_model_) {
-    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    NnApiImplementation()->ANeuralNetworksCompilation_free(nn_compiled_model_);
     nn_compiled_model_ = nullptr;
   }
   if (nn_model_) {
-    ANeuralNetworksModel_free(nn_model_);
+    NnApiImplementation()->ANeuralNetworksModel_free(nn_model_);
     nn_model_ = nullptr;
     // TODO(aselle): Is this thread-safe and callable multiple times?
   }
@@ -145,6 +116,7 @@ TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
                                ANeuralNetworksModel* nn_model,
                                uint32_t* no_of_operands_added,
                                std::vector<int64_t>* nnapi_ids) {
+  const NnApi* nnapi = NnApiImplementation();
   uint32_t next_id = 0;
   for (size_t i = 0; i < subgraph->tensors_size(); i++) {
     // Skip temporaries and RNN back-edges.
@@ -198,24 +170,24 @@ TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
     RETURN_ERROR_IF_NN_FAILED(
-        ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+        nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
     // TODO(aselle): Based on Michael's suggestion, limiting this to read
     // only memory
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
               static_cast<const Allocation*>(tensor->allocation))) {
         RETURN_ERROR_IF_NN_FAILED(
-            ANeuralNetworksModel_setOperandValueFromMemory(
+            nnapi->ANeuralNetworksModel_setOperandValueFromMemory(
                 nn_model, next_id, alloc->memory(),
                 alloc->offset(tensor->data.raw), tensor->bytes));
       } else {
-        RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
+        RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
             nn_model, next_id, tensor->data.raw, tensor->bytes));
       }
     } else if (tensor->bytes == 0) {
       // These size 0 tensors are optional tensors reserved.
-      RETURN_ERROR_IF_NN_FAILED(
-          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
+      RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, nullptr, 0));
     }
 
     ++next_id;
@@ -244,6 +216,7 @@ TfLiteStatus AddOpsAndParams(
     uint32_t next_id, std::vector<int>* model_state_inputs,
     std::vector<int>* model_state_outputs,
     const std::vector<int64_t>& tensor_id_to_nnapi_id) {
+  const NnApi* nnapi = NnApiImplementation();
   for (size_t i = 0; i < subgraph->nodes_size(); i++) {
     const auto* node_and_registration = subgraph->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
@@ -258,21 +231,21 @@ TfLiteStatus AddOpsAndParams(
     MapAndAddTensorIds(node.outputs->data, node.outputs->size,
                        &augmented_outputs, tensor_id_to_nnapi_id);
 
-    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+    auto add_scalar_int32 = [nnapi, &nn_model, &augmented_inputs,
                              &next_id](int value) {
       ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(int32_t)))
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, &value, sizeof(int32_t)))
       augmented_inputs.push_back(next_id++);
     };
 
-    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+    auto add_scalar_float32 = [nnapi, &nn_model, &augmented_inputs,
                                &next_id](float value) {
       ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(float)))
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, &value, sizeof(float)))
       augmented_inputs.push_back(next_id++);
     };
 
@@ -281,8 +254,8 @@ TfLiteStatus AddOpsAndParams(
           .type = ANEURALNETWORKS_TENSOR_INT32,
           .dimensionCount = 1,
           .dimensions = &num_values};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
           nn_model, next_id, values, sizeof(int32_t) * num_values));
       augmented_inputs.push_back(next_id++);
     };
@@ -291,15 +264,16 @@ TfLiteStatus AddOpsAndParams(
     // For each state_out tensor, a corresponding state_in operand needs to be
     // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
-         &model_state_outputs](int tensor_id) {
+        [nnapi, subgraph, &nn_model, &next_id, &augmented_inputs,
+         &model_state_inputs, &model_state_outputs](int tensor_id) {
           const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
           ANeuralNetworksOperandType operand_type{
               ANEURALNETWORKS_TENSOR_FLOAT32,
               static_cast<uint32_t>(tensor->dims->size),
               reinterpret_cast<uint32_t*>(tensor->dims->data),
               tensor->params.scale, tensor->params.zero_point};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+          CHECK_NN(
+              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
           augmented_inputs.push_back(next_id);
           model_state_inputs->push_back(next_id);
           model_state_outputs->push_back(tensor_id);
@@ -388,7 +362,7 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
+    auto add_lstm_scratch_tensor_float32 = [nnapi, subgraph, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
       if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
@@ -398,7 +372,7 @@ TfLiteStatus AddOpsAndParams(
           static_cast<uint32_t>(tensor->dims->size),
           reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
           tensor->params.zero_point};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
       augmented_outputs.insert(augmented_outputs.begin(), next_id++);
     };
 
@@ -427,15 +401,16 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // Handle optional input tensors.
-    auto add_optional_tensors = [&nn_model, &augmented_inputs,
+    auto add_optional_tensors = [nnapi, &nn_model, &augmented_inputs,
                                  &next_id](int nn_type) {
       for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
         if (augmented_inputs[idx] == kOptionalTensor) {
           const std::vector<uint32_t> dim = {0, 0};
           ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
-                                                        nullptr, 0))
+          CHECK_NN(
+              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+          CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+              nn_model, next_id, nullptr, 0))
           augmented_inputs[idx] = next_id++;
         }
       }
@@ -686,6 +661,9 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_MIRROR_PAD:
       case tflite::BuiltinOperator_ABS:
       case tflite::BuiltinOperator_SPLIT_V:
+      case tflite::BuiltinOperator_UNIQUE:
+      case tflite::BuiltinOperator_CEIL:
+      case tflite::BuiltinOperator_REVERSE_V2:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
@@ -695,13 +673,13 @@ TfLiteStatus AddOpsAndParams(
         break;
     }
 
-    if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
+    if (nnapi_version == 11 && nnapi->android_sdk_version < 28) {
       logError("Op %d needs NNAPI1.1", builtin);
       return kTfLiteError;
     }
 
     // Add the operation.
-    RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
+    RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_addOperation(
         nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
         augmented_inputs.data(),
         static_cast<uint32_t>(augmented_outputs.size()),
@@ -713,9 +691,10 @@ TfLiteStatus AddOpsAndParams(
 TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   if (nn_model_ && nn_compiled_model_) return model_status_;
 
+  const NnApi* nnapi = NnApiImplementation();
   // TODO(aselle): This is not correct. need to handle resize invalidation.
   if (!nn_model_) {
-    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+    CHECK_NN(nnapi->ANeuralNetworksModel_create(&nn_model_));
 
     // Find which tensors should be added to NNAPI. TFLite has temporaries
     // and RNN back-edges which are are not valid for NNAPI. We look through all
@@ -762,21 +741,22 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
                        model_states_outputs_.size(), &augmented_outputs,
                        tensor_id_to_nnapi_id);
 
-    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+    CHECK_NN(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs(
         nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
         static_cast<uint32_t>(augmented_outputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
 
-    if (GetAndroidSdkVersionCached() >= 28) {
-      CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+    if (nnapi->android_sdk_version >= 28) {
+      CHECK_NN(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
           nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
     }
-    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+    CHECK_NN(nnapi->ANeuralNetworksModel_finish(nn_model_));
   }
   if (!nn_compiled_model_) {
-    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
-    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+    CHECK_NN(nnapi->ANeuralNetworksCompilation_create(nn_model_,
+                                                      &nn_compiled_model_));
+    CHECK_NN(nnapi->ANeuralNetworksCompilation_finish(nn_compiled_model_));
   }
   return kTfLiteOk;
 }
@@ -792,8 +772,10 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     return model_status_;
   }
 
+  const NnApi* nnapi = NnApiImplementation();
   ANeuralNetworksExecution* execution = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+  CHECK_NN(
+      nnapi->ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
 
   // Currently perform deep copy of input buffer
   for (size_t i = 0; i < subgraph->inputs().size(); i++) {
@@ -801,7 +783,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     // TODO(aselle): Is this what we want or do we want input instead?
     // TODO(aselle): This should be called setInputValue maybe to be cons.
     TfLiteTensor* tensor = subgraph->tensor(input);
-    CHECK_NN(ANeuralNetworksExecution_setInput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
@@ -809,7 +791,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   for (size_t i = 0; i < subgraph->outputs().size(); i++) {
     int output = subgraph->outputs()[i];
     TfLiteTensor* tensor = subgraph->tensor(output);
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
@@ -821,21 +803,21 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     // Here we are using a deep copy for state_in tensors so that we are not
     // reading and writing into the same buffer during a invocation.
     // TODO(miaowang): using double shared buffer to minimize the copies.
-    CHECK_NN(ANeuralNetworksExecution_setInput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
         execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
     // Tell NNAPI where to output the state_out.
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
         execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
   }
 
   // Currently use blocking compute.
   ANeuralNetworksEvent* event = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
-  CHECK_NN(ANeuralNetworksEvent_wait(event));
-  ANeuralNetworksEvent_free(event);
-  ANeuralNetworksExecution_free(execution);
+  CHECK_NN(nnapi->ANeuralNetworksExecution_startCompute(execution, &event));
+  CHECK_NN(nnapi->ANeuralNetworksEvent_wait(event));
+  nnapi->ANeuralNetworksEvent_free(event);
+  nnapi->ANeuralNetworksExecution_free(execution);
 
 #if 0
   printf("From the NN API:\n");
@@ -853,6 +835,8 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   return kTfLiteOk;
 }
 
-bool NNAPIDelegate::IsSupported() { return NNAPIExists(); }
+bool NNAPIDelegate::IsSupported() {
+  return NnApiImplementation()->nnapi_exists;
+}
 
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 52ea6fe636247ec0a4d5fedb41c56fc095e6ac61..bbc252045baad0316333bf9bc19dd78b8bd58590 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -2,6 +2,7 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 common_copts = [
@@ -41,6 +42,17 @@ cc_library(
     copts = common_copts,
 )
 
+cc_test(
+    name = "time_test",
+    srcs = ["time_test.cc"],
+    copts = common_copts,
+    deps = [
+        ":time",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "profile_summarizer",
     srcs = ["profile_summarizer.cc"],
@@ -54,15 +66,14 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
-    copts = common_copts,
+    extra_copts = common_copts,
     deps = [
         ":profile_summarizer",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/profiling/time.cc b/tensorflow/lite/profiling/time.cc
index 3e7db03d9d8df1eeb0c82d388324716c5e7d7896..32eb30070fb7d882cf7fd206fcfca2f81a09cfff 100644
--- a/tensorflow/lite/profiling/time.cc
+++ b/tensorflow/lite/profiling/time.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #if defined(_MSC_VER)
 #include <chrono>  // NOLINT(build/c++11)
+#include <thread>  // NOLINT(build/c++11)
 #else
 #include <sys/time.h>
+#include <time.h>
 #endif
 
 namespace tflite {
@@ -32,12 +34,24 @@ uint64_t NowMicros() {
       .count();
 }
 
+void SleepForMicros(uint64_t micros) {
+  std::this_thread::sleep_for(std::chrono::microseconds(micros));
+}
+
 #else
 
 uint64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
-  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<uint64_t>(tv.tv_sec) * 1e6 + tv.tv_usec;
+}
+
+void SleepForMicros(uint64_t micros) {
+  timespec sleep_time;
+  sleep_time.tv_sec = micros / 1e6;
+  micros -= sleep_time.tv_sec * 1e6;
+  sleep_time.tv_nsec = micros * 1e3;
+  nanosleep(&sleep_time, nullptr);
 }
 
 #endif  // defined(_MSC_VER)
diff --git a/tensorflow/lite/profiling/time.h b/tensorflow/lite/profiling/time.h
index 66233a480fd390619629e26a05284202057e0f4a..c7527ad0d2943e048518c78cf7375a65857c8dfe 100644
--- a/tensorflow/lite/profiling/time.h
+++ b/tensorflow/lite/profiling/time.h
@@ -21,6 +21,7 @@ namespace tflite {
 namespace profiling {
 namespace time {
 uint64_t NowMicros();
+void SleepForMicros(uint64_t micros);
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/time_test.cc b/tensorflow/lite/profiling/time_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f08479adeb9311f7cf098f64edd8f3656928eeb
--- /dev/null
+++ b/tensorflow/lite/profiling/time_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/profiling/time.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+namespace time {
+
+TEST(TimeTest, NowMicros) {
+  auto now0 = NowMicros();
+  EXPECT_GT(now0, 0);
+  auto now1 = NowMicros();
+  EXPECT_GE(now1, now0);
+}
+
+TEST(TimeTest, SleepForMicros) {
+  // A zero sleep shouldn't cause issues.
+  SleepForMicros(0);
+
+  // Sleeping should be reflected in the current time.
+  auto now0 = NowMicros();
+  SleepForMicros(50);
+  auto now1 = NowMicros();
+  EXPECT_GE(now1, now0 + 50);
+
+  // Sleeping more than a second should function properly.
+  now0 = NowMicros();
+  SleepForMicros(1e6 + 50);
+  now1 = NowMicros();
+  EXPECT_GE(now1, now0 + 1e6 + 50);
+}
+
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index acf827892bfd0081f1bbc7d0c3fa4f65af3a0817..1fa8494a4c7723bcb1888270c6dc30db420233a8 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -61,6 +61,7 @@ py_library(
         ":lite_constants",
         ":op_hint",
         "//tensorflow/python:graph_util",
+        "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/keras",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
@@ -71,6 +72,7 @@ py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -115,8 +117,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform",
@@ -145,7 +145,6 @@ py_library(
     srcs = ["convert_saved_model.py"],
     srcs_version = "PY2AND3",
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index cf49ee2b472d2c6617811cde0978eb8ae3a16f8e..e270abaa5afa0f2b3bb255e896c706794277c26e 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
+from tensorflow.python.framework.graph_util_impl import _node_name
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -389,6 +390,29 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       convert.convert_dtype_to_tflite_type(dtypes.bool)
 
+  def testFindHintedOutputNodes(self):
+    """Test if all hinted output nodes are correctly found."""
+
+    def _build_ophinted_op(name, input1, input2):
+      custom_op = op_hint.OpHint(name)
+      input1 = custom_op.add_input(input1)
+      input2 = custom_op.add_input(input2)
+      output = math_ops.mul(input1, input2)
+      return custom_op.add_output(output)
+
+    output_1 = _build_ophinted_op("custom_op_1", array_ops.constant([1.]),
+                                  array_ops.constant([2.]))
+    output_2 = _build_ophinted_op("custom_op_2", array_ops.constant([3.]),
+                                  array_ops.constant([4.]))
+    with self.cached_session() as sess:
+      hinted_outputs_nodes = op_hint.find_all_hinted_output_nodes(sess)
+      expected_hinted_output_nodes = [
+          _node_name(output_1.name),
+          _node_name(output_2.name)
+      ]
+      self.assertEqual(
+          len(hinted_outputs_nodes), len(expected_hinted_output_nodes))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index a6183d13b56c787aac0d9d9fc190eff277eb4c8e..a1325f0b1ff8bec11f0ad90846154401b1bb0134 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -19,20 +19,32 @@ from __future__ import print_function
 
 import sys
 import numpy as np
-from tensorflow.python.util.lazy_loader import LazyLoader
-from tensorflow.python.util.tf_export import tf_export as _tf_export
-
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies. Must use double quotes to match code internal rewrite
-# rule.
-# pylint: disable=g-inconsistent-quotes
-_interpreter_wrapper = LazyLoader(
-    "_interpreter_wrapper", globals(),
-    "tensorflow.lite.python.interpreter_wrapper."
-    "tensorflow_wrap_interpreter_wrapper")
-# pylint: enable=g-inconsistent-quotes
-
-del LazyLoader
+
+# pylint: disable=g-import-not-at-top
+try:
+  from tensorflow.python.util.lazy_loader import LazyLoader
+  from tensorflow.python.util.tf_export import tf_export as _tf_export
+
+  # Lazy load since some of the performance benchmark skylark rules
+  # break dependencies. Must use double quotes to match code internal rewrite
+  # rule.
+  # pylint: disable=g-inconsistent-quotes
+  _interpreter_wrapper = LazyLoader(
+      "_interpreter_wrapper", globals(),
+      "tensorflow.lite.python.interpreter_wrapper."
+      "tensorflow_wrap_interpreter_wrapper")
+  # pylint: enable=g-inconsistent-quotes
+
+  del LazyLoader
+except ImportError:
+  # When full Tensorflow Python PIP is not available do not use lazy load
+  # and instead uf the tflite_runtime path.
+  from tflite_runtime.lite.python import interpreter_wrapper as _interpreter_wrapper
+
+  def tf_export_dummy(*x, **kwargs):
+    del x, kwargs
+    return lambda x: x
+  _tf_export = tf_export_dummy
 
 
 @_tf_export('lite.Interpreter')
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 767a9fc476398dd8fb60128f73f8ae7c518d9a21..6de6fb48f78022652de3cd2cecf7fe51c39e79f6 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -11,6 +11,8 @@ cc_library(
     srcs = ["interpreter_wrapper.cc"],
     hdrs = ["interpreter_wrapper.h"],
     deps = [
+        ":python_error_reporter",
+        ":python_utils",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
         "//third_party/py/numpy:headers",
@@ -19,6 +21,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "python_error_reporter",
+    srcs = ["python_error_reporter.cc"],
+    hdrs = ["python_error_reporter.h"],
+    deps = [
+        "//tensorflow/lite/core/api",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+cc_library(
+    name = "python_utils",
+    srcs = ["python_utils.cc"],
+    hdrs = ["python_utils.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 tf_py_wrap_cc(
     name = "tensorflow_wrap_interpreter_wrapper",
     srcs = [
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index d14af439ec0ab600ea260da17ef0041cca25d629..9ccaabbfe97a3b024abc1080de75f2434b64962e 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 
 // Disallow Numpy 1.7 deprecated symbols.
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
@@ -60,36 +62,6 @@ limitations under the License.
 namespace tflite {
 namespace interpreter_wrapper {
 
-class PythonErrorReporter : public tflite::ErrorReporter {
- public:
-  PythonErrorReporter() {}
-
-  // Report an error message
-  int Report(const char* format, va_list args) override {
-    char buf[1024];
-    int formatted = vsnprintf(buf, sizeof(buf), format, args);
-    buffer_ << buf;
-    return formatted;
-  }
-
-  // Set's a Python runtime exception with the last error.
-  PyObject* exception() {
-    std::string last_message = message();
-    PyErr_SetString(PyExc_RuntimeError, last_message.c_str());
-    return nullptr;
-  }
-
-  // Gets the last error message and clears the buffer.
-  std::string message() {
-    std::string value = buffer_.str();
-    buffer_.clear();
-    return value;
-  }
-
- private:
-  std::stringstream buffer_;
-};
-
 namespace {
 
 // Calls PyArray's initialization to initialize all the API pointers. Note that
@@ -114,61 +86,6 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
   return interpreter;
 }
 
-int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
-  switch (tf_lite_type) {
-    case kTfLiteFloat32:
-      return NPY_FLOAT32;
-    case kTfLiteInt32:
-      return NPY_INT32;
-    case kTfLiteInt16:
-      return NPY_INT16;
-    case kTfLiteUInt8:
-      return NPY_UINT8;
-    case kTfLiteInt8:
-      return NPY_INT8;
-    case kTfLiteInt64:
-      return NPY_INT64;
-    case kTfLiteString:
-      return NPY_OBJECT;
-    case kTfLiteBool:
-      return NPY_BOOL;
-    case kTfLiteComplex64:
-      return NPY_COMPLEX64;
-    case kTfLiteNoType:
-      return NPY_NOTYPE;
-      // Avoid default so compiler errors created when new types are made.
-  }
-  return NPY_NOTYPE;
-}
-
-TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
-  int pyarray_type = PyArray_TYPE(array);
-  switch (pyarray_type) {
-    case NPY_FLOAT32:
-      return kTfLiteFloat32;
-    case NPY_INT32:
-      return kTfLiteInt32;
-    case NPY_INT16:
-      return kTfLiteInt16;
-    case NPY_UINT8:
-      return kTfLiteUInt8;
-    case NPY_INT8:
-      return kTfLiteInt8;
-    case NPY_INT64:
-      return kTfLiteInt64;
-    case NPY_BOOL:
-      return kTfLiteBool;
-    case NPY_OBJECT:
-    case NPY_STRING:
-    case NPY_UNICODE:
-      return kTfLiteString;
-    case NPY_COMPLEX64:
-      return kTfLiteComplex64;
-      // Avoid default so compiler errors created when new types are made.
-  }
-  return kTfLiteNoType;
-}
-
 struct PyDecrefDeleter {
   void operator()(PyObject* p) const { Py_DECREF(p); }
 };
@@ -307,7 +224,7 @@ PyObject* InterpreterWrapper::TensorType(int i) const {
     return nullptr;
   }
 
-  int code = TfLiteTypeToPyArrayType(tensor->type);
+  int code = python_utils::TfLiteTypeToPyArrayType(tensor->type);
   if (code == -1) {
     PyErr_Format(PyExc_ValueError, "Invalid tflite type code %d", code);
     return nullptr;
@@ -352,12 +269,12 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value) {
   PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
   const TfLiteTensor* tensor = interpreter_->tensor(i);
 
-  if (TfLiteTypeFromPyArray(array) != tensor->type) {
+  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
     PyErr_Format(PyExc_ValueError,
                  "Cannot set tensor:"
                  " Got tensor of type %d"
                  " but expected type %d for input %d ",
-                 TfLiteTypeFromPyArray(array), tensor->type, i);
+                 python_utils::TfLiteTypeFromPyArray(array), tensor->type, i);
     return nullptr;
   }
 
@@ -400,7 +317,7 @@ PyObject* CheckGetTensorArgs(Interpreter* interpreter_, int tensor_index,
     return nullptr;
   }
 
-  *type_num = TfLiteTypeToPyArrayType((*tensor)->type);
+  *type_num = python_utils::TfLiteTypeToPyArrayType((*tensor)->type);
   if (*type_num == -1) {
     PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
     return nullptr;
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
index f52ef1eeca7db397d84d249b74445a3276bc65fb..ef4b28f04723ab8d7f4f395a028bb565b4ca9cf3 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 %}
 
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..803a4c29345a44bcdba41d851884fa86d6e87d3e
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+// Report an error message
+int PythonErrorReporter::Report(const char* format, va_list args) {
+  char buf[1024];
+  int formatted = vsnprintf(buf, sizeof(buf), format, args);
+  buffer_ << buf;
+  return formatted;
+}
+
+// Set's a Python runtime exception with the last error.
+PyObject* PythonErrorReporter::exception() {
+  std::string last_message = message();
+  PyErr_SetString(PyExc_RuntimeError, last_message.c_str());
+  return nullptr;
+}
+
+// Gets the last error message and clears the buffer.
+std::string PythonErrorReporter::message() {
+  std::string value = buffer_.str();
+  buffer_.clear();
+  return value;
+}
+}  // namespace interpreter_wrapper
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d4e308834a21b795644f0c1f89607a3b75ad7ce
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+
+#include <Python.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+class PythonErrorReporter : public tflite::ErrorReporter {
+ public:
+  PythonErrorReporter() {}
+
+  // Report an error message
+  int Report(const char* format, va_list args) override;
+
+  // Sets a Python runtime exception with the last error and
+  // clears the error message buffer.
+  PyObject* exception();
+
+  // Gets the last error message and clears the buffer.
+  std::string message();
+
+ private:
+  std::stringstream buffer_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2dc604356ab7d312f13e5c432fd3a8b05bbbc14c
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+
+namespace tflite {
+namespace python_utils {
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
+  switch (tf_lite_type) {
+    case kTfLiteFloat32:
+      return NPY_FLOAT32;
+    case kTfLiteInt32:
+      return NPY_INT32;
+    case kTfLiteInt16:
+      return NPY_INT16;
+    case kTfLiteUInt8:
+      return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
+    case kTfLiteInt64:
+      return NPY_INT64;
+    case kTfLiteString:
+      return NPY_OBJECT;
+    case kTfLiteBool:
+      return NPY_BOOL;
+    case kTfLiteComplex64:
+      return NPY_COMPLEX64;
+    case kTfLiteNoType:
+      return NPY_NOTYPE;
+      // Avoid default so compiler errors created when new types are made.
+  }
+  return NPY_NOTYPE;
+}
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
+  int pyarray_type = PyArray_TYPE(array);
+  switch (pyarray_type) {
+    case NPY_FLOAT32:
+      return kTfLiteFloat32;
+    case NPY_INT32:
+      return kTfLiteInt32;
+    case NPY_INT16:
+      return kTfLiteInt16;
+    case NPY_UINT8:
+      return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
+    case NPY_INT64:
+      return kTfLiteInt64;
+    case NPY_BOOL:
+      return kTfLiteBool;
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE:
+      return kTfLiteString;
+    case NPY_COMPLEX64:
+      return kTfLiteComplex64;
+      // Avoid default so compiler errors created when new types are made.
+  }
+  return kTfLiteNoType;
+}
+
+}  // namespace python_utils
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.h b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..30a44226b8fb1a26ced71f3128dc363ec864e9bd
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+
+#include "tensorflow/lite/context.h"
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+namespace tflite {
+namespace python_utils {
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type);
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
+
+}  // namespace python_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 1b20ff2f92b6a84c21972ccccbc27ec6f999d74b..3b0aa02b7c1c5215908c86b35525566669a0cd30 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -53,19 +53,53 @@ from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=un
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2 as _rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2 as _config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework import graph_util as _tf_graph_util
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework.errors_impl import NotFoundError as _NotFoundError
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
+from tensorflow.python.grappler import tf_optimizer as _tf_optimizer
 from tensorflow.python.lib.io import file_io as _file_io
 from tensorflow.python.saved_model import signature_constants as _signature_constants
 from tensorflow.python.saved_model import tag_constants as _tag_constants
+from tensorflow.python.training.saver import export_meta_graph as _export_meta_graph
 from tensorflow.python.util import deprecation as _deprecation
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
+def _run_graph_optimizations(graph_def, input_arrays, output_arrays):
+  """Apply standard TensorFlow optimizations to the graph_def.
+
+  Args:
+    graph_def: Frozen GraphDef to be optimized.
+    input_arrays: List of arrays that are considered inputs of the graph.
+    output_arrays: List of arrays that are considered outputs of the graph.
+
+  Returns:
+    A new, optimized GraphDef.
+  """
+  meta_graph = _export_meta_graph(graph_def=graph_def)
+
+  # We need to add a collection called 'train_op' so that grappler
+  # knows what the outputs are.
+  fetch_collection = _meta_graph_pb2.CollectionDef()
+  for array in input_arrays + output_arrays:
+    fetch_collection.node_list.value.append(array.name)
+  meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+
+  config = _config_pb2.ConfigProto()
+  rewrite_options = config.graph_options.rewrite_options
+  rewrite_options.layout_optimizer = _rewriter_config_pb2.RewriterConfig.ON
+  # Avoid remapping as it creates ops like _FusedConv2D, which are not
+  # supported by TF Lite.
+  rewrite_options.remapping = _rewriter_config_pb2.RewriterConfig.OFF
+  return _tf_optimizer.OptimizeGraph(config, meta_graph)
+
+
 @_tf_export("lite.TFLiteConverter")
 class TFLiteConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
@@ -401,15 +435,16 @@ class TFLiteConverter(object):
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
         shape = tensor.get_shape()
-        if not shape or not shape.as_list():
+        if not shape:
           raise ValueError("Provide an input shape for input array "
                            "'{0}'.".format(_tensor_name(tensor)))
+        # Note that shape_list might be empty for scalar shapes.
         shape_list = shape.as_list()
         if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
               "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
-        elif shape_list[0] is None:
+        elif shape_list and shape_list[0] is None:
           self._set_batch_size(batch_size=1)
 
     # Get quantization stats. Ensures there is one stat per name if the stats
@@ -446,16 +481,26 @@ class TFLiteConverter(object):
         "dump_graphviz_video": self.dump_graphviz_video
     }
 
+    optimized_graph = None
+    if self.inference_type == constants.QUANTIZED_UINT8:
+      optimized_graph = self._graph_def
+    else:
+      try:
+        optimized_graph = _run_graph_optimizations(
+            self._graph_def, self._input_tensors, self._output_tensors)
+      except Exception:
+        optimized_graph = self._graph_def
+
     # Converts model.
     if self._has_valid_tensors():
       result = _toco_convert_impl(
-          input_data=self._graph_def,
+          input_data=optimized_graph,
           input_tensors=self._input_tensors,
           output_tensors=self._output_tensors,
           **converter_kwargs)
     else:
       result = _toco_convert_graph_def(
-          input_data=self._graph_def,
+          input_data=optimized_graph,
           input_arrays_with_shape=self._input_arrays_with_shape,
           output_arrays=self._output_arrays,
           **converter_kwargs)
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 1f9c768b4441cc1385d93285d26eeee9b651ca83..83fd56bf1d2617b7132d0eb2314c80460e968c18 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -113,6 +113,35 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  def testString(self):
+    in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
+    out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.object_, input_details[0]['dtype'])
+    self.assertTrue(([4] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('Reshape', output_details[0]['name'])
+    self.assertEqual(np.object_, output_details[0]['dtype'])
+    self.assertTrue(([2, 2] == output_details[0]['shape']).all())
+    # TODO(b/122659643): Test setting/getting string data via the python
+    # interpreter API after support has been added.
+
   def testQuantization(self):
     in_tensor_1 = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
@@ -223,18 +252,42 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
                      str(error.exception))
 
-  def testSizeEmptyInvalid(self):
+  def testScalarValid(self):
+    # Construct a graph using a scalar (empty shape) input.
     in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
 
-    # Test empty shape.
+    # Test conversion with the scalar input shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    with self.assertRaises(ValueError) as error:
-      converter.convert()
-    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
-                     str(error.exception))
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([] == input_details[0]['shape']).all())
+
+    # Validate inference using the scalar inputs/outputs.
+    test_input = np.array(4.0, dtype=np.float32)
+    expected_output = np.array(8.0, dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
   def testSizeInvalid(self):
     in_tensor = array_ops.placeholder(
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 8d7f9316bfe81255510fc5aca9ffdf9671cd64df..8df37c111b949ac1e373434ff666cf11c0aed6d6 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -36,9 +36,7 @@ Example:
   session = tf.Session()
 
   graphdef_to_convert = tf.lite.convert_op_hints_to_stubs(session)
-  tflite_graph = tf.lite.toco_convert(graphdef_to_convert,
-                                              [image], [output])
-                                              [image], [output])
+  tflite_graph = tf.lite.toco_convert(graphdef_to_convert, [image], [output])
   with open("/tmp/graph.fb", "wb") as fp:
     fp.write(tflite_graph)
 
@@ -964,6 +962,35 @@ def _convert_op_hints_to_stubs_helper(
   return curr_graph_def
 
 
+def find_all_hinted_output_nodes(session=None, graph_def=None):
+  """Find all Ophints output nodes in the graph.
+
+  This is used to get all the output nodes those are ophinted, it is important
+  for operation like convert_variables_to_constants keep all ophints structure.
+  Note: only one of session or graph_def should be used, not both.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+    graph_def: A graph def that we should convert.
+
+  Returns:
+    A list of OpHints output nodes.
+  Raises:
+    ValueError: If both session and graph_def are provided.
+  """
+  if session is not None and graph_def is not None:
+    raise ValueError("Provide only one of session and graph_def.")
+  hinted_outputs_nodes = []
+  if session is not None:
+    hints = _find_all_hints_in_graph_def(session.graph_def)
+  elif graph_def is not None:
+    hints = _find_all_hints_in_graph_def(graph_def)
+  for hint in _six.itervalues(hints):
+    _, ouput_nodes = hint.flattened_inputs_and_outputs()
+    hinted_outputs_nodes.extend(ouput_nodes)
+  return hinted_outputs_nodes
+
+
 def convert_op_hints_to_stubs(session=None,
                               graph_def=None,
                               write_callback=lambda graph_def, comments: None):
@@ -996,6 +1023,7 @@ def convert_op_hints_to_stubs(session=None,
 
 
 _allowed_symbols = [
-    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new"
+    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new",
+    "find_all_hinted_output_nodes"
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 341b539bead296ca28c1f5f8c17928e553ebabc4..401a592273c9c76f1f371bb8972f7f9a3d494278 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -343,13 +343,13 @@ def run_main(_):
             "floats. Used for quantized input tensors. (default None)"))
   parser.add_argument(
       "--default_ranges_min",
-      type=int,
+      type=float,
       help=("Default value for min bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
   parser.add_argument(
       "--default_ranges_max",
-      type=int,
+      type=float,
       help=("Default value for max bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 69d5458c6e432a2370a2ca4998a5d4664398c528..ea516764c929080bc42e48a7cfcdd171f2d6cc57 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -70,7 +70,6 @@ flatbuffer_cc_library(
         "--no-union-value-namespacing",
         "--gen-object-api",
     ],
-    gen_reflections = True,
     out_prefix = "reflection/",
 )
 
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 91d8049301b235624d924c023eb1dd29c5e86689..fbcb18fbebc7256080917f845677ca1f6e7683e7 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -64,9 +64,20 @@ table QuantizationParameters {
   scale:[float];  // For dequantizing the tensor's values.
   zero_point:[long];
 
-  // If this is not none, the quantization parameters above are ignored and the
-  // value of the QuantizationDetails union below should be used.
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
   details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
 }
 
 table Tensor {
@@ -205,6 +216,9 @@ enum BuiltinOperator : byte {
   MIRROR_PAD = 100,
   ABS = 101,
   SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
 }
 
 // Options for the builtin operators.
@@ -288,6 +302,8 @@ union BuiltinOptions {
   MirrorPadOptions,
   AbsOptions,
   SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -701,6 +717,13 @@ table MirrorPadOptions {
   mode:MirrorPadMode;
 }
 
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 0883cce497d663c6f5eed768564d7a8624f7295e..6ad7df050e752f37af005f9e3d4ca77349d7d58f 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -268,6 +268,12 @@ struct SquaredDifferenceOptionsT;
 struct MirrorPadOptions;
 struct MirrorPadOptionsT;
 
+struct UniqueOptions;
+struct UniqueOptionsT;
+
+struct ReverseV2Options;
+struct ReverseV2OptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -520,11 +526,14 @@ enum BuiltinOperator {
   BuiltinOperator_MIRROR_PAD = 100,
   BuiltinOperator_ABS = 101,
   BuiltinOperator_SPLIT_V = 102,
+  BuiltinOperator_UNIQUE = 103,
+  BuiltinOperator_CEIL = 104,
+  BuiltinOperator_REVERSE_V2 = 105,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SPLIT_V
+  BuiltinOperator_MAX = BuiltinOperator_REVERSE_V2
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[105] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -627,7 +636,10 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
     BuiltinOperator_SQUARED_DIFFERENCE,
     BuiltinOperator_MIRROR_PAD,
     BuiltinOperator_ABS,
-    BuiltinOperator_SPLIT_V
+    BuiltinOperator_SPLIT_V,
+    BuiltinOperator_UNIQUE,
+    BuiltinOperator_CEIL,
+    BuiltinOperator_REVERSE_V2
   };
   return values;
 }
@@ -737,6 +749,9 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "MIRROR_PAD",
     "ABS",
     "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
     nullptr
   };
   return names;
@@ -828,11 +843,13 @@ enum BuiltinOptions {
   BuiltinOptions_MirrorPadOptions = 77,
   BuiltinOptions_AbsOptions = 78,
   BuiltinOptions_SplitVOptions = 79,
+  BuiltinOptions_UniqueOptions = 80,
+  BuiltinOptions_ReverseV2Options = 81,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SplitVOptions
+  BuiltinOptions_MAX = BuiltinOptions_ReverseV2Options
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[82] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -913,7 +930,9 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
     BuiltinOptions_SquaredDifferenceOptions,
     BuiltinOptions_MirrorPadOptions,
     BuiltinOptions_AbsOptions,
-    BuiltinOptions_SplitVOptions
+    BuiltinOptions_SplitVOptions,
+    BuiltinOptions_UniqueOptions,
+    BuiltinOptions_ReverseV2Options
   };
   return values;
 }
@@ -1000,6 +1019,8 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "MirrorPadOptions",
     "AbsOptions",
     "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
     nullptr
   };
   return names;
@@ -1330,6 +1351,14 @@ template<> struct BuiltinOptionsTraits<SplitVOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
 };
 
+template<> struct BuiltinOptionsTraits<UniqueOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ReverseV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseV2Options;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1993,6 +2022,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SplitVOptions ?
       reinterpret_cast<const SplitVOptionsT *>(value) : nullptr;
   }
+  UniqueOptionsT *AsUniqueOptions() {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<UniqueOptionsT *>(value) : nullptr;
+  }
+  const UniqueOptionsT *AsUniqueOptions() const {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<const UniqueOptionsT *>(value) : nullptr;
+  }
+  ReverseV2OptionsT *AsReverseV2Options() {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<ReverseV2OptionsT *>(value) : nullptr;
+  }
+  const ReverseV2OptionsT *AsReverseV2Options() const {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<const ReverseV2OptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -2314,7 +2359,9 @@ struct QuantizationParametersT : public flatbuffers::NativeTable {
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
   QuantizationDetailsUnion details;
-  QuantizationParametersT() {
+  int32_t quantized_dimension;
+  QuantizationParametersT()
+      : quantized_dimension(0) {
   }
 };
 
@@ -2326,7 +2373,8 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     VT_SCALE = 8,
     VT_ZERO_POINT = 10,
     VT_DETAILS_TYPE = 12,
-    VT_DETAILS = 14
+    VT_DETAILS = 14,
+    VT_QUANTIZED_DIMENSION = 16
   };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
@@ -2350,6 +2398,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const CustomQuantization *details_as_CustomQuantization() const {
     return details_type() == QuantizationDetails_CustomQuantization ? static_cast<const CustomQuantization *>(details()) : nullptr;
   }
+  int32_t quantized_dimension() const {
+    return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MIN) &&
@@ -2363,6 +2414,7 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE) &&
            VerifyOffset(verifier, VT_DETAILS) &&
            VerifyQuantizationDetails(verifier, details(), details_type()) &&
+           VerifyField<int32_t>(verifier, VT_QUANTIZED_DIMENSION) &&
            verifier.EndTable();
   }
   QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2395,6 +2447,9 @@ struct QuantizationParametersBuilder {
   void add_details(flatbuffers::Offset<void> details) {
     fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
   }
+  void add_quantized_dimension(int32_t quantized_dimension) {
+    fbb_.AddElement<int32_t>(QuantizationParameters::VT_QUANTIZED_DIMENSION, quantized_dimension, 0);
+  }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2414,8 +2469,10 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
     flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
     flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
     QuantizationDetails details_type = QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0) {
+    flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
   QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_quantized_dimension(quantized_dimension);
   builder_.add_details(details);
   builder_.add_zero_point(zero_point);
   builder_.add_scale(scale);
@@ -2432,7 +2489,8 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
     const std::vector<float> *scale = nullptr,
     const std::vector<int64_t> *zero_point = nullptr,
     QuantizationDetails details_type = QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0) {
+    flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
   return tflite::CreateQuantizationParameters(
       _fbb,
       min ? _fbb.CreateVector<float>(*min) : 0,
@@ -2440,7 +2498,8 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
       scale ? _fbb.CreateVector<float>(*scale) : 0,
       zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0,
       details_type,
-      details);
+      details,
+      quantized_dimension);
 }
 
 flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -7021,6 +7080,100 @@ inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
 
 flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct UniqueOptionsT : public flatbuffers::NativeTable {
+  typedef UniqueOptions TableType;
+  TensorType idx_out_type;
+  UniqueOptionsT()
+      : idx_out_type(TensorType_INT32) {
+  }
+};
+
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UniqueOptionsT NativeTableType;
+  enum {
+    VT_IDX_OUT_TYPE = 4
+  };
+  TensorType idx_out_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE) &&
+           verifier.EndTable();
+  }
+  UniqueOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UniqueOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UniqueOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_idx_out_type(TensorType idx_out_type) {
+    fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
+  }
+  explicit UniqueOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UniqueOptionsBuilder &operator=(const UniqueOptionsBuilder &);
+  flatbuffers::Offset<UniqueOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UniqueOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType idx_out_type = TensorType_INT32) {
+  UniqueOptionsBuilder builder_(_fbb);
+  builder_.add_idx_out_type(idx_out_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseV2OptionsT : public flatbuffers::NativeTable {
+  typedef ReverseV2Options TableType;
+  ReverseV2OptionsT() {
+  }
+};
+
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReverseV2OptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ReverseV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReverseV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseV2OptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ReverseV2OptionsBuilder &operator=(const ReverseV2OptionsBuilder &);
+  flatbuffers::Offset<ReverseV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReverseV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ReverseV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -7391,6 +7544,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const SplitVOptions *builtin_options_as_SplitVOptions() const {
     return builtin_options_type() == BuiltinOptions_SplitVOptions ? static_cast<const SplitVOptions *>(builtin_options()) : nullptr;
   }
+  const UniqueOptions *builtin_options_as_UniqueOptions() const {
+    return builtin_options_type() == BuiltinOptions_UniqueOptions ? static_cast<const UniqueOptions *>(builtin_options()) : nullptr;
+  }
+  const ReverseV2Options *builtin_options_as_ReverseV2Options() const {
+    return builtin_options_type() == BuiltinOptions_ReverseV2Options ? static_cast<const ReverseV2Options *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7738,6 +7897,14 @@ template<> inline const SplitVOptions *Operator::builtin_options_as<SplitVOption
   return builtin_options_as_SplitVOptions();
 }
 
+template<> inline const UniqueOptions *Operator::builtin_options_as<UniqueOptions>() const {
+  return builtin_options_as_UniqueOptions();
+}
+
+template<> inline const ReverseV2Options *Operator::builtin_options_as<ReverseV2Options>() const {
+  return builtin_options_as_ReverseV2Options();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8181,6 +8348,7 @@ inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const
   { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
   { auto _e = details_type(); _o->details.type = _e; };
   { auto _e = details(); if (_e) _o->details.value = QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); };
+  { auto _e = quantized_dimension(); _o->quantized_dimension = _e; };
 }
 
 inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -8197,6 +8365,7 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
   auto _details_type = _o->details.type;
   auto _details = _o->details.Pack(_fbb);
+  auto _quantized_dimension = _o->quantized_dimension;
   return tflite::CreateQuantizationParameters(
       _fbb,
       _min,
@@ -8204,7 +8373,8 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
       _scale,
       _zero_point,
       _details_type,
-      _details);
+      _details,
+      _quantized_dimension);
 }
 
 inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -10356,6 +10526,55 @@ inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers:
       _mode);
 }
 
+inline UniqueOptionsT *UniqueOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UniqueOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = idx_out_type(); _o->idx_out_type = _e; };
+}
+
+inline flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUniqueOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _idx_out_type = _o->idx_out_type;
+  return tflite::CreateUniqueOptions(
+      _fbb,
+      _idx_out_type);
+}
+
+inline ReverseV2OptionsT *ReverseV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ReverseV2OptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateReverseV2Options(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -10930,6 +11149,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -11264,6 +11491,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -11586,6 +11821,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const SplitVOptionsT *>(value);
       return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptionsT *>(value);
+      return CreateUniqueOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2OptionsT *>(value);
+      return CreateReverseV2Options(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -11908,6 +12151,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new SplitVOptionsT(*reinterpret_cast<SplitVOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_UniqueOptions: {
+      value = new UniqueOptionsT(*reinterpret_cast<UniqueOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      value = new ReverseV2OptionsT(*reinterpret_cast<ReverseV2OptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -12310,6 +12561,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<UniqueOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<ReverseV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index cbf1d7b226af20251d5f70a354a21f1eb40ae1c6..6fc7de90ea534f9c8c4f61b4607ff7d2d8647d00 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -35,8 +35,11 @@ TEST(StringUtil, TestStringUtil) {
 
   char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
 
-  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, {}, data,
-                                          15);
+  TfLiteQuantization quant;
+  quant.type = kTfLiteNoQuantization;
+  quant.params = nullptr;
+  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, quant,
+                                          data, 15);
   TfLiteTensor* t2 = interpreter.tensor(2);
   interpreter.AllocateTensors();
 
diff --git a/tensorflow/lite/testdata/multi_add.json b/tensorflow/lite/testdata/multi_add.json
index 97b931dba8b1050ecf91939d1d9dcea5e0ea56fb..ae559255a85300bfacf5c3658b2915ce7738f5b7 100644
--- a/tensorflow/lite/testdata/multi_add.json
+++ b/tensorflow/lite/testdata/multi_add.json
@@ -1,46 +1,131 @@
 {
-  "version": 1,
+  "version": 3,
   "operator_codes": [
     {
-      "builtin_code": "ADD"
     }
   ],
   "subgraphs": [
     {
       "tensors": [
-        { "shape": [ 1, 8, 8, 3 ], "name": "a" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "b" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "c" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "d" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "i" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "x" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "y" }
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "a"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "b"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "c"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "d"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "i"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "x"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "y"
+        }
+      ],
+      "inputs": [
+        0,
+        1,
+        2,
+        3
+      ],
+      "outputs": [
+        5,
+        6
       ],
-      "inputs": [ 0, 1, 2, 3 ],
-      "outputs": [ 5, 6 ],
       "operators": [
         {
-          "inputs": [ 1, 2 ],
-          "outputs": [ 4 ],
+          "inputs": [
+            1,
+            2
+          ],
+          "outputs": [
+            4
+          ],
           "builtin_options_type": "AddOptions",
           "builtin_options": {
           }
         },
         {
-          "inputs": [ 0, 4 ],
-          "outputs": [ 5 ],
+          "inputs": [
+            0,
+            4
+          ],
+          "outputs": [
+            5
+          ],
           "builtin_options_type": "AddOptions",
           "builtin_options": {
           }
         },
         {
-          "inputs": [ 3, 4 ],
-          "outputs": [ 6 ],
+          "inputs": [
+            3,
+            4
+          ],
+          "outputs": [
+            6
+          ],
           "builtin_options_type": "AddOptions",
           "builtin_options": {
           }
         }
       ]
     }
+  ],
+  "buffers": [
+    {
+      "data": [
+
+      ]
+    }
   ]
 }
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 22ffed43cc0e08ac45a9a07077450d2642ba7f26..3840b52a0ad6a639d67d8240ddfa9177e776c31e 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -12,6 +12,7 @@ load(
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
     "tf_cc_test",
     "py_test",
 )
@@ -165,6 +166,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:reference_ops",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -229,13 +231,13 @@ cc_test(
     ],
 )
 
-cc_binary(
+tf_cc_binary(
     name = "nnapi_example",
     srcs = ["nnapi_example.cc"],
     deps = [
         ":parse_testdata_lib",
         ":tflite_driver",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
     ],
 )
 
@@ -256,7 +258,7 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "tf_driver_test",
     size = "small",
     srcs = ["tf_driver_test.cc"],
@@ -285,7 +287,7 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "generate_testspec_test",
     size = "small",
     srcs = ["generate_testspec_test.cc"],
@@ -379,7 +381,7 @@ tf_cc_test(
     ],
 )
 
-cc_binary(
+tf_cc_binary(
     name = "tflite_diff",
     srcs = ["tflite_diff_example_test.cc"],
     deps = [
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index dd7b3d07456fbd9943e9f45b815e6015f4973a94..3aaaf545dfa14388c43a7db1565c0a92b118efb0 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -1424,6 +1424,36 @@ def make_conv_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+# Note: This is a regression test for a bug (b/122651451) that Toco incorrectly
+# erases the reduction indices array while it's shared with other ops.
+def make_l2norm_shared_epsilon_tests(zip_path):
+  """Regression test for a bug (b/122651451)."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[5, 7]],
+      "dim": [1],
+      "epsilon": [1e-8],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    epsilon = tf.constant(parameters["epsilon"])
+    out1 = tf.nn.l2_normalize(input_tensor, parameters["dim"], epsilon=epsilon)
+    out2 = tf.nn.l2_normalize(input_tensor, parameters["dim"], epsilon=epsilon)
+    out = out1 + out2
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Note: This is a regression test for a bug (b/112436267) that Toco incorrectly
 # fuses weights when multiple Conv2D/FULLY_CONNECTED ops share the same constant
 # weight tensor.
@@ -3012,6 +3042,31 @@ def make_floor_tests(zip_path):
     out = tf.floor(input_value)
     return [input_value], [out]
 
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_ceil_tests(zip_path):
+  """Make a set of tests to do ceil."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the ceil op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.ceil(input_value)
+    return [input_value], [out]
+
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
@@ -3222,6 +3277,48 @@ def make_slice_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_conv2d_transpose_tests(zip_path):
+  """Make a set of tests to do transpose_conv."""
+
+  test_parameters = [{
+      "input_shape": [[1, 50, 54, 3]],
+      "filter_shape": [[1, 1, 8, 3], [1, 2, 8, 3], [1, 3, 8, 3], [1, 4, 8, 3]],
+      "output_shape": [[1, 100, 108, 8]],
+  }]
+
+  def build_graph(parameters):
+    """Build a transpose_conv graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+
+    filter_tensor = tf.placeholder(
+        dtype=tf.float32, name="filter", shape=parameters["filter_shape"])
+
+    out = tf.nn.conv2d_transpose(
+        input_tensor,
+        filter_tensor,
+        output_shape=parameters["output_shape"],
+        padding="SAME",
+        strides=(1, 2, 2, 1))
+
+    input_tensors = [input_tensor, filter_tensor]
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(np.float32, parameters["input_shape"]),
+        create_tensor_data(np.float32, parameters["filter_shape"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=4)
+
+
 # Since compute output_shape is fairly complicated for
 # tf.nn.conv2d_backprop_input input_sizes argument, so we here first perform a
 # "conv2d" operation to get the output, then we use the output to feed in
@@ -3749,6 +3846,86 @@ def make_placeholder_with_default_tests(zip_path):
                     expected_tf_success=3)
 
 
+def make_unique_tests(zip_path):
+  """Make a set of tests for Unique op."""
+
+  test_parameters = [
+      {
+          "input_shape": [[1]],
+          "index_type": [tf.int32, tf.int64, None],
+          "input_values": [3]
+      },
+      {
+          "input_shape": [[5]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[3, 2, 1, 2, 3]]
+      },
+      {
+          "input_shape": [[7]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[1, 1, 1, 1, 1, 1, 1]]
+      },
+      {
+          "input_shape": [[5]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[3, 2, 1, 0, -1]]
+      }]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["index_type"] is None:
+      output = tf.unique(input_tensor)
+    else:
+      output = tf.unique(input_tensor, parameters["index_type"])
+
+    return [input_tensor], output
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=9)
+
+
+def make_reverse_v2_tests(zip_path):
+  """Make a set of tests to do reverse_v2."""
+
+  test_parameters = [{
+      "base_shape": [[3, 4, 3], [3, 4], [5, 6, 7, 8]],
+      "axis": [0, 1, 2, 3],
+  }]
+
+  def get_valid_axis(parameters):
+    """Return a tweaked version of 'axis'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    while axis > len(shape) - 1:
+      axis -= 1
+    return axis
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name=("input"), shape=parameters["base_shape"])
+    outs = tf.reverse(input_tensor, axis=[get_valid_axis(parameters)])
+    return [input_tensor], [outs]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(np.float32, shape=parameters["base_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index a9a31ad088e6f4b0297ba313c585abbe6189728b..45bd59a67d10baf61ad981f2fef29e948c2e77d2 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -107,6 +107,28 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
+// Additional list of tests that are expected to fail when
+//   --test_arg=--ignore_known_bugs=false
+// and
+//   --test_arg=--use_nnapi=true
+// Note that issues related to lack of NNAPI support for a particular op are
+// handled separately; this list is specifically for broken cases where
+// execution produces broken output.
+// Key is a substring of the test name and value is a bug number.
+std::map<string, string> kBrokenNnapiTests = {
+    // Certain NNAPI kernels silently fail with int32 types.
+    {R"(^\/add.*dtype=tf\.int32)", "122987564"},
+    {R"(^\/concat.*dtype=tf\.int32)", "122987564"},
+    {R"(^\/mul.*dtype=tf\.int32)", "122987564"},
+    {R"(^\/space_to_depth.*dtype=tf\.int32)", "122987564"},
+
+    // Certain NNAPI fully_connected shape permutations fail.
+    {R"(^\/fully_connected_constant_filter=True.*shape1=\[3,3\])", "122987564"},
+    {R"(^\/fully_connected_constant_filter=True.*shape1=\[4,4\])", "122987564"},
+    {R"(^\/fully_connected.*shape1=\[3,3\].*transpose_b=True)", "122987564"},
+    {R"(^\/fully_connected.*shape1=\[4,4\].*shape2=\[4,1\])", "122987564"},
+};
+
 // Allows test data to be unarchived into a temporary directory and makes
 // sure those temporary directories are removed later.
 class ArchiveEnvironment : public ::testing::Environment {
@@ -242,8 +264,13 @@ TEST_P(OpsTest, RunZipTests) {
   tflite::testing::TfLiteDriver test_driver(FLAGS_use_nnapi);
   test_driver.SetModelBaseDir(tflite_dir);
 
+  auto broken_tests = kBrokenTests;
+  if (FLAGS_use_nnapi) {
+    broken_tests.insert(kBrokenNnapiTests.begin(), kBrokenNnapiTests.end());
+  }
+
   string bug_number;
-  for (const auto& p : kBrokenTests) {
+  for (const auto& p : broken_tests) {
     if (RE2::PartialMatch(test_name, p.first)) {
       bug_number = p.second;
     }
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index d1c314608687f045b346cc5526ea46c8149c2755..d10d2909b5ec4a269fd1a67d7a22f4c1e76f707e 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -24,7 +24,21 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-// Join a list of data separated by delimiter.
+// Join a list of data with default precision separated by delimiter.
+template <typename T>
+string JoinDefault(T* data, size_t len, const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << data[0];
+  for (int i = 1; i < len; i++) {
+    result << delimiter << data[i];
+  }
+  return result.str();
+}
+
+// Join a list of data with fixed precision separated by delimiter.
 template <typename T>
 string Join(T* data, size_t len, const string& delimiter) {
   if (len == 0 || data == nullptr) {
diff --git a/tensorflow/lite/testing/join_test.cc b/tensorflow/lite/testing/join_test.cc
index 0b3c07f37e14e3815ac1eb4acd0aefac3515064c..476a7f20591691ccddff6829c894c640608f6471 100644
--- a/tensorflow/lite/testing/join_test.cc
+++ b/tensorflow/lite/testing/join_test.cc
@@ -26,6 +26,11 @@ TEST(JoinTest, JoinInt) {
   EXPECT_EQ(Join(data.data(), data.size(), ","), "1,2,3");
 }
 
+TEST(JoinDefaultTest, JoinFloat) {
+  float data[] = {1.0, -3, 2.3, 1e-5};
+  EXPECT_EQ(JoinDefault(data, 4, " "), "1 -3 2.3 1e-05");
+}
+
 TEST(JoinTest, JoinFloat) {
   float data[] = {1.0, -3, 2.3, 1e-5};
   EXPECT_EQ(Join(data, 4, " "), "1 -3 2.29999995 9.99999975e-06");
diff --git a/tensorflow/lite/testing/nnapi_example.cc b/tensorflow/lite/testing/nnapi_example.cc
index 22df8dbd8821436ab9a960d0acd4423278c078d8..309cb19628cd54a39ea926a6f3506cf570ff3679 100644
--- a/tensorflow/lite/testing/nnapi_example.cc
+++ b/tensorflow/lite/testing/nnapi_example.cc
@@ -25,11 +25,14 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include <string>
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/testing/parse_testdata.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
 
-string dirname(const string& s) { return s.substr(0, s.find_last_of("/")); }
+std::string dirname(const std::string& s) {
+  return s.substr(0, s.find_last_of("/"));
+}
 
 bool Interpret(const char* examples_filename, bool use_nnapi) {
   std::ifstream tflite_stream(examples_filename);
@@ -65,14 +68,14 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  string base_dir = dirname(argv[1]);
+  std::string base_dir = dirname(argv[1]);
   DIR* dir = opendir(base_dir.c_str());
   if (dir == nullptr) {
     fprintf(stderr, "Can't open dir %s\n", base_dir.c_str());
     return 1;
   }
   while (struct dirent* ent = readdir(dir)) {
-    string name = ent->d_name;
+    std::string name = ent->d_name;
     if (name.rfind(".txt") == name.length() - 4) {
       printf("%s: ", name.c_str());
       if (Interpret((base_dir + "/" + name).c_str(), use_nnapi)) {
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index 363d162d56a1670821d29768bc36411bf22d61e9..e79704d616cf59585228851b91c2e93259d84c0b 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -93,7 +93,7 @@ TEST(TfDriverTest, SimpleTest) {
                    {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
 
   runner->LoadModel(
-      "third_party/tensorflow/lite/testdata/multi_add.pb");
+      "tensorflow/lite/testdata/multi_add.pb");
   EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
 
   ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 4e11d49f252818f9f7024b8bbafa8b17ad77ad48..a637dc86c020d4e16fb4fc02e9f62e8dec6a3a25 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/split.h"
 
@@ -77,32 +79,7 @@ class TfLiteDriver::Expectation {
     SetTensorData(values, &data_);
   }
 
-  template <>
-  void SetData<string>(const string& csv_values) {
-    string s = absl::HexStringToBytes(csv_values);
-    data_.raw = new char[s.size()];
-    memcpy(data_.raw, s.data(), s.size());
-  }
-
-  bool Check(bool verbose, const TfLiteTensor& tensor) {
-    switch (tensor.type) {
-      case kTfLiteFloat32:
-        return TypedCheck<float>(verbose, tensor);
-      case kTfLiteInt32:
-        return TypedCheck<int32_t>(verbose, tensor);
-      case kTfLiteInt64:
-        return TypedCheck<int64_t>(verbose, tensor);
-      case kTfLiteUInt8:
-        return TypedCheck<uint8_t>(verbose, tensor);
-      case kTfLiteBool:
-        return TypedCheck<bool>(verbose, tensor);
-      case kTfLiteString:
-        return TypedCheck<string>(verbose, tensor);
-      default:
-        fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
-        return false;
-    }
-  }
+  bool Check(bool verbose, const TfLiteTensor& tensor);
 
  private:
   template <typename T>
@@ -144,52 +121,87 @@ class TfLiteDriver::Expectation {
     return good_output;
   }
 
-  template <>
-  bool TypedCheck<string>(bool verbose, const TfLiteTensor& tensor) {
-    if (tensor.data.raw == nullptr) {
+  TfLitePtrUnion data_;
+  size_t num_elements_;
+};
+
+template <>
+void TfLiteDriver::Expectation::SetData<string>(const string& csv_values) {
+  string s = absl::HexStringToBytes(csv_values);
+  data_.raw = new char[s.size()];
+  memcpy(data_.raw, s.data(), s.size());
+}
+
+template <>
+bool TfLiteDriver::Expectation::TypedCheck<string>(bool verbose,
+                                                   const TfLiteTensor& tensor) {
+  if (tensor.data.raw == nullptr) {
+    if (verbose) {
+      std::cerr << "  got empty string" << std::endl;
+    }
+    return false;
+  }
+  int expected_num_strings = GetStringCount(data_.raw);
+  int returned_num_strings = GetStringCount(tensor.data.raw);
+  if (expected_num_strings != returned_num_strings) {
+    if (verbose) {
+      std::cerr << "  string count differ: got " << returned_num_strings
+                << ", but expected " << expected_num_strings << std::endl;
+    }
+    return false;
+  }
+  for (int i = 0; i < returned_num_strings; ++i) {
+    auto expected_ref = GetString(data_.raw, i);
+    auto returned_ref = GetString(tensor.data.raw, i);
+    if (expected_ref.len != returned_ref.len) {
       if (verbose) {
-        std::cerr << "  got empty string" << std::endl;
+        std::cerr << "  index " << i << ": got string of size "
+                  << returned_ref.len << ", but expected size "
+                  << expected_ref.len << std::endl;
       }
       return false;
     }
-    int expected_num_strings = GetStringCount(data_.raw);
-    int returned_num_strings = GetStringCount(tensor.data.raw);
-    if (expected_num_strings != returned_num_strings) {
+    if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
       if (verbose) {
-        std::cerr << "  string count differ: got " << returned_num_strings
-                  << ", but expected " << expected_num_strings << std::endl;
+        std::cerr << "  index " << i << ": strings are different" << std::endl;
       }
       return false;
     }
-    for (int i = 0; i < returned_num_strings; ++i) {
-      auto expected_ref = GetString(data_.raw, i);
-      auto returned_ref = GetString(tensor.data.raw, i);
-      if (expected_ref.len != returned_ref.len) {
-        if (verbose) {
-          std::cerr << "  index " << i << ": got string of size "
-                    << returned_ref.len << ", but expected size "
-                    << expected_ref.len << std::endl;
-        }
-        return false;
-      }
-      if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
-        if (verbose) {
-          std::cerr << "  index " << i << ": strings are different"
-                    << std::endl;
-        }
-        return false;
-      }
-    }
-
-    return true;
   }
 
-  TfLitePtrUnion data_;
-  size_t num_elements_;
-};
+  return true;
+}
+
+bool TfLiteDriver::Expectation::Check(bool verbose,
+                                      const TfLiteTensor& tensor) {
+  switch (tensor.type) {
+    case kTfLiteFloat32:
+      return TypedCheck<float>(verbose, tensor);
+    case kTfLiteInt32:
+      return TypedCheck<int32_t>(verbose, tensor);
+    case kTfLiteInt64:
+      return TypedCheck<int64_t>(verbose, tensor);
+    case kTfLiteUInt8:
+      return TypedCheck<uint8_t>(verbose, tensor);
+    case kTfLiteBool:
+      return TypedCheck<bool>(verbose, tensor);
+    case kTfLiteString:
+      return TypedCheck<string>(verbose, tensor);
+    default:
+      fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
+      return false;
+  }
+}
 
-TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
+TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name,
+                           bool reference_kernel)
     : use_nnapi_(use_nnapi) {
+  if (reference_kernel) {
+    resolver_.reset(new ops::builtin::BuiltinRefOpResolver);
+  } else {
+    resolver_.reset(new ops::builtin::BuiltinOpResolver);
+  }
+
   if (delegate_name == "FLEX") {
     delegate_ = FlexDelegate::Create();
   }
@@ -221,8 +233,7 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
     Invalidate("Failed to mmap model " + bin_file_path);
     return;
   }
-  ops::builtin::BuiltinOpResolver builtins;
-  InterpreterBuilder(*model_, builtins)(&interpreter_);
+  InterpreterBuilder(*model_, *resolver_)(&interpreter_);
   if (!interpreter_) {
     Invalidate("Failed build interpreter");
     return;
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 1da0533c57cf51f442253f28b6d9ba13078ef9a7..537f20dfbfd6c6fe0fbefd854358146129d33b7a 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -16,10 +16,12 @@ limitations under the License.
 #define TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
 
 #include <map>
+#include <memory>
 
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/testing/test_runner.h"
 
@@ -29,7 +31,8 @@ namespace testing {
 // A test runner that feeds inputs into TF Lite and verifies its outputs.
 class TfLiteDriver : public TestRunner {
  public:
-  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "");
+  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "",
+                        bool reference_kernel = false);
   ~TfLiteDriver() override;
 
   void LoadModel(const string& bin_file_path) override;
@@ -65,6 +68,7 @@ class TfLiteDriver : public TestRunner {
 
   class Expectation;
 
+  std::unique_ptr<OpResolver> resolver_;
   std::unique_ptr<FlexDelegate> delegate_;
   bool use_nnapi_ = false;
   std::unique_ptr<FlatBufferModel> model_;
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 6e953e5e19b8f6cac1a4349145b03a7f8b5e1969..81bf6700cb898796a72bea38ea0711556a7215a5 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -56,6 +56,40 @@ TEST(TfliteDriverTest, SimpleTest) {
   ASSERT_TRUE(runner->CheckResults());
 }
 
+TEST(TfliteDriverTest, SingleAddOpTest) {
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(
+      /*use_nnapi*/ false, /*delegate*/ "", /*reference_kernel*/ true));
+
+  runner->SetModelBaseDir("tensorflow/lite");
+  runner->LoadModel("testdata/multi_add.bin");
+  ASSERT_TRUE(runner->IsValid());
+
+  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
+  ASSERT_THAT(runner->GetOutputs(), ElementsAre(5, 6));
+
+  for (int i : {0, 1, 2, 3}) {
+    runner->ReshapeTensor(i, "1,2,2,1");
+  }
+  ASSERT_TRUE(runner->IsValid());
+
+  runner->AllocateTensors();
+
+  runner->SetInput(0, "0.1,0.2,0.3,0.4");
+  runner->SetInput(1, "0.001,0.002,0.003,0.004");
+  runner->SetInput(2, "0.001,0.002,0.003,0.004");
+  runner->SetInput(3, "0.01,0.02,0.03,0.04");
+
+  runner->ResetTensor(2);
+
+  runner->SetExpectation(5, "0.101,0.202,0.303,0.404");
+  runner->SetExpectation(6, "0.011,0.022,0.033,0.044");
+
+  runner->Invoke();
+  ASSERT_TRUE(runner->IsValid());
+
+  ASSERT_TRUE(runner->CheckResults());
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 93d41fcae14c8130de87471bdce64edad131c11f..47880c1f0b026beaceba61132d7aaa6db7a7e8de 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -192,6 +192,7 @@ cc_library(
         "graph_transformations/fuse_binary_into_preceding_affine.cc",
         "graph_transformations/fuse_broadcast_into_following_binary.cc",
         "graph_transformations/graph_transformations.cc",
+        "graph_transformations/group_bidirectional_sequence_ops.cc",
         "graph_transformations/hardcode_min_max.cc",
         "graph_transformations/identify_dilated_conv.cc",
         "graph_transformations/identify_l2_normalization.cc",
@@ -342,13 +343,15 @@ tf_cc_test(
     name = "import_tensorflow_test",
     srcs = ["import_tensorflow_test.cc"],
     deps = [
+        ":toco_port",
         ":toco_tooling",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -385,9 +388,11 @@ tf_cc_test(
     srcs = ["tooling_util_test.cc"],
     deps = [
         ":model",
+        ":toco_port",
         ":tooling_util",
         "//tensorflow/core:lib",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -451,12 +456,13 @@ tf_cc_test(
         ":toco_port",
         ":toco_tooling",
         ":types_proto_cc",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         # We cannot embed the core:ops dependency directly into :toco_tooling as
         # it can conflict with downstream deps when toco is used as a library.
         "//tensorflow/core:ops",
+        "//tensorflow/lite/testing:util",
     ],
 )
 
@@ -468,6 +474,7 @@ tf_cc_test(
     ],
     deps = [
         ":toco_port",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 9fff0015527ebadf501f571bdd5ed0a7643d66e0..50a30f51621c15766a1a2bb58628055ca07baecb 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -1205,6 +1205,16 @@ void ConvertFloorOperator(const Model& model, const FloorOperator& src_op,
   (*floor_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertCeilOperator(const Model& model, const CeilOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* ceil_op = tensorflow_graph->add_node();
+  ceil_op->set_op("Ceil");
+  ceil_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *ceil_op->add_input() = src_op.inputs[0];
+  (*ceil_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
                            GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* gather_op = tensorflow_graph->add_node();
@@ -2052,6 +2062,20 @@ void ConvertZerosLikeOperator(const Model& model,
   (*zeros_like_op->mutable_attr())["T"].set_type(data_type);
 }
 
+void ConvertReverseV2Operator(const Model& model,
+                              const ReverseV2Operator& src_op,
+                              const char* op_name, GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* reverse_v2_op = tensorflow_graph->add_node();
+  reverse_v2_op->set_op(op_name);
+  reverse_v2_op->set_name(src_op.outputs[0]);
+  DCHECK_EQ(src_op.inputs.size(), 2);
+  *reverse_v2_op->add_input() = src_op.inputs[0];
+  *reverse_v2_op->add_input() = src_op.inputs[1];
+  const tensorflow::DataType data_type =
+      GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*reverse_v2_op->mutable_attr())["T"].set_type(data_type);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -2169,6 +2193,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kFloor) {
     ConvertFloorOperator(model, static_cast<const FloorOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kCeil) {
+    ConvertCeilOperator(model, static_cast<const CeilOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kGather) {
     ConvertGatherOperator(model, static_cast<const GatherOperator&>(src_op),
                           tensorflow_graph);
@@ -2328,6 +2355,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertZerosLikeOperator(
         model, static_cast<const TensorFlowZerosLikeOperator&>(src_op),
         "ZerosLike", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kReverseV2) {
+    ConvertReverseV2Operator(model,
+                             static_cast<const ReverseV2Operator&>(src_op),
+                             "Reverse_V2", tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 436b639253f2e190fcaab895cd077b06796c1ca1..9ea8d8fa5b9792ccc9a9402ddc132462251b00c2 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -218,6 +218,12 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
     return ::tensorflow::Status::OK();
   }
 
+  if (CountOpsWithInput(*model, binary_op->outputs[0]) != 1) {
+    AddMessageF("Not fusing %s because it's consumed by multiple ops",
+                LogName(*binary_op));
+    return ::tensorflow::Status::OK();
+  }
+
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
 
   if (!following_op) {
@@ -287,9 +293,7 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  if (CountOpsWithInput(*model, binary_op->outputs[0]) == 1) {
-    model->EraseArray(binary_op->outputs[0]);
-  }
+  model->EraseArray(binary_op->outputs[0]);
 
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 187b584b6989cc55894160fc5508c13474a1d2d3..4008bbdb4d3a4a9051de70ef63d73d74fd44281c 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -127,6 +127,8 @@ DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBroadcastIntoFollowingBinary)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceRnn)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10e817b16559ad551f6eb366f67ff1edcd486ff2
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -0,0 +1,495 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <iterator>
+#include <memory>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator& op) {
+  return std::find_if(
+      model->operators.begin(), model->operators.end(),
+      [&op](const std::unique_ptr<Operator>& ptr) { return ptr.get() == &op; });
+}
+
+bool MatchTwoUnpackOps(const Operator& op, const Model& model,
+                       Operator** fw_output, Operator** bw_output) {
+  if (op.inputs.size() != 2) {
+    return false;
+  }
+
+  *fw_output = GetOpWithOutput(model, op.inputs[0]);
+  *bw_output = GetOpWithOutput(model, op.inputs[1]);
+  if (*fw_output == nullptr || *bw_output == nullptr) {
+    return false;
+  }
+
+  if ((*fw_output)->type != OperatorType::kUnpack ||
+      (*bw_output)->type != OperatorType::kUnpack) {
+    return false;
+  }
+
+  // TODO(renjieliu): Check the shapes are matching.
+
+  return true;
+}
+
+bool FindUnidirectionalSequenceOp(const Model& model, const Operator& output_op,
+                                  OperatorType operator_type,
+                                  std::stack<Operator*>* sequence_ops,
+                                  Operator** input_op) {
+  Operator* op_it = nullptr;
+  op_it = GetOpWithOutput(model, output_op.inputs[0]);
+  if (op_it == nullptr) {
+    return false;
+  }
+
+  while (op_it->type == operator_type) {
+    sequence_ops->push(op_it);
+    // Check the first input of the unidirectional squence lstm op.
+    op_it = GetOpWithOutput(model, op_it->inputs[0]);
+    if (op_it == nullptr) {
+      return false;
+    }
+  }
+
+  *input_op = op_it;
+  return true;
+}
+
+bool CheckTwoUnidirectionalSequenceOpsAreValid(
+    const std::stack<Operator*>& fw_unidirectional_sequence_ops,
+    const std::stack<Operator*>& bw_unidirectional_sequence_ops,
+    const Operator* first_fw_sequence_op_input,
+    const Operator* first_bw_sequence_op_input) {
+  if (fw_unidirectional_sequence_ops.size() !=
+          bw_unidirectional_sequence_ops.size() ||
+      fw_unidirectional_sequence_ops.empty()) {
+    return false;
+  }
+
+  // For static bidirectional sequence lstm, we should have two pack ops.
+  if (first_fw_sequence_op_input->type != OperatorType::kPack ||
+      first_bw_sequence_op_input->type != OperatorType::kPack) {
+    return false;
+  }
+
+  // fw_lstm & bw_lstm should point to the same input, but reversed sequence.
+  for (int i = 0; i < first_fw_sequence_op_input->inputs.size(); ++i) {
+    if (first_fw_sequence_op_input->inputs[i] !=
+        first_bw_sequence_op_input
+            ->inputs[first_fw_sequence_op_input->inputs.size() - i - 1]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ConstructBidirectionalSequenceOp(
+    const Operator& fw_lstm_op, const Operator& bw_lstm_op, Model* model,
+    BidirectionalSequenceLstmOperator** bi_op) {
+  // TODO(renjieliu): Check the shapes & configurations are equal.
+  constexpr int kBidirectionalSequenceLstmInputsCount = 47;
+  constexpr int kFwLstmInputsStartIndex = 1;
+  constexpr int kBwLstmInputsStartIndex = 18;
+  constexpr int kFwInputActivationStartIndex = 35;
+  constexpr int kBwInputActivationStartIndex = 37;
+  constexpr int kAuxInputStartIndex = 39;
+  (*bi_op)->inputs.reserve(kBidirectionalSequenceLstmInputsCount);
+  const string& input_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_input_0");
+  model->GetOrCreateArray(input_array_name);
+  // The input will be changed later.
+  (*bi_op)->inputs.push_back(input_array_name);
+  int i = 1;
+  // Fill in the fw_lstm weights.
+  for (; i < kBwLstmInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(fw_lstm_op.inputs[i]);
+  }
+
+  // Fill in the bw_lstm weights. bidirectional lstm backward weights start
+  // from 18.
+  for (; i < kFwInputActivationStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_lstm_op
+            .inputs[i - (kBwLstmInputsStartIndex - kFwLstmInputsStartIndex)]);
+  }
+
+  // Fill in fw_lstm previous states.
+  for (; i < kBwInputActivationStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        fw_lstm_op.inputs[i - (kFwInputActivationStartIndex -
+                               kBwLstmInputsStartIndex)]);
+  }
+
+  // Fill in bw_lstm previous states.
+  for (; i < kAuxInputStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_lstm_op.inputs[i - (kBwInputActivationStartIndex -
+                               kBwLstmInputsStartIndex)]);
+  }
+
+  // TODO(renjieliu): Deal with Auxiliary input and weights for 39 - 47.
+  for (; i <= kBidirectionalSequenceLstmInputsCount; ++i) {
+    const string& temp_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_lstm_temp_" + std::to_string(i));
+    model->CreateOptionalArray(temp_array_name);
+    (*bi_op)->inputs.push_back(temp_array_name);
+  }
+
+  // Deal with outputs.
+  (*bi_op)->outputs.reserve(2);
+  const string& fw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_fw_output_0");
+  const string& bw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_bw_output_0");
+  model->GetOrCreateArray(fw_output_array_name);
+  model->GetOrCreateArray(bw_output_array_name);
+  (*bi_op)->outputs.push_back(fw_output_array_name);
+  (*bi_op)->outputs.push_back(bw_output_array_name);
+  (*bi_op)->merge_outputs = false;
+  return true;
+}
+
+bool ConstructBidirectionalSequenceOp(
+    const Operator& fw_rnn_op, const Operator& bw_rnn_op, Model* model,
+    BidirectionalSequenceRnnOperator** bi_op) {
+  // TODO(renjieliu): Check the shapes & configurations are equal.
+  constexpr int kBidirectionalSequenceRnnInputsCount = 12;
+  constexpr int kFwInputsStartIndex = 1;
+  constexpr int kBwInputsStartIndex = 5;
+  constexpr int kAuxInputsStartIndex = 9;
+  (*bi_op)->inputs.reserve(kBidirectionalSequenceRnnInputsCount);
+  const string& input_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_input_0");
+  model->GetOrCreateArray(input_array_name);
+  // The input will be changed later.
+  (*bi_op)->inputs.push_back(input_array_name);
+  int i = 1;
+
+  // Fill in the fw_rnn weights.
+  for (; i < kBwInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(fw_rnn_op.inputs[i]);
+  }
+
+  // Fill in the bw_rnn weights.
+  for (; i < kAuxInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_rnn_op.inputs[i - (kBwInputsStartIndex - kFwInputsStartIndex)]);
+  }
+
+  // TODO(renjieliu): Deal with optional weights.
+  for (; i < kBidirectionalSequenceRnnInputsCount; ++i) {
+    const string& temp_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_rnn_temp_" + std::to_string(i));
+    model->CreateOptionalArray(temp_array_name);
+    (*bi_op)->inputs.push_back(temp_array_name);
+  }
+
+  // Deal with outputs.
+  (*bi_op)->outputs.reserve(2);
+  const string& fw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_fw_output_0");
+  const string& bw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_bw_output_0");
+  model->GetOrCreateArray(fw_output_array_name);
+  model->GetOrCreateArray(bw_output_array_name);
+  (*bi_op)->outputs.push_back(fw_output_array_name);
+  (*bi_op)->outputs.push_back(bw_output_array_name);
+  (*bi_op)->merge_outputs = false;
+  return true;
+}
+
+template <typename T>
+bool GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
+                          std::stack<Operator*> bw_sequence_ops,
+                          std::vector<T*>* bidirectional_sequence_ops) {
+  while (!fw_sequence_ops.empty()) {
+    Operator* fw_sequence_op = fw_sequence_ops.top();
+    Operator* bw_sequence_op = bw_sequence_ops.top();
+    T* bidirectional_sequence_op = new T;
+    if (!ConstructBidirectionalSequenceOp(*fw_sequence_op, *bw_sequence_op,
+                                          model, &bidirectional_sequence_op)) {
+      return false;
+    }
+
+    bidirectional_sequence_ops->push_back(bidirectional_sequence_op);
+    fw_sequence_ops.pop();
+    bw_sequence_ops.pop();
+  }
+  return true;
+}
+
+template <typename T>
+void RewireBidirectionalSequenceSequenceOpsConnections(
+    OperatorType operator_type, const string& input_array_name,
+    const std::vector<T*>& bidirectional_sequence_ops,
+    std::vector<std::unique_ptr<Operator>>::iterator* op_it, Model* model) {
+  int aux_input_index = -1;
+  switch (operator_type) {
+    case OperatorType::kBidirectionalSequenceLstm:
+      aux_input_index = 39;
+      break;
+    case OperatorType::kBidirectionalSequenceRnn:
+      aux_input_index = 9;
+      break;
+    default:
+      // Should not reach here.
+      DCHECK(false);
+  }
+  string cur_fw_input = input_array_name;
+  string cur_bw_input = input_array_name;
+  for (int i = 0; i < bidirectional_sequence_ops.size(); ++i) {
+    DeleteArrayIfUsedOnce(bidirectional_sequence_ops[i]->inputs[0], model);
+    bidirectional_sequence_ops[i]->inputs[0] = cur_fw_input;
+    if (i != 0) {
+      DeleteArrayIfUsedOnce(
+          bidirectional_sequence_ops[i]->inputs[aux_input_index], model);
+      bidirectional_sequence_ops[i]->inputs[aux_input_index] = cur_bw_input;
+    }
+    cur_fw_input = bidirectional_sequence_ops[i]->outputs[0];
+    cur_bw_input = bidirectional_sequence_ops[i]->outputs[1];
+    if (i != (bidirectional_sequence_ops.size() - 1)) {
+      bidirectional_sequence_ops[i]->merge_outputs = false;
+    } else {
+      // TODO(renjieliu): We need to check whether the outputs of the last bidi
+      // lstms needs merged outputs or not.
+      bidirectional_sequence_ops[i]->merge_outputs = true;
+      DeleteArrayIfUnused(bidirectional_sequence_ops[i]->outputs[1], model);
+      bidirectional_sequence_ops[i]->outputs.pop_back();
+    }
+    model->operators.emplace(*op_it, bidirectional_sequence_ops[i]);
+    *op_it += 1;
+  }
+}
+
+template <typename T>
+void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
+                              UnpackOperator** final_unpack_operator,
+                              T** final_bidi_sequence_operator, Model* model) {
+  (*final_unpack_operator)
+      ->inputs.push_back((*final_bidi_sequence_operator)->outputs[0]);
+  (*final_unpack_operator)->axis = original_unpack_operator.axis;
+  (*final_unpack_operator)->num = original_unpack_operator.num;
+
+  for (int i = 0; i < original_unpack_operator.outputs.size(); ++i) {
+    const string& output_array_name = original_unpack_operator.outputs[i];
+    const string& final_unpack_output_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_unpack_" + std::to_string(i));
+    model->GetOrCreateArray(final_unpack_output_array_name);
+    (*final_unpack_operator)->outputs.push_back(final_unpack_output_array_name);
+    Operator* unpack_following_op = GetOpWithInput(*model, output_array_name);
+    if (unpack_following_op != nullptr) {
+      // If there's a following op after the unpack, it must be a concat op.
+      DCHECK(unpack_following_op->type == OperatorType::kConcatenation);
+      // For every output of the concat, rewire the outputs.
+      for (const string& concat_output : unpack_following_op->outputs) {
+        (*final_unpack_operator)->outputs[i] = concat_output;
+      }
+      // Remove the concat op.
+      model->operators.erase(FindOperator(model, *unpack_following_op));
+    }
+  }
+}
+
+void RemoveUnpackOperator(const Operator& unpack_op, Model* model) {
+  for (const string& output_array_name : unpack_op.outputs) {
+    DeleteArrayIfUnused(output_array_name, model);
+  }
+  model->operators.erase(FindOperator(model, unpack_op));
+}
+
+void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
+                                     Model* model) {
+  while (!uni_sequence_ops.empty()) {
+    Operator* uni_sequence_op = uni_sequence_ops.top();
+    DeleteArrayIfUnused(uni_sequence_op->outputs[0], model);
+    model->operators.erase(FindOperator(model, *uni_sequence_op));
+    uni_sequence_ops.pop();
+  }
+}
+
+}  // namespace
+
+// TODO(renjieliu): Support graph generated by dynamic rnn as well.
+::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
+                                                         std::size_t op_index,
+                                                         bool* modified) {
+  *modified = false;
+  // Bidirectional sequence lstm will generate two separate unidirectional
+  // sequence lstm ops, for static bidirectional sequence lstm, there will be
+  // a concatenation op at very end; for dynamic bidirectional squence lstm,
+  // it is not guaranteed, but currently we do not support that.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Match fw unidirectional lstm outputs and bw unidirectional lstm outputs:
+  // should be two unstack ops.
+  Operator *fw_lstm_output, *bw_lstm_output;
+  if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_lstm_output,
+                         &bw_lstm_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional lstm ops.
+  std::stack<Operator*> fw_unidirectional_sequence_lstm_ops,
+      bw_unidirectional_sequence_lstm_ops;
+  Operator *first_fw_lstm_input, *first_bw_lstm_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
+          &fw_unidirectional_sequence_lstm_ops, &first_fw_lstm_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
+          &bw_unidirectional_sequence_lstm_ops, &first_bw_lstm_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          fw_unidirectional_sequence_lstm_ops,
+          bw_unidirectional_sequence_lstm_ops, first_fw_lstm_input,
+          first_bw_lstm_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  std::vector<BidirectionalSequenceLstmOperator*>
+      bidirectional_sequence_lstm_ops;
+  if (!GroupFwBwSequenceOps(model, fw_unidirectional_sequence_lstm_ops,
+                            bw_unidirectional_sequence_lstm_ops,
+                            &bidirectional_sequence_lstm_ops)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_lstm_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      OperatorType::kBidirectionalSequenceLstm, current_input,
+      bidirectional_sequence_lstm_ops, &op_it, model);
+
+  // Insert a unpack op for the output.
+  UnpackOperator* unpack_operator = new UnpackOperator;
+
+  RewireFinalUnpackOutputs(
+      static_cast<const UnpackOperator&>(*fw_lstm_output), &unpack_operator,
+      &bidirectional_sequence_lstm_ops[bidirectional_sequence_lstm_ops.size() -
+                                       1],
+      model);
+  model->operators.emplace(op_it, unpack_operator);
+
+  // Delete unused ops.
+  RemoveUnpackOperator(*fw_lstm_output, model);
+  RemoveUnpackOperator(*bw_lstm_output, model);
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_lstm_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_lstm_ops, model);
+  // Only keep the fw lstm's pack input.
+  DeleteArrayIfUnused(first_bw_lstm_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_lstm_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+// TODO(renjieliu): Support graph generated by dynamic rnn as well.
+::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
+  // Bidirectional sequence rnn will generate two separate unidirectional
+  // sequence rnn ops, for static bidirectional sequence rnn, there will be
+  // a concatenation op at very end; for dynamic bidirectional squence rnn,
+  // it is not guaranteed, but currently we do not support that.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Match fw unidirectional rnn outputs and bw unidirectional rnn outputs:
+  // should be two unstack ops.
+  Operator *fw_rnn_output, *bw_rnn_output;
+  if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_rnn_output,
+                         &bw_rnn_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional rnn ops.
+  std::stack<Operator*> fw_unidirectional_sequence_rnn_ops,
+      bw_unidirectional_sequence_rnn_ops;
+  Operator *first_fw_rnn_input, *first_bw_rnn_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
+          &fw_unidirectional_sequence_rnn_ops, &first_fw_rnn_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
+          &bw_unidirectional_sequence_rnn_ops, &first_bw_rnn_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          fw_unidirectional_sequence_rnn_ops,
+          bw_unidirectional_sequence_rnn_ops, first_fw_rnn_input,
+          first_bw_rnn_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  std::vector<BidirectionalSequenceRnnOperator*> bidirectional_sequence_rnn_ops;
+  if (!GroupFwBwSequenceOps(model, fw_unidirectional_sequence_rnn_ops,
+                            bw_unidirectional_sequence_rnn_ops,
+                            &bidirectional_sequence_rnn_ops)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_rnn_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      OperatorType::kBidirectionalSequenceRnn, current_input,
+      bidirectional_sequence_rnn_ops, &op_it, model);
+
+  // Insert a unpack op for the output.
+  UnpackOperator* unpack_operator = new UnpackOperator;
+  RewireFinalUnpackOutputs(
+      static_cast<const UnpackOperator&>(*fw_rnn_output), &unpack_operator,
+      &bidirectional_sequence_rnn_ops[bidirectional_sequence_rnn_ops.size() -
+                                      1],
+      model);
+  model->operators.emplace(op_it, unpack_operator);
+
+  // Delete unused ops.
+  RemoveUnpackOperator(*fw_rnn_output, model);
+  RemoveUnpackOperator(*bw_rnn_output, model);
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_rnn_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_rnn_ops, model);
+  // Only keep the fw rnn's pack input.
+  DeleteArrayIfUnused(first_bw_rnn_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_rnn_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 2e41767095fb3cde09a7fb5d690ac57b1cfcd762..6882a19801538f64e71e317d6c947dd2316815c1 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -440,6 +440,8 @@ bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
     case OperatorType::kGather:
     case OperatorType::kTranspose:
     case OperatorType::kMean:
+    case OperatorType::kReduceMax:
+    case OperatorType::kReduceMin:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
     case OperatorType::kSum:
@@ -448,7 +450,7 @@ bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
       // in special circumstances like when computing expected value using
       // reduce_sum the input range and the output range matches. Hence the
       // below code would act as a fallback. If a fake_quant node is observed in
-      // the output that takes precendence over the hard coding logic below.
+      // the output that takes precedence over the hard coding logic below.
       changed = HardcodeMinMaxFromFirstInput(model, op);
       if (changed) {
         LOG(WARNING) << "Using the input range for output in reduce_sum op."
diff --git a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
index dabd4bd209f450645d12b76c782b36fa5198f84a..3b7c88ac62e48e6a8a571cfd046cc50c2c35f813 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -151,20 +151,12 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
 
   // Erase the subgraph that is now replaced by L2Normalization
   model->operators.erase(FindOperator(model, square_op));
-  model->EraseArray(sum_op->inputs[0]);
-  if (sum_op->inputs.size() > 1) {
-    model->EraseArray(sum_op->inputs[1]);
-  }
-  model->operators.erase(FindOperator(model, sum_op));
+  DeleteOpAndArraysIfUnused(model, sum_op);
   if (add_op) {
-    model->EraseArray(add_op->inputs[0]);
-    model->EraseArray(add_op->inputs[1]);
-    model->operators.erase(FindOperator(model, add_op));
+    DeleteOpAndArraysIfUnused(model, add_op);
   }
-  model->EraseArray(sqrt_or_rsqrt_op->inputs[0]);
-  model->operators.erase(FindOperator(model, sqrt_or_rsqrt_op));
-  model->EraseArray(div_or_mul_op->inputs[1]);
-  model->operators.erase(FindOperator(model, div_or_mul_op));
+  DeleteOpAndArraysIfUnused(model, sqrt_or_rsqrt_op);
+  DeleteOpAndArraysIfUnused(model, div_or_mul_op);
   *modified = true;
   return ::tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index 089ecee959a3ab80474782a88fa176b7a9f42001..65dbb8a1766a6aae4347435b392ff4af49e3d44e 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -147,12 +147,26 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   if (final_output_mul->type != OperatorType::kMul) {
     return ::tensorflow::Status::OK();
   }
+  // final_output_mul->outputs[0] would be one of the two outputs of our
+  // LstmCell. Exit if it does not already have a data type.
+  // We won't be able to propagate data types through a fused LstmCell.
+  if (model->GetArray(final_output_mul->outputs[0]).data_type ==
+      ArrayDataType::kNone) {
+    return ::tensorflow::Status::OK();
+  }
   Operator *state_output_tanh, *fc_output_sig;
   if (!MatchOperatorInputs(*final_output_mul, *model, OperatorType::kTanh,
                            &state_output_tanh, OperatorType::kLogistic,
                            &fc_output_sig)) {
     return ::tensorflow::Status::OK();
   }
+  // state_output_tanh->inputs[0] would be one of the two outputs of our
+  // LstmCell. Exit if it does not already have a data type.
+  // We won't be able to propagate data types through a fused LstmCell.
+  if (model->GetArray(state_output_tanh->inputs[0]).data_type ==
+      ArrayDataType::kNone) {
+    return ::tensorflow::Status::OK();
+  }
 
   // State output TanH
   // (We don't count an operator as ID'd until we verify it has the correct
@@ -262,11 +276,15 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
       lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
   const string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
-  model->GetOrCreateArray(concat_temp_array_name);
+  auto& concat_temp_array = model->GetOrCreateArray(concat_temp_array_name);
+  concat_temp_array.data_type =
+      model->GetArray(concat_inputs->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
   const string& activ_temp_array_name =
       AvailableArrayName(*model, base_name + "activ_temp");
-  model->GetOrCreateArray(activ_temp_array_name);
+  auto& activ_temp_array = model->GetOrCreateArray(activ_temp_array_name);
+  activ_temp_array.data_type =
+      model->GetArray(fully_connected->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] = activ_temp_array_name;
   AddMessageF("Created temp outputs %s and %s on operator %s",
               concat_temp_array_name, activ_temp_array_name,
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index cbae6610d7f4703a898d8d6f35351a09cd70173c..cb66a2372fdd3edf484902c336821b35befae48d 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -252,6 +252,40 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kUnidirectionalSequenceRnn: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kUnique: {
+      CHECK_EQ(op->outputs.size(), 2);
+      const UniqueOperator* unique_op = static_cast<UniqueOperator*>(op);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      model->GetArray(op->outputs[0]).data_type = data_type;
+      model->GetArray(op->outputs[1]).data_type = unique_op->idx_out_type;
+      break;
+    }
+    case OperatorType::kBidirectionalSequenceLstm: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kBidirectionalSequenceRnn: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kLstmCell: {
+      // It's tricky to propagate data types through a LstmCell, as that has
+      // multiple inputs and outputs, and there are quantized cases with
+      // mixed (8bit vs 16bit) cases. Fortunately, that should never be needed,
+      // as the data formats, such as TFLITE, that have LstmCell nodes, also
+      // have data type fields for all their arrays.
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 0e653f08a04f237c861038639a1469eb62f35dfa..329ef92fe9b7046f5d5eece45b5204ce1b025c2b 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1109,6 +1109,154 @@ void ProcessUnidirectionalSequenceLstmOperator(
   output_shape->ReplaceDims({timestamp, batch_size, output_size});
 }
 
+void ProcessUnidirectionalSequenceRnnOperator(
+    Model* model, UnidirectionalSequenceRnnOperator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated.
+    return;
+  }
+
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  const auto& bias_array = model->GetArray(op->inputs[3]);
+  // Yield until input dims have been resolved.
+  if (!bias_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kHiddenStateTensor = 4;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kHiddenStateTensor]).buffer.reset();
+
+  const auto& bias_shape = bias_array.shape();
+  const int output_size = bias_shape.dims(0);
+
+  Shape* output_shape = output_array.mutable_shape();
+  output_shape->ReplaceDims({timestamp, batch_size, output_size});
+}
+
+void ProcessBidirectionalSequenceLstmOperator(
+    Model* model, BidirectionalSequenceLstmOperator* op) {
+  // We assume time major.
+  auto& fw_output_array = model->GetArray(op->outputs[0]);
+  auto& bw_output_array = model->GetArray(op->outputs[1]);
+  if (fw_output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (fw_output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  constexpr int kBwRecurrentToOutputWeightsTensor = 25;
+  const auto& recurrent_to_output_weights_array =
+      model->GetArray(op->inputs[kBwRecurrentToOutputWeightsTensor]);
+  // Yield until input dims have been resolved.
+  if (!recurrent_to_output_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kFwInputActivationStateTensor = 35;
+  constexpr int kFwInputCellStateTensor = 36;
+  constexpr int kBwInputActivationStateTensor = 37;
+  constexpr int kBwInputCellStateTensor = 38;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kFwInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kFwInputCellStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwInputCellStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = recurrent_to_output_weights_array.shape();
+  const int output_size = output_weights_shape.dims(1);
+
+  Shape* fw_output_shape = fw_output_array.mutable_shape();
+  if (op->merge_outputs) {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, 2 * output_size});
+  } else {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+    Shape* bw_output_shape = bw_output_array.mutable_shape();
+    bw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+  }
+}
+
+void ProcessBidirectionalSequenceRnnOperator(
+    Model* model, BidirectionalSequenceRnnOperator* op) {
+  // We assume time major.
+  auto& fw_output_array = model->GetArray(op->outputs[0]);
+  auto& bw_output_array = model->GetArray(op->outputs[1]);
+  if (fw_output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (fw_output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  constexpr int kFwWeightsTensor = 1;
+  const auto& forward_weights_array =
+      model->GetArray(op->inputs[kFwWeightsTensor]);
+  // Yield until input dims have been resolved.
+  if (!forward_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kFwHiddenStateTensor = 4;
+  constexpr int kBwHiddenStateTensor = 8;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kFwHiddenStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwHiddenStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = forward_weights_array.shape();
+  const int output_size = output_weights_shape.dims(0);
+
+  Shape* fw_output_shape = fw_output_array.mutable_shape();
+  if (op->merge_outputs) {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, 2 * output_size});
+  } else {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+    Shape* bw_output_shape = bw_output_array.mutable_shape();
+    bw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+  }
+}
+
 void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
@@ -1616,14 +1764,37 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
     return;
   }
 
+  const Array& axis_array = model->GetArray(op->inputs[1]);
+  // Yield until input axis array shape has been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
   const std::vector<int>& input_dims = input_array.shape().dims();
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32 ||
+        axis_array.data_type == ArrayDataType::kInt64)
+      << "axis_array must be int32, int64";
+
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int64 axis;
+  if (axis_array.data_type == ArrayDataType::kInt32) {
+    axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  } else {
+    axis = axis_array.GetBuffer<ArrayDataType::kInt64>().data[0];
+  }
+
   std::vector<int> output_dims;
 
-  output_dims.reserve(input_dims.size());
-  for (int i = 0; i < input_dims.size() - 1; ++i) {
-    output_dims.push_back(input_dims[i]);
+  output_dims.reserve(input_dims.size() - 1);
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (i != axis) {
+      output_dims.push_back(input_dims[i]);
+    }
   }
-  output_dims.push_back(1);
+
   const string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
@@ -1828,6 +1999,20 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
+void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // We have 2 outputs, the shape of the index tensor, is the same size
+  // as the input array. The unique values tensor, is unknown until runtime.
+  CHECK_EQ(op->outputs.size(), 2);
+  auto& idx_output_array = model->GetArray(op->outputs[1]);
+
+  // Yield until input dims have been resolved, or output already computed
+  if (!input_array.has_shape() || idx_output_array.has_shape()) {
+    return;
+  }
+  idx_output_array.copy_shape(input_array.shape());
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -1869,12 +2054,14 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
     case OperatorType::kAssert:
     case OperatorType::kCast:
     case OperatorType::kFloor:
+    case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kSin:
     case OperatorType::kLogicalAnd:
     case OperatorType::kLogicalNot:
     case OperatorType::kLogicalOr:
     case OperatorType::kZerosLike:
+    case OperatorType::kReverseV2:
       ProcessSimpleOperator(model, op, 0);
       break;
     case OperatorType::kGather:
@@ -2023,6 +2210,18 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
       ProcessUnidirectionalSequenceLstmOperator(
           model, static_cast<UnidirectionalSequenceLstmOperator*>(op));
       break;
+    case OperatorType::kUnidirectionalSequenceRnn:
+      ProcessUnidirectionalSequenceRnnOperator(
+          model, static_cast<UnidirectionalSequenceRnnOperator*>(op));
+      break;
+    case OperatorType::kBidirectionalSequenceLstm:
+      ProcessBidirectionalSequenceLstmOperator(
+          model, static_cast<BidirectionalSequenceLstmOperator*>(op));
+      break;
+    case OperatorType::kBidirectionalSequenceRnn:
+      ProcessBidirectionalSequenceRnnOperator(
+          model, static_cast<BidirectionalSequenceRnnOperator*>(op));
+      break;
     case OperatorType::kLstmCell:
       ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
       break;
@@ -2103,6 +2302,9 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
     case OperatorType::kMirrorPad:
       ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
       break;
+    case OperatorType::kUnique:
+      ProcessUniqueOperator(model, static_cast<UniqueOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 2fa80f2edac2bc4e1c6a9147afca20798fca372b..ee65f92e00cd9f9347e62db314ca3a3f5e8bb396 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -66,7 +66,8 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
          type == OperatorType::kRandomUniform ||
          type == OperatorType::kResizeNearestNeighbor ||
-         type == OperatorType::kPRelu;
+         type == OperatorType::kPRelu || type == OperatorType::kReduceMax ||
+         type == OperatorType::kReduceMin;
 }
 
 // The quantized op allows output arrays of type float using
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 6a4b9198548956217d24693bceff2bd6b3b8f0a6..98105d384e176573b248ffc3fd75710768002750 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -30,6 +30,7 @@ namespace {
 bool IsElementwiseOperator(OperatorType optype) {
   switch (optype) {
     case OperatorType::kCast:
+    case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kFloor:
     case OperatorType::kNeg:
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index bbbedbe3a93065e3a7007073aad7f6e7600e2651..03d331226d885e86bf47d219691591a5a8c53d7a 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -40,3 +40,15 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+tf_cc_test(
+    name = "fuse_binary_into_following_affine_test",
+    srcs = ["fuse_binary_into_following_affine_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cba6824cfbe55f05b92f70cc45fc87b58d56559
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5) {
+  std::vector<testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(testing::FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+}  // namespace
+
+class FuseBinaryIntoFollowingAffineTest : public ::testing::Test {
+ protected:
+  FuseBinaryIntoFollowingAffineTest() {}
+
+  void SetUp() override { model_.reset(new Model); }
+
+  void CreateArray(const string& name, const std::vector<int>& shape) {
+    Array& array = model_->GetOrCreateArray(name);
+    array.data_type = ArrayDataType::kFloat;
+    Shape* array_shape = array.mutable_shape();
+    *(array_shape->mutable_dims()) = shape;
+  }
+
+  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+                           const std::vector<float>& data) {
+    CreateArray(name, shape);
+    Array& array = model_->GetOrCreateArray(name);
+    auto& array_buffer = array.GetMutableBuffer<ArrayDataType::kFloat>();
+    int bufsize = 1;
+    for (int dim : shape) {
+      bufsize *= dim;
+    }
+    array_buffer.data.resize(bufsize);
+    float* buf_ptr = array_buffer.data.data();
+    for (int i = 0; i < bufsize; ++i) {
+      buf_ptr[i] = data[i];
+    }
+  }
+
+  std::unique_ptr<Model> model_;
+};
+
+TEST_F(FuseBinaryIntoFollowingAffineTest, FuseMulIntoFullyConnected) {
+  // Creating a model.
+  {
+    CreateArray("Input", {2, 2});
+    CreateConstantArray("MulInput2", {1}, {2.0});
+    CreateArray("MulOutput", {2, 2});
+    CreateConstantArray("FCWeight", {2, 2}, {1.0, 2.0, 3.0, 4.0});
+    CreateConstantArray("FCBias", {1}, {1.0});
+    CreateArray("Output", {2, 2});
+
+    auto* mul_op = new MulOperator;
+    mul_op->inputs = {"Input", "MulInput2"};
+    mul_op->outputs = {"MulOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(mul_op));
+
+    auto* fc_op = new FullyConnectedOperator;
+    fc_op->inputs = {"MulOutput", "FCWeight", "FCBias"};
+    fc_op->outputs = {"Output"};
+    model_->operators.push_back(std::unique_ptr<Operator>(fc_op));
+  }
+  toco::FuseBinaryIntoFollowingAffine transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/0, &modified).ok());
+  EXPECT_TRUE(modified);
+
+  // `Mul` should be fused into `FullyConnected`. Only 1 op is left.
+  ASSERT_EQ(model_->operators.size(), 1);
+  const auto& op = model_->operators[0];
+  ASSERT_EQ(op->type, OperatorType::kFullyConnected);
+  ASSERT_EQ(op->inputs.size(), 3);
+
+  auto& weights_array = model_->GetArray(op->inputs[1]);
+  EXPECT_THAT(weights_array.GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear({2.0, 4.0, 6.0, 8.0})));
+
+  auto& bias_array = model_->GetArray(op->inputs[2]);
+  EXPECT_THAT(bias_array.GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear({1.0})));
+}
+
+// This is a regression test of b/121287325. Toco crashes before the fix.
+TEST_F(FuseBinaryIntoFollowingAffineTest, DoNotFuseWithMultipleConsumers) {
+  // Creating a model.
+  {
+    CreateArray("Input", {2, 2});
+    CreateConstantArray("MulInput2", {1}, {2.0});
+    CreateArray("MulOutput", {2, 2});
+    CreateConstantArray("FCWeight", {2, 2}, {1.0, 2.0, 3.0, 4.0});
+    CreateConstantArray("FCBias", {1}, {1.0});
+    CreateArray("Output", {2, 2});
+    CreateArray("AnotherOutput", {2, 2});
+
+    auto* mul_op = new MulOperator;
+    mul_op->inputs = {"Input", "MulInput2"};
+    mul_op->outputs = {"MulOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(mul_op));
+
+    auto* fc_op = new FullyConnectedOperator;
+    fc_op->inputs = {"MulOutput", "FCWeight", "FCBias"};
+    fc_op->outputs = {"Output"};
+    model_->operators.push_back(std::unique_ptr<Operator>(fc_op));
+
+    auto identity_op = new TensorFlowIdentityOperator;
+    identity_op->inputs = {"MulOutput"};
+    identity_op->outputs = {"AnotherOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(identity_op));
+  }
+
+  toco::FuseBinaryIntoFollowingAffine transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/0, &modified).ok());
+  // Do not modify the graph if the binary operator has another output.
+  EXPECT_FALSE(modified);
+  EXPECT_EQ(model_->operators.size(), 3);
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 6a496875f9def78879b75b9f693c548aeca62360..b1b04949bad7f96866cc5687940f4fef2a8ee1a8 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -235,6 +235,131 @@ tensorflow::Status ImportShape(
   return NumElements(input_dims_only_sizes, input_flat_size);
 }
 
+// Define ways to retrieve data from tensors of different types.
+// TODO(b/80208043): simply use tensorflow::Tensor::FromProto() instead.
+template <typename T>
+struct TensorTraits;
+
+template <>
+struct TensorTraits<float> {
+  static int size(const TensorProto& p) { return p.float_val_size(); }
+  static float get(const TensorProto& p, int i) { return p.float_val(i); }
+  static string accessor_name() { return "float_val"; }
+  static string type_name() { return "float"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<float>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<uint8_t> {
+  static int size(const TensorProto& p) { return p.int_val_size(); }
+  static uint8_t get(const TensorProto& p, int i) { return p.int_val(i); }
+  static string accessor_name() { return "int_val"; }
+  static string type_name() { return "uint8"; }
+  static void CopyFromContent(const TensorProto& p,
+                              std::vector<uint8_t>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<std::complex<float>> {
+  static int size(const TensorProto& p) { return p.scomplex_val_size() / 2; }
+  static std::complex<float> get(const TensorProto& p, int i) {
+    return std::complex<float>(p.scomplex_val(2 * i),
+                               p.scomplex_val(2 * i + 1));
+  }
+  static string accessor_name() { return "scomplex_val"; }
+  static string type_name() { return "complex64"; }
+  static void CopyFromContent(const TensorProto& p,
+                              std::vector<std::complex<float>>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<int32> {
+  static int size(const TensorProto& p) { return p.int_val_size(); }
+  static int32 get(const TensorProto& p, int i) { return p.int_val(i); }
+  static string accessor_name() { return "int_val"; }
+  static string type_name() { return "int32"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<int32>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<int64> {
+  static int size(const TensorProto& p) { return p.int64_val_size(); }
+  static int64 get(const TensorProto& p, int i) { return p.int64_val(i); }
+  static string accessor_name() { return "int64_val"; }
+  static string type_name() { return "int64"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<int64>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<bool> {
+  static int size(const TensorProto& p) { return p.bool_val_size(); }
+  static bool get(const TensorProto& p, int i) { return p.bool_val(i); }
+  static string accessor_name() { return "bool_val"; }
+  static string type_name() { return "bool"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<bool>* data) {
+    std::vector<char> buf(p.tensor_content().size());
+    toco::port::CopyToBuffer(p.tensor_content(), buf.data());
+    for (int i = 0; i < p.tensor_content().size(); i++) {
+      (*data)[i] = static_cast<bool>(buf[i]);
+    }
+  }
+};
+
+template <typename T>
+tensorflow::Status ImportTensorData(const TensorProto& input_tensor,
+                                    int input_flat_size,
+                                    std::vector<T>* output_data) {
+  CHECK_GE(output_data->size(), input_flat_size);
+  int num_elements_in_tensor = TensorTraits<T>::size(input_tensor);
+  if (num_elements_in_tensor == input_flat_size) {
+    for (int i = 0; i < num_elements_in_tensor; i++) {
+      (*output_data)[i] = TensorTraits<T>::get(input_tensor, i);
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(T)) {
+    TensorTraits<T>::CopyFromContent(input_tensor, output_data);
+  } else if (num_elements_in_tensor > 0 &&
+             num_elements_in_tensor < input_flat_size) {
+    // TODO(b/80208043): use tensorflow::Tensor::FromProto() which is the
+    // official way to import tensor data. This particular else-if handles a
+    // grappler optimization where the last few elements in a tensor are
+    // omitted if they are repeated.
+    int i = 0;
+    for (; i < num_elements_in_tensor; ++i) {
+      (*output_data)[i] = TensorTraits<T>::get(input_tensor, i);
+    }
+    auto last = (*output_data)[i - 1];
+    for (; i < input_flat_size; ++i) {
+      (*output_data)[i] = last;
+    }
+  } else {
+    string accessor_name = TensorTraits<T>::accessor_name();
+    string type_name = TensorTraits<T>::type_name();
+    return tensorflow::errors::InvalidArgument(
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(T), ") nor ",
+                     accessor_name, " (", num_elements_in_tensor,
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this ", type_name, " tensor"));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_FLOAT);
@@ -249,28 +374,8 @@ tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                            0.f);
-  CHECK_GE(output_float_data.size(), input_flat_size);
-  if (input_tensor.float_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_float_data[i] = input_tensor.float_val(0);
-    }
-  } else if (input_tensor.float_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.float_val_size(); i++) {
-      output_float_data[i] = input_tensor.float_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(float)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_float_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        absl::StrCat("Neither input_content (",
-                     input_tensor.tensor_content().size() / sizeof(float),
-                     ") nor float_val (", input_tensor.float_val_size(),
-                     ") have the right dimensions (", input_flat_size,
-                     ") for this float tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<float>(input_tensor, input_flat_size,
+                                 &output_float_data);
 }
 
 tensorflow::Status ImportComplex64Array(const TensorProto& input_tensor,
@@ -287,32 +392,8 @@ tensorflow::Status ImportComplex64Array(const TensorProto& input_tensor,
       output_array->GetMutableBuffer<ArrayDataType::kComplex64>().data;
   output_complex_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                              std::complex<float>(0.f, 0.f));
-  CHECK_GE(output_complex_data.size(), input_flat_size);
-  if (input_tensor.scomplex_val_size() == 2) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_complex_data[i] = std::complex<float>(
-          input_tensor.scomplex_val(0), input_tensor.scomplex_val(1));
-    }
-  } else if (input_tensor.scomplex_val_size() == 2 * input_flat_size) {
-    for (int i = 0; i < input_flat_size; ++i) {
-      output_complex_data[i] =
-          std::complex<float>(input_tensor.scomplex_val(2 * i),
-                              input_tensor.scomplex_val(2 * i + 1));
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(std::complex<float>)) {
-    toco::port::CopyToBuffer(
-        input_tensor.tensor_content(),
-        reinterpret_cast<char*>(output_complex_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(absl::StrCat(
-        "Neither input_content (",
-        input_tensor.tensor_content().size() / sizeof(std::complex<float>),
-        ") nor scomplex_val (", input_tensor.scomplex_val_size(),
-        ") have the right dimensions (", input_flat_size,
-        ") for this complex64 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<std::complex<float>>(input_tensor, input_flat_size,
+                                               &output_complex_data);
 }
 
 tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
@@ -328,28 +409,8 @@ tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
-  CHECK_GE(output_int_data.size(), input_flat_size);
-  if (input_tensor.int_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_int_data[i] = input_tensor.int_val(0);
-    }
-  } else if (input_tensor.int_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.int_val_size(); i++) {
-      output_int_data[i] = input_tensor.int_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(uint8_t)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_int_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        absl::StrCat("Neither input_content (",
-                     input_tensor.tensor_content().size() / sizeof(uint8_t),
-                     ") nor int_val (", input_tensor.int_val_size(),
-                     ") have the right dimensions (", input_flat_size,
-                     ") for this uint8 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<uint8_t>(input_tensor, input_flat_size,
+                                   &output_int_data);
 }
 
 tensorflow::Status ImportInt32Array(const TensorProto& input_tensor,
@@ -365,27 +426,8 @@ tensorflow::Status ImportInt32Array(const TensorProto& input_tensor,
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
-  CHECK_GE(output_int_data.size(), input_flat_size);
-  if (input_tensor.int_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_int_data[i] = input_tensor.int_val(0);
-    }
-  } else if (input_tensor.int_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.int_val_size(); i++) {
-      output_int_data[i] = input_tensor.int_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(int32)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_int_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(absl::StrCat(
-        "Neither input_content (",
-        input_tensor.tensor_content().size() / sizeof(int32), ") nor int_val (",
-        input_tensor.int_val_size(), ") have the right dimensions (",
-        input_flat_size, ") for this int32 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<int32>(input_tensor, input_flat_size,
+                                 &output_int_data);
 }
 
 tensorflow::Status ImportInt64Array(const TensorProto& input_tensor,
@@ -401,28 +443,8 @@ tensorflow::Status ImportInt64Array(const TensorProto& input_tensor,
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
-  CHECK_GE(output_int_data.size(), input_flat_size);
-  if (input_tensor.int64_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_int_data[i] = input_tensor.int64_val(0);
-    }
-  } else if (input_tensor.int64_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.float_val_size(); i++) {
-      output_int_data[i] = input_tensor.int64_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(int64)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_int_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        absl::StrCat("Neither input_content (",
-                     input_tensor.tensor_content().size() / sizeof(int64),
-                     ") nor int64_val (", input_tensor.int64_val_size(),
-                     ") have the right dimensions (", input_flat_size,
-                     ") for this int64 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<int64>(input_tensor, input_flat_size,
+                                 &output_int_data);
 }
 
 tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
@@ -439,36 +461,17 @@ tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
       output_array->GetMutableBuffer<ArrayDataType::kBool>().data;
   output_bool_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                           false);
-  CHECK_GE(output_bool_data.size(), input_flat_size);
-  if (input_tensor.bool_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_bool_data[i] = input_tensor.bool_val(0);
-    }
-  } else if (input_tensor.bool_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.bool_val_size(); i++) {
-      output_bool_data[i] = input_tensor.bool_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() == input_flat_size) {
-    std::vector<char> buf(input_tensor.tensor_content().size());
-    toco::port::CopyToBuffer(input_tensor.tensor_content(), buf.data());
-    for (int i = 0; i < input_tensor.tensor_content().size(); i++) {
-      output_bool_data[i] = static_cast<bool>(buf[i]);
-    }
-  } else {
+  status =
+      ImportTensorData<bool>(input_tensor, input_flat_size, &output_bool_data);
+  if (!status.ok() && output_bool_data.size() == 1) {
     // Some graphs have bool const nodes without actual value...
     // assuming that 'false' is implied.
     // So far only encountered that in an array with 1 entry, let's
     // require that until we encounter a graph where that's not the case.
-    if (output_bool_data.size() != 1) {
-      return tensorflow::errors::InvalidArgument(absl::StrCat(
-          "Neither input_content (", input_tensor.tensor_content().size(),
-          ") nor bool_val (", input_tensor.bool_val_size(),
-          ") have the right dimensions (", input_flat_size,
-          ") for this bool tensor"));
-    }
     output_bool_data[0] = false;
+    return tensorflow::Status::OK();
   }
-  return tensorflow::Status::OK();
+  return status;
 }
 
 tensorflow::Status ImportStringArray(const TensorProto& input_tensor,
@@ -1187,7 +1190,7 @@ enum FlexSupport { kFlexOk, kFlexNotOk };
 // taken from the given NodeDef, and its number must match NumInputs, unless
 // kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator
 // will be eligible for being exported as a flex op.
-template <typename Op, int NumInputs, FlexSupport flex>
+template <typename Op, int NumInputs, int NumOutputs, FlexSupport flex>
 tensorflow::Status ConvertSimpleOperatorGeneric(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1200,6 +1203,11 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  if (NumOutputs > 1) {
+    for (int i = 1; i < NumOutputs; ++i) {
+      op->outputs.push_back(node.name() + ":" + std::to_string(i));
+    }
+  }
 
   if (flex == kFlexOk) {
     RetainTensorFlowNodeDef(node, op);
@@ -1210,20 +1218,20 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
 }
 
 // Convert a simple operator which is not valid as a flex op.
-template <typename Op, int NumInputs = kAnyNumInputs>
+template <typename Op, int NumInputs, int NumOutputs>
 tensorflow::Status ConvertSimpleOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexNotOk>(
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, NumOutputs, kFlexNotOk>(
       node, tf_import_flags, model);
 }
 
 // Convert a simple operator which is valid as a flex op.
-template <typename Op, int NumInputs = kAnyNumInputs>
+template <typename Op, int NumInputs, int NumOutputs>
 tensorflow::Status ConvertSimpleOperatorFlexOk(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexOk>(
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, NumOutputs, kFlexOk>(
       node, tf_import_flags, model);
 }
 
@@ -1521,6 +1529,20 @@ tensorflow::Status ConvertFloorOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertCeilOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "Ceil");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  const auto data_type = GetDataTypeAttr(node, "T");
+  CHECK(data_type == DT_FLOAT);
+  auto* op = new CeilOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertGatherOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -2309,6 +2331,27 @@ tensorflow::Status ConvertLeakyReluOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertUnidirectionalSequenceRnn(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  DCHECK_EQ(node.op(), "UnidirectionalSequenceRnn");
+
+  auto* op = new UnidirectionalSequenceRnnOperator();
+  const auto& indices = GetListAttr(node, "_tflite_input_indices");
+  if (indices.i_size() != node.input().size()) {
+    return tensorflow::errors::InvalidArgument("Input size does not match.");
+  }
+
+  for (const string& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  // Only use the last one as input.
+  op->outputs.push_back(node.name() + ":1");
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -2330,14 +2373,15 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
 
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
-      {"Abs", ConvertSimpleOperator<AbsOperator>},
-      {"Add", ConvertSimpleOperator<AddOperator, 2>},
-      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
-      {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
+      {"Abs", ConvertSimpleOperator<AbsOperator, kAnyNumInputs, 1>},
+      {"Add", ConvertSimpleOperator<AddOperator, 2, 1>},
+      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator, kAnyNumInputs, 1>},
+      {"All", ConvertSimpleOperator<TensorFlowAllOperator, kAnyNumInputs, 1>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
       {"ArgMin", ConvertArgMinOperator},
-      {"Assert", ConvertSimpleOperator<TensorFlowAssertOperator>},
+      {"Assert",
+       ConvertSimpleOperator<TensorFlowAssertOperator, kAnyNumInputs, 1>},
       {"AvgPool", ConvertAvgPoolOperator},
       {"BatchMatMul", ConvertBatchMatMulOperator},
       {"BatchNormWithGlobalNormalization",
@@ -2345,6 +2389,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"BatchToSpaceND", ConvertBatchToSpaceNDOperator},
       {"BiasAdd", ConvertBiasAddOperator},
       {"Cast", ConvertCastOperator},
+      {"Ceil", ConvertCeilOperator},
       {"CheckNumerics", ConvertIdentityOperator},
       {"Concat", ConvertConcatOperator},
       {"ConcatV2", ConvertConcatOperator},
@@ -2354,98 +2399,101 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"CTCBeamSearchDecoder", ConvertCTCBeamSearchDecoderOperator},
       {"DepthToSpace", ConvertDepthToSpaceOperator},
       {"DepthwiseConv2dNative", ConvertDepthwiseConvOperator},
-      {"Div", ConvertSimpleOperator<DivOperator, 2>},
+      {"Div", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"DynamicPartition", ConvertDynamicPartitionOperator},
       {"DynamicStitch", ConvertDynamicStitchOperator},
-      {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2>},
-      {"Exp", ConvertSimpleOperator<ExpOperator, 1>},
-      {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2>},
+      {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2, 1>},
+      {"Exp", ConvertSimpleOperator<ExpOperator, 1, 1>},
+      {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2, 1>},
       {"FakeQuantWithMinMaxArgs", ConvertFakeQuantWithMinMaxArgs},
       {"FakeQuantWithMinMaxVars", ConvertFakeQuantWithMinMaxVars},
-      {"Fill", ConvertSimpleOperator<FillOperator, 2>},
+      {"Fill", ConvertSimpleOperator<FillOperator, 2, 1>},
       {"Floor", ConvertFloorOperator},
-      {"FloorDiv", ConvertSimpleOperator<FloorDivOperator, 2>},
-      {"FloorMod", ConvertSimpleOperator<FloorModOperator, 2>},
+      {"FloorDiv", ConvertSimpleOperator<FloorDivOperator, 2, 1>},
+      {"FloorMod", ConvertSimpleOperator<FloorModOperator, 2, 1>},
       {"FusedBatchNorm", ConvertFusedBatchNormOperator},
       {"Gather", ConvertGatherOperator},
       {"GatherV2", ConvertGatherOperator},
-      {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2>},
+      {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2, 1>},
       {"GreaterEqual",
-       ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
+       ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2, 1>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
       {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
-      {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
-      {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
-      {"Log", ConvertSimpleOperator<LogOperator, 1>},
-      {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2>},
-      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2>},
-      {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1>},
-      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1>},
+      {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2, 1>},
+      {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2, 1>},
+      {"Log", ConvertSimpleOperator<LogOperator, 1, 1>},
+      {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2, 1>},
+      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2, 1>},
+      {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1, 1>},
+      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1, 1>},
       {"MatMul", ConvertMatMulOperator},
       {"Max", ConvertReduceOperator<TensorFlowMaxOperator>},
       {"MaxPool", ConvertMaxPoolOperator},
-      {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2>},
+      {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2, 1>},
       {"Mean", ConvertReduceOperator<MeanOperator>},
-      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2>},
+      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2, 1>},
       {"Min", ConvertReduceOperator<TensorFlowMinOperator>},
-      {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2>},
-      {"Mul", ConvertSimpleOperator<MulOperator, 2>},
-      {"Neg", ConvertSimpleOperator<NegOperator, 1>},
+      {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2, 1>},
+      {"Mul", ConvertSimpleOperator<MulOperator, 2, 1>},
+      {"Neg", ConvertSimpleOperator<NegOperator, 1, 1>},
       {"NextIteration", ConvertOperatorSpecialCasedAsRNNBackEdge},
       {"NoOp", ConvertNoOpOperator},
-      {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2>},
+      {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2, 1>},
       {"OneHot", ConvertOneHotOperator},
       {"Pack", ConvertPackOperator},
-      {"Pad", ConvertSimpleOperator<PadOperator, 2>},
-      {"PadV2", ConvertSimpleOperator<PadV2Operator, 3>},
+      {"Pad", ConvertSimpleOperator<PadOperator, 2, 1>},
+      {"PadV2", ConvertSimpleOperator<PadV2Operator, 3, 1>},
       {"ParallelDynamicStitch", ConvertDynamicStitchOperator},
       {"Placeholder", ConvertPlaceholderOperator},
       {"PlaceholderWithDefault", ConvertIdentityOperator},
-      {"Pow", ConvertSimpleOperator<PowOperator, 2>},
+      {"Pow", ConvertSimpleOperator<PowOperator, 2, 1>},
       {"Prod", ConvertReduceOperator<TensorFlowProdOperator>},
       {"RandomUniform", ConvertRandomUniform},
       {"Range", ConvertRangeOperator},
-      {"Rank", ConvertSimpleOperator<RankOperator, 1>},
-      {"RealDiv", ConvertSimpleOperator<DivOperator, 2>},
-      {"Relu", ConvertSimpleOperator<ReluOperator, 1>},
-      {"Relu6", ConvertSimpleOperator<Relu6Operator, 1>},
-      {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2>},
+      {"Rank", ConvertSimpleOperator<RankOperator, 1, 1>},
+      {"RealDiv", ConvertSimpleOperator<DivOperator, 2, 1>},
+      {"Relu", ConvertSimpleOperator<ReluOperator, 1, 1>},
+      {"Relu6", ConvertSimpleOperator<Relu6Operator, 1, 1>},
+      {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2, 1>},
       {"ResizeBilinear", ConvertResizeBilinearOperator},
       {"ResizeNearestNeighbor", ConvertResizeNearestNeighborOperator},
-      {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1>},
-      {"Select", ConvertSimpleOperator<SelectOperator, 3>},
+      {"ReverseV2", ConvertSimpleOperator<ReverseV2Operator, 2, 1>},
+      {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1, 1>},
+      {"Select", ConvertSimpleOperator<SelectOperator, 3, 1>},
       {"Shape", ConvertShapeOperator},
-      {"Sigmoid", ConvertSimpleOperator<LogisticOperator, 1>},
-      {"Sin", ConvertSimpleOperator<SinOperator, 1>},
-      {"Slice", ConvertSimpleOperator<SliceOperator, 3>},
+      {"Sigmoid", ConvertSimpleOperator<LogisticOperator, 1, 1>},
+      {"Sin", ConvertSimpleOperator<SinOperator, 1, 1>},
+      {"Slice", ConvertSimpleOperator<SliceOperator, 3, 1>},
       {"Softmax", ConvertSoftmaxOperator},
       {"SpaceToBatchND", ConvertSpaceToBatchNDOperator},
       {"SpaceToDepth", ConvertSpaceToDepthOperator},
       {"SparseToDense", ConvertSparseToDenseOperator},
       {"Split", ConvertSplitOperator},
       {"SplitV", ConvertSplitVOperator},
-      {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
-      {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1, 1>},
+      {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1, 1>},
       {"SquaredDifference",
-       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2, 1>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
-      {"Sub", ConvertSimpleOperator<SubOperator, 2>},
+      {"Sub", ConvertSimpleOperator<SubOperator, 2, 1>},
       {"Sum", ConvertReduceOperator<TensorFlowSumOperator>},
       {"Svdf", ConvertSvdfOperator},
       {"Switch", ConvertSwitchOperator},
-      {"Tanh", ConvertSimpleOperator<TanhOperator, 1>},
-      {"Tile", ConvertSimpleOperator<TensorFlowTileOperator, 2>},
+      {"Tanh", ConvertSimpleOperator<TanhOperator, 1, 1>},
+      {"Tile", ConvertSimpleOperator<TensorFlowTileOperator, 2, 1>},
       {"TopK", ConvertTopKV2Operator},
       {"TopKV2", ConvertTopKV2Operator},
-      {"Transpose", ConvertSimpleOperator<TransposeOperator, 2>},
+      {"Transpose", ConvertSimpleOperator<TransposeOperator, 2, 1>},
       {"Unpack", ConvertUnpackOperator},
-      {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
+      {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1, 1>},
       {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
+      {"UnidirectionalSequenceRnn", ConvertUnidirectionalSequenceRnn},
       {"MirrorPad", ConvertMirrorPadOperator},
+      {"Unique", ConvertSimpleOperator<UniqueOperator, 1, 2>},
   });
 }
 
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index ac020c104921e918613a30aece04ad896203258c..8ff3f7733afb4355a8e7863594633a6555287c10 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/import_tensorflow.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace toco {
 
@@ -37,6 +39,7 @@ using tensorflow::DT_QUINT8;
 using tensorflow::DT_STRING;
 using tensorflow::NodeDef;
 using tensorflow::Status;
+using ::testing::ElementsAre;
 
 namespace internal {
 using ConverterType = tensorflow::Status (*)(
@@ -116,35 +119,35 @@ void BuildConstNode(std::initializer_list<int64_t> shape,
   switch (dtype) {
     case DT_FLOAT:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_float_val(i / 10000.0);
+        t.add_float_val(i / 10000.0 + 1);
       }
       break;
     case DT_INT32:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_int_val(i % std::numeric_limits<int>::max());
+        t.add_int_val(i % std::numeric_limits<int>::max() + 1);
       }
       break;
     case DT_QUINT8:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_int_val(i % std::numeric_limits<uint8_t>::max());
+        t.add_int_val(i % std::numeric_limits<uint8_t>::max() + 1);
       }
       break;
     case DT_INT64:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_int64_val(i);
+        t.add_int64_val(i + 1);
       }
       break;
     case DT_STRING:
       break;
     case DT_BOOL:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_bool_val(i % 2);
+        t.add_bool_val((i % 2) == 0);
       }
       break;
     case DT_COMPLEX64:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_scomplex_val(i / 10000.0);
-        t.add_scomplex_val(-i / 10000.0);
+        t.add_scomplex_val(i / 10000.0 + 1);
+        t.add_scomplex_val(-i / 10000.0 - 1);
       }
       break;
     default:
@@ -254,23 +257,126 @@ std::vector<tensorflow::DataType> TestTypes() {
   return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
 }
 
-INSTANTIATE_TEST_CASE_P(ShapeImportTest, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
+INSTANTIATE_TEST_SUITE_P(ShapeImportTest, ShapeImportTest,
+                         ::testing::ValuesIn(TestTypes()));
+
+class ContentImportTest : public ::testing::Test {
+ public:
+  template <ArrayDataType T>
+  std::vector<DataType<T>> ImportAndGetData(const NodeDef& node) {
+    Model model;
+    auto status = ImportNode(node, &model);
+    CHECK(status.ok()) << status.error_message();
+    const auto& array = model.GetArray("Node1");
+    return array.GetBuffer<T>().data;
+  }
+  void RemoveTrailingElements(NodeDef* node, int num) {
+    tensorflow::TensorProto* p =
+        node->mutable_attr()->at("value").mutable_tensor();
+    for (int i = 0; i < num; ++i) {
+      if (p->int_val_size() > 0) p->mutable_int_val()->RemoveLast();
+      if (p->int64_val_size() > 0) p->mutable_int64_val()->RemoveLast();
+      if (p->float_val_size() > 0) p->mutable_float_val()->RemoveLast();
+      if (p->bool_val_size() > 0) p->mutable_bool_val()->RemoveLast();
+      if (p->scomplex_val_size() > 0) p->mutable_scomplex_val()->RemoveLast();
+      if (p->scomplex_val_size() > 0) p->mutable_scomplex_val()->RemoveLast();
+    }
+  }
+};
+
+TEST_F(ContentImportTest, Int32) {
+  constexpr ArrayDataType kType = ArrayDataType::kInt32;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_INT32, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 5));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Int64) {
+  constexpr ArrayDataType kType = ArrayDataType::kInt64;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_INT64, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 5));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Quint8) {
+  constexpr ArrayDataType kType = ArrayDataType::kUint8;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_QUINT8, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 5));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Bool) {
+  constexpr ArrayDataType kType = ArrayDataType::kBool;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_BOOL, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 0, 1, 0, 1, 0));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 0, 1, 0, 1, 1));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Float) {
+  constexpr ArrayDataType kType = ArrayDataType::kFloat;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_FLOAT, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0001, 1.0002, 1.0003, 1.0004, 1.0005));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0001, 1.0002, 1.0003, 1.0004, 1.0004));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000));
+}
+
+TEST_F(ContentImportTest, Complex64) {
+  constexpr ArrayDataType kType = ArrayDataType::kComplex64;
 
-TEST(ImportTest, Complex64ConstNode) {
   NodeDef node;
   BuildConstNode({1, 2, 3}, DT_COMPLEX64, 6, &node);
-  Model model;
-  EXPECT_TRUE(ImportNode(node, &model).ok());
-  const auto& array = model.GetArray("Node1");
-  EXPECT_EQ(ArrayDataType::kComplex64, array.data_type);
-  EXPECT_EQ(6, array.GetBuffer<ArrayDataType::kComplex64>().Length());
-  int64_t i = 0;
-  for (const auto& datum : array.GetBuffer<ArrayDataType::kComplex64>().data) {
-    EXPECT_EQ(i / 10000.0f, std::real(datum));
-    EXPECT_EQ(-i / 10000.0f, std::imag(datum));
-    i++;
-  }
+
+  using cplx = std::complex<float>;
+  EXPECT_THAT(
+      ImportAndGetData<kType>(node),
+      ElementsAre(std::complex<float>(1.0000, -1.0000), cplx(1.0001, -1.0001),
+                  cplx(1.0002, -1.0002), cplx(1.0003, -1.0003),
+                  cplx(1.0004, -1.0004), cplx(1.0005, -1.0005)));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(
+      ImportAndGetData<kType>(node),
+      ElementsAre(std::complex<float>(1.0000, -1.0000), cplx(1.0001, -1.0001),
+                  cplx(1.0002, -1.0002), cplx(1.0003, -1.0003),
+                  cplx(1.0004, -1.0004), cplx(1.0004, -1.0004)));
+
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(
+      ImportAndGetData<kType>(node),
+      ElementsAre(std::complex<float>(1.0000, -1.0000), cplx(1.0000, -1.0000),
+                  cplx(1.0000, -1.0000), cplx(1.0000, -1.0000),
+                  cplx(1.0000, -1.0000), cplx(1.0000, -1.0000)));
 }
 
 std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
@@ -312,8 +418,8 @@ TEST_P(TypeImportTest, BasicTypeInference) {
           model.operators[0].get());
   ASSERT_THAT(op->output_data_types, ::testing::ElementsAre(GetParam().second));
 }
-INSTANTIATE_TEST_CASE_P(BasicTypeInference, TypeImportTest,
-                        ::testing::ValuesIn(UnaryTestTypes()));
+INSTANTIATE_TEST_SUITE_P(BasicTypeInference, TypeImportTest,
+                         ::testing::ValuesIn(UnaryTestTypes()));
 
 TEST(ImportTest, TypeInferenceWithFixedOutputType) {
   // Create an op that has a fixed output type (bool).
@@ -460,3 +566,10 @@ TEST(ImportTest, UnsupportedOpWithMultipleOutputs) {
 
 }  // namespace
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index e71d36583e8ca3e94ef3aae699b3df4e4dfdd981..b99cb74a161dc8468e37ba8ceaac464d63f5abc8 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -42,6 +42,7 @@ enum class OperatorType : uint8 {
   kAveragePool,
   kBatchMatMul,
   kBatchNormalization,
+  kCeil,
   kConv,
   kConcatenation,
   kDepthwiseConv,
@@ -157,7 +158,12 @@ enum class OperatorType : uint8 {
   kResizeNearestNeighbor,
   kLeakyRelu,
   kAbs,
-  kMirrorPad
+  kMirrorPad,
+  kUnique,
+  kUnidirectionalSequenceRnn,
+  kBidirectionalSequenceLstm,
+  kReverseV2,
+  kBidirectionalSequenceRnn
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -647,6 +653,18 @@ struct UnidirectionalSequenceLstmOperator : Operator {
       : Operator(OperatorType::kUnidirectionalSequenceLstm) {}
 };
 
+struct BidirectionalSequenceLstmOperator : Operator {
+  BidirectionalSequenceLstmOperator()
+      : Operator(OperatorType::kBidirectionalSequenceLstm) {}
+  bool merge_outputs;
+};
+
+struct BidirectionalSequenceRnnOperator : Operator {
+  BidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kBidirectionalSequenceRnn) {}
+  bool merge_outputs;
+};
+
 // Element-wise multiplication operator.
 //
 // Inputs:
@@ -1658,6 +1676,16 @@ struct FloorOperator : Operator {
   FloorOperator() : Operator(OperatorType::kFloor) {}
 };
 
+// Ceil operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Ceil
+struct CeilOperator : Operator {
+  CeilOperator() : Operator(OperatorType::kCeil) {}
+};
+
 // Gather operator. It gathers slices from params according to indices.
 // Only 1-D indices are supported at the moment.
 //
@@ -1683,6 +1711,7 @@ struct GatherOperator : Operator {
 //
 // Inputs:
 //   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
 //
 // TensorFlow equivalent: ArgMax
 struct ArgMaxOperator : Operator {
@@ -1694,6 +1723,7 @@ struct ArgMaxOperator : Operator {
 //
 // Inputs:
 //   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
 //
 // TensorFlow equivalent: ArgMin
 struct ArgMinOperator : Operator {
@@ -1936,6 +1966,16 @@ struct TensorFlowZerosLikeOperator : Operator {
   TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
 };
 
+// ReverseV2 operator:
+//
+// Inputs:
+// Inputs[0]: required: the input array.
+//
+// TensorFlow equivalent: ReverseV2.
+struct ReverseV2Operator : Operator {
+  ReverseV2Operator() : Operator(OperatorType::kReverseV2) {}
+};
+
 enum class MirrorPadMode { kNone, kSymmetric, kReflect };
 
 // MirrorPad Operator:
@@ -1953,6 +1993,24 @@ struct MirrorPadOperator : Operator {
   MirrorPadMode mode;
 };
 
+// Unique Operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Unique
+struct UniqueOperator : Operator {
+  UniqueOperator() : Operator(OperatorType::kUnique) {}
+  ArrayDataType idx_out_type = ArrayDataType::kInt32;
+};
+
+struct UnidirectionalSequenceRnnOperator : Operator {
+  UnidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kUnidirectionalSequenceRnn) {}
+  bool time_major;
+  FusedActivationFunctionType fused_activation_function;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 8a6e82ec46445b5ec5440de129177eae836f8db8..6da48333a292df6a9fd21da620be9bb00afe0b7d 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,5 +1,4 @@
 package(default_visibility = [
-    "//tensorflow/contrib/lite:__subpackages__",
     "//tensorflow/lite:__subpackages__",
     "//tensorflow/tools/pip_package:__subpackages__",
 ])
@@ -47,7 +46,6 @@ tf_py_wrap_cc(
     visibility = [
         "//learning/expander/pod/deep_pod/utils:__subpackages__",
         "//research/handwriting/converters/tflite:__subpackages__",
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index abfd370b86e94d9152521e27eda186349d1d4176..f61488ef353bbc0c3a22c26c1cbd87755fdb4184 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/util/ptr_util.h"
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/builtin_operator.h"
@@ -62,6 +63,11 @@ class AveragePool
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -467,6 +473,20 @@ class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const string& weights_feature_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& weights_feature_array =
+        op_signature.model->GetArray(weights_feature_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // If the op is a signed int8 hybrid operation, we need to return
+    // version 2.
+    if (input_array.data_type == ArrayDataType::kFloat &&
+        weights_feature_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kFloat) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -706,6 +726,11 @@ class Softmax
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -795,9 +820,24 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
     const auto& lstm_op =
         static_cast<const LstmCellOperator&>(*op_signature.op);
     switch (lstm_op.kernel_type) {
-      case LstmCellOperator::KERNEL_FULL:
+      case LstmCellOperator::KERNEL_FULL: {
+        // If the input tensor is float and a weight is int8, this is a version
+        // 3 hybrid operation.
+        const string& input_name = op_signature.op->inputs[0];
+        const string& weights_name = op_signature.op->inputs[2];
+        const string& output_name = op_signature.op->outputs[0];
+        const Array& input_array = op_signature.model->GetArray(input_name);
+        const Array& weights_array = op_signature.model->GetArray(weights_name);
+        const Array& output_array = op_signature.model->GetArray(output_name);
+        if (input_array.data_type == ArrayDataType::kFloat &&
+            weights_array.data_type == ArrayDataType::kInt8 &&
+            output_array.data_type == ArrayDataType::kFloat) {
+          return 3;
+        }
         return 1;
+      }
       case LstmCellOperator::KERNEL_BASIC:
+        // KERNEL_BASIC was added in version 2.
         return 2;
     }
   }
@@ -850,6 +890,19 @@ class UnidirectionalSequenceLstm
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    // If the input tensor is float and a weight is int8, this is a version
+    // 2 hybrid operation.
+    const string& input_name = op_signature.op->inputs[0];
+    const string& weights_name = op_signature.op->inputs[2];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& weights_array = op_signature.model->GetArray(weights_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    if (input_array.data_type == ArrayDataType::kFloat &&
+        weights_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kFloat) {
+      return 2;
+    }
     return 1;
   }
 
@@ -862,6 +915,94 @@ class UnidirectionalSequenceLstm
   }
 };
 
+class BidirectionalSequenceLstm
+    : public BuiltinOperator<
+          BidirectionalSequenceLstmOperator,
+          ::tflite::BidirectionalSequenceLSTMOptions,
+          ::tflite::BuiltinOptions_BidirectionalSequenceLSTMOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateBidirectionalSequenceLSTMOptions(
+        *builder, /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*cell_clip=*/0.0,
+        /*proj_clip=*/0.0,
+        /*merge_outputs=*/op.merge_outputs,
+        /*time_major=*/true);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+    op->merge_outputs = options.merge_outputs();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    // Forward input activation state.
+    mutating_input_variables[35] = true;
+    // Forward input cell state.
+    mutating_input_variables[36] = true;
+    // Backward input activation state.
+    mutating_input_variables[37] = true;
+    // Backward input cell state.
+    mutating_input_variables[38] = true;
+    return mutating_input_variables;
+  }
+};
+
+class BidirectionalSequenceRnn
+    : public BuiltinOperator<
+          BidirectionalSequenceRnnOperator,
+          ::tflite::BidirectionalSequenceRNNOptions,
+          ::tflite::BuiltinOptions_BidirectionalSequenceRNNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateBidirectionalSequenceRNNOptions(
+        *builder, /*time_major=*/true,
+        /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*merge_outputs=*/op.merge_outputs);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+    op->merge_outputs = options.merge_outputs();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    // Forward hidden state.
+    mutating_input_variables[4] = true;
+    // Backward hidden state.
+    mutating_input_variables[8] = true;
+    return mutating_input_variables;
+  }
+};
+
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::ReducerOptions,
                                     ::tflite::BuiltinOptions_ReducerOptions> {
  public:
@@ -1426,9 +1567,67 @@ class MirrorPad
                    : MirrorPadMode::kSymmetric;
   }
 
+  int GetVersion(const OperatorSignature& op) const override { return 1; }
+};
+
+class Unique : public BuiltinOperator<UniqueOperator, ::tflite::UniqueOptions,
+                                      ::tflite::BuiltinOptions_UniqueOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    const UniqueOperator& unique_op = static_cast<const UniqueOperator&>(op);
+    return ::tflite::CreateUniqueOptions(
+        *builder, unique_op.idx_out_type == toco::ArrayDataType::kInt64
+                      ? ::tflite::TensorType::TensorType_INT64
+                      : ::tflite::TensorType_INT32);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    UniqueOperator* unique_op = static_cast<UniqueOperator*>(op);
+    unique_op->idx_out_type =
+        options.idx_out_type() == ::tflite::TensorType_INT64
+            ? toco::ArrayDataType::kInt64
+            : toco::ArrayDataType::kInt32;
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
+class UnidirectionalSequenceRnn
+    : public BuiltinOperator<UnidirectionalSequenceRnnOperator,
+                             ::tflite::SequenceRNNOptions,
+                             ::tflite::BuiltinOptions_SequenceRNNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSequenceRNNOptions(
+        *builder, /*time_major=*/true,
+        /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh actication, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+  }
+
   int GetVersion(const OperatorSignature& op_signature) const override {
     return 1;
   }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    mutating_input_variables[4] = true;
+    return mutating_input_variables;
+  }
 };
 
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
@@ -1529,13 +1728,27 @@ class TensorFlowUnsupported : public BaseOperator {
           has_valid_attr = true;
           break;
         case tensorflow::AttrValue::kList:
-          if (attr.list().i_size() > 0) {
+          if (attr.list().s_size() > 0) {
+            auto start = fbb->StartVector(key);
+            for (const string& v : attr.list().s()) {
+              fbb->Add(v);
+            }
+            fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+            has_valid_attr = true;
+          } else if (attr.list().i_size() > 0) {
             auto start = fbb->StartVector(key);
             for (const int64_t v : attr.list().i()) {
               fbb->Add(v);
             }
             fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
             has_valid_attr = true;
+          } else if (attr.list().f_size() > 0) {
+            auto start = fbb->StartVector(key);
+            for (const float v : attr.list().f()) {
+              fbb->Add(v);
+            }
+            fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+            has_valid_attr = true;
           } else {
             LOG(WARNING)
                 << "Ignoring unsupported type in list attribute with key '"
@@ -1556,10 +1769,6 @@ class TensorFlowUnsupported : public BaseOperator {
     return std::unique_ptr<flexbuffers::Builder>(fbb.release());
   }
 
-// TODO(wvo): hack to make this code compile with 2 different API versions.
-// Please remove once OS/internal versions are in sync.
-// See hardcoded values in the switch below.
-
   void ReadOptions(const flexbuffers::Map& m,
                    TensorFlowUnsupportedOperator* op) const {
     ::tensorflow::NodeDef node_def;
@@ -1569,6 +1778,10 @@ class TensorFlowUnsupported : public BaseOperator {
     for (size_t i = 0; i < keys.size(); ++i) {
       const auto key = keys[i].AsKey();
       const auto& value = m[key];
+      // TODO(wvo): hack to make this code compile with 2 different API
+      // versions.
+      // Please remove once OS/internal versions are in sync.
+      // See hardcoded values in the switch below.
       switch (value.GetType()) {
         case 5:  // flexbuffers::FBT_STRING:
           (*attr)[key].set_s(value.AsString().c_str());
@@ -1596,6 +1809,22 @@ class TensorFlowUnsupported : public BaseOperator {
           }
           break;
         }
+        case 13: {  // flexbuffers::FBT_VECTOR_FLOAT: {
+          auto* list = (*attr)[key].mutable_list();
+          const auto& vector = value.AsTypedVector();
+          for (size_t i = 0; i < vector.size(); i++) {
+            list->add_f(vector[i].AsFloat());
+          }
+          break;
+        }
+        case 15: {  // flexbuffers::FBT_VECTOR_STRING: {
+          auto* list = (*attr)[key].mutable_list();
+          const auto& vector = value.AsTypedVector();
+          for (size_t i = 0; i < vector.size(); i++) {
+            list->add_s(vector[i].AsString().str());
+          }
+          break;
+        }
         default:
           LOG(WARNING) << "Ignoring unsupported attribute type with key '"
                        << key << "'";
@@ -1756,6 +1985,12 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.emplace_back(MakeUnique<UnidirectionalSequenceLstm>(
       ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
       OperatorType::kUnidirectionalSequenceLstm));
+  ops.emplace_back(MakeUnique<BidirectionalSequenceLstm>(
+      ::tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+      OperatorType::kBidirectionalSequenceLstm));
+  ops.emplace_back(MakeUnique<BidirectionalSequenceRnn>(
+      ::tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+      OperatorType::kBidirectionalSequenceRnn));
   ops.push_back(MakeUnique<OneHot>(::tflite::BuiltinOperator_ONE_HOT,
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
@@ -1767,6 +2002,11 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       OperatorType::kSquaredDifference));
   ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
                                       OperatorType::kMirrorPad));
+  ops.push_back(MakeUnique<Unique>(::tflite::BuiltinOperator_UNIQUE,
+                                   OperatorType::kUnique));
+  ops.push_back(MakeUnique<UnidirectionalSequenceRnn>(
+      ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+      OperatorType::kUnidirectionalSequenceRnn));
 
   // Custom Operators.
   ops.push_back(
@@ -1784,6 +2024,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   // builtins.
   ops.push_back(
       MakeUnique<SimpleOperator<FloorOperator>>("FLOOR", OperatorType::kFloor));
+  ops.push_back(
+      MakeUnique<SimpleOperator<CeilOperator>>("CEIL", OperatorType::kCeil));
   ops.push_back(
       MakeUnique<SimpleOperator<ReluOperator>>("RELU", OperatorType::kRelu));
   ops.push_back(MakeUnique<SimpleOperator<Relu1Operator>>(
@@ -1853,6 +2095,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
   ops.push_back(
       MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
+  ops.push_back(MakeUnique<SimpleOperator<ReverseV2Operator>>(
+      "REVERSE_V2", OperatorType::kReverseV2));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index f2f7221eb1dc7a78d5da1e361e819c6476658493..88f68f7ebf919bc53255cff46dc3b0178093a70e 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -112,6 +112,7 @@ class OperatorTest : public ::testing::Test {
 
 TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
+  CheckSimpleOperator<CeilOperator>("CEIL", OperatorType::kCeil);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
@@ -150,6 +151,8 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
   CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
   CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
+  CheckSimpleOperator<ReverseV2Operator>("REVERSE_V2",
+                                         OperatorType::kReverseV2);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -569,6 +572,20 @@ TEST_F(OperatorTest, TensorFlowUnsupported) {
   (*attr)["str_attr"].set_s("Hello World");
   (*attr)["int_attr"].set_i(17);
   (*attr)["bool_attr"].set_b(true);
+  {
+    auto* list = (*attr)["list_string_attr"].mutable_list();
+    list->add_s("abcde");
+    list->add_s("1234");
+    list->add_s("");
+    list->add_s("zyxwv");
+    list->add_s("!-.");
+  }
+  {
+    auto* list = (*attr)["list_float_attr"].mutable_list();
+    list->add_f(std::numeric_limits<float>::min());
+    list->add_f(2.0);
+    list->add_f(-std::numeric_limits<float>::max());
+  }
   {
     auto* list = (*attr)["list_int_attr"].mutable_list();
     list->add_i(1);
@@ -588,7 +605,22 @@ TEST_F(OperatorTest, TensorFlowUnsupported) {
   EXPECT_EQ("Hello World", output_attr.at("str_attr").s());
   EXPECT_EQ(17, output_attr.at("int_attr").i());
   EXPECT_EQ(true, output_attr.at("bool_attr").b());
-
+  {
+    const auto& list = output_attr.at("list_string_attr").list();
+    ASSERT_EQ(5, list.s_size());
+    EXPECT_EQ("abcde", list.s(0));
+    EXPECT_EQ("1234", list.s(1));
+    EXPECT_EQ("", list.s(2));
+    EXPECT_EQ("zyxwv", list.s(3));
+    EXPECT_EQ("!-.", list.s(4));
+  }
+  {
+    const auto& list = output_attr.at("list_float_attr").list();
+    ASSERT_EQ(3, list.f_size());
+    EXPECT_EQ(std::numeric_limits<float>::min(), list.f(0));
+    EXPECT_EQ(2.0, list.f(1));
+    EXPECT_EQ(-std::numeric_limits<float>::max(), list.f(2));
+  }
   {
     const auto& list = output_attr.at("list_int_attr").list();
     ASSERT_EQ(4, list.i_size());
@@ -614,10 +646,11 @@ TEST_F(OperatorTest, TestShouldExportAsFlexOp) {
   EXPECT_FALSE(ShouldExportAsFlexOp(false, "Conv2D"));
   EXPECT_TRUE(ShouldExportAsFlexOp(true, "Conv2D"));
   EXPECT_TRUE(ShouldExportAsFlexOp(true, "EluGrad"));
+  EXPECT_TRUE(ShouldExportAsFlexOp(true, "RFFT"));
   EXPECT_FALSE(ShouldExportAsFlexOp(true, "MyAwesomeCustomOp"));
-  // While the RFFT op is available on desktop, it is not in the kernel
+  // While the RandomShuffle op is available on desktop, it is not in the kernel
   // set available on mobile and should be excluded.
-  EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT"));
+  EXPECT_FALSE(ShouldExportAsFlexOp(true, "RandomShuffle"));
 }
 
 TEST_F(OperatorTest, BuiltinMirrorPad) {
@@ -628,6 +661,15 @@ TEST_F(OperatorTest, BuiltinMirrorPad) {
   EXPECT_EQ(op.mode, output_toco_op->mode);
 }
 
+TEST_F(OperatorTest, BuiltinUnique) {
+  UniqueOperator op;
+  op.idx_out_type = ArrayDataType::kInt64;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("UNIQUE", OperatorType::kUnique), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+  EXPECT_EQ(output_toco_op->idx_out_type, op.idx_out_type);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index 039a918af16019292214f982326fba3eb5695c62..1b337ebc85f627b2ee90824cacd2a1f9a090428c 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -68,6 +68,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "Cast",
+          "Ceil",
           "CheckNumerics",
           "ComplexAbs",
           "Concat",
@@ -118,6 +119,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "FakeQuantWithMinMaxVarsPerChannel",
           "FakeQuantWithMinMaxVarsPerChannelGradient",
           "FakeQueue",
+          "FFT",
+          "FFT2D",
+          "FFT3D",
           "FIFOQueue",
           "FIFOQueueV2",
           "Fill",
@@ -143,6 +147,12 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "_HostSend",
           "Identity",
           "IdentityN",
+          "IFFT",
+          "IFFT2D",
+          "IFFT3D",
+          "IRFFT",
+          "IRFFT2D",
+          "IRFFT3D",
           "ImmutableConst",
           "InTopK",
           "InTopKV2",
@@ -311,6 +321,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Reverse",
           "ReverseSequence",
           "ReverseV2",
+          "RFFT",
+          "RFFT2D",
+          "RFFT3D",
           "Round",
           "Rsqrt",
           "RsqrtGrad",
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
index c3c440db94396def2f8cfd40242642767d11a63a..739b924607e7aa60bcdb6f081de52aed65a87d58 100644
--- a/tensorflow/lite/toco/toco_convert_test.cc
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_convert.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 namespace toco {
 namespace {
@@ -171,3 +173,10 @@ TEST(TocoTest, TransientStringTensors) {
 
 }  // namespace
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index fb8c1b8337f1e509ed9c9ee2522e63e84d143927..b222032e61418224efddbae2c6ec2f110286ab0b 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -57,6 +57,11 @@ void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
   ::InitGoogle(usage, argc, argv, remove_flags);
 }
 
+void InitGoogleWasDoneElsewhere() {
+  // Nothing need be done since ::CheckInitGoogleIsDone() is aware of other
+  // possible initialization entry points.
+}
+
 void CheckInitGoogleIsDone(const char* message) {
   ::CheckInitGoogleIsDone(message);
 }
@@ -152,6 +157,8 @@ constexpr int kFileWriteFlags = O_CREAT | O_WRONLY;
 
 static bool port_initialized = false;
 
+void InitGoogleWasDoneElsewhere() { port_initialized = true; }
+
 void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
   if (!port_initialized) {
 #if defined(PLATFORM_GOOGLE)
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 2f39e3d6d5c02457e9ade320e7525fbf881b5389..231612ecd43f3d77fc959a38642690ff6beed19b 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -55,6 +55,10 @@ double round(double x);
 namespace toco {
 namespace port {
 
+// Things like tests use other initialization routines that need control
+// of flags. However, for testing we still want to use toco_port.h facilities.
+// This function sets initialized flag trivially.
+void InitGoogleWasDoneElsewhere();
 void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags);
 void CheckInitGoogleIsDone(const char* message);
 
diff --git a/tensorflow/lite/toco/toco_port_test.cc b/tensorflow/lite/toco/toco_port_test.cc
index f5fbb4caeb2882d51c4b586293eb202fcf60a9de..997da58b8f64386dfbf6e41ff5838373dd8d64c2 100644
--- a/tensorflow/lite/toco/toco_port_test.cc
+++ b/tensorflow/lite/toco/toco_port_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/toco/toco_types.h"
 
 #include <gmock/gmock.h>
@@ -56,3 +57,10 @@ TEST(TocoPortTest, JoinPath) {
 }  // namespace
 }  // namespace port
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 55a454e66de4d0afce18421450d875911bea01f4..69d7a7a61a5065ba0284edb3b0fdecf40df5d267 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -178,6 +178,23 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
       // Ignore non-real data types.
       continue;
     }
+    // The enum value QUANTIZED_UINT8 for --inference_type and
+    // --inference_input_type has long meant just 'QUANTIZED', being used as
+    // well in mixed 8-bit / 16-bit quantized models. However,
+    // ConvertIODataTypeToArrayDataType still interpretes it as meaning 8bit,
+    // and people have run into issues in the situation where they have an
+    // already mixed 8-bit / 16-bit quantized model in TFLITE format and
+    // want to run it again through toco, without having to re-specify all the
+    // extra array info that was used in the (complicated) process of initially
+    // quantizing that model. In order to have --inference_type=QUANTIZED_UINT8
+    // just work in that case, we implement the logic that when an array is
+    // already quantized, if  --inference_type is quantized (so we're not
+    // asking to dequantize here), no change of quantized data type is to be
+    // recorded.
+    if (array->data_type != toco::ArrayDataType::kFloat &&
+        type != toco::ArrayDataType::kFloat) {
+      continue;
+    }
 
     array->final_data_type = type;
   }
@@ -306,6 +323,14 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
                             });
   }
 
+  // Try to merge bidirectional sequence lstm or rnn if present.
+  GraphTransformationsSet bidirectional_transformations;
+  bidirectional_transformations.Add(new RemoveUnusedOp);
+  bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceLstm);
+  bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceRnn);
+  RunGraphTransformations(model, "Group bidirectional sequence lstm/rnn",
+                          bidirectional_transformations);
+
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index af4cd386a209d82cb56a877410abe6fbdbf99c7b..6978dde668af932d44ed824d4724234d3b96d3d9 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -173,7 +173,7 @@ bool DeleteArrayIfUsedOnce(const string& array_name, Model* model) {
   return false;
 }
 
-void DeleteOpAndArraysIfUnused(Model* model, Operator* op) {
+void DeleteOpAndArraysIfUnused(Model* model, const Operator* op) {
   for (const string& array_name : op->inputs) {
     DeleteArrayIfUsedOnce(array_name, model);
   }
@@ -385,6 +385,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ConcatV2)
     HANDLE_OPERATORTYPENAME_CASE(Cast)
     HANDLE_OPERATORTYPENAME_CASE(Floor)
+    HANDLE_OPERATORTYPENAME_CASE(Ceil)
     HANDLE_OPERATORTYPENAME_CASE(Gather)
     HANDLE_OPERATORTYPENAME_CASE(ResizeBilinear)
     HANDLE_OPERATORTYPENAME_CASE(SpaceToBatchND)
@@ -412,10 +413,15 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Unpack)
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(BidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(BidirectionalSequenceRnn)
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
     HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
     HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
     HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
+    HANDLE_OPERATORTYPENAME_CASE(Unique)
+    HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceRnn)
+    HANDLE_OPERATORTYPENAME_CASE(ReverseV2)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 53131824b532853afc1660354de92da40db0da86..517da784d0e6395eb06a0bf0fb9004645d292e42 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -72,7 +72,7 @@ bool DeleteArrayIfUsedOnce(const string& array_name, Model* model);
 
 // Deletes the op and any of its input and output arrays if they are unused
 // after the op has been deleted.
-void DeleteOpAndArraysIfUnused(Model* model, Operator* op);
+void DeleteOpAndArraysIfUnused(Model* model, const Operator* op);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
     const Model& model, const string& array_name);
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index 6f1c9c563ada01891b67094caa93cfd1847cdf6b..f063ce71e9156ce85b7b4fe1bfeb8ad5d57cda0c 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/lib/core/status.h"
 
 namespace toco {
 
@@ -94,8 +96,8 @@ TEST_P(ShapeTest, Agrees) {
   }
 }
 
-INSTANTIATE_TEST_CASE_P(AgreeBroadcast, ShapeTest,
-                        ::testing::ValuesIn(CreateShapePairs()));
+INSTANTIATE_TEST_SUITE_P(AgreeBroadcast, ShapeTest,
+                         ::testing::ValuesIn(CreateShapePairs()));
 
 static const char kNegativeValuesMessage[] =
     "Tensor shape should not include negative values";
@@ -203,3 +205,10 @@ TEST(FusedActivationTest, DefaultsToUnfused) {
 }
 
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 9a74e221c13e72c286512175a7f633c87f75eedd..129747fe4d5c93630f9f6552a9486cbe8f8c37b7 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -22,6 +22,12 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
 #include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
 #include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
@@ -29,12 +35,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
 #include "tensorflow/lite/tools/accuracy/utils.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace {
 using tensorflow::string;
@@ -185,21 +185,17 @@ Status EvaluateModelForShard(const uint64_t shard_id,
   const TensorShape& input_shape = model_info.input_shapes[0];
   const int image_height = input_shape.dim_size(1);
   const int image_width = input_shape.dim_size(2);
-  const bool is_quantized = (model_info.input_types[0] == DT_UINT8);
 
   RunTFLiteModelStage::Params tfl_model_params;
   tfl_model_params.model_file_path = params.model_file_path;
-  if (is_quantized) {
-    tfl_model_params.input_type = {DT_UINT8};
-    tfl_model_params.output_type = {DT_UINT8};
-  } else {
-    tfl_model_params.input_type = {DT_FLOAT};
-    tfl_model_params.output_type = {DT_FLOAT};
-  }
+
+  tfl_model_params.input_type = {model_info.input_types[0]};
+  tfl_model_params.output_type = {model_info.input_types[0]};
 
   Scope root = Scope::NewRootScope();
   FileReaderStage reader;
-  InceptionPreprocessingStage inc(image_height, image_width, is_quantized);
+  InceptionPreprocessingStage inc(image_height, image_width,
+                                  model_info.input_types[0]);
   RunTFLiteModelStage tfl_model_stage(tfl_model_params);
   EvalPipelineBuilder builder;
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
index 2b086cdf7075d7e6328ce0a41b17ca611ea3c4e2..f5642d52a89d86930023fd21a6d81e628073927c 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
@@ -67,11 +67,18 @@ Status ImagenetTopKAccuracy::ComputeEval(
     for (size_t i = 0; i < probs.size(); i++) {
       probabilities.push_back(probs(i));
     }
-  } else {
+  } else if (output.dtype() == DT_UINT8) {
     auto probs = output.flat<uint8>();
     for (size_t i = 0; i < probs.size(); i++) {
       probabilities.push_back(probs(i));
     }
+  } else if (output.dtype() == DT_INT8) {
+    auto probs = output.flat<int8>();
+    for (size_t i = 0; i < probs.size(); i++) {
+      probabilities.push_back(probs(i));
+    }
+  } else {
+    return errors::InvalidArgument("Invalid datatype");
   }
 
   CHECK_EQ(kNumCategories, probabilities.size());
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
index 9a889f0dd88bc4c51b2c060baf0e89c126c98c1f..04b6cb755892bd218d899587bd81b818a51f85d8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -57,23 +57,33 @@ void InceptionPreprocessingStage::AddToGraph(const Scope& scope,
   tensorflow::Output cropped_image;
   CentralCropImage(s, decoded_jpeg, params_.cropping_fraction, &cropped_image);
   auto dims_expander = ops::ExpandDims(s, cropped_image, 0);
-  auto resized_image = ops::ResizeBilinear(
-      s, dims_expander,
-      ops::Const(s.WithOpName("size"), {image_height_, image_width_}));
-  if (is_quantized_) {
-    this->stage_output_ =
-        ops::Cast(s.WithOpName(output_name()), resized_image, DT_UINT8);
-  } else {
-    auto squeezed_image = ops::Squeeze(s, resized_image);
-    auto normalized_image =
-        ops::Div(s,
-                 ops::Sub(s, squeezed_image,
-                          {params_.input_means[0], params_.input_means[1],
-                           params_.input_means[2]}),
-                 {params_.scale});
-    this->stage_output_ =
-        ops::ExpandDims(s.WithOpName(output_name()), normalized_image, {0});
+  auto resized_image =
+      ops::ResizeBilinear(s.WithOpName("resize"), dims_expander,
+                          ops::Const(s, {image_height_, image_width_}));
+
+  ::tensorflow::Output preprocessed_image = resized_image;
+
+  if (!params_.input_means.empty()) {
+    preprocessed_image =
+        ops::Sub(s.WithOpName("sub"), preprocessed_image,
+                 {params_.input_means[0], params_.input_means[1],
+                  params_.input_means[2]});
+  }
+
+  if (std::abs(params_.scale) > 1e-7f) {
+    auto squeezed_image = ops::Squeeze(s, preprocessed_image);
+    preprocessed_image = ops::Div(s, squeezed_image, {params_.scale});
+    preprocessed_image = ops::ExpandDims(s, preprocessed_image, {0});
   }
+
+  // Cast the output from float to output datatype.
+  if (output_datatype_ != DT_FLOAT) {
+    preprocessed_image =
+        ops::Cast(s.WithOpName("cast"), preprocessed_image, output_datatype_);
+  }
+
+  this->stage_output_ =
+      ops::Identity(s.WithOpName(output_name()), preprocessed_image);
 }
 
 }  // namespace metrics
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
index 4a1d3ce4769d1a7d3f46f39941eb3e9bcde7785c..371feb3e76515a714286983a393c10dbaf4be3c8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
 
 #include <utility>
 
-#include "tensorflow/lite/tools/accuracy/stage.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -31,28 +31,53 @@ namespace metrics {
 // shape {1, image_height, image_width, 3}, where 3 is the number of channels.
 class InceptionPreprocessingStage : public Stage {
  public:
+  // Preprocessing params that govern scaling and normalization of channels of
+  // the image.
   struct Params {
+    // Input means are subtracted from each channel.
+    // In case of an empty vector this is skipped.
     std::vector<float> input_means;
+    // Scale is used to divide the input.
+    // A scale of 0 means divison is skipped.
     float scale;
     double cropping_fraction;
   };
 
-  static Params DefaultParams() {
-    return {.input_means = {127.5, 127.5, 127.5},
-            .scale = 127.5,
-            .cropping_fraction = 0.875};
+  // Default preprocessing for inception stage based on |output_type|
+  static Params DefaultParamsForType(DataType output_type) {
+    const float kCroppingFraction = 0.875;
+    Params params = {};
+    params.cropping_fraction = kCroppingFraction;
+    if (output_type == DT_UINT8) {
+    } else if (output_type == DT_INT8) {
+      params.input_means = {128.0, 128.0, 128.0};
+    } else {
+      // Assume floating point preprocessing.
+      params.input_means = {127.5, 127.5, 127.5};
+      params.scale = 127.5;
+    }
+    return params;
+  }
+
+  // Creates a new preprocessing stage object with provided |image_width|
+  // |image_height| as the size of output image.
+  // |output_datatype| is the datatype of output of the stage.
+  InceptionPreprocessingStage(int image_width, int image_height,
+                              DataType output_datatype)
+      : output_datatype_(output_datatype),
+        image_width_(image_width),
+        image_height_(image_height) {
+    params_ = DefaultParamsForType(output_datatype);
   }
 
   // Creates a new preprocessing stage object with provided |image_width|
   // |image_height| as the size of output image.
-  // If |is_quantized| is set to true then |params| is ignored since quantized
-  // images don't go through any preprocessing.
+  // |output_datatype| is the datatype of output of the stage.
   InceptionPreprocessingStage(int image_width, int image_height,
-                              bool is_quantized,
-                              Params params = DefaultParams())
-      : image_width_(image_width),
+                              DataType output_datatype, Params params)
+      : output_datatype_(output_datatype),
+        image_width_(image_width),
         image_height_(image_height),
-        is_quantized_(is_quantized),
         params_(std::move(params)) {}
 
   string name() const override { return "stage_inception_preprocess"; }
@@ -63,6 +88,7 @@ class InceptionPreprocessingStage : public Stage {
   void AddToGraph(const Scope& scope, const Input& input) override;
 
  private:
+  DataType output_datatype_;
   int image_width_;
   int image_height_;
   bool is_quantized_;
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
index 5d0e01d7d18c451b978edbd08fc27934c8379961..f88847035f21ee41eb7403aae99c9d7db1484499 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 
 namespace {
 tensorflow::string* g_test_image_file = nullptr;
@@ -48,7 +48,7 @@ Status GetContents(const string& filename, string* output) {
   }
 }
 
-TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
+TEST(InceptionPreprocessingTest, TestImagePreprocessUInt8Quantized) {
   ASSERT_TRUE(g_test_image_file != nullptr);
   string image_contents;
   string image_path = *g_test_image_file;
@@ -56,8 +56,8 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
   ASSERT_TRUE(status.ok()) << status.error_message();
   const int width = 224;
   const int height = 224;
-  const bool is_quantized = true;
-  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_UINT8);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_UINT8, params);
   Scope scope = Scope::NewRootScope();
   preprocess_stage.AddToGraph(scope, image_contents);
   TF_CHECK_OK(scope.status());
@@ -77,6 +77,35 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
   EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
 }
 
+TEST(InceptionPreprocessingTest, TestImagePreprocessInt8Quantized) {
+  ASSERT_TRUE(g_test_image_file != nullptr);
+  string image_contents;
+  string image_path = *g_test_image_file;
+  auto status = GetContents(image_path, &image_contents);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+  const int width = 224;
+  const int height = 224;
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_INT8);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_INT8, params);
+  Scope scope = Scope::NewRootScope();
+  preprocess_stage.AddToGraph(scope, image_contents);
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                                   /*inputs*/
+                   {preprocess_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  TF_CHECK_OK(run_status);
+  EXPECT_EQ(1, outputs.size());
+  EXPECT_EQ(DT_INT8, outputs[0].dtype());
+  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
+}
+
 TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
   ASSERT_TRUE(g_test_image_file != nullptr);
   string image_contents;
@@ -85,8 +114,8 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
   ASSERT_TRUE(status.ok()) << status.error_message();
   const int width = 224;
   const int height = 224;
-  const bool is_quantized = false;
-  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_FLOAT);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_FLOAT, params);
   Scope scope = Scope::NewRootScope();
   preprocess_stage.AddToGraph(scope, image_contents);
   TF_CHECK_OK(scope.status());
diff --git a/tensorflow/lite/tools/accuracy/utils.cc b/tensorflow/lite/tools/accuracy/utils.cc
index c19dc1ff7cca10745a367c027bef1067d117eb4a..953892b8ddff2e60d2e1618df97d867b2d553c29 100644
--- a/tensorflow/lite/tools/accuracy/utils.cc
+++ b/tensorflow/lite/tools/accuracy/utils.cc
@@ -38,6 +38,12 @@ DataType GetTFDataType(TfLiteType tflite_type) {
       return DT_FLOAT;
     case kTfLiteUInt8:
       return DT_UINT8;
+    case kTfLiteInt8:
+      return DT_INT8;
+    case kTfLiteInt32:
+      return DT_INT32;
+    case kTfLiteInt64:
+      return DT_INT64;
     default:
       return DT_INVALID;
   }
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index bc47406cd92d406a0900743986ea67a4ba39240e..ce31eaf42f170b6ce52a961bb984197313e63f96 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -4,6 +4,7 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
@@ -35,7 +36,7 @@ cc_binary(
     ],
 )
 
-cc_binary(
+tf_cc_binary(
     name = "benchmark_model_plus_flex",
     srcs = [
         "benchmark_plus_flex_main.cc",
@@ -140,10 +141,6 @@ cc_library(
         ":logging",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/profiling:profile_summarizer",
-        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/profiling:time",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a4d9c879eb645019a7626502207e9a3f4e89b1c1..e6ba818c71f23f39e511b7866ce2356848d46493 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -5,7 +5,7 @@
 A simple C++ binary to benchmark a TFLite model and its individual operators,
 both on desktop machines and on Android. The binary takes a TFLite model,
 generates random inputs and then repeatedly runs the model for specified number
-of runs. Aggregrate latency statistics are reported after running the benchmark.
+of runs. Aggregate latency statistics are reported after running the benchmark.
 
 The instructions below are for running the binary on Desktop and Android,
 for iOS please use the
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index f5b67e3f79aa669c5424d46c23f053213ad3a101..db82c59acd3de38bbd8ffcf1542f34adf02c9098 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -51,7 +51,7 @@ and can be appended to the `args` string alongside the required `--graph` flag
 args key).
 
 ```
-adb shell am start -S -n
+adb shell am start -S -n \
   org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
   --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
 ```
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index e9b485efcaa81b011c598d5dfa39d4f253090dc8..70f4c94d3588b1645ce6c8422ca3cfe94eddc8e6 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
-#include <time.h>
-
 #include <iostream>
 #include <sstream>
 
@@ -28,18 +26,11 @@ void SleepForSeconds(double sleep_seconds) {
   if (sleep_seconds <= 0.0) {
     return;
   }
-  // Convert the run_delay string into a timespec.
-  timespec req;
-  req.tv_sec = static_cast<time_t>(sleep_seconds);
-  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
   // If requested, sleep between runs for an arbitrary amount of time.
   // This can be helpful to determine the effect of mobile processor
   // scaling and thermal throttling.
-#ifdef PLATFORM_WINDOWS
-  Sleep(sleep_seconds * 1000);
-#else
-  nanosleep(&req, nullptr);
-#endif
+  return tflite::profiling::time::SleepForMicros(
+      static_cast<uint64_t>(sleep_seconds * 1e6));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index fed9e7ea7e8633e00413118fa3e9e4f12d5188a4..8142f48ad9002cd6578b790e794e06094f349b74 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -24,8 +24,9 @@ to build TFLite.
 Running
 
 ```bash
-tensorflow/lite/build_ios_universal_lib.sh
+tensorflow/lite/tools/make/build_ios_universal_lib.sh
 ```
+
 will also build `tensorflow/lite/gen/lib/benchmark-lib.a` .
 
 - Now copy the downloaded model file to `benchmark_data` directory. 
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 994f660dba7742de162525dcf6a8c6a288ee71c6..e98ba9b2e0720b11b3a3a83d3c4c82d41027aeb6 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -111,15 +111,27 @@ $(wildcard tensorflow/lite/*/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*/*test.cc) \
 $(wildcard tensorflow/lite/kernels/test_util.cc) \
 $(MINIMAL_SRCS)
+
 ifeq ($(BUILD_TYPE),micro)
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/lite/mmap_allocation.cc \
-tensorflow/lite/nnapi_delegate.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/mmap_allocation.cc
 else
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/lite/mmap_allocation_disabled.cc \
-tensorflow/lite/nnapi_delegate_disabled.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/mmap_allocation_disabled.cc
+endif
+
+BUILD_WITH_NNAPI=true
+ifeq ($(BUILD_TYPE),micro)
+	BUILD_WITH_NNAPI=false
+endif
+ifeq ($(TARGET),ios)
+	BUILD_WITH_NNAPI=false
 endif
+ifeq ($(BUILD_WITH_NNAPI),true)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate_disabled.cc
+else
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate.cc
+endif
+
+
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 0a0d5cc4123ba64c7208c5e74344248b28af6851..ecc48f807e4a5daa6bb7f7368e82c918002fa7d0 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,25 +1,309 @@
-# TODO(suharshs): Write quantize_weights tests that use small exportable files.
-# Then we can remove this file.
-package(
-    default_visibility = ["//visibility:public"],
-)
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(default_visibility = [
+    "//visibility:public",
+])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+exports_files(glob([
+    "testdata/*.bin",
+]))
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+cc_library(
+    name = "quantization_utils",
+    srcs = ["quantization_utils.cc"],
+    hdrs = ["quantization_utils.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:round",
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+tf_cc_test(
+    name = "quantization_utils_test",
+    srcs = ["quantization_utils_test.cc"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
 
 cc_library(
     name = "quantize_weights",
     srcs = ["quantize_weights.cc"],
     hdrs = ["quantize_weights.h"],
     deps = [
+        ":quantization_utils",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+        "//tensorflow/lite:framework",
+        # TODO(suharshs): Move the relevant quantization utils to a non-internal location.
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/core:tflite_portable_logging",
+    ],
+)
+
+cc_library(
+    name = "calibrator_lib",
+    srcs = ["calibrator.cc"],
+    hdrs = ["calibrator.h"],
+    deps = [
+        ":calibration_common",
+        ":calibration_logger",
+        ":calibration_reader",
+        ":logging_op_resolver",
+        ":node_info_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "calibrator_test",
+    srcs = ["calibrator_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":calibrator_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "logging_op_resolver",
+    srcs = ["logging_op_resolver.cc"],
+    hdrs = ["logging_op_resolver.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_test(
+    name = "logging_op_resolver_test",
+    srcs = ["logging_op_resolver_test.cc"],
+    deps = [
+        ":logging_op_resolver",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "calibration_reader",
+    srcs = ["calibration_reader.cc"],
+    hdrs = ["calibration_reader.h"],
+    deps = [
+        ":calibration_logger",
+        "//tensorflow/lite:framework",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "calibration_logger",
+    hdrs = ["calibration_logger.h"],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "calibration_common",
+    hdrs = ["calibration_common.h"],
+    deps = [
         "//tensorflow/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "node_info_delegate",
+    srcs = ["node_info_delegate.cc"],
+    hdrs = ["node_info_delegate.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "node_info_delegate_test",
+    srcs = ["node_info_delegate_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":node_info_delegate",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_weights_test",
+    srcs = ["quantize_weights_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        "//tensorflow/lite/tools/optimize:testdata/weight_shared_between_convs.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_weights",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "subgraph_quantizer",
+    srcs = ["subgraph_quantizer.cc"],
+    hdrs = ["subgraph_quantizer.h"],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels/internal:round",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
 )
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "subgraph_quantizer_test",
+    srcs = ["subgraph_quantizer_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_minus_127_max_plus_127.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_softmax_min_minus_5_max_plus_5.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":subgraph_quantizer",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "quantize_model",
+    srcs = ["quantize_model.cc"],
+    hdrs = ["quantize_model.h"],
+    deps = [
+        ":subgraph_quantizer",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_model_test",
+    srcs = ["quantize_model_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_model",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration_common.h b/tensorflow/lite/tools/optimize/calibration_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ff2d3f18a66ca4323727b8403515e857e54d8cc
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_common.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+using BuiltinOperatorKey = std::pair<BuiltinOperator, int>;
+
+using BuiltinOpsSet = std::unordered_set<
+    BuiltinOperatorKey,
+    op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>>;
+
+template <typename T>
+class BuiltinOpsMap
+    : public std::unordered_map<
+          BuiltinOperatorKey, T,
+          op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>> {};
+
+// An alias for |TfLiteRegistration.invoke|.
+using KernelEvalFuncPtr = TfLiteStatus (*)(TfLiteContext*, TfLiteNode*);
+
+enum class OperatorTensorType { kNone, kInput, kOutput, kIntermediate };
+
+// Information about an operator in the TfLite graph.
+struct OperatorInfo {
+  int node_index;
+  std::string name;
+  BuiltinOperator builtin_op_code;
+  bool is_custom_op;
+  std::vector<int> inputs;
+  std::vector<int> outputs;
+  // Inputs that need to be logged.
+  std::vector<int> loggable_inputs;
+  // Outputs that need to be logged.
+  std::vector<int> loggable_outputs;
+  const TfLiteRegistration* registration;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
diff --git a/tensorflow/lite/tools/optimize/calibration_logger.h b/tensorflow/lite/tools/optimize/calibration_logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fd380423a3ee0e671fcedd5c3e2cdf566c993eb
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_logger.h
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+class MinMax {
+ public:
+  void Update(const float* values, size_t tensor_size) {
+    // TODO(shashishekhar): Really slow implementation, optimize
+    if (tensor_size <= 0) return;
+
+    if (!has_values_) {
+      min_ = max_ = values[0];
+      has_values_ = true;
+      return;
+    }
+
+    // We are only logging absolute min/max here.
+    // TODO(shashishekhar): Make it possible to use weighted/moving average.
+    for (size_t i = 0; i < tensor_size; i++) {
+      float val = values[i];
+      if (min_ > val) {
+        min_ = val;
+      } else if (max_ < val) {
+        max_ = val;
+      }
+    }
+  }
+
+  bool HasValues() const { return has_values_; }
+
+  TfLiteStatus Get(float* min_val, float* max_val) const {
+    if (!has_values_) return kTfLiteError;
+    *min_val = min_;
+    *max_val = max_;
+    return kTfLiteOk;
+  }
+
+ private:
+  bool has_values_;
+  float min_, max_;
+};
+
+// Captures min max values for tensors.
+class Logger {
+ public:
+  // Log the value for tensor at |tensor_index| which has |tensor_values|
+  void LogTensorValue(int tensor_index, const float* tensor_values,
+                      size_t tensor_size) {
+    tensor_id_to_stats_map_[tensor_index].Update(tensor_values, tensor_size);
+  }
+
+  // Returns a map from tensor_index -> observed min max values.
+  const std::unordered_map<int, MinMax>& GetCalibrationValues() const {
+    return tensor_id_to_stats_map_;
+  }
+
+ private:
+  std::unordered_map<int, MinMax> tensor_id_to_stats_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.cc b/tensorflow/lite/tools/optimize/calibration_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b01a62bd6c15dee5b60edf5f3abdd40ba4c3a56b
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_reader.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+
+#include "absl/memory/memory.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+TfLiteStatus CalibrationReader::GetTensorStatsAsMap(
+    std::unordered_map<int, CalibrationStats>* tensor_id_to_stats_map) const {
+  tensor_id_to_stats_map->clear();
+  for (const auto& tensorid_stat : logger_->GetCalibrationValues()) {
+    auto minmax = tensorid_stat.second;
+    CalibrationReader::CalibrationStats stats;
+    TF_LITE_ENSURE_STATUS(minmax.Get(&stats.min, &stats.max));
+    tensor_id_to_stats_map->insert({tensorid_stat.first, stats});
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus CalibrationReader::AddCalibrationToModel(ModelT* model) const {
+  if (!model || model->subgraphs.empty()) {
+    return kTfLiteError;
+  }
+  const auto& subgraph = model->subgraphs[0];
+  for (const auto& tensorid_stat : logger_->GetCalibrationValues()) {
+    auto minmax = tensorid_stat.second;
+    float min, max;
+    TF_LITE_ENSURE_STATUS(minmax.Get(&min, &max));
+    auto quant_params = absl::make_unique<tflite::QuantizationParametersT>();
+    quant_params->min.push_back(min);
+    quant_params->max.push_back(max);
+    subgraph->tensors[tensorid_stat.first]->quantization =
+        std::move(quant_params);
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..af0da1bb3835493e69ef7a6bccb7149ef14b1db9
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration_reader.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+// Warning: This is not a public API and subject to change.
+//
+// Reads calibrator data collected by running the interpreter through
+// a calibration set.
+class CalibrationReader {
+ public:
+  struct CalibrationStats {
+    float min;
+    float max;
+  };
+  explicit CalibrationReader(const Logger* logger) : logger_(logger) {}
+
+  // Gets a map from tensor index to recorded calibration values.
+  virtual TfLiteStatus GetTensorStatsAsMap(
+      std::unordered_map<int, CalibrationStats>* tensor_id_to_stats_map) const;
+
+  // Annotates the tensors in the given model with statistics captured during
+  // calibration.
+  virtual TfLiteStatus AddCalibrationToModel(ModelT* model) const;
+
+  virtual ~CalibrationReader() {}
+
+ private:
+  const Logger* logger_;
+};
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
diff --git a/tensorflow/lite/tools/optimize/calibrator.cc b/tensorflow/lite/tools/optimize/calibrator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccb55c3081f331189c35cc3dc302ae3f7725b2b5
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibrator.cc
@@ -0,0 +1,347 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibrator.h"
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/tools/optimize/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+namespace {
+
+// Calibrator is used to hold information that can be accessed during kernel
+// invocations.
+// TfLite kernel invocations are C functions and cannot look at the global
+// structure of the graph. Calibrator allows the kernel invoke functions to
+// access the global structure of graph and know which node is currently being
+// executed. This also allows us to write a simple kernel invoke wrapper
+// (see LoggingEval) that can work for most builtin ops.
+class Calibrator {
+ public:
+  Calibrator(const std::unordered_map<const TfLiteNode*, OperatorInfo>&
+                 node_ptr_opinfo_map,
+             std::unique_ptr<LoggingOpResolver> logging_op_resolver)
+      : node_ptr_opinfo_map_(node_ptr_opinfo_map),
+        logging_op_resolver_(std::move(logging_op_resolver)) {
+    logger_ = absl::make_unique<Logger>();
+  }
+
+  // Returns the wrapped kernel invoke function |TfLiteRegistration.invoke|.
+  KernelEvalFuncPtr GetKernelInvoke(const TfLiteNode* node) const;
+
+  // Gets the instance of logger associated with the current context.
+  Logger* GetLogger() const { return logger_.get(); }
+
+  // Gets the operator information about the given TfLiteNode.
+  const OperatorInfo& GetOpInfo(const TfLiteNode* node) const {
+    return node_ptr_opinfo_map_.at(node);
+  }
+
+ private:
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map_;
+  std::unique_ptr<LoggingOpResolver> logging_op_resolver_;
+  const std::unordered_map<int, OperatorInfo> index_opinfo_;
+  std::unique_ptr<Logger> logger_;
+};
+
+KernelEvalFuncPtr Calibrator::GetKernelInvoke(const TfLiteNode* node) const {
+  auto op_info = node_ptr_opinfo_map_.at(node);
+  return logging_op_resolver_->GetWrappedKernelInvoke(op_info.builtin_op_code,
+                                                      1);
+}
+
+// A registry of |Calibrator| objects per |TfLiteContext|.
+// This global registry is needed to access |Calibrator| objects in the kernel
+// invoke functions i.e. |TfLiteRegistration.invoke|.
+// Kernel invoke functions are C functions that have limited access to
+// |TfLiteContext|. Kernel invoke functions don't have access to global state of
+// graph. That means during a kernel invocation, the function cannot know which
+// node it was invoked for. E.g. in case of a model with |Conv| op at two
+// locations, there is no easy way for the Conv.invoke function to disambiguate
+// the calls.
+//
+// For calibration we solve this problem by creating a map of calibrators
+// per |TfLiteContext|. This map is |GlobalCalibrationRegistry|.
+//
+// This registry is then accessed using a global getter function:
+// |GetCalibratorRegistry|.
+// E.g.
+// TfLiteStatus SomeKernelInvokeFn(TfLiteContext* context, TfLiteNode* node) {
+//   .... code ....
+//   auto registry = GetCalibratorRegistry();
+//   auto calibrator = registry->GetCalibrator(context);
+//   ..... code ....
+//  }
+//
+// This way the kernel invoke functions can get the access to the Calibrator
+// object associated with the |TfLiteContext|.
+class GlobalCalibratorRegistry {
+ public:
+  // Get the |Calibrator| associated with given context, returns null if no
+  // calibrator is associated with the given context.
+  Calibrator* GetCalibrator(const TfLiteContext* context) const {
+    if (calibrator_registry_.find(context) == calibrator_registry_.cend()) {
+      return nullptr;
+    }
+    return calibrator_registry_.at(context).get();
+  }
+
+  // Removes the association between calibrator and context.
+  // Note: This deletes the calibrator as well.
+  void RemoveCalibrator(const TfLiteContext* context) {
+    calibrator_registry_.erase(context);
+  }
+
+  // Creates an instance of |Calibrator|.
+  // Registry owns the |Calibrator| object which can be deleted by calling
+  // |RemoveCalibrator|.
+  TfLiteStatus CreateCalibrator(
+      const TfLiteContext* context,
+      const std::unordered_map<const TfLiteNode*, OperatorInfo>& node_to_opinfo,
+      std::unique_ptr<LoggingOpResolver> logging_op_resolver,
+      Calibrator** calibrator_ptr, ErrorReporter* reporter) {
+    if (calibrator_registry_.find(context) != calibrator_registry_.cend()) {
+      reporter->Report(
+          "Failed to create calibrator, context already registered.");
+      return kTfLiteError;
+    }
+    std::unique_ptr<Calibrator> calibrator = absl::make_unique<Calibrator>(
+        node_to_opinfo, std::move(logging_op_resolver));
+    calibrator_registry_[context] = std::move(calibrator);
+    *calibrator_ptr = calibrator_registry_.at(context).get();
+    return kTfLiteOk;
+  }
+
+ private:
+  std::unordered_map<const TfLiteContext*, std::unique_ptr<Calibrator>>
+      calibrator_registry_;
+};
+
+GlobalCalibratorRegistry* GetCalibratorRegistry() {
+  static GlobalCalibratorRegistry* registry = new GlobalCalibratorRegistry();
+  return registry;
+}
+
+// A wrapper implementation for |TfLiteRegistration.invoke| that logs inputs,
+// invokes the wrapped implementation and then logs the outputs.
+TfLiteStatus LoggingEval(TfLiteContext* context, TfLiteNode* node) {
+  Calibrator* calibrator = GetCalibratorRegistry()->GetCalibrator(context);
+
+  if (!calibrator) {
+    context->ReportError(context, "No calibrator found for context.");
+    return kTfLiteError;
+  }
+
+  auto kernel_invoke = calibrator->GetKernelInvoke(node);
+  auto logger = calibrator->GetLogger();
+  auto op_info = calibrator->GetOpInfo(node);
+
+  for (int i : op_info.loggable_inputs) {
+    auto tensor = context->tensors[i];
+    logger->LogTensorValue(i, tensor.data.f, tensor.bytes / sizeof(float));
+  }
+
+  auto status = kernel_invoke(context, node);
+  // TODO(shashishekhar): An intermediate tensor in graph will get logged twice
+  // once as an input and second time as output. This doesn't change the min max
+  // values but is inefficient.
+  // Using moving average will also break this.
+
+  for (int i : op_info.loggable_outputs) {
+    auto tensor = context->tensors[i];
+    logger->LogTensorValue(i, tensor.data.f, tensor.bytes / sizeof(float));
+  }
+
+  return status;
+}
+
+// Returns the loggable tensors. Not all inputs and outputs need to be logged.
+// For example, const weight tensors which have buffers associated with them
+// don't need to be logged.
+std::vector<int> GetLoggableTensorIndices(
+    const std::vector<int>& tensor_indices,
+    const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* tensor_buffers) {
+  std::vector<int> loggable;
+  for (auto tensor_index : tensor_indices) {
+    auto tensor = tensors->Get(tensor_index);
+    auto buffer_index = tensor->buffer();
+    const bool has_no_buffer =
+        (tensor_buffers->Get(buffer_index) == nullptr) ||
+        (tensor_buffers->Get(buffer_index)->data() == nullptr) ||
+        (tensor_buffers->Get(buffer_index)->data()->size() == 0);
+    if (has_no_buffer && tensor->type() == tflite::TensorType_FLOAT32) {
+      loggable.push_back(tensor_index);
+    }
+  }
+  return loggable;
+}
+
+// Creates a mapping between the static model graph and the runtime TfLiteNode*
+// nodes in the graph for the given context.
+// This is done by querying the TfLiteContext for node and registrations using
+// the |NodeInfoDelegateObserver|.
+TfLiteStatus GetNodeOpInfoMapAndContext(
+    const std::unordered_map<int, OperatorInfo>& node_to_opinfo,
+    tflite::Interpreter* const interpreter,
+    std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map,
+    const TfLiteContext** context
+
+) {
+  NodeInfoDelegateObserver delegate_observer(node_to_opinfo,
+                                             node_ptr_opinfo_map);
+  NodeInfoDelegateParams delegate_params;
+  delegate_params.delegate_observer = &delegate_observer;
+  TfLiteDelegate logging_delegate = CreateNodeInfoDelegate(&delegate_params);
+
+  auto modify_status = interpreter->ModifyGraphWithDelegate(&logging_delegate);
+  if (modify_status != kTfLiteOk) {
+    return kTfLiteError;
+  }
+  *context = delegate_observer.GetContext();
+  return kTfLiteOk;
+}
+
+string GetOpName(const tflite::OperatorCode& opcode) {
+  if (opcode.custom_code() != nullptr) {
+    return opcode.custom_code()->str();
+  }
+  return tflite::EnumNamesBuiltinOperator()[opcode.builtin_code()];
+}
+
+// A |CalibrationReader| that owns the Calibrator.
+class Reader : public CalibrationReader {
+ public:
+  Reader(const TfLiteContext* context, const Logger* logger)
+      : CalibrationReader(logger), context_(context) {}
+
+  ~Reader() override { GetCalibratorRegistry()->RemoveCalibrator(context_); }
+
+ private:
+  const TfLiteContext* context_;
+};
+
+}  // namespace
+
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader) {
+  auto tflite_model = model.GetModel();
+  auto subgraphs = tflite_model->subgraphs();
+  auto tensor_buffers = tflite_model->buffers();
+
+  if (subgraphs->size() != 1) {
+    model.error_reporter()->Report(
+        "Only models with a single subgraph are supported, model had %d "
+        "subgraphs",
+        subgraphs->size());
+    return kTfLiteError;
+  }
+
+  // Populate the node index to operator info map.
+  // We want to collect this information so we can use it during runtime to
+  // log details of which inputs and outputs.
+  // At runtime TFLite kernel invoke functions can only look into their
+  // own node in the graph (TFLiteNode*) and some limited context information.
+  auto primary_subgraph = subgraphs->Get(0);
+  auto operator_codes = tflite_model->operator_codes();
+  auto operators = primary_subgraph->operators();
+  auto tensors = primary_subgraph->tensors();
+  std::unordered_map<int, OperatorInfo> node_to_opinfo;
+  BuiltinOpsSet op_and_versions;
+
+  for (size_t i = 0; i < operators->size(); i++) {
+    OperatorInfo op_info;
+    op_info.node_index = i;
+    auto op = operators->Get(i);
+    auto operator_code = operator_codes->Get(op->opcode_index());
+    op_info.builtin_op_code = operator_code->builtin_code();
+    op_info.name = GetOpName(*operator_code);
+    op_info.is_custom_op = operator_code->custom_code() != nullptr;
+
+    auto op_inputs = op->inputs();
+    auto op_outputs = op->outputs();
+    op_info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
+    op_info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
+    op_info.loggable_inputs =
+        GetLoggableTensorIndices(op_info.inputs, tensors, tensor_buffers);
+    op_info.loggable_outputs =
+        GetLoggableTensorIndices(op_info.outputs, tensors, tensor_buffers);
+    if (!op_info.is_custom_op) {
+      op_info.registration = op_resolver.FindOp(operator_code->builtin_code(),
+                                                operator_code->version());
+    } else {
+      op_info.registration =
+          op_resolver.FindOp(op_info.name.c_str(), operator_code->version());
+    }
+    node_to_opinfo[i] = op_info;
+    op_and_versions.insert({op_info.builtin_op_code, operator_code->version()});
+  }
+
+  // Prepare the logging op resolver to use |LoggingEval| for kernel
+  // invocations.
+  auto logging_op_resolver = absl::make_unique<LoggingOpResolver>(
+      op_and_versions, op_resolver, LoggingEval);
+  tflite::InterpreterBuilder(model, *logging_op_resolver)(interpreter);
+
+  if (!(*interpreter)) {
+    model.error_reporter()->Report("Failed to construct interpreter");
+    return kTfLiteError;
+  }
+
+  // Compute the mapping between runtime and static graph structure, i.e.
+  // (TfLiteContext, TfLiteNode) -> OperatorInfo
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map;
+  const TfLiteContext* context = nullptr;
+  GetNodeOpInfoMapAndContext(node_to_opinfo, interpreter->get(),
+                             &node_ptr_opinfo_map, &context);
+
+  Calibrator* calibrator = nullptr;
+  // Register a calibrator object for the context. This can be accessed
+  // during invocations by the logging kernels.
+  TF_LITE_ENSURE_STATUS(GetCalibratorRegistry()->CreateCalibrator(
+      context, node_ptr_opinfo_map, std::move(logging_op_resolver), &calibrator,
+      model.error_reporter()));
+  *calibration_reader = std::unique_ptr<CalibrationReader>(
+      new Reader(context, calibrator->GetLogger()));
+
+  return kTfLiteOk;
+}
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibrator.h b/tensorflow/lite/tools/optimize/calibrator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab3cb27eb7518b7327655023739e310e2a6b0249
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibrator.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
+
+#include <unordered_map>
+
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// Warning: This is not a public API and subject to change.
+
+// Builds a interpreter that logs the calibration data in memory.
+// The calibration data can be recovered using |calibration_reader|.
+//
+// Sample usage:
+// std::unique_ptr<Interpreter> interpreter;
+// std::unique_ptr<CalibrationReader> calibration_reader;
+// BuiltinOpResolver resolver = ...
+// FlatBufferModel model = ..
+//
+// BuildLoggingInterpreter(model, resolver, &interpreter,
+//  &calibration_reader);
+//
+//
+// * Allocate tensors...
+// * Call interpreter->invoke on calibration dataset.
+//
+// Calibration data can be read either directly by calling
+// std::unordered_map<int,  CalibrationStats>> tensor_index_to_stats;
+// calibration_reader->GetTensorStatsAsMap(&tensor_index_to_stats);
+//
+// or adding calibration data to model itself.
+// ModelT * original_floating_point_model = ...
+// calibration_reader->AddCalibrationToModel(original_floating_point_model);
+//
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader);
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
diff --git a/tensorflow/lite/tools/optimize/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibrator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a415adc55b1b7790da190874b184a4da77716aa
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibrator_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstring>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibrator.h"
+
+namespace {
+tensorflow::string* g_test_model_file = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadModel() {
+  if (g_test_model_file) {
+    return FlatBufferModel::BuildFromFile(g_test_model_file->c_str());
+  }
+  return nullptr;
+}
+
+TEST(CalibratorTest, CalibrationStatsAreCollected) {
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(
+      *model, ops::builtin::BuiltinOpResolver{}, &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  std::unordered_map<int, CalibrationReader::CalibrationStats> stats;
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_TRUE(stats.empty());
+
+  status = interpreter->AllocateTensors();
+  ASSERT_EQ(kTfLiteOk, status);
+  // Model does the following:
+  // 0        1       2        3
+  // |        |__ ____|        |
+  // |           |             |
+  // |          Add(tensor:4)  |
+  // |____ ______|______ ______|
+  //      |             |
+  //      Add          Add
+  //      |             |
+  //    Output:5      Output:6
+
+  const size_t tensor_size = 1 * 8 * 8 * 3;
+
+  std::vector<float> ones(tensor_size, 1.0f);
+  // Fill input tensor i with i+1, i.e. input[0] = 1.0f, input[1] = 2.0f,
+  // input[2] = 3.0f
+
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_tensor_idx = interpreter->inputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+    ASSERT_EQ(tensor->bytes, tensor_size * sizeof(float));
+    for (size_t j = 0; j < tensor_size; j++) {
+      tensor->data.f[j] = i + 1;
+    }
+  }
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  const float eps = 1e-6f;
+  // Verify that tensor 5: is 6
+  // Verify that tensor 6: is 9
+  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  for (size_t i = 0; i < tensor_size; i++) {
+    EXPECT_NEAR(tensor->data.f[i], 6.0f, eps);
+  }
+  tensor = interpreter->tensor(interpreter->outputs()[1]);
+  for (size_t i = 0; i < tensor_size; i++) {
+    EXPECT_NEAR(tensor->data.f[i], 9.0f, eps);
+  }
+
+  // Verify that min max of tensors.
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  // Check inputs
+  for (int tensor_idx = 0; tensor_idx < 4; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, tensor_idx + 1, eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, tensor_idx + 1, eps);
+  }
+  // Check tensor 4 max.
+  EXPECT_NEAR(stats.at(4).min, 5, eps);
+  EXPECT_NEAR(stats.at(4).max, 5, eps);
+
+  // Check outputs
+  EXPECT_NEAR(stats.at(5).min, 6, eps);
+  EXPECT_NEAR(stats.at(5).max, 6, eps);
+
+  EXPECT_NEAR(stats.at(6).min, 9, eps);
+  EXPECT_NEAR(stats.at(6).max, 9, eps);
+}
+
+TEST(CalibratorTest, MultipleInvokes) {
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(
+      *model, ops::builtin::BuiltinOpResolver{}, &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  status = interpreter->AllocateTensors();
+
+  EXPECT_EQ(kTfLiteOk, status);
+  const size_t tensor_size = 1 * 8 * 8 * 3;
+  // Fill input tensor i with i+1, i.e. input[0] = 1.0f, input[1] = 2.0f,
+  // input[2] = 3.0f
+
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_tensor_idx = interpreter->inputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+    ASSERT_EQ(tensor->bytes, tensor_size * sizeof(float));
+    for (size_t j = 0; j < tensor_size; j++) {
+      tensor->data.f[j] = i + 1;
+    }
+  }
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  const float eps = 1e-6f;
+  // Verify that min max of tensors.
+  std::unordered_map<int, CalibrationReader::CalibrationStats> stats;
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  const float expected_values[7] = {
+      1.0f,  // input 0
+      2.0f,  // input 1
+      3.0f,  // input 2
+      4.0f,  // input 3
+      5.0f,  // Add(1, 2)
+      6.0f,  // Output 5: Add(0, Add(1,2))
+      9.0f,  // Output 6: Add(Add(1,2), 3)
+  };
+  for (int tensor_idx = 0; tensor_idx < 7; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, expected_values[tensor_idx], eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, expected_values[tensor_idx], eps);
+  }
+  // Set input[0][0] = 1.5 and input[0][1] = 0.5 this should change the values
+  // only for input[0] and tensor 4 and ouputs 5, 6.
+  TfLiteTensor* input0 = interpreter->tensor(0);
+  input0->data.f[0] = 1.5f;
+  input0->data.f[1] = 0.5f;
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  EXPECT_NEAR(stats.at(0).min, 0.5f, eps);
+  EXPECT_NEAR(stats.at(0).max, 1.5f, eps);
+
+  for (int tensor_idx = 1; tensor_idx < 5; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, expected_values[tensor_idx], eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, expected_values[tensor_idx], eps);
+  }
+
+  EXPECT_NEAR(stats.at(5).min, 5.5f, eps);
+  EXPECT_NEAR(stats.at(5).max, 6.5f, eps);
+
+  EXPECT_NEAR(stats.at(6).min, 9.0f, eps);
+  EXPECT_NEAR(stats.at(6).max, 9.0f, eps);
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_file = new tensorflow::string(model_file);
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/logging_op_resolver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7633ebb8dd9d7aee0b8a5befa5d51911f68a7e32
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/logging_op_resolver.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+
+#include "absl/memory/memory.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+LoggingOpResolver::LoggingOpResolver(const BuiltinOpsSet& ops_to_replace,
+                                     const OpResolver& base_resolver,
+                                     KernelEvalFuncPtr logging_eval_fn) {
+  for (const auto& op_and_version : ops_to_replace) {
+    const TfLiteRegistration* base_registration =
+        base_resolver.FindOp(op_and_version.first, op_and_version.second);
+    BuiltinOperatorKey key = op_and_version;
+    builtin_op_evalfn_map_[key] = base_registration->invoke;
+    std::unique_ptr<TfLiteRegistration> logging_registation =
+        absl::make_unique<TfLiteRegistration>(*base_registration);
+    logging_registation->invoke = logging_eval_fn;
+    builtin_op_registration_map_[key] = std::move(logging_registation);
+  }
+}
+
+const TfLiteRegistration* LoggingOpResolver::FindOp(BuiltinOperator op,
+                                                    int version) const {
+  BuiltinOperatorKey key = {op, version};
+  if (builtin_op_registration_map_.find(key) !=
+      builtin_op_registration_map_.end()) {
+    return builtin_op_registration_map_.at(key).get();
+  }
+
+  return nullptr;
+}
+
+KernelEvalFuncPtr LoggingOpResolver::GetWrappedKernelInvoke(BuiltinOperator op,
+                                                            int version) const {
+  return builtin_op_evalfn_map_.at({op, version});
+}
+
+const TfLiteRegistration* LoggingOpResolver::FindOp(const char* op,
+                                                    int version) const {
+  // TODO(b/121374947): Support custom ops as well.
+  return nullptr;
+}
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.h b/tensorflow/lite/tools/optimize/logging_op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..58a3a0fe3c08288ccba6881a64b1fd581103da10
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/logging_op_resolver.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+// A resolver that replaces the kernel invocations with a wrapper
+// eval function.
+class LoggingOpResolver : public OpResolver {
+ public:
+  // Creates an instance of |LoggingOpResolver|.
+  // All |TfLiteRegistration.invoke| functions are replaced by
+  // |logging_eval_fn|.
+  // TODO(shashishekhar): This interface needs to change for custom ops and
+  // BuiltinOps that need special logging implementations.
+  LoggingOpResolver(const BuiltinOpsSet& ops_to_replace,
+                    const OpResolver& base_resolver,
+                    KernelEvalFuncPtr logging_eval_fn);
+
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override;
+
+  KernelEvalFuncPtr GetWrappedKernelInvoke(BuiltinOperator op,
+                                           int version) const;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+
+ private:
+  BuiltinOpsMap<std::unique_ptr<TfLiteRegistration>>
+      builtin_op_registration_map_;
+  BuiltinOpsMap<KernelEvalFuncPtr> builtin_op_evalfn_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7fe2d37ce49e4e467ef92c963ef33692d5d998b
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus WrappingInvoke(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TEST(LoggingOpResolverTest, KernelInvokesAreReplaced) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+
+  auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_TRUE(reg->prepare == ConvPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+
+  reg = resolver.FindOp(BuiltinOperator_ADD, 1);
+
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_ADD);
+  EXPECT_TRUE(reg->prepare == AddPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+}
+
+TEST(LoggingOpResolverTest, OriginalKernelInvokesAreRetained) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+  auto kernel_invoke =
+      resolver.GetWrappedKernelInvoke(BuiltinOperator_CONV_2D, 1);
+  EXPECT_TRUE(kernel_invoke == ConvEval);
+  kernel_invoke = resolver.GetWrappedKernelInvoke(BuiltinOperator_ADD, 1);
+  EXPECT_TRUE(kernel_invoke == AddEval);
+}
+
+TEST(LoggingOpResolverTest, OnlyOpsInReplacementSetAreReplaces) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  // Only replace conv2d
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+  auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_TRUE(reg->prepare == ConvPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+
+  reg = resolver.FindOp(BuiltinOperator_ADD, 1);
+  EXPECT_EQ(nullptr, reg);
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.cc b/tensorflow/lite/tools/optimize/node_info_delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccaa69373fcf55adaef21a948089ea59821ca763
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/node_info_delegate.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+namespace {
+// The prepare function for delegate that forwards the prepare call to the
+// delegate observer in node info delegate params.
+// The function simply calls a delegate observer OnDelegatePrepareMethod.
+TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
+                                     TfLiteDelegate* delegate) {
+  if (delegate == nullptr) return TfLiteStatus::kTfLiteError;
+
+  NodeInfoDelegateParams* params =
+      reinterpret_cast<NodeInfoDelegateParams*>(delegate->data_);
+  return params->delegate_observer->OnDelegatePrepareCalled(context);
+}
+}  // namespace
+
+TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
+  return {.data_ = params,
+          .Prepare = NodeInfoDelegatePrepare,
+          .CopyFromBufferHandle = nullptr,
+          .CopyToBufferHandle = nullptr,
+          .FreeBufferHandle = nullptr};
+}
+
+TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
+    TfLiteContext* context) {
+  context_ = context;
+  const size_t num_nodes = node_index_opinfo_map_.size();
+  for (size_t node_index = 0; node_index < num_nodes; node_index++) {
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* reg = nullptr;
+    TF_LITE_ENSURE_STATUS(
+        context->GetNodeAndRegistration(context, node_index, &node, &reg));
+    auto op_info = node_index_opinfo_map_.at(node_index);
+    op_info.registration = reg;
+    node_ptr_opinfo_map_->insert({node, op_info});
+  }
+
+  if (node_ptr_opinfo_map_->size() != node_index_opinfo_map_.size()) {
+    // Something wrong.
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.h b/tensorflow/lite/tools/optimize/node_info_delegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ee2ce1978cf87b104518c4b64e84df166cef32d
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/node_info_delegate.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/tools/optimize/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// An interface for delegate observer that can listen to TfLiteDelegate::Prepare
+// calls.
+class DelegateObserver {
+ public:
+  virtual TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) = 0;
+  virtual ~DelegateObserver() {}
+};
+
+// The parameters for the node info delegate.
+struct NodeInfoDelegateParams {
+  DelegateObserver* delegate_observer;
+};
+
+// Creates a delegate with the given |params|.
+TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params);
+
+// A delegate observer that can construct the map from TfLiteNode* ->
+// OperatorInfo.
+class NodeInfoDelegateObserver : public DelegateObserver {
+ public:
+  NodeInfoDelegateObserver(
+      const std::unordered_map<int, OperatorInfo>& node_index_to_op,
+      std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map)
+      : node_index_opinfo_map_(node_index_to_op),
+        node_ptr_opinfo_map_(node_ptr_opinfo_map) {}
+
+  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override;
+
+  // Returns the context that was used to called the prepare method.
+  const TfLiteContext* GetContext() const { return context_; }
+
+ private:
+  const TfLiteContext* context_ = nullptr;
+  const std::unordered_map<int, OperatorInfo>& node_index_opinfo_map_;
+  std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/node_info_delegate_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05332c56b549a49d72d67cbed4fa0832d38a8dcc
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/node_info_delegate_test.cc
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unordered_map>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadModel() {
+  return ReadModel(internal::kConvModelWith0Plus10Weights);
+}
+
+class TestDelegateObserver : public DelegateObserver {
+ public:
+  explicit TestDelegateObserver(TfLiteStatus status_to_return)
+      : status_to_return_(status_to_return) {}
+
+  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override {
+    num_times_called_++;
+    return status_to_return_;
+  }
+  int num_times_called() { return num_times_called_; }
+
+ private:
+  int num_times_called_ = 0;
+  TfLiteStatus status_to_return_;
+};
+
+TEST(NodeInfoDelegateTest, DelegateObserverIsCalled) {
+  TestDelegateObserver observer(kTfLiteOk);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+  EXPECT_EQ(0, observer.num_times_called());
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(1, observer.num_times_called());
+}
+
+TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
+  // Observer returns error
+  TestDelegateObserver observer(kTfLiteError);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteError, status);
+}
+
+TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+
+  std::unordered_map<int, OperatorInfo> index_to_opinfo;
+  auto primary_subgraph = model->GetModel()->subgraphs()->Get(0);
+  auto operators = primary_subgraph->operators();
+  auto subgraph_tensors = primary_subgraph->tensors();
+  for (size_t i = 0; i < operators->size(); i++) {
+    OperatorInfo info;
+    auto op_inputs = operators->Get(i)->inputs();
+    auto op_outputs = operators->Get(i)->outputs();
+    info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
+    info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
+    index_to_opinfo[i] = info;
+  }
+
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_to_opinfo;
+  NodeInfoDelegateObserver observer(index_to_opinfo, &node_to_opinfo);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(index_to_opinfo.size(), node_to_opinfo.size());
+  EXPECT_EQ(interpreter->nodes_size(), node_to_opinfo.size());
+
+  for (const auto& node_and_opinfo : node_to_opinfo) {
+    const TfLiteNode* tflite_node = node_and_opinfo.first;
+    const OperatorInfo& info = node_and_opinfo.second;
+    ASSERT_EQ(tflite_node->inputs->size, info.inputs.size());
+    ASSERT_EQ(tflite_node->outputs->size, info.outputs.size());
+
+    for (size_t input_index = 0; input_index < info.inputs.size();
+         input_index++) {
+      const TfLiteTensor* tflite_tensor =
+          interpreter->tensor(tflite_node->inputs->data[input_index]);
+      EXPECT_EQ(tflite_tensor->name,
+                subgraph_tensors->Get(info.inputs[input_index])->name()->str());
+    }
+
+    for (size_t output_index = 0; output_index < info.outputs.size();
+         output_index++) {
+      const TfLiteTensor* tflite_tensor =
+          interpreter->tensor(tflite_node->outputs->data[output_index]);
+      EXPECT_EQ(
+          tflite_tensor->name,
+          subgraph_tensors->Get(info.outputs[output_index])->name()->str());
+    }
+  }
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..445fffb8dd4256b001f72576902f47f425ef9161
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -0,0 +1,163 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+#include <cmath>
+#include <cstdint>
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+
+namespace {
+const int8_t kMinQuantizedValue = -127;
+const int8_t kMaxQuantizedValue = 127;
+}  // namespace
+
+TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements) {
+  if (tensor.shape.empty()) {
+    return kTfLiteError;
+  }
+  *num_elements = 1;
+  for (const uint64_t dim : tensor.shape) {
+    *num_elements *= dim;
+  }
+  return kTfLiteOk;
+}
+
+// Nudge min and max so that floating point 0 falls exactly on a quantized
+// value, returning the nudges scale and zero_point.
+//
+// Although this code originates from FakeQuantization in quantized training,
+// we may deviate from that implementation as we please since we do not fine
+// tune the weights with quantized training.
+void GetAsymmetricQuantizationParams(
+    float min, float max, const int quant_min, const int quant_max,
+    QuantizationParametersT* quantization_params) {
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  // Adjust the boundaries to guarantee 0 is included.
+  min = std::min(static_cast<float>(min), 0.0f);
+  max = std::max(static_cast<float>(max), 0.0f);
+  const float scale = (max - min) / (quant_max_float - quant_min_float);
+  // Scale can be zero if min and max are exactly 0.0f.
+  float zero_point_from_min = quant_min_float;
+  if (scale != 0) {
+    zero_point_from_min = quant_min_float - min / scale;
+  }
+  int64_t zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    zero_point = static_cast<int64_t>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    zero_point = static_cast<int64_t>(quant_max);
+  } else {
+    zero_point = static_cast<int64_t>(std::round(zero_point_from_min));
+  }
+  quantization_params->min = std::vector<float>(1, min);
+  quantization_params->max = std::vector<float>(1, max);
+  quantization_params->scale = std::vector<float>(1, scale);
+  quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
+}
+
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+void SymmetricPerChannelQuantization(const float* const input,
+                                     const std::vector<int>& dimension,
+                                     int32_t channel_dim_index,
+                                     std::vector<float>* output_scales,
+                                     std::vector<int8_t>* output_value) {
+  const int32_t channel_dim_size = dimension[channel_dim_index];
+  std::vector<float> min_vals(channel_dim_size);
+  std::vector<float> max_vals(channel_dim_size);
+  std::vector<bool> has_min_max_value(channel_dim_size, false);
+  int indices[4];
+  RuntimeShape tensor_dims{dimension[0], dimension[1], dimension[2],
+                           dimension[3]};
+
+  // Compute min max ranges per channel
+  for (indices[0] = 0; indices[0] < dimension[0]; indices[0]++) {
+    for (indices[1] = 0; indices[1] < dimension[1]; indices[1]++) {
+      for (indices[2] = 0; indices[2] < dimension[2]; indices[2]++) {
+        for (indices[3] = 0; indices[3] < dimension[3]; indices[3]++) {
+          int channel_idx = indices[channel_dim_index];
+          const float val = input[Offset(tensor_dims, indices)];
+          if (has_min_max_value[channel_idx]) {
+            if (min_vals[channel_idx] > val) {
+              min_vals[channel_idx] = val;
+            } else if (max_vals[channel_idx] < val) {
+              max_vals[channel_idx] = val;
+            }
+          } else {
+            min_vals[channel_idx] = val;
+            max_vals[channel_idx] = val;
+            has_min_max_value[channel_idx] = true;
+          }
+        }
+      }
+    }
+  }
+
+  // Calculate scales per channel
+  std::vector<float> scale_invs(channel_dim_size);
+  const float half_scale = kMaxQuantizedValue;
+  for (size_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
+    const float half_range = std::max(std::abs(min_vals[channel_idx]),
+                                      std::abs(max_vals[channel_idx]));
+    output_scales->at(channel_idx) = half_range / half_scale;
+    if (half_range == 0) {
+      scale_invs[channel_idx] = 0;
+    } else {
+      scale_invs[channel_idx] = half_scale / half_range;
+    }
+  }
+
+  // Quantize the values.
+  SymmetricPerChannelQuantizeValues(input, scale_invs, dimension,
+                                    channel_dim_index, output_value);
+}
+
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value) {
+  // Quantize the values.
+  int indices[4];
+  RuntimeShape tensor_dims{dimension[0], dimension[1], dimension[2],
+                           dimension[3]};
+  for (indices[0] = 0; indices[0] < dimension[0]; indices[0]++) {
+    for (indices[1] = 0; indices[1] < dimension[1]; indices[1]++) {
+      for (indices[2] = 0; indices[2] < dimension[2]; indices[2]++) {
+        for (indices[3] = 0; indices[3] < dimension[3]; indices[3]++) {
+          int channel_idx = indices[channel_dim_index];
+          int index = Offset(tensor_dims, indices);
+          const float val = input[index];
+          const int32_t quantized_value =
+              static_cast<int32_t>(TfLiteRound(val * scales_inv[channel_idx]));
+          output_value->at(index) = std::min<int8_t>(
+              kMaxQuantizedValue,
+              std::max<int8_t>(kMinQuantizedValue, quantized_value));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d20b3176bf389be1a9661610426e3b1403a3ef4d
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+
+// Returns the number of elements in the given tensor.
+TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements);
+
+// Populates the scale and zero point for quantization parameters.
+//
+// Nudges min and max so that floating point 0 falls exactly on a quantized
+// value, returning the nudges scale and zero_point.
+void GetAsymmetricQuantizationParams(
+    float min, float max, const int quant_min, const int quant_max,
+    QuantizationParametersT* quantization_params);
+
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+// Parameters:
+// - input is the float input data to be quantized.
+// - dimension is the dimension of the input data. Only supports dimension of
+//   size 4.
+// - channel_dim_index is the channel index within "dimension".
+//   dimension[channel_dim_index] gives the number of channels.
+// - output_scale is the output scale, the size of which equals the number of
+//   channels.
+// - output_value is the output data, the size of which equals the number of
+//   inputs.
+void SymmetricPerChannelQuantization(const float* const input,
+                                     const std::vector<int>& dimension,
+                                     int32_t channel_dim_index,
+                                     std::vector<float>* output_scales,
+                                     std::vector<int8_t>* output_value);
+
+// Quantize the values given an array of scales.
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value);
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ecad09ed61225c2b6e0ed5a20b52561e1e2c35ef
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(QuantizationUtilsTest, NumElements) {
+  TensorT tensor;
+  tensor.shape = {1, 2, 3, 4};
+  uint64_t num_elements;
+  EXPECT_EQ(kTfLiteOk, NumElements(tensor, &num_elements));
+  EXPECT_EQ(num_elements, 1 * 2 * 3 * 4);
+
+  tensor.shape = {5};
+  EXPECT_EQ(kTfLiteOk, NumElements(tensor, &num_elements));
+  EXPECT_EQ(num_elements, 5);
+
+  tensor.shape = {};
+  EXPECT_EQ(kTfLiteError, NumElements(tensor, &num_elements));
+}
+
+TEST(QuantizationUtilsTest, GetAsymmetricQuantizationParamsUnitRange) {
+  const float float_min = -128.0;
+  const float float_max = 127.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_EQ(zero_point, 0);
+  EXPECT_NEAR(scale, 1, eps);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithAllPositiveRange) {
+  // The min should get nudged to include 0, so the effective range is [0, 6].
+  const float float_min = 1.0;
+  const float float_max = 6.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], 0.0);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_EQ(zero_point, -128);
+  EXPECT_NEAR(scale, 6 / 255.0f, eps);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithAllNegativeRange) {
+  // The min should get nudged to include 0, so the effective range is [-6, 0].
+  const float float_min = -6.0;
+  const float float_max = -1.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], 0.0);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_EQ(zero_point, 127);
+  EXPECT_NEAR(scale, 6 / 255.0f, eps);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithZeroInRange) {
+  const float float_min = -5.0;
+  const float float_max = 1.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_NEAR(scale, 6 / 255.0f, eps);
+  EXPECT_GT(zero_point, quant_min);
+  EXPECT_LT(zero_point, quant_max);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithZeroMinMax) {
+  const float float_min = 0;
+  const float float_max = 0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_NEAR(scale, 0, eps);
+  EXPECT_NEAR(zero_point, quant_min, eps);
+  EXPECT_LT(zero_point, quant_max);
+}
+
+TEST(QuantizationUtilsTest, SymmetricPerChannelQuantization) {
+  // Set up an input with [3, 2, 2, 2] size and 0 is the channel index.
+  const std::vector<float> input = {
+      3.0, 2.0, 5.0,  -2.0, 3.0,  2.0,  5.0,  -2.0,  // Channel 1.
+      1.0, 2.0, 3.0,  4.0,  5.0,  6.0,  7.0,  8.0,   // Channel 2.
+      1.0, 0.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0,  // Channel 3.
+  };
+  const std::vector<int32_t> dimension = {3, 2, 2, 2};
+  const int channel_index = 0;
+
+  // Create holder for output scale and data.
+  std::vector<float> output_scales(3);
+  std::vector<int8_t> output_data(3 * 2 * 2 * 2);
+
+  // Call SymmetricPerChannelQuantization and verify the result.
+  SymmetricPerChannelQuantization(input.data(), dimension, channel_index,
+                                  &output_scales, &output_data);
+  const std::vector<float> expected_output_scales = {0.0393700786, 0.0629921257,
+                                                     0.0472440943};
+  const std::vector<int8_t> expected_output_data = {
+      76, 51, 127, -51, 76,  51,  127,  -51,   // Channel 1.
+      16, 32, 48,  64,  79,  95,  111,  127,   // Channel 2.
+      21, 0,  -21, -42, -64, -85, -106, -127,  // Channel 3.
+  };
+  EXPECT_THAT(output_scales, ElementsAreArray(expected_output_scales));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_output_data));
+}
+
+TEST(QuantizationUtilsTest, SymmetricPerChannelQuantizeValues) {
+  // Set up an input with [3, 1, 1, 2] size and 0 is the channel index.
+  const std::vector<float> input = {
+      13.0, 21.0,  // Channel 1.
+      21.0, 22.0,  // Channel 2.
+      31.0, 40.0,  // Channel 3.
+  };
+  const std::vector<float> scales_inv = {2, 0.5, 3};
+  const std::vector<int32_t> dimension = {3, 1, 1, 2};
+  const int channel_index = 0;
+
+  // Create holder for output data.
+  std::vector<int8_t> output_data(3 * 1 * 1 * 2);
+
+  // Call SymmetricPerChannelQuantizeValues and verify the result.
+  SymmetricPerChannelQuantizeValues(input.data(), scales_inv, dimension,
+                                    channel_index, &output_data);
+  const std::vector<int8_t> expected_output_data = {
+      26, 42,   // Channel 1.
+      11, 11,   // Channel 2.
+      93, 120,  // Channel 3.
+  };
+  EXPECT_THAT(output_data, ElementsAreArray(expected_output_data));
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55a9b1c580a4c08a2f9dabeee527dbc919c74467
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantize_model.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
+
+namespace tflite {
+namespace optimize {
+
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, ErrorReporter* error_reporter) {
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    internal::SubgraphQuantizer quantizer(model, subgraph, error_reporter);
+    for (int op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
+      auto status = quantizer.QuantizeOperator(op_idx);
+      if (status != kTfLiteOk) {
+        OperatorT* op = subgraph->operators[op_idx].get();
+        const BuiltinOperator op_code =
+            model->operator_codes[op->opcode_index]->builtin_code;
+        error_reporter->Report(
+            "Failed to quantized operator: %s in subgraph %d, node: %d",
+            EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
+        return kTfLiteError;
+      }
+    }
+  }
+
+  flatbuffers::Offset<Model> output_model_location =
+      Model::Pack(*builder, model);
+  FinishModelBuffer(*builder, output_model_location);
+
+  return kTfLiteOk;
+}
+
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4a62435d3f5c719aab60755e86928487f05e4f0
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
+
+#include <memory>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+// Quantizes input_model and populates the provided builder with the new model.
+// input_model is required to have min/max information populated in its
+// quantization params.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, ErrorReporter* error_reporter);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf3eb2dde6c3aa95963178041545b9cd8a1909c7
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantize_model.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadTestModel() {
+  auto model_path = tensorflow::io::JoinPath(
+      *g_test_model_dir, internal::kConvModelWith0Plus10Weights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+template <typename T>
+std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
+  return std::vector<T>(vec->begin(), vec->end());
+}
+
+class QuantizeModelTest : public testing::Test {
+ protected:
+  QuantizeModelTest() {
+    input_model_ = ReadTestModel();
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+
+  std::unique_ptr<FlatBufferModel> input_model_;
+  const Model* readonly_model_;
+  tflite::ModelT model_;
+  flatbuffers::FlatBufferBuilder builder_;
+  internal::FailOnErrorReporter error_reporter_;
+};
+
+TEST_F(QuantizeModelTest, QuantizationSucceeds) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  const uint8_t* buffer = builder_.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+}
+
+TEST_F(QuantizeModelTest, TensorShapesAndStructureIsUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
+       subgraph_idx++) {
+    const auto quantized_graph = model_.subgraphs[subgraph_idx].get();
+    const auto float_graph = readonly_model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors.size(), float_graph->tensors()->size());
+    for (size_t i = 0; i < quantized_graph->tensors.size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors[i].get();
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer, float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable, float_tensor->is_variable());
+      EXPECT_EQ(quant_tensor->shape, GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name, float_tensor->name()->str());
+    }
+  }
+}
+
+TEST_F(QuantizeModelTest, OperatorsAreUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  ASSERT_EQ(model_.operator_codes.size(),
+            readonly_model_->operator_codes()->size());
+  for (size_t i = 0; i < model_.operator_codes.size(); i++) {
+    const auto float_model_op = readonly_model_->operator_codes()->Get(i);
+    EXPECT_EQ(model_.operator_codes[i]->builtin_code,
+              float_model_op->builtin_code());
+    EXPECT_EQ(model_.operator_codes[i]->version, float_model_op->version());
+  }
+
+  ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
+       subgraph_idx++) {
+    const auto quantized_graph = model_.subgraphs[subgraph_idx].get();
+    const auto float_graph = readonly_model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->operators.size(),
+              float_graph->operators()->size());
+    for (size_t i = 0; i < quantized_graph->operators.size(); i++) {
+      const auto quant_op = quantized_graph->operators[i].get();
+      const auto float_op = float_graph->operators()->Get(i);
+      EXPECT_EQ(quant_op->inputs, GetAsVector(float_op->inputs()));
+      EXPECT_EQ(quant_op->outputs, GetAsVector(float_op->outputs()));
+      EXPECT_EQ(quant_op->opcode_index, float_op->opcode_index());
+    }
+  }
+}
+
+TEST_F(QuantizeModelTest, GraphIsFullyQuantized) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  for (const auto& subgraph : model_.subgraphs) {
+    for (const auto& tensor : subgraph->tensors) {
+      EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                  tensor->type == TensorType_INT8);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index de3c0b03237c1c85d1cfbeafc2ce8db4faf70ff6..f0a280f1c1fc2a3ab45c8e5916d9c26254a8849b 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -21,11 +21,12 @@ limitations under the License.
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/memory/memory.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
 namespace tflite {
 namespace optimize {
@@ -33,72 +34,36 @@ namespace optimize {
 namespace {
 
 typedef struct {
-  TensorT* tensor;
+  OperatorT* op;
+  // The index of the op in the operators vector.
+  int32_t op_idx;
   // The index of the tensor to quantize in subgraph->tensors.
-  int32_t tensor_idx;
-  // The index of the tensor of the weight tensor to be quantize in op->inputs.
   int32_t op_input_idx;
-  // True if the tensor supports hybrid evaluation.
-  bool eval_hybrid;
-} TensorInfo;
+} ConsumerOpInfo;
 
 // The default minimum number of elements a weights array must have to be
 // quantized by this transformation.
 const int kWeightsMinNumElementsDefault = 1024;
 
-// Nudge min and max so that floating point 0 falls exactly on a quantized
-// value, returning the nudges scale and zero_point.
-//
-// Although this code originates from FakeQuantization in quantized training,
-// we may deviate from that implementation as we please since we do not fine
-// tune the weights with quantized training.
-void GetAsymmetricQuantizationParams(
-    const float min, const float max, const int quant_min, const int quant_max,
-    QuantizationParametersT* quantization_params) {
-  // Adjust the boundaries to guarantee 0 is included.
-  const float quant_min_float = std::min(static_cast<float>(quant_min), 0.0f);
-  const float quant_max_float = std::max(static_cast<float>(quant_max), 0.0f);
-  const float scale = (max - min) / (quant_max_float - quant_min_float);
-  const float zero_point_from_min = quant_min_float - min / scale;
-  int64_t zero_point;
-  if (zero_point_from_min < quant_min_float) {
-    zero_point = static_cast<int64_t>(quant_min);
-  } else if (zero_point_from_min > quant_max_float) {
-    zero_point = static_cast<int64_t>(quant_max);
-  } else {
-    zero_point = static_cast<int64_t>(std::round(zero_point_from_min));
-  }
-  quantization_params->scale = std::vector<float>(1, scale);
-  quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
-}
-
-// Returns the number of elements in tensor.
-uint64_t NumElements(const TensorT* tensor) {
-  if (tensor->shape.empty()) {
-    LOG(FATAL) << "Tensor has no shape information.";
-  }
-  uint64_t num_elements = 1;
-  for (const uint64_t dim : tensor->shape) {
-    num_elements *= dim;
-  }
-  return num_elements;
-}
-
-uint64_t CountTensorConsumers(const ModelT* model, const SubGraphT* subgraph,
-                              int32_t tensor_idx) {
-  uint64_t count = 0;
+// Gets the operators that consume tensor_idx.
+std::vector<ConsumerOpInfo> GetTensorConsumers(const ModelT* model,
+                                               const SubGraphT* subgraph,
+                                               int32_t tensor_idx) {
+  // TODO(suharshs): If this proves to be too slow, avoid calling it per tensor,
+  // instead doing one sweep for the entire model.
+  std::vector<ConsumerOpInfo> consumer_ops;
   for (int op_idx = 0; op_idx < subgraph->operators.size(); ++op_idx) {
-    const OperatorT* op = subgraph->operators[op_idx].get();
+    OperatorT* op = subgraph->operators[op_idx].get();
     if (op == nullptr) {
       continue;
     }
     for (int i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == tensor_idx) {
-        count++;
+        consumer_ops.push_back({op, op_idx, i});
       }
     }
   }
-  return count;
+  return consumer_ops;
 }
 
 // Gets the list of op->inputs indices of the weights inputs to be quantized for
@@ -156,23 +121,39 @@ bool IsHybridEvaluationOp(const OperatorT* op, const BuiltinOperator& op_code) {
   return eval_hybrid;
 }
 
-// Returns a vector of TensorInfos for each input tensor of op that should be
-// quantized.
-std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
+// Returns true if all of the op's inputs are quantized.
+bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
+                               const BuiltinOperator& op_code) {
+  std::vector<int32_t> op_input_indices = GetWeightInputIndices(op_code);
+  for (const int32_t op_input_idx : op_input_indices) {
+    int32_t tensor_idx = op->inputs[op_input_idx];
+
+    if (tensor_idx == -1) {
+      // Optional tensor.
+      continue;
+    }
+
+    TensorT* tensor = subgraph->tensors[tensor_idx].get();
+
+    if (tensor->type != TensorType_INT8) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Inserts Tensors for each input tensor of op that should be
+// quantized into tensor_map.
+TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
-    bool use_hybrid_evaluation) {
+    std::unordered_map<int32_t, TensorT*>* tensor_map) {
   SubGraphT* subgraph = model->subgraphs.at(0).get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
 
-  std::vector<TensorInfo> tensor_infos;
-
-  bool eval_hybrid = use_hybrid_evaluation && IsHybridEvaluationOp(op, op_code);
-
   std::vector<int32_t> op_input_indices = GetWeightInputIndices(op_code);
   for (const int32_t op_input_idx : op_input_indices) {
     int32_t tensor_idx = op->inputs[op_input_idx];
-
     if (tensor_idx == -1) {
       LOG(INFO) << "Skipping optional tensor input " << op_input_idx
                 << " of operation " << EnumNameBuiltinOperator(op_code);
@@ -180,28 +161,18 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
     }
 
     TensorT* tensor = subgraph->tensors[tensor_idx].get();
-    // TODO(suharshs): Support shared weights, i.e. If two tensors share the
-    // same weight array, things may break. (i.e. SSD object detection)
-    if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
-      LOG(INFO) << "Skipping quantization of tensor " << tensor->name
-                << " that is shared between multiple multiple operations.";
-      continue;
-    }
-
     if (tensor->type != TensorType_FLOAT32) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " that is not type float.";
       continue;
     }
 
-    const uint64_t num_elements = NumElements(tensor);
+    uint64_t num_elements;
+    TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
     if (num_elements < weights_min_num_elements) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " because it has fewer than " << weights_min_num_elements
                 << " elements (" << num_elements << ").";
-      // If one of the weights isn't quantized, then we cannot use the hybrid
-      // kernel for this operation, since it expects everything to be quantized.
-      eval_hybrid = false;
       continue;
     }
 
@@ -213,57 +184,8 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
       continue;
     }
 
-    TensorInfo tensor_info;
-    tensor_info.eval_hybrid = eval_hybrid;
-    tensor_info.op_input_idx = op_input_idx;
-    tensor_info.tensor_idx = tensor_idx;
-    tensor_info.tensor = tensor;
-
-    tensor_infos.push_back(tensor_info);
-  }
-
-  return tensor_infos;
-}
-
-// Quantizes tensor using asymmetric quantization with the min and max elements
-// of the tensor. This is needed to pass to Dequantize operations.
-TfLiteStatus AsymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
-  BufferT* buffer = model->buffers[tensor->buffer].get();
-  float* float_data = reinterpret_cast<float*>(buffer->data.data());
-  const uint64_t num_elements = NumElements(tensor);
-  LOG(INFO) << "Quantizing tensor " << tensor->name << " with " << num_elements
-            << " elements for float evaluation.";
-
-  // Compute the quantization params.
-  float min_value = *std::min_element(float_data, float_data + num_elements);
-  float max_value = *std::max_element(float_data, float_data + num_elements);
-
-  if (tensor->quantization == nullptr) {
-    tensor->quantization = absl::make_unique<QuantizationParametersT>();
-  }
-  GetAsymmetricQuantizationParams(min_value, max_value, 0, 255,
-                                  tensor->quantization.get());
-
-  // Quantize the buffer.
-  std::vector<uint8_t> quantized_buffer;
-  quantized_buffer.resize(num_elements);
-  const double inverse_scale = 1. / tensor->quantization->scale[0];
-  for (std::size_t i = 0; i < num_elements; i++) {
-    const float src_val = float_data[i];
-    double scaled_val;
-    if (tensor->quantization->scale[0] == 0) {
-      scaled_val = tensor->quantization->zero_point[0];
-    } else {
-      scaled_val =
-          tensor->quantization->zero_point[0] + inverse_scale * src_val;
-    }
-    uint8_t integer_val = static_cast<uint8_t>(std::round(scaled_val));
-    quantized_buffer[i] = integer_val;
+    tensor_map->insert({tensor_idx, tensor});
   }
-  model->buffers[tensor->buffer]->data = quantized_buffer;
-
-  // Update the tensor type.
-  tensor->type = TensorType_UINT8;
 
   return kTfLiteOk;
 }
@@ -274,9 +196,10 @@ TfLiteStatus AsymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
 TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
   BufferT* buffer = model->buffers[tensor->buffer].get();
   float* float_data = reinterpret_cast<float*>(buffer->data.data());
-  const uint64_t num_elements = NumElements(tensor);
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
   LOG(INFO) << "Quantizing tensor " << tensor->name << " with " << num_elements
-            << " elements for hybrid evaluation.";
+            << " elements.";
 
   std::vector<int8_t> quantized_buffer;
   quantized_buffer.resize(num_elements);
@@ -297,7 +220,7 @@ TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
                                               uint8_buffer + num_elements);
 
   // Update the tensor type.
-  tensor->type = TensorType_UINT8;
+  tensor->type = TensorType_INT8;
 
   return kTfLiteOk;
 }
@@ -313,7 +236,8 @@ int32_t GetOrInsertDequantizeOpCodeIndex(ModelT* model) {
   model->operator_codes.push_back(absl::make_unique<OperatorCodeT>());
   int op_code_idx = model->operator_codes.size() - 1;
   model->operator_codes[op_code_idx]->builtin_code = BuiltinOperator_DEQUANTIZE;
-  // TODO(suharshs): How should the version be set in this op_code?
+  // Version 2 and onwards supports INT8 inputs.
+  model->operator_codes[op_code_idx]->version = 2;
 
   // Return the index of the newly placed OperatorCodeT.
   return op_code_idx;
@@ -340,6 +264,26 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
   tensor->reset(tensor_raw);
 }
 
+// Updates operator code versions for the operators with INT8 inputs.
+void UpdateInt8OperatorVersions(ModelT* model) {
+  for (int i = 0; i < model->operator_codes.size(); ++i) {
+    const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
+    if (op_code == BuiltinOperator_CONV_2D || op_code == BuiltinOperator_SVDF ||
+        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
+        op_code == BuiltinOperator_RNN ||
+        op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
+        op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
+        op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
+      model->operator_codes[i]->version = 2;
+
+    } else if (op_code == BuiltinOperator_FULLY_CONNECTED ||
+               op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
+               op_code == BuiltinOperator_LSTM) {
+      model->operator_codes[i]->version = 3;
+    }
+  }
+}
+
 TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
                                      const Model* input_model,
                                      bool use_hybrid_evaluation,
@@ -357,48 +301,82 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
   SubGraphT* subgraph = model->subgraphs.at(0).get();
 
   std::vector<std::unique_ptr<OperatorT>> new_operators;
+  std::unordered_map<int32_t, TensorT*> tensor_map;
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
+    TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
+        model.get(), op, weights_min_num_elements, &tensor_map));
+  }
+
+  // The unordered_map ensures that we quantize each tensor exactly once.
+  // TODO(suharshs): This map key isn't sufficient when we support multiple
+  // subgraphs.
+  for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+    // Quantize the tensor.
+    TF_LITE_ENSURE_STATUS(
+        SymmetricQuantizeTensor(model.get(), tensor_pair.second));
+  }
 
-    std::vector<TensorInfo> tensor_infos = GetQuantizableTensorsFromOperator(
-        model.get(), op, weights_min_num_elements, use_hybrid_evaluation);
-
-    for (const TensorInfo& tensor_info : tensor_infos) {
-      if (tensor_info.eval_hybrid) {
-        // Quantize the tensor.
-        TF_LITE_ENSURE_STATUS(
-            SymmetricQuantizeTensor(model.get(), tensor_info.tensor));
-      } else {
-        // Quantize the tensor.
-        TF_LITE_ENSURE_STATUS(
-            AsymmetricQuantizeTensor(model.get(), tensor_info.tensor));
-
-        // Create a new tensor to be the output of the dequantize op.
-        std::unique_ptr<TensorT> dequantize_output;
-        MakeTensor(tensor_info.tensor->name + "_dequantize",
-                   tensor_info.tensor->shape, &dequantize_output);
-        const int32_t dequantize_output_idx = subgraph->tensors.size();
-        subgraph->tensors.push_back(std::move(dequantize_output));
-
-        // Create the Dequantize operation.
-        std::unique_ptr<OperatorT> dequantize_op;
-        MakeDequantizeOperator(model.get(), &dequantize_op,
-                               tensor_info.tensor_idx, dequantize_output_idx);
-
-        // Update the op_input of tensor_idx to dequantize_output_idx.
-        op->inputs[tensor_info.op_input_idx] = dequantize_output_idx;
-
-        // Insert the newly created Dequantize operation.
-        new_operators.push_back(std::move(dequantize_op));
+  // Examine the tensor consumers to determine which require dequantize ops.
+  for (const auto& tensor_pair : tensor_map) {
+    const int32_t tensor_idx = tensor_pair.first;
+    TensorT* tensor = tensor_pair.second;
+    std::vector<ConsumerOpInfo> consumer_op_infos =
+        GetTensorConsumers(model.get(), subgraph, tensor_idx);
+
+    std::vector<ConsumerOpInfo> dequant_op_infos;  // Ops that need dequants.
+    for (ConsumerOpInfo& consumer_op_info : consumer_op_infos) {
+      OperatorT* consumer_op = consumer_op_info.op;
+      const BuiltinOperator consumer_op_code =
+          model->operator_codes[consumer_op->opcode_index]->builtin_code;
+      // If the op is a hybrid op and all the required tensors are quantized,
+      // we have no further work to do, but for all ops that require
+      // dequantization we need to add a Dequantize op.
+      bool eval_hybrid =
+          use_hybrid_evaluation &&
+          IsHybridEvaluationOp(consumer_op, consumer_op_code) &&
+          CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code);
+      if (!eval_hybrid) {
+        dequant_op_infos.push_back(consumer_op_info);
       }
     }
-    // After (maybe) quantizing inputs, we copy the operator into the new list.
-    new_operators.push_back(std::move(subgraph->operators[i]));
+
+    // If no ops require dequant, we are done for this tensor.
+    if (dequant_op_infos.empty()) {
+      continue;
+    }
+
+    // Create a new tensor to be the output of the dequantize op.
+    std::unique_ptr<TensorT> dequantize_output;
+    const string dequant_name = tensor->name + "_dequantize";
+    MakeTensor(dequant_name, tensor->shape, &dequantize_output);
+    const int32_t dequantize_output_idx = subgraph->tensors.size();
+    subgraph->tensors.push_back(std::move(dequantize_output));
+
+    // Create the Dequantize operation.
+    std::unique_ptr<OperatorT> dequantize_op;
+    MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                           dequantize_output_idx);
+
+    LOG(INFO) << "Creating Dequantize op with name " << dequant_name << ".";
+
+    // Update the op_input of all the ops that need the created dequantize
+    // operation.
+    int32_t min_op_idx = 0;
+    for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
+      dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
+          dequantize_output_idx;
+      min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+    }
+
+    // Insert the newly created Dequantize operation before the earliest
+    // consumer, since TFLite requires operators to be topo-sorted.
+    subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
+                               std::move(dequantize_op));
   }
 
-  // At this point all unique_ptrs in the original operators are invalid, and
-  // we need to replace it with the new_operators vector.
-  subgraph->operators = std::move(new_operators);
+  // Update the modified operator code versions.
+  UpdateInt8OperatorVersions(model.get());
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model.get());
@@ -412,11 +390,12 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
 namespace internal {
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
+                             uint64_t weights_min_num_elements,
                              bool use_hybrid_evaluation) {
   // By default we require that only weights with more than
   // kWeightsMinSizeDefault elements are quantized.
   return QuantizeWeightsInternal(builder, input_model, use_hybrid_evaluation,
-                                 kWeightsMinNumElementsDefault);
+                                 weights_min_num_elements);
 }
 }  // namespace internal
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
index c2c0b0ce83435dc423a62cea598e35ba45a0561f..6baecc210fa0b52ddccace05a3fc7d6a9908712d 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -48,6 +48,7 @@ namespace internal {
 // evaluation disabled.
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
+                             uint64_t weights_min_num_elements,
                              bool use_hybrid_evaluation);
 }  // namespace internal
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 32725e5ee29c364d56754c08a2cb1084ef049fdb..a18b3bb7ffecfa71f24890fb0cbfbdc94d66c0c2 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -12,215 +12,346 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/quantize_weights.h"
-
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 
-#include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
 
 namespace tflite {
 namespace optimize {
 namespace {
 
-class QuantizeWeightsTest : public ::testing::Test {
+std::unique_ptr<FlatBufferModel> ReadTestModel() {
+  auto model_path = tensorflow::io::JoinPath(
+      *g_test_model_dir, internal::kConvModelWith0Plus10Weights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadSharedWeightsTestModel() {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir,
+                                             internal::kModelWithSharedWeights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+template <typename T>
+std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
+  return std::vector<T>(vec->begin(), vec->end());
+}
+
+class QuantizeWeightsTest : public testing::Test {
  protected:
-  int GetElementsNum(const TensorT* tensor) {
-    int tensor_size = 1;
-    for (const int dim : tensor->shape) {
-      tensor_size *= dim;
-    }
-    return tensor_size;
+  QuantizeWeightsTest() {}
+
+  void LoadBasicModel() {
+    input_model_ = ReadTestModel();
+    model_ = input_model_->GetModel();
   }
 
-  const OperatorT* GetOpWithOutput(const SubGraphT* subgraph,
-                                   int32_t output_tensor_idx) {
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
-      OperatorT* op = subgraph->operators[i].get();
-      if (std::find(op->outputs.begin(), op->outputs.end(),
-                    output_tensor_idx) != op->outputs.end()) {
-        return op;
-      }
-    }
-    return nullptr;
+  void LoadSharedWeightsModel() {
+    input_model_ = ReadSharedWeightsTestModel();
+    model_ = input_model_->GetModel();
   }
 
-  void SymmetricDequantizeAndCompare(const BufferT* input_buffer,
-                                     const BufferT* output_buffer,
-                                     float scale) {
-    const float* input_buffer_data =
-        reinterpret_cast<const float*>(input_buffer->data.data());
-    const int8_t* output_buffer_data =
-        reinterpret_cast<const int8_t*>(output_buffer->data.data());
-    for (int i = 0; i < output_buffer->data.size(); i++) {
-      float diff = input_buffer_data[i] - (output_buffer_data[i] * scale);
-      ASSERT_TRUE(std::abs(diff) <= scale);
+  std::unique_ptr<FlatBufferModel> input_model_;
+  const Model* model_;
+
+  bool IsModelInputOrOutput(const Model* model, uint32_t tensor_idx) {
+    for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+         ++subgraph_idx) {
+      const auto subgraph = model->subgraphs()->Get(subgraph_idx);
+      for (size_t i = 0; i < subgraph->inputs()->size(); ++i) {
+        if (subgraph->inputs()->Get(i) == tensor_idx) {
+          return true;
+        }
+      }
+      for (size_t i = 0; i < subgraph->outputs()->size(); ++i) {
+        if (subgraph->outputs()->Get(i) == tensor_idx) {
+          return true;
+        }
+      }
     }
+    return false;
   }
 
-  void AsymmetricDequantizeAndCompare(const BufferT* input_buffer,
-                                      const BufferT* output_buffer, float scale,
-                                      int64_t zero_point) {
-    const float* input_buffer_data =
-        reinterpret_cast<const float*>(input_buffer->data.data());
-    const uint8_t* output_buffer_data = output_buffer->data.data();
-    for (int i = 0; i < output_buffer->data.size(); i++) {
-      float diff =
-          input_buffer_data[i] - ((output_buffer_data[i] - zero_point) * scale);
-      ASSERT_TRUE(std::abs(diff) <= scale);
+  // Returns the producer op code of the specified tensor_idx.
+  bool GetProducerOpCode(const Model* model, uint32_t subgraph_idx,
+                         uint32_t tensor_idx,
+                         tflite::BuiltinOperator* op_code) {
+    const auto subgraph = model->subgraphs()->Get(subgraph_idx);
+    for (size_t op_idx = 0; op_idx < subgraph->operators()->size(); ++op_idx) {
+      const auto op = subgraph->operators()->Get(op_idx);
+      for (size_t i = 0; i < op->outputs()->size(); ++i) {
+        if (op->outputs()->Get(i) == tensor_idx) {
+          const uint32_t op_code_idx = op->opcode_index();
+          *op_code = model->operator_codes()->Get(op_code_idx)->builtin_code();
+          return true;
+        }
+      }
     }
+    return false;
   }
+};
 
-  void CheckWeights(const Model* input_model_packed,
-                    const Model* output_model_packed,
-                    bool use_hybrid_evaluation,
-                    uint64_t weights_min_num_elements = 1024) {
-    std::unique_ptr<ModelT> input_model;
-    input_model.reset(input_model_packed->UnPack());
-
-    std::unique_ptr<ModelT> output_model;
-    output_model.reset(output_model_packed->UnPack());
-
-    SubGraphT* subgraph = output_model->subgraphs.at(0).get();
-
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
-      OperatorT* op = subgraph->operators[i].get();
-      const BuiltinOperator op_code =
-          output_model->operator_codes[op->opcode_index]->builtin_code;
-
-      // These are the operations that should be quantized.
-      // TODO(suharshs): Right now this test only checks the relevant operations
-      // for the mobilenet v1 model used in the tests below.
-      int32_t tensor_idx;
-      if (op_code == BuiltinOperator_CONV_2D ||
-          op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
-          op_code == BuiltinOperator_FULLY_CONNECTED) {
-        tensor_idx = op->inputs[1];
-      } else {
-        continue;
-      }
+TEST_F(QuantizeWeightsTest, QuantizationSucceeds) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
 
-      bool eval_hybrid = false;
-      // These are the ops that support hybrid evaluation.
-      if (op_code == BuiltinOperator_FULLY_CONNECTED ||
-          op_code == BuiltinOperator_CONV_2D) {
-        eval_hybrid = true;
-      }
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+}
 
-      const TensorT* tensor = subgraph->tensors[tensor_idx].get();
-      int tensor_size = GetElementsNum(tensor);
-      // If the tensor_size is less than 1024 we expect the tensor to remain
-      // unquantized.
-      if (tensor_size < weights_min_num_elements) {
-        ASSERT_TRUE(tensor->type == TensorType_FLOAT32)
-            << tensor->name << " of type " << tensor->type;
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        // The weight tensor should not come from a dequantize op.
-        ASSERT_TRUE(preceding_op == nullptr);
-      } else if (use_hybrid_evaluation && eval_hybrid) {
-        // The input to the op should still be uint8.
-        ASSERT_TRUE(tensor->type == TensorType_UINT8) << tensor->name;
-        // The weight tensor should not come from a dequantize op.
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        ASSERT_TRUE(preceding_op == nullptr);
-
-        // Test symmetric quantization.
-        SymmetricDequantizeAndCompare(
-            input_model->buffers[tensor->buffer].get(),
-            output_model->buffers[tensor->buffer].get(),
-            tensor->quantization->scale[0]);
+TEST_F(QuantizeWeightsTest, WeightsMinNumElements) {
+  LoadBasicModel();
+  // Make weights_min_size sufficiently large such that no quantization should
+  // happen, i.e. the original model is the same size as the old one.
+  flatbuffers::FlatBufferBuilder builder;
+  const uint64_t kWeightsMinNumElements = 1000000;
+  EXPECT_EQ(QuantizeWeights(&builder, model_, kWeightsMinNumElements),
+            kTfLiteOk);
 
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      // Everything should remain equal between the two graphs.
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      EXPECT_EQ(quant_tensor->type(), float_tensor->type());
+    }
+  }
+}
+
+TEST_F(QuantizeWeightsTest, HybridConv) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
+
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  // Nothing should change.
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    // Make sure the graph only has one Conv operation.
+    ASSERT_EQ(quantized_graph->operators()->size(), 1);
+    const auto op = quantized_graph->operators()->Get(0);
+    const uint32_t op_code_idx = op->opcode_index();
+    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
+              BuiltinOperator_CONV_2D);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      // If the tensor is a weight, it should have type INT8, otherwise it
+      // should stay with type FLOAT32.
+      // If the tensor is a bias, it should have type FLOAT32.
+      if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
+            << quant_tensor->name()->str();
       } else {
-        // The input to the op should still be float.
-        ASSERT_TRUE(tensor->type == TensorType_FLOAT32) << tensor->name;
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        ASSERT_TRUE(preceding_op != nullptr);
-        // The float input should be the dequantize output.
-        ASSERT_TRUE(output_model->operator_codes[preceding_op->opcode_index]
-                        ->builtin_code == BuiltinOperator_DEQUANTIZE);
-        // Finally, ensure that the input to the dequantize operation is
-        // quantized.
-        const TensorT* quantized_tensor =
-            subgraph->tensors[preceding_op->inputs[0]].get();
-        ASSERT_TRUE(quantized_tensor->type == TensorType_UINT8);
-
-        // Test the assymetric quantization.
-        AsymmetricDequantizeAndCompare(
-            input_model->buffers[quantized_tensor->buffer].get(),
-            output_model->buffers[quantized_tensor->buffer].get(),
-            quantized_tensor->quantization->scale[0],
-            quantized_tensor->quantization->zero_point[0]);
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       }
     }
   }
-};
-
-TEST_F(QuantizeWeightsTest, SimpleTestWithHybrid) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
+}
 
+TEST_F(QuantizeWeightsTest, DequantizeConv) {
+  LoadBasicModel();
   flatbuffers::FlatBufferBuilder builder;
-  EXPECT_EQ(QuantizeWeights(&builder, input_model), kTfLiteOk);
+  auto status = internal::QuantizeWeights(&builder, model_, 0,
+                                          /*use_hybrid_evaluation=*/false);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-
-  CheckWeights(input_model, output_model, true);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    // The output graph should have an extra tensor from the added dequantize
+    // op.
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size() + 1);
+    // Check that a dequantize op exists.
+    int32_t dequant_input_idx = -1;
+    int32_t dequant_output_idx = -1;
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
+          BuiltinOperator_DEQUANTIZE) {
+        dequant_input_idx = op->inputs()->Get(0);
+        dequant_output_idx = op->outputs()->Get(0);
+      }
+    }
+    ASSERT_GT(dequant_input_idx, -1);
+    ASSERT_GT(dequant_output_idx, -1);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); ++i) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      // If the tensor is a weight, it should have type INT8.
+      // If the tensor is a bias, it should have type FLOAT32.
+      // If the tensor is an input or output it should have type FLOAT32.
+      // The input to dequantize should be INT8, and all other tensors should be
+      // FLOAT32.
+      if (i == dequant_input_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
+      } else if (i == dequant_output_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        // If its a non-bias constant tensor, is must be the weight.
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
+      } else {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      }
+    }
+  }
 }
 
-TEST_F(QuantizeWeightsTest, SimpleTestWithoutHybrid) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
-
+TEST_F(QuantizeWeightsTest, SharedWeights_Hybrid) {
+  LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
-  // Disable hybrid evaluation.
-  EXPECT_EQ(internal::QuantizeWeights(&builder, input_model, false), kTfLiteOk);
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-
-  CheckWeights(input_model, output_model, false);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  uint32_t num_conv_ops = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      const auto op_code =
+          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+      if (op_code == BuiltinOperator_CONV_2D) {
+        num_conv_ops++;
+        // Ensure that each convolution's weights tensor is now INT8.
+        const auto weights_tensor =
+            quantized_graph->tensors()->Get(op->inputs()->Get(1));
+        EXPECT_EQ(weights_tensor->type(), TensorType_INT8);
+      }
+    }
+  }
+  // Ensure that there were exactly two convolutions in the model.
+  EXPECT_EQ(num_conv_ops, 2);
 }
 
-TEST_F(QuantizeWeightsTest, SimpleTestWithWeightsMinNumElements) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
-
+TEST_F(QuantizeWeightsTest, SharedWeights_Dequantize) {
+  LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
-  // Make weights_min_size sufficiently large such that no quantization should
-  // happen, i.e. the original model is the same size as the old one.
-  const uint64_t kWeightsMinNumElements = 1000000;
-  EXPECT_EQ(QuantizeWeights(&builder, input_model, kWeightsMinNumElements),
-            kTfLiteOk);
+  auto status = internal::QuantizeWeights(&builder, model_, 0,
+                                          /*use_hybrid_evaluation*/ false);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-  CheckWeights(input_model, output_model, true, kWeightsMinNumElements);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  uint32_t num_conv_ops = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      const auto op_code =
+          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+      if (op_code == BuiltinOperator_CONV_2D) {
+        num_conv_ops++;
+        // Ensure that each convolution's weights tensor is still FLOAT
+        // (the output of the dequantize).
+        uint32_t weights_tensor_index = op->inputs()->Get(1);
+        const auto weights_tensor =
+            quantized_graph->tensors()->Get(weights_tensor_index);
+        EXPECT_EQ(weights_tensor->type(), TensorType_FLOAT32);
+
+        // Check that it comes from a dequantize operation.
+        BuiltinOperator producer_op_code;
+        ASSERT_TRUE(GetProducerOpCode(output_model, subgraph_idx,
+                                      weights_tensor_index, &producer_op_code));
+        EXPECT_EQ(producer_op_code, BuiltinOperator_DEQUANTIZE);
+      }
+    }
+  }
+  // Ensure that there were exactly two convolutions in the model.
+  EXPECT_EQ(num_conv_ops, 2);
 }
 
-// TODO(suharshs): Add tests that run the resulting model.
-
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
-  ::testing::InitGoogleTest(&argc, argv);
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..118e055058f37693c95cd7278f3229f35434ddb4
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+
+namespace {
+TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
+                                   const std::vector<int64_t>& zero_point,
+                                   int quantized_dimension,
+                                   const uint8_t* buffer_data,
+                                   size_t buffer_size, TensorType output_type,
+                                   ModelT* model, TensorT* tensor) {
+  tensor->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor->quantization->scale.assign(scales.begin(), scales.end());
+  if (zero_point.size() != scales.size()) {
+    return kTfLiteError;
+  }
+  tensor->quantization->zero_point.assign(zero_point.begin(), zero_point.end());
+  tensor->quantization->quantized_dimension = quantized_dimension;
+  model->buffers[tensor->buffer]->data.assign(buffer_data,
+                                              buffer_data + buffer_size);
+  // Update the tensor type.
+  tensor->type = output_type;
+  return kTfLiteOk;
+}
+
+bool OpHasOptionalBiasTensor(BuiltinOperator op_code) {
+  return op_code == BuiltinOperator_CONV_2D ||
+         op_code == BuiltinOperator_DEPTHWISE_CONV_2D;
+}
+
+struct OpWithBiasTensors {
+  int activation_input_index;
+  int weights_input_index;
+  int bias_input_index;
+  int index_for_channel_in_weights;
+};
+
+const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) {
+  if (op_code == BuiltinOperator_CONV_2D) {
+    static OpWithBiasTensors op_info = {.activation_input_index = 0,
+                                        .weights_input_index = 1,
+                                        .bias_input_index = 2,
+                                        .index_for_channel_in_weights = 0};
+    return &op_info;
+  }
+  if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+    static OpWithBiasTensors op_info = {.activation_input_index = 0,
+                                        .weights_input_index = 1,
+                                        .bias_input_index = 2,
+                                        .index_for_channel_in_weights = 3};
+    return &op_info;
+  }
+
+  return nullptr;
+}
+
+// Symmetrically Quantizes the given tensor as int8 values.
+TfLiteStatus SymmetricPerChannelQuantizeTensor(ModelT* model, TensorT* tensor,
+                                               int32_t channel_dim_index,
+                                               ErrorReporter* error_reporter) {
+  if (tensor->shape.size() != 4) {
+    error_reporter->Report("Only dims=4 is supported, tensor dims: %d",
+                           tensor->shape.size());
+    return kTfLiteError;
+  }
+
+  // Get dimensions.
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
+  const int32_t channel_dim_size = tensor->shape[channel_dim_index];
+
+  // Get input float data.
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  float* float_input_data = reinterpret_cast<float*>(buffer->data.data());
+
+  // Create container for output scale and output data.
+  std::vector<float> scales(channel_dim_size);
+  std::vector<int8_t> final_buffer(num_elements);
+
+  // Quantize the input data with respect to channel_dim_index.
+  const std::vector<int> tensor_dims = {tensor->shape[0], tensor->shape[1],
+                                        tensor->shape[2], tensor->shape[3]};
+  utils::SymmetricPerChannelQuantization(
+      float_input_data, tensor_dims, channel_dim_index, &scales, &final_buffer);
+
+  // Set the buffers and output type.
+  uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
+  const size_t buffer_size = num_elements * sizeof(int8_t);
+  std::vector<int64_t> zero_point(scales.size(), 0);
+  return AddQuantizationParams(scales, zero_point, channel_dim_index,
+                               uint8_buffer, buffer_size, TensorType_INT8,
+                               model, tensor);
+}
+
+// Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
+// The scale of bias if weight_per_channel_scale[channel] * input_scale
+TfLiteStatus SymmetricPerChannelBiasQuantize(const TensorT* input_tensor,
+                                             const TensorT* weight_tensor,
+                                             int channel_dim_index,
+                                             ModelT* model, TensorT* tensor,
+                                             ErrorReporter* error_reporter) {
+  if (tensor->shape.size() != 1) {
+    error_reporter->Report("Expected bias tensor shape to be 1.");
+    return kTfLiteError;
+  }
+
+  if (tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+
+  // TODO(shashishekhar): Make this support scalar biases.
+  if (tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
+    error_reporter->Report(
+        "Channel mismatch between bias and weight tensors %d vs %d",
+        tensor->shape[0], weight_tensor->shape[channel_dim_index]);
+    return kTfLiteError;
+  }
+  int32_t channel_dim_size = tensor->shape[0];
+  if (!input_tensor->quantization ||
+      input_tensor->quantization->scale.size() != 1) {
+    error_reporter->Report("Input tensor missing quantization information");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
+  const std::vector<float>& weight_scales = weight_tensor->quantization->scale;
+
+  if (weight_scales.size() != channel_dim_size) {
+    error_reporter->Report("Mismatch weight scale dimension: %d",
+                           weight_scales.size());
+    return kTfLiteError;
+  }
+
+  // Compute scales.
+  std::vector<float> scales(channel_dim_size);
+  for (size_t i = 0; i < channel_dim_size; i++) {
+    scales[i] = input_tensor->quantization->scale[0] * weight_scales[i];
+  }
+
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  float* float_data = reinterpret_cast<float*>(buffer->data.data());
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
+
+  std::vector<int32_t> final_buffer(num_elements);
+  const int32_t kScale = std::numeric_limits<int32_t>::max();
+
+  for (int32_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
+    float scaling_factor = scales[channel_idx];
+    float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
+    const int32_t quantized_value = static_cast<int32_t>(
+        TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
+    final_buffer[channel_idx] =
+        std::min(kScale, std::max(-kScale, quantized_value));
+  }
+
+  // Set the buffers and output type.
+  uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
+  size_t buffer_size = num_elements * sizeof(int32_t);
+  std::vector<int64_t> zero_point(scales.size(), 0);
+  return AddQuantizationParams(scales, zero_point, channel_dim_index,
+                               uint8_buffer, buffer_size, TensorType_INT32,
+                               model, tensor);
+}
+}  // namespace
+
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeTensor(
+    BuiltinOperator op_code, int32_t tensor_idx) {
+  TensorT* tensor = subgraph_->tensors[tensor_idx].get();
+  if (tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+
+  if (model_->buffers[tensor->buffer]->data.data() != nullptr) {
+    return kTfLiteError;
+  }
+  if (!tensor->quantization || tensor->quantization->min.empty() ||
+      tensor->quantization->max.empty()) {
+    error_reporter_->Report(
+        "Missing required min/max information for tensor_idx %d of operation: "
+        "%s",
+        tensor_idx, EnumNameBuiltinOperator(op_code));
+    return kTfLiteError;
+  }
+  utils::GetAsymmetricQuantizationParams(
+      tensor->quantization->min[0], tensor->quantization->max[0],
+      std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+      tensor->quantization.get());
+  tensor->type = TensorType_INT8;
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::QuantizeOpWithBias(BuiltinOperator op_code,
+                                                   OperatorT* op) {
+  auto op_tensor_info = GetInfoForOpWithBiasTensor(op_code);
+  if (!op_tensor_info) {
+    error_reporter_->Report("Cannot quantize op: %s",
+                            EnumNameBuiltinOperator(op_code));
+    return kTfLiteError;
+  }
+
+  // Conv/Depthwise conv have 2 inputs when there is no bias, 3 otherwise.
+  if (op->inputs.size() != 2 && op->inputs.size() != 3) {
+    return kTfLiteError;
+  }
+  auto input_tensor_idx = op->inputs[op_tensor_info->activation_input_index];
+  if (IsSubgraphInput(input_tensor_idx)) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, input_tensor_idx));
+  }
+  auto weights_tensor_idx = op->inputs[op_tensor_info->weights_input_index];
+
+  TensorT* weights_tensor = subgraph_->tensors[weights_tensor_idx].get();
+  int weights_channel_index = op_tensor_info->index_for_channel_in_weights;
+
+  auto status = SymmetricPerChannelQuantizeTensor(
+      model_, weights_tensor, weights_channel_index, error_reporter_);
+  TF_LITE_ENSURE_STATUS(status);
+
+  // If there is bias, quantize it.
+  if (op->inputs.size() == 3) {
+    auto bias_tensor_idx = op->inputs[op_tensor_info->bias_input_index];
+    const TensorT* input_tensor = subgraph_->tensors[input_tensor_idx].get();
+    TensorT* bias_tensor = subgraph_->tensors[bias_tensor_idx].get();
+    TF_LITE_ENSURE_STATUS(SymmetricPerChannelBiasQuantize(
+        input_tensor, weights_tensor, weights_channel_index, model_,
+        bias_tensor, error_reporter_));
+  }
+
+  if (op->outputs.size() != 1) {
+    return kTfLiteError;
+  }
+  auto output_tensor_idx = op->outputs[0];
+  TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, output_tensor_idx));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::PropagateMinMaxForAvgAndMaxPool(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
+
+  if (IsSubgraphInput(op->inputs[0])) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
+  }
+
+  auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
+  if (output_tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+  auto input_tensor = subgraph_->tensors[op->inputs[0]].get();
+  if (!input_tensor->quantization) {
+    error_reporter_->Report(
+        "Missing required min/max information for input of operation: %s",
+        EnumNameBuiltinOperator(op_code));
+    return kTfLiteError;
+  }
+  if (input_tensor->quantization->min.size() != 1 ||
+      input_tensor->quantization->max.size() != 1 ||
+      input_tensor->quantization->scale.size() != 1 ||
+      input_tensor->quantization->zero_point.size() != 1) {
+    error_reporter_->Report(
+        "Invalid quantization information for Op: %s, tensor: %s",
+        EnumNameBuiltinOperator(op_code), input_tensor->name.c_str());
+    return kTfLiteError;
+  }
+  auto quant_params = absl::make_unique<QuantizationParametersT>();
+  // Nudge min, max to include the floating point zero.
+  const float min = std::min(0.f, input_tensor->quantization->min[0]);
+  const float max = std::max(0.f, input_tensor->quantization->max[0]);
+  quant_params->min.push_back(min);
+  quant_params->max.push_back(max);
+  quant_params->scale.push_back(input_tensor->quantization->scale[0]);
+  quant_params->zero_point.push_back(input_tensor->quantization->zero_point[0]);
+  // TODO(shashishekhar): Log a warning here if overriding existing
+  // min/max/scales differ from input scales.
+  output_tensor->quantization = std::move(quant_params);
+  output_tensor->type = TensorType_INT8;
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSingleInputOutputOp(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1);
+
+  if (IsSubgraphInput(op->inputs[0])) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
+  }
+
+  auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
+  if (output_tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+  auto quant_params = absl::make_unique<QuantizationParametersT>();
+  TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->outputs[0]));
+  return kTfLiteOk;
+}
+
+bool SubgraphQuantizer::IsSubgraphInput(int32_t tensor_idx) const {
+  return std::find(subgraph_->inputs.begin(), subgraph_->inputs.end(),
+                   tensor_idx) != subgraph_->inputs.end();
+}
+
+TfLiteStatus SubgraphQuantizer::QuantizeOperator(int op_idx) {
+  OperatorT* op = subgraph_->operators[op_idx].get();
+  const BuiltinOperator op_code =
+      model_->operator_codes[op->opcode_index]->builtin_code;
+  if (OpHasOptionalBiasTensor(op_code)) {
+    return QuantizeOpWithBias(op_code, op);
+  }
+  switch (op_code) {
+    case BuiltinOperator_AVERAGE_POOL_2D:
+    case BuiltinOperator_MAX_POOL_2D:
+      return PropagateMinMaxForAvgAndMaxPool(op_code, op);
+    case BuiltinOperator_SQUEEZE:
+    case BuiltinOperator_SOFTMAX:
+      return AsymmetricQuantizeSingleInputOutputOp(op_code, op);
+    default:
+      return kTfLiteError;
+  }
+
+  return kTfLiteError;
+}
+
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.h b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d6ca7fad594f4831847c2b2f9de5d6bc0be5e6d
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_SUBGRAPH_QUANTIZER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_SUBGRAPH_QUANTIZER_H_
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+
+// Quantizes a given subgraph, the subgraph needs to min/max information
+// present.
+//
+// Assumes that some ops like Conv and Depthwise conv are quantized by
+// per channel symmetric quantization.
+class SubgraphQuantizer {
+ public:
+  SubgraphQuantizer(ModelT* model, SubGraphT* subgraph,
+                    ErrorReporter* error_reporter)
+      : model_(model), subgraph_(subgraph), error_reporter_(error_reporter) {}
+
+  // Quantize operator at the given index.
+  TfLiteStatus QuantizeOperator(int op_idx);
+
+ private:
+  // Quantizes ops with bias tensors.
+  TfLiteStatus QuantizeOpWithBias(BuiltinOperator op_code, OperatorT* op);
+
+  // Average and Max pool need special treatement. The scales are propagated
+  // from inputs to outputs.
+  TfLiteStatus PropagateMinMaxForAvgAndMaxPool(BuiltinOperator op_code,
+                                               OperatorT* op);
+
+  // Asymmetric quantizes inputs and outputs of an Op that has single input and
+  // single output. E.g. Squeeze.
+  TfLiteStatus AsymmetricQuantizeSingleInputOutputOp(BuiltinOperator op_code,
+                                                     OperatorT* op);
+
+  TfLiteStatus AsymmetricQuantizeTensor(BuiltinOperator op_code,
+                                        int32_t tensor_idx);
+
+  // Returns true if |tensor_idx| is one of the inputs in the subgraph.
+  bool IsSubgraphInput(int32_t tensor_idx) const;
+
+  ModelT* model_;
+  SubGraphT* subgraph_;
+  ErrorReporter* error_reporter_;
+};
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_SUBGRAPH_QUANTIZER_H_
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b23ced0fb9cc77309c5d25c01c8b53c08edaafe
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
@@ -0,0 +1,395 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadConvModel1() {
+  return ReadModel(kConvModelWithMinus128Plus127Weights);
+}
+
+std::unique_ptr<FlatBufferModel> ReadConvModel2() {
+  return ReadModel(kConvModelWith0Plus10Weights);
+}
+
+std::unique_ptr<FlatBufferModel> ReadSoftmaxModel() {
+  return ReadModel(kSingleSoftmaxModelMinMinus5MaxPlus5);
+}
+
+std::unique_ptr<FlatBufferModel> ReadAvgPoolModel() {
+  return ReadModel(kSingleAvgPoolModelMinMinus5MaxPlus5);
+}
+
+TEST(SubgraphQuantizerTest, VerifyConvQuantizationWithUnitScale) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadConvModel1();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto conv_op = subgraph->operators[0].get();
+  const int input_tensor_idx = 0;
+  const int weights_tensor_idx = 1;
+  const int bias_tensor_index = 2;
+  const int output_tensor_idx = 0;
+  const auto bias_tensor =
+      subgraph->tensors[conv_op->inputs[bias_tensor_index]].get();
+  const auto input_tensor =
+      subgraph->tensors[conv_op->inputs[input_tensor_idx]].get();
+  const auto weights_tensor =
+      subgraph->tensors[conv_op->inputs[weights_tensor_idx]].get();
+  const auto output_tensor =
+      subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
+
+  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
+  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(weights_tensor->type, TensorType_INT8);
+
+  ASSERT_TRUE(weights_tensor->quantization);
+  const int out_channel_size = weights_tensor->shape[0];
+  ASSERT_TRUE(bias_tensor->quantization);
+  ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
+  const std::vector<float>& weights_scales =
+      weights_tensor->quantization->scale;
+
+  const std::vector<int64_t>& weights_zero_points =
+      weights_tensor->quantization->zero_point;
+
+  ASSERT_EQ(bias_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_zero_points.size(), out_channel_size);
+  ASSERT_EQ(input_tensor->quantization->scale.size(), 1);
+  ASSERT_EQ(output_tensor->quantization->scale.size(), 1);
+
+
+  for (size_t i = 0; i < out_channel_size; i++) {
+    EXPECT_EQ(weights_scales[i], 1);
+    EXPECT_EQ(bias_scales[i], 1);
+    EXPECT_EQ(weights_zero_points[i], 0);
+  }
+
+  EXPECT_EQ(input_tensor->quantization->scale[0], 1);
+  EXPECT_EQ(output_tensor->quantization->scale[0], 1);
+
+  const auto bias_buffer = model.buffers[bias_tensor->buffer].get();
+  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
+  const int32_t* bias_values =
+      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  const auto original_bias_buffer =
+      readonly_model->buffers()->Get(bias_tensor->buffer);
+  const float* bias_float_buffer =
+      reinterpret_cast<const float*>(original_bias_buffer->data()->data());
+
+  const float eps = 1e-7;
+  for (size_t i = 0; i < bias_tensor->shape[0]; i++) {
+    const float bias_scale =
+        input_tensor->quantization->scale[0] * weights_scales[i];
+    auto dequantized_value = bias_values[i] * bias_scale;
+    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], eps);
+  }
+
+  const auto weights_buffer = model.buffers[weights_tensor->buffer].get();
+  const auto original_weights_buffer =
+      readonly_model->buffers()->Get(weights_tensor->buffer);
+  const int8_t* weight_values =
+      reinterpret_cast<int8_t*>(weights_buffer->data.data());
+  const float* weights_float_buffer =
+      reinterpret_cast<const float*>(original_weights_buffer->data()->data());
+  ASSERT_EQ(sizeof(float) * weights_buffer->data.size(),
+            original_weights_buffer->data()->size());
+  int num_values_in_channel = weights_buffer->data.size() / out_channel_size;
+  for (size_t channel_idx = 0; channel_idx < out_channel_size; channel_idx++) {
+    for (size_t j = 0; j < num_values_in_channel; j++) {
+      size_t element_idx = channel_idx * out_channel_size + j;
+      auto dequantized_value =
+          weight_values[element_idx] * weights_scales[channel_idx];
+      EXPECT_NEAR(dequantized_value, weights_float_buffer[element_idx], eps);
+    }
+  }
+}
+
+TEST(SubgraphQuantizerTest, VerifyConvQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadConvModel2();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto conv_op = subgraph->operators[0].get();
+  const int input_tensor_idx = 0;
+  const int weights_tensor_idx = 1;
+  const int bias_tensor_index = 2;
+  const int output_tensor_idx = 0;
+  const auto bias_tensor =
+      subgraph->tensors[conv_op->inputs[bias_tensor_index]].get();
+  const auto input_tensor =
+      subgraph->tensors[conv_op->inputs[input_tensor_idx]].get();
+  const auto weights_tensor =
+      subgraph->tensors[conv_op->inputs[weights_tensor_idx]].get();
+  const auto output_tensor =
+      subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
+
+  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
+  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(weights_tensor->type, TensorType_INT8);
+
+  ASSERT_TRUE(weights_tensor->quantization);
+  const int out_channel_size = weights_tensor->shape[0];
+  ASSERT_TRUE(bias_tensor->quantization);
+  ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
+  const std::vector<float>& weights_scales =
+      weights_tensor->quantization->scale;
+  const std::vector<int64_t>& weights_zero_points =
+      weights_tensor->quantization->zero_point;
+
+  ASSERT_EQ(bias_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_zero_points.size(), out_channel_size);
+  ASSERT_EQ(input_tensor->quantization->scale.size(), 1);
+  ASSERT_EQ(output_tensor->quantization->scale.size(), 1);
+
+  const float eps = 1e-7;
+
+  // Bias scale should be input * per_channel_weight_scale.
+  for (size_t i = 0; i < out_channel_size; i++) {
+    EXPECT_NEAR(bias_scales[i],
+                input_tensor->quantization->scale[0] * weights_scales[i], eps);
+  }
+
+  const auto bias_buffer = model.buffers[bias_tensor->buffer].get();
+  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
+  const int32_t* bias_values =
+      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  const auto original_bias_buffer =
+      readonly_model->buffers()->Get(bias_tensor->buffer);
+  const float* bias_float_buffer =
+      reinterpret_cast<const float*>(original_bias_buffer->data()->data());
+
+  for (size_t i = 0; i < out_channel_size; i++) {
+    auto dequantized_value = bias_values[i] * bias_scales[i];
+    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+  }
+
+  const auto weights_buffer = model.buffers[weights_tensor->buffer].get();
+  const auto original_weights_buffer =
+      readonly_model->buffers()->Get(weights_tensor->buffer);
+  const int8_t* weight_values =
+      reinterpret_cast<int8_t*>(weights_buffer->data.data());
+  const float* weights_float_buffer =
+      reinterpret_cast<const float*>(original_weights_buffer->data()->data());
+  ASSERT_EQ(sizeof(float) * weights_buffer->data.size(),
+            original_weights_buffer->data()->size());
+  int num_values_in_channel = weights_buffer->data.size() / out_channel_size;
+  for (size_t channel_idx = 0; channel_idx < out_channel_size; channel_idx++) {
+    for (size_t j = 0; j < num_values_in_channel; j++) {
+      size_t element_idx = channel_idx * out_channel_size + j;
+      auto scale = weights_scales[channel_idx];
+      auto zero_point = weights_zero_points[channel_idx];
+      auto dequantized_value = weight_values[element_idx] * scale;
+      EXPECT_NEAR(dequantized_value, weights_float_buffer[element_idx],
+                  scale / 2);
+      EXPECT_EQ(zero_point, 0);
+    }
+  }
+}
+
+void VerifyAsymmetricQuantizationScale(
+    const QuantizationParameters& float_quant_params,
+    const QuantizationParametersT& quantized_quant_params) {
+  const float eps = 1e-7;
+  ASSERT_EQ(float_quant_params.min()->size(), 1);
+  ASSERT_EQ(float_quant_params.max()->size(), 1);
+  float float_min = std::min(0.f, float_quant_params.min()->Get(0));
+  float float_max = std::max(0.f, float_quant_params.max()->Get(0));
+
+  ASSERT_EQ(quantized_quant_params.scale.size(), 1);
+  ASSERT_EQ(quantized_quant_params.zero_point.size(), 1);
+
+  float scale = (float_max - float_min) / 255;
+  EXPECT_NEAR(scale, quantized_quant_params.scale[0], eps);
+}
+
+TEST(SubgraphQuantizerTest, VerifySoftmaxQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadSoftmaxModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto op = subgraph->operators[0].get();
+  // Model has a single softmax op.
+  ASSERT_EQ(op->opcode_index, 0);
+  ASSERT_EQ(model.operator_codes[0].get()->builtin_code,
+            BuiltinOperator_SOFTMAX);
+
+  ASSERT_EQ(op->inputs.size(), 1);
+  ASSERT_EQ(op->outputs.size(), 1);
+  auto float_graph = readonly_model->subgraphs()->Get(0);
+
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
+            TensorType_FLOAT32);
+
+  EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  auto float_input_quant_params =
+      float_graph->tensors()->Get(op->inputs[0])->quantization();
+  auto input_quant_params =
+      subgraph->tensors[op->inputs[0]]->quantization.get();
+  VerifyAsymmetricQuantizationScale(*float_input_quant_params,
+                                    *input_quant_params);
+
+  auto float_output_quant_params =
+      float_graph->tensors()->Get(op->outputs[0])->quantization();
+  auto output_quant_params =
+      subgraph->tensors[op->outputs[0]]->quantization.get();
+  VerifyAsymmetricQuantizationScale(*float_output_quant_params,
+                                    *output_quant_params);
+}
+
+TEST(SubgraphQuantizerTest, VerifyAvgPoolQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadAvgPoolModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto op = subgraph->operators[0].get();
+  // Model has a single AveragePool op.
+  ASSERT_EQ(op->opcode_index, 0);
+  ASSERT_EQ(model.operator_codes[0].get()->builtin_code,
+            BuiltinOperator_AVERAGE_POOL_2D);
+
+  ASSERT_EQ(op->inputs.size(), 1);
+  ASSERT_EQ(op->outputs.size(), 1);
+
+  auto float_graph = readonly_model->subgraphs()->Get(0);
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
+            TensorType_FLOAT32);
+
+  EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  auto float_input_quant_params =
+      float_graph->tensors()->Get(op->inputs[0])->quantization();
+  auto input_quant_params =
+      subgraph->tensors[op->inputs[0]]->quantization.get();
+  VerifyAsymmetricQuantizationScale(*float_input_quant_params,
+                                    *input_quant_params);
+
+  auto float_output_quant_params =
+      float_graph->tensors()->Get(op->outputs[0])->quantization();
+  auto output_quant_params =
+      subgraph->tensors[op->outputs[0]]->quantization.get();
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+  ASSERT_EQ(output_quant_params->min.size(), 1);
+  ASSERT_EQ(output_quant_params->max.size(), 1);
+
+  // Make sure the input min/maxes are propagated to outputs.
+  EXPECT_EQ(input_quant_params->min[0], output_quant_params->min[0]);
+  EXPECT_EQ(input_quant_params->max[0], output_quant_params->max[0]);
+  EXPECT_EQ(input_quant_params->scale[0], output_quant_params->scale[0]);
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..190242402b37c74f123a1f24bc2980ce70da7ae7
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+const char* kConvModelWithMinus128Plus127Weights =
+    "single_conv_weights_min_minus_127_max_plus_127.bin";
+
+const char* kConvModelWith0Plus10Weights =
+    "single_conv_weights_min_0_max_plus_10.bin";
+
+const char* kSingleSoftmaxModelMinMinus5MaxPlus5 =
+    "single_softmax_min_minus_5_max_plus_5.bin";
+
+const char* kSingleAvgPoolModelMinMinus5MaxPlus5 =
+    "single_avg_pool_min_minus_5_max_plus_5.bin";
+
+const char* kModelWithSharedWeights = "weight_shared_between_convs.bin";
+
+int FailOnErrorReporter::Report(const char* format, va_list args) {
+  char buf[1024];
+  vsnprintf(buf, sizeof(buf), format, args);
+  EXPECT_TRUE(false) << "Error happened: " << buf;
+  return 0;
+}
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..21f8b3ceb0ba48abc3c95810ee1b12a8c2b00b0c
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_TEST_UTIL_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+// Test model with a single convolution.
+// Floating point weights of the model are all integers and lie in
+// range[-127, 127]. The weights have been put in such a way that each
+// channel has at least one weight as -127 and one weight as 127.
+// The activations are all in range: [-128, 127]
+// This means all bias computations should result in 1.0 scale.
+extern const char* kConvModelWithMinus128Plus127Weights;
+
+// Test model with single convolution where all weights are integers between
+// [0, 10] weights are randomly distributed. It is not guaranteed that min max
+// for weights are going to appear in each channel.
+// Activations have min = 0, max = 10.
+extern const char* kConvModelWith0Plus10Weights;
+
+// A floating point model with a single softmax. The input tensor has min
+// and max in range [-5, 5], not necessarily -5 or +5.
+extern const char* kSingleSoftmaxModelMinMinus5MaxPlus5;
+
+// A floating point model with a single average pool. The input tensor has min
+// and max in range [-5, 5], not necessarily -5 or +5.
+extern const char* kSingleAvgPoolModelMinMinus5MaxPlus5;
+
+// Test model with a weights variable that is shared between a convolution layer
+// and an add operation.
+extern const char* kModelWithSharedWeights;
+
+// An error reporter that fails on testing.
+class FailOnErrorReporter : public ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override;
+};
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_TEST_UTIL_H_
diff --git a/tensorflow/lite/tools/optimize/testdata/README.md b/tensorflow/lite/tools/optimize/testdata/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..21fcd32b1ee85a6a821e60c4336ecd7a32e677a0
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/testdata/README.md
@@ -0,0 +1,25 @@
+# Test models for testing quantization
+
+This directory contains test models for testing quantization.
+
+## Models
+
+* `single_conv_weights_min_0_max_plus_10.bin` \
+   A floating point model with single convolution where all weights are
+   integers between [0, 10] weights are randomly distributed. It is not
+   guaranteed that min max for weights are going to appear in each channel.
+   All activations have min maxes and activations are in range [0,10].
+* `single_conv_weights_min_minus_127_max_plus_127.bin` \
+   A floating point model with a single convolution where weights of the model
+   are all integers that lie in range[-127, 127]. The weights have been put in
+   such a way that each channel has at least one weight as -127 and one weight
+   as 127. The activations are all in range: [-128, 127].
+   This means all bias computations should result in 1.0 scale.
+* `single_softmax_min_minus_5_max_5.bin` \
+   A floating point model with a single softmax. The input tensor has min
+   and max in range [-5, 5], not necessarily -5 or +5.
+* `single_avg_pool_input_min_minus_5_max_5.bin` \
+   A floating point model with a single average pool. The input tensor has min
+   and max in range [-5, 5], not necessarily -5 or +5.
+* `weight_shared_between_convs.tflite` \
+   A floating point model with two convs that have a use the same weight tensor.
diff --git a/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin b/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a65f39ee29514b27ea3af861c10dd452ab9e5ce2
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..70cbc0620ad7222817cf241030acb98387083154
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29b9f47097d466b65831514cec3a00f19f5cbdf3
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin b/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3caba63492e174229ef605bfbb0d2ddeda2ba61d
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin b/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4c903c82eec32df8aa0d3462262b61daa30fc251
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin differ
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 64d62ee1f2d5d0cc1fa1d1804c637f8220937128..c5141c17537f355fc80b37b3a7e2ed3b2c0a2dfd 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -57,12 +57,22 @@ RELATIVE_MAKEFILE_PATH = os.path.join(RELATIVE_MAKE_DIR, 'Makefile')
 DOWNLOAD_SCRIPT_PATH = os.path.join(MAKE_DIR, 'download_dependencies.sh')
 
 
+# Check physical memory and if we are on a reasonable non small SOC machine
+# with more than 4GB, use all the CPUs, otherwisxe only 1.
+def get_build_cpus():
+  physical_bytes = os.sysconf('SC_PAGESIZE') * os.sysconf('SC_PHYS_PAGES')
+  if physical_bytes < (1<<30) * 4:
+    return 1
+  else:
+    return multiprocessing.cpu_count()
+
+
 def make_args(target='', quiet=True):
   """Construct make command line."""
   args = (['make', 'SHELL=/bin/bash', '-C', TENSORFLOW_DIR]
           + MAKE_CROSS_OPTIONS +
           ['-f', RELATIVE_MAKEFILE_PATH, '-j',
-           str(multiprocessing.cpu_count())])
+           str(get_build_cpus())])
   if quiet:
     args.append('--quiet')
   if target:
@@ -136,7 +146,7 @@ setup(
     long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/lite/',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     license='Apache 2.0',
     include_package_data=True,
     keywords='tflite tensorflow tensor machine learning',
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 02d6e6b23cdd66c9dd87700e4be6bb2cfbee407f..8631bef67b28e290357913a72614a86e3129f65a 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -105,6 +105,10 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                                ErrorReporter* error_reporter) {
   uint64_t bytes_required = 1;
+  if (!tensor.shape()) {
+    ReportError(error_reporter, "Tensor shape is empty");
+    return false;
+  }
   for (int dim : *tensor.shape()) {
     bytes_required *= dim;
     if (bytes_required > UINT_MAX) {
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index 98abafad927ae45cd7de428d0011e234f345dd6e..083f00d445ae0274274b0b83641a6d7c6a718577 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -25,13 +27,29 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/verifier.h"
 #include "tensorflow/lite/version.h"
-#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
 using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : buffer_size_(0) {}
+  int Report(const char* format, va_list args) override {
+    buffer_size_ = vsnprintf(buffer_, kBufferSize, format, args);
+    return buffer_size_;
+  }
+  int GetBufferSize() { return buffer_size_; }
+
+  string GetAsString() const { return string(buffer_, buffer_size_); }
+
+ private:
+  static constexpr int kBufferSize = 256;
+  char buffer_[kBufferSize];
+  int buffer_size_;
+};
+
 // Build single subgraph model.
 class TfLiteFlatbufferModelBuilder {
  public:
@@ -60,6 +78,12 @@ class TfLiteFlatbufferModelBuilder {
       buffer_index = buffers_.size();
       buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
     }
+    if (shape.empty()) {
+      tensors_.push_back(CreateTensorDirect(builder_, /*shape=*/nullptr, type,
+                                            buffer_index, name,
+                                            /*quantization=*/0));
+      return;
+    }
     tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
                                           name, /*quantization=*/0));
   }
@@ -92,13 +116,16 @@ class TfLiteFlatbufferModelBuilder {
 
   bool Verify() {
     return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
-                          resolver_, DefaultErrorReporter());
+                          resolver_, &mock_reporter_);
   }
 
+  string GetErrorString() { return mock_reporter_.GetAsString(); }
+
  private:
   FlatBufferBuilder builder_;
   MutableOpResolver resolver_;
   TfLiteRegistration fake_op_;
+  MockErrorReporter mock_reporter_;
   std::vector<Offset<Operator>> operators_;
   std::vector<Offset<OperatorCode>> operator_codes_;
   std::vector<Offset<Tensor>> tensors_;
@@ -112,8 +139,27 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Missing 'subgraphs' section."));
+}
+
+TEST(VerifyModel, TestEmptyShape) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor({}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "inputtwo");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor shape is empty"));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
@@ -127,12 +173,16 @@ TEST(VerifyModel, TestSimpleModel) {
   builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
   builder.FinishModel({0, 1}, {2});
   ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 TEST(VerifyModel, TestCorruptedData) {
   std::string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size(), MutableOpResolver{},
-                      /*error_reporter=*/nullptr));
+  MockErrorReporter mock_reporter;
+  ASSERT_FALSE(
+      Verify(model.data(), model.size(), MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Invalid flatbuffer format"));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -140,8 +190,11 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Invalid model version 1"));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -166,6 +219,10 @@ TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
   builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex(
+          "Tensor requires 6 bytes, but is allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
@@ -173,6 +230,10 @@ TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
   builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex(
+          "Tensor requires 2 bytes, but is allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeOverflow) {
@@ -181,6 +242,8 @@ TEST(VerifyModel, TestIntTensorShapeOverflow) {
                     "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor dimension overflow"));
 }
 
 TEST(VerifyModel, TensorBufferIsNotValid) {
@@ -203,8 +266,12 @@ TEST(VerifyModel, TensorBufferIsNotValid) {
                            builder.CreateString("SmartReply"), buffers);
 
   ::tflite::FinishModelBuffer(builder, model);
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(
+      mock_reporter.GetAsString(),
+      ::testing::ContainsRegex("Missing 'operators' section in subgraph."));
 }
 
 TEST(VerifyModel, StringTensorHasInvalidNumString) {
@@ -215,6 +282,10 @@ TEST(VerifyModel, StringTensorHasInvalidNumString) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor buffer requires at least -2147483640 bytes, "
+                  "but is allocated with 18 bytes"));
 }
 
 TEST(VerifyModel, StringTensorOffsetTooSmall) {
@@ -224,6 +295,9 @@ TEST(VerifyModel, StringTensorOffsetTooSmall) {
       {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor buffer initial offset must be: 16"));
 }
 
 TEST(VerifyModel, StringTensorOffsetOutOfRange) {
@@ -233,6 +307,9 @@ TEST(VerifyModel, StringTensorOffsetOutOfRange) {
       {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex("String tensor buffer is invalid: index 2"));
 }
 
 TEST(VerifyModel, StringTensorIsLargerThanRequired) {
@@ -243,37 +320,47 @@ TEST(VerifyModel, StringTensorIsLargerThanRequired) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex("String tensor buffer last offset must be 19"));
 }
 
 TEST(VerifyModel, AllOpsAreSupported) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
   builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
+  ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 TEST(VerifyModel, UseUnsupportedBuiltinOps) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_SUB}, {"CustomOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex("Unsupported builtin op: ADD, version: 1"));
 }
 
 TEST(VerifyModel, UseUnsupportedCustomOps) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"NewOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "Not supported");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "Unsupported custom op: Not supported, version: 1"));
 }
 
 // TODO(yichengfan): make up malicious files to test with.
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 88800c295124cbb7e1f292c6970b81e3b0594ab3..dc98702c087e2fa2d22e7bb296a1a43158460e05 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -37,6 +37,7 @@ tensorflow/third_party/toolchains/clang6/README.md
 tensorflow/third_party/toolchains/clang6/repo.bzl
 tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
 tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
@@ -47,12 +48,14 @@ tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUI
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
 tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
 tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
@@ -68,21 +71,17 @@ tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
 tensorflow/third_party/toolchains/cpus/arm/BUILD
 tensorflow/third_party/toolchains/cpus/py3/BUILD
 tensorflow/third_party/toolchains/cpus/py/BUILD
+tensorflow/third_party/toolchains/remote/configure.bzl
+tensorflow/third_party/toolchains/remote/BUILD.tpl
+tensorflow/third_party/toolchains/remote/BUILD
+tensorflow/third_party/toolchains/remote/execution.bzl.tpl
 tensorflow/third_party/toolchains/BUILD
-tensorflow/third_party/nccl/remote.BUILD.tpl
-tensorflow/third_party/nccl/archive.BUILD
-tensorflow/third_party/nccl/LICENSE
-tensorflow/third_party/nccl/system.BUILD.tpl
-tensorflow/third_party/nccl/nccl_configure.bzl
-tensorflow/third_party/nccl/build_defs.bzl.tpl
-tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/gpus/BUILD
 tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
 tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
 tensorflow/third_party/gpus/crosstool/CROSSTOOL.tpl
 tensorflow/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
 tensorflow/third_party/gpus/crosstool/LICENSE
-tensorflow/third_party/gpus/crosstool/remote.BUILD.tpl
 tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
 tensorflow/third_party/gpus/crosstool/BUILD.tpl
 tensorflow/third_party/gpus/crosstool/BUILD
@@ -90,7 +89,6 @@ tensorflow/third_party/gpus/cuda/LICENSE
 tensorflow/third_party/gpus/cuda/BUILD.tpl
 tensorflow/third_party/gpus/cuda/BUILD.windows.tpl
 tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
-tensorflow/third_party/gpus/cuda/remote.BUILD.tpl
 tensorflow/third_party/gpus/cuda/BUILD
 tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
 tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
@@ -123,6 +121,7 @@ tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
 tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
 tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
 tensorflow/third_party/eigen3/LICENSE
+tensorflow/third_party/eigen3/gebp_neon.patch
 tensorflow/third_party/eigen3/BUILD
 tensorflow/third_party/systemlibs/build_defs.bzl.tpl
 tensorflow/third_party/systemlibs/absl_py.BUILD
@@ -175,6 +174,12 @@ tensorflow/third_party/llvm/expand_cmake_vars.py
 tensorflow/third_party/llvm/llvm.autogenerated.BUILD
 tensorflow/third_party/llvm/llvm.bzl
 tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/fft2d/BUILD
 tensorflow/third_party/fft2d/fft.h
 tensorflow/third_party/fft2d/LICENSE
@@ -187,7 +192,6 @@ tensorflow/third_party/tensorrt/BUILD
 tensorflow/third_party/tensorrt/build_defs.bzl.tpl
 tensorflow/third_party/tensorrt/BUILD.tpl
 tensorflow/third_party/tensorrt/tensorrt_configure.bzl
-tensorflow/third_party/tensorrt/remote.BUILD.tpl
 tensorflow/third_party/kafka/config.patch
 tensorflow/third_party/kafka/BUILD
 tensorflow/third_party/android/BUILD
@@ -210,10 +214,10 @@ tensorflow/third_party/git/BUILD.tpl
 tensorflow/third_party/git/BUILD
 tensorflow/third_party/git/git_configure.bzl
 tensorflow/third_party/protobuf/BUILD
+tensorflow/third_party/enum34.BUILD
 tensorflow/third_party/tflite_mobilenet.BUILD
 tensorflow/third_party/py/BUILD
 tensorflow/third_party/py/BUILD.tpl
-tensorflow/third_party/py/remote.BUILD.tpl
 tensorflow/third_party/py/numpy/BUILD
 tensorflow/third_party/py/python_configure.bzl
 tensorflow/third_party/termcolor.BUILD
@@ -239,7 +243,7 @@ tensorflow/third_party/tflite_ovic_testdata.BUILD
 tensorflow/third_party/libxsmm.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/third_party/eigen.BUILD
-tensorflow/stream_executor/BUILD
+tensorflow/stream_executor/build_defs.bzl
 tensorflow/api_template_v1.__init__.py
 tensorflow/compat_template_v1.__init__.py
 tensorflow/api_template.__init__.py
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c12e9ee3f3d844bec07a4e6f93669216614efe43..c89027582c0f74381ccb54e2a6b0212951840431 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -11,6 +11,7 @@ visibility = [
     "//tensorflow/lite/toco/python:__pkg__",
     "//tensorflow_models:__subpackages__",
     "//tensorflow_model_optimization:__subpackages__",
+    "//third_party/py/cleverhans:__subpackages__",
     # TODO(aselle): to pass open source test.
     "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
 ]
@@ -23,6 +24,7 @@ exports_files(["LICENSE"])
 
 exports_files(["platform/base.i"])
 
+load("//tensorflow:tensorflow.bzl", "if_not_v2")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
@@ -69,8 +71,29 @@ py_library(
     ],
     deps = [
         ":no_contrib",
-        "//tensorflow/contrib:contrib_py",
         "//tensorflow/python/estimator:estimator_py",
+    ] + if_not_v2(["//tensorflow/contrib:contrib_py"]),
+)
+
+py_library(
+    name = "keras_lib",
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow:internal",
+        "//tensorflow/python/estimator:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
+        "//tensorflow/python/tools:__pkg__",
+        "//tensorflow/python/tools/api/generator:__pkg__",
+        "//tensorflow/tools/api/tests:__pkg__",
+        "//tensorflow/tools/compatibility/update:__pkg__",
+        "//tensorflow_estimator:__subpackages__",
+    ],
+    deps = [
+        ":rnn",
+        "//tensorflow/python:layers",
+        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -89,11 +112,13 @@ py_library(
     ],
     deps = [
         ":array_ops",
+        ":audio_ops_gen",
         ":bitwise_ops",
         ":boosted_trees_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
+        ":clustering_ops",
         ":collective_ops",
         ":cond_v2",
         ":confusion_matrix",
@@ -110,8 +135,8 @@ py_library(
         ":image_ops",
         ":initializers_ns",
         ":io_ops",
+        ":keras_lib",
         ":kernels",
-        ":layers",
         ":lib",
         ":list_ops",
         ":manip_ops",
@@ -147,13 +172,14 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:profiler",
         "//tensorflow/python/eager:remote",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//tensorflow/python/keras",
+        "//tensorflow/python/module",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/losses",
@@ -188,6 +214,7 @@ py_library(
         ":pywrap_tensorflow",
         ":util",
         "//tensorflow/core:protos_all_py",
+        "@absl_py//absl:app",
         "@absl_py//absl/flags",
         "@six_archive//:six",
     ],
@@ -208,7 +235,10 @@ py_library(
     name = "platform_test",
     srcs = ["platform/googletest.py"],
     srcs_version = "PY2AND3",
-    deps = [":platform_benchmark"],
+    deps = [
+        ":platform_benchmark",
+        "@absl_py//absl/testing:absltest",
+    ],
 )
 
 tf_py_test(
@@ -474,19 +504,11 @@ tf_cc_shared_object(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "file_system_test",
     size = "small",
     srcs = ["framework/file_system_test.py"],
-    data = [":framework/test_file_system.so"],
-    main = "framework/file_system_test.py",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",  # Path issues due to test environment
-        "no_windows",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":data_flow_ops",
         ":framework",
@@ -495,57 +517,59 @@ py_test(
         ":platform",
         ":util",
     ],
+    data = [":framework/test_file_system.so"],
+    main = "framework/file_system_test.py",
+    tags = [
+        "no_pip",  # Path issues due to test environment
+        "no_windows",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "decorator_utils_test",
     srcs = ["util/decorator_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_export_test",
     srcs = ["util/tf_export_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "deprecation_test",
     srcs = ["util/deprecation_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "dispatch_test",
     srcs = ["util/dispatch_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -652,6 +676,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":common_shapes",
+        ":composite_tensor",
         ":cpp_shape_inference_proto_py",
         ":errors",
         ":framework_fast_tensor_util",
@@ -735,6 +760,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":c_api_util",
+        ":error_interpolation",
         ":util",
     ],
 )
@@ -791,13 +817,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "function_def_to_graph_test",
     size = "small",
     srcs = ["framework/function_def_to_graph_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":constant_op",
@@ -809,6 +833,7 @@ py_test(
         ":math_ops",
         ":test_ops",
     ],
+    tags = ["no_pip"],
 )
 
 py_library(
@@ -919,12 +944,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "auto_control_deps_test",
     size = "small",
     srcs = ["framework/auto_control_deps_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":auto_control_deps",
         ":client_testlib",
     ],
@@ -959,12 +983,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "smart_cond_test",
     size = "small",
     srcs = ["framework/smart_cond_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":constant_op",
         ":framework_ops",
@@ -978,6 +1001,18 @@ py_library(
     name = "sparse_tensor",
     srcs = ["framework/sparse_tensor.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":composite_tensor",
+        ":dtypes",
+        ":framework_ops",
+        ":tensor_util",
+    ],
+)
+
+py_library(
+    name = "composite_tensor",
+    srcs = ["framework/composite_tensor.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":dtypes",
         ":framework_ops",
@@ -985,6 +1020,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "framework_composite_tensor_test",
+    srcs = ["framework/composite_tensor_test.py"],
+    main = "framework/composite_tensor_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":composite_tensor",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":platform_test",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 # This target is maintained separately from :util to provide separate visibility
 # for legacy users who were granted visibility when the functions were private
 # members of ops.Graph.
@@ -1055,6 +1105,7 @@ py_library(
     name = "extra_py_tests_deps",
     srcs_version = "PY2AND3",
     deps = [
+        ":keras_lib",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1088,6 +1139,14 @@ py_library(
     ],
 )
 
+# Including this as a dependency will result in tests using
+# :framework_test_lib to use XLA.
+py_library(
+    name = "is_xla_test_true",
+    srcs = ["framework/is_xla_test_true.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "distributed_framework_test_lib",
     srcs_version = "PY2AND3",
@@ -1110,52 +1169,47 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_registry_test",
     size = "small",
     srcs = ["framework/registry_test.py"],
-    main = "framework/registry_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         "//tensorflow/python:client_testlib",
     ],
+    main = "framework/registry_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_errors_test",
     size = "small",
     srcs = ["framework/errors_test.py"],
-    main = "framework/errors_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/errors_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_error_interpolation_test",
     size = "small",
     srcs = ["framework/error_interpolation_test.py"],
-    main = "framework/error_interpolation_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":constant_op",
         ":error_interpolation",
         ":traceable_stack",
     ],
+    main = "framework/error_interpolation_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_subscribe_test",
     size = "small",
     srcs = ["framework/subscribe_test.py"],
-    main = "framework/subscribe_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1164,50 +1218,48 @@ py_test(
         ":script_ops",
         ":subscribe",
     ],
+    main = "framework/subscribe_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "contrib_test",
     size = "small",
     srcs = ["framework/contrib_test.py"],
-    main = "framework/contrib_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
     ],
+    main = "framework/contrib_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "build_info_test",
     size = "small",
     srcs = [
         "platform/build_info.py",
         "platform/build_info_test.py",
     ],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
     main = "platform/build_info_test.py",
-    srcs_version = "PY2AND3",
     tags = [
         "no_pip",
         "notap",
     ],
-    deps = [
-        ":client_testlib",
-        ":platform",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "proto_test",
     size = "small",
     srcs = ["framework/proto_test.py"],
-    main = "framework/proto_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    main = "framework/proto_test.py",
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1270,25 +1322,22 @@ cuda_py_tests(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_versions_test",
     size = "small",
     srcs = ["framework/versions_test.py"],
-    main = "framework/versions_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
     ],
+    main = "framework/versions_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_importer_test",
     size = "large",
     srcs = ["framework/importer_test.py"],
-    main = "framework/importer_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -1300,9 +1349,10 @@ py_test(
         ":random_ops",
         ":test_ops",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/importer_test.py",
 )
 
 filegroup(
@@ -1313,18 +1363,11 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_meta_graph_test",
     size = "small",
     srcs = ["framework/meta_graph_test.py"],
-    data = ["//tensorflow/python:meta_graph_testdata"],
-    main = "framework/meta_graph_test.py",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":control_flow_ops",
@@ -1339,21 +1382,26 @@ py_test(
         ":training",
         ":variables",
     ],
+    data = ["//tensorflow/python:meta_graph_testdata"],
+    main = "framework/meta_graph_test.py",
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_traceable_stack_test",
     size = "small",
     srcs = ["framework/traceable_stack_test.py"],
-    main = "framework/traceable_stack_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_test_lib",
         ":platform_test",
         ":test_ops",
         ":traceable_stack",
         ":util",
     ],
+    main = "framework/traceable_stack_test.py",
 )
 
 tf_gen_op_wrapper_py(
@@ -1388,29 +1436,25 @@ cc_library(
     alwayslink = 1,
 )
 
-py_test(
+tf_py_test(
     name = "framework_common_shapes_test",
     size = "small",
     srcs = ["framework/common_shapes_test.py"],
-    main = "framework/common_shapes_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/common_shapes_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_ops_test",
     size = "small",
     srcs = ["framework/ops_test.py"],
-    main = "framework/ops_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # test_ops_2 is not available in pip.
-    deps = [
+    additional_deps = [
         ":cond_v2",
         ":control_flow_ops",
         ":errors",
@@ -1431,114 +1475,106 @@ py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
+    main = "framework/ops_test.py",
+    tags = ["no_pip"],  # test_ops_2 is not available in pip.
 )
 
-py_test(
+tf_py_test(
     name = "framework_ops_enable_eager_test",
     size = "small",
     srcs = ["framework/ops_enable_eager_test.py"],
-    main = "framework/ops_enable_eager_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":platform_test",
         "//tensorflow/python/eager:context",
     ],
+    main = "framework/ops_enable_eager_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_shape_test",
     size = "small",
     srcs = ["framework/tensor_shape_test.py"],
-    main = "framework/tensor_shape_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/tensor_shape_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_spec_test",
     size = "small",
     srcs = ["framework/tensor_spec_test.py"],
-    main = "framework/tensor_spec_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         ":tensor_spec",
         "//third_party/py/numpy",
     ],
+    main = "framework/tensor_spec_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_sparse_tensor_test",
     size = "small",
     srcs = ["framework/sparse_tensor_test.py"],
-    main = "framework/sparse_tensor_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/sparse_tensor_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_device_test",
     size = "small",
     srcs = ["framework/device_test.py"],
-    main = "framework/device_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/device_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_random_seed_test",
     size = "small",
     srcs = ["framework/random_seed_test.py"],
-    main = "framework/random_seed_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework",
     ],
+    main = "framework/random_seed_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_shape_div_test",
     size = "small",
     srcs = ["framework/tensor_shape_div_test.py"],
-    main = "framework/tensor_shape_div_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
-        "//tensorflow/core:protos_all_py",
         "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/tensor_shape_div_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_util_test",
     size = "small",
     srcs = ["framework/tensor_util_test.py"],
-    main = "framework/tensor_util_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -1548,16 +1584,15 @@ py_test(
         ":state_ops_gen",
         "//third_party/py/numpy",
     ],
+    main = "framework/tensor_util_test.py",
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_test_util_test",
     size = "small",
     srcs = ["framework/test_util_test.py"],
-    main = "framework/test_util_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":control_flow_ops",
         ":errors",
         ":framework_for_generated_wrappers",
@@ -1568,35 +1603,35 @@ py_test(
         ":session",
         ":test_ops",
         ":variables",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    main = "framework/test_util_test.py",
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_dtypes_test",
     size = "small",
     srcs = ["framework/dtypes_test.py"],
-    main = "framework/dtypes_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
     ],
+    main = "framework/dtypes_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "op_def_library_test",
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
@@ -1604,18 +1639,17 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_kernels_test",
     size = "small",
     srcs = ["framework/kernels_test.py"],
-    main = "framework/kernels_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_test_lib",
         ":kernels",
         ":platform_test",
         ":test_ops",
     ],
+    main = "framework/kernels_test.py",
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1690,6 +1724,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "clustering_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:clustering_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "collective_ops_gen",
     visibility = ["//tensorflow:internal"],
@@ -1825,6 +1867,11 @@ tf_gen_op_wrapper_private_py(
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "stateful_random_ops_gen",
+    visibility = ["//learning/brain/python/ops:__pkg__"],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "resource_variable_ops_gen",
     visibility = [
@@ -2058,14 +2105,37 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "clip_ops_test",
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":clip_ops",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "clustering_ops",
+    srcs = ["ops/clustering_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":clustering_ops_gen",
+        ":framework",
+        ":ops",
+        ":training",
+    ],
+)
+
+tf_py_test(
+    name = "clustering_ops_test",
+    size = "medium",
+    srcs = ["ops/clustering_ops_test.py"],
+    additional_deps = [
         ":client_testlib",
-        ":clip_ops",
+        ":clustering_ops",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
@@ -2081,12 +2151,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "collective_ops_test",
     size = "small",
     srcs = ["ops/collective_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":collective_ops",
         ":framework_for_generated_wrappers",
@@ -2417,6 +2486,23 @@ py_library(
     ],
 )
 
+py_library(
+    name = "init_ops_v2",
+    srcs = ["ops/init_ops_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":dtypes",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        ":random_ops",
+        ":util",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "initializers_ns",
     srcs = ["ops/initializers_ns.py"],
@@ -2745,6 +2831,32 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stateful_random_ops",
+    srcs = ["ops/stateful_random_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":stateful_random_ops_gen",
+        ":variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "stateful_random_ops_test",
+    size = "medium",
+    srcs = ["ops/stateful_random_ops_test.py"],
+    additional_deps = [
+        ":stateful_random_ops",
+        ":client_testlib",
+        ":logging_ops",
+        ":random_ops_gen",
+    ],
+)
+
 py_library(
     name = "stateless_random_ops",
     srcs = ["ops/stateless_random_ops.py"],
@@ -2864,11 +2976,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sparse_ops_test",
     srcs = ["ops/sparse_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":constant_op",
         ":dtypes",
         ":framework_test_lib",
@@ -2891,11 +3002,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sort_ops_test",
     srcs = ["ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -3025,6 +3135,7 @@ py_library(
         ":special_math_ops",
         ":state_grad",
         ":state_ops",
+        ":stateful_random_ops",
         ":stateless_random_ops",
         ":string_ops",
         ":template",
@@ -3309,7 +3420,6 @@ cuda_py_test(
         ":framework_test_lib",
         ":functional_ops",
         ":gradients",
-        ":layers",
         ":list_ops",
         ":math_grad",
         ":math_ops",
@@ -3393,6 +3503,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "init_ops_v2_test",
+    size = "medium",
+    srcs = ["ops/init_ops_v2_test.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":init_ops_v2",
+        ":random_ops",
+        ":framework_ops",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 cuda_py_test(
     name = "math_grad_test",
     size = "small",
@@ -3449,7 +3574,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "nn_fused_batchnorm_test",
-    size = "large",
+    size = "medium",
     srcs = ["ops/nn_fused_batchnorm_test.py"],
     additional_deps = [
         ":array_ops",
@@ -3551,7 +3676,7 @@ py_library(
         ":gradients",
         ":init_ops",
         ":io_ops",
-        ":layers_base",
+        ":layers_util",
         ":lookup_ops",
         ":math_ops",
         ":platform",
@@ -3572,19 +3697,17 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:distribute_coordinator_context",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        # `layers` dependency only exists due to the use of a small utility.
-        "//tensorflow/python/keras:layers",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/training/checkpointable:base",
         "//tensorflow/python/training/checkpointable:util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -3682,24 +3805,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "evaluation_test",
     size = "small",
     srcs = ["training/evaluation_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
-        ":layers",
         ":math_ops",
         ":metrics",
         ":platform",
@@ -3707,9 +3823,14 @@ py_test(
         ":summary",
         ":training",
         ":variables",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
+    ],
+    shard_count = 3,
+    tags = [
+        "manual",
+        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
     ],
 )
 
@@ -3757,76 +3878,68 @@ py_library(
 )
 
 # Placeholder for intenal nest_test comments.
-py_test(
+tf_py_test(
     name = "util_nest_test",
     size = "small",
     srcs = ["util/nest_test.py"],
-    main = "util/nest_test.py",
-    srcs_version = "PY2AND3",
-    visibility = visibility + [
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":util",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
     ],
+    main = "util/nest_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "util_serialization_test",
     size = "small",
     srcs = ["util/serialization_test.py"],
-    main = "util/serialization_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
+    main = "util/serialization_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "future_api_test",
     size = "small",
     srcs = ["util/future_api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":util",
         "//tensorflow:tensorflow_py",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_contextlib_test",
     size = "small",
     srcs = ["util/tf_contextlib_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_decorator_test",
     size = "small",
     srcs = ["util/tf_decorator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -3844,23 +3957,21 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_should_use_test",
     size = "small",
     srcs = ["util/tf_should_use_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":tf_should_use",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_inspect_test",
     size = "small",
     srcs = ["util/tf_inspect_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -3878,17 +3989,16 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "lock_util_test",
     size = "small",
     srcs = ["util/lock_util_test.py"],
-    main = "util/lock_util_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
         "@absl_py//absl/testing:parameterized",
     ],
+    main = "util/lock_util_test.py",
 )
 
 tf_proto_library(
@@ -3917,28 +4027,25 @@ tf_proto_library(
     visibility = ["//tensorflow:internal"],
 )
 
-py_test(
+tf_py_test(
     name = "protobuf_compare_test",
     size = "small",
     srcs = ["util/protobuf/compare_test.py"],
-    main = "util/protobuf/compare_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
-    deps = [
+    additional_deps = [
         ":compare_test_proto_py",
         ":platform_test",
         ":util",
         "@six_archive//:six",
     ],
+    main = "util/protobuf/compare_test.py",
+    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
 )
 
-py_test(
+tf_py_test(
     name = "util_example_parser_configuration_test",
     size = "small",
     srcs = ["util/example_parser_configuration_test.py"],
-    main = "util/example_parser_configuration_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
@@ -3946,14 +4053,14 @@ py_test(
         ":parsing_ops",
         ":util_example_parser_configuration",
     ],
+    main = "util/example_parser_configuration_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "events_writer_test",
     size = "small",
     srcs = ["client/events_writer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":errors",
         ":framework_test_lib",
         ":lib",
@@ -4119,6 +4226,7 @@ tf_py_wrap_cc(
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
@@ -4568,24 +4676,22 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
-py_test(
+tf_py_test(
     name = "c_api_util_test",
     size = "small",
     srcs = ["framework/c_api_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":c_api_util",
         ":framework_test_lib",
         ":platform_test",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_util_test",
     size = "small",
     srcs = ["framework/graph_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework",
@@ -4598,37 +4704,34 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "bfloat16_test",
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":lib",
         ":pywrap_tensorflow",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "file_io_test",
     size = "small",
     srcs = ["lib/io/file_io_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         ":lib",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "tf_record_test",
     size = "small",
     srcs = ["lib/io/tf_record_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         ":lib",
@@ -4818,17 +4921,11 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saver_large_variable_test",
     size = "medium",
     srcs = ["training/saver_large_variable_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "noasan",  # http://b/30379628
-        "notsan",  # http://b/30379628
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":errors",
@@ -4837,18 +4934,18 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "manual",
+        "noasan",  # http://b/30379628
+        "notsan",  # http://b/30379628
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "saver_large_partitioned_variable_test",
     size = "medium",
     srcs = ["training/saver_large_partitioned_variable_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "noasan",  # http://b/30782289
-        "notsan",  # http://b/30782289
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4856,6 +4953,10 @@ py_test(
         ":training",
         ":variables",
     ],
+    tags = [
+        "noasan",  # http://b/30782289
+        "notsan",  # http://b/30782289
+    ],
 )
 
 cuda_py_test(
@@ -4901,16 +5002,11 @@ tf_py_test(
     tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "basic_session_run_hooks_test",
     size = "medium",
     srcs = ["training/basic_session_run_hooks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # intermittent races on a few percent of runs
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":control_flow_ops",
@@ -4927,21 +5023,17 @@ py_test(
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "no_windows",
+        "notsan",  # intermittent races on a few percent of runs
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "checkpoint_utils_test",
     size = "small",
     srcs = ["training/checkpoint_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_cuda_on_cpu_tap",
-        "no_oss",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4955,14 +5047,20 @@ py_test(
         ":variable_scope",
         ":variables",
     ],
+    tags = [
+        "manual",
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "no_windows",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "checkpoint_ops_test",
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":checkpoint_ops_gen",
         ":client",
         ":client_testlib",
@@ -4978,12 +5076,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "warm_starting_util_test",
     size = "medium",
     srcs = ["training/warm_starting_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":dtypes",
@@ -4992,21 +5089,15 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "monitored_session_test",
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67945581
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":checkpoint_management",
         ":client_testlib",
@@ -5025,6 +5116,10 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
 )
 
 py_library(
@@ -5046,12 +5141,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "training_util_test",
     size = "small",
     srcs = ["training/training_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework",
         ":platform",
@@ -5148,13 +5242,13 @@ py_library(
     srcs = [
         "layers/__init__.py",
         "layers/base.py",
-        "layers/utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
         ":control_flow_ops",
         ":framework_for_generated_wrappers",
+        ":layers_util",
         ":platform",
         ":smart_cond",
         ":tensor_util",
@@ -5167,6 +5261,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "layers_util",
+    srcs = [
+        "layers/utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":smart_cond",
+        ":util",
+        ":variables",
+    ],
+)
+
 py_library(
     name = "layers",
     srcs = [
@@ -5204,13 +5312,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "layers_base_test",
     size = "small",
     srcs = ["layers/base_test.py"],
-    main = "layers/base_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5223,15 +5329,14 @@ py_test(
         ":variable_scope",
         "//tensorflow/python/eager:context",
     ],
+    main = "layers/base_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_core_test",
     size = "small",
     srcs = ["layers/core_test.py"],
-    main = "layers/core_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5244,15 +5349,14 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    main = "layers/core_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_convolutional_test",
     size = "small",
     srcs = ["layers/convolutional_test.py"],
-    main = "layers/convolutional_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5261,32 +5365,31 @@ py_test(
         ":nn_ops",
         ":random_ops",
     ],
+    main = "layers/convolutional_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_utils_test",
     size = "small",
     srcs = ["layers/utils_test.py"],
-    main = "layers/utils_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":layers",
     ],
+    main = "layers/utils_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_pooling_test",
     size = "small",
     srcs = ["layers/pooling_test.py"],
-    main = "layers/pooling_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_test_lib",
         ":layers",
         ":random_ops",
     ],
+    main = "layers/pooling_test.py",
 )
 
 cuda_py_test(
@@ -5311,51 +5414,48 @@ cuda_py_test(
 # -----------------------------------------------------------------------------
 # Quantization
 
-py_test(
+tf_py_test(
     name = "dequantize_op_test",
     size = "small",
     srcs = ["ops/dequantize_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "quantized_ops_test",
     size = "small",
     srcs = ["ops/quantized_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "quantized_conv_ops_test",
     size = "small",
     srcs = ["ops/quantized_conv_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":nn_ops",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
     name = "accumulate_n_benchmark",
-    size = "large",
+    size = "medium",
     srcs = ["ops/accumulate_n_benchmark.py"],
     additional_deps = [
         ":array_ops",
@@ -5370,6 +5470,7 @@ cuda_py_test(
         ":state_ops_gen",
     ],
     main = "ops/accumulate_n_benchmark.py",
+    shard_count = 6,
 )
 
 cuda_py_test(
@@ -5565,38 +5666,32 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "item_test",
     size = "small",
     srcs = [
         "grappler/item_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "datasets_test",
     size = "small",
     srcs = [
         "grappler/datasets_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5604,6 +5699,10 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
 py_library(
@@ -5652,25 +5751,24 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_optimizer_test",
     size = "small",
     srcs = [
         "grappler/tf_optimizer_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":tf_item",
         ":tf_optimizer",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
     ],
 )
 
@@ -5687,32 +5785,28 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_placer_test",
     size = "large",
     srcs = ["grappler/graph_placer_test.py"],
-    tags = [
-        "grappler",
-        "no_pip",  # graph_placer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":graph_placer",
         "//tensorflow/python:math_ops",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # graph_placer is not available in pip.
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "memory_optimizer_test",
     size = "medium",
     srcs = [
         "grappler/memory_optimizer_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
@@ -5723,8 +5817,11 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
     ],
 )
 
@@ -5764,7 +5861,6 @@ cuda_py_test(
         ":constant_op",
         ":dtypes",
         ":functional_ops",
-        ":layers",
         ":math_ops",
         ":nn",
         ":ops",
@@ -5809,17 +5905,11 @@ py_binary(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "cost_analyzer_test",
     size = "small",
     srcs = ["grappler/cost_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":cost_analyzer",
@@ -5831,8 +5921,13 @@ py_test(
         ":state_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_cuda_on_cpu_tap",
+        "no_pip",
     ],
 )
 
@@ -5845,24 +5940,23 @@ py_library(
     deps = [":pywrap_tensorflow_internal"],
 )
 
-py_test(
+tf_py_test(
     name = "model_analyzer_test",
     size = "small",
     srcs = ["grappler/model_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":model_analyzer",
         ":state_ops",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_pip",
     ],
 )
 
@@ -5884,6 +5978,8 @@ py_library(
     deps = [
         ":framework_for_generated_wrappers",
         ":nccl_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
     ],
 )
 
@@ -5931,14 +6027,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "mode_keys_test",
     size = "small",
     srcs = [
         "training/mode_keys_test.py",
     ],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":mode_keys",
     ],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 9f1e52b42bb73261e13ca37e29543242f682640e..398fb375e1453866f3f1953a53012aaae2c22dd6 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -82,6 +82,7 @@ from tensorflow.python import distribute
 from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
+from tensorflow.python.module import module
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
@@ -99,6 +100,9 @@ from tensorflow.python.summary import summary
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 
+# Import audio ops to make sure the ops are registered.
+from tensorflow.python.ops import gen_audio_ops as _
+
 # Import boosted trees ops to make sure the ops are registered (but unused).
 from tensorflow.python.ops import gen_boosted_trees_ops as _gen_boosted_trees_ops
 
@@ -121,6 +125,8 @@ from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 
+from tensorflow.python.compat import v2_compat
+
 from tensorflow.python.util.all_util import make_all
 from tensorflow.python.util.tf_export import tf_export
 
@@ -148,7 +154,7 @@ nn.rnn_cell = rnn_cell
 # pylint: disable=undefined-variable
 tf_export(v1=['AttrValue'])(AttrValue)
 tf_export(v1=['ConfigProto'])(ConfigProto)
-tf_export('Event', 'summary.Event')(Event)
+tf_export(v1=['Event', 'summary.Event'])(Event)
 tf_export(v1=['GPUOptions'])(GPUOptions)
 tf_export(v1=['GraphDef'])(GraphDef)
 tf_export(v1=['GraphOptions'])(GraphOptions)
@@ -161,10 +167,10 @@ tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
 tf_export(v1=['RunMetadata'])(RunMetadata)
 tf_export(v1=['RunOptions'])(RunOptions)
 tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
-tf_export('Summary', 'summary.Summary')(Summary)
-tf_export('summary.SummaryDescription')(SummaryDescription)
-tf_export('SummaryMetadata')(SummaryMetadata)
-tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
+tf_export(v1=['Summary', 'summary.Summary'])(Summary)
+tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
+tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
+tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
 tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
diff --git a/tensorflow/python/autograph/LIMITATIONS.md b/tensorflow/python/autograph/LIMITATIONS.md
index d8b1cb7616ac348981bf2b69d6e2fd8d8a6e6b78..b4e4ca661ad7a4c6d69019ce56a0832fd1cbb03f 100644
--- a/tensorflow/python/autograph/LIMITATIONS.md
+++ b/tensorflow/python/autograph/LIMITATIONS.md
@@ -8,39 +8,39 @@ Python is a large language, so hoping to convert arbitrary Python code directly
 
 Note: as more complex features in TensorFlow are made more accessible using AutoGraph, we expect to come across use cases that haven't been tried before, some of which might reveal rare bugs. If we do find any such bugs, we may add additional restrictions for the affected configurations, until those bugs are resolved.
 
- Construct | Supported now? | Plan to support? | Notes
- :--------- | :--------------: | :----------------: | :-----
-If statement | Yes |  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
-For statement | Yes | | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
-While statement | Yes | | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
-Continue and break | Yes | | Converts to boolean flags and extra predicates in loop tests.
-Composition of control flow | Yes | | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
-Iterators | Some | Yes | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
-Multiple return values | Yes | | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
-Print expression | Yes | | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
-Static function calls | Yes | | Non-recursive function calls
-Nested call trees | Yes | | For example, `f` calls `g` which calls `h`, all of which need conversion.
-Recursive function calls | No | Maybe | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
-Python built-ins | Some | Yes | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
-List operations | Yes | | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
-Function variables | Yes | | e.g. `f_new = f_orig; f_new()`
-Lambda functions | No | Yes | Planned feature.
-Classes | Yes | | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
-Subclasses | Yes | | Subclassing library objects like tf.keras.Model is also supported.
-Dynamic types | Some | | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
-Dynamic code / exec | No | |
-Reflection | No | |
-Try / Except | No | No | No current sane TF equivalent.
-Global variables | Restricted | | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
-Functions with side effects | Some | | Side effects are allowed, under certain circumstances.
-Collections | Some | Yes | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
-List Comprehensions | Yes | | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
-Custom context managers | No | Yes | Currently low priority. Left unconverted currently.
-Generators | No | Maybe | Could be achievable using queues; very low priority.
-Assertions | Yes | | As `tf.Assert`
-Deletion | Yes | Maybe | Currently unconverted. If new semanti cs are required for `del`, we are able to add it in.
-Inline imports | No | Yes | For example, `import numpy as np; np.eye(3)`. Currently low priority.
-Async | No | No |
+Construct                   | Supported now? | Plan to support? | Notes
+:-------------------------- | :------------: | :--------------: | :----
+If statement                | Yes            |                  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
+For statement               | Yes            |                  | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
+While statement             | Yes            |                  | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
+Continue and break          | Yes            |                  | Converts to boolean flags and extra predicates in loop tests.
+Composition of control flow | Yes            |                  | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
+Iterators                   | Some           | Yes              | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
+Multiple return values      | Yes            |                  | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
+Print expression            | Yes            |                  | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
+Static function calls       | Yes            |                  | Non-recursive function calls
+Nested call trees           | Yes            |                  | For example, `f` calls `g` which calls `h`, all of which need conversion.
+Recursive function calls    | No             | Maybe            | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
+Python built-ins            | Some           | Yes              | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
+List operations             | Yes            |                  | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
+Function variables          | Yes            |                  | e.g. `f_new = f_orig; f_new()`
+Lambda functions            | No             | Yes              | Planned feature.
+Classes                     | Yes            |                  | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
+Subclasses                  | Yes            |                  | Subclassing library objects like tf.keras.Model is also supported.
+Dynamic types               | Some           |                  | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
+Dynamic code / exec         | No             |                  |
+Reflection                  | No             |                  |
+Try / Except                | No             | No               | No current sane TF equivalent.
+Global variables            | Restricted     |                  | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
+Functions with side effects | Some           |                  | Side effects are allowed, under certain circumstances.
+Collections                 | Some           | Yes              | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
+List Comprehensions         | Yes            |                  | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
+Custom context managers     | No             | Yes              | Currently low priority. Left unconverted currently.
+Generators                  | No             | Maybe            | Could be achievable using queues; very low priority.
+Assertions                  | Yes            |                  | As `tf.Assert`
+Deletion                    | Yes            | Maybe            | Currently unconverted. If new semantics are required for `del`, we are able to add it in.
+Inline imports              | No             | Yes              | For example, `import numpy as np; np.eye(3)`. Currently low priority.
+Async                       | No             | No               |
 
 ## Extra capabilities
 
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 6faeb016072479ab7e860b6520515edb4c88fab9..a1956d1d8af20924a50168401a7229808adeeaac 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -50,7 +50,8 @@ from tensorflow.python.autograph.lang.directives import set_element_type
 from tensorflow.python.autograph.lang.directives import set_loop_options
 from tensorflow.python.autograph.lang.special_functions import stack
 from tensorflow.python.autograph.lang.special_functions import tensor_list
-from tensorflow.python.autograph.pyct.transformer import AutographParseError
+from tensorflow.python.autograph.pyct.transformer import AutoGraphParseError
+from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
 # TODO(mdan): Revisit this list once we finalize the generated code mechanism.
@@ -77,7 +78,7 @@ _allowed_symbols = [
     'stack',
     'tensor_list',
     # Exceptions
-    'AutographParseError',
+    'AutoGraphParseError',
     # Utilities: to be removed
     'utils',
 ]
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 3ac446db02c6ef1946e76a8b549a85c67fed2872..bafc5b0ca7c203255f098f6e03fa8b417b74d4f6 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -25,7 +25,6 @@ py_library(
         "conditional_expressions.py",
         "continue_statements.py",
         "control_flow.py",
-        "decorators.py",
         "directives.py",
         "error_handlers.py",
         "function_scopes.py",
@@ -139,21 +138,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "decorators_test",
-    srcs = ["decorators_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":converters",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/core:test_lib",
-    ],
-)
-
 py_test(
     name = "directives_test",
     srcs = ["directives_test.py"],
diff --git a/tensorflow/python/autograph/converters/builtin_functions_test.py b/tensorflow/python/autograph/converters/builtin_functions_test.py
index 2683be16ec7ffa91b1df3cd272336366502d9f4f..2e6cf16b9c5af5aad32e6746bf7c5503917200dd 100644
--- a/tensorflow/python/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/python/autograph/converters/builtin_functions_test.py
@@ -55,7 +55,9 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
       with self.session() as sess:
         with self.assertPrints('a\n'):
-          sess.run(result.test_fn('a'))
+          sess.run(result.test_fn(constant_op.constant('a')))
+      with self.assertPrints('a\n'):
+        result.test_fn('a')
 
   @test_util.run_deprecated_v1
   def test_print_multiple_values(self):
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index d4eb17e976f6fdf321903a878326e668aeb6ea49..7026a162a28da40476930928f2327ef0b93dc352 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -268,17 +268,10 @@ class CallTreeTransformer(converter.Base):
     new_call.keywords = node.keywords
     return new_call
 
-  def _visit_decorators(self, decorator_list):
-    if not self.ctx.program.options.uses(converter.Feature.DECORATORS):
-      # When not processing decorators, strip everything that is encountered.
-      return []
-
-    return self.visit_block(decorator_list)
-
   def visit_FunctionDef(self, node):
     node.args = self.visit(node.args)
     node.body = self.visit_block(node.body)
-    node.decorator_list = self._visit_decorators(node.decorator_list)
+    node.decorator_list = []
     node.returns = self.visit_block(node.returns)
     return node
 
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a39a0b0cdb16280312b830c9c9bbe78c06ab77b0..4f38a2926b601f4cb342aa3da74a76679ce77b03 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -297,12 +297,11 @@ class ControlFlowTransformer(converter.Base):
         if str(name) != ssf
     }
 
+    state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
+
     if len(loop_state) == 1:
       loop_state = loop_state[0]
       state_ssf = state_ssf[0]
-      state_ast_tuple = loop_state
-    else:
-      state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
 
     return loop_state, state_ssf, state_ast_tuple, ssf_map
 
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 034fcbe3865cdd78cdaad19631da98359cb4690d..9009e67ef474877abaf396fa315a059d0a5ef809 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -184,7 +183,7 @@ class ControlFlowTest(converter_testing.TestCase):
       return b
 
     node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(transformer.AutographParseError):
+    with self.assertRaises(ValueError):
       control_flow.transform(node, ctx)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/autograph/converters/decorators.py b/tensorflow/python/autograph/converters/decorators.py
deleted file mode 100644
index f0ea51277468499937089c89eedb344149cb1ae7..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/converters/decorators.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Handles decorators.
-
-Note: this module only deals with functions whose decorators are still recorded
-in the AST. This does not always happen. See the unit test for an example.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.util import tf_inspect
-
-
-class DecoratorsTransformer(converter.Base):
-  """Converts or removes decorators."""
-
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-    kept_decorators = []
-    for dec in node.decorator_list:
-      if isinstance(dec, gast.Call):
-        dec_func = dec.func
-      else:
-        dec_func = dec
-
-      # Special cases.
-      # TODO(mdan): Is there any way we can treat these more generically?
-      # We may want to forego using decorators altogether if we can't
-      # properly support them.
-      if isinstance(dec_func, gast.Name) and dec_func.id in ('classmethod',):
-        # Assumption: decorators are only visible in the AST when converting
-        # a function inline (via another decorator).
-        # In that case, the converted function is no longer part of the
-        # original object that it was declared into.
-        # This is currently verified by tests.
-        continue
-
-      if not anno.hasanno(dec_func, 'live_val'):
-        raise ValueError('could not resolve the decorator "@%s"' %
-                         (anno.getanno(dec_func, anno.Basic.QN)))
-
-      original_dec = anno.getanno(dec_func, anno.Basic.QN)
-      dec_value = anno.getanno(dec_func, 'live_val')
-
-      if dec_value in self.ctx.program.options.strip_decorators:
-        continue
-
-      # When using foo.bar.baz, we only really need to grab foo and import
-      # that.
-      dec_support_node = dec_func
-      while isinstance(dec_support_node, gast.Attribute):
-        dec_support_node = dec_support_node.value
-
-      if not anno.hasanno(dec_support_node, 'live_val'):
-        raise ValueError(
-            'could not resolve symbol "%s" when looking up decorator "%s"' %
-            (anno.getanno(dec_support_node, anno.Basic.QN), original_dec))
-
-      dec_support = anno.getanno(dec_support_node, 'live_val')
-      # The tuple contains:
-      #  * the AST that represents the decorator
-      #  * the entity supporting the decorator (i.e., what we need to import)
-      #  * the name of the module that needs to be imported for this decorator
-      #    to properly resolve.
-      # Examples:
-      #  for foo.bar, the tuple is (<ast>, <module foo>, 'foo')
-      #  for baz, the tuple is (<ast>, <module baz.__module__>, 'baz')
-      kept_decorators.append((dec, dec_support,
-                              anno.getanno(dec_support_node, anno.Basic.QN)))
-
-    for _, dec_support, name in kept_decorators:
-      if tf_inspect.ismodule(dec_support):
-        self.ctx.program.additional_imports.add(
-            'import %s as %s' % (dec_support.__name__, name))
-      else:
-        if dec_support.__module__ == '__main__':
-          raise ValueError(
-              'decorator "%s" was not allowed because it is declared '
-              'in the module "%s". To fix this, declare it in a separate '
-              'module that we can import it from.' % (dec_support,
-                                                      dec_support.__module__))
-        self.ctx.program.additional_imports.add(
-            'from %s import %s' % (dec_support.__module__, name))
-
-    node.decorator_list = [dec for dec, _, _ in kept_decorators]
-    return node
-
-
-def transform(node, ctx):
-  return DecoratorsTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/decorators_test.py b/tensorflow/python/autograph/converters/decorators_test.py
deleted file mode 100644
index abd76849d6eafd92c2d7fa540a30d699e3a57e52..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/converters/decorators_test.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for decorators module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import wraps
-import imp
-
-from tensorflow.python import autograph
-from tensorflow.python.autograph.converters import decorators
-from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.platform import test
-
-
-# The Python parser only briefly captures decorators into the AST.
-# The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is normally what you would expect, since
-# they are meant to be transparent).
-# However, decorators are still visible when you analyze the function
-# from inside a decorator, before it was applied - as is the case
-# with our conversion decorators.
-
-
-def simple_decorator(f):
-  return lambda a: f(a) + 1
-
-
-def self_transform_decorator(transform):
-
-  def decorator(f):
-    @wraps(f)
-    def wrapper(*args):
-      # This removing wrapper is defined in the test below. This setup is so
-      # intricate in order to simulate how we use the transformer in practice.
-      transformed_f = transform(f, (self_transform_decorator,))
-      return transformed_f(*args) + 1
-    return wrapper
-  return decorator
-
-
-class DecoratorsTest(converter_testing.TestCase):
-
-  def _transform(self, f, strip_decorators):
-    namespace = {
-        'self_transform_decorator': self_transform_decorator,
-        'simple_decorator': simple_decorator,
-        'converter_testing': converter_testing,
-    }
-    node, ctx = self.prepare(
-        f, namespace, recursive=False, strip_decorators=strip_decorators)
-    node = decorators.transform(node, ctx)
-    import_line = '\n'.join(ctx.program.additional_imports)
-    result, _ = compiler.ast_to_object(node, source_prefix=import_line)
-    return getattr(result, f.__name__)
-
-  def test_noop(self):
-
-    def test_fn(a):
-      return a
-
-    with self.converted(test_fn, decorators, {}) as result:
-      self.assertEqual(1, result.test_fn(1))
-
-  def test_function(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-      return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, test_fn(1))
-
-  def test_method(self):
-
-    class TestClass(object):
-
-      @self_transform_decorator(self._transform)
-      def test_fn(self, a):
-        return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, TestClass().test_fn(1))
-
-  def test_multiple_decorators(self):
-
-    class TestClass(object):
-
-      # Note that reversing the order of this two doesn't work.
-      @classmethod
-      @self_transform_decorator(self._transform)
-      def test_fn(cls, a):
-        return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, TestClass.test_fn(1))
-
-  def test_nested_decorators_local(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-      @simple_decorator
-      def inner_fn(b):
-        return b + 11
-      return inner_fn(a)
-
-    # Expected to fail because simple_decorator could not be imported.
-    with self.assertRaises(transformer.AutographParseError):
-      test_fn(1)
-
-  def test_nested_decorators_imported(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-
-      @converter_testing.imported_decorator
-      def inner_fn(b):
-        return b + 11
-
-      return inner_fn(a)
-
-    # Work around TensorFlow's symbol suppression mechanism that causes core to
-    # be invisible in the generated code.
-    core_mod = imp.new_module('core')
-    core_mod.converter_testing = converter_testing
-    autograph.core = core_mod
-
-    # 14 = 1 (a) + 1 (simple_decorator) + 11 (inner_fn)
-    self.assertEqual(14, test_fn(1))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/converters/logical_expressions.py b/tensorflow/python/autograph/converters/logical_expressions.py
index dfcaafdc9eba61bcb3c03432eadf309484d48dee..ea9740a22e1c065f04401fa3f15e8086349eb513 100644
--- a/tensorflow/python/autograph/converters/logical_expressions.py
+++ b/tensorflow/python/autograph/converters/logical_expressions.py
@@ -38,29 +38,29 @@ from tensorflow.python.autograph.pyct import templates
 SAFE_BOOLEAN_OPERAND = 'SAFE_BOOLEAN_OPERAND'
 
 
+OP_MAPPING = {
+    gast.And: 'ag__.and_',
+    gast.Eq: 'ag__.eq',
+    gast.NotEq: 'ag__.not_eq',
+    gast.Lt: 'ag__.lt',
+    gast.LtE: 'ag__.lt_e',
+    gast.Gt: 'ag__.gt',
+    gast.GtE: 'ag__.gt_e',
+    gast.Is: 'ag__.is_',
+    gast.IsNot: 'ag__.is_not',
+    gast.In: 'ag__.in_',
+    gast.Not: 'ag__.not_',
+    gast.NotIn: 'ag__.not_in',
+    gast.Or: 'ag__.or_',
+    gast.UAdd: 'ag__.u_add',
+    gast.USub: 'ag__.u_sub',
+    gast.Invert: 'ag__.invert',
+}
+
+
 class LogicalExpressionTransformer(converter.Base):
   """Converts logical expressions to corresponding TF calls."""
 
-  def __init__(self, ctx):
-    super(LogicalExpressionTransformer, self).__init__(ctx)
-    # TODO(mdan): For completeness and consistency, overload everything.
-    self.op_mapping = {
-        gast.And: 'ag__.and_',
-        gast.Eq: 'ag__.eq',
-        gast.NotEq: 'ag__.not_eq',
-        gast.Lt: 'ag__.lt',
-        gast.LtE: 'ag__.lt_e',
-        gast.Gt: 'ag__.gt',
-        gast.GtE: 'ag__.gt_e',
-        gast.Is: 'ag__.is_',
-        gast.IsNot: 'ag__.is_not',
-        gast.In: 'ag__.in_',
-        gast.Not: 'ag__.not_',
-        gast.NotIn: 'ag__.not_in',
-        gast.Or: 'ag__.or_',
-        gast.USub: 'ag__.u_sub',
-    }
-
   def _expect_simple_symbol(self, operand):
     if isinstance(operand, gast.Name):
       return
@@ -74,11 +74,11 @@ class LogicalExpressionTransformer(converter.Base):
 
   def _has_matching_func(self, operator):
     op_type = type(operator)
-    return op_type in self.op_mapping
+    return op_type in OP_MAPPING
 
   def _matching_func(self, operator):
     op_type = type(operator)
-    return self.op_mapping[op_type]
+    return OP_MAPPING[op_type]
 
   def _as_function(self, func_name, args, args_as_lambda=False):
     if args_as_lambda:
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index 687412750e0b2d3e7db275f6c25e5923ffaaa831..67ccd1fb47955053e0896df07e20903d4406370b 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -77,6 +77,13 @@ class LogicalExpressionTest(converter_testing.TestCase):
     with self.converted(test_fn, logical_expressions, {}) as result:
       self.assertTrue(result.test_fn('a', ('a',)))
 
+  def test_unary_ops(self):
+    def test_fn(a):
+      return ~a, -a, +a
+
+    with self.converted(test_fn, logical_expressions, {}) as result:
+      self.assertEqual(result.test_fn(1), (-2, -1, 1))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 496c99e3b5247c174f8a74e9b3f23517ddc649f3..80a555385df0a21470a448055ca5c0cafd302434 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -22,310 +22,326 @@ import gast
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# TODO(mdan): Move this logic into transformer_base.
-class BodyVisitor(converter.Base):
-  """Walks breadth- or depth-first the list-of-nodes bodies of AST nodes."""
+BODY_DEFINITELY_RETURNS = 'BODY_DEFINITELY_RETURNS'
+ORELSE_DEFINITELY_RETURNS = 'ORELSE_DEFINITELY_RETURNS'
+STMT_DEFINITELY_RETURNS = 'STMT_DEFINITELY_RETURNS'
 
-  def __init__(self, ctx, depth_first=False):
-    super(BodyVisitor, self).__init__(ctx)
-    self.depth_first = depth_first
-    self.changes_made = False
 
-  def visit_nodelist(self, nodelist):
-    for node in nodelist:
-      if isinstance(node, list):
-        node = self.visit_nodelist(node)
+class _Block(object):
+
+  def __init__(self):
+    self.definitely_returns = False
+
+
+class ConditionalReturnLowering(converter.Base):
+  """Rewrites a a pattern where it's unbovious that all paths return a value.
+
+  This rewrite allows avoiding intermediate None return values.
+
+  The following pattern:
+
+      if cond:
+        <block 1>
+        return
       else:
-        node = self.generic_visit(node)
-    return nodelist
+        <block 2>
+      <block 3>
 
-  def visit_If(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
-    return node
+  is converted to:
 
-  def visit_For(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+      if cond:
+        <block 1>
+        return
+      else:
+        <block 2>
+        <block 3>
+
+  and vice-versa (if the else returns, subsequent statements are moved unthe the
+  if branch).
+  """
+
+  def visit_Return(self, node):
+    self.state[_Block].definitely_returns = True
     return node
 
+  def _postprocess_statement(self, node):
+    # If the node definitely returns (e.g. it's a with statement with a
+    # return stateent in it), then the current block also definitely returns.
+    if anno.getanno(node, STMT_DEFINITELY_RETURNS, default=False):
+      self.state[_Block].definitely_returns = True
+
+    # The special case: collapse a typical conditional return pattern into
+    # a single conditional with possibly returns on both branches. This
+    # reduces the use of None return values, which don't work with TF
+    # conditionals.
+    if (isinstance(node, gast.If)
+        and anno.getanno(node, BODY_DEFINITELY_RETURNS, default=False)):
+      return node, node.orelse
+    elif (isinstance(node, gast.If)
+          and anno.getanno(node, ORELSE_DEFINITELY_RETURNS, default=False)):
+      return node, node.body
+
+    return node, None
+
+  def _visit_statement_block(self, node, nodes):
+    self.state[_Block].enter()
+    new_nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    block_definitely_returns = self.state[_Block].definitely_returns
+    self.state[_Block].exit()
+    return new_nodes, block_definitely_returns
+
   def visit_While(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+    node.test = self.visit(node.test)
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
     return node
 
-  def visit_Try(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    node.finalbody = self.visit_nodelist(node.finalbody)
-    for i in range(len(node.handlers)):
-      node.handlers[i].body = self.visit_nodelist(node.handlers[i].body)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+  def visit_For(self, node):
+    node.iter = self.visit(node.iter)
+    node.target = self.visit(node.target)
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
     return node
 
-  def visit_With(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    if not self.depth_first:
-      node = self.generic_visit(node)
-    return node
+  def visit_If(self, node):
+    node.test = self.visit(node.test)
 
-  def visit_FunctionDef(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    self.generic_visit(node)
-    if not self.depth_first:
-      node = self.generic_visit(node)
-    return node
+    node.body, body_definitely_returns = self._visit_statement_block(
+        node, node.body)
+    if body_definitely_returns:
+      anno.setanno(node, BODY_DEFINITELY_RETURNS, True)
 
+    node.orelse, orelse_definitely_returns = self._visit_statement_block(
+        node, node.orelse)
+    if orelse_definitely_returns:
+      anno.setanno(node, ORELSE_DEFINITELY_RETURNS, True)
 
-class FoldElse(BodyVisitor):
-
-  def visit_nodelist(self, nodelist):
-    for i in range(len(nodelist)):
-      node = nodelist[i]
-      if isinstance(node, gast.If):
-        true_branch_returns = isinstance(node.body[-1], gast.Return)
-        false_branch_returns = len(node.orelse) and isinstance(
-            node.orelse[-1], gast.Return)
-        # If the last node in the if body is a return,
-        # then every line after this if statement effectively
-        # belongs in the else.
-        if true_branch_returns and not false_branch_returns:
-          for j in range(i + 1, len(nodelist)):
-            nodelist[i].orelse.append(ast_util.copy_clean(nodelist[j]))
-          if nodelist[i + 1:]:
-            self.changes_made = True
-          return nodelist[:i + 1]
-        elif not true_branch_returns and false_branch_returns:
-          for j in range(i + 1, len(nodelist)):
-            nodelist[i].body.append(ast_util.copy_clean(nodelist[j]))
-          if nodelist[i + 1:]:
-            self.changes_made = True
-          return nodelist[:i + 1]
-        elif true_branch_returns and false_branch_returns:
-          if nodelist[i + 1:]:
-            raise ValueError(
-                'Unreachable code after conditional where both branches return.'
-            )
-          return nodelist
-      elif isinstance(node, gast.Return) and nodelist[i + 1:]:
-        raise ValueError(
-            'Cannot have statements after a return in the same basic block')
-    return nodelist
-
-
-def contains_return(node):
-  for n in gast.walk(node):
-    if isinstance(n, gast.Return):
-      return True
-  return False
-
-
-class LiftReturn(converter.Base):
-  """Move return statements out of If and With blocks."""
-
-  def __init__(self, ctx):
-    super(LiftReturn, self).__init__(ctx)
-    self.changes_made = False
-    self.common_return_name = None
+    if body_definitely_returns and orelse_definitely_returns:
+      self.state[_Block].definitely_returns = True
 
-  def visit_If(self, node):
-    # Depth-first traversal of if statements
-    node = self.generic_visit(node)
-
-    # We check if both branches return, and if so, lift the return out of the
-    # conditional. We don't enforce that the true and false branches either
-    # both return or both do not, because FoldElse might move a return
-    # into a branch after this transform completes. FoldElse and LiftReturn
-    # are alternately run until the code reaches a fixed point.
-    true_branch_returns = isinstance(node.body[-1], gast.Return)
-    false_branch_returns = len(node.orelse) and isinstance(
-        node.orelse[-1], gast.Return)
-    if true_branch_returns and false_branch_returns:
-      node.body[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
-      node.orelse[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.orelse[-1].value)[0]
-      return_node = templates.replace('return a', a=self.common_return_name)[0]
-      self.changes_made = True
-      return [node, return_node]
-    else:
-      return node
-
-  def visit_With(self, node):
-    # Depth-first traversal of syntax
-    node = self.generic_visit(node)
-
-    # If the with statement returns, lift the return
-    if isinstance(node.body[-1], gast.Return):
-      node.body[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
-      return_node = templates.replace('return a', a=self.common_return_name)[0]
-      node = self.generic_visit(node)
-      self.changes_made = True
-      return [node, return_node]
-    else:
-      return node
+    return node
 
   def visit_FunctionDef(self, node):
-    # Ensure we're doing depth-first traversal
-    last_return_name = self.common_return_name
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    referenced_names = body_scope.referenced
-    self.common_return_name = self.ctx.namer.new_symbol('return_',
-                                                        referenced_names)
-    node = self.generic_visit(node)
-    self.common_return_name = last_return_name
+    node.args = self.visit(node.args)
+    node.body, _ = self._visit_statement_block(node, node.body)
     return node
 
 
-class DetectReturnInUnsupportedControlFlow(gast.NodeVisitor):
-  """Throws an error if code returns inside loops or try/except."""
+class _Return(object):
 
-  # First, throw an error if we detect a return statement in a loop.
-  # TODO(alexbw): we need to learn to handle returns inside a loop,
-  # but don't currently have the TF constructs to do so (need something
-  # that looks vaguely like a goto).
+  def __init__(self):
+    self.used = False
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: {}'.format(
+        self.used)
+
+
+class _Function(object):
 
   def __init__(self):
-    self.cant_return = False
-    self.function_level = 0
-    super(DetectReturnInUnsupportedControlFlow, self).__init__()
+    self.do_return_var_name = None
+    self.retval_var_name = None
 
-  def visit_While(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+  def __repr__(self):
+    return 'return control: {}, return value: {}'.format(
+        self.do_return_var_name, self.retval_var_name)
 
-  def visit_For(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
 
-  def visit_Try(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+class ReturnStatementsTransformer(converter.Base):
+  """Lowers return statements into variables and conditionals.
 
-  def visit_FunctionDef(self, node):
-    if not self.function_level:
-      self.function_level += 1
-      self.generic_visit(node)
-      self.function_level -= 1
+  Specifically, the following pattern:
 
-  def visit_Return(self, node):
-    if self.cant_return:
-      raise ValueError(
-          '`return` statements are not supported in loops. '
-          'Try assigning to a variable in the while loop, and returning '
-          'outside of the loop')
+      <block 1>
+      return val
+      <block 2>
 
+  is converted to:
 
-class DetectReturnInConditional(gast.NodeVisitor):
-  """Assert that no return statements are present in conditionals."""
+      do_return = False
+      retval = None
 
-  def __init__(self):
-    self.cant_return = False
-    self.function_level = 0
-    super(DetectReturnInConditional, self).__init__()
+      <block 1>
 
-  def visit_If(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+      do_return = True
+      retval = val
 
-  def visit_FunctionDef(self, node):
-    if not self.function_level:
-      self.function_level += 1
-      self.generic_visit(node)
-      self.function_level -= 1
+      if not do_return:
+        <block 2>
+
+      return retval
+
+  The conversion adjusts loops as well:
+
+      <block 1>
+      while cond:
+        <block 2>
+        return retval
+
+  is converted to:
+
+      <block 1>
+      while not do_return and cond:
+        <block 2>
+        do_return = True
+        retval = val
+  """
 
   def visit_Return(self, node):
-    if self.cant_return:
-      raise ValueError(
-          'After transforms, a conditional contained a `return `statement, '
-          'which is not allowed. This is a bug, and should not happen.')
+    self.state[_Return].used = True
 
+    retval = node.value if node.value else parser.parse_expresison('None')
 
-class DetectReturnInFunctionDef(gast.NodeVisitor):
+    template = """
+      do_return_var_name = True
+      retval_var_name = retval
+    """
+    node = templates.replace(
+        template,
+        do_return_var_name=self.state[_Function].do_return_var_name,
+        retval_var_name=self.state[_Function].retval_var_name,
+        retval=retval)
 
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-    if not contains_return(node):
-      raise ValueError(
-          'Each function definition should contain at least one return.')
+    return node
 
+  def _postprocess_statement(self, node):
+    # Example of how the state machine below works:
+    #
+    #   1| stmt           # State: _Return.used = False
+    #    |                # Action: none
+    #   3| return         # State: _Return.used = True,
+    #    |                #        _Return.guard_created = False,
+    #    |                #        _Return.create_guard = False
+    #    |                # Action: _Return.create_guard = True
+    #   4| stmt           # State: _Return.used = True,
+    #    |                #        _Return.guard_created = False,
+    #    |                #        _Return.create_guard = True
+    #    |                # Action: create `if not return_used`,
+    #    |                #         set _Return.guard_created = True
+    #   5| stmt           # State: _Return.used = True,
+    #    |                #        _Return.guard_created = True
+    #    |                # Action: none (will be wrapped under previously
+    #    |                #         created if node)
+    if self.state[_Return].used:
+      if self.state[_Return].guard_created:
+        return node, None
+
+      elif not self.state[_Return].create_guard:
+        self.state[_Return].create_guard = True
+        return node, None
+
+      elif (not self.state[_Return].guard_created and
+            self.state[_Return].create_guard):
+        self.state[_Return].guard_created = True
+        template = """
+          if ag__.not_(do_return_var_name):
+            original_node
+        """
+        cond, = templates.replace(
+            template,
+            do_return_var_name=self.state[_Function].do_return_var_name,
+            original_node=node)
+        return cond, cond.body
 
-def transform(node, ctx):
-  """Ensure a function has only a single return.
-
-  This transforms an AST node with multiple returns successively into containing
-  only a single return node.
-  There are a few restrictions on what we can handle:
-   - An AST being transformed must contain at least one return.
-   - No returns allowed in loops. We have to know the type of the return value,
-   and we currently don't have either a type inference system to discover it,
-   nor do we have a mechanism for late type binding in TensorFlow.
-   - After all transformations are finished, a Return node is not allowed inside
-   control flow. If we were unable to move a return outside of control flow,
-   this is an error.
-
-  Args:
-     node: ast.AST
-     ctx: converter.EntityContext
-
-  Returns:
-     new_node: an AST with a single return value
-
-  Raises:
-    ValueError: if the AST is structured so that we can't perform the
-   transform.
-  """
-  # Make sure that the function has at least one return statement
-  # TODO(alexbw): turning off this assertion for now --
-  # we need to not require this in e.g. class constructors.
-  # DetectReturnInFunctionDef().visit(node)
+      else:
+        assert False, 'should handle all states'
 
-  # Make sure there's no returns in unsupported locations (loops, try/except)
-  DetectReturnInUnsupportedControlFlow().visit(node)
+    return node, None
 
-  while True:
+  def _visit_statement_block(self, node, nodes):
+    self.state[_Return].enter()
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    return_used = self.state[_Return].used
+    self.state[_Return].exit()
+    if return_used:
+      self.state[_Return].used = True
+    return nodes
 
-    # Try to lift all returns out of if statements and with blocks
-    lr = LiftReturn(ctx)
-    node = lr.visit(node)
-    changes_made = lr.changes_made
-    fe = FoldElse(ctx)
-    node = fe.visit(node)
-    changes_made = changes_made or fe.changes_made
+  def visit_While(self, node):
+    node.test = self.visit(node.test)
+
+    # Add the check for return to the loop condition.
+    node.body = self._visit_statement_block(node, node.body)
+    if self.state[_Return].used:
+      node.test = templates.replace_as_expression(
+          'ag__.and_(lambda: ag__.not_(control_var), lambda: test)',
+          test=node.test,
+          control_var=self.state[_Function].do_return_var_name)
+
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
 
-    if not changes_made:
-      break
+  def visit_For(self, node):
+    node.iter = self.visit(node.iter)
+    node.target = self.visit(node.target)
+
+    # Add the check for return to the loop condition.
+    node.body = self._visit_statement_block(node, node.body)
+    if self.state[_Return].used:
+      extra_test = anno.getanno(node, 'extra_test', default=None)
+      if extra_test is not None:
+        extra_test = templates.replace_as_expression(
+            'ag__.and_(lambda: ag__.not_(control_var), lambda: extra_test)',
+            extra_test=extra_test,
+            control_var=self.state[_Function].do_return_var_name)
+      else:
+        extra_test = templates.replace_as_expression(
+            'ag__.not_(control_var)',
+            control_var=self.state[_Function].do_return_var_name)
+      anno.setanno(node, 'extra_test', extra_test)
 
-  # Make sure we've scrubbed all returns from conditionals
-  DetectReturnInConditional().visit(node)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
 
+  def visit_If(self, node):
+    node.test = self.visit(node.test)
+    node.body = self._visit_statement_block(node, node.body)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    self.state[_Return].enter()
+
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    do_return_var_name = self.ctx.namer.new_symbol(
+        'do_return', scope.referenced)
+    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+    self.state[_Function].do_return_var_name = do_return_var_name
+    self.state[_Function].retval_var_name = retval_var_name
+
+    converted_body = self._visit_statement_block(node, node.body)
+
+    if self.state[_Return].used:
+      template = """
+        do_return_var_name = False
+        retval_var_name = None
+        body
+        return retval_var_name
+      """
+      node.body = templates.replace(
+          template,
+          body=converted_body,
+          do_return_var_name=do_return_var_name,
+          retval_var_name=retval_var_name)
+
+    self.state[_Return].exit()
+    self.state[_Function].exit()
+    return node
+
+
+def transform(node, ctx):
+  """Ensure a function has only a single return."""
+  # Note: Technically, these two could be merged into a single walk, but
+  # keeping them separate helps with readability.
+  node = ConditionalReturnLowering(ctx).visit(node)
+  node = ReturnStatementsTransformer(ctx).visit(node)
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index 762fbc6f607f56ed6d80dd82f59f8c7653c7312a..2b160d5ac7c5845e34c013c7fa01a75b06283d15 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -49,17 +49,16 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_orelse(self):
+  def test_missing_else(self):
 
     def test_fn(x):
       if x > 0:
         return x
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(ValueError):
-      return_statements.transform(node, ctx)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_orelse_recovrable(self):
+  def test_missing_else_then_default(self):
 
     def test_fn(x):
       if x > 0:
@@ -69,7 +68,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_branch_return_recoverable(self):
+  def test_else_only_then_default(self):
 
     def test_fn(x):
       if x < 0:
@@ -136,7 +135,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
     self.assertTransformedEquivalent(test_fn, 2)
 
-  def test_nested_functions(self):
+  def test_nested_function(self):
 
     def test_fn(x):
 
@@ -151,7 +150,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_nested_functions_in_control_flow(self):
+  def test_nested_function_in_control_flow(self):
 
     def test_fn(x):
 
@@ -163,16 +162,30 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_loop(self):
+  def test_for_loop(self):
 
-    def test_fn(x):
-      for _ in range(10):
-        return x
-      return x
+    def test_fn(n):
+      for _ in range(n):
+        return 1
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(ValueError):
-      return_statements.transform(node, ctx)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 0)
+
+  def test_while_loop(self):
+
+    def test_fn(n):
+      i = 0
+      s = 0
+      while i < n:
+        i += 1
+        s += i
+        if s > 4:
+          return s
+      return -1
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index bd049afdfcef4c839bcb3d9ba5444d885c3061cc..11e3736d4fb9e8d06d5f02c991ea66410b35b374 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -23,7 +23,6 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import list_ops
@@ -68,7 +67,7 @@ class SliceTest(converter_testing.TestCase):
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.float32')
     }
-    with self.assertRaises(transformer.AutographParseError):
+    with self.assertRaises(ValueError):
       slices.transform(node, ctx)
 
 
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index e6d626f215927941dffae9da45ce6b4d24b6402f..870c213e63b74292b30996fa1fdfe58f6c9a5fc2 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -125,11 +125,22 @@ class Feature(enum.Enum):
   ALL = 'ALL'
 
   AUTO_CONTROL_DEPS = 'AUTO_CONTROL_DEPS'
-  DECORATORS = 'DECORATORS'
   ERROR_REWRITING = 'ERROR_REWRITING'
   LISTS = 'LISTS'
   NAME_SCOPES = 'NAME_SCOPES'
 
+  @classmethod
+  def all(cls):
+    """Returns a tuple that enables all options."""
+    return tuple(cls.__members__.values())
+
+  @classmethod
+  def all_but(cls, exclude):
+    """Returns a tuple that enables all but the excluded options."""
+    if not isinstance(exclude, (list, tuple, set)):
+      exclude = (exclude,)
+    return tuple(set(cls.all()) - set(exclude) - {cls.ALL})
+
 
 class ConversionOptions(object):
   """Immutable container for global conversion flags.
@@ -206,7 +217,7 @@ class ConversionOptions(object):
       ast.Node
     """
     template = """
-      constructor_name(
+      ag__.ConversionOptions(
           recursive=recursive_val,
           verbose=verbose_val,
           strip_decorators=strip_decorators_val,
@@ -216,7 +227,8 @@ class ConversionOptions(object):
     """
 
     def as_qualified_name(o):
-      name = inspect_utils.getqualifiedname(ctx.info.namespace, o, max_depth=1)
+      name = inspect_utils.getqualifiedname(
+          ctx.info.namespace, o, max_depth=1)
       if not name:
         if isinstance(o, weakref.ref):
           # `o` might already be a weak reference, if this object was
@@ -234,17 +246,15 @@ class ConversionOptions(object):
 
     def list_of_features(values):
       return parser.parse_expression('({})'.format(', '.join(
-          'ag__.Feature.{}'.format(v)
-          for v in Feature.__members__
+          'ag__.{}'.format(v)
+          for v in Feature.__members__.values()
           if v in values)))
 
-    if internal_convert_user_code is not None:
+    if internal_convert_user_code is None:
       internal_convert_user_code = self.internal_convert_user_code
 
     expr_ast = templates.replace(
         template,
-        constructor_name=parser.parse_expression(
-            as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
         verbose_val=parser.parse_expression(str(int(self.verbose))),
         strip_decorators_val=list_of_names(self._strip_decorators),
@@ -278,6 +288,11 @@ class ProgramContext(object):
     required_imports: str, containing an import statement on each line. These
       are all the imports necessary for the compiled code to run, in addition to
       the closures of each entity, which are attached dynamically.
+    partial_types: Tuple[Type], deprecated.
+    conversion_order: Tuple[Any], deprecated.
+    additional_symbols: Dict[str, Any], a map of new symbols that have been
+      created under this context, and need to be added to the namespace of the
+      generated code.
   """
 
   def __init__(
@@ -349,7 +364,7 @@ class ProgramContext(object):
     self.dependency_cache[original_entity] = converted_ast
 
 
-class EntityContext(object):
+class EntityContext(transformer.Context):
   """Tracks the conversion of a single entity.
 
   This object is mutable, and is updated during conversion. Not thread safe.
@@ -361,8 +376,8 @@ class EntityContext(object):
   """
 
   def __init__(self, namer, entity_info, program_ctx):
+    super(EntityContext, self).__init__(entity_info)
     self.namer = namer
-    self.info = entity_info
     self.program = program_ctx
 
 
@@ -374,8 +389,7 @@ class Base(transformer.Base):
   """
 
   def __init__(self, ctx):
-    super(Base, self).__init__(ctx.info)
-    self.ctx = ctx  # Keeping this short because it's used frequently.
+    super(Base, self).__init__(ctx)
 
     self._used = False
     self._ast_depth = 0
@@ -475,13 +489,13 @@ def standard_analysis(node, context, is_initial=False):
   # TODO(mdan): Don't return a node because it's modified by reference.
   graphs = cfg.build(node)
   node = qual_names.resolve(node)
-  node = activity.resolve(node, context.info, None)
-  node = reaching_definitions.resolve(node, context.info, graphs, AnnotatedDef)
-  node = liveness.resolve(node, context.info, graphs)
-  node = live_values.resolve(node, context.info, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, context.info)
+  node = activity.resolve(node, context, None)
+  node = reaching_definitions.resolve(node, context, graphs, AnnotatedDef)
+  node = liveness.resolve(node, context, graphs)
+  node = live_values.resolve(node, context, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, context)
   # This second call allows resolving first-order class attributes.
-  node = live_values.resolve(node, context.info, config.PYTHON_LITERALS)
+  node = live_values.resolve(node, context, config.PYTHON_LITERALS)
   if is_initial:
     anno.dup(
         node,
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index 864ea6c7d2b891cd1f21f4b1c83f66949cd6ab9b..4050878b929b097bc61169040368a4d56876e45e 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -23,7 +23,10 @@ import weakref
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
 
@@ -31,7 +34,46 @@ class TestConverter(converter.Base):
   pass
 
 
-class ConversionOptionsTest(test.TestCase):
+class ConversionOptionsTest(converter_testing.TestCase):
+
+  def test_to_ast(self):
+    opts = converter.ConversionOptions()
+
+    namer = converter_testing.FakeNamer()
+    program_ctx = converter.ProgramContext(
+        options=opts,
+        partial_types=None,
+        autograph_module=None,
+        uncompiled_modules=())
+    entity_info = transformer.EntityInfo(
+        source_code='',
+        source_file='<fragment>',
+        namespace={},
+        arg_values=None,
+        arg_types={},
+        owner_type=None)
+    ctx = converter.EntityContext(namer, entity_info, program_ctx)
+    opts_ast = opts.to_ast(ctx)
+
+    template = '''
+    def test_fn():
+      return opts_ast
+    '''
+    opts_packed = templates.replace(template, opts_ast=opts_ast)
+
+    reparsed, _ = compiler.ast_to_object(opts_packed)
+    reparsed.__dict__['ag__'] = self.make_fake_mod(
+        'fake_ag', converter.ConversionOptions, converter.Feature)
+
+    reparsed_opts = reparsed.test_fn()
+
+    self.assertEqual(opts.recursive, reparsed_opts.recursive)
+    self.assertEqual(opts.verbose, reparsed_opts.verbose)
+    self.assertEqual(opts.force_conversion, reparsed_opts.force_conversion)
+    self.assertEqual(
+        opts.internal_convert_user_code,
+        reparsed_opts.internal_convert_user_code)
+    self.assertEqual(opts.optional_features, reparsed_opts.optional_features)
 
   def test_should_strip_weakrefs(self):
     def test_fn():
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index f1374081d3c6e0dd93c39d331c76404859b2f40a..56445dbd456eb07d5e3b5fec6a3da3023cd069f4 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -39,9 +39,7 @@ from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
-
-def imported_decorator(f):
-  return lambda a: f(a) + 1
+RESULT_OF_MOCK_CONVERTED_CALL = 7
 
 
 # TODO(mdan): We should use the real namer here.
@@ -50,6 +48,7 @@ class FakeNamer(object):
 
   def __init__(self):
     self.i = 0
+    self.partial_types = ()
 
   def new_symbol(self, name_root, used):
     while True:
@@ -95,8 +94,8 @@ class TestCase(test.TestCase):
     self.dynamic_calls = []
     def converted_call(*args):
       """Mock version of api.converted_call."""
-      self.dynamic_calls.append(args)
-      return 7
+      self.dynamic_calls.append(args[3:])  # args only; see api.converted_call
+      return RESULT_OF_MOCK_CONVERTED_CALL
 
     try:
       result, source = compiler.ast_to_object(node, include_source_map=True)
@@ -107,11 +106,13 @@ class TestCase(test.TestCase):
                                    converter.ConversionOptions)
       fake_ag.__dict__.update(operators.__dict__)
       fake_ag.__dict__.update(special_functions.__dict__)
-      fake_ag.__dict__['utils'] = utils
-      fake_ag.__dict__['rewrite_graph_construction_error'] = (
+      fake_ag.ConversionOptions = converter.ConversionOptions
+      fake_ag.Feature = converter.Feature
+      fake_ag.utils = utils
+      fake_ag.rewrite_graph_construction_error = (
           errors.rewrite_graph_construction_error)
-      fake_ag.__dict__['function_scope'] = function_wrapping.function_scope
-      result.__dict__['ag__'] = fake_ag
+      fake_ag.function_scope = function_wrapping.function_scope
+      result.ag__ = fake_ag
       for k, v in namespace.items():
         result.__dict__[k] = v
       yield result
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index b8d79daebaa6d6dcf5f324f637a3b496f3742b92..245795c3d2e1c8c33f7de6ee01e17f43433bd410 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -18,8 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
+
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.utils import misc
+
+
+class _NamingStyle(enum.Enum):
+  SNAKE = 1
+  CAMEL = 2
 
 
 class Namer(object):
@@ -46,17 +54,52 @@ class Namer(object):
 
     self.generated_names = set()
 
+  def _as_symbol_name(self, fqn, style=_NamingStyle.SNAKE):
+    """Returns a symbol name that matches a fully-qualified name.
+
+    The returned name is safe to use for Python symbols. Any special characters
+    present in fqn are replaced according to the style argument.
+
+    Examples:
+
+      self._as_symbol_name('foo.bar', style=_NamingStyle.CAMEL) == 'FooBar'
+      self._as_symbol_name('foo.bar', style=_NamingStyle.SNAKE) == 'foo_bar'
+
+    See the unit tests for more examples.
+
+    Args:
+      fqn: Union[Text, Tuple[Text]] a fully-qualified symbol name. The qualifier
+        may include module, class names, attributes, etc.
+      style: _NamingStyle
+    Returns:
+      Text
+    """
+    assert style in _NamingStyle
+
+    if isinstance(fqn, tuple):
+      cn = '.'.join(fqn)
+    else:
+      cn = fqn
+
+    # Until we clean up the whole FQN mechanism, `fqn` may not be
+    # canonical, that is, in can appear as ('foo.bar', 'baz')
+    # This replaces any characters that might remain because of that.
+    pieces = cn.split('.')
+
+    if style == _NamingStyle.CAMEL:
+      pieces = tuple(misc.capitalize_initial(p) for p in pieces)
+      return ''.join(pieces)
+    elif style == _NamingStyle.SNAKE:
+      return '_'.join(pieces)
+
   def compiled_class_name(self, original_fqn, live_entity=None):
     """See call_trees.FunctionNamer.compiled_class_name."""
     if live_entity is not None and live_entity in self.renamed_calls:
       return self.renamed_calls[live_entity]
 
-    if isinstance(original_fqn, tuple):
-      original_name = '__'.join(original_fqn)
-    else:
-      original_name = original_fqn
-
-    new_name_root = 'Tf%s' % original_name
+    canonical_name = self._as_symbol_name(
+        original_fqn, style=_NamingStyle.CAMEL)
+    new_name_root = 'Tf%s' % canonical_name
     new_name = new_name_root
     n = 0
     while new_name in self.global_namespace:
@@ -73,7 +116,6 @@ class Namer(object):
                              live_entity=None,
                              owner_type=None):
     """See call_trees.FunctionNamer.compiled_function_name."""
-
     if not self.recursive:
       return None, False
 
@@ -84,15 +126,12 @@ class Namer(object):
       # Members are not renamed when part of an entire converted class.
       return None, False
 
-    if isinstance(original_fqn, tuple):
-      original_name = '__'.join(original_fqn)
-    else:
-      original_name = original_fqn
-
     if live_entity is not None and live_entity in self.renamed_calls:
       return self.renamed_calls[live_entity], True
 
-    new_name_root = 'tf__%s' % original_name
+    canonical_name = self._as_symbol_name(
+        original_fqn, style=_NamingStyle.SNAKE)
+    new_name_root = 'tf__%s' % canonical_name
     new_name = new_name_root
     n = 0
     while new_name in self.global_namespace:
diff --git a/tensorflow/python/autograph/core/naming_test.py b/tensorflow/python/autograph/core/naming_test.py
index 2db98836d1e3bce73aacd736867c96d4d19390d2..cc8c4314a700ac43ff5d21ad32706a0c3d5be0f5 100644
--- a/tensorflow/python/autograph/core/naming_test.py
+++ b/tensorflow/python/autograph/core/naming_test.py
@@ -45,6 +45,22 @@ class NamerTest(test.TestCase):
     self.assertEqual(('tf__foo', True), namer.compiled_function_name(
         'foo', foo))
 
+  def test_compiled_function_name_unsanitized_fqn(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual(('tf__foo_bar', True),
+                     namer.compiled_function_name('foo.bar'))
+    self.assertEqual(('tf__foo_bar_baz', True), namer.compiled_function_name(
+        ('foo.bar', 'baz')))
+
+  def test_compiled_class_name_basic(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual('TfFooBar', namer.compiled_class_name(('foo', 'Bar')))
+
+  def test_compiled_class_name_unsanitized_fqn(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual('TfFooBarBaz',
+                     namer.compiled_class_name(('foo.bar', 'Baz')))
+
   def test_compiled_function_name_avoids_global_conflicts(self):
     def foo():
       pass
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 201a88875413982b0f1a791f3408b403a3259eb8..66f7915696ec400675810b8b954e6812294f0760 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 filegroup(
     name = "all_files",
@@ -37,25 +37,23 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "api_test",
     srcs = ["api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":impl",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/utils",
-        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":impl",
-        "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
+        "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index b1c16b116945ebc71885c2ab83e5eadb65981b79..122ea1726b60c641d702fe737db524205b70e389 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -35,9 +35,9 @@ from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.utils import py_func
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -263,7 +263,7 @@ def converted_call(f, owner, options, *args, **kwargs):
     partial_types = (f.__class__,)
 
   else:
-    NotImplementedError('unknown callable type "%s"' % type(f))
+    raise NotImplementedError('unknown callable type "%s"' % type(f))
 
   arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
   arg_types = {}
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 0ca84b1f7a488e28f1900cb3ba76577814562094..1cd5671eae7b5ddd40955df77b72b54f12df9b6e 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -33,7 +33,6 @@ from tensorflow.python.autograph.converters import call_trees
 from tensorflow.python.autograph.converters import conditional_expressions
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.converters import control_flow
-from tensorflow.python.autograph.converters import decorators
 from tensorflow.python.autograph.converters import directives
 from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.converters import function_scopes
@@ -44,18 +43,19 @@ from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import errors
+from tensorflow.python.autograph.core import errors as ag_errors
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -81,23 +81,27 @@ def is_whitelisted_for_graph(o):
   else:
     m = tf_inspect.getmodule(o)
   if not hasattr(m, '__name__'):
-    logging.vlog(1, '%s is NOT whitelisted for graph: unknown module name', o)
+    # Note: typically it's builtins that fall in this category. Builtins will
+    # be handled by specific code that follows this screening layer.
+    logging.log(2, '%s is NOT whitelisted: unknown module name', o)
     return False
 
   for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
     if m.__name__.startswith(prefix):
-      logging.vlog(1, '%s is whitelisted: name starts with "%s"', o, prefix)
+      logging.log(2, '%s is whitelisted: name starts with "%s"', o, prefix)
       return True
 
-  if hasattr(o, 'autograph_info__'):
+  if hasattr(o, 'autograph_info__') or hasattr(o, '__ag_compiled'):
+    logging.log(2, '%s is whitelisted: already converted', o)
     return True
 
   if (not inspect_utils.isweakrefself(o) and not tf_inspect.isclass(o) and
       hasattr(o, '__call__') and hasattr(o, '__class__')):
     # Callable objects: whitelisted if their __call__ method is.
-    retval = is_whitelisted_for_graph(o.__call__)
-    logging.vlog(1, '%s is whitelisted: object __call__ whitelisted', o)
-    return retval
+    call_whitelisted = is_whitelisted_for_graph(o.__call__)
+    if call_whitelisted:
+      logging.log(2, '%s is whitelisted: object __call__ whitelisted', o)
+      return call_whitelisted
 
   if tf_inspect.ismethod(o):
     # Methods of whitelisted classes are also whitelisted, even if they are
@@ -119,8 +123,8 @@ def is_whitelisted_for_graph(o):
     if owner_class is not None:
       owner_class = inspect_utils.getdefiningclass(o, owner_class)
       if is_whitelisted_for_graph(owner_class):
-        logging.vlog(1, '%s is whitelisted: owner is whitelisted %s', o,
-                     owner_class)
+        logging.log(2, '%s is whitelisted: owner is whitelisted %s', o,
+                    owner_class)
         return True
 
   if inspect_utils.isnamedtuple(o):
@@ -128,14 +132,13 @@ def is_whitelisted_for_graph(o):
     # because they don't expose source code. But we assume they are safe for
     # graph mode since they are just containers.
     if tf_inspect.isclass(o) and len(o.__bases__) > 1:
-      logging.log_first_n(
-          logging.level_warning(),
+      logging.warn_first_n(
           'Entity {} looks like a namedtuple subclass. If it has any custom'
           ' methods, they will not be converted by AutoGraph.'.format(o), 1)
-    logging.vlog(1, '%s is whitelisted: named tuple', o)
+    logging.log(2, '%s is whitelisted: named tuple', o)
     return True
 
-  logging.vlog(1, '%s is NOT whitelisted for graph', o)
+  logging.log(2, '%s is NOT whitelisted', o)
   return False
 
 
@@ -167,7 +170,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
-  logging.vlog(logging.DEBUG, 'Converting %s', o)
+  logging.log(1, 'Converting %s', o)
 
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, program_ctx)
@@ -201,9 +204,9 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
 
   program_ctx.add_to_cache(o, node)
 
-  if logging.get_verbosity() <= logging.DEBUG:
-    logging.vlog(logging.DEBUG, 'Compiled output of %s:\n\n%s\n', o,
-                 compiler.ast_to_source(node))
+  if logging.has_verbosity(2):
+    logging.log(2, 'Compiled output of %s:\n\n%s\n', o,
+                compiler.ast_to_source(node))
 
   if program_ctx.options.recursive:
     while True:
@@ -315,10 +318,12 @@ def _add_self_references(namespace, autograph_module):
     # internal modules.
     ag_internal = imp.new_module('autograph')
     ag_internal.__dict__.update(autograph_module.__dict__)
+    ag_internal.ConversionOptions = converter.ConversionOptions
+    ag_internal.Feature = converter.Feature
     ag_internal.utils = utils
     ag_internal.function_scope = function_wrapping.function_scope
     ag_internal.rewrite_graph_construction_error = (
-        errors.rewrite_graph_construction_error)
+        ag_errors.rewrite_graph_construction_error)
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
@@ -336,6 +341,7 @@ def function_to_graph(f,
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
+  logging.log(3, 'Source code of %s:\n%s', f, source)
   node = node.body[0]
 
   # In general, the output of inspect.getsource is inexact because it uses
@@ -373,7 +379,12 @@ def function_to_graph(f,
       arg_types=arg_types,
       owner_type=owner_type)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  node = node_to_graph(node, context)
+  try:
+    node = node_to_graph(node, context)
+  except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
+    logging.error(1, 'Error converting %s', f, exc_info=True)
+    raise errors.InternalError('conversion', e)
+    # TODO(mdan): Catch and rethrow syntax errors.
 
   if isinstance(node, gast.Lambda):
     new_name = namer.new_symbol('tf__lambda', ())
@@ -416,9 +427,6 @@ def node_to_graph(node, context):
   # source.
   # TODO(mdan): Is it feasible to reconstruct intermediate source code?
   context.info.source_code = None
-
-  if context.program.options.uses(converter.Feature.DECORATORS):
-    node = converter.apply_(node, context, decorators)
   node = converter.apply_(node, context, arg_defaults)
   node = converter.apply_(node, context, directives)
   node = converter.apply_(node, context, break_statements)
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index 9a4fbdad8c1994d8c8cc534b6e0b4af45f5c4c80..cd893e3ff14eaa53633b60963923fe7ebd10823b 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -96,7 +96,6 @@ class ConversionTest(test.TestCase):
     f_node = program_ctx.dependency_cache[f][0]
     g_node = program_ctx.dependency_cache[g][0]
     self.assertEqual('tf__f', f_node.name)
-    self.assertEqual('tf__g', f_node.body[0].body[0].body[0].value.func.id)
     self.assertEqual('tf__g', g_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index aedb901845b97bbee5918902875b5023a8604dcd..21a66c86b79e2116319bb240b138c6757484c6e0 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -38,6 +38,7 @@ py_library(
         "//tensorflow/python:list_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/utils",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 7a580fe32475cbc32f20a1196c075fbf7f981d27..35f8028c295550443b98ca430d459967e03a6edf 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -52,6 +52,7 @@ from tensorflow.python.autograph.operators.logical import eq
 from tensorflow.python.autograph.operators.logical import gt
 from tensorflow.python.autograph.operators.logical import gt_e
 from tensorflow.python.autograph.operators.logical import in_
+from tensorflow.python.autograph.operators.logical import invert
 from tensorflow.python.autograph.operators.logical import is_
 from tensorflow.python.autograph.operators.logical import is_not
 from tensorflow.python.autograph.operators.logical import lt
@@ -60,6 +61,7 @@ from tensorflow.python.autograph.operators.logical import not_
 from tensorflow.python.autograph.operators.logical import not_eq
 from tensorflow.python.autograph.operators.logical import not_in
 from tensorflow.python.autograph.operators.logical import or_
+from tensorflow.python.autograph.operators.logical import u_add
 from tensorflow.python.autograph.operators.logical import u_sub
 from tensorflow.python.autograph.operators.py_builtins import float_
 from tensorflow.python.autograph.operators.py_builtins import int_
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index afa3787d4277985285d5dc8b3e1531a00460076b..f046000720cae8282218cd6bfd3853dac1cdd3dc 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.util import nest
 
 
 def for_stmt(iter_, extra_test, body, init_state):
@@ -73,10 +74,6 @@ def _py_for_stmt(iter_, extra_test, body, init_state):
     if not extra_test(*state):
       break
     state = body(target, *state)
-
-  # TODO(mdan): Remove this special case.
-  if len(state) == 1:
-    return state[0]
   return state
 
 
@@ -87,10 +84,12 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
   def while_body(iterate_index, *state):
     iterate = iter_[iterate_index]
     new_state = body(iterate, *state)
+
+    state = (iterate_index + 1,)
     if new_state:
-      return (iterate_index + 1,) + new_state
-    else:
-      return iterate_index + 1
+      state += new_state
+
+    return state
 
   def while_cond(iterate_index, *state):
     return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
@@ -108,9 +107,6 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
     assert len(results) >= 1  # Has at least the iterate.
     if len(results) > 1:
       results = results[1:]
-    if len(results) == 1:
-      # TODO(mdan): Remove this special case.
-      results, = results
   else:
     results = ()
 
@@ -129,9 +125,6 @@ def _dataset_for_stmt(ds, extra_test, body, init_state):
 
   results = ds.reduce(init_state, reduce_body)
 
-  # TODO(mdan): Remove this special case.
-  if len(results) == 1:
-    return results[0]
   return results
 
 
@@ -160,7 +153,8 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
   # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
-  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+  if any(tensor_util.is_tensor(v)
+         for v in nest.flatten(init_state + extra_deps)):
     return _tf_while_stmt(test, body, init_state, opts)
   else:
     return _py_while_stmt(test, body, init_state, opts)
@@ -170,7 +164,13 @@ def _tf_while_stmt(test, body, init_state, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   if opts is None:
     opts = {}
-  return control_flow_ops.while_loop(test, body, init_state, **opts)
+
+  # Non-v2 while_loop unpacks the results when there is only one return value.
+  # This enforces consistency across versions.
+  opts['return_same_structure'] = True
+
+  retval = control_flow_ops.while_loop(test, body, init_state, **opts)
+  return retval
 
 
 def _py_while_stmt(test, body, init_state, opts):
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 0a7d4b64022f583bae4effc7d0f7eb04f46cc048..590b51820f831cf1ce1bb3ff9b98f66b6532f56d 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -36,7 +36,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
@@ -45,7 +45,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    self.assertEqual(10, s)
+    self.assertEqual((10,), s)
 
   @test_util.run_deprecated_v1
   def test_dataset(self):
@@ -55,7 +55,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((10,), self.evaluate(s))
 
 
@@ -65,18 +65,30 @@ class WhileLoopTest(test.TestCase):
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
-        test=lambda i, s: i < n,
-        body=lambda i, s: (i + 1, s + i,),
+        test=lambda i, sum: i < n,
+        body=lambda i, sum: (i + 1, sum + i,),
         init_state=(0, 0),
         extra_deps=(n,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((5, 10), self.evaluate(results))
 
+  @test_util.run_deprecated_v1
+  def test_tensor_dict_state(self):
+    n = 5
+    init_state = {'i': constant_op.constant(0), 'sum': constant_op.constant(0)}
+    results = control_flow.while_stmt(
+        test=lambda s: s['i'] < n,
+        body=lambda s: ({'i': s['i'] + 1, 'sum': s['sum'] + s['i']},),
+        init_state=(init_state,),
+        extra_deps=())
+    with self.cached_session():
+      self.assertEqual(({'i': 5, 'sum': 10},), self.evaluate(results))
+
   def test_python(self):
     n = 5
     results = control_flow.while_stmt(
-        test=lambda i, s: i < n,
-        body=lambda i, s: (i + 1, s + i),
+        test=lambda i, sum: i < n,
+        body=lambda i, sum: (i + 1, sum + i),
         init_state=(0, 0),
         extra_deps=(n,))
     self.assertEqual((5, 10), results)
@@ -93,7 +105,7 @@ class IfStmtTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_tensor(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       t = self.single_return_if_stmt(constant_op.constant(True))
       self.assertEqual(1, self.evaluate(t))
       t = self.single_return_if_stmt(constant_op.constant(False))
@@ -105,7 +117,7 @@ class IfStmtTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_tensor_multiple_returns(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       t = self.multi_return_if_stmt(constant_op.constant(True))
       self.assertAllEqual([1, 2], self.evaluate(t))
       t = self.multi_return_if_stmt(constant_op.constant(False))
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
index 569db5b91bd7efb92ce2b8a8b8eb6eb773f4abcb..dadb0daf1ae22016d0cff2889472423149258ffb 100644
--- a/tensorflow/python/autograph/operators/logical.py
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import operator
+
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -35,7 +37,7 @@ def and_(a, b):
   a_val = a()
   if tensor_util.is_tensor(a_val):
     return _tf_lazy_and(a_val, b)
-  return _py_lazy_and(a_val, b)
+  return a_val and b()
 
 
 def _tf_lazy_and(cond, b):
@@ -44,17 +46,12 @@ def _tf_lazy_and(cond, b):
   return control_flow_ops.cond(cond, b, lambda: cond)
 
 
-def _py_lazy_and(cond, b):
-  """Lazy-eval equivalent of "and" in Python."""
-  return cond and b()
-
-
 def or_(a, b):
   """Functional form of "or". Uses lazy evaluation semantics."""
   a_val = a()
   if tensor_util.is_tensor(a_val):
     return _tf_lazy_or(a_val, b)
-  return _py_lazy_or(a_val, b)
+  return a_val or b()
 
 
 def _tf_lazy_or(cond, b):
@@ -63,16 +60,11 @@ def _tf_lazy_or(cond, b):
   return control_flow_ops.cond(cond, lambda: cond, b)
 
 
-def _py_lazy_or(cond, b):
-  """Lazy-eval equivalent of "or" in Python."""
-  return cond or b()
-
-
 def eq(a, b):
   """Functional form of "equal"."""
   if tensor_util.is_tensor(a) or tensor_util.is_tensor(b):
     return _tf_equal(a, b)
-  return _py_equal(a, b)
+  return a == b
 
 
 def _tf_equal(a, b):
@@ -80,11 +72,6 @@ def _tf_equal(a, b):
   return gen_math_ops.equal(a, b)
 
 
-def _py_equal(a, b):
-  """Overload of "equal" that falls back to Python's default implementation."""
-  return a == b
-
-
 def not_eq(a, b):
   """Functional form of "not-equal"."""
   return not_(eq(a, b))
@@ -92,25 +79,8 @@ def not_eq(a, b):
 
 # Default implementation for the remainings.
 
-
-def gt(a, b):
-  """Functional form of "less-than"."""
-  return a > b
-
-
-def gt_e(a, b):
-  """Functional form of "less-than"."""
-  return a >= b
-
-
-def is_(a, b):
-  """Functional form of "less-than"."""
-  return a is b
-
-
-def is_not(a, b):
-  """Functional form of "less-than"."""
-  return a is not b
+is_ = operator.is_
+is_not = operator.is_not
 
 
 def in_(a, b):
@@ -119,21 +89,16 @@ def in_(a, b):
   return a in b
 
 
-def lt(a, b):
-  """Functional form of "less-than"."""
-  return a < b
-
-
-def lt_e(a, b):
-  """Functional form of "less-than"."""
-  return a <= b
-
-
 def not_in(a, b):
   """Functional form of "less-than"."""
   return a not in b
 
+gt = operator.gt
+gt_e = operator.ge
+lt = operator.lt
+lt_e = operator.le
+
 
-def u_sub(a):
-  """Functional form of "unary-sub"."""
-  return -a
+u_add = operator.pos
+u_sub = operator.neg
+invert = operator.invert
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index ddf05f73f37821c6ff7e246051cd82a560f370e3..fe9486ca1ed41ce55f2219b3771639eb081a6afe 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 
 
-UNDEFINED = object()
+UNSPECIFIED = object()
 
 
 def overload_of(f):
@@ -77,14 +77,14 @@ def _py_float(x):
   return float(x)
 
 
-def int_(x=0, base=UNDEFINED):
+def int_(x=0, base=UNSPECIFIED):
   if tensor_util.is_tensor(x):
     return _tf_int(x, base)
   return _py_int(x, base)
 
 
 def _tf_int(x, base):
-  if base not in (10, UNDEFINED):
+  if base not in (10, UNSPECIFIED):
     raise NotImplementedError('base {} not supported for int'.format(base))
 
   # TODO(mdan): We shouldn't assume int32.
@@ -94,7 +94,7 @@ def _tf_int(x, base):
 
 
 def _py_int(x, base):
-  if base is UNDEFINED:
+  if base is UNSPECIFIED:
     return int(x)
   return int(x, base)
 
@@ -155,19 +155,28 @@ def _py_len(s):
 
 
 def print_(*objects, **kwargs):
+  """Overload of the print builtin."""
   # Note: Python 2.6 doesn't support explicit keywords after starargs.
   unknown_kwargs = tuple(
       set(kwargs.keys()) - set(('sep', 'end', 'file', 'flush')))
   if unknown_kwargs:
     raise ValueError('invalid keyword arguments: {}'.format(unknown_kwargs))
 
-  # TODO(mdan): use logging_ops.Print when py_func is not supported.
-  return _tf_py_func_print(objects, kwargs)
+  # TODO(mdan): Use next.flatten(objects) instead?
+  if any(tensor_util.is_tensor(o) for o in objects):
+    # TODO(mdan): use tf.print instead.
+    return _tf_py_func_print(objects, kwargs)
+  else:
+    _py_print(*objects, **kwargs)
+
+
+def _py_print(*objects, **kwargs):
+  print(*objects, **kwargs)
 
 
 def _tf_py_func_print(objects, kwargs):
   """Overload of print_ as a py_func implementation."""
-  override_kwargs = {k: v for k, v in kwargs.items() if v is not UNDEFINED}
+  override_kwargs = {k: v for k, v in kwargs.items() if v is not UNSPECIFIED}
   if 'flush' not in override_kwargs:
     # Defaulting to flushing the console in graph mode, which helps reduce
     # garbled output in IPython.
@@ -187,7 +196,7 @@ def _tf_py_func_print(objects, kwargs):
       print_wrapper, None, objects, use_dummy_return=True)
 
 
-def range_(start_or_stop, stop=UNDEFINED, step=UNDEFINED):
+def range_(start_or_stop, stop=UNSPECIFIED, step=UNSPECIFIED):
   if any(tensor_util.is_tensor(s) for s in (start_or_stop, stop, step)):
     return _tf_range(start_or_stop, stop, step)
   return _py_range(start_or_stop, stop, step)
@@ -200,10 +209,10 @@ def _tf_range(start_or_stop, stop, step):
   # graph construction error aligns the semantics with Python.
 
   # TODO(mdan): We should optimize this when a full tensor is not required.
-  if step is not UNDEFINED:
+  if step is not UNSPECIFIED:
     # TODO(mdan): Add argument coercion similar to other cases.
     return math_ops.range(start_or_stop, stop, step)
-  if stop is not UNDEFINED:
+  if stop is not UNSPECIFIED:
     stop = math_ops.maximum(start_or_stop, stop)
     return math_ops.range(start_or_stop, stop)
   start_or_stop = math_ops.maximum(start_or_stop, 0)
@@ -211,9 +220,9 @@ def _tf_range(start_or_stop, stop, step):
 
 
 def _py_range(start_or_stop, stop, step):
-  if step is not UNDEFINED:
+  if step is not UNSPECIFIED:
     return range(start_or_stop, stop, step)
-  if stop is not UNDEFINED:
+  if stop is not UNSPECIFIED:
     return range(start_or_stop, stop)
   return range(start_or_stop)
 
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index ba8ec271394981ec878473205a8dbbd19d255f3b..e6bff2d719fa9921a42d0e57453af5f727a740e8 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -24,6 +24,7 @@ py_library(
         "ast_util.py",
         "cfg.py",
         "compiler.py",
+        "errors.py",
         "inspect_utils.py",
         "origin_info.py",
         "parser.py",
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf.py b/tensorflow/python/autograph/pyct/common_transformers/anf.py
index 192621b1cd329acec56c9517f3c885ee622b62e9..246c26833f0c30c757526209b710ef6df90eebf0 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf.py
@@ -36,10 +36,10 @@ from tensorflow.python.autograph.pyct import transformer
 class DummyGensym(object):
   """A dumb gensym that suffixes a stem by sequential numbers from 1000."""
 
-  def __init__(self, entity_info):
-    del entity_info
+  def __init__(self, ctx):
+    del ctx
     # A proper implementation needs to account for:
-    #   * entity_info.namespace
+    #   * ctx.info.namespace
     #   * all the symbols defined in the AST
     #   * the symbols generated so far
     self._idx = 0
@@ -68,19 +68,19 @@ class AnfTransformer(transformer.Base):
   # processing the `body` and the `orelse` need to be kept together with them,
   # and not accidentally lifted out of the `if`.
 
-  def __init__(self, entity_info, gensym_source=None):
+  def __init__(self, ctx, gensym_source=None):
     """Creates an ANF transformer.
 
     Args:
-      entity_info: transformer.EntityInfo
+      ctx: transformer.Context
       gensym_source: An optional object with the same interface as `DummyGensym`
         for generating unique names
     """
-    super(AnfTransformer, self).__init__(entity_info)
+    super(AnfTransformer, self).__init__(ctx)
     if gensym_source is None:
-      self._gensym = DummyGensym(entity_info)
+      self._gensym = DummyGensym(ctx)
     else:
-      self._gensym = gensym_source(entity_info)
+      self._gensym = gensym_source(ctx)
     self._pending_statements = []
 
   def _consume_pending_statements(self):
@@ -406,7 +406,7 @@ class AnfTransformer(transformer.Base):
     return node
 
 
-def transform(node, entity_info, gensym_source=None):
+def transform(node, ctx, gensym_source=None):
   """Converts the given node to A-normal form (ANF).
 
   The general idea of A-normal form: https://en.wikipedia.org/wiki/A-normal_form
@@ -416,9 +416,9 @@ def transform(node, entity_info, gensym_source=None):
 
   Args:
     node: The node to transform.
-    entity_info: transformer.EntityInfo.  TODO(mdan): What information does this
+    ctx: transformer.EntityInfo.  TODO(mdan): What information does this
       argument provide?
     gensym_source: An optional object with the same interface as `DummyGensym`
       for generating unique names.
   """
-  return AnfTransformer(entity_info, gensym_source=gensym_source).visit(node)
+  return AnfTransformer(ctx, gensym_source=gensym_source).visit(node)
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index 525d4886dee37c79d4087a293fa9ce5424a74c15..58663d21ff2626a6bad9f892263b8c721d82d004 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -30,10 +30,10 @@ from tensorflow.python.platform import test
 class DummyGensym(object):
   """A dumb gensym that suffixes a stem by sequential numbers from 1000."""
 
-  def __init__(self, entity_info):
-    del entity_info
+  def __init__(self, ctx):
+    del ctx
     # A proper implementation needs to account for:
-    #   * entity_info.namespace
+    #   * ctx.info.namespace
     #   * all the symbols defined in the AST
     #   * the symbols generated so far
     self._idx = 0
@@ -68,21 +68,22 @@ def exec_expected_result():
 
 class AnfTransformerTest(test.TestCase):
 
-  def _simple_source_info(self):
-    return transformer.EntityInfo(
+  def _simple_context(self):
+    entity_info = transformer.EntityInfo(
         source_code=None,
         source_file=None,
         namespace=None,
         arg_values=None,
         arg_types=None,
         owner_type=None)
+    return transformer.Context(entity_info)
 
   def test_basic(self):
     def test_function():
       a = 0
       return a
     node, _ = parser.parse_entity(test_function)
-    node = anf.transform(node.body[0], self._simple_source_info())
+    node = anf.transform(node.body[0], self._simple_context())
     result, _ = compiler.ast_to_object(node)
     self.assertEqual(test_function(), result.test_function())
 
@@ -100,7 +101,7 @@ class AnfTransformerTest(test.TestCase):
     exp_node, _ = parser.parse_entity(expected_fn)
     node, _ = parser.parse_entity(test_fn)
     node = anf.transform(
-        node, self._simple_source_info(), gensym_source=DummyGensym)
+        node, self._simple_context(), gensym_source=DummyGensym)
     exp_name = exp_node.body[0].name
     # Ignoring the function names in the result because they can't be
     # the same (because both functions have to exist in the same scope
@@ -109,7 +110,7 @@ class AnfTransformerTest(test.TestCase):
     self.assert_same_ast(exp_node, node)
     # Check that ANF is idempotent
     node_repeated = anf.transform(
-        node, self._simple_source_info(), gensym_source=DummyGensym)
+        node, self._simple_context(), gensym_source=DummyGensym)
     self.assert_same_ast(node_repeated, node)
 
   def test_binop_basic(self):
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fc9c168c0a5a0fa05b6b6c041b41cda6da8cf78
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -0,0 +1,37 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code transformation exceptions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class AutoGraphError(Exception):
+  pass
+
+
+class InternalError(AutoGraphError):
+
+  def __init__(self, message, original_exc):
+    super(InternalError, self).__init__()
+    self.message = message
+    self.original_exc = original_exc
+
+  def __str__(self):
+    return '{} during {}: {}'.format(
+        type(self.original_exc).__name__, self.message, self.original_exc)
+
+
diff --git a/tensorflow/python/autograph/pyct/origin_info.py b/tensorflow/python/autograph/pyct/origin_info.py
index 102bd42c91ca8189355fe39d014521151c0a6377..230468eab98fcdc91a49332f4f8c878e4784b627 100644
--- a/tensorflow/python/autograph/pyct/origin_info.py
+++ b/tensorflow/python/autograph/pyct/origin_info.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
 import tokenize
 
 import gast
@@ -73,6 +74,11 @@ class OriginInfo(
     return (self.loc.filename, self.loc.lineno, self.function_name,
             self.source_code_line)
 
+  def __repr__(self):
+    return '{}:{}:{}'.format(
+        os.path.split(self.loc.filename)[1], self.loc.lineno,
+        self.loc.col_offset)
+
 
 # TODO(mdan): This source map should be a class - easier to refer to.
 def create_source_map(nodes, code, filename, indices_in_code):
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index d04a40157e7ef59c887b2e3af0870ab087fd93d0..a14966e470a14e1d57e0e1d9b0420bd13cad2267 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -31,9 +31,18 @@ from tensorflow.python.util import tf_inspect
 
 def parse_entity(entity):
   """Returns the AST of given entity."""
-  source = tf_inspect.getsource(entity)
-
-  def fail(comment):
+  try:
+    source = tf_inspect.getsource(entity)
+  except (IOError, OSError) as e:
+    raise ValueError(
+        'Unable to locate the source code of {}. Note that functions defined'
+        ' in certain environments, like the interactive Python shell do not'
+        ' expose their source code. If that is the case, you should to define'
+        ' them in a .py source file. If you are certain the code is'
+        ' graph-compatible, wrap the call using'
+        ' @tf.autograph.do_not_convert. Original error: {}'.format(entity, e))
+
+  def raise_parse_failure(comment):
     raise ValueError(
         'Failed to parse source code of {}, which Python reported as:\n{}\n'
         '{}'.format(entity, source, comment))
@@ -49,8 +58,9 @@ def parse_entity(entity):
   except IndentationError:
     # The text below lists the causes of this error known to us. There may
     # be more.
-    fail('This may be caused by multiline strings or comments not indented at'
-         'the same level as the code.')
+    raise_parse_failure(
+        'This may be caused by multiline strings or comments not indented at'
+        ' the same level as the code.')
 
   except SyntaxError as e:
     if not tf_inspect.isfunction(entity) or entity.__name__ != '<lambda>':
@@ -71,8 +81,9 @@ def parse_entity(entity):
 
     # Give up if there's nothing we can chip away.
     if len(lines) == lineno and len(lines[-1]) == offset:
-      fail('If this is a lambda function, the error may be avoided by creating'
-           ' the lambda in a standalone statement.')
+      raise_parse_failure(
+          'If this is a lambda function, the error may be avoided by creating'
+          ' the lambda in a standalone statement.')
 
     # Drop all lines following the error location
     # TODO(mdan): What's with the pylint errors?
@@ -84,9 +95,10 @@ def parse_entity(entity):
     try:
       return parse_str(new_source), new_source
     except SyntaxError as e:
-      fail('If this is a lambda function, the error may be avoided by creating'
-           ' the lambda in a standalone statement. Tried to strip down the'
-           ' source to:\n{}\nBut that did not work.'.format(new_source))
+      raise_parse_failure(
+          'If this is a lambda function, the error may be avoided by creating'
+          ' the lambda in a standalone statement. Tried to strip down the'
+          ' source to:\n{}\nBut that did not work.'.format(new_source))
 
 
 def parse_str(src):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index 997d9a8aff111dfb0c223840da642ce8b2f138ce..595e95bed98f88b19a68c5ceb4ce1e2156e2b27d 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -121,7 +121,8 @@ class ActivityAnalyzerTest(test.TestCase):
         arg_types=None,
         owner_type=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     return node, entity_info
 
   def assertSymbolSetsAre(self, expected, actual, name):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values.py b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
index e8e3d229bea4bb505d58cdae24de87377b1b50e6..eca4571d38977905cc51387e47ee9a7d763f6703 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
@@ -39,7 +39,8 @@ class LiveValueResolver(transformer.Base):
 
   def visit_ClassDef(self, node):
     self.generic_visit(node)
-    anno.setanno(node, 'live_val', self.entity_info.namespace[node.name])
+    anno.setanno(
+        node, 'live_val', self.ctx.info.namespace[node.name])
     return node
 
   def visit_Name(self, node):
@@ -53,8 +54,8 @@ class LiveValueResolver(transformer.Base):
       if not is_defined:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
-        elif node.id in self.entity_info.namespace:
-          obj = self.entity_info.namespace[node.id]
+        elif node.id in self.ctx.info.namespace:
+          obj = self.ctx.info.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
           if hasattr(obj, '__name__'):
             anno.setanno(node, 'fqn', (obj.__name__,))
@@ -86,8 +87,8 @@ class LiveValueResolver(transformer.Base):
         def_, = defs
         # Note: param_of is a weakref.
         if def_.param_of and def_.param_of() is self.enclosing_entities[0]:
-          if node.id in self.entity_info.arg_values:
-            obj = self.entity_info.arg_values[node.id]
+          if node.id in self.ctx.info.arg_values:
+            obj = self.ctx.info.arg_values[node.id]
             anno.setanno(node, 'live_val', obj)
             anno.setanno(node, 'fqn', (obj.__class__.__name__,))
     return node
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
index 882c380b7888250560e0bf69ca44c3e7f4264979..a8d4e25e3c6f221ad13cb62ebadb54b8c86e665c 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
@@ -51,12 +51,13 @@ class LiveValuesResolverTest(test.TestCase):
         owner_type=None)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
-    node = activity.resolve(node, entity_info)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
-    node = live_values.resolve(node, entity_info, literals)
-    node = type_info.resolve(node, entity_info)
-    node = live_values.resolve(node, entity_info, literals)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
     return node
 
   def test_literals(self):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index 4366808d4962394b98cb3d939abed9666899a6d3..9738c6d0b4cce866dcf64f6f8937772428a8ef81 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -40,9 +40,10 @@ class LivenessTest(test.TestCase):
         arg_types=None,
         owner_type=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    liveness.resolve(node, entity_info, graphs)
+    liveness.resolve(node, ctx, graphs)
     return node
 
   def assertHasLiveOut(self, node, expected):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 8c0d51850770e90c6755951e4ca5b01bb0987c51..fd91721c7458b4606562e183e64de88edfb62e53 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -40,9 +40,10 @@ class DefinitionInfoTest(test.TestCase):
         arg_types=None,
         owner_type=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
     return node
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info.py b/tensorflow/python/autograph/pyct/static_analysis/type_info.py
index edb2ef0e274c53136560ce508bfa862781e380b8..68a53661d3701960f56033edfb75fabc2a6d6956 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info.py
@@ -45,6 +45,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
@@ -141,10 +142,11 @@ class TypeInfoResolver(transformer.Base):
     arg_name = str(qn)
     self.scope.setval(qn, arg_node)
     if (len(self.enclosing_entities) == 1 and
-        arg_name in self.entity_info.arg_types):
+        arg_name in self.ctx.info.arg_types):
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
-      type_string, type_obj = self.entity_info.arg_types[arg_name]
+      type_string, type_obj = self.ctx.info.arg_types[
+          arg_name]
       anno.setanno(arg_node, 'type', type_obj)
       anno.setanno(arg_node, 'type_fqn', tuple(type_string.split('.')))
 
@@ -177,7 +179,8 @@ class TypeInfoResolver(transformer.Base):
       func = value.func
       if anno.hasanno(func, 'live_val'):
         func_obj = anno.getanno(func, 'live_val')
-        if tf_inspect.isclass(func_obj):
+        if (tf_inspect.isclass(func_obj) and
+            not inspect_utils.isbuiltin(func_obj)):
           anno.setanno(value, 'is_constructor', True)
           anno.setanno(value, 'type', func_obj)
           anno.setanno(value, 'type_fqn', anno.getanno(func, 'fqn'))
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
index 34ba3d2f13889273ac9351b6194a46762a4ac39b..c6cf91e06207e739868282d1d0e7c2aa6cb51b62 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
@@ -72,12 +72,13 @@ class TypeInfoResolverTest(test.TestCase):
         owner_type=None)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
-    node = activity.resolve(node, entity_info)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
-    node = live_values.resolve(node, entity_info, {})
-    node = type_info.resolve(node, entity_info)
-    node = live_values.resolve(node, entity_info, {})
+    node = live_values.resolve(node, ctx, {})
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     return node
 
   def test_constructor_detection(self):
@@ -88,11 +89,22 @@ class TypeInfoResolverTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn, {'training': training})
     call_node = node.body[0].body[0].value
+    self.assertTrue(anno.getanno(call_node, 'is_constructor'))
     self.assertEquals(training.GradientDescentOptimizer,
                       anno.getanno(call_node, 'type'))
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(call_node, 'type_fqn'))
 
+  def test_constructor_detection_builtin_class(self):
+
+    def test_fn(x):
+      res = zip(x)
+      return res
+
+    node = self._parse_and_analyze(test_fn, {})
+    call_node = node.body[0].body[0].value
+    self.assertFalse(anno.hasanno(call_node, 'is_constructor'))
+
   def test_class_members_of_detected_constructor(self):
 
     def test_fn():
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 43279b3ca0111b8ea3860f1c467df1c602b3de74..0f0b8613ace6d733e950f4fccc55b5f0bc23ee39 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -108,6 +108,7 @@ class ReplaceTransformer(gast.NodeTransformer):
         anno.Basic.ORIGIN,
         anno.Basic.SKIP_PROCESSING,
         anno.Static.ORIG_DEFINITIONS,
+        'extra_test',
     }
 
   def _prepare_replacement(self, replaced, key):
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index b6830534b3dbf2e2815957b26d715d24dc002da7..f32d60938846b518feb2de40aaedd4a10aab2a88 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -18,10 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 
 import gast
-import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
@@ -29,21 +27,48 @@ from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import templates
 
 
-class AutographParseError(SyntaxError):
-  pass
+class AutoGraphParseError(SyntaxError):
+  """Error for graph construction errors from AutoGraph generated code."""
+
+  def __init__(self, error, origin_info):
+    file_path = origin_info.loc.filename
+    line_number = origin_info.loc.lineno
+    col_offset = origin_info.loc.col_offset
+    source_line = origin_info.source_code_line
+    super(AutoGraphParseError, self).__init__(
+        error, (file_path, line_number, col_offset, source_line))
+
+
+# TODO(znado): Use namedtuple.
+class Context(object):
+  """Contains information about a source code transformation.
+
+  This object is mutable, and is updated during conversion. Not thread safe.
+
+  Attributes:
+    info: EntityInfo, immutable.
+    current_origin: origin_info.OriginInfo, holds the OriginInfo of the last
+      AST node to be processed successfully. Useful for error handling.
+  """
+
+  def __init__(self, info):
+    self.info = info
+    self.current_origin = None
 
 
 # TODO(mdan): Use namedtuple.
 class EntityInfo(object):
-  """Contains information about a Python entity. Immutable.
+  """Contains information about a Python entity.
+
+  Immutable.
 
   Examples of entities include functions and classes.
 
   Attributes:
     source_code: The entity's source code.
     source_file: The entity's source file.
-    namespace: Dict[str, ], containing symbols visible to the entity
-        (excluding parameters).
+    namespace: Dict[str, ], containing symbols visible to the entity (excluding
+      parameters).
     arg_values: dict[str->*], containing parameter values, if known.
     arg_types: dict[str->*], containing parameter types, if known.
     owner_type: The surrounding class type of the function, if present.
@@ -198,17 +223,17 @@ class Base(gast.NodeTransformer):
 
   # TODO(mdan): Document all extra features.
 
-  def __init__(self, entity_info):
-    """Initialize the transformer. Subclasses should call this.
+  def __init__(self, ctx):
+    """Initialize the transformer.
+
+    Subclasses should call this.
 
     Args:
-      entity_info: An EntityInfo object.
+      ctx: A Context object.
     """
-    self._current_origin = None
     self._lineno = 0
     self._col_offset = 0
-    # TODO(znado): remove this from the constructor of all Transformers.
-    self.entity_info = entity_info
+    self.ctx = ctx
     self._enclosing_entities = []
 
     # A stack that allows keeping mutable, scope-local state where scopes may be
@@ -232,13 +257,15 @@ class Base(gast.NodeTransformer):
     return len(self._local_scope_state)
 
   def enter_local_scope(self, inherit=None):
-    """Deprecated. Use self.state instead.
+    """Deprecated.
+
+    Use self.state instead.
 
     Marks entry into a new local scope.
 
     Args:
-      inherit: Optional enumerable of variable names to copy from the
-          parent scope.
+      inherit: Optional enumerable of variable names to copy from the parent
+        scope.
     """
     scope_entered = {}
     if inherit:
@@ -249,13 +276,15 @@ class Base(gast.NodeTransformer):
     self._local_scope_state.append(scope_entered)
 
   def exit_local_scope(self, keep=None):
-    """Deprecated. Use self.state instead.
+    """Deprecated.
+
+    Use self.state instead.
 
     Marks exit from the current local scope.
 
     Args:
-      keep: Optional enumerable of variable names to copy into the
-          parent scope.
+      keep: Optional enumerable of variable names to copy into the parent scope.
+
     Returns:
       A dict containing the scope that has just been exited.
     """
@@ -390,11 +419,11 @@ class Base(gast.NodeTransformer):
 
     Args:
       targets: list, tuple of or individual AST node. Should be used with the
-          targets field of an ast.Assign node.
+        targets field of an ast.Assign node.
       values: an AST node.
       apply_fn: a function of a single argument, which will be called with the
-          respective nodes of each single assignment. The signature is
-          apply_fn(target, value), no return value.
+        respective nodes of each single assignment. The signature is
+        apply_fn(target, value), no return value.
     """
     if not isinstance(targets, (list, tuple)):
       targets = (targets,)
@@ -429,75 +458,54 @@ class Base(gast.NodeTransformer):
       # call `visit`.  The error needs to be raised before the exception handler
       # below is installed, because said handler will mess up if `node` is not,
       # in fact, a node.
-      msg = (
-          'invalid value for "node": expected "ast.AST", got "{}"; to'
-          ' visit lists of nodes, use "visit_block" instead').format(type(node))
+      msg = ('invalid value for "node": expected "ast.AST", got "{}"; to'
+             ' visit lists of nodes, use "visit_block" instead').format(
+                 type(node))
       raise ValueError(msg)
 
     did_enter_function = False
     local_scope_size_at_entry = len(self._local_scope_state)
     processing_expr_node = False
 
-    try:
-      parent_origin = self._current_origin
-      if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
-        did_enter_function = True
-      elif isinstance(node, gast.Expr):
-        processing_expr_node = True
-
-      if did_enter_function:
-        self._enclosing_entities.append(node)
-
-      if anno.hasanno(node, anno.Basic.ORIGIN):
-        self._current_origin = anno.getanno(node, anno.Basic.ORIGIN)
-
-      if processing_expr_node:
-        entry_expr_value = node.value
-
-      if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
-        result = super(Base, self).visit(node)
-      self._current_origin = parent_origin
-
-      # Adjust for consistency: replacing the value of an Expr with
-      # an Assign node removes the need for the Expr node.
-      if processing_expr_node:
-        if isinstance(result, gast.Expr) and result.value != entry_expr_value:
-          # When the replacement is a list, it is assumed that the list came
-          # from a template that contained a number of statements, which
-          # themselves are standalone and don't require an enclosing Expr.
-          if isinstance(result.value,
-                        (list, tuple, gast.Assign, gast.AugAssign)):
-            result = result.value
-
-      # On exception, the local scope integrity is not guaranteed.
-      if did_enter_function:
-        self._enclosing_entities.pop()
-
-      if local_scope_size_at_entry != len(self._local_scope_state):
-        raise AssertionError(
-            'Inconsistent local scope stack. Before entering node %s, the'
-            ' stack had length %d, after exit it has length %d. This'
-            ' indicates enter_local_scope and exit_local_scope are not'
-            ' well paired.' % (node, local_scope_size_at_entry,
-                               len(self._local_scope_state)))
-      return result
-
-    except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
-      if not self._current_origin:
-        raise e
-      original_file_path = self._current_origin.loc.filename
-      original_line_number = self._current_origin.loc.lineno
-      original_col_offset = self._current_origin.loc.col_offset
-      original_source_line = self._current_origin.source_code_line
-      msg = '%s: %s.' % (e.__class__.__name__, str(e))
-
-      # TODO(mdan): Avoid the printing of the original exception.
-      # In other words, we need to find how to suppress the "During handling
-      # of the above exception, another exception occurred" message.
-      six.reraise(
-          AutographParseError,
-          AutographParseError(msg, (original_file_path, original_line_number,
-                                    original_col_offset, original_source_line)),
-          sys.exc_info()[2])
-    finally:
-      self._current_origin = parent_origin
+    parent_origin = self.ctx.current_origin
+    if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
+      did_enter_function = True
+    elif isinstance(node, gast.Expr):
+      processing_expr_node = True
+
+    if did_enter_function:
+      self._enclosing_entities.append(node)
+
+    if anno.hasanno(node, anno.Basic.ORIGIN):
+      self.ctx.current_origin = anno.getanno(node, anno.Basic.ORIGIN)
+
+    if processing_expr_node:
+      entry_expr_value = node.value
+
+    if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+      result = super(Base, self).visit(node)
+    self.ctx.current_origin = parent_origin
+
+    # Adjust for consistency: replacing the value of an Expr with
+    # an Assign node removes the need for the Expr node.
+    if processing_expr_node:
+      if isinstance(result, gast.Expr) and result.value != entry_expr_value:
+        # When the replacement is a list, it is assumed that the list came
+        # from a template that contained a number of statements, which
+        # themselves are standalone and don't require an enclosing Expr.
+        if isinstance(result.value,
+                      (list, tuple, gast.Assign, gast.AugAssign)):
+          result = result.value
+
+    # On exception, the local scope integrity is not guaranteed.
+    if did_enter_function:
+      self._enclosing_entities.pop()
+
+    if local_scope_size_at_entry != len(self._local_scope_state):
+      raise AssertionError(
+          'Inconsistent local scope stack. Before entering node %s, the'
+          ' stack had length %d, after exit it has length %d. This'
+          ' indicates enter_local_scope and exit_local_scope are not'
+          ' well paired.' % (node, local_scope_size_at_entry,
+                             len(self._local_scope_state)))
+    return result
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 0c68d2a7648ccd3f44fb53db994bd0bb94a813eb..d97c1f0766a842f490d4874870441ad584ba22b2 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -28,14 +28,15 @@ from tensorflow.python.platform import test
 
 class TransformerTest(test.TestCase):
 
-  def _simple_source_info(self):
-    return transformer.EntityInfo(
+  def _simple_context(self):
+    entity_info = transformer.EntityInfo(
         source_code=None,
         source_file=None,
         namespace=None,
         arg_values=None,
         arg_types=None,
         owner_type=None)
+    return transformer.Context(entity_info)
 
   def test_entity_scope_tracking(self):
 
@@ -52,7 +53,7 @@ class TransformerTest(test.TestCase):
         anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
         return self.generic_visit(node)
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function():
       a = 0
@@ -126,7 +127,7 @@ class TransformerTest(test.TestCase):
         self.state[CondState].exit()
         return node
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function(a):
       a = 1
@@ -192,7 +193,7 @@ class TransformerTest(test.TestCase):
       def visit_For(self, node):
         return self._annotate_result(node)
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function(a):
       """Docstring."""
@@ -231,7 +232,7 @@ class TransformerTest(test.TestCase):
         self.exit_local_scope()
         return node
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def no_exit(a):
       if a > 0:
@@ -270,7 +271,7 @@ class TransformerTest(test.TestCase):
       z = y
       return z
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     node, _ = parser.parse_entity(test_function)
     node = tr.visit(node)
@@ -301,7 +302,7 @@ class TransformerTest(test.TestCase):
       if x > 0:
         return x
 
-    tr = BrokenTransformer(self._simple_source_info())
+    tr = BrokenTransformer(self._simple_context())
 
     node, _ = parser.parse_entity(test_function)
     with self.assertRaises(ValueError) as cm:
@@ -332,7 +333,7 @@ class TransformerTest(test.TestCase):
       if x > 0:
         return x
 
-    tr = BrokenTransformer(self._simple_source_info())
+    tr = BrokenTransformer(self._simple_context())
 
     node, _ = parser.parse_entity(test_function)
     with self.assertRaises(ValueError) as cm:
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 790c661661dabab7c5e1d5dd097a60562c8cc358..f5e0dbf00bf5ce35ae049755b32b47d12e5c9960 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -20,6 +20,7 @@ py_library(
     name = "utils",
     srcs = [
         "__init__.py",
+        "ag_logging.py",
         "context_managers.py",
         "misc.py",
         "py_func.py",
@@ -33,7 +34,9 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/data/ops:dataset_ops",
         "@six_archive//:six",
diff --git a/tensorflow/python/autograph/utils/ag_logging.py b/tensorflow/python/autograph/utils/ag_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc51e341f46cb74d07dff2c0d58a64966cd9661b
--- /dev/null
+++ b/tensorflow/python/autograph/utils/ag_logging.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logging and debugging utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# TODO(mdan): Use a custom logger class.
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+VERBOSITY_VAR_NAME = 'AUTOGRAPH_VERBOSITY'
+DEFAULT_VERBOSITY = 0
+
+verbosity_level = None  # vlog-like. Takes precedence over the env variable.
+echo_log_to_stdout = False
+
+# In interactive Python, logging echo is enabled by default.
+if hasattr(sys, 'ps1') or hasattr(sys, 'ps2'):
+  echo_log_to_stdout = True
+
+
+@tf_export('autograph.set_verbosity')
+def set_verbosity(level, alsologtostdout=False):
+  """Sets the AutoGraph verbosity level.
+
+  _Debug logging in AutoGraph_
+
+  More verbose logging is useful to enable when filing bug reports or doing
+  more in-depth debugging.
+
+  There are two controls that control the logging verbosity:
+   * The `set_verbosity` function
+   * The `AUTOGRAPH_VERBOSITY` environment variable
+  `set_verbosity` takes precedence over the environment variable.
+
+  For example:
+
+  ```python
+  import os
+  import tensorflow as tf
+
+  os.environ['AUTOGRAPH_VERBOSITY'] = 5
+  # Verbosity is now 5
+
+  tf.autograph.set_verbosity(0)
+  # Verbosity is now 0
+
+  os.environ['AUTOGRAPH_VERBOSITY'] = 1
+  # No effect, because set_verbosity was already called.
+  ```
+
+  Logs entries are output to [absl](https://abseil.io)'s default output,
+  with `INFO` level.
+  Logs can be mirrored to stdout by using the `alsologtostdout` argument.
+  Mirroring is enabled by default when Python runs in interactive mode.
+
+  Args:
+    level: int, the verbosity level; larger values specify increased verbosity;
+      0 means no logging. When reporting bugs, it is recommended to set this
+      value to a larges number, like 10.
+    alsologtostdout: bool, whether to also output log messages to `sys.stdout`.
+  """
+  global verbosity_level
+  global echo_log_to_stdout
+  verbosity_level = level
+  echo_log_to_stdout = alsologtostdout
+
+
+@tf_export('autograph.trace')
+def trace(*args):
+  """Traces argument information at compilation time.
+
+  `trace` is useful when debugging, and it always executes during the tracing
+  phase, that is, when the TF graph is constructed.
+
+  _Example usage_
+
+  ```python
+  import tensorflow as tf
+
+  for i in tf.range(10):
+    tf.autograph.trace(i)
+  # Output: <Tensor ...>
+  ```
+
+  Args:
+    *args: Arguments to print to `sys.stdout`.
+  """
+  print(*args)
+
+
+def get_verbosity():
+  global verbosity_level
+  if verbosity_level is not None:
+    return verbosity_level
+  return os.getenv(VERBOSITY_VAR_NAME, DEFAULT_VERBOSITY)
+
+
+def has_verbosity(level):
+  return get_verbosity() >= level
+
+
+def error(level, msg, *args, **kwargs):
+  if has_verbosity(level):
+    logging.error(msg, *args, **kwargs)
+    if echo_log_to_stdout:
+      print(msg % args)
+
+
+def log(level, msg, *args, **kwargs):
+  if has_verbosity(level):
+    logging.info(msg, *args, **kwargs)
+    if echo_log_to_stdout:
+      print(msg % args)
+
+
+def warn_first_n(msg, *args, **kwargs):
+  logging.log_first_n(logging.WARNING, msg, *args, **kwargs)
diff --git a/tensorflow/python/autograph/utils/misc.py b/tensorflow/python/autograph/utils/misc.py
index 1b06caf0bdeb6f4a079e33f2e887d2dca017adc2..046e6cf97dcd40cea4f1601cf8e69259559f7adf 100644
--- a/tensorflow/python/autograph/utils/misc.py
+++ b/tensorflow/python/autograph/utils/misc.py
@@ -23,7 +23,7 @@ from tensorflow.python.ops import array_ops
 
 
 def alias_tensors(*args):
-  """Wrap any Tensor arguments with an identity op.
+  """Wraps any Tensor arguments with an identity op.
 
   Any other argument, including Variables, is returned unchanged.
 
@@ -48,3 +48,10 @@ def alias_tensors(*args):
     return alias_if_tensor(args[0])
 
   raise ValueError('at least one argument required')
+
+
+def capitalize_initial(s):
+  """Capitalizes the initial of a string only."""
+  if s:
+    return s[0].upper() + s[1:]
+  return s
diff --git a/tensorflow/python/autograph/utils/misc_test.py b/tensorflow/python/autograph/utils/misc_test.py
index c78df48d6263b121076c86198670222441e7fec7..24b5753a91a035da9edd6c8cba431a063fc3c8d6 100644
--- a/tensorflow/python/autograph/utils/misc_test.py
+++ b/tensorflow/python/autograph/utils/misc_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.utils.misc import alias_tensors
+from tensorflow.python.autograph.utils import misc
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops.variables import Variable
@@ -27,11 +27,20 @@ from tensorflow.python.platform import test
 
 class MiscTest(test.TestCase):
 
+  def test_capitalize_initial(self):
+    self.assertEqual('', misc.capitalize_initial(''))
+    self.assertEqual('A', misc.capitalize_initial('A'))
+    self.assertEqual('Ab', misc.capitalize_initial('Ab'))
+    self.assertEqual('AbC', misc.capitalize_initial('AbC'))
+    self.assertEqual('A', misc.capitalize_initial('a'))
+    self.assertEqual('Ab', misc.capitalize_initial('ab'))
+    self.assertEqual('AbC', misc.capitalize_initial('abC'))
+
   @test_util.run_deprecated_v1
   def test_alias_single_tensor(self):
     a = constant(1)
 
-    new_a = alias_tensors(a)
+    new_a = misc.alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.cached_session() as sess:
       self.assertEqual(1, self.evaluate(new_a))
@@ -43,7 +52,7 @@ class MiscTest(test.TestCase):
     s = 'a'
     l = [1, 2, 3]
 
-    new_a, new_v, new_s, new_l = alias_tensors(a, v, s, l)
+    new_a, new_v, new_s, new_l = misc.alias_tensors(a, v, s, l)
 
     self.assertFalse(new_a is a)
     self.assertTrue(new_v is v)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 87a200ed336735f4b4abd9b0ac2352e36f7b84e4..bdca7dee556eb6e2fd8f4e98a31e0a5ccbe5c83b 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -736,10 +736,11 @@ class BaseSession(SessionInterface):
     if self._session is not None:
       try:
         tf_session.TF_DeleteSession(self._session)
-      except AttributeError:
-        # At shutdown, `c_api_util` or `tf_session` may have been garbage
-        # collected, causing the above method calls to fail. In this case,
-        # silently leak since the program is about to terminate anyway.
+      except (AttributeError, TypeError):
+        # At shutdown, `c_api_util`, `tf_session`, or
+        # `tf_session.TF_DeleteSession` may have been garbage collected, causing
+        # the above method calls to fail. In this case, silently leak since the
+        # program is about to terminate anyway.
         pass
       self._session = None
 
@@ -1531,7 +1532,7 @@ class Session(BaseSession):
 
     If no `graph` argument is specified when constructing the session,
     the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
+    using more than one graph (created with `tf.Graph()`) in the same
     process, you will have to use different sessions for each graph,
     but each graph can be used in multiple sessions. In this case, it
     is often clearer to pass the graph to be launched explicitly to
@@ -1674,7 +1675,7 @@ class InteractiveSession(BaseSession):
 
     If no `graph` argument is specified when constructing the session,
     the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
+    using more than one graph (created with `tf.Graph()`) in the same
     process, you will have to use different sessions for each graph,
     but each graph can be used in multiple sessions. In this case, it
     is often clearer to pass the graph to be launched explicitly to
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index 4d361612b7624a23ff8c74de0d6d54bce8817139..6639cf506e0a2f3d53373959b47cf98e5fcb0887 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -109,21 +109,8 @@ class SessionLogger {
   }
 
   Status RecordNewSession(Session* session) {
-    LOG(INFO) << "New session discovered.  Capturing devices...";
     ReplayOp op;
     NewReplaySession* req = op.mutable_new_replay_session();
-
-    std::vector<DeviceAttributes> devices;
-    Status status = session->ListDevices(&devices);
-    if (status.ok()) {
-      LOG(INFO) << "Found: " << devices.size() << " devices.";
-      for (const DeviceAttributes& dev : devices) {
-        *req->mutable_devices()->add_local_device() = dev;
-      }
-    } else {
-      LOG(WARNING) << "Failed to list devices on session. Continuing.";
-    }
-
     req->set_session_handle(SessionToHandle(session));
     return Flush(op);
   }
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index c4a118a41406afc52586553b1d3f0b446005c46d..da6218663de8b02fcda3f3e67e68bb46e47e914a 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -2036,7 +2036,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
       with self.assertRaisesRegexp(
-          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+          TypeError, r'Type of feed value 1 with type <(\w+) \'int\'> is not'):
         sess.run(a, feed_dict={a: 1})
 
 
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 9f2ce8c676e77480106c525bdc9c6440c599acec..87dd5d7f669f2f1cfe8fb5068a96dbdab62897d4 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -4,13 +4,23 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
+py_library(
+    name = "v2_compat",
+    srcs = ["v2_compat.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "compat",
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:tf2",
         "//tensorflow/python:util",
     ],
 )
@@ -24,3 +34,14 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+tf_py_test(
+    name = "disable_v2_behavior_test",
+    size = "small",
+    srcs = ["disable_v2_behavior_test.py"],
+    additional_deps = [
+        ":v2_compat",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index db99b2e45247f112a5793cd2d94cb971d9f6688a..03147e35812bf3e498bae6fcb7471f9182e9a796 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -24,15 +24,10 @@ from __future__ import print_function
 
 import datetime
 
-from tensorflow.python import tf2
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import variable_scope
-
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 1, 28)
 
 
 @tf_export("compat.forward_compatible")
@@ -138,40 +133,3 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
-
-
-@tf_export(v1=["enable_v2_behavior"])
-def enable_v2_behavior():
-  """Enables TensorFlow 2.x behaviors.
-
-  This function can be called at the beginning of the program (before `Tensors`,
-  `Graphs` or other structures have been created, and before devices have been
-  initialized. It switches all global behaviors that are different between
-  TensorFlow 1.x and 2.x to behave as intended for 2.x.
-
-  This function is called in the main TensorFlow `__init__.py` file, user should
-  not need to call it, except during complex migrations.
-  """
-  tf2.enable()  # Switches TensorArrayV2 and control flow V2
-  ops.enable_eager_execution()
-  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
-  variable_scope.enable_resource_variables()
-
-
-@tf_export(v1=["disable_v2_behavior"])
-def disable_v2_behavior():
-  """Disables TensorFlow 2.x behaviors.
-
-  This function can be called at the beginning of the program (before `Tensors`,
-  `Graphs` or other structures have been created, and before devices have been
-  initialized. It switches all global behaviors that are different between
-  TensorFlow 1.x and 2.x to behave as intended for 1.x.
-
-  User can call this function to disable 2.x behavior during complex migrations.
-  """
-  tf2.disable()  # Switches TensorArrayV2 and control flow V2
-  ops.disable_eager_execution()
-  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
-  variable_scope.disable_resource_variables()
-
-
diff --git a/tensorflow/python/compat/disable_v2_behavior_test.py b/tensorflow/python/compat/disable_v2_behavior_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c247eac395ec3b71c2d1840964cc351b9b78de6d
--- /dev/null
+++ b/tensorflow/python/compat/disable_v2_behavior_test.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for forward and backwards compatibility utilties."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class DisableV2BehaviorTest(test.TestCase):
+
+  def test_basic(self):
+    t = constant_op.constant([1, 2, 3])  # creates a hidden context
+    self.assertTrue(isinstance(t, ops.EagerTensor))
+    v2_compat.disable_v2_behavior()
+    t = constant_op.constant([1, 2, 3])
+    self.assertFalse(isinstance(t, ops.EagerTensor))
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a94939ae11dbf28146ae12ab21d11990dbb2516
--- /dev/null
+++ b/tensorflow/python/compat/v2_compat.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Switching v2 features on and off."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers import normalization
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+  normalization.enable_v2_batch_normalization()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Disables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+  normalization.disable_v2_batch_normalization()
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
index e063849f70381b8244a8a916353a3cc3be15c230..0ccf5c57d1954078bea1fca02885824a796236f5 100644
--- a/tensorflow/python/data/benchmarks/batch_benchmark.py
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -42,6 +42,9 @@ class BatchBenchmark(test.Benchmark):
 
     dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
         ).batch(batch_size_placeholder)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -72,13 +75,11 @@ class BatchBenchmark(test.Benchmark):
 
         median_wall_time = np.median(deltas) / 100.0
 
-        print("Batch sparse dataset non-zeros per row: %d batch_size: %d "
-              "wall time: %f"
-              % (non_zeros_per_row, batch_size, median_wall_time))
         self.report_benchmark(
-            iters=10000, wall_time=median_wall_time,
-            name="batch_sparse_dataset_nnz_%d_batch_size_%d" % (
-                non_zeros_per_row, batch_size))
+            iters=10000,
+            wall_time=median_wall_time,
+            name="sparse_num_elements_%d_batch_size_%d" %
+            (non_zeros_per_row, batch_size))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
index a6d86fe2218aec835e4f09f0c8c708596cf511f8..e0ecf19e11f95f0f2726eb0959ddc23ac9141283 100644
--- a/tensorflow/python/data/benchmarks/filter_benchmark.py
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -36,6 +36,9 @@ class FilterBenchmark(test.Benchmark):
     with ops.Graph().as_default():
       dataset = (
           dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset.with_options(options)
       iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
@@ -51,12 +54,7 @@ class FilterBenchmark(test.Benchmark):
           deltas.append(end - start)
 
         median_wall_time = np.median(deltas) / 100
-        print("Filter dataset using %s. Median wall time: %f" %
-              (name, median_wall_time))
-        self.report_benchmark(
-            iters=100,
-            wall_time=median_wall_time,
-            name=name)
+        self.report_benchmark(iters=100, wall_time=median_wall_time, name=name)
 
   def benchmarkSimpleFunction(self):
     self._benchmark(array_ops.identity, "simple_function")
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
index d7f1a4e7af5b00569e71900df8f2a7486d7c813b..4e5559ddbafb2ee0501ec9c87a98b314594cdc75 100644
--- a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -41,6 +41,9 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data)
         .repeat(num_epochs + 1).batch(batch_size))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -59,9 +62,6 @@ class FromTensorSlicesBenchmark(test.Benchmark):
         pass
 
     median_wall_time = np.median(deltas)
-    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
     self.report_benchmark(
         iters=len(deltas),
         wall_time=median_wall_time,
@@ -77,6 +77,9 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data)
         .repeat(num_epochs + 1).batch(batch_size))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -96,10 +99,6 @@ class FromTensorSlicesBenchmark(test.Benchmark):
         pass
 
     median_wall_time = np.median(deltas)
-    print(
-        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
-        " wall time per element: %f" % (input_size, batch_size,
-                                        median_wall_time))
     self.report_benchmark(
         iters=len(deltas),
         wall_time=median_wall_time,
@@ -116,6 +115,9 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
         .repeat(num_epochs + 1))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -135,9 +137,6 @@ class FromTensorSlicesBenchmark(test.Benchmark):
         pass
 
     median_wall_time = np.median(deltas)
-    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
     self.report_benchmark(
         iters=len(deltas),
         wall_time=median_wall_time,
@@ -154,6 +153,9 @@ class FromTensorSlicesBenchmark(test.Benchmark):
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
         .cache().repeat(num_epochs + 1))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -173,10 +175,6 @@ class FromTensorSlicesBenchmark(test.Benchmark):
         pass
 
     median_wall_time = np.median(deltas)
-    print(
-        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
-        "Median wall time per element: %f"
-        % (input_size, batch_size, median_wall_time))
     self.report_benchmark(
         iters=len(deltas),
         wall_time=median_wall_time,
diff --git a/tensorflow/python/data/benchmarks/list_files_benchmark.py b/tensorflow/python/data/benchmarks/list_files_benchmark.py
index 0dc21471129d5ca288a68c957e424035fea9dd66..70f8eeec9e8ec66edb7da5c2c82d97c8fa8336bd 100644
--- a/tensorflow/python/data/benchmarks/list_files_benchmark.py
+++ b/tensorflow/python/data/benchmarks/list_files_benchmark.py
@@ -58,6 +58,9 @@ class ListFilesBenchmark(test.Benchmark):
     for _ in range(iters):
       with ops.Graph().as_default():
         dataset = dataset_ops.Dataset.list_files(patterns)
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        dataset = dataset.with_options(options)
         next_element = dataset.make_one_shot_iterator().get_next()
         with session.Session() as sess:
           sub_deltas = []
@@ -71,11 +74,6 @@ class ListFilesBenchmark(test.Benchmark):
               break
           deltas.append(sub_deltas)
     median_deltas = np.median(deltas, axis=0)
-    print('Nested directory size (width*depth): %d*%d Median wall time: '
-          '%fs (read first filename), %fs (read second filename), avg %fs'
-          ' (read %d more filenames)' %
-          (width, depth, median_deltas[0], median_deltas[1],
-           np.average(median_deltas[2:]), len(median_deltas) - 2))
     self.report_benchmark(
         iters=iters,
         wall_time=np.sum(median_deltas),
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index 65d945cdae87aedad55351cfb63ad06e3521d570..b620eaaed52c5bdea4fab776442ddd6bc2801605 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -38,17 +38,14 @@ class MapBenchmark(test.Benchmark):
         if mode == "general":
           map_fn = lambda x: x + 1
           use_inter_op_parallelism = True
-          print_label = ""
           benchmark_label = ""
         if mode == "single-threaded":
           map_fn = lambda x: x + 1
           use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
           benchmark_label = "_single_threaded"
         if mode == "short-circuit":
           map_fn = lambda x: x
           use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
           benchmark_label = "_short_circuit"
 
         with ops.Graph().as_default():
@@ -58,6 +55,9 @@ class MapBenchmark(test.Benchmark):
                 dataset,
                 map_fn,
                 use_inter_op_parallelism=use_inter_op_parallelism)
+          options = dataset_ops.Options()
+          options.experimental_optimization.apply_default_optimizations = False
+          dataset = dataset.with_options(options)
           iterator = dataset_ops.make_one_shot_iterator(dataset)
           next_element = iterator.get_next()
 
@@ -73,13 +73,10 @@ class MapBenchmark(test.Benchmark):
               deltas.append(end - start)
 
             median_wall_time = np.median(deltas) / 100
-            print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (print_label, chain_length, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
-                name="map_dataset_chain_length_%d%s" % (chain_length,
-                                                        benchmark_label))
+                name="chain_length_%d%s" % (chain_length, benchmark_label))
 
   def benchmarkMapFanOut(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
@@ -88,17 +85,14 @@ class MapBenchmark(test.Benchmark):
         if mode == "general":
           map_fn = lambda *xs: [x + 1 for x in xs]
           use_inter_op_parallelism = True
-          print_label = ""
           benchmark_label = ""
         if mode == "single-threaded":
           map_fn = lambda *xs: [x + 1 for x in xs]
           use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
           benchmark_label = "_single_threaded"
         if mode == "short-circuit":
           map_fn = lambda *xs: xs
           use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
           benchmark_label = "_short_circuit"
 
         with ops.Graph().as_default():
@@ -108,6 +102,9 @@ class MapBenchmark(test.Benchmark):
               dataset,
               map_fn,
               use_inter_op_parallelism=use_inter_op_parallelism)
+          options = dataset_ops.Options()
+          options.experimental_optimization.apply_default_optimizations = False
+          dataset = dataset.with_options(options)
           iterator = dataset_ops.make_one_shot_iterator(dataset)
           next_element = iterator.get_next()
 
@@ -123,12 +120,10 @@ class MapBenchmark(test.Benchmark):
               deltas.append(end - start)
 
             median_wall_time = np.median(deltas) / 100
-            print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (print_label, fan_out, median_wall_time))
             self.report_benchmark(
                 iters=1000,
                 wall_time=median_wall_time,
-                name="map_dataset_fan_out_%d%s" % (fan_out, benchmark_label))
+                name="fan_out_%d%s" % (fan_out, benchmark_label))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index a5020e2873063ea8b01801c0889a23cb60601ec3..375ff339a82207f8c5662ecf67ac47fc8c79c2a6 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -31,14 +31,16 @@ class RangeBenchmark(test.Benchmark):
 
   def _benchmarkRangeHelper(self, modeling_enabled):
     num_elements = 10000000 if modeling_enabled else 50000000
-    options = dataset_ops.Options()
-    options.experimental_autotune = modeling_enabled
 
     # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
     # C++, and focus on the minimal overheads (excluding Python invocation
     # costs).
     dataset = dataset_ops.Dataset.range(num_elements).skip(
-        num_elements - 1).take(1).with_options(options)
+        num_elements - 1).take(1)
+    options = dataset_ops.Options()
+    options.experimental_autotune = modeling_enabled
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -54,11 +56,10 @@ class RangeBenchmark(test.Benchmark):
       end = time.time()
 
       time_per_element = (end - start) / num_elements
-      print("Average time per element (%s modeling): %f nanoseconds" % (
-          "with" if modeling_enabled else "without", time_per_element * 1e9))
-      self.report_benchmark(iters=num_elements, wall_time=time_per_element,
-                            name="benchmark_tf_data_dataset_range%s"
-                            % ("_with_modeling" if modeling_enabled else ""))
+      self.report_benchmark(
+          iters=num_elements,
+          wall_time=time_per_element,
+          name="modeling_%s" % ("on" if modeling_enabled else "off"))
 
   def benchmarkRange(self):
     for modeling_enabled in [False, True]:
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index ffc2e5ef5fa239beada67687ec700437b2fc44ba..3c1d798bd23fec5990d6d1f3080e5a8557240aed 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -65,6 +65,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
+@@take_while
 @@unbatch
 @@unique
 
@@ -115,6 +116,7 @@ from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repe
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
 from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
+from tensorflow.python.data.experimental.ops.take_while_ops import take_while
 from tensorflow.python.data.experimental.ops.threading_options import ThreadingOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 651dfd6857af319135c3ba594a48b824bc9f3b46..4f2117ec9b07a7d22391d8e856588fe34ed4086f 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -110,6 +110,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_benchmark",
+    srcs = ["choose_fastest_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
@@ -124,6 +138,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "parallel_interleave_benchmark",
+    srcs = ["parallel_interleave_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:sleep",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "rejection_resample_benchmark",
     srcs = ["rejection_resample_benchmark.py"],
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index e713494b526320f2c18774c7198406521c373033..bda7d38792a4aaaff6622f32f2101ad345eaa6da 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -33,12 +33,20 @@ class AutotuneBenchmark(test.Benchmark):
   """Benchmarks for autotuning performance knobs."""
 
   def benchmarkMap(self):
+    a = self._benchmarkMap(autotune=False)
+    b = self._benchmarkMap(autotune=True)
+    print("speedup: %f" % (a / b))
+
+  def _benchmarkMap(self, autotune):
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
                                                                1))).repeat()
     dataset = dataset.map(
         math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
+    options = dataset_ops.Options()
+    options.experimental_autotune = autotune
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
@@ -46,23 +54,24 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(1000):
+      for _ in range(10000):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
     self.report_benchmark(
-        iters=1000, wall_time=np.median(deltas), name="map_autotune")
+        iters=10000,
+        wall_time=np.median(deltas),
+        name="map" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
   def benchmarkMapAndBatch(self):
-    self._benchmarkMapAndBatch(numa_aware=False)
-    self._benchmarkMapAndBatch(numa_aware=True)
+    a = self._benchmarkMapAndBatch(autotune=False)
+    b = self._benchmarkMapAndBatch(autotune=True)
+    print("speedup: %f" % (a / b))
 
-  def _benchmarkMapAndBatch(self, numa_aware):
+  def _benchmarkMapAndBatch(self, autotune):
     batch_size = 16
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
@@ -74,7 +83,8 @@ class AutotuneBenchmark(test.Benchmark):
             num_parallel_calls=optimization.AUTOTUNE,
             batch_size=batch_size))
     options = dataset_ops.Options()
-    options.experimental_numa_aware = numa_aware
+    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
@@ -83,22 +93,24 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(100):
+      for _ in range(1000):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
     self.report_benchmark(
-        iters=100,
+        iters=1000,
         wall_time=np.median(deltas),
-        name=("numa_" if numa_aware else "") + "map_and_batch_autotune")
+        name="map_and_batch" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
   def benchmarkInterleave(self):
+    a = self._benchmarkInterleave(autotune=False)
+    b = self._benchmarkInterleave(autotune=True)
+    print("speedup: %f" % (a / b))
+
+  def _benchmarkInterleave(self, autotune):
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
@@ -108,6 +120,10 @@ class AutotuneBenchmark(test.Benchmark):
         lambda _: dataset,
         cycle_length=10,
         num_parallel_calls=optimization.AUTOTUNE)
+    options = dataset_ops.Options()
+    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
@@ -115,21 +131,24 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(1000):
+      for _ in range(10000):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
     self.report_benchmark(
-        iters=1000,
+        iters=10000,
         wall_time=np.median(deltas),
-        name="interleave_autotune")
+        name="interleave" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
   def benchmarkMapAndInterleave(self):
+    a = self._benchmarkMapAndInterleave(autotune=False)
+    b = self._benchmarkMapAndInterleave(autotune=True)
+    print("speedup: %f" % (a / b))
+
+  def _benchmarkMapAndInterleave(self, autotune):
     k = 1024 * 1024
     a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
     b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
@@ -161,6 +180,10 @@ class AutotuneBenchmark(test.Benchmark):
 
     dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
     dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    options = dataset_ops.Options()
+    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
@@ -168,19 +191,17 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next)
-      for _ in range(100):
+      for _ in range(1000):
         start = time.time()
         sess.run(get_next)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
     self.report_benchmark(
-        iters=100,
+        iters=1000,
         wall_time=np.median(deltas),
-        name="map_and_interleave_autotune")
+        name="map_and_interleave" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py b/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5a264c6f33196f882c3c2455339b7ba5a7e81c
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for static optimizations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks too.
+class ChooseFastestBenchmark(test.Benchmark):
+  """Benchmarks for static optimizations."""
+
+  def benchmarkChooseFastest(self):
+
+    dataset = dataset_ops.Dataset.range(1000**2).repeat()
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    map_batch_dataset = dataset.map(lambda x: x + 1).batch(100)
+    batch_map_dataset = dataset.batch(100).map(lambda x: x + 1)
+
+    merge_dataset = optimization._ChooseFastestDataset(  # pylint: disable=protected-access
+        [batch_map_dataset, map_batch_dataset])
+    self._benchmark(map_batch_dataset, "map_batch_dataset")
+    self._benchmark(batch_map_dataset, "batch_map_dataset")
+    self._benchmark(merge_dataset, "merge_dataset")
+
+  def benchmarkChooseFastestFirstNIterations(self):
+
+    dataset = dataset_ops.Dataset.range(1000**2).repeat()
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    map_batch_dataset = dataset.map(lambda x: x + 1).batch(100)
+    batch_map_dataset = dataset.batch(100).map(lambda x: x + 1)
+
+    merge_dataset = optimization._ChooseFastestDataset(  # pylint: disable=protected-access
+        [batch_map_dataset, map_batch_dataset])
+
+    self._benchmarkFirstN(map_batch_dataset, "map_batch_dataset")
+    self._benchmarkFirstN(batch_map_dataset, "batch_map_dataset")
+    self._benchmarkFirstN(merge_dataset, "merge_dataset")
+
+  def _benchmarkFirstN(self, dataset, name):
+    n = 10  # The default num_experiments for ChooseFastestDataset
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    next_element = iterator.get_next()
+
+    deltas = []
+    for _ in range(100):
+      with session.Session() as sess:
+        start = time.time()
+        for _ in range(n):
+          sess.run(next_element.op)
+        end = time.time()
+        deltas.append(end - start)
+    median_wall_time = np.median(deltas) / n
+    self.report_benchmark(
+        iters=n, wall_time=median_wall_time, name=name + "_first_%d" % n)
+
+  def _benchmark(self, dataset, name):
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      # Run 10 steps to warm up the session caches before taking the first
+      # measurement. Additionally, 10 is the default num_experiments for
+      # ChooseFastestDataset.
+      for _ in range(10):
+        sess.run(next_element.op)
+      deltas = []
+      for _ in range(50):
+        start = time.time()
+        for _ in range(50):
+          sess.run(next_element.op)
+        end = time.time()
+        deltas.append(end - start)
+
+      median_wall_time = np.median(deltas) / 100
+      self.report_benchmark(iters=100, wall_time=median_wall_time, name=name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
index 03345ce4e6648fecf47348806c55adba10aeed5a..2e91e08c79f2fcd990b6e3850f4539ea616c65fe 100644
--- a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -63,6 +63,9 @@ class CsvDatasetBenchmark(test.Benchmark):
 
   def _runBenchmark(self, dataset, num_cols, prefix):
     dataset = dataset.skip(self._num_per_iter - 1)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     deltas = []
     for _ in range(10):
       next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
@@ -79,8 +82,6 @@ class CsvDatasetBenchmark(test.Benchmark):
       deltas.append(end - start)
     # Median wall time per CSV record read and decoded
     median_wall_time = np.median(deltas) / self._num_per_iter
-    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
-                                                    median_wall_time))
     self.report_benchmark(
         iters=self._num_per_iter,
         wall_time=median_wall_time,
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
index b17f2bcd12b2b78c97e7c390d919331ac4ef5386..4b7c1737863d040763b8dc94952d0742c2c1027c 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -41,7 +40,7 @@ _NUMPY_RANDOM_SEED = 42
 class MapAndBatchBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.map_and_batch()`."""
 
-  def benchmarkMapAndBatchDense(self):
+  def benchmarkMapAndBatch(self):
     """Measures the performance of parallelized batching."""
     shapes = [(), (10,), (10, 10), (10, 10, 10), (224, 224, 3)]
     batch_size_values = [1, 32, 64, 128, 1024]
@@ -55,6 +54,9 @@ class MapAndBatchBenchmark(test.Benchmark):
 
     dataset = dataset.apply(batching.map_and_batch(
         lambda _: dense_value, batch_size_placeholder))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -88,13 +90,9 @@ class MapAndBatchBenchmark(test.Benchmark):
         median_wall_time = np.median(deltas) / 100.0
         iters = len(deltas) * 100
 
-        print("Map and batch dense dataset shape: %r batch_size: %d "
-              "wall time: %f (%d iters)"
-              % (shape, batch_size, median_wall_time, iters))
         self.report_benchmark(
             iters=iters, wall_time=median_wall_time,
-            name="benchmark_batch_dense_dataset_nnz_%d_batch_size_%d" % (
-                np.prod(shape), batch_size))
+            name="num_elements_%d_batch_size_%d" % (np.prod(shape), batch_size))
 
   def benchmarkMapAndBatchChainingVersusFusing(self):
     """Compares the performance of chaining and fusing map and batch.
@@ -128,49 +126,25 @@ class MapAndBatchBenchmark(test.Benchmark):
     def benchmark(label, series):
       """Runs benchmark the given series."""
 
-      print("%s:" % label)
-
-      def make_base_dataset(element_size):
+      def make_dataset(element_size, num_calls, batch_size):  # pylint: disable=missing-docstring
         k = 1024 * 1024
         x = constant_op.constant(np.random.rand(element_size, 4 * k))
         y = constant_op.constant(np.random.rand(4 * k, 1))
-        return dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+        dataset = dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+        dataset = dataset.map(
+            math_ops.matmul,
+            num_parallel_calls=num_calls).batch(batch_size=batch_size)
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        return dataset.with_options(options)
 
       for num_calls, inter_op, element_size, batch_size in series:
-
         num_iters = 1024 // (
             (element_size * batch_size) // min(num_calls, inter_op))
-        fused_dataset = make_base_dataset(element_size)
-        fused_dataset = fused_dataset.map(
-            math_ops.matmul,
-            num_parallel_calls=num_calls).batch(batch_size=batch_size)
-
-        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
-        fused_get_next = fused_iterator.get_next()
-
-        fused_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-
-          for _ in range(5):
-            sess.run(fused_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(fused_get_next.op)
-            end = time.time()
-            fused_deltas.append(end - start)
-
-        # `map_and_batch_fusion` is optimized by default. To get the chained
-        # dataset, with have to disable it.
-        options = dataset_ops.Options()
-        options.experimental_optimization = OptimizationOptions()
-        options.experimental_optimization.map_and_batch_fusion = False
-        chained_dataset = fused_dataset.with_options(options)
+        # By default the chained map().batch() calls will not be fused.
+        chained_dataset = make_dataset(element_size, num_calls, batch_size)
         chained_iterator = dataset_ops.make_one_shot_iterator(chained_dataset)
         chained_get_next = chained_iterator.get_next()
-
         chained_deltas = []
         with session.Session(
             config=config_pb2.ConfigProto(
@@ -184,27 +158,32 @@ class MapAndBatchBenchmark(test.Benchmark):
             end = time.time()
             chained_deltas.append(end - start)
 
-        print(
-            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
-            "element size: %d, num iters: %d\nchained wall time: %f (median), "
-            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
-            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
-            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
-            (batch_size, num_calls, inter_op, element_size, num_iters,
-             np.median(chained_deltas), np.mean(chained_deltas),
-             np.std(chained_deltas), np.min(chained_deltas),
-             np.max(chained_deltas), np.median(fused_deltas),
-             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
-             np.max(fused_deltas),
-             np.median(chained_deltas) / np.median(fused_deltas),
-             np.mean(chained_deltas) / np.mean(fused_deltas)))
-
         self.report_benchmark(
             iters=num_iters,
             wall_time=np.median(chained_deltas),
             name=name("chained", label, num_calls, inter_op, element_size,
                       batch_size))
 
+        # Apply an option to the default dataset that will fuse map().batch().
+        options = dataset_ops.Options()
+        options.experimental_optimization.map_and_batch_fusion = True
+        fused_dataset = chained_dataset.with_options(options)
+        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
+        fused_get_next = fused_iterator.get_next()
+        fused_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+
+          for _ in range(5):
+            sess.run(fused_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(fused_get_next.op)
+            end = time.time()
+            fused_deltas.append(end - start)
+
         self.report_benchmark(
             iters=num_iters,
             wall_time=np.median(fused_deltas),
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
index a60ba0a857ee18e88e912fc25000a479e4a86e72..50e3a5c469232e2ff3ea8f0bd74866d829c31770 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
@@ -115,31 +114,27 @@ class MapVectorizationBenchmark(test.Benchmark):
 
   def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
     num_elems = int(np.sum([np.prod(x) for x in input_size]))
-    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
+    name_template = "{}_batch_size_{}_input_element_size_{}_{}"
 
-    base_dataset = input_dataset.map(map_fn).batch(batch_size)
+    unoptimized_dataset = input_dataset.map(map_fn).batch(batch_size)
 
     options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    # Disable default map_and_batch_fusion optimization
-    opt_options.map_and_batch_fusion = False
-    options.experimental_optimization = opt_options
-    base_dataset = base_dataset.with_options(options)
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized_dataset = unoptimized_dataset.with_options(options)
+    unoptimized_next = dataset_ops.make_one_shot_iterator(
+        unoptimized_dataset).get_next()
 
-    unoptimized_op = dataset_ops.make_one_shot_iterator(base_dataset).get_next()
-
-    optimized_options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.map_vectorization = True
-    optimized_options.experimental_optimization = opt_options
-    optimized = base_dataset.with_options(optimized_options)
-    optimized_op = dataset_ops.make_one_shot_iterator(optimized).get_next()
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized_dataset = unoptimized_dataset.with_options(options)
+    optimized_next = dataset_ops.make_one_shot_iterator(
+        optimized_dataset).get_next()
 
     unoptimized_time = self._run(
-        unoptimized_op,
+        unoptimized_next,
         name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
     optimized_time = self._run(
-        optimized_op,
+        optimized_next,
         name=name_template.format(str_id, batch_size, num_elems, "optimized"))
 
     print("Batch size: {}\n"
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
index c53f8dd7c537fecbfcd551e2a4809aaf5447ff46..cb5bf2946d5d7dc8b802a9d32db4ec49e78a5e14 100644
--- a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -60,6 +60,9 @@ class MatchingFilesBenchmark(test.Benchmark):
     for _ in range(iters):
       with ops.Graph().as_default():
         dataset = matching_files.MatchingFilesDataset(patterns)
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        dataset = dataset.with_options(options)
         next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
         with session.Session() as sess:
@@ -75,11 +78,6 @@ class MatchingFilesBenchmark(test.Benchmark):
           deltas.append(sub_deltas)
 
     median_deltas = np.median(deltas, axis=0)
-    print('Nested directory size (width*depth): %d*%d Median wall time: '
-          '%fs (read first filename), %fs (read second filename), avg %fs'
-          ' (read %d more filenames)' %
-          (width, depth, median_deltas[0], median_deltas[1],
-           np.average(median_deltas[2:]), len(median_deltas) - 2))
     self.report_benchmark(
         iters=iters,
         wall_time=np.sum(median_deltas),
@@ -92,7 +90,7 @@ class MatchingFilesBenchmark(test.Benchmark):
             (len(median_deltas) - 2):
                 np.average(median_deltas[2:])
         },
-        name='dataset_nested_directory(%d*%d)' %
+        name='nested_directory(%d*%d)' %
         (width, depth))
 
     shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
index 1bbee5e7a3ff61a2d7c8d418cc6bdd360595dbe7..395a529f853e17909fd3f094174cc8d82393d6da 100644
--- a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -47,6 +47,7 @@ class OptimizationBenchmark(test.Benchmark):
         dataset = dataset.map(lambda x: x)
       if optimize_dataset:
         options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
         options.experimental_optimization.map_fusion = True
         dataset = dataset.with_options(options)
 
@@ -66,8 +67,6 @@ class OptimizationBenchmark(test.Benchmark):
 
         median_wall_time = np.median(deltas) / 100
         opt_mark = "opt" if optimize_dataset else "noopt"
-        print("Map dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
         self.report_benchmark(
             iters=100,
             wall_time=median_wall_time,
@@ -90,6 +89,7 @@ class OptimizationBenchmark(test.Benchmark):
             lambda x: math_ops.greater_equal(x - 5, 0))
       if optimize_dataset:
         options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
         options.experimental_optimization.map_and_filter_fusion = True
         dataset = dataset.with_options(options)
       iterator = dataset_ops.make_one_shot_iterator(dataset)
@@ -108,8 +108,6 @@ class OptimizationBenchmark(test.Benchmark):
 
         median_wall_time = np.median(deltas) / 100
         opt_mark = "opt" if optimize_dataset else "noopt"
-        print("Map and filter dataset {} chain length: {} Median wall time: {}"
-              .format(opt_mark, chain_length, median_wall_time))
         self.report_benchmark(
             iters=100,
             wall_time=median_wall_time,
@@ -131,6 +129,7 @@ class OptimizationBenchmark(test.Benchmark):
         dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0))
       if optimize_dataset:
         options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
         options.experimental_optimization.filter_fusion = True
         dataset = dataset.with_options(options)
 
@@ -150,8 +149,6 @@ class OptimizationBenchmark(test.Benchmark):
 
         median_wall_time = np.median(deltas) / 100
         opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Filter dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
         self.report_benchmark(
             iters=1000,
             wall_time=median_wall_time,
diff --git a/tensorflow/python/data/experimental/benchmarks/parallel_interleave_benchmark.py b/tensorflow/python/data/experimental/benchmarks/parallel_interleave_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..37375af27f4359764ec24aa0e5810a8b2a5b1ea7
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/parallel_interleave_benchmark.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.parallel_interleave()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import sleep
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def _make_fake_dataset_fn():
+  """Returns a dataset that emulates a remote storage data source.
+
+  Returns a dataset factory which creates a dataset with 100 elements that
+  emulates the performance characteristic of a file-based dataset stored in a
+  remote storage. In particular, the first element will take an order of
+  magnitude longer to produce than the remaining elements (1s vs. 1ms).
+  """
+
+  def fake_dataset_fn(unused):
+    del unused
+
+    def make_dataset(time_us, num_elements):
+      return dataset_ops.Dataset.range(num_elements).apply(sleep.sleep(time_us))
+
+    return make_dataset(1000 * 1000, 0).concatenate(make_dataset(1000,
+                                                                 100)).take(100)
+
+  return fake_dataset_fn
+
+
+class ParallelInterleaveBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.parallel_interleave()`."""
+
+  def _benchmark(self, dataset_fn, iters, num_elements):
+    with ops.Graph().as_default():
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset_fn().with_options(options)
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with session.Session() as sess:
+        deltas = []
+        for _ in range(iters):
+          start = time.time()
+          for _ in range(num_elements):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+    mean_wall_time = np.mean(deltas) / num_elements
+    self.report_benchmark(iters=iters, wall_time=mean_wall_time)
+
+  def benchmark_sequential_interleave(self):
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).repeat().interleave(
+          _make_fake_dataset_fn(), cycle_length=10)
+
+    self._benchmark(dataset_fn=dataset_fn, iters=10, num_elements=100)
+
+  def benchmark_parallel_interleave_v1(self):
+    """Benchmark for parallel interleave that does not support autotuning."""
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).repeat().apply(
+          interleave_ops.parallel_interleave(
+              _make_fake_dataset_fn(), cycle_length=10))
+
+    self._benchmark(dataset_fn=dataset_fn, iters=100, num_elements=1000)
+
+  def benchmark_parallel_interleave_v2(self):
+    """Benchmark for parallel interleave that supports autotuning."""
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).repeat().interleave(
+          _make_fake_dataset_fn(),
+          cycle_length=10, num_parallel_calls=optimization.AUTOTUNE)
+
+    self._benchmark(dataset_fn=dataset_fn, iters=100, num_elements=1000)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
index a64f7ecb00b4c2c02b1a579562cbf0afcf50f10e..9a8ac7ef655d56ebc11c1467b6ed82b5f943277c 100644
--- a/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
@@ -39,6 +39,9 @@ def _time_resampling(data_np, target_dist, init_dist, num_to_sample):  # pylint:
           initial_dist=init_dist,
           seed=142))
 
+  options = dataset_ops.Options()
+  options.experimental_optimization.apply_default_optimizations = False
+  dataset = dataset.with_options(options)
   get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
   with session.Session() as sess:
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
index 6f80df50b847c4e93c16603061b63399a1a4ff2d..3f5b9b91307f423ca78489b5f3ef824974a0a6fe 100644
--- a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -42,6 +42,9 @@ class UnbatchBenchmark(test.Benchmark):
       dataset = dataset.batch(batch_size_placeholder)
       dataset = dataset.apply(batching.unbatch())
       dataset = dataset.skip(elems_per_trial)
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset.with_options(options)
       iterator = dataset_ops.make_initializable_iterator(dataset)
       next_element = iterator.get_next()
 
@@ -58,8 +61,6 @@ class UnbatchBenchmark(test.Benchmark):
             deltas.append((end - start) / elems_per_trial)
 
           median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
           self.report_benchmark(
               iters=10000,
               wall_time=median_wall_time,
@@ -78,6 +79,9 @@ class UnbatchBenchmark(test.Benchmark):
       dataset = dataset.batch(batch_size_placeholder)
       dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
       dataset = dataset.skip(elems_per_trial)
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset.with_options(options)
       iterator = dataset_ops.make_initializable_iterator(dataset)
       next_element = iterator.get_next()
 
@@ -94,8 +98,6 @@ class UnbatchBenchmark(test.Benchmark):
             deltas.append((end - start) / elems_per_trial)
 
           median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
           self.report_benchmark(
               iters=10000,
               wall_time=median_wall_time,
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 9362a3e8eb8c23643fc83bf821cbf6ea2ec8eaad..04819130642d9558d5fbe247524b8a32bddefaf2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -23,6 +23,7 @@ py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -638,6 +639,28 @@ py_library(
     ],
 )
 
+py_test(
+    name = "take_while_test",
+    size = "small",
+    srcs = ["take_while_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:take_while_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "tf_record_writer_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 3324243c54351e297ae15c36bb56fcb5342e5ce5..0bbf0e9a12ba3170bd3c69e43824322b8b1eb059 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import random
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -72,10 +74,138 @@ def _get_record_shape(sparse):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class BucketBySequenceLengthTest(test_base.DatasetTestBase):
+class BucketBySequenceLengthTest(test_base.DatasetTestBase,
+                                 parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testBucketDropReminder(self, param_no_padding):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25, 35]
+
+    n_bucket_elements = [28, 7, 6, 5]
+    n_expected_batches = 5
+
+    # Expected sequence lengths of the individual batches.
+    expected_lengths = []
+
+    # Expected sum of all batches with an equal sequence length.
+    # <seq-length>: <expected-total-sum>
+    expected_sums = dict()
+
+    # Expected batch sizes of batches depending on the sequence length.
+    # <seq-length>: [batch1_size, ..., batchN_size]
+    expected_batch_sizes = dict()
+
+    for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
+                                                   n_bucket_elements):
+      # Calculate the expected sum across all batches of a specific sequence length.
+      expected_sums[length] = \
+          (bucket_elements - bucket_elements % batch_size) * length
+      # Calculate the expected occurrence of individual batch sizes.
+      expected_batch_sizes[length] = \
+          [batch_size] * (bucket_elements // batch_size)
+      # Calculate the expected occurence of individual sequence lengths.
+      expected_lengths.extend([length] * (bucket_elements // batch_size))
+
+    def build_dataset(sparse):
+
+      def _generator():
+        # Produce 1 batch for each bucket
+        elements = []
+        for bucket_elements, length in zip(n_bucket_elements, lengths):
+          # Using only full sequences (opposed to the strategy employed in `testBucket`) makes
+          # checking the sum a lot easier.
+          record_len = length
+          for _ in range(bucket_elements):
+            elements.append([1] * record_len)
+        random.shuffle(elements)
+        for el in elements:
+          yield (_format_record(el, sparse),)
+
+      dataset = dataset_ops.Dataset.from_generator(
+          _generator, (_get_record_type(sparse),), (_get_record_shape(sparse),))
+      if sparse:
+        dataset = dataset.map(lambda x: (_to_sparse_tensor(x),))
+      return dataset
 
-  # TODO(b/117581999): add eager coverage.
-  def testSkipEagerBucket(self):
+    def _test_bucket_by_padding(no_padding):
+      dataset = build_dataset(sparse=no_padding)
+      dataset = dataset.apply(
+          grouping.bucket_by_sequence_length(
+              _element_length_fn,
+              boundaries,
+              batch_sizes,
+              no_padding=no_padding,
+              drop_remainder=True))
+
+      get_next = self.getNext(dataset)
+      batches = []
+      for _ in range(n_expected_batches):
+        batch, = self.evaluate(get_next())
+        batches.append(batch)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+      generated_lengths = []
+
+      # <seq-length>: <total-sum>
+      generated_sums = dict()
+
+      # <seq-length>: [<batch_size>, ...]
+      generated_batch_sizes = dict()
+
+      for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
+                                                     n_bucket_elements):
+        # Initialize the sum across all batches.
+        generated_sums[length] = 0
+        # Initialize the individual batch sizes.
+        generated_batch_sizes[length] = []
+
+      for batch in batches:
+        shape = batch.dense_shape if no_padding else batch.shape
+        length = shape[1]
+        generated_lengths.append(length)
+
+        batch_size = shape[0]
+        generated_batch_sizes[length].append(batch_size)
+
+        batch_sum = batch.values.sum() if no_padding else batch.sum()
+        generated_sums[length] += batch_sum
+
+      for l in lengths:
+        # Make sure the sum of the batch contents is correct for the individual sequence lengths.
+        self.assertEqual(
+            generated_sums[l], expected_sums[l], "Tensor sums did not match! "
+            "expected: {}, generated: {}".format(expected_sums, generated_sums))
+
+        # Make sure the individual batch sizes are generated as expected.
+        self.assertEqual(
+            sorted(generated_batch_sizes[l]), sorted(expected_batch_sizes[l]),
+            "Batch-sizes did not match! "
+            "expected: {}, generated: {}".format(
+                sorted(expected_batch_sizes[l]),
+                sorted(generated_batch_sizes[l])))
+
+      # Make sure the generated sequence lengths appear as often as expected.
+      self.assertEqual(
+          sorted(generated_lengths), sorted(expected_lengths),
+          "The generated sequence lengths did not match! "
+          "expected: {}, generated: {}".format(
+              sorted(expected_lengths), sorted(generated_lengths)))
+
+    _test_bucket_by_padding(param_no_padding)
+
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testBucket(self, param_no_padding):
 
     boundaries = [10, 20, 30]
     batch_sizes = [10, 8, 4, 2]
@@ -132,8 +262,7 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
       self.assertEqual(sorted(lengths), sorted(lengths_val))
 
-    for no_padding in (True, False):
-      _test_bucket_by_padding(no_padding)
+    _test_bucket_by_padding(param_no_padding)
 
   def testPadToBoundary(self):
 
@@ -218,7 +347,11 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
     self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
                                      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
 
-  def testTupleElements(self):
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testTupleElements(self, param_no_padding):
 
     def build_dataset(sparse):
       def _generator():
@@ -246,11 +379,13 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       self.assertEqual([None, None], shapes[0].as_list())
       self.assertEqual([None], shapes[1].as_list())
 
-    for no_padding in (True, False):
-      _test_tuple_elements_by_padding(no_padding)
+    _test_tuple_elements_by_padding(param_no_padding)
 
-  # TODO(b/117581999): add eager coverage
-  def testSkipEagerBucketSparse(self):
+  @parameterized.named_parameters(
+      ("DoDropRemainder", True),
+      ("DoNotDropRemainder", False),
+  )
+  def testBucketSparse(self, param_drop_remainder):  # pylint: disable=g-doc-args
     """Tests bucketing of sparse tensors (case where `no_padding` == True).
 
     Test runs on following dataset:
@@ -281,11 +416,16 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       dataset = dataset.map(_to_sparse_tensor)
       return dataset
 
-    def _compute_expected_batches():
+    def _compute_expected_batches(drop_remainder):
       """Computes expected batch outputs and stores in a set."""
       all_expected_sparse_tensors = set()
       for bucket_start_len in range(min_len, max_len, bucket_size):
-        for batch_offset in range(0, bucket_size, batch_size):
+        if drop_remainder:
+          batch_offsets = [0]
+        else:
+          batch_offsets = range(0, bucket_size, batch_size)
+
+        for batch_offset in batch_offsets:
           batch_start_len = bucket_start_len + batch_offset
           batch_end_len = min(batch_start_len + batch_size,
                               bucket_start_len + bucket_size)
@@ -314,13 +454,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       return all_sparse_tensors
     dataset = _build_dataset()
     boundaries = range(min_len + bucket_size + 1, max_len, bucket_size)
-    dataset = dataset.apply(grouping.bucket_by_sequence_length(
-        _element_length_fn,
-        boundaries,
-        [batch_size] * (len(boundaries) + 1),
-        no_padding=True))
+    dataset = dataset.apply(
+        grouping.bucket_by_sequence_length(
+            _element_length_fn,
+            boundaries, [batch_size] * (len(boundaries) + 1),
+            no_padding=True,
+            drop_remainder=param_drop_remainder))
     batches = _compute_batches(dataset)
-    expected_batches = _compute_expected_batches()
+    expected_batches = _compute_expected_batches(param_drop_remainder)
     self.assertEqual(batches, expected_batches)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index 3b7b335e7066175fba6ef190b977362bc461ca1d..3f371434c047a32481ce38668ece1b1af0f00b1c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -449,6 +449,28 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         header=True,
     )
 
+  def testMakeCSVDataset_withNAValuesAndFieldDelim(self):
+    """Tests that datasets can be created from different delim and na_value."""
+    column_names = ["col%d" % i for i in range(5)]
+    inputs = [["0 1 2 3 4", "5 6 7 8 9"], ["10 11 12 13 14", "15 16 17 ? 19"]]
+    expected_output = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14],
+                       [15, 16, 17, 0, 19]]
+    label = "col0"
+
+    self._test_dataset(
+        inputs,
+        expected_output=expected_output,
+        expected_keys=column_names,
+        column_names=column_names,
+        label_name=label,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False,
+        header=False,
+        na_value="?",
+        field_delim=" ",
+    )
+
   def testMakeCSVDataset_withSelectCols(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index bf868ebe79339e3c36473711ece064210db5f47f..3bfe55244e575066356fa3f3dfcec16076fbadb6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -190,6 +190,7 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -206,7 +207,7 @@ py_test(
 
 py_test(
     name = "map_vectorization_test",
-    size = "medium",
+    size = "small",
     srcs = ["map_vectorization_test.py"],
     shard_count = 8,
     srcs_version = "PY2AND3",
@@ -232,6 +233,7 @@ py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -241,6 +243,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_dataset_test",
+    size = "small",
+    srcs = ["choose_fastest_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "model_dataset_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
index 9b8248a78da11d99e3cf6cd87ab69d30d4d369d6..e05dcbd9d582da05a4049e76d4f8c057a53b3161 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -31,11 +31,17 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
   def testAssertNext(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map"])).map(lambda x: x)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testAssertNextInvalid(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Whoops"])).map(lambda x: x)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
         expected_error=(
@@ -48,6 +54,7 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
         optimization.assert_next(["Map", "Whoops"])).map(lambda x: x)
     options = dataset_ops.Options()
     options.experimental_autotune = False
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec7a85ae113d0d517434827e5dae64804861070a
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
@@ -0,0 +1,85 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental._ChooseFastestDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ChooseFastestDatasetTest(test_base.DatasetTestBase,
+                               parameterized.TestCase):
+
+  def testChooseFastestSimple(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3, 4])
+    merge = optimization._ChooseFastestDataset([dataset, dataset])
+    self.assertDatasetProduces(
+        merge,
+        expected_output=[0, 1, 2, 3, 4],
+        expected_shapes=dataset.output_shapes)
+
+  def testChooseFastestManyInputs(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3, 4])
+    merge = optimization._ChooseFastestDataset([dataset for _ in range(5)])
+    self.assertDatasetProduces(
+        merge,
+        expected_output=[0, 1, 2, 3, 4],
+        expected_shapes=dataset.output_shapes)
+
+  def testChooseFastest(self):
+    dataset = dataset_ops.Dataset.range(600)
+    f = lambda x: 2 * x
+    dataset_a = dataset.batch(50).map(f)
+    dataset_b = dataset.map(f).batch(50)
+    merge = optimization._ChooseFastestDataset([dataset_a, dataset_b])
+    self.assertDatasetProduces(
+        merge,
+        expected_output=[
+            [i * 2 for i in range(j * 50, (j + 1) * 50)] for j in range(12)
+        ],
+        expected_shapes=dataset_a.output_shapes)
+
+  @parameterized.named_parameters(
+      ("Shapes", [0], [[1, 2, 3]], "must have compatible output shapes."),
+      ("Types", [0], [0.0], "must have the same output types."),
+      ("NumComponents", [0], ([0], [1]), "must have the same output types."),
+      ("Cardinality", [1, 2, 3], [1], "must have compatible cardinalities."))
+  def testChooseFastestErrorWithIncompatibleInput(self, slices_a, slices_b,
+                                                  error_msg):
+    dataset_a = dataset_ops.Dataset.from_tensor_slices(slices_a)
+    dataset_b = dataset_ops.Dataset.from_tensor_slices(slices_b)
+
+    # The error is raised at dataset creation time.
+    if context.executing_eagerly():
+      with self.assertRaises(errors.InvalidArgumentError):
+        merge = optimization._ChooseFastestDataset([dataset_a, dataset_b])
+    else:
+      merge = optimization._ChooseFastestDataset([dataset_a, dataset_b])
+      self.assertDatasetProduces(
+          merge, expected_error=(errors.InvalidArgumentError, error_msg))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
index 3ce921b5efe9e870fe1c5fb6406736f8bbb9c09f..525ae2c54e41e68869964de9d2997b41c3ca8585 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -71,6 +71,7 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.filter_fusion = True
     dataset = dataset.with_options(options)
     expected_output = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index f080891f2e783f0cbe2f6f6f8fb4bfa1ff726745..08a44e572b899c7f79af09c5a17448c9cd75a8b7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -65,12 +64,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testDataset(self, dataset):
     previous_result = 0
-    if context.executing_eagerly():
-      iterator = dataset.__iter__()
-      get_next = iterator._next_internal  # pylint: disable=protected-access
-    else:
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      get_next = iterator.get_next
+    get_next = self.getNext(dataset)
     for _ in range(5):
       result = self.evaluate(get_next())
       self.assertLessEqual(1, result)
@@ -91,6 +85,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
@@ -107,6 +102,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 8af86da852169eae992c0bad92ae8acbbdff5bb6..4fd982d12278232eaa65e8269f49c816823566ba 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -35,6 +35,7 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
              "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_stats.latency_all_edges = True
     options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
index 2386dd5f116d660eb93213c935b662c05d90011d..d79ae4387c868d4821ac65787ba0bc04d47cc7d3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
@@ -34,6 +34,7 @@ class MakeNumaAwareTest(test_base.DatasetTestBase):
             batching.map_and_batch(lambda x: x * x, 10))
     options = dataset_ops.Options()
     options.experimental_numa_aware = True
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset, expected_output=[[x * x for x in range(10)]])
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
index e2ff3116eccf2ccfb7ed72085f4727a1e0262164..dc7bb9d6a37b8da3bfe983ea3cf8c74dbe16ee86 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -31,6 +31,10 @@ class MapAndBatchFusionTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
             ["MapAndBatch"])).map(lambda x: x * x).batch(10)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset, expected_output=[[x * x for x in range(10)]])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index fa1d673065d6b5e8e473fd72680a92f0f07e7d65..7b0cc569734b9bf14b210e3a637334bdb950c503 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -83,6 +83,7 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(
             ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
@@ -101,6 +102,7 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Map",
                                   "Filter"])).map(function).filter(predicate)
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index defdaf044001ec4b6129987c82c0c626825fce95..b3a7304b4e498fbcae01efc85281d3437061155e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -74,6 +74,7 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_fusion = True
     dataset = dataset.with_options(options)
     expected_output = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index d8dd31fee8b0bc66bcaf92dffe6b0a89d29d668f..60649cd3ede8ed5f5d13857c9182f6fc912325c5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -22,12 +22,13 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -42,36 +43,58 @@ def _map_parallelization_test_cases():
     with ops.control_dependencies([assert_op]):
       return x
 
-  def random(_):
-    return random_ops.random_uniform([],
-                                     minval=0,
-                                     maxval=10,
-                                     dtype=dtypes.int64,
-                                     seed=42)
-
-  def assert_with_random(x):
-    x = assert_greater(x)
-    return random(x)
-
-  return (("Identity", identity, True), ("Increment", increment, True),
-          ("AssertGreater", assert_greater, True), ("Random", random, False),
-          ("AssertWithRandom", assert_with_random, False))
+  return (("Identity", identity, True),
+          ("Increment", increment, True),
+          ("AssertGreater", assert_greater, True))
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_parallelization_test_cases())
-  def testMapParallelization(self, function, should_optimize):
-    next_nodes = ["ParallelMap"] if should_optimize else ["Map"]
+  def testMapParallelization(self, function, should_be_parallel):
+    next_nodes = ["ParallelMap"] if should_be_parallel else ["Map"]
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(next_nodes)).map(function)
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_parallelization = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset, expected_output=[function(x) for x in range(5)])
+
+  def testMapParallelizationWithCapturedConstant(self):
+    """Tests that functions with captured constants are parallelized."""
+
+    captured_t = constant_op.constant(42, dtype=dtypes.int64)
+    def fn(x):
+      return x + captured_t
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["ParallelMap"])).map(fn)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_parallelization = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset, expected_output=[x + 42 for x in range(5)])
+
+  def testMapParallelizationWithCapturedVariable(self):
+    """Tests that functions with captured variables are not parallelized."""
+
+    captured_t = variables.Variable(42, dtype=dtypes.int64)
+    def fn(x):
+      return x + captured_t
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map"])).map(fn)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_parallelization = True
     dataset = dataset.with_options(options)
-    if should_optimize:
-      self.assertDatasetProduces(
-          dataset, expected_output=[function(x) for x in range(5)])
+    self.evaluate(variables.global_variables_initializer())
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[x + 42 for x in range(5)],
+        requires_initialization=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index 65fa2bac171e87eba0f5c61bb1c7d11966572e11..5e748a78ef4cdbce9b9cf6570b8954b4cf94b684 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -225,6 +226,10 @@ def _generate_csv_test_case():
 
 
 def _generate_parse_single_example_test_case():
+  # When sparse tensors are used, map_vectorization is not
+  # attempted because the output_shapes of the map dataset are not defined.
+  # TODO(rachelim): Consider being more lax with checking the output_shapes of
+  # the map node.
 
   def parse_example_factory():
 
@@ -243,8 +248,6 @@ def _generate_parse_single_example_test_case():
                     feature={
                         "dense_int": _int64_feature(i),
                         "dense_str": _bytes_feature(str(i)),
-                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
-                        "sparse_str": _bytes_feature(*["abc"] * i)
                     })).SerializeToString() for i in range(10)
         ]))
 
@@ -252,8 +255,6 @@ def _generate_parse_single_example_test_case():
     features = {
         "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
         "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
-        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
-        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
     }
     return parsing_ops.parse_single_example(x, features)
 
@@ -349,6 +350,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       dataset = dataset.map(map_fn, num_parallel_calls)
       dataset = dataset.batch(100)
       options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
       options.experimental_optimization.map_and_batch_fusion = False
       dataset = dataset.with_options(options)
       return dataset
@@ -357,6 +359,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     optimized = _make_dataset(["Batch", map_node_name]
                               if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_vectorization = True
     optimized = optimized.with_options(options)
     return unoptimized, optimized
@@ -368,23 +371,21 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                      num_parallel_calls)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationBadMapFn(self):
+  def testOptimizationBadMapFn(self):
     # Test map functions that give an error
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
       return array_ops.gather(x, 10)
-
-    base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
-        5, drop_remainder=True)
-    _, optimized = self._get_test_datasets(base_dataset, map_fn)
-    nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
+      base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
+          5, drop_remainder=True)
+      _, optimized = self._get_test_datasets(base_dataset, map_fn)
+      nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
       self.evaluate(nxt)
 
   def testOptimizationWithCapturedInputs(self):
-    # Tests that vectorization works with captured inputs
+    # Tests that vectorization works with captured inputs.
     y = constant_op.constant(1, shape=(2,))
     z = constant_op.constant(2, shape=(2,))
 
@@ -397,6 +398,84 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
+  def testOptimizationWithMapAndBatchFusion(self):
+    # Tests that vectorization works on fused map and batch.
+    y = constant_op.constant(1, shape=(2,))
+    z = constant_op.constant(2, shape=(2,))
+
+    def map_fn(x):
+      return x, y, z
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
+                                                           [3, 4]]).repeat(5)
+    base_dataset = base_dataset.with_options(options)
+
+    def _make_dataset(node_names):
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = dataset.apply(batching.map_and_batch(map_fn, 100))
+      return dataset
+
+    unoptimized = _make_dataset(["MapAndBatch"])
+    optimized = _make_dataset(["Batch", "ParallelMap"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = optimized.with_options(options)
+    self.assertDatasetsEqual(optimized, unoptimized)
+
+  @parameterized.named_parameters(
+      ("1", True, True),
+      ("2", True, False),
+      ("3", False, True),
+      ("4", False, False),
+  )
+  def testOptimizationWithChainedMapAndBatch(self, fuse_first, fuse_second):
+    # Tests that vectorization works on chained map and batch functions.
+    def map_fn(x):
+      return x * 2
+
+    unoptimized_seq = []
+
+    def make_apply_fn(is_fused):
+      if is_fused:
+        unoptimized_seq.append("MapAndBatch")
+
+        def apply_fn(dataset):
+          return dataset.apply(
+              batching.map_and_batch(map_fn, 2, 12, drop_remainder=True))
+
+        return apply_fn
+      else:
+        unoptimized_seq.extend(["ParallelMap", "Batch"])
+
+        def apply_fn(dataset):
+          return dataset.map(map_fn, 12).batch(2, drop_remainder=True)
+
+        return apply_fn
+
+    base_dataset = dataset_ops.Dataset.range(1000)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    base_dataset = base_dataset.with_options(options)
+
+    apply_fn_1 = make_apply_fn(fuse_first)
+    apply_fn_2 = make_apply_fn(fuse_second)
+
+    def make_dataset(node_names):
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = apply_fn_1(dataset)
+      dataset = apply_fn_2(dataset)
+      return dataset
+
+    unoptimized = make_dataset(unoptimized_seq)
+    optimized = make_dataset(["Batch", "ParallelMap", "Batch", "ParallelMap"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = optimized.with_options(options)
+
+    self.assertDatasetsEqual(optimized, unoptimized)
+
   # TODO(b/117581999): Add eager coverage for the following tests.
   def testSkipEagerOptimizationIgnoreStateful(self):
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index 0f0274b41f2da1add8b2361b54e5c32a5974da41..5c1ae7a98a2326f61518b1550d0678da50e78401 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -23,10 +23,11 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-# TODO(b/117581999): Add eager coverage for the following tests.
+@test_util.run_all_in_graph_and_eager_modes
 class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testAutotuneOption(self):
@@ -35,15 +36,13 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Model"]))
     options = dataset_ops.Options()
     options.experimental_autotune = True
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
+    get_next = self.getNext(dataset)
 
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    self.assertEqual(0, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index 8058f53eea240831545444286fb2c6aa404e240a..74f620e37d5659bf4d2989d7a8a0b5d8359a91af 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -41,6 +41,10 @@ class NoopEliminationTest(test_base.DatasetTestBase):
             ["FiniteRepeat", "FiniteSkip", "Prefetch", "MemoryCacheImpl"]))
     dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
         1).prefetch(0).prefetch(1).cache()
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index c111567c1c5ed5c0bc1cbadfb06eead1e1a49350..bcd027ebbbc07fa4fd9458cc979293c9c417c9cf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -106,15 +106,19 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testOptimizationStatefulFunction(self):
     dataset = dataset_ops.Dataset.range(
         10).map(lambda _: random_ops.random_uniform([])).batch(10)
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
+  # TODO(b/123300735): Add eager coverage for the following tests.
   def testSkipEagerOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -127,7 +131,9 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -147,7 +153,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
-    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNestedDatasetWithModifiedRetval(self):
@@ -163,10 +172,8 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
 
-    # TODO(b/120558523): We use Options instead of _OptimizeDataset directly
-    # here because of a bug with chaining _OptimizeDatasets when there are
-    # nested dataset functions
     options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_and_batch_fusion = True
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[[0]])
@@ -179,7 +186,9 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         threadpool.PrivateThreadPool(
             2, display_name="private_thread_pool_%d" % 2))
 
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
         expected_output=[list(range(10))],
@@ -193,14 +202,20 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
     dataset = dataset.skip(0)  # Should be removed by noop elimination
     dataset = dataset.cache()
-    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNonSerializableAsDirectInput(self):
     """Tests that non-serializable dataset can be OptimizeDataset's input."""
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.apply(optimization.non_serializable())
-    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   @parameterized.named_parameters(_generate_captured_refvar_test_cases())
@@ -217,6 +232,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       unoptimized_dataset = dataset_fn(variable)
 
       options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
       options.experimental_optimization.noop_elimination = True
       options.experimental_optimization.map_and_batch_fusion = True
       optimized_dataset = unoptimized_dataset.with_options(options)
@@ -265,10 +281,15 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     only explicitly enabled optimizations will be applied.
     """
     options = dataset_ops.Options()
-    options.experimental_optimization.hoist_random_uniform = True
     options.experimental_optimization.apply_default_optimizations = False
-    expected_optimizations = ["hoist_random_uniform"]
-    self.assertEqual(options._static_optimizations(), expected_optimizations)
+    options.experimental_optimization.hoist_random_uniform = True
+    options.experimental_optimization.noop_elimination = True
+    expected_optimizations = [
+        "hoist_random_uniform",
+        "noop_elimination",
+    ]
+    self.assertEqual(
+        set(options._static_optimizations()), set(expected_optimizations))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 594b59375febbba6c939dc5429ff59fe9c971a5f..824cc680abb9e574f77a544edb6e7fffa9a064c7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -31,6 +31,10 @@ class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
   def testShuffleAndRepeatFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.shuffle_and_repeat_fusion = True
+    dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
 
     for _ in range(2):
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index 76e0d4d72a6d22f24da9c762770d1592ba67b737..4dbb188f2cffa08ff47cb4bd85ea6d3672edd222 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -27,6 +27,7 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
@@ -671,8 +672,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
-  @test_util.run_deprecated_v1
-  def testSkipEagerSerializedShapeMismatch(self):
+  def testSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
     cname = "c"
@@ -695,19 +695,34 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     ]
 
     serialized = [m.SerializeToString() for m in original]
-    self._test(
-        ops.convert_to_tensor(serialized), {
-            aname:
-                parsing_ops.FixedLenSequenceFeature((2, 1),
-                                                    dtype=dtypes.float32,
-                                                    allow_missing=True,
-                                                    default_value=[]),
-            bname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
-        },
-        expected_err=(ValueError,
-                      "Cannot reshape a tensor with 0 elements to shape"))
+    if context.executing_eagerly():
+      self._test(
+          ops.convert_to_tensor(serialized), {
+              aname:
+                  parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                      dtype=dtypes.float32,
+                                                      allow_missing=True,
+                                                      default_value=[]),
+              bname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+          },
+          expected_err=(errors_impl.InvalidArgumentError,
+                        "Input to reshape is a tensor with 0 values"))
+    else:
+      self._test(
+          ops.convert_to_tensor(serialized), {
+              aname:
+                  parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                      dtype=dtypes.float32,
+                                                      allow_missing=True,
+                                                      default_value=[]),
+              bname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+          },
+          expected_err=(ValueError,
+                        "Cannot reshape a tensor with 0 elements to shape"))
 
   @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 87a91415b08097c40a60937b4d970cc63183c23e..ddac02b9e29fc54efd962d9697be66cd7e756354 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-# TODO(b/117581999): Add eager coverage
+# TODO(b/117581999): Add eager specific test.
 class RestructuredDatasetTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index f5ac0f500746f69f0d91eda5d93f9a967c429aa1..38e9b1e128157e4ff284ae0065ee474b20bad86c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -70,9 +70,7 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
-  # TODO(b/117581999): Add coverage for eager.
-  @test_util.run_deprecated_v1
-  def testSkipEagerSparseCount(self):
+  def testSparseCount(self):
 
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 4a2e28f49649ea698e9d426d86dae4bb42cdebf9..4fd2a2ec4bfc4ca44b1b421bbe00ebf16bc55936 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -93,6 +93,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_dataset_serialization_test",
+    size = "small",
+    srcs = ["choose_fastest_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "concatenate_dataset_serialization_test",
     size = "small",
@@ -666,6 +684,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "take_while_dataset_serialization_test",
+    size = "small",
+    srcs = ["take_while_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:take_while_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "textline_dataset_serialization_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..936dc2221490d32eb978cf3fe96de13b53b57f99
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
@@ -0,0 +1,45 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ZipDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class ChooseFastestDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+    num_outputs = 10
+    batch_size = 2
+
+    def build_ds():
+      dataset = dataset_ops.Dataset.range(num_outputs)
+      map_fn = lambda x: x * 2
+      return optimization._ChooseFastestDataset([  # pylint: disable=protected-access
+          dataset.map(map_fn).batch(batch_size),
+          dataset.batch(batch_size).map(map_fn)
+      ])
+
+    self.run_core_tests(build_ds, None, num_outputs // 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..47899eab68cbe41ad0dcb7f4daddabda0071d488
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
@@ -0,0 +1,44 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the TakeWhileDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import take_while_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class TakeWhileDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
+  def _build_dataset(self, num_elements, upper_bound):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        take_while_ops.take_while(lambda x: x < upper_bound))
+
+  @parameterized.parameters((23, 10, 7), (10, 50, 0), (25, 30, 25))
+  def testCore(self, num_elem1, num_elem2, upper_bound):
+    self.run_core_tests(lambda: self._build_dataset(num_elem1, upper_bound),
+                        lambda: self._build_dataset(num_elem2, upper_bound),
+                        upper_bound)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index b80aab994e1754faccde5653de9149f32a5f862c..f5a15f4c848c536ac07636469ea1f8b762bd317e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -104,9 +104,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
         self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
             summary_str, "::execution_time", float(i + 1))
       self._assertSummaryContains(summary_str,
-                                  dataset_name + "::num_parallel_calls")
-      self._assertSummaryContains(summary_str,
-                                  dataset_name + "::active_parallel_calls")
+                                  dataset_name + "::thread_utilization")
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
     if function_processing_time:
diff --git a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..25ad6c7750e75ab92f8bb81c31ad4d60fea9a871
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py
@@ -0,0 +1,103 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.take_while()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import take_while_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.parameters((14, 2), (15, 2), (100, 3))
+  def testTakeWhileDataset(self, num_elements, window_size):
+
+    def _predicate_func(elem):
+      return array_ops.shape(elem)[0] > (window_size - 1)
+
+    take_while = take_while_ops.take_while(_predicate_func)
+
+    dataset = dataset_ops.Dataset.range(num_elements).batch(window_size)
+    dataset = dataset.apply(take_while).flat_map(
+        dataset_ops.Dataset.from_tensor_slices)
+
+    expected_num_elements = int(num_elements / window_size) * window_size
+    self.assertDatasetProduces(dataset, np.arange(expected_num_elements))
+
+  @parameterized.parameters((10, 2, False), (16, 7, False), (100, 99, False),
+                            (100, 101, True), (0, 1, True))
+  def testTakeWhileDatasetRange(self, num_elements, upper_bound, out_of_bounds):
+    dataset = dataset_ops.Dataset.range(num_elements).apply(
+        take_while_ops.take_while(lambda x: x < upper_bound))
+
+    if out_of_bounds:
+      with self.assertRaises(errors.OutOfRangeError):
+        self.assertDatasetProduces(dataset, np.arange(upper_bound))
+
+    else:
+      self.assertDatasetProduces(dataset, np.arange(upper_bound))
+
+  def testTakeWhileDatasetString(self):
+
+    def not_equal(string):
+      return lambda x: math_ops.not_equal(x, constant_op.constant(string))
+
+    string = ["this", "is", "the", "test", "for", "strings"]
+    dataset = dataset_ops.Dataset.from_tensor_slices(string).apply(
+        take_while_ops.take_while(not_equal("test")))
+
+    next_element = self.getNext(dataset)
+    self.assertEqual(b"this", self.evaluate(next_element()))
+    self.assertEqual(b"is", self.evaluate(next_element()))
+    self.assertEqual(b"the", self.evaluate(next_element()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.assertEqual(b"test", self.evaluate(next_element()))
+
+  @parameterized.parameters((5, 3), (10, 0), (100, 5), (8, 7))
+  def testTakewhileDatasetShortCircuit(self, size, index):
+
+    def _predicate_func(data_elem):
+      return data_elem
+
+    boolean_array = [True] * size
+    boolean_array[index] = False
+    dataset = dataset_ops.Dataset.from_tensor_slices(boolean_array).apply(
+        take_while_ops.take_while(_predicate_func))
+
+    next_element = self.getNext(dataset)
+
+    for _ in range(index):
+      self.assertTrue(self.evaluate(next_element()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index e4034cc43a0cbc6cd0c35595a8a4ca944ca4d07e..613fe0da6b3d3db81a969a3cea261f238951fab4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -68,9 +68,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         data, [(i, compat.as_bytes(str(i)), i) for i in range(10)])
 
-  # TODO(b/117581999): Add eager coverage.
-  @test_util.run_deprecated_v1
-  def testSkipEagerUnbatchDatasetWithSparseTensor(self):
+  def testUnbatchDatasetWithSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
         values=list(range(10)),
@@ -79,20 +77,12 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    next_element = iterator.get_next()
-
-    for i in range(10):
-      st_row = self.evaluate(next_element)
-      self.assertEqual([i], st_row.indices)
-      self.assertEqual([i], st_row.values)
-      self.assertEqual([10], st_row.dense_shape)
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element)
+    expected_output = [
+        sparse_tensor.SparseTensorValue([[i]], [i], [10]) for i in range(10)
+    ]
+    self.assertDatasetProduces(data, expected_output=expected_output)
 
-  # TODO(b/117581999): Add eager coverage.
-  @test_util.run_deprecated_v1
-  def testSkipEagerUnbatchDatasetWithDenseAndSparseTensor(self):
+  def testUnbatchDatasetWithDenseAndSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
         values=list(range(10)),
@@ -101,16 +91,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    next_element = self.getNext(data)
-
-    for i in range(10):
-      dense_elem, st_row = self.evaluate(next_element())
-      self.assertEqual(i, dense_elem)
-      self.assertEqual([i], st_row.indices)
-      self.assertEqual([i], st_row.values)
-      self.assertEqual([10], st_row.dense_shape)
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
+    expected_output = [(i, sparse_tensor.SparseTensorValue([[i]], [i], [10]))
+                       for i in range(10)]
+    self.assertDatasetProduces(data, expected_output=expected_output)
 
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
@@ -150,7 +133,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       data.apply(batching.unbatch())
 
-  # TODO(b/117581999): eager mode doesnt capture raised error, debug.
+  # Note: dynamic shape mismatch is graph specific test.
   @test_util.run_deprecated_v1
   def testSkipEagerUnbatchDynamicShapeMismatch(self):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 60c20e0bcf2d875a15ffcc4c42d10cb6e0cc25ea..56bf59344f8881d96525c197268ad9dac988166a 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -354,6 +354,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "take_while_ops",
+    srcs = ["take_while_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_library(
     name = "threading_options",
     srcs = ["threading_options.py"],
@@ -454,6 +467,7 @@ py_library(
         ":shuffle_ops",
         ":sleep",
         ":stats_ops",
+        ":take_while_ops",
         ":threadpool",
         ":unique",
         ":writers",
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 2435f0cfdb77ba607c90db66af499780288c324b..4e83acf6bbadc065adae1a6fe3da81bc6ff19d0e 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -130,7 +130,8 @@ def bucket_by_sequence_length(element_length_func,
                               padded_shapes=None,
                               padding_values=None,
                               pad_to_bucket_boundary=False,
-                              no_padding=False):
+                              no_padding=False,
+                              drop_remainder=False):
   """A transformation that buckets elements in a `Dataset` by length.
 
   Elements of the `Dataset` are grouped together by length and then are padded
@@ -160,6 +161,10 @@ def bucket_by_sequence_length(element_length_func,
       any elements with length longer than `max(bucket_boundaries)`.
     no_padding: `bool`, indicates whether to pad the batch features (features
       need to be either of type `tf.SparseTensor` or of same shape).
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in the case it has fewer than
+      `batch_size` elements; the default behavior is not to drop the smaller
+      batch.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -209,7 +214,7 @@ def bucket_by_sequence_length(element_length_func,
       """Batch elements in dataset."""
       batch_size = window_size_fn(bucket_id)
       if no_padding:
-        return grouped_dataset.batch(batch_size)
+        return grouped_dataset.batch(batch_size, drop_remainder=drop_remainder)
       none_filler = None
       if pad_to_bucket_boundary:
         err_msg = ("When pad_to_bucket_boundary=True, elements must have "
@@ -227,7 +232,8 @@ def bucket_by_sequence_length(element_length_func,
       shapes = make_padded_shapes(
           padded_shapes or grouped_dataset.output_shapes,
           none_filler=none_filler)
-      return grouped_dataset.padded_batch(batch_size, shapes, padding_values)
+      return grouped_dataset.padded_batch(
+          batch_size, shapes, padding_values, drop_remainder=drop_remainder)
 
     def _apply_fn(dataset):
       return dataset.apply(
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 22a36646ea486fe8fe6e76b2e8ce262aa058e1c8..984c820b17fcb2743b955f3fd3f6bbd0b1ba0860 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -129,3 +129,50 @@ class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
             self._input_dataset._variant_tensor,  # pylint: disable=protected-access
             **dataset_ops.flat_structure(self)))
     super(_NonSerializableDataset, self).__init__(input_dataset, variant_tensor)
+
+
+class _ChooseFastestDataset(dataset_ops.DatasetV2):
+  """A `Dataset` that merges two input datasets."""
+
+  def __init__(self, datasets, num_experiments=10):
+    """Chooses the fastest of some input datasets.
+
+    Given input datasets, produces elements as quickly as the fastest of the
+    inputs. Note that this dataset assumes that input datasets have the same
+    elements in the same order, though this is not enforced besides checking
+    that the input datasets have compatible output types, output shapes, and
+    cardinality at runtime. The resulting dataset produces elements that are
+    identical to the input elements, and in the same order.
+
+    Note that the time to first iteration is longer when this dataset is used
+    due to the overhead of dynamically picking the faster dataset. Namely,
+    for the first num_experiments iterations, this dataset will pull from all
+    of its inputs simultaneously in order to determine which input is the
+    fastest. For all subsequent iterations, that input will be used.
+
+    Args:
+      datasets: A list of `Datasets` that all have the same elements in the same
+        order.
+      num_experiments: The number of experiments to run before deciding which
+        dataset is fastest. In each "experiment" iteration, the dataset will
+        call from all its inputs simultaneously, and update its knowledge of
+        which input is the fastest.
+
+    Returns:
+      A `Dataset` that has the same elements the inputs.
+    """
+    self._datasets = list(datasets)
+    self._structure = self._datasets[0]._element_structure  # pylint: disable=protected-access
+    variant_tensor = (
+        gen_experimental_dataset_ops.experimental_choose_fastest_dataset(
+            [dataset._variant_tensor for dataset in self._datasets],  # pylint: disable=protected-access
+            num_experiments=num_experiments,
+            **dataset_ops.flat_structure(self)))
+    super(_ChooseFastestDataset, self).__init__(variant_tensor)
+
+  def _inputs(self):
+    return self._datasets
+
+  @property
+  def _element_structure(self):
+    return self._datasets[0]._element_structure  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index 41a819d94bb88384c89cbc9b3eb0d4dc59575e0e..be1fb4c7cacdbfffa43fa801e2f30d9e1d16ade9 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -33,7 +33,7 @@ class OptimizationOptions(options.OptionsBase):
   ```python
   options = tf.data.Options()
   options.experimental_optimization.map_vectorization = True
-  options.apply_default_optimizations = False
+  options.experimental_optimization.apply_default_optimizations = False
   dataset = dataset.with_options(options)
   ```
   """
@@ -104,18 +104,21 @@ class OptimizationOptions(options.OptionsBase):
 
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
-    result = []
-    optimizations_to_enable = [
+    result = set()
+    all_optimizations = [
         "filter_fusion",
         "hoist_random_uniform",
+        "map_and_batch_fusion",
         "map_and_filter_fusion",
-        "map_fusion",
         "map_parallelization",
+        "map_fusion",
         "map_vectorization",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
     ]
-    for optimization in optimizations_to_enable:
+    for optimization in all_optimizations:
       if getattr(self, optimization):
-        result.append(optimization)
+        result.add(optimization)
 
     if self.apply_default_optimizations is not False:
       # The following optimizations are turned on by default, unless the
@@ -127,5 +130,5 @@ class OptimizationOptions(options.OptionsBase):
       ]
       for optimization in optimizations_to_disable:
         if getattr(self, optimization) is not False:
-          result.append(optimization)
-    return result
+          result.add(optimization)
+    return sorted(list(result))
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index d5fcc033ab7df34369e0680275df744c431ed069..3e4c66be27018d25d4877d26ac565b4500633d0d 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -45,7 +45,7 @@ class StatsAggregator(object):
 
   # Apply `StatsOptions` to associate `dataset` with `aggregator`.
   options = dataset_ops.Options()
-  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  options.experimental_stats.aggregator = aggregator
   dataset = dataset.with_options(options)
   ```
 
diff --git a/tensorflow/python/data/experimental/ops/take_while_ops.py b/tensorflow/python/data/experimental/ops/take_while_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f27a84edcc7306f5b8c7ec6866315c2490118a6
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/take_while_ops.py
@@ -0,0 +1,72 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""take-while dataset transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure as structure_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _TakeWhileDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset that stops iteration when `predicate` returns false."""
+
+  def __init__(self, input_dataset, predicate):
+    """See `take_while()` for details."""
+
+    self._input_dataset = input_dataset
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        predicate,
+        "tf.data.experimental.take_while()",
+        dataset=self._input_dataset)
+
+    if not wrapped_func.output_structure.is_compatible_with(
+        structure_lib.TensorStructure(dtypes.bool, [])):
+      raise ValueError("`predicate` must return a scalar boolean tensor.")
+
+    self._predicate = wrapped_func
+    var_tensor = gen_experimental_dataset_ops.experimental_take_while_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
+        **dataset_ops.flat_structure(self))
+    super(_TakeWhileDataset, self).__init__(input_dataset, var_tensor)
+
+  def _functions(self):
+    return [self._predicate]
+
+
+@tf_export("data.experimental.take_while")
+def take_while(predicate):
+  """A transformation that stops dataset iteration based on a `predicate`.
+
+  Args:
+    predicate: A function that maps a nested structure of tensors (having shapes
+      and types defined by `self.output_shapes` and `self.output_types`) to a
+      scalar `tf.bool` tensor.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _TakeWhileDataset(dataset, predicate)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 737ba28cebd6b885c71ba53da3130b2d6abf6ee1..b134a8deaef4463c7dd38e4aafedc7f92bf306d6 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -287,6 +287,7 @@ tf_py_test(
     size = "small",
     srcs = ["iterator_cluster_test.py"],
     additional_deps = [
+        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -405,6 +406,7 @@ cuda_py_test(
     srcs = ["multi_device_iterator_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 8193dffc7d2b467086c6c0f003a50a4345d8c7a8..f319b24bee87d127cda11f84c75fa295a1cb67c3 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -209,7 +209,6 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(2, inputs.count(ds2))
     self.assertEqual(1, inputs.count(ds3))
 
-  # TODO(b/119882922): use-after-free bug in eager mode.
   # pylint: disable=g-long-lambda
   @parameterized.named_parameters(
       ("Tensor", lambda: constant_op.constant(37.0),
@@ -233,8 +232,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
        optional_ops.OptionalStructure(
            structure.TensorStructure(dtypes.float32, []))),
   )
-  def testSkipEagerDatasetStructure(self, tf_value_fn,
-                                    expected_element_structure):
+  def testDatasetStructure(self, tf_value_fn, expected_element_structure):
     dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value_fn())
     dataset_structure = structure.Structure.from_value(dataset)
     self.assertIsInstance(dataset_structure, dataset_ops.DatasetStructure)
@@ -268,6 +266,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           round_trip_dataset, [self.evaluate(tf_value_fn())],
           requires_initialization=True)
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerSameGraphErrorOneShot(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -275,6 +274,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         dataset = dataset.batch(2)
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerSameGraphErrorOneShotSimple(self):
     dataset = dataset_ops.Dataset.range(10)
@@ -285,6 +285,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
             str(mock_log.call_args), "Please ensure that all datasets in the "
             "pipeline are created in the same graph as the iterator.")
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerSameGraphErrorInitializable(self):
     dataset = dataset_ops.Dataset.range(10)
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
index ef608ebb67007c7605e7bea36058d0cd5c5d146f..546c2fb2ed3f7584001e9aa2dbeb93ac82ca7709 100644
--- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -29,11 +29,11 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+# NOTE: deprecated method in V2, no eager coverage added.
 class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
-  def testSkipEagerFromSparseTensorSlices(self):
+  def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
     iterator = dataset_ops.make_initializable_iterator(
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index 9a480e56789aee9198fc88201f0eecb2c2eaab52..72db6387718712b97442eb3f7ddc3befcbbf6a12 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -53,7 +53,7 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       results = self.evaluate(get_next())
 
-  def testSkipEagerFromTensorSlicesSparse(self):
+  def testFromTensorSlicesSparse(self):
     """Test a dataset that represents the slices from a tuple of tensors."""
     components = (sparse_tensor.SparseTensorValue(
         indices=np.array([[0, 0], [1, 0], [2, 0]]),
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index ab3c15263fdaa0829686f90450e0e79081299a2e..82ccdebc7ff7adec439791f205c30e3011afa996 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -50,7 +50,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
 
     self.assertDatasetProduces(dataset, expected_output=[components])
 
-  def testSkipEagerFromTensorsSparse(self):
+  def testFromTensorsSparse(self):
     """Test a dataset that represents a single tuple of tensors."""
     components = (sparse_tensor.SparseTensorValue(
         indices=np.array([[0]]),
@@ -224,6 +224,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
     self.assertEquals(dtypes.int64, get_next().dtype)
     self.assertEquals([3], get_next().shape)
 
+  # TODO(b/121264236): needs mechanism for multiple device in eager mode.
   def testSkipEagerSplitPipelineFailsWithPlacementError(self):
     with session.Session(
         target="",
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
index 4fb61b2daf125ce08a3ba99d81c7721a7fb6dc0a..4b427ff5a4173d73171400a2d3f36cbdfd416cdd 100644
--- a/tensorflow/python/data/kernel_tests/interleave_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -17,19 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -78,47 +74,6 @@ def _interleave(lists, cycle_length, block_length):
           break
 
 
-def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
-                                     num_parallel_calls):
-  """Produces a dataset iterator and events to control the order of elements.
-
-  Args:
-    input_values: the values to generate lists to interleave from
-    cycle_length: the length of the interleave cycle
-    block_length: the length of the interleave block
-    num_parallel_calls: the degree of interleave parallelism
-
-  Returns:
-    A dataset iterator (represented as `get_next` op) and events that can be
-    used to control the order of output elements.
-  """
-
-  # Set up threading events used to sequence when items are produced that
-  # are subsequently interleaved. These events allow us to deterministically
-  # simulate slowdowns and force sloppiness.
-  coordination_events = {i: threading.Event() for i in input_values}
-
-  def map_py_fn(x):
-    coordination_events[x].wait()
-    coordination_events[x].clear()
-    return x * x
-
-  def map_fn(x):
-    return script_ops.py_func(map_py_fn, [x], x.dtype)
-
-  def interleave_fn(x):
-    dataset = dataset_ops.Dataset.from_tensors(x)
-    dataset = dataset.repeat(x)
-    return dataset.map(map_fn)
-
-  options = dataset_ops.Options()
-  options.experimental_deterministic = False
-  dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
-      2).interleave(interleave_fn, cycle_length, block_length,
-                    num_parallel_calls).with_options(options)
-  return dataset, coordination_events
-
-
 def _repeat(values, count):
   """Produces a list of lists suitable for testing interleave.
 
@@ -252,63 +207,37 @@ class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.evaluate(get_next())
 
   @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 2, 1, 1),
-      ("2", np.int64([4, 5, 6]), 2, 1, 2),
-      ("3", np.int64([4, 5, 6]), 2, 3, 1),
-      ("4", np.int64([4, 5, 6]), 2, 3, 2),
-      ("5", np.int64([4, 5, 6]), 3, 2, 1),
-      ("6", np.int64([4, 5, 6]), 3, 2, 2),
-      ("7", np.int64([4, 5, 6]), 3, 2, 3),
-      ("8", np.int64([4, 0, 6]), 2, 3, 1),
-      ("9", np.int64([4, 0, 6]), 2, 3, 2),
+      ("1", np.int64([4, 5, 6]), 1, 3, 1),
+      ("2", np.int64([4, 5, 6]), 2, 1, 1),
+      ("3", np.int64([4, 5, 6]), 2, 1, 2),
+      ("4", np.int64([4, 5, 6]), 2, 3, 1),
+      ("5", np.int64([4, 5, 6]), 2, 3, 2),
+      ("6", np.int64([4, 5, 6]), 7, 2, 1),
+      ("7", np.int64([4, 5, 6]), 7, 2, 3),
+      ("8", np.int64([4, 5, 6]), 7, 2, 5),
+      ("9", np.int64([4, 5, 6]), 7, 2, 7),
+      ("10", np.int64([4, 0, 6]), 2, 3, 1),
+      ("11", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  def testSloppyInterleaveInOrder(self, input_values, cycle_length,
+  def testSloppyInterleaveDataset(self, input_values, cycle_length,
                                   block_length, num_parallel_calls):
-    dataset, coordination_events = _make_coordinated_sloppy_dataset(
-        input_values, cycle_length, block_length, num_parallel_calls)
-    options = dataset_ops.Options()
-    options.experimental_threading = threading_options.ThreadingOptions()
-    options.experimental_threading.private_threadpool_size = (
-        num_parallel_calls + 1)
-    dataset = dataset.with_options(options)
-
-    get_next = self.getNext(dataset, requires_initialization=True)
-    for expected_element in _interleave(
-        _repeat(input_values, 2), cycle_length, block_length):
-      coordination_events[expected_element].set()
-      self.assertEqual(expected_element * expected_element,
-                       self.evaluate(get_next()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
-
-  @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 2, 1, 2),
-      ("2", np.int64([4, 5, 6]), 2, 3, 2),
-      ("3", np.int64([4, 5, 6]), 3, 2, 3),
-      ("4", np.int64([4, 0, 6]), 2, 3, 2),
-  )
-  def testSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
-                                     block_length, num_parallel_calls):
-    dataset, coordination_events = _make_coordinated_sloppy_dataset(
-        input_values, cycle_length, block_length, num_parallel_calls)
+    count = 2
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        count).interleave(
+            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+            cycle_length, block_length, num_parallel_calls)
     options = dataset_ops.Options()
-    options.experimental_threading = threading_options.ThreadingOptions()
-    options.experimental_threading.private_threadpool_size = (
-        num_parallel_calls + 1)
+    options.experimental_deterministic = False
     dataset = dataset.with_options(options)
-    get_next = self.getNext(dataset, requires_initialization=True)
-    elements = [
-        x for x in _interleave(
-            _repeat(input_values, 2), cycle_length, block_length)
+    expected_output = [
+        element for element in _interleave(
+            _repeat(input_values, count), cycle_length, block_length)
     ]
-    for i in [1, 4, 7]:
-      elements[i], elements[i + 1] = elements[i + 1], elements[i]
-
-    for element in elements:
-      coordination_events[element].set()
-      self.assertEqual(element * element, self.evaluate(get_next()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
+    get_next = self.getNext(dataset)
+    actual_output = []
+    for _ in range(len(expected_output)):
+      actual_output.append(self.evaluate(get_next()))
+    self.assertAllEqual(expected_output.sort(), actual_output.sort())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index 20088234953b1cdc8f85381ded45cf22aa93c75a..23d3b6a439857e229ebd1b3298db1c29e2b09849 100644
--- a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib import lookup as lookup_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -31,7 +32,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index e84391946f0c322f90221aad800590c7b0317b36..97ab6b21bc27283d0e3630690b9d7cbf20b09b47 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -84,7 +84,7 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _buildMapDataset(self, components, count):
 
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 433ea620e1698d22a63716b18a6d5dadf1f06dff..2bc0d8f90df2c4ca2b3a43e4f206c4a39916052a 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -18,11 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -31,97 +34,109 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-# TODO(b/117581999): Add eager coverage.
-class MultiDeviceIteratorTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class MultiDeviceIteratorTest(test_base.DatasetTestBase,
+                              parameterized.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
-  def testNoGetNext(self):
+  @parameterized.parameters(0, 1, 42,)
+  @test_util.run_v1_only("b/121264236")
+  def testInitOnly(self, num_inits):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
-      self.evaluate(multi_device_iterator.initializer)
+    with self.test_session(config=config):
+      for _ in range(num_inits):
+        self.evaluate(multi_device_iterator.initializer)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:0", "/cpu:1"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2", "/cpu:1", "/cpu:2"])
-    elements = multi_device_iterator.get_next()
-    elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 20, 4):
+        elements = multi_device_iterator.get_next()
+        elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
         self.assertEqual(i + 2, self.evaluate(elem_on_3))
         self.assertEqual(i + 3, self.evaluate(elem_on_4))
       with self.assertRaises(errors.OutOfRangeError):
+        elements = multi_device_iterator.get_next()
+        elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
         self.evaluate(elem_on_3)
         self.evaluate(elem_on_4)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      elem_on_1 = multi_device_iterator.get_next("/cpu:1")
       self.assertEqual(8, self.evaluate(elem_on_1))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testGetNextAsOptional(self):
+    if context.executing_eagerly():
+      return
+
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
@@ -154,26 +169,31 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"], max_buffer_size=4)
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1 = multi_device_iterator.get_next("/cpu:1")
         self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
+        elem_on_2 = multi_device_iterator.get_next("/cpu:2")
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
-  def testMultipleInitializations(self):
+  @test_util.run_v1_only("b/121264236")
+  def testMultipleInitializationsGraph(self):
+    if context.executing_eagerly():
+      return
+
     with ops.device("/cpu:0"):
       epoch = array_ops.placeholder(dtypes.int64, shape=[])
       dataset1 = dataset_ops.Dataset.from_tensors(epoch).repeat(1000)
@@ -191,6 +211,24 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
                                                           elem_on_2]))
 
+  @test_util.run_v1_only("b/121264236")
+  def testMultipleInitializationsEager(self):
+    self.skipTest("b/123023614")
+    if not context.executing_eagerly():
+      return
+
+    with ops.device("/cpu:0"):
+      dataset1 = dataset_ops.Dataset.range(1000)
+      dataset2 = dataset_ops.Dataset.range(1000)
+      dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    for _ in range(1000):
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/cpu:2"], prefetch_buffer_size=4)
+      elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+      self.assertEqual([(0, 0), (1, 1)], self.evaluate([elem_on_1, elem_on_2]))
+
+  @test_util.run_v1_only("b/121264236")
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -198,18 +236,20 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/gpu:0"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/121264236")
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -217,21 +257,24 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/gpu:0"], max_buffer_size=4)
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1 = multi_device_iterator.get_next("/cpu:1")
         self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
+        elem_on_2 = multi_device_iterator.get_next("/gpu:0")
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/121264236")
   def testGetNextAsOptionalGpu(self):
-    if not test_util.is_gpu_available():
+    if not test_util.is_gpu_available() or context.executing_eagerly():
       self.skipTest("No GPU available")
 
     dataset = dataset_ops.Dataset.range(9)
@@ -266,7 +309,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
@@ -279,18 +322,21 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={"CPU": 3, "GPU": 1}))
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
index ba5ee9b6613a1a82000ed41f90f595b3975bedb8..2269bb8724dba40f73ceef8797206adb513a2f60 100644
--- a/tensorflow/python/data/kernel_tests/optional_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -75,7 +75,6 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllEqual(expected.dense_shape,
                           self.evaluate(actual.dense_shape))
 
-  @test_util.run_deprecated_v1
   def testFromNone(self):
     value_structure = structure.TensorStructure(dtypes.float32, [])
     opt = optional_ops.Optional.none_from_structure(value_structure)
@@ -269,9 +268,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
        optional_ops.OptionalStructure(
            structure.TensorStructure(dtypes.float32, []))),
   )
-  @test_util.run_deprecated_v1
-  def testSkipEagerOptionalStructure(self, tf_value_fn,
-                                     expected_value_structure):
+  def testOptionalStructure(self, tf_value_fn, expected_value_structure):
     tf_value = tf_value_fn()
     opt = optional_ops.Optional.from_value(tf_value)
 
@@ -306,6 +303,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(
           self.evaluate(tf_value), self.evaluate(round_trip_opt.get_value()))
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @parameterized.named_parameters(
       ("Tensor", np.array([1, 2, 3], dtype=np.int32),
        lambda: constant_op.constant([4, 5, 6], dtype=dtypes.int32), True),
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index dcfb2f507bf1a7d91041eb5f24c95c6de2c18362..042af7a6f9fb19b25fd9b01c509ed267833720f9 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -156,6 +156,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     next_element = self.getNext(padded_dataset)
     self.evaluate(next_element())
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerPaddedBatchDatasetShapeSpecifications(self):
     int_placeholder = array_ops.placeholder(dtypes.int32)
@@ -228,6 +229,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       _ = dataset_ops.Dataset.range(10).padded_batch(
           5, padded_shapes=shape_as_tensor)
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerPaddedBatchShapeError(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index 14bbc0bf72caa07445ca7d077845e2bc4569cc01..93acc1565fd34beb3b0be1eaf0408272c81effed 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -68,6 +68,7 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(((i + 1) * i) // 2, s)
       self.assertEqual(i, c)
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerSquareUsingPlaceholder(self):
     delta = array_ops.placeholder(dtype=dtypes.int64)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 7fa9ea59e888a2a0e0afa02104276d9d92d4a371..104f99a3d4faba9ecd4340384cbe1d75b3e5ba5b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -970,8 +970,8 @@ class DatasetV2(object):
         shapes and types defined by `self.output_shapes` and
        `self.output_types`) to another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially. If the value
+        representing the number elements to process asynchronously in parallel.
+        If not specified, elements will be processed sequentially. If the value
         `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
 
@@ -1733,7 +1733,7 @@ def make_one_shot_iterator(dataset):
 
 
 @tf_export(v1=["data.make_initializable_iterator"])
-def make_initializable_iterator(dataset):
+def make_initializable_iterator(dataset, shared_name=None):
   """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
 
   Note: The returned iterator will be in an uninitialized state,
@@ -1741,13 +1741,16 @@ def make_initializable_iterator(dataset):
 
   ```python
   dataset = ...
-  iterator = dataset.make_initializable_iterator()
+  iterator = tf.data.make_initializable_iterator(dataset)
   # ...
   sess.run(iterator.initializer)
   ```
 
   Args:
     dataset: A `tf.data.Dataset`.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+      shared under the given name across multiple sessions that share the
+      same devices (e.g. when using a remote server).
 
   Returns:
     A `tf.data.Iterator` over the elements of `dataset`.
@@ -1756,11 +1759,11 @@ def make_initializable_iterator(dataset):
     RuntimeError: If eager execution is enabled.
   """
   try:
-    # Call the defined `make_one_shot_iterator()` if there is one, because some
-    # datasets (e.g. for prefetching) override its behavior.
-    return dataset.make_initializable_iterator()
+    # Call the defined `make_initializable_iterator()` if there is one, because
+    # some datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_initializable_iterator(shared_name)
   except AttributeError:
-    return DatasetV1Adapter(dataset).make_initializable_iterator()
+    return DatasetV1Adapter(dataset).make_initializable_iterator(shared_name)
 
 
 @tf_export("data.Options")
@@ -3017,7 +3020,6 @@ class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
   def __init__(self, input_dataset):
-    """See `optimize()` for details."""
     self._input_dataset = input_dataset
     variant_tensor = gen_dataset_ops.model_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -3029,7 +3031,6 @@ class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
   def __init__(self, input_dataset, optimizations):
-    """See `optimize()` for details."""
     self._input_dataset = input_dataset
     if optimizations is None:
       optimizations = []
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index bfa256f8d77356d4d915e82b95786892bdc814dc..d6fb73813cd06e440d69f900e6b1076606a068c0 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -68,7 +68,7 @@ def _device_stack_is_empty():
 
 
 @tf_export(v1=["data.Iterator"])
-class Iterator(checkpointable.CheckpointableBase):
+class Iterator(checkpointable.Checkpointable):
   """Represents the state of iterating through a `Dataset`."""
 
   def __init__(self, iterator_resource, initializer, output_types,
@@ -491,7 +491,7 @@ def _generate_shared_name(prefix):
   return "{}{}".format(prefix, uid)
 
 
-class EagerIterator(checkpointable.CheckpointableBase):
+class EagerIterator(checkpointable.Checkpointable):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset."""
 
   def __init__(self, dataset):
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 8192d5389175db5ffd94cb1986dfff15a1887a8d..e9a26807aebc373d059e3361621755f491ba95d1 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -28,14 +28,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 class _PerDeviceGenerator(dataset_ops.DatasetV2):
   """A `dummy` generator dataset."""
 
   def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
-               source_device, target_device, element_structure):
-    self._target_device = target_device
+               source_device, element_structure):
     self._structure = element_structure
 
     multi_device_iterator_string_handle = (
@@ -107,15 +107,14 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     )
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
-    with ops.device(self._target_device):
-      variant_tensor = gen_dataset_ops.generator_dataset(
-          self._init_captured_args,
-          self._next_captured_args,
-          self._finalize_captured_args,
-          init_func=self._init_func,
-          next_func=self._next_func,
-          finalize_func=self._finalize_func,
-          **dataset_ops.flat_structure(self))
+    variant_tensor = gen_dataset_ops.generator_dataset(
+        self._init_captured_args,
+        self._next_captured_args,
+        self._finalize_captured_args,
+        init_func=self._init_func,
+        next_func=self._next_func,
+        finalize_func=self._finalize_func,
+        **dataset_ops.flat_structure(self))
     super(_PerDeviceGenerator, self).__init__(variant_tensor)
 
   def _inputs(self):
@@ -128,13 +127,7 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
 
 
 class MultiDeviceIterator(object):
-  """An iterator over multiple devices.
-
-  @compatibility(eager)
-  MultiDeviceIterator isn't currently supported in Eager mode but support is
-  coming soon.
-  @end_compatibility
-  """
+  """An iterator over multiple devices."""
 
   def __init__(self,
                dataset,
@@ -155,10 +148,6 @@ class MultiDeviceIterator(object):
     Raises:
       RuntimeError: If run in Eager mode.
     """
-    if context.executing_eagerly():
-      # TODO(rohanj): Fix this. Tracking bug: b/116467184
-      raise RuntimeError("MultiDeviceIterator is not currently supported in "
-                         "Eager mode.")
     self._dataset = dataset._apply_options()  # pylint: disable=protected-access
     self._devices = devices
     self._source_device = source_device
@@ -166,12 +155,23 @@ class MultiDeviceIterator(object):
 
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
+      # TODO(b/121378567): Get rid of this shared_name hack.
+      shared_name = ""
+      if context.executing_eagerly():
+        # Ensure a unique name when eager execution is enabled to avoid spurious
+        # sharing issues.
+        shared_name += str(ops.uid())
       self._multi_device_iterator_resource = (
           gen_dataset_ops.multi_device_iterator(
               devices=self._devices,
-              shared_name="",
+              shared_name=shared_name,
               container="",
               **dataset_ops.flat_structure(dataset)))
+      if context.executing_eagerly():
+        # Delete the resource when this object is deleted
+        self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+            handle=self._multi_device_iterator_resource,
+            handle_device=self._source_device)
 
       # The incarnation ID is used to ensure consistency between the per-device
       # iterators and the multi-device iterator.
@@ -187,27 +187,36 @@ class MultiDeviceIterator(object):
     # Create the per device iterators.
     self._device_iterators = []
     for i, device in enumerate(self._devices):
-      ds = _PerDeviceGenerator(
-          i, self._multi_device_iterator_resource, self._incarnation_id,
-          self._source_device_tensor, device, dataset._element_structure)  # pylint: disable=protected-access
-      if prefetch_buffer_size > 0:
-        ds = ds.prefetch(prefetch_buffer_size)
-      # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
-      # non-CPU devices.
-      options = dataset_ops.Options()
-      options.experimental_autotune = False
-      options.experimental_optimization.apply_default_optimizations = False
-      ds = ds.with_options(options)
       with ops.device(device):
-        self._device_iterators.append(
-            dataset_ops.make_initializable_iterator(ds))
+        ds = _PerDeviceGenerator(
+            i, self._multi_device_iterator_resource, self._incarnation_id,
+            self._source_device_tensor, dataset._element_structure)  # pylint: disable=protected-access
+        if prefetch_buffer_size > 0:
+          ds = ds.prefetch(prefetch_buffer_size)
+        # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+        # non-CPU devices.
+        options = dataset_ops.Options()
+        options.experimental_autotune = False
+        options.experimental_optimization.apply_default_optimizations = False
+        ds = ds.with_options(options)
+        if context.executing_eagerly():
+          self._device_iterators.append(dataset_ops.make_one_shot_iterator(ds))
+        else:
+          self._device_iterators.append(
+              dataset_ops.make_initializable_iterator(ds))
+
+    if not context.executing_eagerly():
+      device_iterator_initializers = [
+          iterator.initializer for iterator in self._device_iterators
+      ]
+      self._initializer = control_flow_ops.group(*device_iterator_initializers)
+
+  def get_next(self, device=None):
+    """Returns the next element given a `device`, else returns all in a list."""
+    if device is not None:
+      index = self._devices.index(device)
+      return self._device_iterators[index].get_next()
 
-    device_iterator_initializers = [
-        iterator.initializer for iterator in self._device_iterators
-    ]
-    self._initializer = control_flow_ops.group(*device_iterator_initializers)
-
-  def get_next(self):
     result = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
@@ -224,6 +233,8 @@ class MultiDeviceIterator(object):
 
   @property
   def initializer(self):
+    if context.executing_eagerly():
+      return control_flow_ops.no_op()
     return self._initializer
 
   @property
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 1dcdb880f553422c53cd8323ff888dc2e1c60719..27a700f813cf0fd3896a85fd799b02776672795c 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -19,6 +19,7 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
+load("//tensorflow:tensorflow.bzl", "if_not_v2")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
@@ -406,9 +407,10 @@ py_library(
         ":debug_errors",
         ":debug_fibonacci",
         ":debug_keras",
+    ] + if_not_v2([
         ":debug_mnist",
         ":debug_tflearn_iris",
-    ],
+    ]),
 )
 
 py_binary(
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 02957b2fefbc0e58a9338a16e641ccb729e14ecc..36e85571267913ca462ff063a5ad42a2034f15d2 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -124,6 +124,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":device_util",
+        ":numpy_dataset",
         ":reduce_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -184,9 +185,6 @@ py_test(
     name = "distribute_coordinator_test",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -219,7 +217,9 @@ py_library(
         ":cross_device_ops",
         ":device_util",
         ":distribute_lib",
+        ":input_lib",
         ":multi_worker_util",
+        ":numpy_dataset",
         ":reduce_util",
         ":shared_variable_creator",
         ":values",
@@ -241,6 +241,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "parameter_server_strategy",
+    srcs = ["parameter_server_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":input_lib",
+        ":mirrored_strategy",
+        ":numpy_dataset",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -253,6 +276,49 @@ py_library(
     ],
 )
 
+py_library(
+    name = "numpy_dataset",
+    srcs = ["numpy_dataset.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "numpy_dataset_test",
+    size = "small",
+    srcs = ["numpy_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":numpy_dataset",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "input_lib",
+    srcs = ["input_lib.py"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":input_ops",
+        ":values",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "input_ops",
     srcs = ["input_ops.py"],
@@ -278,16 +344,12 @@ cuda_py_test(
         "//tensorflow/python:io_ops",
         "//tensorflow/python:util",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
     deps = [
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
@@ -348,14 +410,12 @@ py_library(
     deps = [
         ":device_util",
         ":distribute_lib",
-        ":input_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:base",
         "@six_archive//:six",
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index 73188bd7caaeb8f60e1e19dc11ce20e0a4349433..8cc7cff6394d28236e6594f30bf53f698f3fa950 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -56,13 +56,13 @@ class ClusterResolver(object):
   underlying machine failures and scale TensorFlow worker clusters up and down.
 
   Note to Implementors: In addition to these abstract methods, you must also
-  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  implement the task_type, task_id, and rpc_layer attributes. You may choose
   to implement them either as properties with getters or setters or directly
   set the attributes.
 
   - task_type is the name of the server's current named job (e.g. 'worker',
      'ps' in a distributed parameterized training job).
-  - task_index is the ordinal index of the server within the task type.
+  - task_id is the ordinal index of the server within the task type.
   - rpc_layer is the protocol used by TensorFlow to communicate with other
       TensorFlow servers in a distributed environment.
   """
@@ -84,12 +84,12 @@ class ClusterResolver(object):
     raise NotImplementedError()
 
   @abc.abstractmethod
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Retrieves the name or URL of the session master.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
@@ -103,7 +103,7 @@ class ClusterResolver(object):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     """Returns the number of accelerator cores per worker.
@@ -113,7 +113,7 @@ class ClusterResolver(object):
     should return 0. This method will query the master for this information
     if it is not otherwise known.
 
-    Optionally, we allow callers to specify the task_type, task_index, and
+    Optionally, we allow callers to specify the task_type, task_id, and
     rpc_layer, if they want to target a specific TensorFlow process to query
     the number of accelerators. This is to support heterogenous environments,
     where the number of accelerators cores per host is different.
@@ -121,14 +121,14 @@ class ClusterResolver(object):
     Args:
       task_type: (Optional) The type of the TensorFlow task of the machine we
         want to query.
-      task_index: (Optional) The index of the TensorFlow task of the machine we
+      task_id: (Optional) The index of the TensorFlow task of the machine we
         want to query.
       accelerator_type: (Optional) The type of accelerator we are trying to
         query (defaults to 'GPU').
       config_proto: (Optional) Configuration for starting a new session to
         query how many accelerator cores it has.
     """
-    master = self.master(task_type, task_index)
+    master = self.master(task_type, task_id)
     devices = get_accelerator_devices(master, config_proto)
     return sum(1 for d in devices if d.device_type == accelerator_type)
 
@@ -141,14 +141,14 @@ class ClusterResolver(object):
 class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
-  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+  def __init__(self, cluster_spec, master='', task_type=None, task_id=None,
                environment='', num_accelerators=0,
                rpc_layer=None):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
 
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._environment = environment
     self._num_accelerators = num_accelerators
     self._rpc_layer = rpc_layer
@@ -165,22 +165,22 @@ class SimpleClusterResolver(ClusterResolver):
     """Returns the ClusterSpec passed into the constructor."""
     return self._cluster_spec
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC used by distributed TensorFlow.
 
     Returns:
       The name or URL of the session master.
 
-    If a task_type and task_index is given, this will override the `master`
+    If a task_type and task_id is given, this will override the `master`
     string passed into the initialization function.
     """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
     else:
       master = self._master
 
@@ -191,16 +191,16 @@ class SimpleClusterResolver(ClusterResolver):
     return self._task_type
 
   @property
-  def task_index(self):
-    return self._task_index
+  def task_id(self):
+    return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -208,7 +208,7 @@ class SimpleClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     """Returns the number of accelerator cores per worker.
@@ -220,12 +220,12 @@ class SimpleClusterResolver(ClusterResolver):
 
     Args:
       task_type: Unused.
-      task_index: Unused.
+      task_id: Unused.
       accelerator_type: Unused.
       config_proto: Unused.
     """
     # Unused
-    del task_type, task_index, accelerator_type, config_proto
+    del task_type, task_id, accelerator_type, config_proto
     return self._num_accelerators
 
   @property
@@ -259,7 +259,7 @@ class UnionClusterResolver(ClusterResolver):
         rpc_layer - (Optional) Override value for the RPC layer used by
           TensorFlow.
         task_type - (Optional) Override value for the current task type.
-        task_index - (Optional) Override value for the current task index.
+        task_id - (Optional) Override value for the current task index.
 
     Raises:
       TypeError: If any argument is not a subclass of `ClusterResolvers`.
@@ -269,7 +269,7 @@ class UnionClusterResolver(ClusterResolver):
 
     self._rpc_layer = kwargs.pop('rpc_layer', None)
     self._task_type = kwargs.pop('task_type', None)
-    self._task_index = kwargs.pop('task_index', None)
+    self._task_id = kwargs.pop('task_id', None)
 
     if kwargs:
       raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
@@ -357,22 +357,22 @@ class UnionClusterResolver(ClusterResolver):
 
     return ClusterSpec(merged_cluster)
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     This usually returns the master from the first ClusterResolver passed in,
-    but you can override this by specifying the task_type and task_index.
+    but you can override this by specifying the task_type and task_id.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
       return format_master_url(master, rpc_layer or self._rpc_layer)
 
     return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
@@ -382,16 +382,16 @@ class UnionClusterResolver(ClusterResolver):
     return self._task_type or self._cluster_resolvers[0].task_type
 
   @property
-  def task_index(self):
-    return self._task_index or self._cluster_resolvers[0].task_index
+  def task_id(self):
+    return self._task_id or self._cluster_resolvers[0].task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -399,11 +399,11 @@ class UnionClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     return self._cluster_resolvers[0].num_accelerators(
-        task_type, task_index, accelerator_type, config_proto)
+        task_type, task_id, accelerator_type, config_proto)
 
   @property
   def rpc_layer(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index 0ff6b6be62122b3a7b71124613a694d9bb5fd357..c1eb29e2fc9bbb85e9580ce0f951412d0c09fd2c 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -33,7 +33,7 @@ class MockBaseClusterResolver(ClusterResolver):
   def cluster_spec(self):
     return None
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     return ""
 
   def environment(self):
@@ -117,12 +117,12 @@ class UnionClusterResolverTest(test.TestCase):
     })
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_index=1, environment="cloud",
+                                            task_id=1, environment="cloud",
                                             num_accelerators=8,
                                             rpc_layer="grpc")
 
     self.assertEqual(simple_resolver.task_type, "ps")
-    self.assertEqual(simple_resolver.task_index, 1)
+    self.assertEqual(simple_resolver.task_id, 1)
     self.assertEqual(simple_resolver.environment, "cloud")
     self.assertEqual(simple_resolver.num_accelerators(), 8)
     self.assertEqual(simple_resolver.rpc_layer, "grpc")
@@ -134,16 +134,16 @@ class UnionClusterResolverTest(test.TestCase):
     })
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_index=1, environment="cloud",
+                                            task_id=1, environment="cloud",
                                             num_accelerators=8,
                                             rpc_layer="grpc")
 
     simple_resolver.task_type = "worker"
-    simple_resolver.task_index = 2
+    simple_resolver.task_id = 2
     simple_resolver.rpc_layer = "http"
 
     self.assertEqual(simple_resolver.task_type, "worker")
-    self.assertEqual(simple_resolver.task_index, 2)
+    self.assertEqual(simple_resolver.task_id, 2)
     self.assertEqual(simple_resolver.rpc_layer, "http")
 
   def testSimpleOverrideMasterWithTaskIndexZero(self):
@@ -182,7 +182,7 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
-                                      task_index=1, environment="cloud",
+                                      task_id=1, environment="cloud",
                                       num_accelerators=8,
                                       rpc_layer="grpc")
 
@@ -191,24 +191,24 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
     })
     resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
-                                      task_index=2, environment="local",
+                                      task_id=2, environment="local",
                                       num_accelerators=16,
                                       rpc_layer="http")
 
     union_resolver = UnionClusterResolver(resolver1, resolver2)
 
     self.assertEqual(union_resolver.task_type, "ps")
-    self.assertEqual(union_resolver.task_index, 1)
+    self.assertEqual(union_resolver.task_id, 1)
     self.assertEqual(union_resolver.environment, "cloud")
     self.assertEqual(union_resolver.num_accelerators(), 8)
     self.assertEqual(union_resolver.rpc_layer, "grpc")
 
     union_resolver.task_type = "worker"
-    union_resolver.task_index = 2
+    union_resolver.task_id = 2
     union_resolver.rpc_layer = "http"
 
     self.assertEqual(union_resolver.task_type, "worker")
-    self.assertEqual(union_resolver.task_index, 2)
+    self.assertEqual(union_resolver.task_id, 2)
     self.assertEqual(union_resolver.rpc_layer, "http")
 
   def testTwoNonOverlappingJobMergedClusterResolver(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
index 06512613cbe34b09730dd7c6914ea9d7098204d5..305c53870de3f6f630e7285b6becd0dd0fcf6621 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -49,7 +49,7 @@ class GceClusterResolver(ClusterResolver):
                instance_group,
                port,
                task_type='worker',
-               task_index=0,
+               task_id=0,
                rpc_layer='grpc',
                credentials='default',
                service=None):
@@ -66,7 +66,7 @@ class GceClusterResolver(ClusterResolver):
       port: Port of the listening TensorFlow server (default: 8470)
       task_type: Name of the TensorFlow job this GCE instance group of VM
         instances belong to.
-      task_index: The task index for this particular VM, within the GCE
+      task_id: The task index for this particular VM, within the GCE
         instance group. In particular, every single instance should be assigned
         a unique ordinal index within an instance group manually so that they
         can be distinguished from each other.
@@ -85,7 +85,7 @@ class GceClusterResolver(ClusterResolver):
     self._zone = zone
     self._instance_group = instance_group
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._rpc_layer = rpc_layer
     self._port = port
     self._credentials = credentials
@@ -149,12 +149,12 @@ class GceClusterResolver(ClusterResolver):
     worker_list.sort()
     return ClusterSpec({self._task_type: worker_list})
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     task_type = task_type if task_type is not None else self._task_type
-    task_index = task_index if task_index is not None else self._task_index
+    task_id = task_id if task_id is not None else self._task_id
 
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
       if rpc_layer or self._rpc_layer:
         return '%s://%s' % (rpc_layer or self._rpc_layer, master)
       else:
@@ -167,8 +167,8 @@ class GceClusterResolver(ClusterResolver):
     return self._task_type
 
   @property
-  def task_index(self):
-    return self._task_index
+  def task_id(self):
+    return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
@@ -176,9 +176,9 @@ class GceClusterResolver(ClusterResolver):
         'You cannot reset the task_type of the GceClusterResolver after it has '
         'been created.')
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index d4f0660c922d593d81c0927dea0d6271e89c53e1..07b9eeb08efac3e75dacf1f49adc456b8d354f7d 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -140,7 +140,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        task_index=0,
+        task_id=0,
         port=8470,
         credentials=None,
         service=self.standard_mock_service_client())
@@ -181,11 +181,11 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
-    gce_cluster_resolver.task_index = 1
+    gce_cluster_resolver.task_id = 1
     gce_cluster_resolver.rpc_layer = 'test'
 
     self.assertEqual(gce_cluster_resolver.task_type, 'testworker')
-    self.assertEqual(gce_cluster_resolver.task_index, 1)
+    self.assertEqual(gce_cluster_resolver.task_id, 1)
     self.assertEqual(gce_cluster_resolver.rpc_layer, 'test')
     self.assertEqual(gce_cluster_resolver.master(), 'test://10.2.3.4:8470')
 
@@ -201,13 +201,13 @@ class GceClusterResolverTest(test.TestCase):
         zone='us-east1-d',
         instance_group='test-instance-group',
         task_type='',
-        task_index=1,
+        task_id=1,
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
     self.assertEqual(gce_cluster_resolver.master(
-        task_type='', task_index=0), 'grpc://10.1.2.3:8470')
+        task_type='', task_id=0), 'grpc://10.1.2.3:8470')
 
   def testCustomJobNameAndPortRetrieval(self):
     gce_cluster_resolver = GceClusterResolver(
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
index 7ff6ec0f2d5c6f6d2315e98cf5e7250b118fbadd..71d48ed2b94370dfe9c80c5b698f987c83730273 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -88,31 +88,31 @@ class KubernetesClusterResolver(ClusterResolver):
     self._override_client = override_client
 
     self.task_type = None
-    self.task_index = None
+    self.task_id = None
     self.rpc_layer = rpc_layer
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
-    You must have set the task_type and task_index object properties before
-    calling this function, or pass in the `task_type` and `task_index`
+    You must have set the task_type and task_id object properties before
+    calling this function, or pass in the `task_type` and `task_id`
     parameters when using this function. If you do both, the function parameters
     will override the object properties.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    if task_type is not None and task_index is not None:
+    if task_type is not None and task_id is not None:
       return format_master_url(
-          self.cluster_spec().task_address(task_type, task_index),
+          self.cluster_spec().task_address(task_type, task_id),
           rpc_layer or self.rpc_layer)
 
     return ''
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index a9750fa60b993a3504bbd01f0663cfdf868a2f01..f4e4cd82129a807cc62b81e7b7ac07d6b7c8d92c 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -119,9 +119,9 @@ class KubernetesClusterResolverTest(test.TestCase):
         override_client=_mock_kubernetes_client(
             {'job-name=tensorflow': ret}))
     cluster_resolver.task_type = 'worker'
-    cluster_resolver.task_index = 0
+    cluster_resolver.task_id = 0
     self.assertEqual(cluster_resolver.task_type, 'worker')
-    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.task_id, 0)
     self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
     self.assertEqual(cluster_resolver.master('worker', 2),
                      'grpc://10.1.2.5:8470')
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 9dbe25b613447fde2140585742d005dab82fb018..0ec566c670f65ef93bca12ce3f302b33afc0e297 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -112,7 +112,7 @@ class SlurmClusterResolver(ClusterResolver):
 
     self._auto_set_gpu = auto_set_gpu
     self.task_type = None
-    self.task_index = None
+    self.task_id = None
     self.rpc_layer = rpc_layer
 
     self._gpu_allocation = []
@@ -170,7 +170,7 @@ class SlurmClusterResolver(ClusterResolver):
 
       if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
         self.task_type = task_type
-        self.task_index = self._rank - cluster_rank_offset_start
+        self.task_id = self._rank - cluster_rank_offset_start
 
       cluster_rank_offset_start = cluster_rank_offset_end
 
@@ -180,7 +180,7 @@ class SlurmClusterResolver(ClusterResolver):
     return ClusterSpec(self._cluster_allocation)
 
   def get_task_info(self):
-    """Returns job name and task_index for the process which calls this.
+    """Returns job name and task_id for the process which calls this.
 
     This returns the job name and task index for the process which calls this
     function according to its rank and cluster specification. The job name and
@@ -191,14 +191,14 @@ class SlurmClusterResolver(ClusterResolver):
       A string specifying job name the process belongs to and an integner
         specifying the task index the process belongs to in that job.
     """
-    return self.task_type, self.task_index
+    return self.task_type, self.task_id
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master string for connecting to a TensorFlow master.
 
     Args:
       task_type: (Optional) Overrides the default auto-selected task type.
-      task_index: (Optional) Overrides the default auto-slected task index.
+      task_id: (Optional) Overrides the default auto-slected task index.
       rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
         to communicate across nodes.
 
@@ -206,11 +206,11 @@ class SlurmClusterResolver(ClusterResolver):
       A connection string for connecting to a TensorFlow master.
     """
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    if task_type is not None and task_index is not None:
+    if task_type is not None and task_id is not None:
       return format_master_url(
-          self.cluster_spec().task_address(task_type, task_index),
+          self.cluster_spec().task_address(task_type, task_id),
           rpc_layer or self.rpc_layer)
 
     return ''
@@ -227,9 +227,9 @@ class SlurmClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     # Unused, since this is set in __init__ manually.
-    del task_type, task_index, accelerator_type, config_proto
+    del task_type, task_id, accelerator_type, config_proto
     return self._gpus_per_node
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index 076539d16f17d64a9a28052960b61a5b99a7c9c6..c641fe60853a4b131cb6035c48e3d9f6ef9ddadf 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -83,7 +83,7 @@ class SlurmClusterResolverTest(test.TestCase):
         auto_set_gpu=False)
 
     slurm_cluster_resolver.task_type = 'worker'
-    slurm_cluster_resolver.task_index = 1
+    slurm_cluster_resolver.task_id = 1
     self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
 
     slurm_cluster_resolver.rpc_layer = 'ab'
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index 8d530cc15a035afcf2d3356599ed06e0b9d9a4cd..6dcbafbc5047e06d517754436a6ee95c1e69adc9 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -52,7 +52,7 @@ class TFConfigClusterResolver(ClusterResolver):
 
   def __init__(self,
                task_type=None,
-               task_index=None,
+               task_id=None,
                rpc_layer=None,
                environment=None):
     """Creates a new TFConfigClusterResolver.
@@ -60,14 +60,14 @@ class TFConfigClusterResolver(ClusterResolver):
     Args:
       task_type: (String, optional) Overrides the task type specified in the
         TF_CONFIG environment variable.
-      task_index: (Integer, optional) Overrides the task index specified in the
+      task_id: (Integer, optional) Overrides the task index specified in the
         TF_CONFIG environment variable.
       rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
       environment: (String, optional) Overrides the environment TensorFlow
         operates in.
     """
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._rpc_layer = rpc_layer
     self._environment = environment
 
@@ -80,20 +80,20 @@ class TFConfigClusterResolver(ClusterResolver):
       return self._task_type
 
   @property
-  def task_index(self):
+  def task_id(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
       return task_info['index'] if 'index' in task_info else None
     else:
-      return self._task_index
+      return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -121,13 +121,13 @@ class TFConfigClusterResolver(ClusterResolver):
       return ClusterSpec({})
     return ClusterSpec(tf_config['cluster'])
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a TensorFlow session.
 
     Args:
       task_type: (String, optional) Overrides and sets the task_type of the
         master.
-      task_index: (Integer, optional) Overrides and sets the task id of the
+      task_id: (Integer, optional) Overrides and sets the task id of the
         master.
       rpc_layer: (String, optional) Overrides and sets the protocol over which
         TensorFlow nodes communicate with each other.
@@ -155,7 +155,7 @@ class TFConfigClusterResolver(ClusterResolver):
     # We try to auto-detect the task type and id, but uses the user-supplied one
     # where available
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    return format_master_url(cluster_spec.task_address(task_type, task_index),
+    return format_master_url(cluster_spec.task_address(task_type, task_id),
                              self.rpc_layer)
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index 36b3bb9c1e1a32960525f8cff7f852e204c72211..97a5eb685fa633f107aa8c8cafec9944f7812300 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -150,7 +150,7 @@ class TFConfigClusterResolverTest(test.TestCase):
 
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('ps', cluster_resolver.task_type)
-    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(0, cluster_resolver.task_id)
     self.assertEqual('grpc', cluster_resolver.rpc_layer)
 
   def testParameterOverrides(self):
@@ -168,19 +168,19 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0)
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_id=0)
 
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
     self.assertEqual('ps', cluster_resolver.task_type)
-    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(0, cluster_resolver.task_id)
 
     cluster_resolver.task_type = 'worker'
-    cluster_resolver.task_index = 1
+    cluster_resolver.task_id = 1
     cluster_resolver.rpc_layer = 'test'
 
     self.assertEqual('test://worker1:2222', cluster_resolver.master())
     self.assertEqual('worker', cluster_resolver.task_type)
-    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual(1, cluster_resolver.task_id)
     self.assertEqual('test', cluster_resolver.rpc_layer)
 
   def testZeroItemsInClusterSpecMasterRead(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 529a4434127f35a2e5b88468af23cd12608a301e..d02e9b8fe8135f7aa8ecde59be901cb468a311fb 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -192,11 +192,12 @@ class TPUClusterResolver(ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu: Either a string, or a list of strings corresponding to the TPUs to
-        use. If the single string is the empty string, the string 'local', or a
-        string that begins with 'grpc://' or '/bns', then it is assumed to not
-        correspond with a Cloud TPU and will instead be passed as the session
-        master and no ClusterSpec propagation will be done.
+      tpu: A string corresponding to the TPU to use. If the string is the empty
+        string, the string 'local', or a string that begins with 'grpc://' or
+        '/bns', then it is assumed to not correspond with a Cloud TPU and will
+        instead be passed as the session master and no ClusterSpec propagation
+        will be done. In the future, this may also support a list of strings
+        when multiple Cloud TPUs are used.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
@@ -253,10 +254,10 @@ class TPUClusterResolver(ClusterResolver):
       raise RuntimeError('You need to specify a TPU Name if you are running in '
                          'the Google Cloud environment.')
 
-    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
     # first worker in the task).
     self.task_type = job_name
-    self.task_index = 0
+    self.task_id = 0
 
     if tpu.startswith('grpc://'):
       # Cloud environment, where we are using GRPC to communicate to TPUs.
@@ -284,7 +285,7 @@ class TPUClusterResolver(ClusterResolver):
     # in later in self.master().
     if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
       tpu = tpu[len(self.rpc_layer + '://'):]
-      self._tpu = tpu
+      self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
       self._should_resolve_override = False
 
     # Whether we should actually attempt to contact Cloud APIs
@@ -326,7 +327,7 @@ class TPUClusterResolver(ClusterResolver):
     else:
       self._coordinator_address = coordinator_address
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Get the Master string to be used for the session.
 
     In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
@@ -340,7 +341,7 @@ class TPUClusterResolver(ClusterResolver):
     Args:
       task_type: (Optional, string) The type of the TensorFlow task of the
         master.
-      task_index: (Optional, integer) The index of the TensorFlow task of the
+      task_id: (Optional, integer) The index of the TensorFlow task of the
         master.
       rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
         communicate with TPUs.
@@ -354,12 +355,12 @@ class TPUClusterResolver(ClusterResolver):
     if self._shouldResolve():
       # We are going to communicate with the Cloud TPU APIs to get a Cluster.
       cluster_spec = self.cluster_spec()
-      if task_type is not None and task_index is not None:
-        # task_type and task_index is from the function parameter
-        master = cluster_spec.task_address(task_type, task_index)
-      elif self.task_type is not None and self.task_index is not None:
-        # task_type and task_index is from the object
-        master = cluster_spec.task_address(self.task_type, self.task_index)
+      if task_type is not None and task_id is not None:
+        # task_type and task_id is from the function parameter
+        master = cluster_spec.task_address(task_type, task_id)
+      elif self.task_type is not None and self.task_id is not None:
+        # task_type and task_id is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_id)
       else:
         # by default we take the first item in the cluster with the right name
         job_tasks = cluster_spec.job_tasks(self.task_type)
@@ -368,7 +369,7 @@ class TPUClusterResolver(ClusterResolver):
         master = job_tasks[0]
     else:
       if isinstance(self._tpu, (bytes, bytearray)):
-        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+        master = compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR)[0]
       else:
         master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
     return format_master_url(master, rpc_layer or self.rpc_layer)
@@ -377,7 +378,8 @@ class TPUClusterResolver(ClusterResolver):
     return self.master()
 
   def get_job_name(self):
-    if self._shouldResolve():
+    if (self._shouldResolve() or
+        self._tpu.startswith(compat.as_bytes('grpc://'))):
       return self.task_type
 
   def cluster_spec(self):
@@ -437,7 +439,7 @@ class TPUClusterResolver(ClusterResolver):
         return None
       # Case 2.
       tpus = []
-      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+      for tpu in compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR):
         # We are working around the fact that GKE environment variable that is
         # supplied to us has the protocol string embedded in it, but we want
         # to strip it out for the ClusterSpec.
@@ -456,7 +458,7 @@ class TPUClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='TPU',
                        config_proto=None):
     """Returns the number of TPU cores per worker.
@@ -467,7 +469,7 @@ class TPUClusterResolver(ClusterResolver):
 
     Args:
       task_type: Unused.
-      task_index: Unused.
+      task_id: Unused.
       accelerator_type: Unused.
       config_proto: Used to create a connection to a TPU master in order to
         retrieve the system metadata.
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index 58c332a5098d34cca361e0920ce0a22d12cc0ffd..0cb0dc607fba442e4b6742d5e383b4d0ea29e8cf 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -459,8 +459,7 @@ class TPUClusterResolverTest(test.TestCase):
   def testNoCallComputeMetadata(self):
     resolver = cluster_resolver.TPUClusterResolver(
         tpu='/bns/foo/bar')
-    self.assertEqual(
-        compat.as_bytes('/bns/foo/bar'), resolver.master())
+    self.assertEqual('/bns/foo/bar', resolver.master())
     self.assertEqual(None, resolver.cluster_spec())
 
   def testGkeEnvironmentForDonut(self):
@@ -576,12 +575,12 @@ class TPUClusterResolverTest(test.TestCase):
     self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
     resolver.task_type = 'worker'
-    resolver.task_index = 3
+    resolver.task_id = 3
     self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')
 
     self.assertEqual(
         resolver.master(
-            task_type='worker', task_index=2, rpc_layer='test'),
+            task_type='worker', task_id=2, rpc_layer='test'),
         'test://10.2.3.6:8470')
 
   def testGetDeviceDictAndCoresWithTPUs(self):
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 9575301d975e8ab797a0a9a79575b7f9bcbbb314..9729302c6dc1e22772c1a80a25eff17720c50994 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -323,6 +323,9 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     assert check_destinations(destinations)
     devices = get_devices_from(destinations)
     reduce_to_device = self.reduce_to_device or devices[0]
+    logging.log_first_n(
+        logging.INFO,
+        "Reduce to %s then broadcast to %r." % (reduce_to_device, devices), 10)
     reduced = _simple_reduce(per_replica_value, reduce_to_device,
                              self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, destinations)
@@ -839,9 +842,6 @@ class CollectiveAllReduce(CrossDeviceOps):
     if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
 
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     device_map, logical_device = get_device_map_from(destinations)
@@ -865,9 +865,6 @@ class CollectiveAllReduce(CrossDeviceOps):
     if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
 
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
@@ -886,9 +883,6 @@ class CollectiveAllReduce(CrossDeviceOps):
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution with collective ops is not supported yet.")
 
     logging.log_first_n(
         logging.INFO, "Collective All-reduce invoked with batches size = %d, "
@@ -949,12 +943,9 @@ def _has_dgx1_like_links(gpu_links):
 
 def _choose_all_reduce_algorithm(device_links):
   if _has_dgx1_like_links(device_links):
-    logging.info("Configured hierarchical_copy with num_packs=%d",
-                 len(device_links))
     return AllReduceCrossDeviceOps(
         "hierarchical_copy", num_packs=len(device_links))
   else:
-    logging.info("Configured nccl all-reduce.")
     return AllReduceCrossDeviceOps("nccl", num_packs=1)
 
 
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 5b4b3a6f978c0fd15bd5a67c3b47a46e5ae0e357..e8066dd467c285c50cb39b98450f5150756d6db9 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -23,6 +23,8 @@ import threading
 
 from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -353,15 +355,25 @@ def build_collective_reduce(input_tensors,
   num_devices = len(devices)
   group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_instance_key()
-  out_tensors = []
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-  for d in range(num_devices):
-    with ops.device(devices[d]):
-      reduce_op = collective_ops.all_reduce(
-          input_tensors[d], group_size, group_key, instance_key, reduction_op,
-          unary_op, subdiv_offsets)
-      out_tensors.append(reduce_op)
-  return out_tensors
+
+  def collective_all_reduce():
+    """Call collective allreduce."""
+    assert not context.executing_eagerly()
+    out_tensors = []
+    for d in range(num_devices):
+      with ops.device(devices[d]):
+        reduce_op = collective_ops.all_reduce(
+            input_tensors[d], group_size, group_key, instance_key, reduction_op,
+            unary_op, subdiv_offsets)
+        out_tensors.append(reduce_op)
+    return out_tensors
+
+  if context.executing_eagerly():
+    # Collective ops will block unless they are executed concurrently such as in
+    # a graph or a defun.
+    collective_all_reduce = def_function.function(collective_all_reduce)
+  return collective_all_reduce()
 
 
 def sum_grad_and_var_all_reduce(grad_and_vars,
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index 78c995a57823c5ad274eebd52f39dcad81a67e19..70c6409a221e4228707abfe4fac45e24f5bcb967 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -210,8 +210,8 @@ class _WorkerContext(object):
       ValueError: if `worker_barrier` is not passed to the __init__ method.
     """
     if not self._worker_barrier:
-      raise ValueError("`worker_barrier is not set in the worker context.` \t" +
-                       self._debug_message())
+      # TODO(yuefengz): we should throw an error in independent worker mode.
+      return
     self._worker_barrier.wait()
 
   def session_creator(self,
@@ -721,7 +721,8 @@ def run_distribute_coordinator(worker_fn,
 
   Returns:
     In the client job, return the value returned by `worker_fn` if
-    it is in-graph replication; return None otherwise.
+    it is in-graph replication or INDEPENDENT_WORKER mode; return None
+    otherwise.
   """
   tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
   if not cluster_spec:
@@ -819,19 +820,19 @@ def run_distribute_coordinator(worker_fn,
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
       if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
-        _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
-                           task_id, session_config, rpc_layer)
+        return _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
+                                  task_id, session_config, rpc_layer)
       else:
         # Only one node runs `worker_fn` if in-graph.
         context = _WorkerContext(strategy, cluster_spec, task_type, task_id)
         if context.is_chief:
-          _run_single_worker(worker_fn, strategy, cluster_spec, None, None,
-                             session_config, rpc_layer)
+          return _run_single_worker(worker_fn, strategy, cluster_spec, None,
+                                    None, session_config, rpc_layer)
         else:
           server.join()
     elif task_type == _TaskType.EVALUATOR:
-      _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type,
-                         task_id, session_config, rpc_layer)
+      return _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type,
+                                task_id, session_config, rpc_layer)
     else:
       if task_type != _TaskType.PS:
         raise ValueError("Unexpected task_type: %r" % task_type)
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 9c6bcea4cdface0ee596b593c71a310a1c845d40..2cc99b3db4619d86d18507d1beb1adb6d42c528b 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -26,6 +26,7 @@ import enum
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
 from tensorflow.python.framework import constant_op
@@ -33,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses_impl
@@ -208,12 +210,14 @@ class _SameScopeAgainContext(object):
 # TODO(yuefengz): add more replication modes.
 @tf_export("distribute.InputReplicationMode")
 class InputReplicationMode(enum.Enum):
-  """Replication mode for input function."""
+  """Replication mode for input function.
 
-  # The input function will be called on each worker independently, creating as
-  # many input pipelines as number of workers. Replicas will dequeue from the
-  # local Dataset on their worker. Distribution Strategy doesn't manage any
-  # state sharing between such separate input pipelines.
+  * `PER_WORKER`: The input function will be called on each worker
+    independently, creating as many input pipelines as number of workers.
+    Replicas will dequeue from the local Dataset on their worker.
+    `tf.distribute.Strategy` doesn't manage any state sharing between such
+    separate input pipelines.
+  """
   PER_WORKER = "PER_WORKER"
 
 
@@ -329,37 +333,8 @@ class DistributionStrategy(object):
     """DEPRECATED: use extended.colocate_vars_with() instead."""
     return self._extended.colocate_vars_with(colocate_with_variable)
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all replicas.  DEPRECATED.
-
-    DEPRECATED: Please use `make_dataset_iterator` or
-    `make_input_fn_iterator` instead.
-
-    Suitable for providing input to `extended.call_for_each_replica()` by
-    creating an iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-
-    with strategy.scope():
-      distributed_dataset = strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_initializable_iterator()
-      replica_results = strategy.extended.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-    ```
-
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
-
-    Returns:
-      A `PerReplicaDataset` that will produce data for each replica.
-    """
-    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
-
   def make_dataset_iterator(self, dataset):
-    """Makes an iterator for input provided via input_dataset.
+    """Makes an iterator for input provided via `dataset`.
 
     Data from the given dataset will be distributed evenly across all the
     compute replicas. We will assume that the input dataset is batched by the
@@ -388,28 +363,36 @@ class DistributionStrategy(object):
     """Returns an iterator split across replicas created from an input function.
 
     The `input_fn` should take an `tf.distribute.InputContext` object where
-    information about input sharding can be accessed:
+    information about batching and input sharding can be accessed:
 
     ```
     def input_fn(input_context):
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
       return d.shard(input_context.num_input_pipelines,
                      input_context.input_pipeline_id)
     with strategy.scope():
-      iterator = strategy.make_input_fn_iterator(
-          input_fn)
-      replica_results = strategy.extended.call_for_each_replica(
-          replica_fn, iterator.get_next())
+      iterator = strategy.make_input_fn_iterator(input_fn)
+      replica_results = strategy.experimental_run(replica_fn, iterator)
     ```
 
+    The `tf.data.Dataset` returned by `input_fn` should have a per-replica
+    batch size, which may be computed using
+    `input_context.get_per_replica_batch_size`.
+
     Args:
-      input_fn: A function that returns a `tf.data.Dataset`. This function is
-        expected to take an `tf.distribute.InputContext` object.
+      input_fn: A function taking a `tf.distribute.InputContext` object and
+        returning a `tf.data.Dataset`.
       replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
-        Only `PER_WORKER` is supported currently.
+        Only `PER_WORKER` is supported currently, which means there will be
+        a single call to `input_fn` per worker. Replicas will dequeue from the
+        local `tf.data.Dataset` on their worker.
 
     Returns:
-      An iterator object that can be initialized and fetched next element.
+      An iterator object that should first be `.initialize()`-ed. It may then
+      either be passed to `strategy.experimental_run()` or you can
+      `iterator.get_next()` to get the next value to pass to
+      `strategy.extended.call_for_each_replica()`.
     """
     if replication_mode != InputReplicationMode.PER_WORKER:
       raise ValueError(
@@ -417,6 +400,40 @@ class DistributionStrategy(object):
     return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
         input_fn, replication_mode=replication_mode)
 
+  def experimental_make_numpy_iterator(
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas. Note that lists of Numpy arrays are stacked,
+        as that is normal `tf.data.Dataset` behavior.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the global batch
+        size. It should be divisible by `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    ds = self.extended.experimental_make_numpy_dataset(
+        numpy_input, session=session)
+    if shuffle:
+      ds = ds.shuffle(shuffle)
+    if num_epochs != 1:
+      ds = ds.repeat(num_epochs)
+    # We need to use the drop_remainder argument to get a known static
+    # input shape which is required for TPUs.
+    drop_remainder = self.extended.experimental_require_static_shapes
+    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
+    return self.make_dataset_iterator(ds)
+
   def experimental_run(self, fn, input_iterator=None):
     """Runs ops in `fn` on each replica, with inputs from `input_iterator`.
 
@@ -453,73 +470,11 @@ class DistributionStrategy(object):
         inputs = input_iterator.get_next()
         return self._extended.call_for_each_replica(fn, args=(inputs,))
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def broadcast(self, tensor, destinations=None):
-    """DEPRECATED: use extended.broadcast_to() instead."""
-    return self._extended.broadcast_to(tensor, destinations)
-
-  @doc_controls.do_not_generate_docs  # Use experimental_initialize() instead.
-  def initialize(self):
-    """DEPRECATED: Use `experimental_initialize()` instead."""
-    return self._extended._initialize()  # pylint: disable=protected-access
-
-  def experimental_initialize(self):
-    """Any initialization to be done before running any computations.
-
-    In eager mode, it executes any initialization as a side effect.
-    In graph mode, it creates the initialization ops and returns them.
-
-    For example, TPU initialize_system ops.
-
-    Returns:
-      A list of ops to execute.
-    """
-    return self._extended._initialize()  # pylint: disable=protected-access
-
-  @doc_controls.do_not_generate_docs  # Use experimental_finalize() instead.
-  def finalize(self):
-    """DEPRECATED: Use `experimental_finalize()` instead."""
-    return self._extended._finalize()  # pylint: disable=protected-access
-
-  def experimental_finalize(self):
-    """Any final actions to be done at the end of all computations.
-
-    In eager mode, it executes any finalize actions as a side effect.
-    In graph mode, it creates the finalize ops and returns them.
-
-    For example, TPU shutdown ops.
-
-    Returns:
-      A list of ops to execute.
-    """
-    return self._extended._finalize()  # pylint: disable=protected-access
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def run_steps_on_dataset(self, fn, iterator, iterations=1,
-                           initial_loop_values=None):
-    """DEPRECATED: use extended.experimental_run_steps_on_iterator() instead."""
-    return self._extended.experimental_run_steps_on_iterator(
-        fn, iterator, iterations, initial_loop_values)
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def call_for_each_replica(self, fn, *args, **kwargs):
-    """DEPRECATED: use extended.call_for_each_replica() instead."""
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to call_for_each_replica")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
-      kwargs = k
-    kwargs.pop("run_concurrently", None)  # Ignore old option.
-    return self._extended.call_for_each_replica(fn, args, kwargs)
+  # TODO(b/121296772,b/121300973): Add logical_device argument (default of 0).
+  def broadcast(self, tensor):
+    """Broadcasts `tensor` to all replicas, returning a per-replica value."""
+    _require_cross_replica_context_extended(self._extended)
+    return self._extended._broadcast(tensor)  # pylint: disable=protected-access
 
   def reduce(self, reduce_op, value):
     """Reduce `value` across replicas.
@@ -535,58 +490,6 @@ class DistributionStrategy(object):
     _require_cross_replica_context_extended(self._extended)
     return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """DEPRECATED: use extended.batch_reduce_to() instead."""
-    return self._extended.batch_reduce_to(aggregation, value_destination_pairs)
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def update(self, var, fn, *args, **kwargs):
-    """DEPRECATED: use extended.update() instead."""
-    group = kwargs.pop("group", True)
-    # We temporarily support "grouped" in addition to "group" for backward-
-    # compatibility.
-    group = kwargs.pop("grouped", True) and group
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to update")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to update")
-      kwargs = k
-    return self._extended.update(var, fn, args, kwargs, group)
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
-    """DEPRECATED: use extended.update_non_slot() instead."""
-    group = kwargs.pop("group", True)
-    # We temporarily support "grouped" in addition to "group" for backward-
-    # compatibility.
-    group = kwargs.pop("grouped", True) and group
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to update_non_slot")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to update_non_slot")
-      kwargs = k
-    return self._extended.update_non_slot(
-        colocate_with, fn, args, kwargs, group)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
   def unwrap(self, value):
     """Returns the list of all per-replica values contained in `value`.
@@ -601,50 +504,16 @@ class DistributionStrategy(object):
     """
     return self._extended._unwrap(value)  # pylint: disable=protected-access
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def value_container(self, value):
-    """DEPRECATED: use extended.value_container() instead."""
-    return self._extended.value_container(value)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
   def group(self, value, name=None):
     """Shortcut for `tf.group(self.unwrap(value))`."""
     return self._extended._group(value, name)  # pylint: disable=protected-access
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def require_static_shapes(self):
-    """DEPRECATED: use extended.require_static_shapes instead."""
-    return self._extended.experimental_require_static_shapes
-
   @property
   def num_replicas_in_sync(self):
     """Returns number of replicas over which gradients are aggregated."""
     return self._extended._num_replicas_in_sync  # pylint: disable=protected-access
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def worker_devices(self):
-    """DEPRECATED: use extended.worker_devices instead."""
-    return self._extended.worker_devices
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def parameter_devices(self):
-    """DEPRECATED: use extended.parameter_devices instead."""
-    return self._extended.parameter_devices
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def non_slot_devices(self, var_list):
-    """DEPRECATED: use extended.non_slot_devices instead."""
-    return self._extended.non_slot_devices(var_list)
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def between_graph(self):
-    """DEPRECATED: use extended.experimental_between_graph instead."""
-    return self._extended.experimental_between_graph
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, being replaced by a new API.
   def configure(self,
                 session_config=None,
@@ -680,24 +549,6 @@ class DistributionStrategy(object):
     """
     return self._extended._update_config_proto(config_proto)  # pylint: disable=protected-access
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def should_init(self):
-    """DEPRECATED: use extended.should_init instead."""
-    return self._extended.experimental_should_init
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def should_checkpoint(self):
-    """DEPRECATED: use extended.should_checkpoint instead."""
-    return self._extended.should_checkpoint
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def should_save_summary(self):
-    """DEPRECATED: use extended.should_save_summary instead."""
-    return self._extended.should_save_summary
-
   def __deepcopy__(self, memo):
     # First do a regular deepcopy of `self`.
     cls = self.__class__
@@ -875,11 +726,9 @@ class DistributionStrategyExtended(object):
     a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
-  * `d.make_dataset_iterator(dataset)` (or the deprecated
-    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
+  * `d.make_dataset_iterator(dataset)`: in cross-replica
     context, produces an iterator with locality T
-  * `d.extended.broadcast_to(t)`: in cross-replica context, produces a value
-    with locality M
+  * `d.broadcast(t)`: in cross-replica context, produces a value with locality M
   * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
     with locality V(`v`)
   * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
@@ -971,6 +820,7 @@ class DistributionStrategyExtended(object):
     def creator_with_resource_vars(*args, **kwargs):
       _require_strategy_scope_extended(self)
       kwargs["use_resource"] = True
+      kwargs["distribute_strategy"] = strategy
       return self._create_variable(*args, **kwargs)
 
     def distributed_getter(getter, *args, **kwargs):
@@ -995,6 +845,30 @@ class DistributionStrategyExtended(object):
     # Note: should support "colocate_with" argument.
     raise NotImplementedError("must be implemented in descendants")
 
+  def variable_created_in_scope(self, v):
+    """Tests whether `v` was created while this strategy scope was active.
+
+    Variables created inside the strategy scope are "owned" by it:
+
+    >>> with strategy.scope():
+    ...   v = tf.Variable(1.)
+    >>> strategy.variable_created_in_scope(v)
+    True
+
+    Variables created outside the strategy are not owned by it:
+
+    >>> v = tf.Variable(1.)
+    >>> strategy.variable_created_in_scope(v)
+    False
+
+    Args:
+      v: A `tf.Variable` instance.
+
+    Returns:
+      True if `v` was created inside the scope, False if not.
+    """
+    return v._distribute_strategy == self._container_strategy_weakref()  # pylint: disable=protected-access
+
   def read_var(self, v):
     """Reads the value of a variable.
 
@@ -1025,7 +899,7 @@ class DistributionStrategyExtended(object):
     ```
     with strategy.scope():
       var1 = tf.get_variable(...)
-      with strategy.extended.colocate_vars_with(v1):
+      with strategy.extended.colocate_vars_with(var1):
         # var2 and var3 will be created on the same device(s) as var1
         var2 = tf.get_variable(...)
         var3 = tf.get_variable(...)
@@ -1033,8 +907,9 @@ class DistributionStrategyExtended(object):
       def fn(v1, v2, v3):
         # operates on v1 from var1, v2 from var2, and v3 from var3
 
-      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
-      strategy.extended.update(v1, fn, args=(v2, v3))
+      # `fn` runs on every device `var1` is on, `var2` and `var3` will be there
+      # too.
+      strategy.extended.update(var1, fn, args=(var2, var3))
     ```
 
     Args:
@@ -1052,22 +927,12 @@ class DistributionStrategyExtended(object):
       return next_creator(*args, **kwargs)
 
     _require_strategy_scope_extended(self)
+    self._validate_colocate_with_variable(colocate_with_variable)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
-  def _call_dataset_fn(self, dataset_fn):
-    """Call the `dataset_fn` with `input_context` as argument."""
-    result = dataset_fn()
-    if not isinstance(result, dataset_ops.DatasetV2):
-      raise ValueError(
-          "dataset_fn() must return a tf.data.Dataset when using a "
-          "tf.distribute.Strategy.")
-    return result
-
-  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
-  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
-  # Extend to implement more functionality of datasets.
-  def _distribute_dataset(self, dataset_fn):
-    raise NotImplementedError("must be implemented in descendants")
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    """Validate `colocate_with_variable` argument to `colocate_vars_with`."""
+    pass
 
   def _make_dataset_iterator(self, dataset):
     raise NotImplementedError("must be implemented in descendants")
@@ -1075,6 +940,29 @@ class DistributionStrategyExtended(object):
   def _make_input_fn_iterator(self, input_fn, replication_mode):
     raise NotImplementedError("must be implemented in descendants")
 
+  def experimental_make_numpy_dataset(self, numpy_input, session=None):
+    """Makes a dataset for input provided via a numpy array.
+
+    This avoids adding `numpy_input` as a large constant in the graph,
+    and copies the data to the machine or machines that will be processing
+    the input.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas. Note that lists of Numpy arrays are stacked,
+        as that is normal `tf.data.Dataset` behavior.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      A `tf.data.Dataset` representing `numpy_input`.
+    """
+    _require_cross_replica_context_extended(self)
+    return self._experimental_make_numpy_dataset(numpy_input, session=session)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    raise NotImplementedError("must be implemented in descendants")
+
   def broadcast_to(self, tensor, destinations):
     """Mirror a tensor on one device to all worker devices.
 
@@ -1091,15 +979,12 @@ class DistributionStrategyExtended(object):
     assert not isinstance(destinations, (list, tuple))
     return self._broadcast_to(tensor, destinations)
 
+  def _broadcast(self, tensor):
+    return self._broadcast_to(tensor, None)  # Default implementation
+
   def _broadcast_to(self, tensor, destinations):
     raise NotImplementedError("must be implemented in descendants")
 
-  def _initialize(self):
-    return []
-
-  def _finalize(self):
-    return []
-
   def experimental_run_steps_on_iterator(self, fn, iterator, iterations=1,
                                          initial_loop_values=None):
     """Run `fn` with input from `iterator` for `iterations` times.
@@ -1206,9 +1091,6 @@ class DistributionStrategyExtended(object):
 
     Args:
       reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
-        DEPRECATED but still accepted values:
-        `tf.VariableAggregation.SUM`,
-        `tf.VariableAggregation.MEAN`,
       value: A per-replica value with one value per replica.
       destinations: A mirrored variable, a per-replica tensor, or a device
         string. The return value will be copied to all destination devices (or
@@ -1221,14 +1103,7 @@ class DistributionStrategyExtended(object):
     # TODO(josh11b): More docstring
     _require_cross_replica_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
-
-    # TODO(priyag): Remove this when all callers have been updated.
-    if isinstance(reduce_op, variable_scope.VariableAggregation):
-      assert reduce_op in (
-          variable_scope.VariableAggregation.SUM,
-          variable_scope.VariableAggregation.MEAN,
-      )
-      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     assert (reduce_op == reduce_util.ReduceOp.SUM or
             reduce_op == reduce_util.ReduceOp.MEAN)
     return self._reduce_to(reduce_op, value, destinations)
@@ -1241,9 +1116,6 @@ class DistributionStrategyExtended(object):
 
     Args:
       reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
-        DEPRECATED but still accepted values:
-        `tf.VariableAggregation.SUM`,
-        `tf.VariableAggregation.MEAN`,
       value_destination_pairs: A sequence of (value, destinations)
         pairs. See `reduce_to()` for a description.
 
@@ -1252,14 +1124,7 @@ class DistributionStrategyExtended(object):
     """
     # TODO(josh11b): More docstring
     _require_cross_replica_context_extended(self)
-
-    # TODO(priyag): Remove this when all callers have been updated.
-    if isinstance(reduce_op, variable_scope.VariableAggregation):
-      assert reduce_op in [
-          variable_scope.VariableAggregation.SUM,
-          variable_scope.VariableAggregation.MEAN,
-      ]
-      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     return self._batch_reduce_to(reduce_op, value_destination_pairs)
 
   def _batch_reduce_to(self, reduce_op, value_destination_pairs):
@@ -1530,12 +1395,6 @@ class ReplicaContext(object):
     require_replica_context(self)
     return self._replica_id_in_sync_group
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, use `strategy`
-  def distribution_strategy(self):
-    """DEPRECATED: use `self.strategy` instead."""
-    return self._strategy
-
   @property
   def strategy(self):
     """The current `tf.distribute.Strategy` object."""
@@ -1547,6 +1406,50 @@ class ReplicaContext(object):
     require_replica_context(self)
     return (device_util.current(),)
 
+  def all_reduce(self, reduce_op, value):
+    """All-reduces the given `Tensor` nest across replicas.
+
+    If `all_reduce` is called in any replica, it must be called in all replicas.
+    The nested structure and `Tensor` shapes must be identical in all replicas.
+
+    IMPORTANT: The ordering of communications must be identical in all replicas.
+
+    Example with two replicas:
+      Replica 0 `value`: {'a': 1, 'b': [40,  1]}
+      Replica 1 `value`: {'a': 3, 'b': [ 2, 98]}
+
+      If `reduce_op` == `SUM`:
+        Result (on all replicas): {'a': 4, 'b': [42, 99]}
+
+      If `reduce_op` == `MEAN`:
+        Result (on all replicas): {'a': 2, 'b': [21, 49.5]}
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+      value: The nested structure of `Tensor`s to all-reduced.
+        The structure must be compatible with `tf.nest`.
+
+    Returns:
+       A `Tensor` nest with the reduced `value`s from each replica.
+    """
+    def batch_all_reduce(strategy, *value_flat):
+      return strategy.extended.batch_reduce_to(
+          reduce_op, [(v, _batch_reduce_destination(v)) for v in value_flat])
+
+    if reduce_op in [reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN]:
+      # TODO(cjfj): Work out why `batch_reduce` doesn't return the correct grad.
+      @custom_gradient.custom_gradient
+      def grad_wrapper(*xs):
+        ys = self.merge_call(batch_all_reduce, args=xs)
+        # The gradient of an all-sum is itself an all-sum (all-mean, likewise).
+        return ys, lambda *dy_s: self.all_reduce(reduce_op, dy_s)
+      return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
+    else:
+      # TODO(cjfj): Implement gradients for other reductions.
+      reduced = nest.pack_sequence_as(
+          value, self.merge_call(batch_all_reduce, args=nest.flatten(value)))
+      return nest.map_structure(array_ops.prevent_gradient, reduced)
+
   # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
   # all-reduce. It would return a function returning the result of reducing `t`
   # across all replicas. The caller would wait to call this function until they
@@ -1557,6 +1460,15 @@ class ReplicaContext(object):
   #   to that point that the first result is needed. Most likely this can be
   #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
+
+def _batch_reduce_destination(x):
+  """Returns the destinations for batch all-reduce."""
+  if isinstance(x, ops.Tensor):  # One device strategies.
+    return x.device
+  else:
+    return x
+
+
 # ------------------------------------------------------------------------------
 
 
@@ -1588,8 +1500,8 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
     _require_strategy_scope_extended(self)
     return ops.colocate_with(colocate_with_variable)
 
-  def _distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
+  def variable_created_in_scope(self, v):
+    return v._distribute_strategy is None  # pylint: disable=protected-access
 
   def _make_dataset_iterator(self, dataset):
     return _DefaultDistributionExtended.DefaultInputIterator(dataset)
@@ -1597,7 +1509,20 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
   def _make_input_fn_iterator(self,
                               input_fn,
                               replication_mode=InputReplicationMode.PER_WORKER):
-    return input_fn(InputContext()).make_initializable_iterator()
+    dataset = input_fn(InputContext())
+    return _DefaultDistributionExtended.DefaultInputIterator(dataset)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    numpy_flat = nest.flatten(numpy_input)
+    vars_flat = tuple(
+        variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
+                                trainable=False, use_resource=True)
+        for i in numpy_flat
+    )
+    for v, i in zip(vars_flat, numpy_flat):
+      numpy_dataset.init_var_from_numpy(v, i, session)
+    vars_nested = nest.pack_sequence_as(numpy_input, vars_flat)
+    return dataset_ops.Dataset.from_tensor_slices(vars_nested)
 
   def _broadcast_to(self, tensor, destinations):
     if destinations is None:
@@ -1682,6 +1607,7 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for this strategy."""
     return True
 
 
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
index e6648bf7c46997a729e80651073cabd82a17e401..6c1e250f9651412067d32291560b9d1135676067 100644
--- a/tensorflow/python/distribute/distribution_strategy_context.py
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -201,6 +201,11 @@ def has_strategy():
   return get_strategy() is not _get_default_strategy()
 
 
+def get_strategy_and_replica_context():
+  per_thread_mode = _get_per_thread_mode()
+  return (per_thread_mode.strategy, per_thread_mode.replica_context)
+
+
 # ------------------------------------------------------------------------------
 # Defaults that are used when no tf.distribute.Strategy is explicitly created.
 # We create them lazily in a function so that we can workaround the circular
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..43915917d635cc9be63fa8d5fd0860ed99fd0f98
--- /dev/null
+++ b/tensorflow/python/distribute/input_lib.py
@@ -0,0 +1,473 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed inputs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+
+
+class InputWorkers(object):
+  """A 1-to-many mapping from input worker devices to compute devices."""
+
+  def __init__(self, device_map, worker_device_pairs=None, logical_device=0):
+    """Initialize an `InputWorkers` object.
+
+    Args:
+      device_map: A `DeviceMap` with the computation devices fed by the
+        input workers.
+      worker_device_pairs: A sequence of pairs:
+        `(input device, a tuple of compute devices fed by that input device)`.
+      logical_device: The logical device of `device_map` to feed.
+    """
+    self._device_map = device_map
+    self._logical_device = logical_device
+    if worker_device_pairs is None:
+      worker_device_pairs = ((
+          device_util.canonicalize("/device:CPU:0"),
+          device_map.logical_to_actual_devices(logical_device)),)
+    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
+    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
+                              for _, f in worker_device_pairs)
+    flattened = tuple(d for l in self._fed_devices for d in l)
+    assert (flattened ==
+            device_map.logical_to_actual_devices(logical_device)), (
+                "flattened: %s logical device %d: %s" %
+                (flattened, logical_device,
+                 device_map.logical_to_actual_devices(logical_device)))
+
+  @property
+  def device_map(self):
+    return self._device_map
+
+  @property
+  def logical_device(self):
+    return self._logical_device
+
+  @property
+  def num_workers(self):
+    return len(self._input_worker_devices)
+
+  @property
+  def worker_devices(self):
+    return self._input_worker_devices
+
+  def compute_devices_for_worker(self, worker_index):
+    return self._fed_devices[worker_index]
+
+  def __repr__(self):
+    devices = self.worker_devices
+    debug_repr = ",\n".join("  %d %s: %s" %
+                            (i, devices[i], self._fed_devices[i])
+                            for i in range(len(devices)))
+    return "%s:{\n%s\n  device_map: %s}" % (
+        self.__class__.__name__, debug_repr, self._device_map)
+
+
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
+
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
+
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
+
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
+
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
+
+  def __init__(self, input_workers, iterators):
+    assert isinstance(input_workers, InputWorkers)
+    if not input_workers.worker_devices:
+      raise ValueError("Should have at least one worker for input iterator.")
+
+    self._iterators = iterators
+    self._input_workers = input_workers
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    replicas = []
+    for i, worker in enumerate(self._input_workers.worker_devices):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        # Make `replicas` a flat list of values across all replicas.
+        replicas.extend(self._iterators[i].get_next_as_list(new_name))
+
+    return values.regroup(self._input_workers.device_map, replicas)
+
+  def initialize(self):
+    """Initialze underlying iterators.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, w in enumerate(self._input_workers.worker_devices):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
+
+  def __init__(self, input_fn, input_workers, input_contexts):
+    """Make an iterator for input provided via an input function.
+
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+    TODO(priyag): Allow taking input function that returns a callable that
+    returns nest of tensors.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      input_workers: an `InputWorkers` object.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if input_workers.num_workers != len(input_contexts):
+      raise ValueError(
+          "Number of input workers (%d) is not same as number of "
+          "input_contexts (%d)" %
+          (input_workers.num_workers, len(input_contexts)))
+
+    iterators = []
+    for i, ctx in enumerate(input_contexts):
+      worker = input_workers.worker_devices[i]
+      with ops.device(worker):
+        result = input_fn(ctx)
+        if not isinstance(result, dataset_ops.DatasetV2):
+          raise ValueError("input_fn must return a tf.data.Dataset.")
+        devices = input_workers.compute_devices_for_worker(i)
+        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(input_workers, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, input_workers, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      input_workers: an `InputWorkers` object.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if split_batch_by:
+      dataset = _split_dataset_batch(dataset, split_batch_by)
+
+    iterators = []
+    for i, worker in enumerate(input_workers.worker_devices):
+      with ops.device(worker):
+        worker_devices = input_workers.compute_devices_for_worker(i)
+        cloned_dataset = dataset
+        if not context.executing_eagerly():
+          cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
+        iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
+                                                worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(input_workers, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
+    self._dataset = dataset
+    self._worker = worker
+    self._devices = devices
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._devices)
+
+  def get_next_as_list(self, name=None):
+    """Get next element from the underlying iterator."""
+    del name
+    with ops.device(self._worker):
+      data_list = self._iterator.get_next()
+      return data_list
+
+  def initialize(self):
+    """Initialze underlying iterator.
+
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if context.executing_eagerly():
+      self._make_iterator()
+      return []
+    else:
+      return [self._iterator.initializer]
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+def _split_dataset_batch(dataset, split_batch_by):
+  """Divide a batch-ed dataset's batches into smaller batches."""
+  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  # pylint: disable=protected-access
+  def _get_batch_dataset(d):
+    """Get the underlying batch dataset from the dataset object."""
+    if isinstance(d, dataset_ops.DatasetV1Adapter):
+      d = d._dataset
+
+    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+      return d
+    elif isinstance(d, dataset_ops.PrefetchDataset):
+      return _get_batch_dataset(d._input_dataset)
+    raise ValueError(
+        "Unable to get batched dataset from the input dataset. `batch` "
+        "`map_and_batch` need to be the last operations on the dataset. "
+        "The batch operations can be followed by a prefetch.")
+
+  batched_dataset = _get_batch_dataset(dataset)
+  if isinstance(batched_dataset, dataset_ops.BatchDataset):
+    batch_size = batched_dataset._batch_size
+    drop_remainder = batched_dataset._drop_remainder
+  elif isinstance(batched_dataset, batching._MapAndBatchDataset):
+    batch_size = batched_dataset._batch_size_t
+    drop_remainder = batched_dataset._drop_remainder_t
+
+  prefetch_buffer = None
+  if isinstance(dataset, dataset_ops.PrefetchDataset):
+    prefetch_buffer = dataset._buffer_size
+  elif (isinstance(dataset, dataset_ops.DatasetV1Adapter)
+        and isinstance(dataset._dataset, dataset_ops.PrefetchDataset)):
+    prefetch_buffer = dataset._dataset._buffer_size
+  # pylint: enable=protected-access
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  if batch_size % split_batch_by:
+    raise ValueError(
+        "Batch size %s cannot be sharded evenly across replicas %s" % (
+            batch_size, split_batch_by))
+  new_batch_size = batch_size // split_batch_by
+
+  dataset = dataset.apply(batching.unbatch())
+  dataset = dataset.batch(new_batch_size, drop_remainder=drop_remainder)
+  if prefetch_buffer is not None:
+    dataset = dataset.prefetch(prefetch_buffer)
+  return dataset
+
+
+class MultiStepContext(object):
+  """A context object that can be used to capture things when running steps.
+
+  This context object is useful when running multiple steps at a time using the
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
+  """
+
+  def __init__(self):
+    """Initialize an output context.
+
+    Returns:
+      A context object.
+    """
+    self._last_step_outputs = {}
+    self._last_step_outputs_reduce_ops = {}
+    self._non_tensor_outputs = {}
+
+  @property
+  def last_step_outputs(self):
+    """A dictionary consisting of outputs to be captured on last step.
+
+    Keys in the dictionary are names of tensors to be captured, as specified
+    when `set_last_step_output` is called.
+    Values in the dictionary are the tensors themselves. If
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
+
+    Returns:
+      A dictionary with last step outputs.
+    """
+    return self._last_step_outputs
+
+  def _set_last_step_outputs(self, outputs):
+    """Replace the entire dictionary of last step outputs."""
+    if not isinstance(outputs, dict):
+      raise ValueError("Need a dictionary to set last_step_outputs.")
+    self._last_step_outputs = outputs
+
+  def set_last_step_output(self, name, output, reduce_op=None):
+    """Set `output` with `name` to be outputted from the last step.
+
+    Args:
+      name: String, name to identify the output. Doesn't need to match tensor
+        name.
+      output: The tensors that should be outputted with `name`. See below for
+        actual types supported.
+      reduce_op: Reduction method to use to reduce outputs from multiple
+        replicas. Required if `set_last_step_output` is called in a replica
+        context. Optional in cross_replica_context.
+        When present, the outputs from all the replicas are reduced using the
+        current distribution strategy's `reduce` method. Hence, the type of
+        `output` must be what's supported by the corresponding `reduce` method.
+        For e.g. if using MirroredStrategy and reduction is set, output
+        must be a `PerReplica` value.
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
+        outputs as already reduced or not.
+    """
+    if distribution_strategy_context.in_cross_replica_context():
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
+        self._last_step_outputs[name] = output
+      else:
+        distribution = distribution_strategy_context.get_strategy()
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
+    else:
+      assert reduce_op is not None
+      def merge_fn(distribution, value):
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
+        # Setting this inside the `merge_fn` because all replicas share the same
+        # context object, so it's more robust to set it only once (even if all
+        # the replicas are trying to set the same value).
+        self._last_step_outputs_reduce_ops[name] = reduce_op
+
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
+
+  @property
+  def non_tensor_outputs(self):
+    """A dictionary consisting of any non tensor outputs to be captured."""
+    return self._non_tensor_outputs
+
+  def set_non_tensor_output(self, name, output):
+    """Set `output` with `name` to be captured as a non tensor output."""
+    if distribution_strategy_context.in_cross_replica_context():
+      self._non_tensor_outputs[name] = output
+    else:
+      def merge_fn(distribution, value):
+        # NOTE(priyag): For non tensor outputs, we simply return all the values
+        # in a list as reduction doesn't make sense on non tensors.
+        self._non_tensor_outputs[name] = distribution.unwrap(value)
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 4f29d916c40dd116d1e8b06f26c8e6088594d613..df96f83a58a5ce5a34fb3d7ecffd5839ca79bda9 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 import contextlib
 import copy
-import functools
 import threading
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.distribute import values
@@ -193,8 +194,8 @@ def _call_for_each_replica(distribution, device_map, fn, args, kwargs):
   return values.regroup(device_map, tuple(t.main_result for t in threads))
 
 
-def _create_mirrored_variable(device_map, logical_device, real_mirrored_creator,
-                              *args, **kwargs):  # pylint: disable=g-missing-docstring
+def _create_mirrored_variable(strategy, device_map, logical_device,  # pylint: disable=missing-docstring
+                              real_mirrored_creator, *args, **kwargs):
   # Figure out what collections this variable should be added to.
   # We'll add the MirroredVariable to those collections instead.
   collections = kwargs.pop("collections", None)
@@ -245,11 +246,13 @@ def _create_mirrored_variable(device_map, logical_device, real_mirrored_creator,
     value_list = real_mirrored_creator(devices, *args, **kwargs)
 
     if is_replica_local:
-      result = values.ReplicaLocalVariable(device_map, value_list, aggregation,
-                                           logical_device=logical_device)
+      result = values.ReplicaLocalVariable(
+          strategy, device_map, value_list, aggregation,
+          logical_device=logical_device)
     else:
-      result = values.MirroredVariable(device_map, value_list, aggregation,
-                                       logical_device=logical_device)
+      result = values.MirroredVariable(
+          strategy, device_map, value_list, aggregation,
+          logical_device=logical_device)
 
   # Add the wrapped variable to the requested collections.
   # The handling of eager mode and the global step matches
@@ -410,7 +413,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
-  The multi-worker version will be added in the fture.
+  The multi-worker version will be added in the future.
 
   Args:
     devices: a list of device strings.
@@ -454,9 +457,10 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
         "No duplicates allowed in `devices` argument: %s" % devices)
     # TODO(josh11b): Require at least 2 devices?
     self._device_map = values.ReplicaDeviceMap(devices)
-    self._input_workers = values.InputWorkers(self._device_map)
+    self._input_workers = input_lib.InputWorkers(self._device_map)
     self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
         devices)
+    self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
 
   def _initialize_multi_worker(self, devices):
     """Initializes the object for multi-worker training."""
@@ -485,9 +489,11 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     # their ops will end up on the cpu device of its first worker, e.g.
     # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
     self._default_device = workers[0]
+    self._host_input_device = numpy_dataset.SingleDevice(workers[0])
 
     self._device_map = values.ReplicaDeviceMap(devices)
-    self._input_workers = values.InputWorkers(self._device_map, worker_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, worker_devices)
     self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
         workers, _infer_num_gpus_per_worker(devices))
 
@@ -497,6 +503,9 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     if colocate_with is None:
       device_map = self._device_map
       logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
     else:
       device_map = colocate_with.device_map
       logical_device = colocate_with.logical_device
@@ -531,22 +540,15 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
           value_list.append(v)
       return value_list
 
-    return _create_mirrored_variable(device_map, logical_device,
-                                     _real_mirrored_creator, *args, **kwargs)
+    return _create_mirrored_variable(
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
 
-  def _distribute_dataset(self, dataset_fn):
-    if self._local_mode:
-      worker_index = 0
-      return values.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._input_workers, worker_index)
-    else:
-      return values.MultiWorkerDataset(
-          functools.partial(self._call_dataset_fn, dataset_fn),
-          self._input_workers,
-          auto_shard=False)
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate_distributed_variable(colocate_with_variable, self)
 
   def _make_dataset_iterator(self, dataset):
-    return values.DatasetIterator(
+    return input_lib.DatasetIterator(
         dataset, self._input_workers, self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
@@ -560,9 +562,13 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
           num_input_pipelines=num_workers,
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
-    return values.InputFunctionIterator(
+    return input_lib.InputFunctionIterator(
         input_fn, self._input_workers, input_contexts)
 
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._host_input_device, session)
+
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values=None):
@@ -570,7 +576,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
 
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
@@ -764,6 +770,13 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
     return True
 
 
@@ -854,13 +867,13 @@ class _MirroredReplicaThread(threading.Thread):
 
 
 class MirroredReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
+  """ReplicaContext used in MirroredStrategy.extended.call_for_each_replica().
 
   Opened in `_MirroredReplicaThread`, to allow the user to invoke
   `MirroredStrategy`'s specific implementation of `merge_call()`,
   which works by delegating the function and its arguments to
   the main thread (the one that invoked
-  `MirroredStrategy.call_for_each_replica()`).
+  `MirroredStrategy.extended.call_for_each_replica()`).
   """
 
   def _merge_call(self, fn, args, kwargs):
diff --git a/tensorflow/python/distribute/numpy_dataset.py b/tensorflow/python/distribute/numpy_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5881e4cd59e75ac5184e400bd0ac90443084635e
--- /dev/null
+++ b/tensorflow/python/distribute/numpy_dataset.py
@@ -0,0 +1,97 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code for creating a dataset out of a NumPy array."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def init_var_from_numpy(input_var, numpy_input, session):
+  """Initialize `input_var` to `numpy_input` using `session` in graph mode."""
+  with ops.init_scope():
+    if context.executing_eagerly():
+      input_var.assign(numpy_input)
+      return
+
+    assert session is not None
+    session.run(input_var.initializer)
+
+    start_placeholder = array_ops.placeholder(dtypes.int64, ())
+    end_placeholder = array_ops.placeholder(dtypes.int64, ())
+    slice_placeholder = array_ops.placeholder(input_var.dtype)
+    assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
+        slice_placeholder)
+
+    # If each batch element is > 64 MB, then we copy each batch element
+    # individually. Otherwise, the slices will be < 128 MB. There might be
+    # padding which might mean that the slices are 128 MB even if the size of
+    # the tensor allocated is less than 128 MB.  This formula gives slices with
+    # size: ceil(64 MB / byte size per batch element) bytes.  Using ceil()
+    # guarantees we get a number >= 1.
+
+    # Calculate the size of each batch element.
+    byte_size_per_batch_element = (
+        np.prod(numpy_input.shape[1:]) * input_var.dtype.size)
+
+    # Calculate number of elements we want to copy per slice.
+    batch_size_per_slice = int(
+        np.ceil((64 << 20) / byte_size_per_batch_element))
+
+    # Copy slices of the above size starting at 0, except the last slice will be
+    # smaller.
+    start = 0
+    limit = numpy_input.shape[0]
+    while start < limit:
+      end = min(start + batch_size_per_slice, limit)
+      session.run(assign_slice_op, feed_dict={
+          start_placeholder: start,
+          end_placeholder: end,
+          slice_placeholder: numpy_input[start:end]})
+      start = end
+
+
+def one_host_numpy_dataset(numpy_input, colocate_with, session):
+  """Create a dataset on `colocate_with` from `numpy_input`."""
+  def create_colocated_variable(next_creator, *args, **kwargs):
+    kwargs["colocate_with"] = colocate_with
+    return next_creator(*args, **kwargs)
+
+  numpy_flat = nest.flatten(numpy_input)
+  with variable_scope.variable_creator_scope(create_colocated_variable):
+    vars_flat = tuple(variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
+                                              trainable=False)
+                      for i in numpy_flat)
+  for v, i in zip(vars_flat, numpy_flat):
+    init_var_from_numpy(v, i, session)
+  vars_nested = nest.pack_sequence_as(numpy_input, vars_flat)
+  return dataset_ops.Dataset.from_tensor_slices(vars_nested)
+
+
+class SingleDevice(object):
+  """Used with `colocate_with` to create a non-mirrored variable."""
+
+  def __init__(self, device):
+    self.device = device
diff --git a/tensorflow/python/distribute/numpy_dataset_test.py b/tensorflow/python/distribute/numpy_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..04eae1daa2ee83040f4d9acb3a79baa6be16f402
--- /dev/null
+++ b/tensorflow/python/distribute/numpy_dataset_test.py
@@ -0,0 +1,44 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+
+
+class InitVarFromNumpyTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_creating_var_with_numpy_arrays(self):
+    with self.cached_session() as session:
+      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      initial = np.zeros_like(x)
+      var_x = variable_scope.variable(initial)
+      numpy_dataset.init_var_from_numpy(var_x, x, session)
+      val = self.evaluate(var_x.value())
+      # Verify that the numpy value is copied to the variable.
+      self.assertAllEqual(x, val)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..269c7417d16cc903d32a65f28b90fbaa2773209e
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -0,0 +1,538 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a multi-worker ps DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import device_setter
+from tensorflow.python.util import nest
+
+_LOCAL_CPU = "/device:CPU:0"
+_LOCAL_GPU_0 = "/device:GPU:0"
+
+
+# TODO(yuefengz): maybe cache variables on local CPU.
+class ParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """A parameter server DistributionStrategy.
+
+  This strategy class works for both local training and between-graph replicated
+  training for multiple workers. It uses `TFConfigClusterResolver` to detect
+  configurations for multi-worker training. In multi-worker training mode, i.e.
+  `TFConfigClusterResolver` has detected 'TF_CONFIG' environment variable and
+  'TF_CONFIG' has a cluster spec, variables and updates to those variables are
+  assigned to parameter servers and other operations are assigned to workers.
+  In local training mode, variables are assigned to local CPU or the only GPU.
+  When each worker has more than one GPU, operations will be replicated on these
+  GPUs. In both cases, operations are replicated but variables are not and these
+  workers share a common view for which paramater server a variable is assigned
+  to.
+
+  This class assumes between-graph replication will be used and works on a graph
+  for a particular worker. Note that each graph and worker is independent.
+  This means that while each worker will synchronously compute a single gradient
+  update across all GPUs, updates between workers proceed asynchronously.
+  Operations that occur only on the first replica (such as incrementing the
+  global step), will occur on the first replica *of every worker*.
+
+  It is expected to call `call_for_each_replica(fn, ...)` for any
+  operations which potentially can be replicated across replicas (i.e. multiple
+  GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
+  caution needs to be taken:
+
+  1) It is generally not recommended to open a device scope under the strategy's
+  scope. A device scope (i.e. calling `tf.device`) will be merged with or
+  override the device for operations but will not change the device for
+  variables.
+
+  2) It is also not recommended to open a colocation scope (i.e. calling
+  `tf.colocate_with`) under the strategy's scope. For colocating variables, use
+  `strategy.extended.colocate_vars_with` instead. Colocation of ops will
+  possibly create conflicts of device assignment.
+  """
+
+  def __init__(self):
+    """Initializes this strategy with default TFConfigClusterResolver."""
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerStrategyExtended(self))
+
+
+class ParameterServerStrategyExtended(
+    distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               cluster_resolver=TFConfigClusterResolver()):
+    super(ParameterServerStrategyExtended, self).__init__(container_strategy)
+    self._initialize_strategy(cluster_resolver)
+
+    # We typically don't need to do all-reduce in this strategy.
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+            reduce_to_device=_LOCAL_CPU))
+
+  def _initialize_strategy(self, cluster_resolver):
+    if cluster_resolver.cluster_spec().as_dict():
+      self._initialize_multi_worker(cluster_resolver)
+    else:
+      self._initialize_local(cluster_resolver)
+    # Save the num_gpus_per_worker for configure method.
+    self._num_gpus_per_worker = cluster_resolver.num_accelerators()
+
+  def _initialize_multi_worker(self, cluster_resolver):
+    """Initialize devices for multiple workers.
+
+    It creates variable devices and compute devices. Variables and operations
+    will be assigned to them respectively. We have one compute device per
+    replica. The variable device is a device function or device string. The
+    default variable device assigns variables to parameter servers in a
+    round-robin fashion.
+
+    Args:
+      cluster_resolver: a descendant of `ClusterResolver` object.
+
+    Raises:
+      ValueError: if the cluster doesn't have ps jobs.
+    """
+    num_gpus = cluster_resolver.num_accelerators()
+    cluster_spec = cluster_resolver.cluster_spec()
+    task_type = cluster_resolver.task_type
+    task_id = cluster_resolver.task_id
+    if not task_type or task_id is None:
+      raise ValueError("When `cluster_spec` is given, you must also specify "
+                       "`task_type` and `task_id`")
+    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+    assert cluster_spec.as_dict()
+
+    worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+
+    # Define compute devices which is a list of device strings and one for each
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
+    if num_gpus > 0:
+      compute_devices = tuple(
+          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
+    else:
+      compute_devices = (worker_device,)
+
+    self._device_map = values.ReplicaDeviceMap(compute_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(worker_device, compute_devices)])
+
+    # In distributed mode, place variables on ps jobs in a round-robin fashion.
+    # Note that devices returned from `replica_device_setter` are not
+    # canonical and therefore we don't canonicalize all variable devices to
+    # make them consistent.
+    # TODO(yuefengz): support passing a strategy object to control variable
+    # assignment.
+    # TODO(yuefengz): merge the logic of replica_device_setter into this
+    # class.
+    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
+    if num_ps_replicas == 0:
+      raise ValueError("The cluster spec needs to have `ps` jobs.")
+    self._variable_device = device_setter.replica_device_setter(
+        ps_tasks=num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        cluster=cluster_spec)
+
+    # The `_parameter_devices` is needed for the `parameter_devices` property
+    # and is a list of all variable devices. Here parameter devices are all
+    # tasks of the "ps" job.
+    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
+                                        range(num_ps_replicas)))
+
+    # Add a default device so that ops without specified devices will not end up
+    # on other workers.
+    self._default_device = worker_device
+
+    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
+                                                task_id)
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+
+    logging.info(
+        "Multi-worker ParameterServerStrategy with "
+        "cluster_spec = %r, task_type = %r, task_id = %r, "
+        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
+        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
+        num_ps_replicas, self._is_chief, self._device_map,
+        self._variable_device)
+
+  def _initialize_local(self, cluster_resolver):
+    """Initialize internal devices for local training."""
+    worker_device = device_util.canonicalize("/device:CPU:0")
+    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+    num_gpus = cluster_resolver.num_accelerators()
+    # Define compute devices which is a list of device strings and one for each
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
+    if num_gpus > 0:
+      compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus)))
+    else:
+      compute_devices = (_LOCAL_CPU,)
+
+    self._device_map = values.ReplicaDeviceMap(compute_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(worker_device, compute_devices)])
+
+    # If there is only one GPU, put everything on that GPU. Otherwise, place
+    # variables on CPU.
+    if num_gpus == 1:
+      assert len(compute_devices) == 1
+      self._variable_device = _LOCAL_GPU_0
+      self._parameter_devices = (_LOCAL_GPU_0,)
+    else:
+      self._variable_device = _LOCAL_CPU
+      self._parameter_devices = (_LOCAL_CPU,)
+
+    self._is_chief = True
+    self._cluster_spec = None
+    self._task_type = None
+    self._task_id = None
+
+    logging.info(
+        "ParameterServerStrategy with compute_devices = %r, "
+        "variable_device = %r", compute_devices, self._variable_device)
+
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate(colocate_with_variable, self)
+
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    return input_lib.InputFunctionIterator(input_fn, self._input_workers,
+                                           [input_context])
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._input_host_device, session)
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
+      # TODO(josh11b): Use current logical device instead of 0 here.
+      destinations = values.LogicalDeviceSpec(
+          device_map=self._device_map, logical_device=0)
+    return self._cross_device_ops.broadcast(tensor, destinations)
+
+  def _allow_variable_partition(self):
+    return not context.executing_eagerly()
+
+  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
+  # this creator, such as "MutableHashTable".
+  def _create_variable(self, next_creator, *args, **kwargs):
+    if self._num_replicas_in_sync > 1:
+      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+      if aggregation not in (
+          vs.VariableAggregation.NONE,
+          vs.VariableAggregation.SUM,
+          vs.VariableAggregation.MEAN,
+          vs.VariableAggregation.ONLY_FIRST_REPLICA
+      ):
+        raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                         " for variable: " + kwargs["name"])
+
+      def var_creator(*args, **kwargs):
+        """Create an AggregatingVariable and fix up collections."""
+        # Record what collections this variable should be added to.
+        collections = kwargs.pop("collections", None)
+        if collections is None:
+          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+        kwargs["collections"] = []
+
+        # Create and wrap the variable.
+        v = next_creator(*args, **kwargs)
+        wrapped = values.AggregatingVariable(
+            self._container_strategy(), v, aggregation)
+
+        # Add the wrapped variable to the requested collections.
+        # The handling of eager mode and the global step matches
+        # ResourceVariable._init_from_args().
+        if not context.executing_eagerly():
+          g = ops.get_default_graph()
+          # If "trainable" is True, next_creator() will add the contained
+          # variable to the TRAINABLE_VARIABLES collection, so we manually
+          # remove it and replace with the wrapper. We can't set "trainable"
+          # to False for next_creator() since that causes functions like
+          # implicit_gradients to skip those variables.
+          if kwargs.get("trainable", True):
+            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l.remove(v)
+          g.add_to_collections(collections, wrapped)
+        elif ops.GraphKeys.GLOBAL_STEP in collections:
+          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
+
+        return wrapped
+    else:
+      var_creator = next_creator
+
+    if "colocate_with" in kwargs:
+      colocate_with = kwargs["colocate_with"]
+      if isinstance(colocate_with, numpy_dataset.SingleDevice):
+        with ops.device(colocate_with.device):
+          return var_creator(*args, **kwargs)
+      with ops.device(None):
+        with ops.colocate_with(colocate_with):
+          return var_creator(*args, **kwargs)
+
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._variable_device):
+        return var_creator(*args, **kwargs)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    # pylint: disable=protected-access
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), self._device_map, fn, args, kwargs)
+
+  def _verify_destinations_not_different_worker(self, destinations):
+    if not self._cluster_spec:
+      return
+    if destinations is None:
+      return
+    for d in cross_device_ops_lib.get_devices_from(destinations):
+      d_spec = tf_device.DeviceSpec.from_string(d)
+      if d_spec.job == self._task_type and d_spec.task != self._task_id:
+        raise ValueError(
+            "Cannot reduce to another worker: %r, current worker is %r" %
+            (d, self._input_workers.worker_devices[0]))
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    self._verify_destinations_not_different_worker(destinations)
+    if not isinstance(value, values.DistributedValues):
+      # pylint: disable=protected-access
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    for _, destinations in value_destination_pairs:
+      self._verify_destinations_not_different_worker(destinations)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
+
+  def _select_single_value(self, structured):
+    """Select any single values in `structured`."""
+
+    def _select_fn(x):  # pylint: disable=g-missing-docstring
+      if isinstance(x, values.Mirrored):
+        if len(x.devices) == 1:
+          return x.primary
+        else:
+          raise ValueError(
+              "You cannot update variable with a Mirrored object with multiple "
+              "components %r when using ParameterServerStrategy. You must "
+              "specify a single value or a Mirrored with a single value." % x)
+      elif isinstance(x, values.PerReplica):
+        raise ValueError(
+            "You cannot update variable with a PerReplica object %r when using "
+            "ParameterServerStrategy. You must specify a single value or a "
+            "Mirrored with a single value" % x)
+      else:
+        return x
+
+    return nest.map_structure(_select_fn, structured)
+
+  def _update(self, var, fn, args, kwargs, group):
+    if isinstance(var, values.AggregatingVariable):
+      var = var.get()
+    if not isinstance(var, resource_variable_ops.ResourceVariable):
+      raise ValueError(
+          "You can not update `var` %r. It must be a Variable." % var)
+    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
+      result = fn(var, *self._select_single_value(args),
+                  **self._select_single_value(kwargs))
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  # TODO(yuefengz): does it need to call _select_single_value?
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    with ops.device(
+        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      return val.values
+    return (val,)
+
+  def value_container(self, val):
+    if (hasattr(val, "_aggregating_container") and
+        not isinstance(val, values.AggregatingVariable)):
+      wrapper = val._aggregating_container()  # pylint: disable=protected-access
+      if wrapper is not None:
+        return wrapper
+    return val
+
+  def read_var(self, var):
+    # No need to distinguish between normal variables and replica-local
+    # variables.
+    return array_ops.identity(var)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the strategy class.
+
+    The strategy object will be re-initialized if `cluster_spec` is given but
+    was not passed in the constructor.
+
+    Args:
+      session_config: not used currently.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
+        not.
+    """
+    if cluster_spec:
+      # Use the num_gpus_per_worker recorded in constructor since _configure
+      # doesn't take num_gpus.
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators=self._num_gpus_per_worker)
+      self._initialize_multi_worker(cluster_resolver)
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    if not self._cluster_spec:
+      updated_config.isolate_session_state = True
+      return updated_config
+
+    updated_config.isolate_session_state = False
+
+    assert self._task_type
+    assert self._task_id is not None
+
+    # The device filters prevent communication between workers.
+    if self._task_type not in ["chief", "worker"]:
+      return updated_config
+    del updated_config.device_filters[:]
+    updated_config.device_filters.extend(
+        ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
+    return updated_config
+
+  @property
+  def _num_replicas_in_sync(self):
+    return self._device_map.num_replicas_in_graph
+
+  @property
+  def worker_devices(self):
+    return self._device_map.all_devices
+
+  @property
+  def worker_devices_by_replica(self):
+    return self._device_map.devices_by_replica
+
+  @property
+  def parameter_devices(self):
+    return self._parameter_devices
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  @property
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
+    return True
+
+  @property
+  def experimental_should_init(self):
+    return self._is_chief
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index e0c575b01cff3a31709391941c79ebb7d28871b5..585ae1bd6c627fc270a9617c37f03fd97d9040e5 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -20,21 +20,15 @@ from __future__ import print_function
 
 import collections
 import contextlib
-import operator
 import weakref
 import six
 
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
-from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -331,7 +325,10 @@ class DistributedDelegate(DistributedValues):
   def __rmul__(self, o): return o * self.get()
   def __truediv__(self, o): return self.get() / o
   def __rtruediv__(self, o): return o / self.get()
-  def __floordiv__(self, o): return self.get() // o
+
+  def __floordiv__(self, o):
+    return self.get() // o
+
   def __rfloordiv__(self, o): return o // self.get()
   def __mod__(self, o): return self.get() % o
   def __rmod__(self, o): return o % self.get()
@@ -413,6 +410,18 @@ def _assign_on_device(device, variable, tensor):
     return variable.assign(array_ops.identity(tensor))
 
 
+def _assert_strategy(strategy):
+  if not distribution_strategy_context.has_strategy():
+    raise RuntimeError(
+        'Need to be inside "with strategy.scope()" for %s' %
+        (strategy,))
+  current_strategy = distribution_strategy_context.get_strategy()
+  if current_strategy is not strategy:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (current_strategy, strategy))
+
+
 DistributedVarOp = collections.namedtuple(
     "DistributedVarOp", ["name", "graph", "type"])
 
@@ -422,7 +431,8 @@ class DistributedVariable(DistributedDelegate):
   # TODO(josh11b): Support changing the set of variables if e.g. if new
   # devices are joining or a device is to leave.
 
-  def __init__(self, device_map, values, logical_device=None):
+  def __init__(self, strategy, device_map, values, logical_device=None):
+    self._distribute_strategy = strategy
     super(DistributedVariable, self).__init__(
         device_map, values, logical_device=logical_device)
     self._common_name = self.primary.name.split(":")[0]
@@ -507,6 +517,11 @@ class DistributedVariable(DistributedDelegate):
   def _unique_id(self):
     return self.primary._unique_id   # pylint: disable=protected-access
 
+  @property
+  def _graph_key(self):
+    """Lets Optimizers know which graph this variable is from."""
+    return self.primary._graph_key  # pylint: disable=protected-access
+
   @property
   def name(self):
     return self.primary.name
@@ -519,6 +534,10 @@ class DistributedVariable(DistributedDelegate):
   def shape(self):
     return self.primary.shape
 
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
   def get_shape(self):
     return self.primary.get_shape()
 
@@ -530,7 +549,7 @@ class DistributedVariable(DistributedDelegate):
     # We want cross-replica code that does some var.op.X calls
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-replica context to fail.
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       return DistributedVarOp(self.primary.op.name,
                               self.primary.op.graph,
                               self.primary.op.type)
@@ -541,8 +560,10 @@ class DistributedVariable(DistributedDelegate):
     return self.primary._in_graph_mode   # pylint: disable=protected-access
 
   def read_value(self):
-    strategy = distribution_strategy_context.get_distribution_strategy()
-    return strategy.extended.read_var(self)
+    return self._distribute_strategy.extended.read_var(self)
+
+  def value(self):
+    return self._get_closest().value()
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -552,10 +573,43 @@ class DistributedVariable(DistributedDelegate):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
+def _validate_colocate_extended(v, extended):
+  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
+  if variable_strategy.extended is not extended:
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
+        (v, variable_strategy))
+
+
+def validate_colocate_distributed_variable(v, extended):
+  if not isinstance(v, DistributedVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate_tpu_variable(v, extended):
+  if not isinstance(v, TPUMirroredVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate(v, extended):
+  if not hasattr(v, "_distribute_strategy"):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
 def _apply_aggregation(strategy, value, aggregation, destinations):
   if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.broadcast(strategy.unwrap(value)[0],
-                              destinations=destinations)
+    return strategy.extended.broadcast_to(strategy.unwrap(value)[0],
+                                          destinations=destinations)
   reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
   return strategy.extended.reduce_to(reduce_op, value, destinations)
 
@@ -576,12 +630,13 @@ class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
 
 
 class MirroredVariable(DistributedVariable, Mirrored,
-                       checkpointable.CheckpointableBase):
+                       checkpointable.Checkpointable):
   """Holds a map from device to variables whose values are kept in sync."""
 
-  def __init__(self, device_map, values, aggregation, logical_device=None):
+  def __init__(
+      self, strategy, device_map, values, aggregation, logical_device=None):
     super(MirroredVariable, self).__init__(
-        device_map, values, logical_device=logical_device)
+        strategy, device_map, values, logical_device=logical_device)
     self._aggregation = aggregation
 
   # The arguments to update() are automatically unwrapped so the update()
@@ -591,8 +646,9 @@ class MirroredVariable(DistributedVariable, Mirrored,
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
+    _assert_strategy(self._distribute_strategy)
     f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       update_device = distribute_lib.get_update_device()
       if update_device is not None:
         # We are calling an assign function on the mirrored variable in an
@@ -601,11 +657,11 @@ class MirroredVariable(DistributedVariable, Mirrored,
         return f(v, *args, **kwargs)
 
       # We are calling assign on the mirrored variable in cross replica context,
-      # use `strategy.update()` to update the variable.
-      strategy = distribution_strategy_context.get_distribution_strategy()
-      return strategy.update(self, f, *args, **kwargs)
+      # use `strategy.extended.update()` to update the variable.
+      return self._distribute_strategy.extended.update(
+          self, f, args=args, kwargs=kwargs)
     else:
-      _assert_replica_context()
+      _assert_replica_context(self._distribute_strategy)
       # We are calling an assign function on the mirrored variable in replica
       # context.
       # We reduce the value we want to assign/add/sub. More details about how we
@@ -618,7 +674,8 @@ class MirroredVariable(DistributedVariable, Mirrored,
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
         v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.update(self, f, v, *other_args, **other_kwargs)
+        return strategy.extended.update(
+            self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
           merge_fn, args=args, kwargs=kwargs)
@@ -648,7 +705,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       return self.primary._as_graph_element()
     return self.get()._as_graph_element()
 
@@ -670,7 +727,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
 # allowing instances of the class to be used as tensors.
 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
   # Try to avoid assignments to and other mutations of MirroredVariable
-  # state except through a DistributionStrategy.update() call.
+  # state except through a DistributionStrategy.extended.update() call.
   assert not as_ref
   return ops.internal_convert_to_tensor(
       var.get(), dtype=dtype, name=name, as_ref=as_ref)
@@ -695,11 +752,13 @@ def _enclosing_tpu_context():
 # tpu.replicate() because it assumes that you're in a device context where you
 # can operate on a single version of the variable, but a tpu.replicate()
 # operates on all variables and is replicated during a rewrite pass.
-class TPUMirroredVariable(checkpointable.CheckpointableBase):
+class TPUMirroredVariable(checkpointable.Checkpointable):
   """Holds a map from device to TPU variables whose values are kept in sync."""
 
-  def __init__(self, device_map, values, aggregation, logical_device=None):
+  def __init__(
+      self, strategy, device_map, values, aggregation, logical_device=None):
     assert isinstance(device_map, DeviceMap)
+    self._distribute_strategy = strategy
     self._device_map = device_map
     self._values = tuple(values)
     if logical_device is None:
@@ -756,6 +815,10 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   def values(self):
     return self._values
 
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
   # pylint: disable=multiple-statements
   def __add__(self, o): return self.read_value() + o
   def __radd__(self, o): return o + self.read_value()
@@ -853,15 +916,12 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    strategy = distribution_strategy_context.get_distribution_strategy()
-    if strategy.__class__.__name__ != "TPUStrategy":
-      raise ValueError("You may only assign to a TPUMirroredVariable within a "
-                       "TPUStrategy.")
+    _assert_strategy(self._distribute_strategy)
     f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       if _enclosing_tpu_context() is not None:
-        return distribution_strategy_context.get_distribution_strategy().update(
-            self, f, *args, **kwargs)
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
 
       update_device = distribute_lib.get_update_device()
       # We are calling update on the mirrored variable in cross replica context.
@@ -871,10 +931,10 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
         v = self._get(device=update_device)
         return f(v, *args, **kwargs)
 
-      return distribution_strategy_context.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
+      return self._distribute_strategy.extended.update(
+          self, f, args=args, kwargs=kwargs)
     else:
-      _assert_replica_context()
+      _assert_replica_context(self._distribute_strategy)
       # We are calling an assign function on the mirrored variable in replica
       # context.
       # We reduce the value we want to assign/add/sub. More details about how we
@@ -887,7 +947,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
         v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.update(self, f, v, *other_args, **other_kwargs)
+        return strategy.extended.update(
+            self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
           merge_fn, args=args, kwargs=kwargs)
@@ -1019,7 +1080,7 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       return self.primary._as_graph_element()
     return self._read_variable_op()
 
@@ -1117,7 +1178,7 @@ class _ReplicaLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
     # We use a callable so that we don't have to evaluate this expression
     # in the case where we are trying to restore instead of save.
     def tensor():
-      strategy = distribution_strategy_context.get_distribution_strategy()
+      strategy = replica_local_variable._distribute_strategy  # pylint: disable=protected-access
       return strategy.extended.read_var(replica_local_variable)
 
     spec = saver.BaseSaverBuilder.SaveSpec(
@@ -1133,31 +1194,36 @@ class _ReplicaLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
     return self._replica_local_variable.assign(tensor)
 
 
-def _assert_replica_context():
-  if not distribution_strategy_context.get_replica_context():
+def _assert_replica_context(strategy):
+  replica_context = distribution_strategy_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
     raise RuntimeError(
         "Replica-local variables may only be assigned in a replica context.")
 
 
 class ReplicaLocalVariable(DistributedVariable, PerReplica,
-                           checkpointable.CheckpointableBase):
+                           checkpointable.Checkpointable):
   """Holds a map from device to variables whose values are reduced on save."""
 
-  def __init__(self, device_map, values, aggregation, logical_device=None):
+  def __init__(
+      self, strategy, device_map, values, aggregation, logical_device=None):
     self._aggregation = aggregation
     super(ReplicaLocalVariable, self).__init__(
-        device_map, values, logical_device=logical_device)
+        strategy, device_map, values, logical_device=logical_device)
 
   def assign_sub(self, *args, **kwargs):
-    _assert_replica_context()
+    _assert_replica_context(self._distribute_strategy)
     return self.get().assign_sub(*args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
-    _assert_replica_context()
+    _assert_replica_context(self._distribute_strategy)
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       # To preserve the sum across save and restore, we have to divide the
       # total across all devices when restoring a variable that was summed
       # when saving.
@@ -1167,7 +1233,7 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
       return control_flow_ops.group(tuple(
           _assign_on_device(v.device, v, tensor) for v in self._values))
     else:
-      _assert_replica_context()
+      _assert_replica_context(self._distribute_strategy)
       return self.get().assign(*args, **kwargs)
 
   @property
@@ -1185,7 +1251,7 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       return self._get_cross_replica()
     return self.get()._as_graph_element()
 
@@ -1347,694 +1413,6 @@ def update_regroup(extended, device_map, updates, group):
   return nest.pack_sequence_as(regrouped, grouped_flat)
 
 
-class InputWorkers(object):
-  """A 1-to-many mapping from input worker devices to compute devices."""
-
-  def __init__(self, device_map, worker_device_pairs=None, logical_device=0):
-    """Initialize an `InputWorkers` object.
-
-    Args:
-      device_map: A `DeviceMap` with the computation devices fed by the
-        input workers.
-      worker_device_pairs: A sequence of pairs:
-        `(input device, a tuple of compute devices fed by that input device)`.
-      logical_device: The logical device of `device_map` to feed.
-    """
-    self._device_map = device_map
-    self._logical_device = logical_device
-    if worker_device_pairs is None:
-      worker_device_pairs = ((
-          device_util.canonicalize("/device:CPU:0"),
-          device_map.logical_to_actual_devices(logical_device)),)
-    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
-    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
-                              for _, f in worker_device_pairs)
-    flattened = tuple(d for l in self._fed_devices for d in l)
-    assert (flattened ==
-            device_map.logical_to_actual_devices(logical_device)), (
-                "flattened: %s logical device %d: %s" %
-                (flattened, logical_device,
-                 device_map.logical_to_actual_devices(logical_device)))
-
-  @property
-  def device_map(self):
-    return self._device_map
-
-  @property
-  def logical_device(self):
-    return self._logical_device
-
-  @property
-  def num_workers(self):
-    return len(self._input_worker_devices)
-
-  @property
-  def worker_devices(self):
-    return self._input_worker_devices
-
-  def compute_devices_for_worker(self, worker_index):
-    return self._fed_devices[worker_index]
-
-  def __repr__(self):
-    devices = self.worker_devices
-    debug_repr = ",\n".join("  %d %s: %s" %
-                            (i, devices[i], self._fed_devices[i])
-                            for i in range(len(devices)))
-    return "%s:{\n%s\n  device_map: %s}" % (
-        self.__class__.__name__, debug_repr, self._device_map)
-
-
-class PerReplicaDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
-
-  def __init__(self, iterator, input_workers, worker_index, prefetch_on_device):
-    assert isinstance(input_workers, InputWorkers)
-    self._iterator = iterator
-    self._input_workers = input_workers
-    self._worker_index = worker_index
-    self._prefetch_on_device = prefetch_on_device
-
-  @property
-  def initializer(self):
-    return self._iterator.initializer
-
-  def get_next_as_list(self, name=None):
-    """Scatter the input across devices."""
-    if self._prefetch_on_device:
-      data_list = self._iterator.get_next()
-    else:
-      batch = self._iterator.get_next(name=name)
-      data_list = []
-      def get_ith(i):
-        return lambda x: x[i]
-
-      devices = self._input_workers.compute_devices_for_worker(
-          self._worker_index)
-      for i, d in enumerate(devices):
-        v = nest.map_structure(get_ith(i), batch)
-        if context.executing_eagerly():
-          with ops.device(d):
-            v = nest.map_structure(array_ops.identity, v)
-        data_list.append(v)
-
-    return data_list
-
-  def get_next(self, name=None):
-    assert self._input_workers.num_workers == 1
-    data_list = self.get_next_as_list(name)
-    return regroup(self._input_workers.device_map, data_list)
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-class PerReplicaDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
-
-  def __init__(self, dataset, input_workers, worker_index,
-               prefetch_on_device=None):
-    assert isinstance(input_workers, InputWorkers)
-    assert worker_index is not None
-    assert worker_index is not True
-    assert worker_index is not False
-    self._input_workers = input_workers
-    self._worker_index = worker_index
-
-    # Default to using prefetching in graph mode, unless specified.
-    # TODO(rohanj): Enable prefetching in eager mode.
-    self._prefetch_on_device = prefetch_on_device
-    if self._prefetch_on_device is None:
-      self._prefetch_on_device = not context.executing_eagerly()
-    assert not (self._prefetch_on_device and context.executing_eagerly()), (
-        "Prefetching is only supported in graph mode currently")
-
-    self._dataset = dataset
-    if not self._prefetch_on_device:
-      # TODO(priyag): If dropping remainder is not appropriate, find another
-      # approach to distributing the dataset when not possible to divide evenly.
-      # Possibly not an issue when we start using PartitionedDataset.
-      num_replicas = len(input_workers.compute_devices_for_worker(worker_index))
-      self._dataset = dataset.batch(num_replicas, drop_remainder=True)
-
-  def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerReplicaDataset."""
-    # Graph mode with one shot iterator is disabled.
-    if not context.executing_eagerly():
-      raise ValueError("Cannot create a one shot iterator. Please use "
-                       "`make_initializable_iterator()` instead.")
-    # Eager mode prefetching would error out in constructor. Only remaining
-    # case is non-prefetching in eager mode. We delegate to
-    # PerReplicaDataIterator to handle that case.
-    dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator, self._input_workers, self._worker_index,
-        prefetch_on_device=False)
-
-  def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerReplicaDataset."""
-    # Eager mode generates already initialized iterators. Hence we cannot create
-    # an initializable iterator.
-    if context.executing_eagerly():
-      raise ValueError("Cannot create initializable iterator in Eager mode. "
-                       "Please use `make_one_shot_iterator` instead.")
-    if self._prefetch_on_device:
-      replica_devices = self._input_workers.compute_devices_for_worker(
-          self._worker_index)
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, replica_devices)
-    else:
-      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator, self._input_workers, self._worker_index,
-        prefetch_on_device=self._prefetch_on_device)
-
-
-class MultiWorkerDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
-
-  def __init__(self, iterators, input_workers):
-    """Initialize the `MultiWorkerDataIterator` object.
-
-    Args:
-      iterators: a list of worker, iterator pairs.
-      input_workers: an `InputWorkers` object.
-
-    Raises:
-      ValueError: if iterators and input_workers are not compatible.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    workers = tuple(d for d, _ in iterators)
-    if workers != input_workers.worker_devices:
-      raise ValueError("iterators and input_workers are not compatible. "
-                       "iterator workers: %r input_workers devices: %r" %
-                       (workers, input_workers.worker_devices))
-    self._iterators = tuple(i for _, i in iterators)
-    self._input_workers = input_workers
-
-  @property
-  def initializer(self):
-    return control_flow_ops.group(
-        tuple(iterator.initializer for iterator in self._iterators))
-
-  def get_iterator(self, worker):
-    for i, w in enumerate(self._input_workers.worker_devices):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  def get_next(self, name=None):
-    """Scatter the input across hosts and devices."""
-    replicas = []
-    for worker, iterator in zip(self._input_workers.worker_devices,
-                                self._iterators):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        data_per_worker = iterator.get_next_as_list(name=new_name)
-        # Append to replicas to get a flat list of values indexed by replica.
-        replicas.extend(data_per_worker)
-
-    return regroup(self._input_workers.device_map, replicas)
-
-
-class MultiWorkerDataset(object):
-  """Like a `tf.data.Dataset` that distributes data to different workers.
-
-  Each worker gets one shard of the input dataset. This currently does not work
-  in eager mode.
-  """
-
-  def __init__(self, dataset_fn, input_workers, prefetch_on_device=None,
-               auto_shard=False):
-    """Initialize the MultiWorkerDataset object.
-
-    Args:
-      dataset_fn: a function or a list of functions that returns a
-        `tf.data.Dataset`.
-      input_workers: an `InputWorkers` object.
-      prefetch_on_device: whether to prefetch to devices.
-      auto_shard: whether to auto-shard the dataset.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if isinstance(dataset_fn, (list, tuple)):
-      if len(dataset_fn) != input_workers.num_workers:
-        raise ValueError("If `dataset_fn` is a list, it must have one entry "
-                         "per worker")
-    # TODO(rohanj): b/120673685 to track re-enabling auto sharding.
-    if auto_shard:
-      raise ValueError("Currently autosharding is not supported.")
-    self._input_workers = input_workers
-    self._datasets = []
-    # TODO(yuefengz, priyag): support different set of jobs for input
-    # processing.
-    for i, worker in enumerate(input_workers.worker_devices):
-      with ops.device(worker):
-        if isinstance(dataset_fn, (list, tuple)):
-          worker_input = dataset_fn[i]()
-        else:
-          worker_input = dataset_fn()
-        dataset = PerReplicaDataset(worker_input, input_workers, i,
-                                    prefetch_on_device=prefetch_on_device)
-        self._datasets.append((worker, dataset))
-
-  def make_one_shot_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._input_workers)
-
-  def make_initializable_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append(
-            (worker, dataset_ops.make_initializable_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._input_workers)
-
-
-class InputIterator(object):
-  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
-
-  def get_next(self):
-    """Returns the next inputs for all replicas."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  def initialize(self):
-    """Initialize the underlying input dataset, when applicable.
-
-    In eager mode, this will create a new iterator and return it.
-    In graph mode, this will initialize the same underlying iterator(s).
-
-    Users are required to call this if
-    - This iterator was returned from a call to `make_input_fn_iterator` with an
-      input function that returns a dataset.
-    - Or this iterator was returned from a call to `make_dataset_iterator`.
-
-    Returns:
-      A list of initialization ops to be executed.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-
-class InputIteratorImpl(InputIterator):
-  """Common implementation for all input iterators."""
-
-  def __init__(self, input_workers, iterators):
-    assert isinstance(input_workers, InputWorkers)
-    if not input_workers.worker_devices:
-      raise ValueError("Should have at least one worker for input iterator.")
-
-    self._iterators = iterators
-    self._input_workers = input_workers
-    self._is_eager = context.executing_eagerly()
-
-  def get_next(self, name=None):
-    """Returns the next input from the iterator for all replicas."""
-    assert self._is_eager == context.executing_eagerly(), (
-        "Iterator should be created and used in same execution mode.")
-
-    replicas = []
-    for i, worker in enumerate(self._input_workers.worker_devices):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        # Make `replicas` a flat list of values across all replicas.
-        replicas.extend(self._iterators[i].get_next_as_list(new_name))
-
-    return regroup(self._input_workers.device_map, replicas)
-
-  def initialize(self):
-    """Initialze underlying iterators.
-
-    Returns:
-      A list of any initializer ops that should be run.
-    """
-    assert self._is_eager == context.executing_eagerly(), (
-        "Iterator should be created and used in same execution mode.")
-
-    init_ops = []
-    for it in self._iterators:
-      init_ops.extend(it.initialize())
-    return init_ops
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_classes(self):
-    return self._iterators[0].output_classes
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  def get_iterator(self, worker):
-    for i, w in enumerate(self._input_workers.worker_devices):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-
-class InputFunctionIterator(InputIteratorImpl):
-  """Iterator created from input function."""
-
-  def __init__(self, input_fn, input_workers, input_contexts):
-    """Make an iterator for input provided via an input function.
-
-    Currently implements PER_WORKER mode, in which the `input_fn` is called
-    once on each worker.
-
-    TODO(priyag): Add other replication modes.
-    TODO(priyag): Allow taking input function that returns a callable that
-    returns nest of tensors.
-
-    Args:
-      input_fn: Input function that returns a `tf.data.Dataset` object.
-      input_workers: an `InputWorkers` object.
-      input_contexts: A list of `InputContext` instances to be passed to call(s)
-        to `input_fn`. Length and order should match worker order in
-        `worker_device_pairs`.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if input_workers.num_workers != len(input_contexts):
-      raise ValueError(
-          "Number of input workers (%d) is not same as number of "
-          "input_contexts (%d)" %
-          (input_workers.num_workers, len(input_contexts)))
-
-    iterators = []
-    for i, ctx in enumerate(input_contexts):
-      worker = input_workers.worker_devices[i]
-      with ops.device(worker):
-        result = input_fn(ctx)
-        if not isinstance(result, dataset_ops.DatasetV2):
-          raise ValueError("input_fn must return a tf.data.Dataset.")
-        devices = input_workers.compute_devices_for_worker(i)
-        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
-        iterators.append(iterator)
-
-    super(InputFunctionIterator, self).__init__(input_workers, iterators)
-
-
-class DatasetIterator(InputIteratorImpl):
-  """Iterator created from input dataset."""
-
-  def __init__(self, dataset, input_workers, split_batch_by=None):
-    """Make an iterator for the dataset on given devices.
-
-    If `split_batch_by` is not None, we "split" each batch of the
-    dataset by `split_batch_by` value. To achieve this, we first unbatch the
-    input dataset and then rebatch it with the per replica batch size that is
-    calculated using `global_batch_size // split_batch_by`.
-    The currently supported datasets are as follows:
-    `dataset.batch()` is the last operation on the dataset OR
-    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
-    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
-    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
-
-    TODO(priyag): Support multi worker / host cases properly by cloning
-    and sharding the dataset on each worker. Current setup will only work in
-    some cases, such as in-graph multi worker GPU case. If the input pipeline
-    has random shuffling (with a different seed on each worker), each worker
-    will see random input from the same overall dataset in each step. Otherwise,
-    each worker will see the same input in each step.
-
-    Args:
-      dataset: `tf.data.Dataset` that will be used as the input source.
-      input_workers: an `InputWorkers` object.
-      split_batch_by: Optional integer. If present, we "split" each batch of the
-        dataset by `split_batch_by` value.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if split_batch_by:
-      dataset = _split_dataset_batch(dataset, split_batch_by)
-
-    iterators = []
-    for i, worker in enumerate(input_workers.worker_devices):
-      with ops.device(worker):
-        worker_devices = input_workers.compute_devices_for_worker(i)
-        cloned_dataset = dataset
-        if not context.executing_eagerly():
-          cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
-        iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
-                                                worker_devices)
-        iterators.append(iterator)
-
-    super(DatasetIterator, self).__init__(input_workers, iterators)
-
-
-class _SingleWorkerDatasetIterator(object):
-  """Iterator for a single `tf.data.Dataset`."""
-
-  def __init__(self, dataset, worker, devices):
-    """Create iterator for the `dataset` to fetch data to worker's `devices` .
-
-    `MultiDeviceIterator` is used to prefetch input to the devices on the
-    given worker. `MultiDeviceIterator` doesn't work in eager mode yet.
-
-    Args:
-      dataset: A `tf.data.Dataset` instance.
-      worker: Worker on which ops should be created.
-      devices: Distribute data from `dataset` to these devices.
-    """
-    self._dataset = dataset
-    self._worker = worker
-    self._devices = devices
-    self._is_eager = context.executing_eagerly()
-    self._make_iterator()
-
-  def _make_iterator(self):
-    """Make appropriate iterator on the dataset."""
-    with ops.device(self._worker):
-      if self._is_eager:
-        # TODO(rohanj): Enable prefetching in eager mode.
-        # TODO(priyag): Measure the performance of this approach vs calling
-        # get_next on the original dataset N times.
-        dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
-        iterator = dataset_ops.make_one_shot_iterator(dataset)
-      else:
-        iterator = multi_device_iterator_ops.MultiDeviceIterator(
-            self._dataset, self._devices)
-    self._iterator = iterator
-
-  def get_next_as_list(self, name=None):
-    """Get next element from the underlying iterator."""
-    with ops.device(self._worker):
-      if self._is_eager:
-        # Batched dataset case.
-        batch = self._iterator.get_next(name=name)
-        data_list = []
-        for i, d in enumerate(self._devices):
-          v = nest.map_structure(operator.itemgetter(i), batch)
-          with ops.device(d):
-            v = nest.map_structure(array_ops.identity, v)
-          data_list.append(v)
-      else:
-        # MultiDeviceIterator case.
-        data_list = self._iterator.get_next()
-
-      return data_list
-
-  def initialize(self):
-    """Initialze underlying iterator.
-
-    In eager execution, this simply recreates the underlying iterator.
-    In graph execution, it returns the initializer ops for the underlying
-    iterator.
-
-    Returns:
-      A list of any initializer ops that should be run.
-    """
-    if self._is_eager:
-      self._make_iterator()
-      return []
-    else:
-      return [self._iterator.initializer]
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-def _split_dataset_batch(dataset, split_batch_by):
-  """Divide a batch-ed dataset's batches into smaller batches."""
-  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
-  # pylint: disable=protected-access
-  def _get_batch_dataset(d):
-    """Get the underlying batch dataset from the dataset object."""
-    if isinstance(d, dataset_ops.DatasetV1Adapter):
-      d = d._dataset
-
-    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
-      return d
-    elif isinstance(d, dataset_ops.PrefetchDataset):
-      return _get_batch_dataset(d._input_dataset)
-    raise ValueError(
-        "Unable to get batched dataset from the input dataset. `batch` "
-        "`map_and_batch` need to be the last operations on the dataset. "
-        "The batch operations can be followed by a prefetch.")
-
-  batched_dataset = _get_batch_dataset(dataset)
-  if isinstance(batched_dataset, dataset_ops.BatchDataset):
-    batch_size = batched_dataset._batch_size
-    drop_remainder = batched_dataset._drop_remainder
-  elif isinstance(batched_dataset, batching._MapAndBatchDataset):
-    batch_size = batched_dataset._batch_size_t
-    drop_remainder = batched_dataset._drop_remainder_t
-  # pylint: enable=protected-access
-
-  if tensor_util.is_tensor(batch_size):
-    batch_size = tensor_util.constant_value(batch_size)
-
-  if tensor_util.is_tensor(drop_remainder):
-    drop_remainder = tensor_util.constant_value(drop_remainder)
-
-  if batch_size % split_batch_by:
-    raise ValueError(
-        "Batch size %s cannot be sharded evenly across replicas %s" % (
-            batch_size, split_batch_by))
-  new_batch_size = batch_size // split_batch_by
-
-  dataset = dataset.apply(batching.unbatch())
-  return dataset.batch(new_batch_size, drop_remainder=drop_remainder)
-
-
-class MultiStepContext(object):
-  """A context object that can be used to capture things when running steps.
-
-  This context object is useful when running multiple steps at a time using the
-  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
-  function to specify which outputs to emit at what frequency. Currently it
-  supports capturing output from the last step, as well as capturing non tensor
-  outputs.  In the future it will be augmented to support other use cases such
-  as output each N steps.
-  """
-
-  def __init__(self):
-    """Initialize an output context.
-
-    Returns:
-      A context object.
-    """
-    self._last_step_outputs = {}
-    self._last_step_outputs_reduce_ops = {}
-    self._non_tensor_outputs = {}
-
-  @property
-  def last_step_outputs(self):
-    """A dictionary consisting of outputs to be captured on last step.
-
-    Keys in the dictionary are names of tensors to be captured, as specified
-    when `set_last_step_output` is called.
-    Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with a `reduce_op` for this output,
-    then the value is the reduced value.
-
-    Returns:
-      A dictionary with last step outputs.
-    """
-    return self._last_step_outputs
-
-  def _set_last_step_outputs(self, outputs):
-    """Replace the entire dictionary of last step outputs."""
-    if not isinstance(outputs, dict):
-      raise ValueError("Need a dictionary to set last_step_outputs.")
-    self._last_step_outputs = outputs
-
-  def set_last_step_output(self, name, output, reduce_op=None):
-    """Set `output` with `name` to be outputted from the last step.
-
-    Args:
-      name: String, name to identify the output. Doesn't need to match tensor
-        name.
-      output: The tensors that should be outputted with `name`. See below for
-        actual types supported.
-      reduce_op: Reduction method to use to reduce outputs from multiple
-        replicas. Required if `set_last_step_output` is called in a replica
-        context. Optional in cross_replica_context.
-        When present, the outputs from all the replicas are reduced using the
-        current distribution strategy's `reduce` method. Hence, the type of
-        `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and reduction is set, output
-        must be a `PerReplica` value.
-        The reduce method is also recorded in a dictionary
-        `_last_step_outputs_reduce_ops` for later interpreting of the
-        outputs as already reduced or not.
-    """
-    if distribution_strategy_context.get_cross_replica_context():
-      self._last_step_outputs_reduce_ops[name] = reduce_op
-      if reduce_op is None:
-        self._last_step_outputs[name] = output
-      else:
-        distribution = distribution_strategy_context.get_distribution_strategy()
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
-    else:
-      assert reduce_op is not None
-      def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
-        # Setting this inside the `merge_fn` because all replicas share the same
-        # context object, so it's more robust to set it only once (even if all
-        # the replicas are trying to set the same value).
-        self._last_step_outputs_reduce_ops[name] = reduce_op
-
-      distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=(output,))
-
-  @property
-  def non_tensor_outputs(self):
-    """A dictionary consisting of any non tensor outputs to be captured."""
-    return self._non_tensor_outputs
-
-  def set_non_tensor_output(self, name, output):
-    """Set `output` with `name` to be captured as a non tensor output."""
-    if distribution_strategy_context.get_cross_replica_context():
-      self._non_tensor_outputs[name] = output
-    else:
-      def merge_fn(distribution, value):
-        # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as reduction doesn't make sense on non tensors.
-        self._non_tensor_outputs[name] = distribution.unwrap(value)
-      distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=(output,))
-
-
 def value_container(val):
   """Returns the container that this per-replica `value` belongs to.
 
@@ -2058,10 +1436,11 @@ def value_container(val):
 
 
 # TODO(josh11b): Descend from Variable.
-class AggregatingVariable(checkpointable.CheckpointableBase):
+class AggregatingVariable(checkpointable.Checkpointable):
   """A wrapper around a variable that aggregates updates across replicas."""
 
-  def __init__(self, v, aggregation):
+  def __init__(self, strategy, v, aggregation):
+    self._distribute_strategy = strategy
     self._v = v
     # NOTE: We don't use "_distributed_container" here because we don't want
     # to trigger that code path in regroup().
@@ -2071,12 +1450,17 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
   def get(self):
     return self._v
 
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
   def __getattr__(self, name):
     return getattr(self._v, name)
 
   def _assign_func(self, *args, **kwargs):
+    _assert_strategy(self._distribute_strategy)
     f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       update_device = distribute_lib.get_update_device()
       if update_device is not None:
         # We are calling an assign function in an update context.
@@ -2084,24 +1468,25 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
 
       # We are calling an assign function in cross replica context, wrap it in
       # an update call.
-      return distribution_strategy_context.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
+      return self._distribute_strategy.extended.update(
+          self, f, args=args, kwargs=kwargs)
     else:
-      assert distribution_strategy_context.get_replica_context()
+      replica_context = distribution_strategy_context.get_replica_context()
+      assert replica_context
       # We are calling an assign function in replica context.
       # We reduce the value we want to assign/add/sub. More details about how we
       # handle the different use cases can be found in the _reduce method.
       # We call the function with the reduced value.
       if self._aggregation == vs.VariableAggregation.NONE:
         raise ValueError("You must specify an aggregation method to update a "
-                         "a variable in Replica Context.")
+                         "a variable in replica context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
         v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.update(self, f, v, *other_args, **other_kwargs)
+        return strategy.extended.update(
+            self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
-      return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=args, kwargs=kwargs)
+      return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index cd5c0be283eea729574614032817632de6b86fff..6f798fcdb06da2c513f570935e3b6ad78f04dda6 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_py_test", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
@@ -25,6 +25,7 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
         "//tensorflow/core:framework",
@@ -55,6 +56,7 @@ py_library(
         ":execute",
         ":function",
         ":graph_only_ops",
+        ":profiler",
         ":tape",
         ":test",
         ":wrap_function",
@@ -89,6 +91,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "profiler",
+    srcs = ["profiler.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+cuda_py_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.py"],
+    additional_deps = [
+        ":profiler",
+        ":test",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
+    ],
+)
+
 py_library(
     name = "tape",
     srcs = ["tape.py"],
@@ -255,11 +279,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "execution_callbacks_test",
     srcs = ["execution_callbacks_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":execution_callbacks",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -373,11 +396,10 @@ tf_py_logged_benchmark(
     target = "//tensorflow/python/eager:benchmarks_test",
 )
 
-py_test(
+tf_py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":test",
@@ -414,20 +436,19 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "pywrap_tfe_test",
     srcs = ["pywrap_tfe_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":core",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -491,11 +512,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":def_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -519,11 +539,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "wrap_function_test",
     srcs = ["wrap_function_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":wrap_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 2c0f68365ccae559c02f6b36b3eadf2971418ade..6117d8a4ea154fc09acc77f8dbd5daa5afea81e0 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -465,14 +465,16 @@ def val_and_grad_function(f, params=None):
 
 
 def make_vjp(f, params=None, persistent=True):
-  """Returns a function that computes f and is vjp w.r.t. params.
+  """Returns a function that computes f and its vjp w.r.t.
+
+  params.
 
   The term "vjp" here is an abbreviation for vector-jacobian product.
 
   Args:
     f: the function to be differentiated.
     params: the parameters (numbers or names) to differentiate with respect to.
-       A value of None will differentiate with respect to all parameters.
+      A value of None will differentiate with respect to all parameters.
     persistent: Boolean controlling whether the VJP function can be re-used.
       Must be True or False.
 
@@ -595,7 +597,9 @@ def _fast_fill(value, shape, dtype):
 
 def _zeros(shape, dtype):
   """Helper to return (possibly cached) zero tensors in eager mode."""
-  if dtype == dtypes.variant or dtype == dtypes.string:
+  if (dtype == dtypes.variant
+      or dtype == dtypes.string
+      or dtype == dtypes.resource):
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
@@ -928,11 +932,12 @@ class GradientTape(object):
                             "gradient in order to compute higher order "
                             "derrivatives.", 1)
 
-    flat_targets = nest.flatten(target)
-    for t in flat_targets:
+    flat_targets = []
+    for t in nest.flatten(target):
       if resource_variable_ops.is_resource_variable(t):
-        raise ValueError("GradientTape.gradient is not supported for variable "
-                         "targets.")
+        with self:
+          t = ops.convert_to_tensor(t)
+      flat_targets.append(t)
 
     flat_sources = nest.flatten(sources)
     flat_sources = [_handle_or_self(x) for x in flat_sources]
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 22ae6f74cb6a5fa0a3a9ab16b516b8798291f4b8..5f4fda8897b3913ffeb165819a4b7859821ec3b8 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -354,6 +354,16 @@ class BackpropTest(test.TestCase):
       loss += v * v
     self.assertAllEqual(t.gradient(loss, v), 2.0)
 
+  def testPythonMax(self):
+    x = [resource_variable_ops.ResourceVariable(2.),
+         resource_variable_ops.ResourceVariable(3.),
+         resource_variable_ops.ResourceVariable(5.)]
+    with backprop.GradientTape() as t:
+      f = max(x)
+    grad = t.gradient(f, x)
+    self.assertAllEqual(self.evaluate(f), 5.)
+    self.assertAllEqual(self.evaluate(grad), [None, None, 1.0])
+
   def testAutomaticWatchedVariables(self):
     with backprop.GradientTape() as t:
       self.assertEqual(0, len(t.watched_variables()))
@@ -674,10 +684,8 @@ class BackpropTest(test.TestCase):
     with backprop.GradientTape() as g:
       x = variables.Variable([3.0])
       y = variables.Variable([2.0])
-    with self.assertRaisesRegexp(
-        ValueError,
-        'GradientTape.gradient is not supported for variable targets.'):
-      g.gradient(x, y)
+    grad = g.gradient(x, y)
+    self.assertAllEqual(grad, None)
 
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only('b/120545219')
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 31a7efca82b016bc193ab9985ea7603897edc7ac..16b52c9baa6fda290bf92889066836a000970ebf 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -140,7 +140,7 @@ class MicroBenchmarks(test.Benchmark):
     self._m_2_by_2 = random_ops.random_uniform((2, 2))
     self._m_100_by_784 = random_ops.random_uniform((100, 784))
     self._num_iters_2_by_2 = 30000
-    self._num_iters_100_by_784 = 1000
+    self._num_iters_100_by_784 = 30000
 
   def _run(self, func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
@@ -370,6 +370,19 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: f(m, m, transpose_b=transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
+  def _benchmark_nested_defun_matmul(self, m, transpose_b, num_iters):
+    inner = function.defun(math_ops.matmul)
+
+    @function.defun
+    def outer(a, b, c, transpose_b):
+      return math_ops.matmul(inner(a, b, transpose_b=transpose_b), c)
+
+    func = lambda: outer(m, m, m, transpose_b=transpose_b)
+    # Warmup before benchmark
+    for _ in range(1000):
+      func()
+    self._run(func, num_iters)
+
   def _benchmark_defun_matmul_forward_backward(self,
                                                m,
                                                transpose_b,
@@ -525,6 +538,11 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  def benchmark_nested_defun_matmul_2_by_2(self):
+    m = self._m_2_by_2.cpu()
+    self._benchmark_nested_defun_matmul(
+        m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
   # Benchmarks for AA.T, A of dimension 100 by 784.
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
@@ -614,6 +632,11 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_nested_defun_matmul_100_by_784(self):
+    m = self._m_100_by_784.gpu()
+    self._benchmark_nested_defun_matmul(
+        m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index cd43dc7ab298bb3bed6128799bf22804f0cdc3d3..33551fccd509a43416026358c0876ff22fa36bc2 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -180,8 +180,8 @@ class _ContextSwitchStack(threading.local):
   def push(self, is_building_function, enter_context_fn):
     """Push metadata about a context switch onto the stack.
 
-    A context switch can take one of two forms: installing a graph as the
-    default graph, or entering the eager context. For each context switch,
+    A context switch can take any one of the two forms: installing a graph as
+    the default graph, or entering the eager context. For each context switch,
     we record whether or not the entered context is building a function.
 
     Args:
@@ -643,6 +643,10 @@ class Context(object):
     pywrap_tensorflow.TFE_ContextAddFunctionDef(
         self._handle, fdef_string, len(fdef_string))
 
+  def has_function(self, name):
+    """Check if a function `name` is registered."""
+    return bool(pywrap_tensorflow.TFE_ContextHasFunction(self._handle, name))
+
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
 
@@ -788,6 +792,27 @@ def in_eager_mode():
   return executing_eagerly()
 
 
+def shared_name(name=None):
+  """Returns the anonymous shared name GUID if no shared name is specified.
+
+  In eager mode we need to use a unique shared name to avoid spurious sharing
+  issues. The runtime generates a unique name on our behalf when the reserved
+  GUID is used as a shared name.
+
+  Args:
+    name: Optional shared name
+
+  Returns:
+    Eager compatible shared name.
+  """
+  if name or not executing_eagerly():
+    return name
+
+  # Ensure a unique name when eager execution is enabled to avoid spurious
+  # sharing issues.
+  return "cd2c89b7-88b7-44c8-ad83-06c2a9158347"
+
+
 def graph_mode():
   """Context-manager to disable eager execution for the current thread."""
   return context()._mode(GRAPH_MODE)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index ebc47d156691abf6cb3d0894ca11647fb912cda7..1175052530652984cd6023859ddc14fd7b4998fa 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -25,12 +25,16 @@ import weakref
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
@@ -53,6 +57,9 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                dtype=None,
                constraint=None,
                add_initializers_to=None,
+               lifted_initializer_graph=None,
+               lifted_all_initializers=None,
+               lifted_placeholders=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -86,6 +93,11 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       add_initializers_to: if not None and not in legacy graph mode, the
         initializer tensor will be added to this map instead of adding the
         assignment to the function.
+      lifted_initializer_graph: FuncGraph to try to lift initializers to.
+      lifted_all_initializers: list with one boolean element, which will be
+        set to False if we cannot lift this initializer to the above graph.
+      lifted_placeholders: placeholders for resource handles lifted out of
+        this graph.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -130,22 +142,22 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                         if init_from_fn else [initial_value]) as name:
       # pylint: disable=protected-access
       with ops.init_scope():
-        shared_name = ops._name_from_scope_name(name)
-        shared_name = "%s_%d" % (shared_name, ops.uid())
+        handle_name = ops._name_from_scope_name(name)
+        unique_id = "%s_%d" % (handle_name, ops.uid())
+        shared_name = context.shared_name(unique_id)
       with ops.name_scope("Initializer"), ops.device(None):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
             name="initial_value", dtype=dtype)
       with ops.init_scope():
         self._handle = resource_variable_ops.eager_safe_variable_handle(
-            shape=initial_value.get_shape(),
-            dtype=initial_value.dtype.base_dtype,
+            initial_value=initial_value,
             shared_name=shared_name,
             name=name,
             graph_mode=self._in_graph_mode)
       self._shape = initial_value.shape
-      self._unique_id = shared_name
-      self._handle_name = shared_name + ":0"
+      self._unique_id = unique_id
+      self._handle_name = handle_name + ":0"
       self._dtype = initial_value.dtype.base_dtype
       self._constraint = constraint
       assert initial_value is not None
@@ -163,6 +175,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
               self._initializer_op = resource_variable_ops.assign_variable_op(
                   self._handle, lifted_initializer, name=n)
+              assign = self._initializer_op
           with ops.name_scope("Read"), ops.colocate_with(self._handle):
             # Manually assign reads to the handle's device to avoid log
             # messages.
@@ -173,6 +186,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       else:
         if add_initializers_to is not None:
           add_initializers_to[self] = initial_value
+          assign = None
         else:
           def assign_fn():
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
@@ -186,9 +200,18 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             return ops.convert_to_tensor(0)
           # Note: this cond is always guaranteed to run because we're inside a
           # defun which will insert automatic control dependencies.
-          control_flow_ops.cond(
+          assign = control_flow_ops.cond(
               resource_variable_ops.var_is_initialized_op(self._handle),
               not_assign_fn, assign_fn)
+      if lifted_initializer_graph is not None and assign is not None:
+        try:
+          handle_placeholder = ops.convert_to_tensor(self._handle)
+          op_map = lift_to_graph.lift_to_graph(
+              assign, lifted_initializer_graph,
+              sources=[handle_placeholder])
+          lifted_placeholders.append((self._handle, op_map[handle_placeholder]))
+        except ValueError:
+          lifted_all_initializers[0] = False
 
     # After the handle has been created, set up a way to clean it up when
     # executing eagerly. We'll hold the only reference to the deleter, so that
@@ -201,13 +224,26 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     self._cached_shape_as_list = None
 
 
-class PolymorphicFunction(object):
+class FunctionDeleter(object):
+
+  def __init__(self, func_graph):
+    self.func_graph = func_graph
+
+  def __del__(self):
+    try:
+      func_graph_module.dismantle_func_graph(self.func_graph)
+    except:  # pylint: disable=bare-except
+      # Note: bare except here because this can be noisy at shutdown time.
+      pass
+
+
+class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
   See the documentation for `tf.function` for more information on the semantics
   of defined functions.
 
-  PolymorphicFunction is thread-compatible.
+  `Function` is thread-compatible.
   """
 
   def __init__(self,
@@ -216,7 +252,7 @@ class PolymorphicFunction(object):
                input_signature=None,
                autograph=True,
                experimental_autograph_options=None):
-    """Initializes a polymorphic function.
+    """Initializes a `Function`.
 
     Args:
       python_function: the function to be wrapped.
@@ -279,10 +315,10 @@ class PolymorphicFunction(object):
   def _initialize(self, args, kwds, add_initializers_to=None):
     """Initializes, on the first call.
 
-    Creates two polymorphic functions, one that will allow creation of variables
+    Creates two `Function`s, one that will allow creation of variables
     and one that won't.
 
-    Additionally runs a trace for the polymorphic function that allows creation
+    Additionally runs a trace for the `Function` that allows creation
     of variables.
 
     Args:
@@ -292,11 +328,17 @@ class PolymorphicFunction(object):
     """
 
     created_variables = []
+    lifted_initializer_graph = func_graph_module.FuncGraph("initializer")
+    lifted_all_initializers = [True]
+    lifted_placeholders = []
 
     def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
       v = UnliftedInitializerVariable(
-          add_initializers_to=add_initializers_to, **kwds)
+          add_initializers_to=add_initializers_to,
+          lifted_initializer_graph=lifted_initializer_graph,
+          lifted_all_initializers=lifted_all_initializers,
+          lifted_placeholders=lifted_placeholders, **kwds)
       created_variables.append(weakref.ref(v))
       return v
 
@@ -304,9 +346,13 @@ class PolymorphicFunction(object):
     self._stateful_fn = self._defun_with_scope(variable_capturing_scope)
     self._stateful_fn._name = self._name  # pylint: disable=protected-access
     # Force the definition of the function for these arguments
+    self._lifted_initializer_graph = lifted_initializer_graph
+    self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
+    self._lifted_placeholders = lifted_placeholders
     self._concrete_stateful_fn = (
         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
             *args, **kwds))
+    self._lifted_all_initializers = lifted_all_initializers[0]
 
     def invalid_creator_scope(*unused_args, **unused_kwds):
       """Disables variable creation."""
@@ -334,6 +380,17 @@ class PolymorphicFunction(object):
 
     # This is the first call of __call__, so we have to initialize.
     self._initialize(args, kwds)
+    if self._lifted_all_initializers and self._lifted_placeholders:
+      with ops.init_scope():
+        handles, placeholders = zip(*self._lifted_placeholders)
+        if context.executing_eagerly():
+          lifted_fn = function_lib._EagerDefinedFunction(  # pylint: disable=protected-access
+              "initializer" + str(ops.uid()),
+              self._lifted_initializer_graph,
+              placeholders, [], {})
+          with tape.stop_recording():
+            lifted_fn.call(context.context(), list(handles))
+      return self._stateless_fn(*args, **kwds)
     canon_args, canon_kwds = self._canonicalize_function_inputs(args, kwds)
 
     if not self._created_variables:
@@ -407,7 +464,7 @@ class PolymorphicFunction(object):
     return self._function_spec
 
   def get_initialization_function(self, *args, **kwargs):
-    """Returns a `Function` object which initializes this function's variables.
+    """Returns a `ConcreteFunction` which initializes this function's variables.
 
     Requires that this function hasn't been accessed yet through either calling
     it or calling get_concrete_function. Fails if we cannot build an initializer
@@ -419,7 +476,8 @@ class PolymorphicFunction(object):
       **kwargs: keyword arguments to the python callable.
 
     Returns:
-      A `Function` object which initializes the variables of this function.
+      A `ConcreteFunction` object which initializes the variables of this
+      function.
 
     Raises:
       RuntimeError: if called after the variables have been initialized.
@@ -442,38 +500,55 @@ class PolymorphicFunction(object):
 
     return initialize_variables.get_concrete_function()
 
-  @property
-  def _cached_input_signatures(self):
-    """All input signatures used to call this PolymorphicFunction."""
-    seen = set()
-    # Preserves signature ordering rather than returning a set() so that we
-    # don't need to re-sort signatures later to work around Python 2's set
-    # nondeterminism.
-    # pylint: disable=protected-access
+  def _list_all_concrete_functions_for_serialization(self):
+    """Returns all concrete functions for serialization.
+
+    Returns:
+      A list of instances of `Function`.
+    """
+    if self._input_signature is not None:
+      self.get_concrete_function()
     concrete_functions = []
+    # pylint: disable=protected-access
     if self._stateful_fn:
       concrete_functions.extend(self._stateful_fn._function_cache.values())
     if self._stateless_fn:
       concrete_functions.extend(self._stateless_fn._function_cache.values())
-    for concrete_function in concrete_functions:
-      signature = concrete_function._python_call_signature
-      if signature not in seen:
-        yield signature
-        seen.add(signature)
     # pylint: enable=protected-access
+    deduplicated_concrete_functions = list()
+    seen_signatures = list()
+    # We are using a list so that:
+    #  - the returned collection is deterministic, and
+    #  - we can use a custom equality operator (is_same_structure).
+    # This is run only at serialization time on likely very small inputs so we
+    # are not concerned about O(n^2) runtime.
+    for concrete_function in concrete_functions:
+      signature, _ = concrete_function.structured_input_signature
+      flattened = nest.flatten(signature)
+      if any(
+          isinstance(arg, func_graph_module.UnknownArgument)
+          for arg in flattened):
+        logging.info("Unsupported signature for serialization: %s.", signature)
+        continue
+      equal_to_signature = functools.partial(
+          function_lib.is_same_structure, signature, check_values=True)
+      if not any(equal_to_signature(s) for s in seen_signatures):
+        deduplicated_concrete_functions.append(concrete_function)
+        seen_signatures.append(signature)
+    return deduplicated_concrete_functions
 
   def get_concrete_function(self, *args, **kwargs):
-    """Returns a `Function` object specialized to inputs and execution context.
+    """Returns a `ConcreteFunction` specialized to inputs and execution context.
 
-    If this `PolymorphicFunction` was created with an `input_signature`, `args`
-    and `kwargs` may be omitted. With an input signature there is only one
-    concrete function associated with this `PolymorphicFunction`.
+    If this `Function` was created with an `input_signature`, `args` and
+    `kwargs` may be omitted. With an input signature there is only one
+    concrete function associated with this `Function`.
 
     If there is no fixed `input_signature` associated with this
-    `PolymorphicFunction`, positional and keyword arguments to
-    `get_concrete_function` follow the same rules as input signature
-    specification, with `tf.TensorSpec` objects describing `tf.Tensor`s which
-    will be passed to the concrete function.
+    `Function`, positional and keyword arguments to `get_concrete_function`
+    follow the same rules as input signature specification, with `tf.TensorSpec`
+    objects describing `tf.Tensor`s which will be passed to the concrete
+    function.
 
     Each `tf.Tensor` argument to the concrete function must have a unique name,
     either because it is the only one associated with a named argument of the
@@ -558,8 +633,8 @@ class PolymorphicFunction(object):
   def __get__(self, instance, owner):
     """Makes it possible to defun instance methods."""
     del owner
-    # `instance` here is the instance that this `PolymorphicFunction` was
-    # accessed through; e.g., for
+    # `instance` here is the instance that this `Function` was accessed through
+    # e.g., for
     #
     #   class Foo(object):
     #
@@ -568,10 +643,10 @@ class PolymorphicFunction(object):
     #       ...
     #
     #   foo = Foo()
-    #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
+    #   foo.bar()  # `foo.bar` is a `Function` instance
     #
     # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
-    # new instance of PolymorphicFunction here to allow different instances each
+    # new instance of `Function` here to allow different instances each
     # to create variables once, thereby allowing methods to be decorated with
     # tf.function. Keeps a cache to avoid retracing the function every time the
     # descriptor is accessed.
@@ -765,7 +840,7 @@ def function(func=None,
   ```
 
   `add_noise()` will return a different output every time it is invoked.
-  However, `add_noise` will return the same value every time it is called,
+  However, `traced()` will return the same value every time it is called,
   since a particular random value generated by the `np.random.randn` call will
   be inserted in the traced/staged TensorFlow graph as a constant. In this
   particular example, replacing `np.random.randn(5, 5)` with
@@ -827,7 +902,7 @@ def function(func=None,
       name = "function"
     return tf_decorator.make_decorator(
         inner_function,
-        PolymorphicFunction(
+        Function(
             inner_function,
             name,
             input_signature=input_signature,
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 77cc8ee981a176f9f57028832039fa9bfe1f47a1..b777ead91301a9c811c00bcbeb5bdca8d807573e 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -61,6 +61,21 @@ class _HasDecoratedMethod(object):
   def f(self, x):
     return x * 3.
 
+# pylint: disable=bad-continuation,anomalous-backslash-in-string
+MIXING_GRAPH_EAGER_TENSORS_ERROR = (
+"""An op outside of the function building code is being passed
+a "Graph" tensor. It is possible to have Graph tensors
+leak out of the function building context by including a
+tf.init_scope in your function building code.
+For example, the following function will fail:
+  @tf.function
+  def has_init_scope\(\):
+    my_constant = tf.constant\(1.\)
+    with tf.init_scope\(\):
+      added = my_constant \* 2
+The graph tensor has name: Const:0""")
+# pylint: enable=bad-continuation,anomalous-backslash-in-string
+
 
 class DefFunctionTest(test.TestCase):
 
@@ -247,10 +262,9 @@ class DefFunctionTest(test.TestCase):
     concrete = compute.get_concrete_function(
         tensor_spec.TensorSpec(None, dtypes.float32))
     self.assertAllClose(4., concrete(constant_op.constant(2.)))
-    input_signature, = compute._cached_input_signatures
-    self.assertEqual(
-        tuple(input_signature),
-        (tensor_spec.TensorSpec(None, dtypes.float32),))
+    signature_args, _ = concrete.structured_input_signature
+    self.assertEqual(signature_args,
+                     (tensor_spec.TensorSpec(None, dtypes.float32),))
 
   def test_serialization_signature_cache(self):
 
@@ -260,8 +274,16 @@ class DefFunctionTest(test.TestCase):
 
     f(constant_op.constant([[3., 4.]]), constant_op.constant([2.]))
     f(constant_op.constant([[3, 4, 5]]), constant_op.constant([2]))
+
+    signatures_args = set()
+    concrete_functions = f._list_all_concrete_functions_for_serialization()
+    for concrete_function in concrete_functions:
+      args, kwargs = concrete_function.structured_input_signature
+      signatures_args.add(args)
+      self.assertEqual(dict(), kwargs)
+
     self.assertEqual(
-        set(f._cached_input_signatures),
+        signatures_args,
         set(((tensor_spec.TensorSpec([1, 2], dtypes.float32),
               tensor_spec.TensorSpec([1], dtypes.float32)),
              (tensor_spec.TensorSpec([1, 3], dtypes.int32),
@@ -289,6 +311,18 @@ class DefFunctionTest(test.TestCase):
     # function itself is not involved in a reference cycle.
     self.assertIs(None, weak_fn())
 
+  def testErrorMessageWhenGraphTensorIsPassedToEager(self):
+
+    @def_function.function
+    def failing_function():
+      a = constant_op.constant(1.)
+
+      with ops.init_scope():
+        _ = a + a
+
+    with self.assertRaisesRegexp(TypeError, MIXING_GRAPH_EAGER_TENSORS_ERROR):
+      failing_function()
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index af1afa3454655df233d8530bb89ae31c840de052..34fa3da39abc2a7311723a38011785145f23792c 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -64,7 +64,7 @@ class InfOrNanError(Exception):
     """Constructor of InfOrNanError.
 
     Args:
-      op_type: Type name of the op that generated the tensor that generated the
+      op_type: Type name of the op that generated the tensor with
         `inf`(s) or `nan`(s) (e.g., `Div`).
       op_name: Name of the op that generated the tensor with `inf`(s) or
         `nan`(s). This name is set by client and can be `None` if it is unset.
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 8b8f6af93b07accc209d3d304436096d2a768fea..ac596654d5a4a3960501761a1baeec056562873b 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -41,6 +41,8 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import error_interpolation
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
@@ -50,6 +52,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import memory
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -74,6 +77,24 @@ CacheKey = collections.namedtuple("CacheKey", [
 ])
 
 
+def is_same_structure(structure1,
+                      structure2,
+                      check_values=False):
+  """Check two structures for equality, optionally of types and of values."""
+  try:
+    nest.assert_same_structure(structure1, structure2)
+  except (ValueError, TypeError):
+    return False
+  if check_values:
+    flattened1 = nest.flatten(structure1)
+    flattened2 = nest.flatten(structure2)
+    # First check the types to avoid AttributeErrors.
+    if any(type(f1) != type(f2) for f1, f2 in zip(flattened1, flattened2)):
+      return False
+    return flattened1 == flattened2
+  return True
+
+
 def _parse_func_attrs(attributes):
   """Convert the keyword arguments into function_def attributes.
 
@@ -105,7 +126,7 @@ def _parse_func_attrs(attributes):
       attrs[key] = attr_value_pb2.AttrValue(i=value)
     elif isinstance(value, float):
       attrs[key] = attr_value_pb2.AttrValue(f=value)
-    elif isinstance(value, (str, bytes)):
+    elif isinstance(value, (str, bytes, six.text_type)):
       attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(value))
     else:
       raise ValueError("Unsupported attribute type for %s with type %s" %
@@ -113,6 +134,46 @@ def _parse_func_attrs(attributes):
   return attrs
 
 
+class _InterpolateFunctionError(object):
+  """Context Manager that interpolates the exception from 'top_level_func'."""
+
+  def __init__(self, top_level_func):
+    self._func = top_level_func
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, typ, exc, tb):
+    if not exc or not isinstance(exc, errors.OpError):
+      return False
+    message = compat.as_text(exc.message)
+    _, tags = error_interpolation.parse_message(message)
+    g = None
+    func_stack = []
+    # pylint: disable=protected-access
+    for t in tags:
+      if t.type == "function_node":
+        if t.name == compat.as_str(self._func.name):
+          g = self._func._graph
+        elif g:
+          next_func = g._get_function(t.name)
+          if next_func is not None and isinstance(next_func,
+                                                  _EagerDefinedFunction):
+            g = next_func._graph
+        if g:
+          func_stack.append(g.name)
+        else:
+          func_stack.append("<unknown>")
+    # pylint: enable=protected-access
+    if g:
+      message = error_interpolation.interpolate(message, g)
+      message += "\n\nFunction call stack:\n"
+      message += " -> ".join(func_stack)
+      message += "\n"
+      exc._message = message  # pylint: disable=protected-access
+    return False
+
+
 def _forward_name(n):
   """The name of a generated forward defun named n."""
   return "__forward_%s_%s" % (n, ops.uid())
@@ -132,7 +193,7 @@ def _inference_name(n):
 # so it doesn't have the definition-generating logic and is just a container for
 # an already-defined function.
 class _EagerDefinedFunction(object):
-  """Callable with the interface of `framework.function._DefinedFunction.`
+  """Callable with the interface of `framework.function._DefinedFunction`.
 
   `_EagerDefinedFunction` encapsulates a function definition and its properties,
   and it provides a method for calling the encapsulated function. Some Ops
@@ -195,13 +256,16 @@ class _EagerDefinedFunction(object):
     self._graph = graph
     self._stateful_ops = tuple(op for op in operations if op.op_def.is_stateful)
 
-  def add_to_graph(self, g):
+  def add_to_graph(self, g=None):
     # pylint: disable=protected-access
-    if self.name not in g._functions:
-      g._add_function(self)
-    for f in self._graph._functions.values():
-      if f.name not in g._functions:
-        g._add_function(f)
+    if not g and context.executing_eagerly():
+      context.context().add_function_def(self.definition)
+    else:
+      if self.name not in g._functions:
+        g._add_function(self)
+      for f in self._graph._functions.values():
+        if f.name not in g._functions:
+          g._add_function(f)
     # pylint: enable=protected-access
 
   @property
@@ -211,8 +275,8 @@ class _EagerDefinedFunction(object):
   def call(self, ctx, args):
     """Calls this function with `args` as inputs.
 
-    Function execution respects device annotations only if the function won't
-    be compiled with xla.
+    `ConcreteFunction` execution respects device annotations only if the
+    function won't be compiled with xla.
 
     Args:
       ctx: a Context object
@@ -224,50 +288,60 @@ class _EagerDefinedFunction(object):
     Raises:
       ValueError: if the number of arguments is incorrect.
     """
+    if len(args) != len(self.signature.input_arg):
+      raise ValueError(
+          "Arguments and signature arguments do not match: %s %s " %
+          (len(args), len(list(self.signature.input_arg))))
+
+    function_call_options = ctx.get_function_call_options()
+    if function_call_options.config_proto_serialized is None:
+      config = function_utils.get_disabled_rewriter_config()
+    else:
+      config = function_call_options.config_proto_serialized
+    executor_type = function_call_options.executor_type or ""
 
     executing_eagerly = ctx.executing_eagerly()
-
-    if self._graph._xla_compile:  # pylint: disable=protected-access
-      # XLA compilation relies upon a custom kernel creator to run functions.
-      signature = self.signature
-      if executing_eagerly:
+    if executing_eagerly:
+      with _InterpolateFunctionError(self):
         outputs = execute.execute(
-            str(signature.name),
+            str(self.signature.name),
             num_outputs=self._num_outputs,
             inputs=args,
-            attrs=None,
+            attrs=("executor_type", executor_type,
+                   "config_proto", config),
             ctx=ctx)
+      # Replace empty list with None
+      outputs = outputs or None
+    elif self._graph._xla_compile:  # pylint: disable=protected-access
+      g = ops.get_default_graph()
+      self.add_to_graph(g)
+      signature = self.signature
+      op = g.create_op(
+          signature.name,
+          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
+          op_def=signature,
+          name="FunctionCall",
+          compute_shapes=False)
+      outputs = op.outputs
+      if not outputs:
+        return op
+      if isinstance(outputs, (ops.Tensor, type(None))):
+        outputs = [outputs]
       else:
-        g = ops.get_default_graph()
-        self.add_to_graph(g)
-        op = g.create_op(
-            signature.name,
-            [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
-            tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
-            op_def=signature,
-            name="FunctionCall",
-            compute_shapes=False)
-        outputs = op.outputs
-        if not outputs:
-          return op
-        outputs = [outputs] if isinstance(
-            outputs, (ops.Tensor, type(None))) else list(outputs)
+        outputs = list(outputs)
     else:
       # TODO(akshayka): Either remove this if the FunctionLibraryRuntime
       # creates `PartitionedCallOp` kernels by default, or remove the previous
       # branch if a TPU kernel is registered for `PartitionedCall`.
-      if len(args) != len(self.signature.input_arg):
-        raise ValueError(
-            "Arguments and signature arguments do not match: %s %s " %
-            (len(args), len(list(self.signature.input_arg))))
-      function_call_options = ctx.get_function_call_options()
-      outputs = functional_ops.partitioned_call(
-          args=args,
-          f=self,
-          tout=self._output_types,
-          executing_eagerly=executing_eagerly,
-          config=function_call_options.config_proto_serialized,
-          executor_type=function_call_options.executor_type)
+      with _InterpolateFunctionError(self):
+        outputs = functional_ops.partitioned_call(
+            args=args,
+            f=self,
+            tout=self._output_types,
+            executing_eagerly=executing_eagerly,
+            config=config,
+            executor_type=executor_type)
 
     if executing_eagerly:
       return outputs
@@ -279,15 +353,15 @@ class _EagerDefinedFunction(object):
       return outputs
 
 
-class Function(object):
+class ConcreteFunction(object):
   """Callable object encapsulating a function definition and its gradient.
 
-  `Function` is a callable that encapsulates a function definition and
+  `ConcreteFunction` is a callable that encapsulates a function definition and
   is differentiable under `tf.GradientTape` objects.
   """
 
   def __init__(self, func_graph, attrs=None, signature=None):
-    """Initialize a Function.
+    """Initialize a `ConcreteFunction`.
 
     Args:
       func_graph: An instance of FuncGraph: the function body to wrap.
@@ -296,6 +370,7 @@ class Function(object):
         definition.
      signature: a nested sequence of `TensorSpec` objects specifying the input
        signature of this function.
+
     Raises:
       ValueError: If number of input_placeholders is not equal to the number
         of function inputs.
@@ -323,8 +398,8 @@ class Function(object):
       *args: Tensors or Variables. Positional arguments are only accepted when
         they correspond one-to-one with arguments of the traced Python function.
       **kwargs: Tensors or Variables specified by name. When
-        `get_concrete_function` was called to create this `Function`, each
-        Tensor input was given a name, defaulting to the name of the Python
+        `get_concrete_function` was called to create this `ConcreteFunction`,
+        each Tensor input was given a name, defaulting to the name of the Python
         function's argument but possibly overridden by the `name=` argument to
         `tf.TensorSpec`. These names become the argument names for the concrete
         function.
@@ -333,7 +408,7 @@ class Function(object):
       The result of applying the TF function on the given Tensors.
 
     Raises:
-      AssertionError: If this `Function` was not created through
+      AssertionError: If this `ConcreteFunction` was not created through
         `get_concrete_function`.
       ValueError: If arguments contains anything other than Tensors or
         Variables.
@@ -383,8 +458,8 @@ class Function(object):
     """
     return self._call_flat(
         (t for t in nest.flatten((args, kwargs))
-         if isinstance(
-             t, (ops.Tensor, resource_variable_ops.ResourceVariable))))
+         if isinstance(t, (ops.Tensor,
+                           resource_variable_ops.ResourceVariable))))
 
   def _call_flat(self, args):
     """Executes the wrapped function.
@@ -400,9 +475,7 @@ class Function(object):
     """
     ctx = context.context()
 
-    for v in self._func_graph.variables:
-      if v.trainable:
-        tape.variable_accessed(v)
+    tape.variables_accessed(self._func_graph.variables)
 
     tensor_inputs = []
     for i, arg in enumerate(args):
@@ -417,7 +490,7 @@ class Function(object):
         tensor_inputs.append(
             ops.convert_to_tensor(arg, self._signature[i].dtype))
       else:
-        raise ValueError("All inputs to `Function`s must be Tensors; "
+        raise ValueError("All inputs to `ConcreteFunction`s must be Tensors; "
                          "on invocation of %s, the %d-th input (%s) was not a "
                          "Tensor." % (self._func_graph.name, i, str(arg)))
     args = tensor_inputs + self._captured_inputs
@@ -433,26 +506,25 @@ class Function(object):
     if context.executing_eagerly() or not self.outputs:
       outputs = self._inference_function.call(ctx, args)
     else:
-      if not self._gradient_name:
-        self._gradient_name = "PartitionedCall-%s" % ops.uid()
-        self._register_gradient(self._gradient_name)
+      self._register_gradient()
       with ops.get_default_graph().gradient_override_map(
           {"PartitionedCall": self._gradient_name,
            "StatefulPartitionedCall": self._gradient_name}):
         outputs = self._inference_function.call(ctx, args)
     return self._build_call_outputs(outputs)
 
-  def _register_gradient(self, name):
-    """Registers the gradient for the current Function under the given name.
+  def _register_gradient(self):
+    """Registers the gradient for this `ConcreteFunction`.
 
     The gradient rewrites an inference call op to a forward call op, but does
     not modify a pre-existing forward call op. It then computes the gradient
     from the output's gradients and the side outputs of the forward op.
-
-    Args:
-      name: The name to register the gradient as.
     """
-    @ops.RegisterGradient(name)
+    if self._gradient_name:
+      return
+    self._gradient_name = "PartitionedCall-%s" % ops.uid()
+
+    @ops.RegisterGradient(self._gradient_name)
     def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
       return self._grad_fn(op, *doutputs)
 
@@ -484,7 +556,7 @@ class Function(object):
 
   @property
   def name(self):
-    """Function name."""
+    """`ConcreteFunction` name."""
     return self._inference_function.name
 
   @property
@@ -497,11 +569,21 @@ class Function(object):
     """Returns tensors in `self.graph` corresponding to arguments."""
     return self._func_graph.inputs
 
+  @property
+  def structured_input_signature(self):
+    """Returns structured signature of the original function."""
+    return self._func_graph.structured_input_signature
+
   @property
   def outputs(self):
-    """Returns tensors in `self.graph` corresponding to return values."""
+    """Returns tensors in `self.graph` corresponding to returned tensors."""
     return self._func_graph.outputs
 
+  @property
+  def structured_outputs(self):
+    """Returns outputs in `self.graph` as returned by the original function."""
+    return self._func_graph.structured_outputs
+
   @property
   def captured_inputs(self):
     """Returns external Tensors captured by this function.
@@ -555,27 +637,26 @@ class Function(object):
     # method's functionality better. Remove register_gradient_functions argument
     # and figure out if these needs to be registered.
 
-    if not context.executing_eagerly() or g:
-      if not g:
-        g = ops.get_default_graph()
-      self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
-
-      # pylint: disable=protected-access
-      if register_gradient_functions:
-        # There are two situations for the actual call of a defun:
-        # 1. If none of the input args are resource variables or watch by any
-        #   tape, and it will run the _inference_function of concrete_func for
-        #   forward pass, the gradient will be generated by standard mechanism.
-        # 2. Otherwise, defun will create two functions, one for forward pass,
-        #   and the backward pass will be created via tape.
-        #   When registering the function, we register both cases.
-        if self._backward_graph_function is None:
-          self._construct_backprop_function()
-        forward_function = self._forward_function
-        backward_function = self._backward_graph_function._inference_function
-        # pylint: enable=protected-access
-        forward_function.add_to_graph(g)
-        backward_function.add_to_graph(g)
+    if not context.executing_eagerly() and not g:
+      g = ops.get_default_graph()
+    self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
+
+    # pylint: disable=protected-access
+    if register_gradient_functions:
+      # There are two situations for the actual call of a defun:
+      # 1. If none of the input args are resource variables or watch by any
+      #   tape, and it will run the _inference_function of concrete_func for
+      #   forward pass, the gradient will be generated by standard mechanism.
+      # 2. Otherwise, defun will create two functions, one for forward pass,
+      #   and the backward pass will be created via tape.
+      #   When registering the function, we register both cases.
+      if self._backward_graph_function is None:
+        self._construct_backprop_function()
+      forward_function = self._forward_function
+      backward_function = self._backward_graph_function._inference_function
+      # pylint: enable=protected-access
+      forward_function.add_to_graph(g)
+      backward_function.add_to_graph(g)
 
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
@@ -608,10 +689,11 @@ class Function(object):
     # Clear captures, since we pass them in as inputs.
     backwards_graph.captures = {}
     backwards_graph.outputs.extend(
-        grad for grad in func_graph_module.flatten(gradients_wrt_inputs)
+        grad
+        for grad in nest.flatten(gradients_wrt_inputs, expand_composites=True)
         if grad is not None)
     backwards_graph.structured_outputs = gradients_wrt_inputs
-    self._backward_graph_function = Function(
+    self._backward_graph_function = ConcreteFunction(
         backwards_graph, attrs=backward_function_attr)
 
     forward_function_attr = _parse_func_attrs({
@@ -642,9 +724,7 @@ class Function(object):
 
     ctx = context.context()
 
-    if not self._gradient_name:
-      self._gradient_name = "PartitionedCall-%s" % ops.uid()
-      self._register_gradient(self._gradient_name)
+    self._register_gradient()
     with ops.get_default_graph().gradient_override_map(
         {"PartitionedCall": self._gradient_name,
          "StatefulPartitionedCall": self._gradient_name}):
@@ -691,9 +771,7 @@ class Function(object):
     """
     ctx = context.context()
 
-    if not self._gradient_name:
-      self._gradient_name = "PartitionedCall-%s" % ops.uid()
-      self._register_gradient(self._gradient_name)
+    self._register_gradient()
     with ops.get_default_graph().gradient_override_map(
         {"PartitionedCall": self._gradient_name,
          "StatefulPartitionedCall": self._gradient_name}):
@@ -748,24 +826,6 @@ class Function(object):
     return ret
 
 
-class UnknownArgument(object):
-  """Signifies an argument which is not currently handled."""
-  pass
-
-
-def _encode_arg_for_serialization(arg):
-  """A representation for this argument, for serializing signatures."""
-  if isinstance(arg, ops.Tensor):
-    return tensor_spec.TensorSpec(arg.shape, arg.dtype)
-  if isinstance(arg, int):
-    return arg
-  if isinstance(arg, float):
-    return arg
-  if isinstance(arg, bool):
-    return arg
-  return UnknownArgument()
-
-
 pywrap_tensorflow.RegisterType("Tensor", ops.Tensor)
 pywrap_tensorflow.RegisterType("IndexedSlices", ops.IndexedSlices)
 
@@ -777,14 +837,6 @@ def _deterministic_dict_values(dictionary):
 class FunctionSpec(object):
   """Specification of how to bind arguments to a function."""
 
-  def as_tuple(self):
-    return (self._fullargspec, self._is_method, self._args_to_prepend,
-            self._kwargs_to_include, self.input_signature)
-
-  @staticmethod
-  def from_tuple(spec_tuple):
-    return FunctionSpec(*spec_tuple)
-
   @staticmethod
   def from_function_and_signature(python_function, input_signature):
     """Create a FunctionSpec instance given a python function and signature."""
@@ -831,7 +883,7 @@ class FunctionSpec(object):
     }
     self._default_values_start_index = offset
     if input_signature is None:
-      self.input_signature = None
+      self._input_signature = None
     else:
       if fullargspec.varkw is not None or fullargspec.kwonlyargs:
         raise ValueError("Cannot define a TensorFlow function from a Python "
@@ -842,8 +894,32 @@ class FunctionSpec(object):
         raise TypeError("input_signature must be either a tuple or a "
                         "list, received " + str(type(input_signature)))
 
-      self.input_signature = tuple(input_signature)
-      self.flat_input_signature = tuple(nest.flatten(input_signature))
+      self._input_signature = tuple(input_signature)
+      self._flat_input_signature = tuple(nest.flatten(input_signature))
+
+  @property
+  def fullargspec(self):
+    return self._fullargspec
+
+  @property
+  def is_method(self):
+    return self._is_method
+
+  @property
+  def args_to_prepend(self):
+    return self._args_to_prepend
+
+  @property
+  def kwargs_to_include(self):
+    return self._kwargs_to_include
+
+  @property
+  def input_signature(self):
+    return self._input_signature
+
+  @property
+  def flat_input_signature(self):
+    return self._flat_input_signature
 
   def canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
@@ -891,7 +967,7 @@ class FunctionSpec(object):
         if index is not None:
           arg_indices_to_values[index] = value
           consumed_args.append(arg)
-        elif self.input_signature is not None:
+        elif self._input_signature is not None:
           raise ValueError("Cannot define a TensorFlow function from a Python "
                            "function with keyword arguments when "
                            "input_signature is provided.")
@@ -914,15 +990,13 @@ class FunctionSpec(object):
     if need_packing:
       inputs = nest.pack_sequence_as(
           structure=inputs, flat_sequence=flat_inputs)
-    if self.input_signature is None:
+    if self._input_signature is None:
       return inputs, kwargs
     else:
       assert not kwargs
-      signature_relevant_inputs = inputs[:len(self.input_signature)]
-      try:
-        nest.assert_same_structure(self.input_signature,
-                                   signature_relevant_inputs)
-      except (ValueError, TypeError):
+      signature_relevant_inputs = inputs[:len(self._input_signature)]
+      if not is_same_structure(self._input_signature,
+                               signature_relevant_inputs):
         raise ValueError("Structure of Python function inputs does not match "
                          "input_signature.")
       signature_inputs_flat = nest.flatten(signature_relevant_inputs)
@@ -931,23 +1005,23 @@ class FunctionSpec(object):
         raise ValueError("When input_signature is provided, all inputs to "
                          "the Python function must be Tensors.")
       if any(not spec.is_compatible_with(other) for spec, other in zip(
-          self.flat_input_signature, signature_inputs_flat)):
+          self._flat_input_signature, signature_inputs_flat)):
         raise ValueError("Python inputs incompatible with input_signature: "
                          "inputs (%s), input_signature (%s)" %
-                         (str(inputs), str(self.input_signature)))
+                         (str(inputs), str(self._input_signature)))
       return inputs, {}
 
 
-class PolymorphicFunction(object):
+class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
   See the documentation for `defun` for more information on the semantics of
   defined functions.
 
-  PolymorphicFunction class is thread-compatible meaning that minimal
-  usage of defuns (defining and calling) is thread-safe, but if users call other
-  methods or invoke the base `python_function` themselves, external
-  synchronization is necessary.
+  `Function` class is thread-compatible meaning that minimal usage of defuns
+  (defining and calling) is thread-safe, but if users call other methods or
+  invoke the base `python_function` themselves, external synchronization is
+  necessary.
   """
 
   def __init__(self,
@@ -956,7 +1030,7 @@ class PolymorphicFunction(object):
                input_signature=None,
                attributes=None,
                autograph=True):
-    """Initializes a polymorphic function.
+    """Initializes a `Function`.
 
     Args:
       python_function: the function to be wrapped.
@@ -968,7 +1042,7 @@ class PolymorphicFunction(object):
         of the function.
       autograph: whether to use autograph to compile
         `python_function`. See https://www.tensorflow.org/guide/autograph for
-        more information.
+          more information.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -983,14 +1057,13 @@ class PolymorphicFunction(object):
     self._name = name
     self._autograph = autograph
     self._function_cache = collections.OrderedDict()
-    self._garbage_collector = _PolymorphicFunctionGarbageCollector(
-        self._function_cache)
+    self._garbage_collector = _FunctionGarbageCollector(self._function_cache)
     self._function_attributes = attributes or {}
 
     self._lock = threading.Lock()
     # _descriptor_cache is a of instance of a class to an instance-specific
-    # PolymorphicFunction, used to make sure defun-decorated methods create
-    # different functions for each instance.
+    # `Function`, used to make sure defun-decorated methods create different
+    # functions for each instance.
     self._descriptor_cache = weakref.WeakKeyDictionary()
 
   def __call__(self, *args, **kwargs):
@@ -1009,12 +1082,12 @@ class PolymorphicFunction(object):
 
   @property
   def _input_signature(self):
-    """Returns the wrapped Python function."""
+    """Returns the input signature."""
     return self._function_spec.input_signature  # pylint: disable=protected-access
 
   @property
   def _flat_input_signature(self):
-    """Returns the wrapped Python function."""
+    """Returns the flattened input signature."""
     return self._function_spec.flat_input_signature  # pylint: disable=protected-access
 
   def _get_concrete_function_internal_garbage_collected(self, *args, **kwargs):
@@ -1029,14 +1102,14 @@ class PolymorphicFunction(object):
     graph_function = self._get_concrete_function_internal_garbage_collected(
         *args, **kwargs)
     # We're returning this concrete function to someone, and they may keep a
-    # reference to the FuncGraph without keeping a reference to the Function
-    # object. So we won't clean up the reference cycles manually and instead
-    # will leave them to Python's garbage collector.
+    # reference to the FuncGraph without keeping a reference to the
+    # ConcreteFunction object. So we won't clean up the reference cycles
+    # manually and instead will leave them to Python's garbage collector.
     graph_function._garbage_collector.release()  # pylint: disable=protected-access
     return graph_function
 
   def get_concrete_function(self, *args, **kwargs):
-    """Returns a `Function` object specialized to inputs and execution context.
+    """Returns a `ConcreteFunction` specialized to inputs and execution context.
 
     Args:
       *args: inputs to specialize on.
@@ -1049,9 +1122,7 @@ class PolymorphicFunction(object):
                          "input_signature is provided.")
       if args:
         # If args are provided, they must match the input signature.
-        try:
-          nest.assert_same_structure(self._input_signature, args)
-        except (ValueError, TypeError):
+        if not is_same_structure(self._input_signature, args):
           raise ValueError("Structure of Python function inputs does not match "
                            "input_signature.")
         flat_inputs = nest.flatten(args)
@@ -1111,8 +1182,8 @@ class PolymorphicFunction(object):
   def __get__(self, instance, owner):
     """Makes it possible to defun instance methods."""
     del owner
-    # `instance` here is the instance that this `PolymorphicFunction` was
-    # accessed through; e.g., for
+    # `instance` here is the instance that this `Function` was accessed through
+    # e.g., for
     #
     #   class Foo(object):
     #
@@ -1121,26 +1192,25 @@ class PolymorphicFunction(object):
     #       ...
     #
     #   foo = Foo()
-    #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
+    #   foo.bar()  # `foo.bar` is a `Function` instance
     #
     # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
-    # new instance of PolymorphicFunction here to allow different instances each
+    # new instance of `Function` here to allow different instances each
     # to create variables once, thereby allowing methods to be decorated with
     # defun. Keeps a cache to avoid retracing the function every time the
     # descriptor is accessed.
     if instance not in self._descriptor_cache:
       if instance is None:
         return self
-      # If there is no instance-specific polymorphic func in the cache,
-      # we construct an instance-specific polymorphic function
-      # that uses a weak reference to the instance (so that the instance will
-      # be correctly gc'd).
+      # If there is no instance-specific `Function` in the cache, we construct
+      # an instance-specific `Function` that uses a weak reference to the
+      # instance (so that the instance will be correctly gc'd).
 
       # And finally add the wrapped function to the description cache
       self._descriptor_cache[instance] = class_method_to_instance_method(
           self, instance)
 
-    # Return the cached polymorphic function for the instance
+    # Return the cached `Function` for the instance
     return self._descriptor_cache[instance]
 
   def _cache_key(self, args, kwargs):
@@ -1198,8 +1268,8 @@ class PolymorphicFunction(object):
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
 
-    `args` and `kwargs` can be None if this `PolymorphicFunction` was created
-    with an `input_signature`.
+    `args` and `kwargs` can be None if this `Function` was created with an
+    `input_signature`.
 
     Args:
       args: The varargs for the Python function.
@@ -1232,11 +1302,16 @@ class PolymorphicFunction(object):
           arglen = len(args)
         else:
           arglen = len(self._input_signature)
-        arg_names = (
-            self._function_spec.arg_names[:arglen]
-            + [self._function_spec.vararg_name] *
-            (arglen - len(self._function_spec.arg_names)))
-        graph_function = Function(
+        base_arg_names = self._function_spec.arg_names[:arglen]
+        num_missing_args = arglen - len(self._function_spec.arg_names)
+        missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
+        # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
+        # where arg is based on the self._function_spec.vararg_name.
+        missing_arg_names = [
+            "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
+        ]
+        arg_names = base_arg_names + missing_arg_names
+        graph_function = ConcreteFunction(
             func_graph_module.func_graph_from_py_func(
                 self._name,
                 self._python_function,
@@ -1244,22 +1319,13 @@ class PolymorphicFunction(object):
                 kwargs,
                 self._input_signature,
                 autograph=self._autograph,
-                arg_names=arg_names),
-            self._function_attributes)
-        if self._input_signature:
-          python_call_signature = self._input_signature
-        else:
-          python_call_signature = tuple(
-              _encode_arg_for_serialization(arg) for arg in args)
+                arg_names=arg_names), self._function_attributes)
         # pylint: disable=protected-access
-        # Save information about non-Tensor arguments with the concrete
-        # function. Used to serialize PolymorphicFunctions.
-        graph_function._python_call_signature = python_call_signature
-        # Tell the Function to clean up its graph once it goes out of
-        # scope. Function does not do this in its constructor since it gets used
-        # in some places (like Keras) where the FuncGraph lives longer than the
-        # Function.
-        graph_function._garbage_collector = _FunctionGarbageCollector(
+        # Tell the ConcreteFunction to clean up its graph once it goes out of
+        # scope. ConcreteFunction does not do this in its constructor since it
+        # gets used in some places (like Keras) where the FuncGraph lives
+        # longer than the ConcreteFunction.
+        graph_function._garbage_collector = _ConcreteFunctionGarbageCollector(
             graph_function.graph)
         # pylint: enable=protected-access
         self._function_cache[cache_key] = graph_function
@@ -1267,24 +1333,24 @@ class PolymorphicFunction(object):
 
 
 def register(func, *args, **kwargs):
-  """Register a specialization of a PolymorphicFunction into the graph.
+  """Register a specialization of a `Function` into the graph.
 
   This won't actually call the function with the inputs, and only put the
   function definition into graph. Register function with different input param
   will result into multiple version of functions registered in graph.
 
   Args:
-    func: the PolymorphicFunction instance that generated by a @defun
+    func: the `Function` instance that generated by a @defun
     *args: input arguments for the Python function.
     **kwargs: input keyword arguments for the Python function.
 
   Returns:
-    a `Function` object specialized to inputs and execution context.
+    a `ConcreteFunction` object specialized to inputs and execution context.
 
   Raises:
     ValueError: When the input function is not a defun wrapped python function.
   """
-  if not isinstance(func, PolymorphicFunction):
+  if not isinstance(func, Function):
     raise ValueError("Only defun function is allowed to be registered. "
                      "Got type: %s" % type(func))
   concrete_func = func.get_concrete_function(*args, **kwargs)
@@ -1650,7 +1716,7 @@ def defun_with_attributes(func=None,
       whitelisted attribute name is allowed. Unwhitelisted attribute name or
       unsupported value will result into ValueError. `func_name` is also one of
       the whitelisted argument which is a python string, and sets the name for
-      this `Function` in the graph.
+      this `ConcreteFunction` in the graph.
     autograph: same as defun()'s autograph.
 
   Returns:
@@ -1671,7 +1737,7 @@ def defun_with_attributes(func=None,
       name = "function"
     return tf_decorator.make_decorator(
         function,
-        PolymorphicFunction(
+        Function(
             function,
             name,
             input_signature=input_signature,
@@ -1703,7 +1769,7 @@ class _WeakrefSelf(object):
 
 
 def class_method_to_instance_method(original_function, instance):
-  """Constructs a new PolymorphicFunction with `self` bound."""
+  """Constructs a new `Function` with `self` bound."""
   weak_instance = weakref.ref(instance)
 
   # Note: while we could bind to a weakref proxy instead, that causes the
@@ -1711,8 +1777,8 @@ def class_method_to_instance_method(original_function, instance):
   bound_method = types_lib.MethodType(original_function.python_function,
                                       _WeakrefSelf(weak_instance))
 
-  # original_function is expected to be of one of the two PolymorphicFunction
-  # types (defined either in function.py or def_function.py).
+  # original_function is expected to be of one of the two `Function` types
+  # (defined either in function.py or def_function.py).
   assert hasattr(original_function, "_name")
   assert hasattr(original_function, "_autograph")
   assert hasattr(original_function, "_input_signature")
@@ -1754,7 +1820,7 @@ def class_method_to_instance_method(original_function, instance):
   return wrapped_instance_func
 
 
-class _PolymorphicFunctionGarbageCollector(object):
+class _FunctionGarbageCollector(object):
   """Cleans up cycles when a defun goes out of scope."""
 
   def __init__(self, cache):
@@ -1763,13 +1829,16 @@ class _PolymorphicFunctionGarbageCollector(object):
   def __del__(self):
     if func_graph_module is None or memory is None:
       return
-    while self._cache:
-      self._cache.popitem()
-    memory.dismantle_ordered_dict(self._cache)
+    try:
+      while self._cache:
+        self._cache.popitem()
+      memory.dismantle_ordered_dict(self._cache)
+    except:  # pylint: disable=bare-except
+      pass
 
 
-class _FunctionGarbageCollector(object):
-  """Cleans up reference cycles when a Function goes out of scope."""
+class _ConcreteFunctionGarbageCollector(object):
+  """Cleans up reference cycles when a `ConcreteFunction` goes out of scope."""
 
   def __init__(self, func_graph):
     self._func_graph = func_graph
@@ -1781,4 +1850,7 @@ class _FunctionGarbageCollector(object):
   def __del__(self):
     if func_graph_module is None or memory is None or self._func_graph is None:
       return
-    func_graph_module.dismantle_func_graph(self._func_graph)
+    try:
+      func_graph_module.dismantle_func_graph(self._func_graph)
+    except:  # pylint: disable=bare-except
+      pass
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
index 9358c4fd07111f7adfbf60241727215f978b2a36..08a50a8f513425ff395b4b83de7a44183c12c757 100644
--- a/tensorflow/python/eager/function_argument_naming_test.py
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -220,10 +220,10 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
         z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
         zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
     self.assertEqual(
-        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
+        ['x', 'y', 'args_1', 'second_variadic', 'z', 'cust'],
         [inp.op.name for inp in variadic_op.inputs])
     self.assertEqual(
-        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
+        [b'x', b'y', b'args_1', b'second_variadic', b'z', b'cust'],
         [inp.op.get_attr('_user_specified_name')
          for inp in variadic_op.inputs])
 
@@ -244,10 +244,10 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
     variadic_op = variadic_fn.get_concrete_function()
     self.assertIn(b'variadic_fn', variadic_op.name)
     self.assertEqual(
-        ['x', 'y', 'args', 'z'],
+        ['x', 'y', 'args_1', 'z'],
         [inp.op.name for inp in variadic_op.inputs])
     self.assertEqual(
-        [b'x', b'y', b'args', b'z'],
+        [b'x', b'y', b'args_1', b'z'],
         [inp.op.get_attr('_user_specified_name')
          for inp in variadic_op.inputs])
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 55a9cc4e92336452260d0de1991e68ee67dd22e2..a2da088d639c7ad447095fe21903777ad44c0991 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 import functools
+import itertools
 from multiprocessing.pool import ThreadPool
 import sys
 import weakref
@@ -45,8 +46,10 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -965,8 +968,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([[3.0]], self.evaluate(y))
 
     # Break the reference cycle between the MiniModel and the defun:
-    # MiniModel --(through its `call` method)--> PolymorphicFunction
-    # PolymorphicFunction --(instancemethod on MiniModel)--> MiniModel
+    # `MiniModel` --(through its `call` method)--> `Function`
+    # `Function` --(instancemethod on `MiniModel`)--> `MiniModel`
     del model.call
 
   # Note: The ConfigProto below unfortunately only configures graph
@@ -1534,7 +1537,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
           add(t, t)
 
-  def testRegisterPolymorphicFunction(self):
+  def testRegisterFunction(self):
+
     @function.defun
     def add(x, y):
       return math_ops.add(x, y)
@@ -1728,8 +1732,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return x
 
     graph_function = foo.get_concrete_function(constant_op.constant(1.0))
-    with self.assertRaisesRegexp(ValueError, 'All inputs to `Function`s must '
-                                 'be Tensors;.*'):
+    with self.assertRaisesRegexp(
+        ValueError, 'All inputs to `ConcreteFunction`s must be Tensors;.*'):
       graph_function('Not a Tensor.')
 
   def testSwapImplementationWithGrapplerPlugin(self):
@@ -1817,7 +1821,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(instance_call_one, instance_call_two)
     self.assertAllEqual(instance_call_one, class_call)
 
-  def testDecoratedMethodUniquePolymorphicFuncPerInstance(self):
+  def testDecoratedMethodUniqueFunctionPerInstance(self):
     m = DefunnedMiniModel()
     n = DefunnedMiniModel()
 
@@ -2063,6 +2067,292 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # function itself is not involved in a reference cycle.
     self.assertIs(None, weak_fn())
 
+  def testFunctionStackInErrorMessage(self):
+    if context.executing_eagerly():
+      # TODO(b/122736651): Remove this skipTest once fixed.
+      self.skipTest('Error interpolation is not working when function is '
+                    'invoked without PartitionedCallOp.')
+
+    @def_function.function()
+    def fn3(x):
+      return x + 2
+
+    @def_function.function()
+    def fn2(x):
+      check_ops.assert_equal(fn3(x), 3)
+      return 2
+
+    @def_function.function()
+    def fn(x):
+      return fn2(x)
+
+    try:
+      fn(2)
+      self.assertFail()
+    except errors.InvalidArgumentError as e:
+      self.assertIn('fn -> fn2', e.message)
+      self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
+      self.assertNotIn('fn3', e.message)
+
+
+class MultiDeviceTest(test.TestCase, parameterized.TestCase):
+
+  def testMultiDeviceOutput(self):
+    """Tests that functions can produce outputs on multiple devices."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    @function.defun
+    def func(a, b, transpose_a):
+      with ops.device('/device:CPU:0'):
+        m1 = math_ops.matmul(a, b, transpose_a=transpose_a)
+      with ops.device('/device:GPU:0'):
+        m2 = math_ops.matmul(a, b, transpose_a=transpose_a)
+      return m1, m2
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    m1, m2 = func(t, t, transpose_a=True)
+    self.assertAllEqual(m1.numpy(), [[10, 14], [14, 20]])
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertAllEqual(m2.numpy(), [[10, 14], [14, 20]])
+    self.assertRegexpMatches(m2.backing_device, 'GPU')
+
+  def testEmptyBody(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    @function.defun
+    def func(a, b):
+      return b, a
+
+    with ops.device('/device:CPU:0'):
+      a = constant_op.constant(3.0)
+    with ops.device('/device:GPU:0'):
+      b = constant_op.constant(5.0)
+
+    m1, m2 = func(a, b)
+    self.assertAllEqual(m1.numpy(), 5.0)
+    self.assertRegexpMatches(m1.backing_device, 'GPU')
+    self.assertAllEqual(m2.numpy(), 3.0)
+    self.assertRegexpMatches(m2.backing_device, 'CPU')
+
+  def testMultiDeviceInt32(self):
+    """Tests that multi-device functions can take and output INT32s.
+
+    When an INT32 device tensor is fed into a function, it is copied to CPU
+    by the eager runtime. The function sees all INT32 inputs on CPU.
+
+    We set allocator attribute 'on_host' for INT32 outputs. They can be
+    partitioned into the GPU component function, but will be allocated on
+    CPU nevertheless.
+
+    There is experimental support for `ints_on_device` in
+    FunctionLibraryRuntime now. We can try that.
+
+    """
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      int_cpu = constant_op.constant(3, dtype=dtypes.int32)
+      resource = resource_variable_ops.ResourceVariable(5, dtype=dtypes.int32)
+    with ops.device('/device:GPU:0'):
+      int_gpu = constant_op.constant(7, dtype=dtypes.int32)
+
+    @function.defun
+    def func(int_cpu, resource, int_gpu):
+      with ops.device('/device:CPU:0'):
+        m1 = int_cpu * resource + int_gpu
+      with ops.device('/device:GPU:0'):
+        # This computation will happen on GPU but m2 will be copied to CPU.
+        m2 = int_gpu * resource + int_cpu + 1
+      return m1, m2
+
+    m1, m2 = func(int_cpu, resource, int_gpu)
+    self.assertAllEqual(m1.numpy(), 22)
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertAllEqual(m2.numpy(), 39)
+    self.assertRegexpMatches(m2.backing_device, 'CPU')
+
+    # flip arguments
+    m1, m2 = func(int_gpu, resource, int_cpu)
+    self.assertAllEqual(m1.numpy(), 38)
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertAllEqual(m2.numpy(), 23)
+    self.assertRegexpMatches(m2.backing_device, 'CPU')
+
+  def testMultiDeviceColocateWith(self):
+    """Tests that function's outputs respect colocation constraints."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    @function.defun
+    def func(a, b):
+      with ops.colocate_with(a):
+        ra = 2 * a
+      with ops.colocate_with(b):
+        rb = 3 * b
+      return ra, rb
+
+    devices = ['/device:CPU:0', '/device:GPU:0']
+    for dev1, dev2 in itertools.product(devices, devices):
+      with ops.device(dev1):
+        a = constant_op.constant(1.0)
+      with ops.device(dev2):
+        b = constant_op.constant(10.0)
+
+      ra, rb = func(a, b)
+      self.assertEqual(ra.numpy(), 2.0)
+      self.assertRegexpMatches(ra.backing_device, dev1)
+      self.assertEqual(rb.numpy(), 30.0)
+      self.assertRegexpMatches(rb.backing_device, dev2)
+
+  def testMultiDeviceResources(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      c1 = resource_variable_ops.ResourceVariable(2.0)
+      c2 = resource_variable_ops.ResourceVariable(7.0)
+    with ops.device('/device:GPU:0'):
+      g1 = resource_variable_ops.ResourceVariable(3.0)
+      g2 = resource_variable_ops.ResourceVariable(5.0)
+
+    @function.defun
+    def func(resource1, resource2):
+      with ops.device('/device:CPU:0'):
+        result1 = resource1 * g2
+      with ops.device('/device:GPU:0'):
+        result2 = resource2 * c2
+      return result1, result2
+
+    r1, r2 = func(c1, g1)
+    self.assertEqual(r1.numpy(), 10.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 21.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+
+    # Call with flipped inputs. Check that we look at resource's
+    # device and reinstantiates the function when inputs' devices change.
+    r1, r2 = func(g1, c1)
+    self.assertEqual(r1.numpy(), 15.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 14.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+
+  def testOutputResources(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      c1 = resource_variable_ops.ResourceVariable(2.0)
+    with ops.device('/device:GPU:0'):
+      g1 = resource_variable_ops.ResourceVariable(3.0)
+
+    @function.defun
+    def func(resource1, resource2):
+      with ops.device('/device:CPU:0'):
+        result1 = resource1 * 5
+      with ops.device('/device:GPU:0'):
+        result2 = resource2 * 7
+      return result1, resource1.handle, result2, resource2.handle
+
+    r1, res1, r2, res2 = func(c1, g1)
+    self.assertEqual(r1.numpy(), 10.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 21.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+
+    def check_handle(handle, expected_value):
+      self.assertRegexpMatches(handle.backing_device, 'CPU')
+      tensor = gen_resource_variable_ops.read_variable_op(
+          handle, dtypes.float32)
+      self.assertEqual(tensor.numpy(), expected_value)
+
+    # Check that handles returned from functions are on CPU and an op using
+    # the resource handle is correctly placed on the device backing the
+    # resource.
+    check_handle(res1, 2.0)
+    check_handle(res2, 3.0)
+
+    # Call with flipped inputs to make sure the same the function is
+    # reinstantiated and eager runtime does not mess up the device assignment
+    # for ops consuming handles returned from defuns.
+    r1, res1, r2, res2 = func(g1, c1)
+    self.assertEqual(r1.numpy(), 15.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 14.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    check_handle(res1, 3.0)
+    check_handle(res2, 2.0)
+
+  def testComplexInputOutputDevicePattern(self):
+    """Tests input/output mapping logic in partitioning."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      rc0 = resource_variable_ops.ResourceVariable(2.0)
+      rc1 = resource_variable_ops.ResourceVariable(3.0)
+      cc0 = constant_op.constant(5.0)
+      cc1 = constant_op.constant(7.0)
+    with ops.device('/device:GPU:0'):
+      rg0 = resource_variable_ops.ResourceVariable(11.0)
+      rg1 = resource_variable_ops.ResourceVariable(13.0)
+      cg0 = constant_op.constant(17.0)
+      cg1 = constant_op.constant(19.0)
+
+    # Make sure tensors are on expected devices.
+    for tensor in [cc0, cc1]:
+      self.assertRegexpMatches(tensor.backing_device, 'CPU:0')
+    for tensor in [cg0, cg1]:
+      self.assertRegexpMatches(tensor.backing_device, 'GPU:0')
+
+    @function.defun
+    def func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1):
+      with ops.device('/device:CPU:0'):
+        m1 = rc0 * cg0
+      with ops.device('/device:GPU:0'):
+        m2 = rg0 * cc0
+
+      with ops.device('/device:CPU:0'):
+        r1 = 1000.0 * m2 + rc1 * cg1
+      with ops.device('/device:GPU:0'):
+        r2 = 1000.0 * m1 + rg1 * cc1
+
+      return r1, r2, m2, m1
+
+    r1, r2, m2, m1 = func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1)
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegexpMatches(m2.backing_device, 'GPU')
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertEqual(m1.numpy(), 34.0)
+    self.assertEqual(r1.numpy(), 55000.0 + 3.0 * 19.0)
+    self.assertEqual(m2.numpy(), 55.0)
+    self.assertEqual(r2.numpy(), 34000.0 + 13.0 * 7.0)
+
+  def testArgumentPrunning(self):
+    """Tests functions taking unnecessary arguments."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      c1 = constant_op.constant(5.0)
+      c2 = constant_op.constant(7.0)
+
+    with ops.device('/device:GPU:0'):
+      g1 = constant_op.constant(11.0)
+      g2 = constant_op.constant(13.0)
+      g3 = constant_op.constant(17.0)
+
+    @function.defun
+    def func(g1, g2, c1, g3, c2):  # pylint: disable=unused-argument
+      # arguments g1 and g2 are unused and can be pruned by grappler.
+      return c1 * g3 * c2
+
+    result = func(g1, g2, c1, g3, c2)
+    self.assertEqual(result.numpy(), 5.0 * 7.0 * 17.0)
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
diff --git a/tensorflow/python/eager/memory_test.py b/tensorflow/python/eager/memory_test.py
index a1a59d511fdd4b831ea853b1f1cb3212322a3b84..9d29180379bd5bc48472f5c8638f01f667763111 100644
--- a/tensorflow/python/eager/memory_test.py
+++ b/tensorflow/python/eager/memory_test.py
@@ -24,6 +24,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
 import six
 
 from tensorflow.python import keras
@@ -32,6 +33,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.variables import Variable
 
 # memory_profiler might not be available in the OSS version of TensorFlow.
 try:
@@ -63,6 +65,11 @@ class MemoryTest(test.TestCase):
       # Warm up.
       f()
 
+      # Wait for background threads to start up and take over memory.
+      # FIXME: The nature of this test leaves few other options. Maybe there
+      # is a better way to do this.
+      time.sleep(4)
+
       initial = memory_profiler.memory_usage(-1)[0]
 
       for _ in six.moves.range(num_iters):
@@ -75,6 +82,16 @@ class MemoryTest(test.TestCase):
           "Maximum allowed increase: %f") % (initial, increase,
                                              increase_threshold_absolute_mb)
 
+  def testMemoryLeakAnonymousVariable(self):
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    def f():
+      inputs = Variable(array_ops.zeros([32, 100], dtypes.float32))
+      del inputs
+
+    self.assertNotIncreasingMemory(f, num_iters=10000)
+
   def testMemoryLeakInSimpleModelForwardOnly(self):
     if memory_profiler is None:
       self.skipTest("memory_profiler required to run this test")
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 17a090d5262f790c92dfa1a92d47f9b5ac6c07d9..ab4bdaa601d94bee077dd9567fef0415164eb821 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import threading
+import weakref
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -397,6 +399,32 @@ class OpsTest(test_util.TensorFlowTestCase):
     t1.start()
     t1.join()
 
+  def testWeakrefEagerTensor(self):
+    x = constant_op.constant([[1.]])
+    x.at1 = constant_op.constant([[2.]])
+    x.at2 = 3.
+    weak_x = weakref.ref(x)
+    weak_xat1 = weakref.ref(x.at1)
+    del x
+    self.assertIs(weak_x(), None)
+    self.assertIs(weak_xat1(), None)
+
+  def testWeakKeyDictionaryTensor(self):
+    weak_key_dict = weakref.WeakKeyDictionary()
+    strong_x = constant_op.constant([[1.]])
+    strong_y = constant_op.constant([[2.]])
+    weak_key_dict[strong_x] = constant_op.constant([[3.]])
+    weak_key_dict[strong_y] = constant_op.constant([[4.]])
+    strong_y.a = constant_op.constant([[5.]])
+    weak_x = weakref.ref(strong_x)
+    del strong_x
+    self.assertIs(weak_x(), None)
+    self.assertEqual([strong_y], list(weak_key_dict))
+    self.assertEqual(1, len(list(weak_key_dict)))
+    self.assertEqual(1, len(weak_key_dict))
+    del strong_y
+    self.assertEqual([], list(weak_key_dict))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a88312264357546496c07b5824aede0178e880ae
--- /dev/null
+++ b/tensorflow/python/eager/profiler.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler for eager mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.platform import gfile
+
+LOGDIR_PLUGIN = 'plugins/profile'
+
+_profiler = None
+_profiler_lock = threading.Lock()
+_run_num = 0
+
+
+def start():
+  """Start profiling.
+
+  Only one active profiling session is allowed.
+
+  Raises:
+    AssertionError: If another profiling session is running.
+  """
+  global _profiler
+  if _profiler is not None:
+    raise AssertionError('Another profiler is running.')
+  with _profiler_lock:
+    _profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle)  # pylint: disable=protected-access
+
+
+def stop():
+  """Stop current profiling session and return its result.
+
+  Returns:
+    A binary string of tensorflow.tpu.Trace. User can write the string
+    to file for offline analysis by tensorboard.
+
+  Raises:
+    AssertionError: If there is no active profiling session.
+  """
+  global _profiler
+  global _run_num
+  if _profiler is None:
+    raise AssertionError('Cannot stop profiling. No profiler is running.')
+  with c_api_util.tf_buffer() as buffer_:
+    pywrap_tensorflow.TFE_ProfilerSerializeToString(
+        context.context()._handle,  # pylint: disable=protected-access
+        _profiler,
+        buffer_)
+    result = pywrap_tensorflow.TF_GetBuffer(buffer_)
+  with _profiler_lock:
+    pywrap_tensorflow.TFE_DeleteProfiler(_profiler)
+    _profiler = None
+    _run_num += 1
+  return result
+
+
+def start_profiler_server(port):
+  """Start a profiler grpc server that listens to given port.
+
+  The profiler server will keep the program running even the training finishes.
+  Please shutdown the server with CTRL-C. The service defined in
+  tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
+  tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
+  file following https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace
+
+  Args:
+    port: port profiler server listens to.
+  """
+  pywrap_tensorflow.TFE_StartProfilerServer(
+      context.context()._handle,  # pylint: disable=protected-access
+      port)
+
+
+class Profiler(object):
+  """Context-manager eager profiler api.
+
+  Example usage:
+  ```python
+  with Profiler("/path/to/logdir"):
+    # do some work
+  ```
+  """
+
+  def __init__(self, logdir):
+    self._logdir = logdir
+
+  def __enter__(self):
+    start()
+
+  def __exit__(self, typ, value, tb):
+    result = stop()
+    plugin_dir = os.path.join(self._logdir, LOGDIR_PLUGIN,
+                              'run{}'.format(_run_num))
+    gfile.MakeDirs(plugin_dir)
+    with gfile.Open(os.path.join(plugin_dir, 'local.trace'), 'wb') as f:
+      f.write(result)
diff --git a/tensorflow/python/eager/profiler_test.py b/tensorflow/python/eager/profiler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6940f56a98190904f63bcc7ae9a7e641da69c872
--- /dev/null
+++ b/tensorflow/python/eager/profiler_test.py
@@ -0,0 +1,49 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for eager profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.profiler import trace_events_pb2
+from tensorflow.python.eager import profiler
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+
+
+class ProfilerTest(test_util.TensorFlowTestCase):
+
+  def test_profile(self):
+    profiler.start()
+    three = constant_op.constant(3)
+    five = constant_op.constant(5)
+    product = three * five
+    self.assertAllEqual(15, product)
+    with self.assertRaises(AssertionError):
+      profiler.start()
+
+    profile_result = profiler.stop()
+    profile_pb = trace_events_pb2.Trace()
+    profile_pb.ParseFromString(profile_result)
+    profile_pb_str = '%s' % profile_pb
+    self.assertTrue('Mul' in profile_pb_str)
+    with self.assertRaises(AssertionError):
+      profiler.stop()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 30a93fb0e421e0b26f517a03302d2e96913d8b9a..35040c5d5652c856f6b72062bb5d4d009c48aa7f 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -501,9 +501,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
 void EagerTensor_dealloc(EagerTensor* self) {
   // Clear weak references to self.
   // Needs to happen before any actual destruction.
-  if (self->weakreflist != nullptr) {
-    PyObject_ClearWeakRefs((PyObject*)self);
-  }
+  PyObject_ClearWeakRefs((PyObject*)self);
 
   TF_DeleteStatus(self->status);
   Py_DECREF(self->handle_data);
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 9ce500bc08e478815f2dbe1d5d5353eefa4f17a8..da1bb24fc739d9919329934fee04569ad53d9386 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -835,15 +835,15 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
 }
 
 const char* TFE_GetPythonString(PyObject* o) {
+#if PY_MAJOR_VERSION >= 3
   if (PyBytes_Check(o)) {
     return PyBytes_AsString(o);
-  }
-#if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(o)) {
+  } else {
     return PyUnicode_AsUTF8(o);
   }
+#else
+  return PyBytes_AsString(o);
 #endif
-  return nullptr;
 }
 
 int64_t get_uid() {
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 669fa084888a52da1601984fa11791f84add6170..445ffbc532d23bfe5fdd0aa5c31e941ee6eca527 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -22,6 +22,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -254,6 +255,21 @@ class Tests(test.TestCase):
         "Value for attr 'num_split' of -1 must be at least minimum 1"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=-1)
 
+    with self.assertRaisesRegexp(
+        Exception,
+        "Value for attr 'num_split' of 0 must be at least minimum 1"):
+      array_ops.split(value=[1, 2, 3], num_or_size_splits=0)
+
+  def testIsFunction(self):
+    ctx = context.context()
+    self.assertFalse(ctx.has_function("not_a_function"))
+
+    @def_function.function
+    def f():
+      return 1.
+
+    self.assertTrue(ctx.has_function(f.get_concrete_function().name))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index e501b403a39144a673e8ac5155edf0498425bcd6..e5d6007b4892a739ed12e072738208880736ff23 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -27,8 +27,8 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 # distribution_strategy_context.
 # TODO(b/117329403): Remove this circular dependency.
 distribution_strategy_context = LazyLoader(
-    "distribute_lib", globals(),
-    "tensorflow.python.training."
+    "distribution_strategy_context", globals(),
+    "tensorflow.python.distribute."
     "distribution_strategy_context")
 
 
@@ -61,8 +61,9 @@ def watch(tape, tensor):
 
 def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
-  strategy = distribution_strategy_context.get_distribution_strategy()
-  if distribution_strategy_context.get_replica_context():
+  strategy, context = (
+      distribution_strategy_context.get_strategy_and_replica_context())
+  if context:
     variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
@@ -76,8 +77,9 @@ def variable_accessed(variable):
   Args:
     variable: variable to be watched.
   """
-  strategy = distribution_strategy_context.get_distribution_strategy()
-  if distribution_strategy_context.get_replica_context():
+  strategy, context = (
+      distribution_strategy_context.get_strategy_and_replica_context())
+  if context:
     variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
@@ -85,6 +87,29 @@ def variable_accessed(variable):
     pywrap_tensorflow.TFE_Py_TapeVariableAccessed(var)
 
 
+def variables_accessed(variables):
+  """Notifies all tapes in the stack that variables have been accessed.
+
+  Only trainable variables are marked as accessed.
+
+  Args:
+    variables: iterable of variables to mark as accessed.
+  """
+  strategy, context = (
+      distribution_strategy_context.get_strategy_and_replica_context())
+  accessed = []
+  if context:
+    accessed = [strategy.extended.value_container(variable)
+                for variable in variables if variable.trainable]
+  else:
+    for variable in variables:
+      if variable.trainable:
+        accessed.extend(strategy.unwrap(variable))
+
+  for var in accessed:
+    pywrap_tensorflow.TFE_Py_TapeVariableAccessed(var)
+
+
 def pop_tape(tape):
   """Pops the top tape in the stack, if any."""
   pywrap_tensorflow.TFE_Py_TapeSetRemove(tape._tape)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 0930b6116d5bef9bc91d999ebbd4462e021fbbe3..f8fbda861cad59daa409aa91f2bf7d0a16d9003e 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -38,8 +38,20 @@ class VariableHolder(object):
     self._variables = []
 
   def variable_creator_scope(self, next_creator, **kwargs):
+    """Creates variables & adds them to collections to match legacy code."""
     v = next_creator(**kwargs)
     self._variables.append(v)
+
+    collections = kwargs.get("collections")
+    trainable = v.trainable
+
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
+      collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
+
+    ops.add_to_collections(collections, v)
+
     return v
 
   def __call__(self, *args, **kwargs):
@@ -48,7 +60,7 @@ class VariableHolder(object):
 
 
 # TODO(allenl): make this checkpointable
-class WrappedFunction(function.Function):
+class WrappedFunction(function.ConcreteFunction):
   """Wraps a tf V1 piece of code in a function."""
 
   def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
@@ -61,6 +73,13 @@ class WrappedFunction(function.Function):
     for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
         raise ValueError("Feeds must be tensors.")
+
+    # Ignoring all feeds that are captures allows prune to be called
+    # using wrapped_func.inputs even when it uses variables
+    internal_captures = self.graph.internal_captures
+    flat_feeds = [f for f in flat_feeds
+                  if f not in internal_captures]
+
     tensor_fetches = []
     operation_fetches = []
     for f in flat_fetches:
@@ -87,7 +106,7 @@ class WrappedFunction(function.Function):
           sink_tensor = control_flow_ops.no_op()
     lift_map = lift_to_graph.lift_to_graph(
         sink_tensor, pruned_graph,
-        sources=flat_feeds + self.graph.internal_captures)
+        sources=flat_feeds + internal_captures)
     for original_fetch, identity_fetch in zip(
         tensor_fetches, identity_fetches):
       lift_map[original_fetch] = lift_map[identity_fetch]
@@ -98,6 +117,8 @@ class WrappedFunction(function.Function):
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
     pruned_graph.inputs.extend(pruned_graph.captures.values())
 
+    pruned_graph.variables = self.graph.variables
+
     def _structured_output_mapping(fetched):
       lifted = lift_map[fetched]
       if isinstance(lifted, ops.Operation):
@@ -171,11 +192,15 @@ def wrap_function(fn, signature, name=None):
     the wrapped graph function.
   """
   holder = VariableHolder(fn)
+  func_graph_name = "wrapped_function"
+  if name is not None:
+    func_graph_name = "wrapped_function_" + name
   return WrappedFunction(
       func_graph.func_graph_from_py_func(
-          name,
+          func_graph_name,
           holder,
           args=None, kwargs=None, signature=signature,
-          add_control_dependencies=False),
+          add_control_dependencies=False,
+          collections={}),
       variable_holder=holder,
       signature=signature)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index 65dd73aafca8cb0f6930c334a62083c4d5cd6677..bb2b28861f8268597f20df223a621efb02d3e22d 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -90,11 +91,93 @@ class WrapFunctionTest(test.TestCase):
 
     f_wrapped = wrap_function.wrap_function(f, [])
     self.assertAllEqual(6.0, f_wrapped())
+
+    # Test pruning directly on the inputs
+    pruned = f_wrapped.prune(
+        feeds=f_wrapped.inputs,
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+    self.assertAllEqual(6.0, pruned())
+
+    # Test pruning with no inputs
     pruned = f_wrapped.prune(
         feeds=(),
         fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
     self.assertAllEqual(6.0, pruned())
 
+  def testCollectionsIsolation(self):
+
+    v1 = variables.Variable(2.)
+    v2_holder = []
+    def f():
+      v2 = variables.Variable(3.)
+      v2_holder.append(v2)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, v2 * constant_op.constant(3.))
+      return array_ops.identity(v1 * v2 * constant_op.constant(1.), 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    self.assertAllEqual(6.0, f_wrapped())
+    self.assertEqual(
+        len(f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)), 1)
+    f_var_collection = f_wrapped.graph.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertEqual(len(f_var_collection), 1)
+    self.assertIs(f_var_collection[0], v2_holder[0])
+
+    v3_holder = []
+    def g():
+      v3 = variables.Variable(4.)
+      v3_holder.append(v3)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, v3 * constant_op.constant(3.))
+      return array_ops.identity(v1 * v3 * constant_op.constant(1.), 'fetch')
+
+    g_wrapped = wrap_function.wrap_function(g, [])
+    self.assertAllEqual(8.0, g_wrapped())
+    self.assertEqual(
+        len(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)), 1)
+    g_var_collection = g_wrapped.graph.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertEqual(len(g_var_collection), 1)
+    self.assertIs(g_var_collection[0], v3_holder[0])
+
+    # Both have only one value, and their values aren't equal. So no sharing.
+    self.assertNotEqual(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES),
+                        f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES))
+
+  def testGradientsOfPrune(self):
+
+    v1 = variables.Variable(2.)
+    v2_holder = []
+
+    def f(z):
+      v2 = variables.Variable(3.)
+      v2_holder.append(v2)
+      return array_ops.identity(v1 * v2 * z, 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtype=dtypes.float32)])
+
+    x = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = f_wrapped(x)
+    grads = tape.gradient(out, [x, v1, v2_holder[0]])
+
+    self.assertAllEqual(6.0, out)
+    self.assertAllEqual([6.0, 3.0, 2.0], grads)
+
+    pruned = f_wrapped.prune(
+        feeds=f_wrapped.inputs,
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+
+    x = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = pruned(x)
+    grads = tape.gradient(out, [x, v1, v2_holder[0]])
+
+    self.assertAllEqual(6.0, out)
+    self.assertAllEqual([6.0, 3.0, 2.0], grads)
+
   def testPruneOperations(self):
 
     v = variables.Variable(0)
@@ -134,6 +217,31 @@ class WrapFunctionTest(test.TestCase):
     self.assertEqual(1, does_not_increment(constant_op.constant(3)).numpy())
     self.assertEqual(3, v.numpy())
 
+  def testPruneStatefulOpsFromWrappedFunc(self):
+
+    v0 = variables.Variable(0)
+    v1 = variables.Variable(0)
+
+    # When we wrap a function, we expect it to be executed with 'tf.Graph`
+    # rules: it's allowed to prune all ops that are not in transitive fanin of
+    # the fetches.
+    def f(x):
+      v0.assign_add(1, name='increment_v0')
+      v1.assign_add(1, name='increment_v1')
+      return x
+
+    f_wrapped = wrap_function.wrap_function(f, [1])
+
+    self.assertEqual(1, f_wrapped().numpy())
+    self.assertEqual(0, v0.numpy())
+    self.assertEqual(0, v1.numpy())
+
+    f_wrapped_with_name = wrap_function.wrap_function(f, [2], name='func')
+
+    self.assertEqual(2, f_wrapped_with_name().numpy())
+    self.assertEqual(0, v0.numpy())
+    self.assertEqual(0, v1.numpy())
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d24a7ae80c86d407ae3bb60ca55fff98be9f27a1..789887e53790ccfd5ad1c239009a9047c07d17b4 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -4,7 +4,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "feature_column_py",
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers_base",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
@@ -94,19 +95,13 @@ filegroup(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "feature_column_test",
     srcs = ["feature_column_test.py"],
-    data = [":vocabulary_testdata"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":feature_column",
         ":feature_column_py",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -125,24 +120,22 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
     ],
-)
-
-py_test(
-    name = "feature_column_v2_test",
-    srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
-    shard_count = 5,
-    srcs_version = "PY2AND3",
     tags = [
         "no_cuda_on_cpu_tap",
         "no_pip",
         "no_windows",
     ],
-    deps = [
+)
+
+tf_py_test(
+    name = "feature_column_v2_test",
+    srcs = ["feature_column_v2_test.py"],
+    additional_deps = [
         ":feature_column_py",
         ":feature_column_v2",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -162,6 +155,12 @@ py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
-        "//third_party/py/numpy",
+    ],
+    data = [":vocabulary_testdata"],
+    shard_count = 5,
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_windows",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index a858d92608db1a0d9d00b34f91860b7d4be01d68..42a07cd9275927f69d4795ffd51404998560672e 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2361,7 +2361,7 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
     del trainable
     input_tensor = inputs.get(self)
     return array_ops.one_hot(
-        indices=math_ops.to_int64(input_tensor),
+        indices=math_ops.cast(input_tensor, dtypes.int64),
         depth=len(self.boundaries) + 1,
         on_value=1.,
         off_value=0.)
@@ -2391,9 +2391,10 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
         array_ops.reshape(input_tensor, (-1,)) +
         (len(self.boundaries) + 1) * i2)
 
-    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
-    dense_shape = math_ops.to_int64(array_ops.stack(
-        [batch_size, source_dimension]))
+    indices = math_ops.cast(
+        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
+    dense_shape = math_ops.cast(
+        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         indices=indices,
         values=bucket_indices,
@@ -2829,7 +2830,7 @@ class _VocabularyFileCategoricalColumn(
     if input_tensor.dtype.is_integer:
       # `index_table_from_file` requires 64-bit integer keys.
       key_dtype = dtypes.int64
-      input_tensor = math_ops.to_int64(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_file(
         vocabulary_file=self.vocabulary_file,
@@ -2881,7 +2882,7 @@ class _VocabularyListCategoricalColumn(
     if input_tensor.dtype.is_integer:
       # `index_table_from_tensor` requires 64-bit integer keys.
       key_dtype = dtypes.int64
-      input_tensor = math_ops.to_int64(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_tensor(
         vocabulary_list=tuple(self.vocabulary_list),
@@ -2924,9 +2925,10 @@ class _IdentityCategoricalColumn(
           'Invalid input, not integer. key: {} dtype: {}'.format(
               self.key, input_tensor.dtype))
 
-    values = math_ops.to_int64(input_tensor.values, name='values')
-    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
-    zero = math_ops.to_int64(0, name='zero')
+    values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
+    num_buckets = math_ops.cast(
+        self.num_buckets, dtypes.int64, name='num_buckets')
+    zero = math_ops.cast(0, dtypes.int64, name='zero')
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
@@ -2944,9 +2946,8 @@ class _IdentityCategoricalColumn(
               values < zero, values >= num_buckets, name='out_of_range'),
           array_ops.fill(
               dims=array_ops.shape(values),
-              value=math_ops.to_int64(self.default_value),
-              name='default_values'),
-          values)
+              value=math_ops.cast(self.default_value, dtypes.int64),
+              name='default_values'), values)
 
     return sparse_tensor_lib.SparseTensor(
         indices=input_tensor.indices,
@@ -3256,7 +3257,8 @@ def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
     # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
     # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
     # these will get grouped, and the final seq_length is [1, 1]
-    seq_length = math_ops.to_int64(math_ops.ceil(seq_length / num_elements))
+    seq_length = math_ops.cast(
+        math_ops.ceil(seq_length / num_elements), dtypes.int64)
 
     # If the last n rows do not have ids, seq_length will have shape
     # [batch_size - n]. Pad the remaining values with zeros.
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index daa0a3b3a4bb5fd067681c5ca91eaccdc64d3144..0ded2bf8c9fc9a7dcf1b100da3258b9e8f30a4b3 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1832,7 +1832,7 @@ class LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -1847,7 +1847,7 @@ class LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -2467,7 +2467,7 @@ class _LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2482,7 +2482,7 @@ class _LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -2974,7 +2974,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2989,7 +2989,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index d4e3fc5d2d40f751de11a443dbcacf63cd95e94f..7c28fe2467b8f1c8fbd03c1c79d19ac8681aa230 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -304,8 +304,84 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
+class _BaseFeaturesLayer(Layer):
+  """Base class for DenseFeatures and SequenceFeatures.
+
+  Defines common methods and helpers.
+
+  Args:
+    feature_columns: An iterable containing the FeatureColumns to use as
+      inputs to your model.
+    expected_column_type: Expected class for provided feature columns.
+    trainable:  Boolean, whether the layer's variables will be updated via
+      gradient descent during training.
+    name: Name to give to the DenseFeatures.
+    **kwargs: Keyword arguments to construct a layer.
+
+  Raises:
+    ValueError: if an item in `feature_columns` doesn't match
+      `expected_column_type`.
+  """
+  def __init__(self, feature_columns, expected_column_type, trainable, name,
+               **kwargs):
+    super(_BaseFeaturesLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    for column in self._feature_columns:
+      if not isinstance(column, expected_column_type):
+        raise ValueError(
+            'Items of feature_columns must be a {}. '
+            'You can wrap a categorical column with an '
+            'embedding_column or indicator_column. Given: {}'.format(
+                expected_column_type, column))
+
+  def build(self, _):
+    for column in self._feature_columns:
+      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          column.create_state(self._state_manager)
+    super(_BaseFeaturesLayer, self).build(None)
+
+  def _output_shape(self, input_shape, num_elements):
+    """Computes expected output shape of the layer or a column's dense tensor.
+
+    Args:
+      input_shape: Tensor or array with batch shape.
+      num_elements: Size of the last dimension of the output.
+
+    Returns:
+      Tuple with output shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def compute_output_shape(self, input_shape):
+    total_elements = 0
+    for column in self._feature_columns:
+      total_elements += column.variable_shape.num_elements()
+    return self._target_shape(input_shape, total_elements)
+
+  def _process_dense_tensor(self, column, tensor):
+    """Reshapes the dense tensor output of a column based on expected shape.
+
+    Args:
+      column: A DenseColumn or SequenceDenseColumn object.
+      tensor: A dense tensor obtained from the same column.
+
+    Returns:
+      Reshaped dense tensor."""
+    num_elements = column.variable_shape.num_elements()
+    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
+    return array_ops.reshape(tensor, shape=target_shape)
+
+  def _verify_and_concat_tensors(self, output_tensors):
+    """Verifies and concatenates the dense output of several columns."""
+    _verify_static_batch_size_equality(output_tensors, self._feature_columns)
+    return array_ops.concat(output_tensors, -1)
+
+
 @keras_export('keras.layers.DenseFeatures', v1=[])
-class DenseFeatures(Layer):
+class DenseFeatures(_BaseFeaturesLayer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -345,8 +421,8 @@ class DenseFeatures(Layer):
         `bucketized_column`, `indicator_column`. If you have categorical
         features, you can wrap them with an `embedding_column` or
         `indicator_column`.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      trainable:  Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
       name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
@@ -354,28 +430,18 @@ class DenseFeatures(Layer):
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
     super(DenseFeatures, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    for column in self._feature_columns:
-      if not isinstance(column, DenseColumn):
-        raise ValueError(
-            'Items of feature_columns must be a DenseColumn. '
-            'You can wrap a categorical column with an '
-            'embedding_column or indicator_column. Given: {}'.format(column))
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=DenseColumn,
+        **kwargs)
 
   @property
   def _is_feature_layer(self):
     return True
 
-  def build(self, _):
-    for column in self._feature_columns:
-      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-          column.create_state(self._state_manager)
-      super(DenseFeatures, self).build(None)
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], total_elements)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -401,27 +467,15 @@ class DenseFeatures(Layer):
                        features)
     transformation_cache = FeatureTransformationCache(features)
     output_tensors = []
-    ordered_columns = []
     for column in self._feature_columns:
       with ops.name_scope(column.name):
-        ordered_columns.append(column)
         tensor = column.get_dense_tensor(transformation_cache,
                                          self._state_manager)
-        num_elements = column.variable_shape.num_elements()
-        batch_size = array_ops.shape(tensor)[0]
-        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-        output_tensors.append(tensor)
+        processed_tensors = self._process_dense_tensor(column, tensor)
         if cols_to_output_tensors is not None:
-          cols_to_output_tensors[column] = tensor
-
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
-
-  def compute_output_shape(self, input_shape):
-    total_elements = 0
-    for column in self._feature_columns:
-      total_elements += column.variable_shape.num_elements()
-    return (input_shape[0], total_elements)
+          cols_to_output_tensors[column] = processed_tensors
+        output_tensors.append(processed_tensors)
+    return self._verify_and_concat_tensors(output_tensors)
 
 
 class _LinearModelLayer(Layer):
@@ -438,7 +492,6 @@ class _LinearModelLayer(Layer):
         name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
     for column in self._feature_columns:
       if not isinstance(column, (DenseColumn, CategoricalColumn)):
         raise ValueError(
@@ -694,7 +747,7 @@ def _transform_features_v2(features, feature_columns, state_manager):
   with ops.name_scope(
       None, default_name='transform_features', values=features.values()):
     transformation_cache = FeatureTransformationCache(features)
-    for column in sorted(feature_columns, key=lambda x: x.name):
+    for column in feature_columns:
       with ops.name_scope(None, default_name=column.name):
         outputs[column] = transformation_cache.get(column, state_manager)
   return outputs
@@ -2660,7 +2713,7 @@ def _normalize_feature_columns(feature_columns):
                                                name_to_column[column.name]))
     name_to_column[column.name] = column
 
-  return feature_columns
+  return sorted(feature_columns, key=lambda x: x.name)
 
 
 class NumericColumn(
@@ -2780,12 +2833,8 @@ class NumericColumn(
     """See 'FeatureColumn` base class."""
     _check_config_keys(config, cls._fields)
     kwargs = config.copy()
-    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
-    if config['normalizer_fn']:
-      kwargs['normalizer_fn'] = utils.deserialize_keras_object(
-          config['normalizer_fn'], custom_objects=custom_objects)
-    else:
-      kwargs['normalizer_fn'] = None
+    kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+        config['normalizer_fn'], custom_objects=custom_objects)
     kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
     return cls(**kwargs)
 
@@ -3072,10 +3121,10 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Get sparse IDs and weights.
@@ -3092,10 +3141,10 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -3109,7 +3158,7 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -3133,8 +3182,8 @@ class EmbeddingColumn(
         (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
       raise ValueError(
           'In embedding_column: {}. '
-          'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'categorical_column must be of type SequenceCategoricalColumn '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -3168,12 +3217,8 @@ class EmbeddingColumn(
     kwargs = config.copy()
     kwargs['categorical_column'] = deserialize_feature_column(
         config['categorical_column'], custom_objects, columns_by_name)
-    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
-    if config['initializer']:
-      kwargs['initializer'] = utils.deserialize_keras_object(
-          config['initializer'], custom_objects=custom_objects)
-    else:
-      kwargs['initializer'] = None
+    kwargs['initializer'] = utils.deserialize_keras_object(
+        config['initializer'], custom_objects=custom_objects)
     return cls(**kwargs)
 
 
@@ -3183,7 +3228,7 @@ def _raise_shared_embedding_column_error():
                    '`DenseFeatures` or `LinearModel` instead.')
 
 
-class SharedEmbeddingColumnCreator(tracking.Checkpointable):
+class SharedEmbeddingColumnCreator(tracking.AutoCheckpointable):
 
   def __init__(self,
                dimension,
@@ -3306,10 +3351,10 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'SequenceFeatureLayer instead of FeatureLayer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     return self._get_dense_tensor_internal(transformation_cache, state_manager)
@@ -3323,7 +3368,7 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use SequenceFeatureLayer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4341,10 +4386,10 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Feature has been already transformed. Return the intermediate
@@ -4362,10 +4407,10 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Feature has been already transformed. Return the intermediate
@@ -4378,7 +4423,7 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4408,7 +4453,7 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4681,7 +4726,7 @@ def deserialize_feature_column(config,
           IdentityCategoricalColumn, IndicatorColumn, NumericColumn,
           SequenceCategoricalColumn, SequenceDenseColumn, SharedEmbeddingColumn,
           VocabularyFileCategoricalColumn, VocabularyListCategoricalColumn,
-          WeightedCategoricalColumn
+          WeightedCategoricalColumn, init_ops.TruncatedNormal
       ]
   }
   if columns_by_name is None:
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a2474253697ad526c33c0099bf955b96000cf0f7..2b150790c1d565ef963be34b0bd004101b7a02a7 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
@@ -2015,7 +2016,7 @@ class LinearModelTest(test.TestCase):
       }
       model(features)
       for var in model.variables:
-        self.assertTrue(isinstance(var, variables_lib.RefVariable))
+        self.assertIsInstance(var, variables_lib.VariableV1)
       variable_names = [var.name for var in model.variables]
       self.assertItemsEqual([
           'linear_model/dense_feature_bucketized/weights:0',
@@ -2052,7 +2053,7 @@ class LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       model = fc.LinearModel([price1, price2])
       model(features)
 
@@ -2068,7 +2069,7 @@ class LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         model = fc.LinearModel([price1, price2, price3])
         model(features)
 
@@ -2818,7 +2819,7 @@ class OldLinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       fc_old.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2833,7 +2834,7 @@ class OldLinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -3261,7 +3262,7 @@ class DenseFeaturesTest(test.TestCase):
       fc.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
+    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
       fc.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
@@ -3422,7 +3423,7 @@ class DenseFeaturesTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
+      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
         fc.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
@@ -3435,7 +3436,7 @@ class DenseFeaturesTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.DenseFeatures([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -3450,7 +3451,7 @@ class DenseFeaturesTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.DenseFeatures([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
@@ -4010,7 +4011,7 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
       self.assertIsInstance(cols_to_vars[some_embedding_column][0],
-                            variables_lib.Variable)
+                            variables_lib.VariableV1)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
   @test_util.run_deprecated_v1
@@ -4141,7 +4142,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -4156,7 +4157,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -6839,7 +6840,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     for v in global_vars:
-      self.assertTrue(isinstance(v, variables_lib.RefVariable))
+      self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
@@ -7147,7 +7148,60 @@ class EmbeddingColumnTest(test.TestCase):
                           self.evaluate(predictions))
 
   @test_util.run_deprecated_v1
-  def test_serialization(self):
+  def test_serialization_with_default_initializer(self):
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+
+    self.assertEqual([categorical_column], embedding_column.parents)
+
+    config = embedding_column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'number_buckets': 3,
+                'key': 'aaa',
+                'default_value': None
+            }
+        },
+        'ckpt_to_load_from': None,
+        'combiner': 'mean',
+        'dimension': 2,
+        'initializer': {
+            'class_name': 'TruncatedNormal',
+            'config': {
+                'dtype': 'float32',
+                'stddev': 0.7071067811865475,
+                'seed': None,
+                'mean': 0.0
+            }
+        },
+        'max_norm': None,
+        'tensor_name_in_ckpt': None,
+        'trainable': True
+    }, config)
+
+    custom_objects = {'TruncatedNormal': init_ops.TruncatedNormal}
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config, custom_objects=custom_objects)
+    self.assertEqual(embedding_column._get_config(),
+                     new_embedding_column._get_config())
+    self.assertIsNot(categorical_column,
+                     new_embedding_column.categorical_column)
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config,
+        custom_objects=custom_objects,
+        columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(embedding_column._get_config(),
+                     new_embedding_column._get_config())
+    self.assertIs(categorical_column, new_embedding_column.categorical_column)
+
+  @test_util.run_deprecated_v1
+  def test_serialization_with_custom_initializer(self):
 
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
@@ -7732,7 +7786,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
     for v in global_vars:
-      self.assertTrue(isinstance(v, variables_lib.RefVariable))
+      self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
       self.assertItemsEqual(
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index a72ded11314d4b491292aed73364be7d875baa86..da76a84e55e5f299bb324eeb1b3e6050fb46eb54 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -29,13 +29,23 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
+# Op types that should not run in program order, e.g. because they need to run
+# asynchronously to avoid deadlock.
+ASYNC_STATEFUL_OPS = [
+    "CollectiveReduce",
+    "CollectiveBcastSend",
+    "CollectiveBcastRecv",
+    "NcclAllReduce",
+]
+
 
 class AutomaticControlDependencies(object):
   """Context manager to automatically add control dependencies.
 
   Code under this context manager will act as if a sensible set of control
   dependencies were present. More specifically:
-    1. All stateful ops in the scope will execute
+    1. All stateful ops in the scope will execute (with the exception of ops in
+       ASYNC_STATEFUL_OPS)
     2. Stateful ops which modify the same resource will execute in program order
 
   Note: creating variables in an automatic control dependencies context is not
@@ -223,7 +233,8 @@ class AutomaticControlDependencies(object):
       control_inputs = set()
       # Ensure stateful ops run
       if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
-          or self._graph._registered_ops[op.type].is_stateful):  # pylint: disable=protected-access
+          or (self._graph._registered_ops[op.type].is_stateful   # pylint: disable=protected-access
+              and op.type not in ASYNC_STATEFUL_OPS)):
         ops_which_must_run.add(op)
       # Ignore switches (they're handled separately)
       if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
@@ -255,8 +266,8 @@ class AutomaticControlDependencies(object):
           if inp in merge_for_resource:
             merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
           last_op_using_resource_tensor[inp] = op
-      if (op.op_def.is_stateful and not found_resource
-          and op._control_flow_context is None):  # pylint: disable=protected-access
+      if (op.op_def.is_stateful and op.type not in ASYNC_STATEFUL_OPS
+          and not found_resource and op._control_flow_context is None):  # pylint: disable=protected-access
         if None in last_op_using_resource_tensor:
           op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
         last_op_using_resource_tensor[None] = op
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9a292e363e2e38be32a30ea95f1f122876bbb0
--- /dev/null
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -0,0 +1,100 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensor-like objects that are composed from tf.Tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python import pywrap_tensorflow
+
+
+@six.add_metaclass(abc.ABCMeta)
+class CompositeTensor(object):
+  """Abstract base class for Tensor-like objects that are composed from Tensors.
+
+  Each `CompositeTensor` can be decomposed into a structured collection of
+  component `tf.Tensor`s, and reconstructed from those components.
+
+  The `tensorflow.python.util.nest` module has support for treating composite
+  tensors as structure, which makes it easy to flatten and reconstruct
+  composite tensors (or larger structures that contain composite tensors).
+  E.g.:
+
+  ```python
+  ct = ...  # Create a composite tensor.
+  flat_list_of_tensors = nest.flatten(ct, expand_composites=True)
+  transformed_list_of_tensors = ...  # do something with the flat tensors.
+  result = nest.pack_sequence_as(ct, transformed_list_of_tensors)
+  ```
+  """
+
+  @abc.abstractmethod
+  def _to_components(self):
+    """Decomposes this composite tensor into its components.
+
+    Returns:
+      The components that comprise this composite tensor: a nested structure
+      (as defined by `tf.python.util.nest`) whose values are `tf.Tensor`s or
+      `CompositeTensor`s.
+    """
+    raise NotImplementedError("CompositeTensor._to_components")
+
+  @abc.abstractmethod
+  def _from_components(cls, components):  # pylint: disable=no-self-argument
+    """Creates a composite tensor of type `cls` from components.
+
+    Args:
+      components: The components that should be used to form the
+        composite tensor: a nested structure (as defined by
+        `tf.python.util.nest`) whose values are tf.Tensors or composite
+        tensors.
+
+    Returns:
+      A `CompositeTensor` of type `cls`.
+    """
+    raise NotImplementedError("CompositeTensor._from_components")
+
+  @abc.abstractmethod
+  def _shape_invariant_to_components(self, shape=None):
+    """Converts a shape invariant into invariants for individual components.
+
+    Args:
+      shape: A `tf.TensorShape` object.  The shape invariant for this
+        `CompositeTensor`, or `None` if a default shape invariant should be
+        used (based on the value of this `CompositeTensor`).
+
+    Returns:
+      A nested structure whose values are `tf.TensorShape` objects, specifying
+      the shape invariants for the tensors that comprise this `CompositeTensor`.
+    """
+    raise NotImplementedError("CompositeTensor._shape_invariant_to_components")
+
+  @abc.abstractproperty
+  def _is_graph_tensor(self):
+    """Returns True if this tensor's components belong to a TF graph."""
+    raise NotImplementedError("CompositeTensor._is_symbolic_tensor")
+
+
+pywrap_tensorflow.RegisterType("CompositeTensor", CompositeTensor)
+
+
+# @TODO(edloper): Can we replace convert_to_tensor_or_xyz with just
+# convert_to_tensor_or_composite?  Alternatively, should composite tensors
+# register a dispatch override for tf.convert_to_tensor?
diff --git a/tensorflow/python/framework/composite_tensor_test.py b/tensorflow/python/framework/composite_tensor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f249faa5d685b411742a65025000e00c2edadbc5
--- /dev/null
+++ b/tensorflow/python/framework/composite_tensor_test.py
@@ -0,0 +1,101 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.composite_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import nest
+
+
+class TestCompositeTensor(composite_tensor.CompositeTensor):
+
+  def __init__(self, *components):
+    self._components = components
+
+  def _to_components(self):
+    return self._components
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    raise NotImplementedError('CompositeTensor._shape_invariant_to_components')
+
+  def _is_graph_tensor(self):
+    return True
+
+
+class CompositeTensorTest(test_util.TensorFlowTestCase):
+
+  def assertNestEqual(self, a, b, expand_composites=False):
+    if isinstance(a, dict):
+      self.assertIsInstance(b, dict)
+      self.assertEqual(set(a), set(b))
+      for key in a:
+        self.assertNestEqual(a[key], b[key])
+    elif isinstance(a, (list, tuple)):
+      self.assertIsInstance(b, (list, tuple))
+      self.assertEqual(len(a), len(b))
+      for a_val, b_val in zip(a, b):
+        self.assertNestEqual(a_val, b_val)
+    elif expand_composites and isinstance(a, composite_tensor.CompositeTensor):
+      self.assertIsInstance(b, composite_tensor.CompositeTensor)
+      self.assertNestEqual(a._to_components(),
+                           b._to_components())
+
+  def testNestFlatten(self):
+    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
+    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
+    structure = [[st1], 'foo', {'y': [st2]}]
+    x = nest.flatten(structure, expand_composites=True)
+    self.assertEqual(len(x), 7)
+    self.assertIs(x[0], st1.indices)
+    self.assertIs(x[1], st1.values)
+    self.assertIs(x[2], st1.dense_shape)
+    self.assertEqual(x[3], 'foo')
+    self.assertIs(x[4], st2.indices)
+    self.assertIs(x[5], st2.values)
+    self.assertIs(x[6], st2.dense_shape)
+
+  def testNestPackSequenceAs(self):
+    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
+    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
+    structure1 = [[st1], 'foo', {'y': [st2]}]
+    flat = [st2.indices, st2.values, st2.dense_shape, 'bar',
+            st1.indices, st1.values, st1.dense_shape]
+    result = nest.pack_sequence_as(structure1, flat, expand_composites=True)
+    expected = [[st2], 'bar', {'y': [st1]}]
+    self.assertNestEqual(expected, result)
+
+  def testAssertSameStructure(self):
+    st1 = sparse_tensor.SparseTensor([[0]], [0], [100])
+    st2 = sparse_tensor.SparseTensor([[0, 3]], ['x'], [100, 100])
+    test = TestCompositeTensor(st1.indices, st1.values, st1.dense_shape)
+    nest.assert_same_structure(st1, st2, expand_composites=False)
+    nest.assert_same_structure(st1, st2, expand_composites=True)
+    nest.assert_same_structure(st1, test, expand_composites=False)
+    with self.assertRaises(TypeError):
+      nest.assert_same_structure(st1, test, expand_composites=True)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/framework/device_test.py b/tensorflow/python/framework/device_test.py
index 0859e956ffd5a2c905837c5f6e68658d11403ae5..cd4b4ea51e62dd1c022316b30cb9203f089a92d3 100644
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import device
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -116,6 +119,20 @@ class DeviceTest(test_util.TensorFlowTestCase):
         "/job:muu/device:MyFunnyDevice:2"))
     self.assertEquals("/job:muu/task:1/device:MyFunnyDevice:2", d.to_string())
 
+    if not context.executing_eagerly():
+      with ops.device(device.merge_device("/device:GPU:0")):
+        var1 = variables.Variable(1.0)
+        self.assertEquals("/device:GPU:0", var1.device)
+        with ops.device(device.merge_device("/job:worker")):
+          var2 = variables.Variable(1.0)
+          self.assertEquals("/job:worker/device:GPU:0", var2.device)
+          with ops.device(device.merge_device("/device:CPU:0")):
+            var3 = variables.Variable(1.0)
+            self.assertEquals("/job:worker/device:CPU:0", var3.device)
+            with ops.device(device.merge_device("/job:ps")):
+              var4 = variables.Variable(1.0)
+              self.assertEquals("/job:ps/device:CPU:0", var4.device)
+
   def testCanonicalName(self):
     self.assertEqual("/job:foo/replica:0",
                      device.canonical_name("/job:foo/replica:0"))
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 9a4fe4e93b32aeedcb74cf0f7b2703f64d9db23a..9d643e041c680fb9fe27c87779ad24e00e578a36 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -535,29 +535,47 @@ _np_qint32 = np.dtype([("qint32", np.int32, 1)])
 np_resource = np.dtype([("resource", np.ubyte, 1)])
 
 # Standard mappings between types_pb2.DataType values and numpy.dtypes.
-_NP_TO_TF = frozenset([
-    (np.float16, float16),
-    (np.float32, float32),
-    (np.float64, float64),
-    (np.int32, int32),
-    (np.int64, int64),
-    (np.uint8, uint8),
-    (np.uint16, uint16),
-    (np.uint32, uint32),
-    (np.uint64, uint64),
-    (np.int16, int16),
-    (np.int8, int8),
-    (np.complex64, complex64),
-    (np.complex128, complex128),
-    (np.object_, string),
-    (np.bool_, bool),
-    (_np_qint8, qint8),
-    (_np_quint8, quint8),
-    (_np_qint16, qint16),
-    (_np_quint16, quint16),
-    (_np_qint32, qint32),
-    (_np_bfloat16, bfloat16),
-])
+_NP_TO_TF = {
+    np.float16: float16,
+    np.float32: float32,
+    np.float64: float64,
+    np.int32: int32,
+    np.int64: int64,
+    np.uint8: uint8,
+    np.uint16: uint16,
+    np.uint32: uint32,
+    np.uint64: uint64,
+    np.int16: int16,
+    np.int8: int8,
+    np.complex64: complex64,
+    np.complex128: complex128,
+    np.object_: string,
+    np.string_: string,
+    np.unicode_: string,
+    np.bool_: bool,
+    _np_qint8: qint8,
+    _np_quint8: quint8,
+    _np_qint16: qint16,
+    _np_quint16: quint16,
+    _np_qint32: qint32,
+    _np_bfloat16: bfloat16,
+}
+
+# Map (some) NumPy platform dtypes to TF ones using their fixed-width
+# synonyms. Note that platform dtypes are not always simples aliases,
+# i.e. reference equality is not guaranteed. See e.g. numpy/numpy#9799.
+for pdt in [
+    np.intc,
+    np.uintc,
+    np.int_,
+    np.uint,
+    np.longlong,
+    np.ulonglong,
+]:
+  if pdt not in _NP_TO_TF:
+    _NP_TO_TF[pdt] = next(
+        _NP_TO_TF[dt] for dt in _NP_TO_TF if dt == pdt().dtype)
+
 _TF_TO_NP = {
     types_pb2.DT_HALF:
         np.float16,
@@ -664,6 +682,20 @@ _PYTHON_TO_TF = {
     builtins.object: string
 }
 
+_ANY_TO_TF = {}
+_ANY_TO_TF.update(_INTERN_TABLE)
+_ANY_TO_TF.update(_STRING_TO_TF)
+_ANY_TO_TF.update(_PYTHON_TO_TF)
+_ANY_TO_TF.update(_NP_TO_TF)
+
+# Ensure no collisions.
+assert len(_ANY_TO_TF) == sum(len(d) for d in [
+    _INTERN_TABLE,
+    _STRING_TO_TF,
+    _PYTHON_TO_TF,
+    _NP_TO_TF
+])
+
 
 @tf_export("dtypes.as_dtype", "as_dtype")
 def as_dtype(type_value):
@@ -684,36 +716,16 @@ def as_dtype(type_value):
   if isinstance(type_value, DType):
     return type_value
 
-  try:
-    return _INTERN_TABLE[type_value]
-  except KeyError:
-    pass
-
-  try:
-    return _STRING_TO_TF[type_value]
-  except KeyError:
-    pass
+  if isinstance(type_value, np.dtype):
+    try:
+      return _NP_TO_TF[type_value.type]
+    except KeyError:
+      pass
 
   try:
-    return _PYTHON_TO_TF[type_value]
+    return _ANY_TO_TF[type_value]
   except KeyError:
     pass
 
-  if isinstance(type_value, np.dtype):
-    # The numpy dtype for strings is variable length. We can not compare
-    # dtype with a single constant (np.string does not exist) to decide
-    # dtype is a "string" type. We need to compare the dtype.type to be
-    # sure it's a string type.
-    if type_value.type == np.string_ or type_value.type == np.unicode_:
-      return string
-
-  if isinstance(type_value, (type, np.dtype)):
-    for key, val in _NP_TO_TF:
-      try:
-        if key == type_value:
-          return val
-      except TypeError as e:
-        raise TypeError("Cannot convert {} to a dtype. {}".format(
-            type_value, e))
-
-  raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
+  raise TypeError(
+      "Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 719fdc0953ae4d5bbe016b3dc2730f5601c3494e..7dd2a792d1254027401d03b9dacddbb815cf4858 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,9 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonLongConversion(self):
+    self.assertIs(dtypes.int64, dtypes.as_dtype(np.array(2**32).dtype))
+
   def testPythonTypesConversion(self):
     self.assertIs(dtypes.float32, dtypes.as_dtype(float))
     self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 557f947291ca57da17071e91f7cba2aa0c7a8a70..af83b70a465cd061c2ed713639cc4a5d531f388d 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -29,10 +29,9 @@ import re
 
 import six
 
-from tensorflow.python.framework.ops import Tensor
 from tensorflow.python.util import tf_stack
 
-_NAME_REGEX = r"[A-Za-z0-9.][A-Za-z0-9_.\-/]*?"
+_NAME_REGEX = r"[A-Za-z0-9_.][A-Za-z0-9_.\-/]*?"
 _TAG_REGEX = r"{{{{({name}) ({name})}}}}".format(name=_NAME_REGEX)
 _INTERPOLATION_REGEX = r"^(.*?)({tag})".format(tag=_TAG_REGEX)
 _INTERPOLATION_PATTERN = re.compile(_INTERPOLATION_REGEX, re.DOTALL)
@@ -46,7 +45,7 @@ _BAD_FILE_SUBSTRINGS = [
 ]
 
 
-def _parse_message(message):
+def parse_message(message):
   """Parses the message.
 
   Splits the message into separators and tags. Tags are named tuples
@@ -178,9 +177,9 @@ def _compute_colocation_summary_from_op(op, prefix=""):
 
 
 def _find_index_of_defining_frame_for_op(op):
-  """Return index in op._traceback with first 'useful' frame.
+  """Return index in op.traceback with first 'useful' frame.
 
-  This method reads through the stack stored in op._traceback looking for the
+  This method reads through the stack stored in op.traceback looking for the
   innermost frame which (hopefully) belongs to the caller.  It accomplishes this
   by rejecting frames whose filename appears to come from TensorFlow (see
   error_interpolation._BAD_FILE_SUBSTRINGS for the list of rejected substrings).
@@ -190,15 +189,13 @@ def _find_index_of_defining_frame_for_op(op):
         location.
 
   Returns:
-    Integer index into op._traceback where the first non-TF file was found
+    Integer index into op.traceback where the first non-TF file was found
     (innermost to outermost), or 0 (for the outermost stack frame) if all files
     came from TensorFlow.
   """
-  # pylint: disable=protected-access
   # Index 0 of tf_traceback is the outermost frame.
-  tf_traceback = tf_stack.convert_stack(op._traceback)
+  tf_traceback = op.traceback
   size = len(tf_traceback)
-  # pylint: enable=protected-access
   filenames = [frame[tf_stack.TB_FILENAME] for frame in tf_traceback]
   # We process the filenames from the innermost frame to outermost.
   for idx, filename in enumerate(reversed(filenames)):
@@ -211,10 +208,7 @@ def _find_index_of_defining_frame_for_op(op):
 def _get_defining_frame_from_op(op):
   """Find and return stack frame where op was defined."""
   frame_index = _find_index_of_defining_frame_for_op(op)
-  # pylint: disable=protected-access
-  frame = op._traceback[frame_index]
-  # pylint: enable=protected-access
-  return frame
+  return op.traceback[frame_index]
 
 
 def compute_field_dict(op, strip_file_prefix=""):
@@ -270,7 +264,7 @@ def compute_field_dict(op, strip_file_prefix=""):
   return field_dict
 
 
-def _common_prefix(all_ops):
+def traceback_files_common_prefix(all_ops):
   """Determines the common prefix from the paths of the stacktrace of 'all_ops'.
 
   For example, if the paths are '/foo/bar/baz/' and '/foo/car', this would
@@ -287,10 +281,7 @@ def _common_prefix(all_ops):
     if ops is None:
       continue
     for op in ops:
-      # pylint: disable=protected-access
-      tf_traceback = tf_stack.convert_stack(op._traceback)
-      # pylint: enable=protected-access
-      for frame in tf_traceback:
+      for frame in op.traceback:
         filename = frame[tf_stack.TB_FILENAME]
         if "<embedded" not in filename:
           files.add(filename)
@@ -315,11 +306,13 @@ def _sources_for_node(name, graph):
     if name.startswith("^"):
       name = name[1:]
     try:
-      op = graph.as_graph_element(name)
-    except KeyError:
-      return
-    if isinstance(op, Tensor):
-      op = op.op
+      tensor = graph.get_tensor_by_name(name)
+      op = tensor.op
+    except (KeyError, ValueError):
+      try:
+        op = graph.get_operation_by_name(name)
+      except KeyError:
+        return
     name = op.name
     if name in seen_names:
       return
@@ -383,7 +376,7 @@ def interpolate(error_message, graph):
   Returns:
     The string with tags of the form {{type name}} interpolated.
   """
-  seps, tags = _parse_message(error_message)
+  seps, tags = parse_message(error_message)
   subs = []
   end_msg = collections.defaultdict(list)
   tagged_ops = []
@@ -398,7 +391,7 @@ def interpolate(error_message, graph):
     else:
       tagged_ops.append([op] + _sources_for_node(op.name, graph))
 
-  common_prefix = _common_prefix(tagged_ops)
+  common_prefix = traceback_files_common_prefix(tagged_ops)
   for tag, ops in zip(tags, tagged_ops):
     msg = "{{%s %s}}" % (tag.type, tag.name)
     if ops is not None:
@@ -411,6 +404,8 @@ def interpolate(error_message, graph):
         msg = "node %s%s placed on device %s " % (
             ops[0].name, field_dict["defined_at"], field_dict["devices"])
         end_msg["colocations"].append(field_dict["devs_and_colocs"])
+    if tag.type == "function_node":
+      msg = ""
     subs.append(msg)
 
   if "source_nodes" in end_msg:
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index faa4fa7c6fa47f4328c6c04569aacde48b51b6c0..922b9e2bd308a03b0b6b28aa741c4e6f54c1b347 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -24,12 +24,28 @@ import warnings
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import error_interpolation
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_stack
 from tensorflow.python.util.tf_export import tf_export
 
 
+def _compact_stack_trace(op):
+  """Returns a traceback for `op` with common file prefixes stripped."""
+  compact_traces = []
+  common_prefix = error_interpolation.traceback_files_common_prefix([[op]])
+  for frame in op.traceback:
+    frame = list(frame)
+    filename = frame[tf_stack.TB_FILENAME]
+    if filename.startswith(common_prefix):
+      filename = filename[len(common_prefix):]
+      frame[tf_stack.TB_FILENAME] = filename
+    compact_traces.append(tuple(frame))
+  return compact_traces
+
+
 @tf_export("errors.OpError", v1=["errors.OpError", "OpError"])
 @deprecation.deprecated_endpoints("OpError")
 class OpError(Exception):
@@ -94,9 +110,10 @@ class OpError(Exception):
 
   def __str__(self):
     if self._op is not None:
-      output = ["%s\n\nCaused by op %r, defined at:\n" % (self.message,
+      output = ["%s\n\nOriginal stack trace for %r:\n" % (self.message,
                                                           self._op.name,)]
-      curr_traceback_list = traceback.format_list(self._op.traceback)
+      curr_traceback_list = traceback.format_list(
+          _compact_stack_trace(self._op))
       output.extend(curr_traceback_list)
       # pylint: disable=protected-access
       original_op = self._op._original_op
@@ -106,7 +123,8 @@ class OpError(Exception):
             "\n...which was originally created as op %r, defined at:\n"
             % (original_op.name,))
         prev_traceback_list = curr_traceback_list
-        curr_traceback_list = traceback.format_list(original_op.traceback)
+        curr_traceback_list = traceback.format_list(
+            _compact_stack_trace(original_op))
 
         # Attempt to elide large common subsequences of the subsequent
         # stack traces.
@@ -136,8 +154,6 @@ class OpError(Exception):
         # pylint: disable=protected-access
         original_op = original_op._original_op
         # pylint: enable=protected-access
-      output.append("\n%s (see above for traceback): %s\n" %
-                    (type(self).__name__, self.message))
       return "".join(output)
     else:
       return self.message
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 9528a24b46b3e7e76df7355241cafd1003542f11..e7718cfef1f3a0026ee637632e0f60af904b4c41 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -18,15 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import collections as py_collections
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
@@ -58,6 +58,66 @@ WHITELIST_COLLECTIONS = [
 ]
 
 
+class UnknownArgument(object):
+  """Signifies an argument which is not currently handled."""
+  pass
+
+
+# TODO(vbardiovsky): Remove this when nest is updated with new
+# flatten_with_tuple_paths.
+def flatten_with_tuple_paths(structure):
+  return list(zip(nest.yield_flat_paths(structure), nest.flatten(structure)))
+
+
+def convert_structure_to_signature(structure, arg_names=None):
+  """Convert a potentially nested structure to a signature.
+
+  Args:
+    structure: Structure to convert, where top level collection is a list or a
+      tuple.
+    arg_names: Optional list of arguments that has equal number of elements as
+      `structure` and is used for naming corresponding TensorSpecs.
+
+  Returns:
+    Identical structure that has TensorSpec objects instead of Tensors and
+    UknownArgument instead of any unsupported types.
+  """
+
+  def encode_arg(arg, name=None):
+    """A representation for this argument, for converting into signatures."""
+    if isinstance(arg, ops.Tensor):
+      return tensor_spec.TensorSpec(arg.shape, arg.dtype, name)
+    if isinstance(arg, (
+        int,
+        float,
+        bool,
+        type(None),
+        dtypes.DType,
+        tensor_spec.TensorSpec,
+    )):
+      return arg
+    return UnknownArgument()
+
+  # We are using the flattened paths to name the TensorSpecs. We need an
+  # explicit name for them downstream.
+  flattened = flatten_with_tuple_paths(structure)
+  if arg_names:
+    if len(arg_names) != len(structure):
+      raise ValueError(
+          "Passed in arg_names don't match actual signature (%s)." % arg_names)
+    # Replace all top-level names with their actual arg_names. If a path before
+    # was "(2,'a',1)", it will become "(arg_names[2],'a',1)".
+    flattened = [
+        ((arg_names[path[0]],) + path[1:], arg) for path, arg in flattened
+    ]
+
+  mapped = [
+      encode_arg(arg, "/".join([str(p) for p in path]))
+      for path, arg in flattened
+  ]
+  return nest.pack_sequence_as(structure, mapped)
+
+
 class FuncGraph(ops.Graph):
   """Graph representing a function body.
 
@@ -69,6 +129,9 @@ class FuncGraph(ops.Graph):
       inputs coming first.
     outputs: Tensors that will be returned by this function. The tensors are in
       this FuncGraph.
+    structured_input_signature: A tuple of (args, kwargs), which are both
+      possibly-nested python objects that were received by this function. Note
+      that these structures might contain Python `None`s.
     structured_outputs: A possibly-nested python object which will be returned
       by this function. The Tensors in this structure are the same as those of
       self.outputs. Note that this structure might contain Python `None`s.
@@ -80,7 +143,7 @@ class FuncGraph(ops.Graph):
     seed: The graph-level random seed.
   """
 
-  def __init__(self, name, read_only_collections=True):
+  def __init__(self, name, collections=None):
     """Construct a new FuncGraph.
 
     The graph will inherit its graph key, collections, seed, and distribution
@@ -88,19 +151,24 @@ class FuncGraph(ops.Graph):
 
     Args:
       name: the name of the function.
-      read_only_collections: whether to not write function graph collections
-        back to default graph. Defaults to True.
+      collections: a dictionary of collections this FuncGraph should start
+        with. If not specified (None), the FuncGraph will read (but not write
+        to) the outer graph's collections that are not whitelisted, and both
+        read and write to the outer graph's collections that are whitelisted.
+        The current whitelisted collections are the global variables, the
+        local variables, and the trainable variables.
+        Defaults to None.
     """
     super(FuncGraph, self).__init__()
 
     self.name = name
     self.inputs = []
     self.outputs = []
+    self.structured_input_signature = None
     self.structured_outputs = None
-    self._read_only_collections = read_only_collections
     self._weak_variables = []
     self.outer_graph = ops.get_default_graph()
-    self.captures = collections.OrderedDict()
+    self.captures = py_collections.OrderedDict()
 
     self._building_function = True
     # Map from resource tensor name to last op (in program order) which uses
@@ -122,9 +190,7 @@ class FuncGraph(ops.Graph):
       # specialization (currently used in cond_v2), here and in the cache key.
       self._colocation_stack = graph._colocation_stack.copy()  # pylint: disable=protected-access
 
-    if not self._read_only_collections:
-      self._collections = graph._collections  # pylint: disable=protected-access
-    else:
+    if collections is None:
       for collection_name in graph.get_all_collection_keys():
         if collection_name not in WHITELIST_COLLECTIONS:
           self._collections[collection_name] = graph.get_collection(
@@ -132,6 +198,8 @@ class FuncGraph(ops.Graph):
       for collection_name in WHITELIST_COLLECTIONS:
         self._collections[collection_name] = graph.get_collection_ref(
             collection_name)
+    else:
+      self._collections = collections
 
   def as_default(self):
     outer_cm = super(FuncGraph, self).as_default()
@@ -338,7 +406,8 @@ def func_graph_from_py_func(name,
                             autograph=False,
                             add_control_dependencies=True,
                             arg_names=None,
-                            op_return_value=None):
+                            op_return_value=None,
+                            collections=None):
   """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
@@ -365,6 +434,13 @@ def func_graph_from_py_func(name,
     op_return_value: Optional. A Tensor. If set and `python_func` returns
       Operations, those return values will be replaced with this value. If not
       set, returning an Operation triggers an error.
+    collections: a dictionary of collections this FuncGraph should start
+      with. If not specified (None), the FuncGraph will read (but not write to)
+      the outer graph's collections that are not whitelisted, and both
+      read and write to the outer graph's collections that are whitelisted.
+      The current whitelisted collections are the global variables, the
+      local variables, and the trainable variables.
+      Defaults to None.
 
   Returns:
     A FuncGraph.
@@ -376,7 +452,7 @@ def func_graph_from_py_func(name,
   if op_return_value is not None:
     assert isinstance(op_return_value, ops.Tensor), op_return_value
   if func_graph is None:
-    func_graph = FuncGraph(name)
+    func_graph = FuncGraph(name, collections=collections)
   assert isinstance(func_graph, FuncGraph)
   if add_control_dependencies:
     control_manager = AutomaticControlDependencies
@@ -395,6 +471,14 @@ def func_graph_from_py_func(name,
     func_args = _get_defun_inputs_from_args(args, arg_names)
     func_kwargs = _get_defun_inputs_from_kwargs(kwargs)
 
+    # Convert all Tensors into TensorSpecs before saving the structured inputs.
+    # If storing pure concrete functions that are not called through polymorphic
+    # functions, we don't have access to FunctionSpec, so we need to call the
+    # TensorSpecs by their `arg_names` for later binding.
+    func_graph.structured_input_signature = (
+        convert_structure_to_signature(func_args, arg_names),
+        convert_structure_to_signature(func_kwargs))
+
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
@@ -414,7 +498,7 @@ def func_graph_from_py_func(name,
           x = array_ops.identity(op_return_value)
       elif not isinstance(x, tensor_array_ops.TensorArray):
         try:
-          x = ops.convert_to_tensor_or_indexed_slices(x)
+          x = ops.convert_to_tensor_or_composite(x)
         except (ValueError, TypeError):
           raise TypeError(
               "To be compatible with tf.contrib.eager.defun, Python functions "
@@ -548,36 +632,25 @@ def flatten(sequence):
   Flattens non-tensor objects into their constituent tensors.
 
   Args:
-    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+    sequence: A nested structure of Tensors, CompositeTensors, and
       TensorArrays.
 
   Returns:
     A list of tensors.
   """
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
-  flat_sequence = nest.flatten(sequence)
-  outputs = []
-  for item in flat_sequence:
-    if isinstance(item, ops.IndexedSlices):
-      if item.dense_shape is not None:
-        outputs.extend([item.values, item.indices, item.dense_shape])
-      else:
-        outputs.extend([item.values, item.indices])
-    elif isinstance(item, sparse_tensor.SparseTensor):
-      outputs.extend([item.indices, item.values, item.dense_shape])
-    elif isinstance(item, tensor_array_ops.TensorArray):
-      outputs.append(item.flow)
-    else:
-      outputs.append(item)
-  return outputs
+  flat_sequence = nest.flatten(sequence, expand_composites=True)
+  return [
+      item.flow if isinstance(item, tensor_array_ops.TensorArray) else item
+      for item in flat_sequence]
 
 
 def pack_sequence_as(structure, flat_sequence):
   """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
 
   Args:
-    structure: The structure to pack into. May contain Tensors, IndexedSlices,
-      TensorArrays or SparseTensors.
+    structure: The structure to pack into. May contain Tensors,
+      CompositeTensors, or TensorArrays.
     flat_sequence: An iterable containing tensors.
 
   Returns:
@@ -586,33 +659,16 @@ def pack_sequence_as(structure, flat_sequence):
   Raises:
     AssertionError if `structure` and `flat_sequence` are not compatible.
   """
-  flattened_structure = nest.flatten(structure)
-  flat_sequence_with_slices_and_tas = []
-  index = 0
-  for t in flattened_structure:
-    if isinstance(t, ops.IndexedSlices):
-      if t.dense_shape is not None:
-        flat_sequence_with_slices_and_tas.append(
-            ops.IndexedSlices(*flat_sequence[index:index + 3]))
-        index += 3
-      else:
-        flat_sequence_with_slices_and_tas.append(
-            ops.IndexedSlices(*flat_sequence[index:index + 2]))
-        index += 2
-    elif isinstance(t, sparse_tensor.SparseTensor):
-      flat_sequence_with_slices_and_tas.append(
-          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
-      index += 3
-    elif isinstance(t, tensor_array_ops.TensorArray):
-      flow = flat_sequence[index]
-      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
-      flat_sequence_with_slices_and_tas.append(ta)
-      index += 1
-    else:
-      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
-      index += 1
-  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
-  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+  flat_sequence = list(flat_sequence)
+  flattened_structure = nest.flatten(structure, expand_composites=True)
+  if len(flattened_structure) != len(flat_sequence):
+    raise ValueError("Mismatch in element count")
+  for i in range(len(flat_sequence)):
+    if isinstance(flattened_structure[i], tensor_array_ops.TensorArray):
+      flat_sequence[i] = tensor_array_ops.build_ta_with_new_flow(
+          old_ta=flattened_structure[i], flow=flat_sequence[i])
+  return nest.pack_sequence_as(structure, flat_sequence, expand_composites=True)
+
 
 
 def _create_substitute_placeholder(value, name=None, dtype=None):
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index afc11b17bfd1447e502906bb973eb5746dfe0274..291986a2c4b05a8a15de1e840f09ccc5f30f9b6d 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -210,6 +210,7 @@ class _DefinedFunction(object):
                shape_func=None,
                capture_by_value=False,
                whitelisted_stateful_ops=None,
+               capture_resource_var_by_value=True,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -232,6 +233,8 @@ class _DefinedFunction(object):
         will be copied into the function body.
       whitelisted_stateful_ops: A set of ops that if stateful we ignore and
         copy into the function body, when `capture_by_value` is True.
+      capture_resource_var_by_value: Boolean (defaults to True). If False,
+        captured resource variable returns the handle instead of value.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -250,6 +253,7 @@ class _DefinedFunction(object):
     self._whitelisted_stateful_ops = whitelisted_stateful_ops
     if self._whitelisted_stateful_ops is None:
       self._whitelisted_stateful_ops = set()
+    self._capture_resource_var_by_value = capture_resource_var_by_value
     self._extra_kwargs = kwargs
     # Constructed only when C API is disabled, lazily
     self._definition = None
@@ -352,7 +356,8 @@ class _DefinedFunction(object):
         self._func_name,
         self._capture_by_value,
         self._caller_device,
-        whitelisted_stateful_ops=self._whitelisted_stateful_ops)
+        whitelisted_stateful_ops=self._whitelisted_stateful_ops,
+        capture_resource_var_by_value=self._capture_resource_var_by_value)
 
     self._extra_inputs = temp_graph.extra_inputs
     # pylint: disable=protected-access
@@ -636,11 +641,12 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, name, capture_by_value, whitelisted_stateful_ops, *args,
-               **kwargs):
+  def __init__(self, name, capture_by_value, whitelisted_stateful_ops,
+               capture_resource_var_by_value, *args, **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
     self._whitelisted_stateful_ops = whitelisted_stateful_ops
+    self._capture_resource_var_by_value = capture_resource_var_by_value
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
@@ -735,7 +741,8 @@ class _FuncGraph(ops.Graph):
           collections=collections,
           use_resource=use_resource)
       self.extra_vars.append(var)
-      if isinstance(var, resource_variable_ops.ResourceVariable):
+      if (isinstance(var, resource_variable_ops.ResourceVariable) and
+          self._capture_resource_var_by_value):
         # For resource-based variables read the variable outside the function
         # and pass in the value. This ensures that the function is pure and
         # differentiable. TODO(apassos) this may have performance problems if
@@ -830,7 +837,8 @@ def func_graph_from_py_func(func,
                             container=None,
                             collections_ref=None,
                             arg_shapes=None,
-                            whitelisted_stateful_ops=None):
+                            whitelisted_stateful_ops=None,
+                            capture_resource_var_by_value=True):
   """Returns a _FuncGraph generated from `func`.
 
   Args:
@@ -850,6 +858,8 @@ def func_graph_from_py_func(func,
     arg_shapes: A sequence of the function's argument shapes.
     whitelisted_stateful_ops: A set of ops that if stateful we ignore and
       re-create.
+    capture_resource_var_by_value: Boolean (defaults to True). If False,
+      captured resource variable returns the handle instead of value.
 
   Returns:
     A _FuncGraph.
@@ -859,7 +869,8 @@ def func_graph_from_py_func(func,
   """
   if not name:
     name = function_utils.get_func_name(func)
-  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops)
+  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops,
+                          capture_resource_var_by_value)
 
   with func_graph.as_default(), ops.device(device):
     # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 10ad7ad541f2f0eb15776deb0c3225421bf47a17..aa670f1e37941689624797d31b64ffff63408c0b 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
@@ -76,6 +77,14 @@ def function_def_to_graph(fdef, input_shapes=None):
   return func_graph
 
 
+def _is_function(fname):
+  """Checks for a function definition with `fname` in the current context."""
+  if context.executing_eagerly():
+    return context.context().has_function(fname)
+  else:
+    return ops.get_default_graph()._is_function(fname)  # pylint: disable=protected-access
+
+
 def function_def_to_graph_def(fdef, input_shapes=None):
   """Convert a FunctionDef to a GraphDef.
 
@@ -147,12 +156,12 @@ def function_def_to_graph_def(fdef, input_shapes=None):
     for attr in op_def.attr:
       if attr.type == "func":
         fname = node_def.attr[attr.name].func.name
-        if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+        if not _is_function(fname):
           raise ValueError("%s function not found." % fname)
       elif attr.type == "list(func)":
         for fn in node_def.attr[attr.name].list.func:
           fname = fn.name
-          if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+          if not _is_function(fname):
             raise ValueError("%s function not found." % fname)
 
     # Iterate over output_args in op_def to build the map.
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 7543376bcf274dc6edf821e19838c4aa574826ff..3d5a5fe79758d43e54a7acaa689bd7d7fe902c56 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -284,6 +284,7 @@ class FunctionTest(test.TestCase):
         out, = sess.run(dlogits, {logits: x, labels: y})
       self.assertAllClose(out, np.exp(prob - y))
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testCustomGradientError(self):
     dtype = dtypes.float32
 
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c737bd48811a664a6d51af26d1137223ba74379c..e6f86f7f932db2955479d785b1b39ebf3e0c7210 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
@@ -266,7 +267,7 @@ def _ProcessNewOps(graph):
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
         # Do not error in TF2 if the colocation cannot be guaranteed
-        if tf2.enabled():
+        if tf2.enabled() or control_flow_util.EnableControlFlowV2(graph):
           continue
 
         raise ValueError('Specified colocation to an op that '
diff --git a/tensorflow/python/framework/is_xla_test_true.py b/tensorflow/python/framework/is_xla_test_true.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ae1e68b36bdd38bf01eae7feb0d90db3cb9f197
--- /dev/null
+++ b/tensorflow/python/framework/is_xla_test_true.py
@@ -0,0 +1,29 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Including this as a dependency will result in Tensorflow tests using XLA.
+
+This function is defined by default in test_util.py to False. The test_util then
+attempts to import this module. If this file is made available through the BUILD
+rule, then this function is overridden and will instead cause Tensorflow graphs
+to be compiled with XLA.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+def is_xla_enabled():
+  """Returns true to state XLA should be enabled for Tensorflow tests."""
+  return True
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 908d28fb9585ab49b7817f351acab5b9391bad53..63b7108f66d7817414bfaa3641dd53ae63c17b55 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -41,6 +41,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -102,6 +103,9 @@ class _UserDeviceSpec(object):
 
 class NullContextmanager(object):
 
+  def __init__(self, *args, **kwargs):
+    pass
+
   def __enter__(self):
     pass
 
@@ -1029,7 +1033,7 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
       `preferred_dtype` is not possible, this argument has no effect.
 
   Returns:
-    An `Tensor` based on `value`.
+    A `Tensor` based on `value`.
 
   Raises:
     TypeError: If no conversion function is registered for `value` to `dtype`.
@@ -1082,7 +1086,7 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
     name: Optional name to use if a new `Tensor` is created.
 
   Returns:
-    An `Tensor` based on `value`.
+    A `Tensor` based on `value`.
 
   Raises:
     TypeError: If no conversion function is registered for `value` to `dtype`.
@@ -1225,7 +1229,7 @@ def internal_convert_n_to_tensor(values,
       value.
   """
   if not isinstance(values, collections.Sequence):
-    raise TypeError("values must be a list.")
+    raise TypeError("values must be a sequence.")
   ret = []
   if ctx is None: ctx = context.context()
   for i, value in enumerate(values):
@@ -1289,7 +1293,7 @@ def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
     name: (Optional.) A name to use if a new `Tensor` is created.
 
   Returns:
-    An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
+    A `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
 
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
@@ -1302,7 +1306,7 @@ def internal_convert_to_tensor_or_indexed_slices(value,
                                                  dtype=None,
                                                  name=None,
                                                  as_ref=False):
-  """Converts the given object to an `Tensor` or an `IndexedSlices`.
+  """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
   If `value` is an `IndexedSlices` or `SparseTensor` it is returned
   unmodified. Otherwise, it is converted to a `Tensor` using
@@ -1317,7 +1321,7 @@ def internal_convert_to_tensor_or_indexed_slices(value,
     as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
-    An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
+    A `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
 
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
@@ -1348,7 +1352,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
   Args:
     values: A list of `None`, `IndexedSlices`, `SparseTensor`, or objects that
       can be consumed by `convert_to_tensor()`.
-    dtype: (Optional.) The required `DType` of the returned `Tensor`
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
       `IndexedSlices`.
     name: (Optional.) A name prefix to used when a new `Tensor` is
       created, in which case element `i` will be given the name `name
@@ -1356,7 +1360,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
     as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
-    A list of `Tensor`, `IndexedSlices`, and/or `SparseTensor` objects.
+    A list of `Tensor`, `IndexedSlices`, `SparseTensor` and/or `None` objects.
 
   Raises:
     TypeError: If no conversion function is registered for an element in
@@ -1365,7 +1369,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
       value.
   """
   if not isinstance(values, collections.Sequence):
-    raise TypeError("values must be a list.")
+    raise TypeError("values must be a sequence.")
   ret = []
   for i, value in enumerate(values):
     if value is None:
@@ -1406,6 +1410,132 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
       values=values, dtype=dtype, name=name, as_ref=False)
 
 
+def convert_to_tensor_or_composite(value, dtype=None, name=None):
+  """Converts the given object to a `Tensor` or `CompositeTensor`.
+
+  If `value` is a `CompositeTensor` it is returned unmodified. Otherwise, it
+  is converted to a `Tensor` using `convert_to_tensor()`.
+
+  Args:
+    value: A `CompositeTensor` or an object that can be consumed
+      by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
+      `CompositeTensor`.
+    name: (Optional.) A name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `CompositeTensor`, based on `value`.
+
+  Raises:
+    ValueError: If `dtype` does not match the element type of `value`.
+  """
+  return internal_convert_to_tensor_or_composite(
+      value=value, dtype=dtype, name=name, as_ref=False)
+
+
+def internal_convert_to_tensor_or_composite(value,
+                                            dtype=None,
+                                            name=None,
+                                            as_ref=False):
+  """Converts the given object to a `Tensor` or `CompositeTensor`.
+
+  If `value` is a `CompositeTensor` it is returned unmodified.  Otherwise, it
+  is converted to a `Tensor` using `convert_to_tensor()`.
+
+  Args:
+    value: A `CompositeTensor`, or an object that can be consumed
+      by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
+      `CompositeTensor`.
+    name: (Optional.) A name to use if a new `Tensor` is created.
+    as_ref: True if the caller wants the results as ref tensors.
+
+  Returns:
+    A `Tensor` or `CompositeTensor`, based on `value`.
+
+  Raises:
+    ValueError: If `dtype` does not match the element type of `value`.
+  """
+  if isinstance(value, composite_tensor.CompositeTensor):
+    value_dtype = getattr(value, "dtype", None)
+    if dtype and not dtypes.as_dtype(dtype).is_compatible_with(value_dtype):
+      raise ValueError(
+          "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
+          (dtypes.as_dtype(dtype).name, value.dtype.name, str(value)))
+    return value
+  else:
+    return internal_convert_to_tensor(
+        value, dtype=dtype, name=name, as_ref=as_ref)
+
+
+def internal_convert_n_to_tensor_or_composite(values,
+                                              dtype=None,
+                                              name=None,
+                                              as_ref=False):
+  """Converts `values` to a list of `Tensor` or `CompositeTensor` objects.
+
+  Any `CompositeTensor` objects in `values` are returned unmodified.
+
+  Args:
+    values: A list of `None`, `CompositeTensor`, or objects that
+      can be consumed by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor`s or
+      `CompositeTensor`s.
+    name: (Optional.) A name prefix to used when a new `Tensor` is
+      created, in which case element `i` will be given the name `name
+      + '_' + i`.
+    as_ref: True if the caller wants the results as ref tensors.
+
+  Returns:
+    A list of `Tensor`, `CompositeTensor`, and/or `None` objects.
+
+  Raises:
+    TypeError: If no conversion function is registered for an element in
+      `values`.
+    RuntimeError: If a registered conversion function returns an invalid
+      value.
+  """
+  if not isinstance(values, collections.Sequence):
+    raise TypeError("values must be a sequence.")
+  ret = []
+  for i, value in enumerate(values):
+    if value is None:
+      ret.append(value)
+    else:
+      n = None if name is None else "%s_%d" % (name, i)
+      ret.append(
+          internal_convert_to_tensor_or_composite(
+              value, dtype=dtype, name=n, as_ref=as_ref))
+  return ret
+
+
+def convert_n_to_tensor_or_composite(values, dtype=None, name=None):
+  """Converts `values` to a list of `Output` or `CompositeTensor` objects.
+
+  Any `CompositeTensor` objects in `values` are returned unmodified.
+
+  Args:
+    values: A list of `None`, `CompositeTensor``, or objects that
+      can be consumed by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor`s or
+      `CompositeTensor`s.
+    name: (Optional.) A name prefix to used when a new `Tensor` is
+      created, in which case element `i` will be given the name `name
+      + '_' + i`.
+
+  Returns:
+    A list of `Tensor` and/or `CompositeTensor` objects.
+
+  Raises:
+    TypeError: If no conversion function is registered for an element in
+      `values`.
+    RuntimeError: If a registered conversion function returns an invalid
+      value.
+  """
+  return internal_convert_n_to_tensor_or_composite(
+      values=values, dtype=dtype, name=name, as_ref=False)
+
+
 # TODO(josh11b): Add ctx argument to conversion_func() signature.
 @tf_export("register_tensor_conversion_function")
 def register_tensor_conversion_function(base_type,
@@ -1485,7 +1615,7 @@ def register_tensor_conversion_function(base_type,
 
 
 @tf_export("IndexedSlices")
-class IndexedSlices(_TensorLike):
+class IndexedSlices(_TensorLike, composite_tensor.CompositeTensor):
   """A sparse representation of a set of tensor slices at given indices.
 
   This class is a simple wrapper for a pair of `Tensor` objects:
@@ -1568,6 +1698,29 @@ class IndexedSlices(_TensorLike):
   def __neg__(self):
     return IndexedSlices(-self.values, self.indices, self.dense_shape)
 
+  def _to_components(self):
+    if self._dense_shape is None:
+      return (self._values, self._indices)
+    else:
+      return (self._values, self._indices, self._dense_shape)
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      shape = self._values.shape
+    if self._dense_shape is None:
+      return [shape, shape[:1]]  # values, indices
+    else:
+      # values, indices, dense_shape
+      return [shape, shape[:1], tensor_shape.TensorShape([shape.ndims])]
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 IndexedSlicesValue = collections.namedtuple(
     "IndexedSlicesValue", ["values", "indices", "dense_shape"])
@@ -4523,7 +4676,11 @@ class Graph(object):
     control_ops = []
     current = self._current_control_dependencies()
     for c in control_inputs:
-      if isinstance(c, IndexedSlices):
+      # The hasattr(handle) is designed to match ResourceVariables. This is so
+      # control dependencies on a variable or on an unread variable don't
+      # trigger reads.
+      if (isinstance(c, IndexedSlices) or
+          (hasattr(c, "_handle") and hasattr(c, "op"))):
         c = c.op
       c = self.as_graph_element(c)
       if isinstance(c, Tensor):
@@ -4988,12 +5145,19 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
         op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
 
 
+# Internal interface to colocate_with. colocate_with has been deprecated from
+# public API. There are still a few internal uses of colocate_with. Add internal
+# only API for those uses to avoid deprecation warning.
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+
+
 @deprecation.deprecated(
     date=None,
     instructions="Colocations handled automatically by placer.")
 @tf_export(v1=["colocate_with"])
-def colocate_with(op, ignore_existing=False):
-  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+def _colocate_with(op, ignore_existing=False):
+  return colocate_with(op, ignore_existing)
 
 
 @tf_export("control_dependencies")
@@ -5470,6 +5634,9 @@ def disable_eager_execution():
   projects from TensorFlow 1.x to 2.x.
   """
   context.default_execution_mode = context.GRAPH_MODE
+  c = context.context_safe()
+  if c is not None:
+    c._eager_context.is_eager = False  # pylint: disable=protected-access
 
 
 def enable_eager_execution_internal(config=None,
@@ -6022,7 +6189,15 @@ class name_scope(object):  # pylint: disable=invalid-name
       name: The name argument that is passed to the op function.
       default_name: The default name to use if the `name` argument is `None`.
       values: The list of `Tensor` arguments that are passed to the op function.
+
+    Raises:
+      TypeError: if `default_name` is passed in but not a string.
     """
+    if not (default_name is None or isinstance(default_name, six.string_types)):
+      raise TypeError(
+          "`default_name` type (%s) is not a string type. You likely meant to "
+          "pass this into the `values` kwarg."
+          % type(default_name))
     self._name = default_name if name is None else name
     self._default_name = default_name
     self._values = values
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 2d7ee1a99e02cbb663df38ae17d8772fa6f11816..8347e9d1eb30c2c94bb2262cce0cff44279d45e1 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2052,6 +2052,9 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     with ops.name_scope(None, default_scope_name, [a, b]) as scope:
       self.assertEqual("%s/" % default_scope_name, scope)
       self.assertEqual(g0, ops.get_default_graph())
+    with self.assertRaises(TypeError):
+      with ops.name_scope(scope_name, [a, b]):
+        pass
 
   def _testGraphElements(self, graph_elements):
     scope_name = "my_scope"
@@ -2345,7 +2348,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           math_ops.add(c, c)
         c2 = constant_op.constant(2.0)
       with self.assertRaisesRegexp(
-          TypeError, "contains objects other than 'EagerTensor'"):
+          TypeError, "Graph tensors"):
         math_ops.add(c2, c2)
 
   def testPreservesNameScopeInEagerExecution(self):
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 5e1a95a26be034bff0a1f5eb996ac6f16c61e282..c69fa41677bdc451d2de63a583bbea8b03fc0178 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes and functions used to construct graphs."""
+"""Sparse tensors."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,10 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -34,7 +36,7 @@ _override_helper = ops._override_helper
 
 
 @tf_export("sparse.SparseTensor", "SparseTensor")
-class SparseTensor(_TensorLike):
+class SparseTensor(_TensorLike, composite_tensor.CompositeTensor):
   """Represents a sparse tensor.
 
   TensorFlow represents a sparse tensor as three separate dense tensors:
@@ -113,16 +115,12 @@ class SparseTensor(_TensorLike):
       dense_shape: A 1-D int64 tensor of shape `[ndims]`.
 
     """
-    with ops.name_scope(None, "SparseTensor",
-                        [indices, values, dense_shape]):
+    with ops.name_scope(None, "SparseTensor", [indices, values, dense_shape]):
       indices = ops.convert_to_tensor(
           indices, name="indices", dtype=dtypes.int64)
-      # Always pass as_ref=True because we want to be able to update
-      # values later if it is a VariableOp.
       # TODO(touts): Consider adding mutable_values() when 'values'
       # is a VariableOp and updating users of SparseTensor.
-      values = ops.internal_convert_to_tensor(
-          values, name="values", as_ref=True)
+      values = ops.internal_convert_to_tensor(values, name="values")
       dense_shape = ops.convert_to_tensor(
           dense_shape, name="dense_shape", dtype=dtypes.int64)
     self._indices = indices
@@ -241,6 +239,30 @@ class SparseTensor(_TensorLike):
   def _override_operator(operator, func):
     _override_helper(SparseTensor, operator, func)
 
+  def _to_components(self):
+    return (self._indices, self._values, self._dense_shape)
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      shape = self.dense_shape.shape
+    if shape.ndims is None:
+      shape = tensor_shape.TensorShape([None])
+    if shape.ndims != 1:
+      raise ValueError("Shape invariant for SparseTensor must have the form "
+                       "TensorShape([r]), got %r" % shape)
+    rank = tensor_shape.dimension_value(shape[0])
+    return [tensor_shape.TensorShape([None, rank]),  # indices
+            tensor_shape.TensorShape([None]),  # values
+            tensor_shape.TensorShape([rank])]  # dense_shape
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 960a3dad7389553955c999e444a9f98c1857f588..a7537bb5f1adfe70018f50cb9a627bfffe176226 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -90,7 +90,8 @@ def disable_v2_tensorshape():
   TensorShape = TensorShapeV1
 
 
-@tf_export(v1=["dimension_value"])
+@tf_export("compat.dimension_value",
+           v1=["dimension_value", "compat.dimension_value"])
 def dimension_value(dimension):
   """Compatibility utility required to allow for both V1 and V2 behavior in TF.
 
@@ -122,7 +123,8 @@ def dimension_value(dimension):
   return dimension
 
 
-@tf_export(v1=["dimension_at_index"])
+@tf_export("compat.dimension_at_index",
+           v1=["dimension_at_index", "compat.dimension_at_index"])
 def dimension_at_index(shape, index):
   """Compatibility utility required to allow for both V1 and V2 behavior in TF.
 
@@ -269,10 +271,11 @@ class Dimension(object):
     Dimensions are combined as follows:
 
     ```python
-    tf.Dimension(n)   .merge_with(tf.Dimension(n))    == tf.Dimension(n)
-    tf.Dimension(n)   .merge_with(tf.Dimension(None)) == tf.Dimension(n)
-    tf.Dimension(None).merge_with(tf.Dimension(n))    == tf.Dimension(n)
-    tf.Dimension(None).merge_with(tf.Dimension(None)) == tf.Dimension(None)
+    tf.Dimension(n)   .merge_with(tf.Dimension(n))     == tf.Dimension(n)
+    tf.Dimension(n)   .merge_with(tf.Dimension(None))  == tf.Dimension(n)
+    tf.Dimension(None).merge_with(tf.Dimension(n))     == tf.Dimension(n)
+    # equivalent to tf.Dimension(None)
+    tf.Dimension(None).merge_with(tf.Dimension(None))
 
     # raises ValueError for n != m
     tf.Dimension(n)   .merge_with(tf.Dimension(m))
@@ -302,10 +305,10 @@ class Dimension(object):
     Dimensions are summed as follows:
 
     ```python
-    tf.Dimension(m)    + tf.Dimension(n)    == tf.Dimension(m + n)
-    tf.Dimension(m)    + tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) + tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) + tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    + tf.Dimension(n)     == tf.Dimension(m + n)
+    tf.Dimension(m)    + tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -337,10 +340,10 @@ class Dimension(object):
     Dimensions are subtracted as follows:
 
     ```python
-    tf.Dimension(m)    - tf.Dimension(n)    == tf.Dimension(m - n)
-    tf.Dimension(m)    - tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) - tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) - tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    - tf.Dimension(n)     == tf.Dimension(m - n)
+    tf.Dimension(m)    - tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -376,10 +379,10 @@ class Dimension(object):
     Dimensions are summed as follows:
 
     ```python
-    tf.Dimension(m)    * tf.Dimension(n)    == tf.Dimension(m * n)
-    tf.Dimension(m)    * tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) * tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) * tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    * tf.Dimension(n)     == tf.Dimension(m * n)
+    tf.Dimension(m)    * tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -415,10 +418,10 @@ class Dimension(object):
     Dimensions are divided as follows:
 
     ```python
-    tf.Dimension(m)    // tf.Dimension(n)    == tf.Dimension(m // n)
-    tf.Dimension(m)    // tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) // tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) // tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    // tf.Dimension(n)     == tf.Dimension(m // n)
+    tf.Dimension(m)    // tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -473,10 +476,10 @@ class Dimension(object):
     Dimension moduli are computed as follows:
 
     ```python
-    tf.Dimension(m)    % tf.Dimension(n)    == tf.Dimension(m % n)
-    tf.Dimension(m)    % tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) % tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) % tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    % tf.Dimension(n)     == tf.Dimension(m % n)
+    tf.Dimension(m)    % tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 51f71616a1bbd2d6dc729211182fedffb77036f6..ca8b067935c067f9ff8fe39b72f4ba32400b03bd 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -598,88 +598,53 @@ def MakeNdarray(tensor):
   dtype = tensor_dtype.as_numpy_dtype
 
   if tensor.tensor_content:
-    return (np.frombuffer(tensor.tensor_content, dtype=dtype).copy()
-            .reshape(shape))
-  elif tensor_dtype == dtypes.float16 or tensor_dtype == dtypes.bfloat16:
+    return (np.frombuffer(tensor.tensor_content,
+                          dtype=dtype).copy().reshape(shape))
+
+  if tensor_dtype == dtypes.string:
+    # np.pad throws on these arrays of type np.object.
+    values = list(tensor.string_val)
+    padding = num_elements - len(values)
+    if padding > 0:
+      last = values[-1] if values else ""
+      values.extend([last] * padding)
+    return np.array(values, dtype=dtype).reshape(shape)
+
+  if tensor_dtype == dtypes.float16 or tensor_dtype == dtypes.bfloat16:
     # the half_val field of the TensorProto stores the binary representation
     # of the fp16: we need to reinterpret this as a proper float16
-    if len(tensor.half_val) == 1:
-      tmp = np.array(tensor.half_val[0], dtype=np.uint16)
-      tmp.dtype = tensor_dtype.as_numpy_dtype
-      return np.repeat(tmp, num_elements).reshape(shape)
-    else:
-      tmp = np.fromiter(tensor.half_val, dtype=np.uint16)
-      tmp.dtype = tensor_dtype.as_numpy_dtype
-      return tmp.reshape(shape)
+    values = np.fromiter(tensor.half_val, dtype=np.uint16)
+    values.dtype = tensor_dtype.as_numpy_dtype
   elif tensor_dtype == dtypes.float32:
-    if len(tensor.float_val) == 1:
-      return np.repeat(
-          np.array(tensor.float_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.float_val, dtype=dtype)
   elif tensor_dtype == dtypes.float64:
-    if len(tensor.double_val) == 1:
-      return np.repeat(
-          np.array(tensor.double_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.double_val, dtype=dtype)
   elif tensor_dtype in [
       dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
       dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16
   ]:
-    if len(tensor.int_val) == 1:
-      return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.int_val, dtype=dtype)
   elif tensor_dtype == dtypes.int64:
-    if len(tensor.int64_val) == 1:
-      return np.repeat(
-          np.array(tensor.int64_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape)
-  elif tensor_dtype == dtypes.string:
-    if len(tensor.string_val) == 1:
-      return np.repeat(
-          np.array(tensor.string_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.array(
-          [x for x in tensor.string_val], dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.int64_val, dtype=dtype)
   elif tensor_dtype == dtypes.complex64:
     it = iter(tensor.scomplex_val)
-    if len(tensor.scomplex_val) == 2:
-      return np.repeat(
-          np.array(
-              complex(tensor.scomplex_val[0], tensor.scomplex_val[1]),
-              dtype=dtype), num_elements).reshape(shape)
-    else:
-      return np.array(
-          [complex(x[0], x[1]) for x in zip(it, it)],
-          dtype=dtype).reshape(shape)
+    values = np.array([complex(x[0], x[1]) for x in zip(it, it)], dtype=dtype)
   elif tensor_dtype == dtypes.complex128:
     it = iter(tensor.dcomplex_val)
-    if len(tensor.dcomplex_val) == 2:
-      return np.repeat(
-          np.array(
-              complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]),
-              dtype=dtype), num_elements).reshape(shape)
-    else:
-      return np.array(
-          [complex(x[0], x[1]) for x in zip(it, it)],
-          dtype=dtype).reshape(shape)
+    values = np.array([complex(x[0], x[1]) for x in zip(it, it)], dtype=dtype)
   elif tensor_dtype == dtypes.bool:
-    if len(tensor.bool_val) == 1:
-      return np.repeat(np.array(tensor.bool_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.bool_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.bool_val, dtype=dtype)
   else:
     raise TypeError("Unsupported tensor type: %s" % tensor.dtype)
 
+  if values.size == 0:
+    return np.zeros(shape, dtype)
+
+  if values.size != num_elements:
+    values = np.pad(values, (0, num_elements - values.size), "edge")
+
+  return values.reshape(shape)
+
 
 def ShapeEquals(tensor_proto, shape):
   """Returns True if "tensor_proto" has the given "shape".
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 00337546186d3a01313a49d11dd266e6dade3227..cdacdfaaada96e21d4f4d6a9fb2a9247e332969f 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -336,23 +336,16 @@ class TensorUtilTest(test.TestCase):
       self.assertAllClose(np.array([10, 20, 30], dtype=nptype), a)
 
   def testIntTypesWithImplicitRepeat(self):
-    for dtype, nptype in [(dtypes.int64, np.int64),
-                          (dtypes.int32, np.int32),
-                          (dtypes.uint8, np.uint8),
-                          (dtypes.uint16, np.uint16),
-                          (dtypes.int16, np.int16),
-                          (dtypes.int8, np.int8)]:
+    for dtype, nptype in [(dtypes.int64, np.int64), (dtypes.int32, np.int32),
+                          (dtypes.uint8, np.uint8), (dtypes.uint16, np.uint16),
+                          (dtypes.int16, np.int16), (dtypes.int8, np.int8)]:
       self.assertAllEqual(
-          np.array(
-              [[10, 10, 10, 10],
-               [10, 10, 10, 10],
-               [10, 10, 10, 10]],
-              dtype=nptype),
+          np.array([[10, 11, 12, 12], [12, 12, 12, 12], [12, 12, 12, 12]],
+                   dtype=nptype),
           tensor_util.MakeNdarray(
-              tensor_util.make_tensor_proto(
-                  [10],
-                  shape=[3, 4],
-                  dtype=dtype)))
+              tensor_util.make_tensor_proto([10, 11, 12],
+                                            shape=[3, 4],
+                                            dtype=dtype)))
 
   def testIntMixedWithDimension(self):
     # Github issue: 11974
@@ -500,9 +493,12 @@ class TensorUtilTest(test.TestCase):
     self.assertEquals([b"foo"], a)
 
   def testStringWithImplicitRepeat(self):
-    t = tensor_util.make_tensor_proto("f", shape=[3, 4])
+    t = tensor_util.make_tensor_proto(["f", "g"], shape=[3, 4])
     a = tensor_util.MakeNdarray(t)
-    self.assertAllEqual(np.array([[b"f"] * 4] * 3, dtype=np.object), a)
+    self.assertAllEqual(
+        np.array([[b"f", b"g", b"g", b"g"], [b"g", b"g", b"g", b"g"],
+                  [b"g", b"g", b"g", b"g"]],
+                 dtype=np.object), a)
 
   def testStringN(self):
     t = tensor_util.make_tensor_proto([b"foo", b"bar", b"baz"], shape=[1, 3])
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 99e184a8acd44012774917c4baaecd48bae6cbe3..1d0145f61c84969cf1b52eb070ec3f933d25741a 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -157,7 +157,7 @@ REGISTER_KERNEL_BUILDER(Name("Old").Device(DEVICE_CPU), OldOp);
 // Stubbed-out resource to test resource handle ops.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
 };
 
 REGISTER_RESOURCE_HANDLE_KERNEL(StubResource);
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index c663af2ee45722afcbd617b3e414a2213b8819cc..2bff3e457fef2911f5b6b2352e9851c5b5f6a750 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -82,6 +82,19 @@ from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
 
 
+# If the above import is made available through the BUILD rule, then this
+# function is overridden and will instead return True and cause Tensorflow
+# graphs to be compiled with XLA.
+def is_xla_enabled():
+  return False
+
+
+try:
+  from tensorflow.python.framework.is_xla_test_true import is_xla_enabled  # pylint: disable=g-import-not-at-top
+except:
+  pass
+
+
 @tf_export("test.gpu_device_name")
 def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
@@ -97,6 +110,7 @@ def assert_ops_in_graph(expected_ops, graph):
   Args:
     expected_ops: `dict<string, string>` of op name to op type.
     graph: Graph to check.
+
   Returns:
     `dict<string, node>` of node name to node.
 
@@ -149,7 +163,7 @@ def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False):
     actual: The `GraphDef` we have.
     expected: The `GraphDef` we expected.
     checkpoint_v2: boolean determining whether to ignore randomized attribute
-        values that appear in V2 checkpoints.
+      values that appear in V2 checkpoints.
 
   Raises:
     AssertionError: If the `GraphDef`s do not match.
@@ -360,7 +374,8 @@ def skip_if(condition):
 
   Args:
     condition: Either an expression that can be used in "if not condition"
-               statement, or a callable whose result should be a boolean.
+      statement, or a callable whose result should be a boolean.
+
   Returns:
     The wrapped function
   """
@@ -373,7 +388,7 @@ def skip_if(condition):
       else:
         skip = condition
       if not skip:
-        fn(*args, **kwargs)
+        return fn(*args, **kwargs)
 
     return wrapper
 
@@ -410,7 +425,7 @@ def enable_control_flow_v2(fn):
     enable_control_flow_v2_old = control_flow_util.ENABLE_CONTROL_FLOW_V2
     control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     try:
-      fn(*args, **kwargs)
+      return fn(*args, **kwargs)
     finally:
       control_flow_util.ENABLE_CONTROL_FLOW_V2 = enable_control_flow_v2_old
 
@@ -483,9 +498,11 @@ def disable_control_flow_v2(unused_msg):
   Returns:
     The wrapped function with _disable_control_flow_v2 attr set to True.
   """
+
   def wrapper(func):
     func._disable_control_flow_v2 = True
     return func
+
   return wrapper
 
 
@@ -568,6 +585,7 @@ def assert_no_new_tensors(f):
 
   Args:
     f: The test case to run.
+
   Returns:
     The decorated test case.
   """
@@ -594,9 +612,9 @@ def assert_no_new_tensors(f):
       ops.get_default_graph()._graph_key = outside_graph_key
       if outside_executed_eagerly:
         with context.eager_mode():
-          f(self, **kwargs)
+          result = f(self, **kwargs)
       else:
-        f(self, **kwargs)
+        result = f(self, **kwargs)
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
     context.context()._clear_caches()  # pylint: disable=protected-access
@@ -610,6 +628,7 @@ def assert_no_new_tensors(f):
           len(tensors_after),
           str(tensors_after),
       )))
+    return result
 
   return decorator
 
@@ -726,6 +745,7 @@ def assert_no_garbage_created(f):
 
   Args:
     f: The function to decorate.
+
   Returns:
     The decorated function.
   """
@@ -734,14 +754,14 @@ def assert_no_garbage_created(f):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
     # Force-load `distribution_strategy_context` to prevent GC at
     # test time when using eager. Remove once b/117329403 is resolved.
-    tape.distribution_strategy_context.get_distribution_strategy()
+    tape.distribution_strategy_context.get_strategy()
 
     gc.disable()
     previous_debug_flags = gc.get_debug()
     gc.set_debug(gc.DEBUG_SAVEALL)
     gc.collect()
     previous_garbage = len(gc.garbage)
-    f(self, **kwargs)
+    result = f(self, **kwargs)
     gc.collect()
     new_garbage = len(gc.garbage)
     if new_garbage > previous_garbage:
@@ -786,6 +806,7 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+    return result
 
   return decorator
 
@@ -797,8 +818,8 @@ def _combine_named_parameters(**kwargs):
   can be computed using `times()`.
 
   Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
+    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+      `option=the_only_possibility`.
 
   Returns:
     a list of dictionaries for each combination. Keys in the dictionaries are
@@ -836,8 +857,8 @@ def generate_combinations_with_testcase_name(**kwargs):
   parameterized tests.
 
   Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
+    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+      `option=the_only_possibility`.
 
   Returns:
     a list of dictionaries for each combination. Keys in the dictionaries are
@@ -865,10 +886,10 @@ def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name, value in cls.__dict__.copy().items():
-    if (callable(value) and
-        name.startswith(unittest.TestLoader.testMethodPrefix) and
-        not (name.startswith("testSkipEager")
-             or name.startswith("test_skip_eager"))):
+    if callable(value) and name.startswith(
+        unittest.TestLoader.testMethodPrefix) and not (
+            name.startswith("testSkipEager") or
+            name.startswith("test_skip_eager") or name == "test_session"):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -906,13 +927,17 @@ def run_in_graph_and_eager_modes(func=None,
   eager execution enabled as it does when constructing a TensorFlow graph and
   executing the `z` tensor in a session.
 
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
+
 
   Args:
     func: function to be annotated. If `func` is None, this method returns a
       decorator the can be applied to a function. If `func` is not None this
       returns the decorator applied to `func`.
-    config: An optional config_pb2.ConfigProto to use to configure the
-      session when executing graphs.
+    config: An optional config_pb2.ConfigProto to use to configure the session
+      when executing graphs.
     use_gpu: If True, attempt to run as many operations as possible on GPU.
     reset_test: If True, tearDown and SetUp the test case between the two
       executions of the test (once with and once without eager execution).
@@ -926,6 +951,7 @@ def run_in_graph_and_eager_modes(func=None,
       collected elsewhere in the unit test file will not work). Additionally,
       checks that nothing still has a reference to Tensors that the test
       allocated.
+
   Returns:
     Returns a decorator that will run the decorated test method twice:
     once by constructing and executing a graph in a session and once with
@@ -986,9 +1012,10 @@ def py_func_if_in_function(f):
     if not ops.get_default_graph()._building_function:
       return f(*args, **kwds)
 
-    tensor_args, tensor_indices = zip(
-        *[(x, i) for i, x in enumerate(args)
-          if isinstance(x, (ops.Tensor, variables.Variable))])
+    tensor_args, tensor_indices = zip(*[(x, i)
+                                        for i, x in enumerate(args)
+                                        if isinstance(x, (ops.Tensor,
+                                                          variables.Variable))])
 
     def inner_f(*inner_tensor_args):
       my_args = list(args)
@@ -1034,18 +1061,23 @@ def also_run_as_tf_function(f):
   return decorated
 
 
-def run_deprecated_v1(func=None):
+def deprecated_graph_mode_only(func=None):
   """Execute the decorated test in graph mode.
 
-  This function returns a decorator intended to be applied to tests that have
-  not been updated to a style that is compatible with both TensorFlow 1.x and
-  2.x. When this decorated is applied, the test body will be run in
-  an environment where API calls construct graphs instead of executing eagerly.
+  This function returns a decorator intended to be applied to tests that are not
+  compatible with eager mode. When this decorator is applied, the test body will
+  be run in an environment where API calls construct graphs instead of executing
+  eagerly.
+
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
 
   Args:
     func: function to be annotated. If `func` is None, this method returns a
       decorator the can be applied to a function. If `func` is not None this
       returns the decorator applied to `func`.
+
   Returns:
     Returns a decorator that will run the decorated test method in graph mode.
   """
@@ -1066,9 +1098,9 @@ def run_deprecated_v1(func=None):
     def decorated(self, *args, **kwargs):
       if tf2.enabled():
         with context.graph_mode():
-          f(self, *args, **kwargs)
+          return f(self, *args, **kwargs)
       else:
-        f(self, *args, **kwargs)
+        return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1078,12 +1110,19 @@ def run_deprecated_v1(func=None):
   return decorator
 
 
+run_deprecated_v1 = deprecated_graph_mode_only
+
+
 def run_v1_only(reason, func=None):
   """Execute the decorated test only if running in v1 mode.
 
   This function is intended to be applied to tests that exercise v1 only
   functionality. If the test is run in v2 mode it will simply be skipped.
 
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
+
   Args:
     reason: string giving a reason for limiting the test to v1 only.
     func: function to be annotated. If `func` is None, this method returns a
@@ -1111,7 +1150,7 @@ def run_v1_only(reason, func=None):
       if tf2.enabled():
         self.skipTest(reason)
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1127,6 +1166,10 @@ def run_v2_only(func=None):
   This function is intended to be applied to tests that exercise v2 only
   functionality. If the test is run in v1 mode it will simply be skipped.
 
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
+
   Args:
     func: function to be annotated. If `func` is None, this method returns a
       decorator the can be applied to a function. If `func` is not None this
@@ -1144,7 +1187,7 @@ def run_v2_only(func=None):
       if not tf2.enabled():
         self.skipTest("Test is only comptaible in v2")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1177,7 +1220,7 @@ def run_gpu_only(func=None):
       if not is_gpu_available():
         self.skipTest("Test requires GPU")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1210,7 +1253,7 @@ def run_cuda_only(func=None):
       if not is_gpu_available(cuda_only=True):
         self.skipTest("Test requires CUDA GPU")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1369,8 +1412,7 @@ class FakeEagerSession(object):
 
 
 class ErrorLoggingSession(session.Session):
-  """Wrapper around a Session that logs errors in run().
-  """
+  """Wrapper around a Session that logs errors in run()."""
 
   def run(self, *args, **kwargs):
     try:
@@ -1384,13 +1426,68 @@ class ErrorLoggingSession(session.Session):
       raise
 
 
+# The description is just for documentation purposes.
+def disable_xla(description):
+
+  def disable_xla_impl(func):
+    """Execute the test method only if xla is not enabled."""
+
+    def decorator(func):
+
+      def decorated(self, *args, **kwargs):
+        if is_xla_enabled():
+          return
+        else:
+          return func(self, *args, **kwargs)
+
+      return decorated
+
+    if func is not None:
+      return decorator(func)
+
+    return decorator
+
+  return disable_xla_impl
+
+
+# The description is just for documentation purposes.
+def disable_all_xla(description):
+
+  def disable_all_impl(cls):
+    """Execute all test methods in this class only if xla is not enabled."""
+    base_decorator = disable_xla
+    for name in dir(cls):
+      value = getattr(cls, name)
+      if callable(value) and name.startswith(
+          "test") and not name == "test_session":
+        setattr(cls, name, base_decorator(value))
+    return cls
+
+  return disable_all_impl
+
+
+class EagerSessionWarner(object):
+
+  def __getattr__(self, attr):
+    raise AttributeError(
+        "Trying to access properties or call methods on the result of "
+        "self.session(), self.cached_session(), etc while eager execution "
+        "is enabled. If you're porting this test case to TF 2.0, either "
+        "adapt the test to work with eager execution or insert a call to "
+        "tf.disable_eager_execution() in the main() function of this test "
+        "file.")
+
+
 @tf_export("test.TestCase")
 class TensorFlowTestCase(googletest.TestCase):
-  """Base class for tests that need to test TensorFlow.
-  """
+  """Base class for tests that need to test TensorFlow."""
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TensorFlowTestCase, self).__init__(methodName)
+    if is_xla_enabled():
+      os.putenv(
+          "TF_XLA_FLAGS", "--tf_xla_auto_jit=2 --tf_xla_min_cluster_size=1 "
+          "--tf_xla_enable_lazy_compilation=false")
     self._threads = []
     self._tempdir = None
     self._cached_session = None
@@ -1468,9 +1565,9 @@ class TensorFlowTestCase(googletest.TestCase):
     ```
 
     Args:
-      stream: The stream whose writes should be captured. This
-        stream must have a file descriptor, support writing via using that
-        file descriptor, and must have a `.flush()` method.
+      stream: The stream whose writes should be captured. This stream must have
+        a file descriptor, support writing via using that file descriptor, and
+        must have a `.flush()` method.
 
     Yields:
       A `CapturedWrites` object that contains all writes to the specified stream
@@ -1561,8 +1658,13 @@ class TensorFlowTestCase(googletest.TestCase):
     else:
       try:
         if sparse_tensor.is_sparse(tensor):
-          return sparse_tensor.SparseTensorValue(tensor.indices, tensor.values,
-                                                 tensor.dense_shape)
+          return sparse_tensor.SparseTensorValue(tensor.indices.numpy(),
+                                                 tensor.values.numpy(),
+                                                 tensor.dense_shape.numpy())
+        elif isinstance(tensor, ops.IndexedSlices):
+          return ops.IndexedSlicesValue(values=tensor.values.numpy(),
+                                        indices=tensor.indices.numpy(),
+                                        dense_shape=tensor.dense_shape.numpy())
         return tensor.numpy()
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
@@ -1629,7 +1731,7 @@ class TensorFlowTestCase(googletest.TestCase):
       the graph building and execution code in a test case.
     """
     if context.executing_eagerly():
-      yield None
+      yield EagerSessionWarner()
     else:
       with self._create_session(graph, config, force_gpu) as sess:
         with self._constrain_devices_and_set_default(sess, use_gpu, force_gpu):
@@ -1819,7 +1921,6 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads.append(ret)
     return ret
 
-
   # pylint: enable=invalid-name
   @py_func_if_in_function
   def assertNear(self, f1, f2, err, msg=None):
@@ -1836,9 +1937,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     # f1 == f2 is needed here as we might have: f1, f2 = inf, inf
     self.assertTrue(
-        f1 == f2 or math.fabs(f1 - f2) <= err,
-        "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
-                               if msg is not None else ""))
+        f1 == f2 or math.fabs(f1 - f2) <= err, "%f != %f +/- %f%s" %
+        (f1, f2, err, " (%s)" % msg if msg is not None else ""))
 
   @py_func_if_in_function
   def assertArrayNear(self, farray1, farray2, err, msg=None):
@@ -2007,11 +2107,11 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       a: The expected numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor), or any arbitrarily nested of
-         structure of these.
+        numpy `ndarray` (including Tensor), or any arbitrarily nested of
+        structure of these.
       b: The actual numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor), or any arbitrarily nested of
-         structure of these.
+        numpy `ndarray` (including Tensor), or any arbitrarily nested of
+        structure of these.
       rtol: relative tolerance.
       atol: absolute tolerance.
       msg: Optional message to report on failure.
@@ -2139,8 +2239,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all greater than a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2151,8 +2251,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all less than a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2163,8 +2263,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all greater than or equal to a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2175,8 +2275,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all less than or equal to a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2224,7 +2324,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       target: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+        numpy `ndarray` (including Tensor).
       lower_bound: lower bound of the range
       upper_bound: upper bound of the range
       open_lower_bound: (`bool`) whether the lower bound is open (i.e., > rather
@@ -2258,8 +2358,8 @@ class TensorFlowTestCase(googletest.TestCase):
                  str(upper_bound) + (")" if open_upper_bound else "]"))
 
     violations = (
-        np.less_equal(target, lower_bound)
-        if open_lower_bound else np.less(target, lower_bound))
+        np.less_equal(target, lower_bound) if open_lower_bound else np.less(
+            target, lower_bound))
     violations = np.logical_or(
         violations,
         np.greater_equal(target, upper_bound)
@@ -2278,7 +2378,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       target: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+        numpy `ndarray` (including Tensor).
       expected_set: (`list`, `tuple` or `set`) The closed set that the elements
         of the value of `target` are expected to fall into.
 
@@ -2300,7 +2400,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       target: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+        numpy `ndarray` (including Tensor).
       expected_dtype: Expected data type.
     """
     target = self._GetNdArray(target)
@@ -2321,9 +2421,9 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       exception_type: The expected type of exception that should be raised.
       expected_err_re_or_predicate: If this is callable, it should be a function
-        of one argument that inspects the passed-in exception and
-        returns True (success) or False (please fail the test). Otherwise, the
-        error message is expected to match this regular expression partially.
+        of one argument that inspects the passed-in exception and returns True
+        (success) or False (please fail the test). Otherwise, the error message
+        is expected to match this regular expression partially.
 
     Returns:
       A context manager to surround code that is expected to raise an
@@ -2424,6 +2524,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
   def _create_session(self, graph, config, force_gpu):
     """See session() for details."""
+
     def prepare_config(config):
       """Returns a config for sessions.
 
@@ -2526,10 +2627,10 @@ def create_local_cluster(num_workers,
   Args:
     num_workers: Number of worker servers to start.
     num_ps: Number of PS servers to start.
-    protocol: Communication protocol.  Allowed values are documented in
-      the documentation of `tf.train.Server`.
-    worker_config: (optional) ConfigProto to initialize workers. Can be used
-      to instantiate multiple devices etc.
+    protocol: Communication protocol.  Allowed values are documented in the
+      documentation of `tf.train.Server`.
+    worker_config: (optional) ConfigProto to initialize workers. Can be used to
+      instantiate multiple devices etc.
     ps_config: (optional) ConfigProto to initialize PS servers.
 
   Returns:
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 87795ffcfb5d21c408d646e581e19fe23a37b945..af9276c508b1db1e57a0dc8690cd5d6dfd0574e5 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -132,7 +132,7 @@ struct GCluster {
 
 static GCluster TF_NewCluster(bool allow_soft_placement,
                    bool disable_detailed_stats, TF_Status* out_status) {
-    int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   int timeout_s = 60 * 10;
   tensorflow::grappler::Cluster* cluster_ =
@@ -176,13 +176,13 @@ tensorflow::Status _GetOpPerformanceDataAndRunTime(
   tensorflow::Status status = cost_measure->Initialize(item);
   if (!status.ok()) return status;
 
-  tensorflow::CostGraphDef cost_graph;
+  tensorflow::RunMetadata run_metadata;
   TF_RETURN_IF_ERROR(
-      cost_measure->PredictCosts(item.graph, &cost_graph, costs));
+      cost_measure->PredictCosts(item.graph, &run_metadata, costs));
 
   if (op_performance_data) {
     *op_performance_data = tensorflow::grappler::CostGraphToOpPerformanceData(
-        cost_graph, item.graph);
+        run_metadata.cost_graph(), item.graph);
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/python/grappler/cluster.py b/tensorflow/python/grappler/cluster.py
index 079d07115b31da86600821a098aec08ec60bf436..428b52402cffc16bd692cac5839494a617815236 100644
--- a/tensorflow/python/grappler/cluster.py
+++ b/tensorflow/python/grappler/cluster.py
@@ -71,26 +71,21 @@ class Cluster(object):
     return self._tf_cluster
 
   def ListDevices(self):
-    """Returns the list of available hardware devices."""
-    devices = []
-    if self._tf_cluster is not None:
-      ret_from_swig = tf_cluster.TF_ListDevices(self._tf_cluster)
-      devices = []
-      for raw_dev in ret_from_swig:
-        devices.append(device_properties_pb2.NamedDevice.FromString(raw_dev))
-    return devices
+    """Returns a list of available hardware devices."""
+    if self._tf_cluster is None:
+      return []
+    return [device_properties_pb2.NamedDevice.FromString(device)
+            for device in tf_cluster.TF_ListDevices(self._tf_cluster)]
 
   def ListAvailableOps(self):
-    """Returns a list of all the available operations (sorted alphatically)."""
+    """Returns a list of all available operations (sorted alphabetically)."""
     return tf_cluster.TF_ListAvailableOps()
 
   def GetSupportedDevices(self, item):
     return tf_cluster.TF_GetSupportedDevices(self._tf_cluster, item.tf_item)
 
   def EstimatePerformance(self, device):
-    """Estimate the performance of the specified device."""
-    serialized = device.SerializeToString()
-    return tf_cluster.TF_EstimatePerformance(serialized)
+    return tf_cluster.TF_EstimatePerformance(device.SerializeToString())
 
   def MeasureCosts(self, item):
     """Returns the cost of running the specified item.
@@ -107,10 +102,8 @@ class Cluster(object):
       return None
 
     op_perf_bytes_list, run_time, step_stats_bytes = ret_from_swig
-    op_perfs = []
-    for op_perf_bytes in op_perf_bytes_list:
-      op_perfs.append(
-          op_performance_data_pb2.OpPerformance.FromString(op_perf_bytes))
+    op_perfs = [op_performance_data_pb2.OpPerformance.FromString(op_perf_bytes)
+                for op_perf_bytes in op_perf_bytes_list]
     return (op_perfs, run_time,
             step_stats_pb2.StepStats.FromString(step_stats_bytes))
 
@@ -122,11 +115,9 @@ class Cluster(object):
     Returns: A hashtable indexed by device name.
     """
     with errors.raise_exception_on_not_ok_status() as status:
-      ret_from_swig = tf_cluster.TF_DeterminePeakMemoryUsage(
+      return tf_cluster.TF_DeterminePeakMemoryUsage(
           item.tf_item, self._tf_cluster, status)
 
-    return ret_from_swig
-
 
 @contextlib.contextmanager
 def Provision(allow_soft_placement=True,
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index b474e19894957d01c7c8978282c547df81a9b2b3..9aa5fbca383d126ebb927a7e47fc714503fcefed 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -42,9 +42,13 @@ Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report,
 void CostAnalyzer::PredictCosts(CostEstimator* cost_estimator,
                                 CostGraphDef* cost_graph, int64* total_time) {
   TF_CHECK_OK(cost_estimator->Initialize(*item_));
+  RunMetadata run_metadata;
   Costs costs;
   const Status status =
-      cost_estimator->PredictCosts(item_->graph, cost_graph, &costs);
+      cost_estimator->PredictCosts(item_->graph, &run_metadata, &costs);
+  if (cost_graph) {
+    cost_graph->Swap(run_metadata.mutable_cost_graph());
+  }
   *total_time = costs.execution_time.count();
   if (!status.ok()) {
     LOG(ERROR) << "Could not estimate the cost for item " << item_->id << ": "
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 47fb480aa1987c67736a6b6dbbf798f8ebc3e93a..e879254dae52cadda953146f0121bf9cf3b18e42 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -1,5 +1,7 @@
 # Description:
 #   Contains the Keras API (internal TensorFlow version).
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -7,9 +9,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-
 config_setting(
     name = "empty_condition",
     values = {"define": "UNUSED=unused"},
@@ -62,13 +61,11 @@ py_library(
         ":engine",
         ":layers",
         ":pil_for_keras",
-        "@keras_applications_archive//:keras_applications",
+        ":saving",
         "//tensorflow/python:training",
         "//tensorflow/python/keras/optimizer_v2",
-        # TODO(kathywu): move saving into engine after resolving circular
-        # dependencies between Keras and SavedModel
-        "//tensorflow/python/keras/saving",
         "//tensorflow/python/saved_model",
+        "@keras_applications_archive//:keras_applications",
     ],
 )
 
@@ -85,6 +82,7 @@ py_library(
     srcs = ["backend.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":backend_config",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -100,6 +98,7 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
@@ -119,12 +118,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "backend_config",
+    srcs = ["backend_config.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "engine",
     srcs = [
-        "activations.py",
-        "callbacks.py",
-        "constraints.py",
         "engine/__init__.py",
         "engine/base_layer.py",
         "engine/base_layer_utils.py",
@@ -140,29 +142,152 @@ py_library(
         "engine/training_eager.py",
         "engine/training_generator.py",
         "engine/training_utils.py",
-        "initializers.py",
-        "losses.py",
-        "metrics.py",
+        "metrics.py",  # Need base_layer
         "models.py",
-        "optimizers.py",
-        "regularizers.py",
-        "utils/data_utils.py",
-        "utils/io_utils.py",
-        "utils/losses_utils.py",
         "utils/metrics_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":activations",
         ":backend",
+        ":callbacks",
+        ":constraints",
+        ":engine_utils",
+        ":initializers",
+        ":losses",
+        ":optimizers",
+        ":regularizers",
+        ":saving",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/training/checkpointable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "saving",
+    srcs = [
+        "saving/__init__.py",
+        "saving/hdf5_format.py",
+        "saving/model_config.py",
+        "saving/saved_model.py",
+        "saving/saving_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        ":optimizers",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:mode_keys",
+        "//tensorflow/python:saver",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model/model_utils",
+    ],
+)
+
+py_library(
+    name = "activations",
+    srcs = [
+        "activations.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "callbacks",
+    srcs = [
+        "callbacks.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "constraints",
+    srcs = [
+        "constraints.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "initializers",
+    srcs = [
+        "initializers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "losses",
+    srcs = [
+        "losses.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "optimizers",
+    srcs = [
+        "optimizers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        "//tensorflow/python/keras/optimizer_v2",
+    ],
+)
+
+py_library(
+    name = "regularizers",
+    srcs = [
+        "regularizers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "engine_utils",
+    srcs = [
+        "utils/conv_utils.py",
+        "utils/data_utils.py",
+        "utils/io_utils.py",
+        "utils/losses_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+    ],
+)
+
 py_library(
     name = "layers",
     srcs = [
@@ -173,6 +298,7 @@ py_library(
         "layers/core.py",
         "layers/cudnn_recurrent.py",
         "layers/embeddings.py",
+        "layers/kernelized.py",
         "layers/local.py",
         "layers/merge.py",
         "layers/noise.py",
@@ -181,8 +307,8 @@ py_library(
         "layers/recurrent.py",
         "layers/serialization.py",
         "layers/wrappers.py",
-        "utils/conv_utils.py",
         "utils/generic_utils.py",
+        "utils/kernelized_utils.py",
         "utils/layer_utils.py",
         "utils/tf_utils.py",
     ],
@@ -194,199 +320,212 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:standard_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "integration_test",
     size = "medium",
     srcs = ["integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:nn",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 12,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "constraints_test",
     size = "small",
     srcs = ["constraints_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "initializers_test",
     size = "small",
     srcs = ["initializers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "regularizers_test",
     size = "small",
     srcs = ["regularizers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 8,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "losses_test",
     size = "small",
     srcs = ["losses_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "metrics_functional_test",
-    size = "medium",
+    size = "small",
     srcs = ["metrics_functional_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "metrics_test",
     size = "medium",
     srcs = ["metrics_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
+tf_py_test(
+    name = "metrics_confusion_matrix_test",
+    size = "medium",
+    srcs = ["metrics_confusion_matrix_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "applications_test",
     size = "enormous",
     srcs = ["applications/applications_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
 )
 
-py_test(
+tf_py_test(
     name = "advanced_activations_test",
     size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "convolutional_recurrent_test",
     size = "large",
     srcs = ["layers/convolutional_recurrent_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
 )
 
-py_test(
+cuda_py_test(
     name = "convolutional_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/convolutional_test.py"],
-    shard_count = 11,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 8,
+)
+
+cuda_py_test(
+    name = "convolutional_transpose_test",
+    size = "medium",
+    srcs = ["layers/convolutional_transpose_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
 cuda_py_test(
     name = "cudnn_recurrent_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/cudnn_recurrent_test.py"],
     additional_deps = [
         ":keras",
@@ -394,34 +533,33 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 2,
+    shard_count = 4,
     tags = ["no_windows_gpu"],
 )
 
-py_test(
+tf_py_test(
     name = "pooling_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/pooling_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 8,
 )
 
-py_test(
+tf_py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
 )
 
 cuda_py_test(
@@ -435,119 +573,124 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "local_test",
     size = "medium",
     srcs = ["layers/local_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "merge_test",
     size = "small",
     srcs = ["layers/merge_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "noise_test",
     size = "small",
     srcs = ["layers/noise_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "normalization_test",
     size = "medium",
     srcs = ["layers/normalization_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "simplernn_test",
     size = "medium",
     srcs = ["layers/simplernn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "gru_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/gru_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # http://b/62136390
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],  # http://b/62136390
 )
 
-py_test(
+tf_py_test(
     name = "lstm_test",
     size = "medium",
     srcs = ["layers/lstm_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
         "noasan",  # times out b/63678675
         "notsan",  # http://b/62189182
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "recurrent_test",
     size = "medium",
     srcs = ["layers/recurrent_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "separable_convolutional_test",
+    size = "medium",
+    srcs = ["layers/separable_convolutional_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
@@ -561,7 +704,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 4,
+    shard_count = 8,
 )
 
 cuda_py_test(
@@ -574,58 +717,87 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 4,
+    shard_count = 6,
 )
 
-py_test(
+tf_py_test(
     name = "serialization_test",
     size = "small",
     srcs = ["layers/serialization_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "kernelized_test",
+    size = "small",
+    srcs = ["layers/kernelized_test.py"],
+    additional_deps = [
+        ":backend",
+        ":initializers",
+        ":keras",
+        ":layers",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/eager:context",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "wrappers_test",
     size = "medium",
     srcs = ["layers/wrappers_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
         "noasan",  # http://b/78599823
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "scikit_learn_test",
     size = "small",
     srcs = ["wrappers/scikit_learn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "data_utils_test",
-    size = "large",
+    size = "medium",
     srcs = ["utils/data_utils_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 6,
     tags = [
         "no_oss",
         "no_windows",
@@ -633,64 +805,66 @@ py_test(
         "notsan",
         "optonly",  # times out
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "generic_utils_test",
     size = "small",
     srcs = ["utils/generic_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_utils_test",
     size = "small",
     srcs = ["utils/tf_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "io_utils_test",
     size = "small",
     srcs = ["utils/io_utils_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "np_utils_test",
     size = "small",
     srcs = ["utils/np_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "kernelized_utils_test",
+    size = "small",
+    srcs = ["utils/kernelized_utils_test.py"],
+    additional_deps = [
+        ":layers",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
     ],
 )
 
@@ -721,287 +895,309 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "conv_utils_test",
     size = "small",
     srcs = ["utils/conv_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "image_test",
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_test",
     size = "small",
     srcs = ["preprocessing/sequence_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "text_test",
     size = "small",
     srcs = ["preprocessing/text_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "callbacks_test",
     size = "medium",
     srcs = ["callbacks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "correctness_test",
     size = "medium",
     srcs = ["engine/correctness_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_test",
     size = "medium",
     srcs = ["engine/training_test.py"],
-    shard_count = 16,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 16,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_dataset_test",
     size = "medium",
     srcs = ["engine/training_dataset_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "training_generator_test",
-    size = "large",
+    size = "medium",
     srcs = ["engine/training_generator_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 6,
     tags = [
         "no_oss",
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "feature_columns_integration_test",
     size = "small",
     srcs = ["engine/feature_columns_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_eager_test",
     size = "medium",
     srcs = ["engine/training_eager_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_utils_test",
     size = "medium",
     srcs = ["engine/training_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "topology_test",
     size = "medium",
     srcs = ["engine/topology_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no-internal-py3",
-    ],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no-internal-py3",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "base_layer_test",
     size = "small",
     srcs = ["engine/base_layer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
-    name = "saving_test",
+tf_py_test(
+    name = "hdf5_format_test",
     size = "medium",
-    srcs = ["engine/saving_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    srcs = ["saving/hdf5_format_test.py"],
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "sequential_test",
     size = "medium",
     srcs = ["engine/sequential_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "models_test",
     size = "medium",
     srcs = ["models_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67509773
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 8,
+    tags = ["notsan"],  # b/67509773
 )
 
-py_test(
+tf_py_test(
     name = "backend_test",
     size = "medium",
     srcs = ["backend_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "backend_config_test",
+    size = "medium",
+    srcs = ["backend_config_test.py"],
+    additional_deps = [
+        ":keras",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "keras_parameterized_test",
     size = "small",
     srcs = ["keras_parameterized_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+    ],
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "saved_model_test",
+    size = "medium",
+    srcs = ["saving/saved_model_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
+)
+
+tf_py_test(
+    name = "saving_utils_test",
+    size = "medium",
+    srcs = ["saving/saving_utils_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index e59744f64d0fee708b3de9fe492a666c3769590b..f024b9b59a21df6e6771e89ef00428bcaaf49524 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -42,8 +42,6 @@ from tensorflow.python.keras import wrappers
 from tensorflow.python.keras.layers import Input
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.models import Sequential
-from tensorflow.python.keras.saving.saved_model import export
-from tensorflow.python.keras.saving.saved_model import load_from_saved_model
 
 from tensorflow.python.util.tf_export import keras_export
 
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 42d94e77a0585250cd234d1813e1b366f95aba94..80921c9761a3d7da92f98966ab0e15615a6c6070 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend_config
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -86,26 +87,14 @@ _GRAPH_LEARNING_PHASES = weakref.WeakKeyDictionary()
 
 # _DUMMY_EAGER_GRAPH is used as a key in _GRAPH_LEARNING_PHASES.
 # We keep a separate reference to it to make sure it does not get removed from
-# _GRAPH_LEARNING_PHASES. We use a dummy class instead of something like a
-# string because strings are not weakly-referencable.
-class _DummyEagerGraph(object):
-  pass
-_DUMMY_EAGER_GRAPH = _DummyEagerGraph()
+# _GRAPH_LEARNING_PHASES.
+_DUMMY_EAGER_GRAPH = threading.local()
 
 # This boolean flag can be set to True to leave variable initialization
 # up to the user.
 # Change its value via `manual_variable_initialization(value)`.
 _MANUAL_VAR_INIT = False
 
-# The type of float to use throughout a session.
-_FLOATX = 'float32'
-
-# Epsilon fuzz factor used throughout the codebase.
-_EPSILON = 1e-7
-
-# Default image data format, one of "channels_last", "channels_first".
-_IMAGE_DATA_FORMAT = 'channels_last'
-
 # This list holds the available devices.
 # It is populated when `_get_available_gpus()` is called for the first time.
 # We assume our devices don't change henceforth.
@@ -119,6 +108,14 @@ _GRAPH_VARIABLES = weakref.WeakKeyDictionary()
 # the graph.
 _GRAPH_TF_OPTIMIZERS = weakref.WeakKeyDictionary()
 
+# The below functions are kept accessible from backend for compatibility.
+epsilon = backend_config.epsilon
+floatx = backend_config.floatx
+image_data_format = backend_config.image_data_format
+set_epsilon = backend_config.set_epsilon
+set_floatx = backend_config.set_floatx
+set_image_data_format = backend_config.set_image_data_format
+
 
 @keras_export('keras.backend.backend')
 def backend():
@@ -132,87 +129,6 @@ def backend():
   return 'tensorflow'
 
 
-@keras_export('keras.backend.epsilon')
-def epsilon():
-  """Returns the value of the fuzz factor used in numeric expressions.
-
-  Returns:
-      A float.
-
-  Example:
-  ```python
-      >>> keras.backend.epsilon()
-      1e-07
-  ```
-  """
-  return _EPSILON
-
-
-@keras_export('keras.backend.set_epsilon')
-def set_epsilon(value):
-  """Sets the value of the fuzz factor used in numeric expressions.
-
-  Arguments:
-      value: float. New value of epsilon.
-
-  Example:
-  ```python
-      >>> from keras import backend as K
-      >>> K.epsilon()
-      1e-07
-      >>> K.set_epsilon(1e-05)
-      >>> K.epsilon()
-      1e-05
-  ```
-  """
-  global _EPSILON
-  _EPSILON = value
-
-
-@keras_export('keras.backend.floatx')
-def floatx():
-  """Returns the default float type, as a string.
-
-  E.g. 'float16', 'float32', 'float64'.
-
-  Returns:
-      String, the current default float type.
-
-  Example:
-  ```python
-      >>> keras.backend.floatx()
-      'float32'
-  ```
-  """
-  return _FLOATX
-
-
-@keras_export('keras.backend.set_floatx')
-def set_floatx(value):
-  """Sets the default float type.
-
-  Arguments:
-      value: String; 'float16', 'float32', or 'float64'.
-
-  Example:
-  ```python
-      >>> from keras import backend as K
-      >>> K.floatx()
-      'float32'
-      >>> K.set_floatx('float16')
-      >>> K.floatx()
-      'float16'
-  ```
-
-  Raises:
-      ValueError: In case of invalid value.
-  """
-  global _FLOATX
-  if value not in {'float16', 'float32', 'float64'}:
-    raise ValueError('Unknown floatx type: ' + str(value))
-  _FLOATX = str(value)
-
-
 @keras_export('keras.backend.cast_to_floatx')
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
@@ -238,49 +154,7 @@ def cast_to_floatx(x):
       dtype('float32')
   ```
   """
-  return np.asarray(x, dtype=_FLOATX)
-
-
-@keras_export('keras.backend.image_data_format')
-def image_data_format():
-  """Returns the default image data format convention.
-
-  Returns:
-      A string, either `'channels_first'` or `'channels_last'`
-
-  Example:
-  ```python
-      >>> keras.backend.image_data_format()
-      'channels_first'
-  ```
-  """
-  return _IMAGE_DATA_FORMAT
-
-
-@keras_export('keras.backend.set_image_data_format')
-def set_image_data_format(data_format):
-  """Sets the value of the image data format convention.
-
-  Arguments:
-      data_format: string. `'channels_first'` or `'channels_last'`.
-
-  Example:
-  ```python
-      >>> from keras import backend as K
-      >>> K.image_data_format()
-      'channels_first'
-      >>> K.set_image_data_format('channels_last')
-      >>> K.image_data_format()
-      'channels_last'
-  ```
-
-  Raises:
-      ValueError: In case of invalid `data_format` value.
-  """
-  global _IMAGE_DATA_FORMAT
-  if data_format not in {'channels_last', 'channels_first'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-  _IMAGE_DATA_FORMAT = str(data_format)
+  return np.asarray(x, dtype=floatx())
 
 
 # A global dictionary mapping graph objects to an index of counters used
@@ -377,12 +251,24 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  if context.executing_eagerly():
-    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-      # Fallback to inference mode as default.
-      return 0
-    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
-  return symbolic_learning_phase()
+  if ops.get_default_graph() is _GRAPH:
+    # Don't enter an init_scope for the learning phase if eager execution
+    # is enabled but we're inside the Keras workspace graph.
+    return symbolic_learning_phase()
+  with ops.init_scope():
+    # We always check & set the learning phase inside the init_scope,
+    # otherwise the wrong default_graph will be used to look up the learning
+    # phase inside of functions & defuns.
+    #
+    # This is because functions & defuns (both in graph & in eager mode)
+    # will always execute non-eagerly using a function-specific default
+    # subgraph.
+    if context.executing_eagerly():
+      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+        # Fallback to inference mode as default.
+        return 0
+      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+    return symbolic_learning_phase()
 
 
 def symbolic_learning_phase():
@@ -410,11 +296,25 @@ def set_learning_phase(value):
     raise ValueError('Expected learning phase to be 0 or 1.')
   with ops.init_scope():
     if context.executing_eagerly():
+      # In an eager context, the learning phase values applies to both the eager
+      # context and the internal Keras graph.
       _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
-    else:
-      _GRAPH_LEARNING_PHASES[get_graph()] = value
+    _GRAPH_LEARNING_PHASES[get_graph()] = value
+
 
+def set_eager_learning_phase(value):
+  """Internal utility that sets the learning phase in eager execution only.
 
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  assert value in {0, 1}
+  assert context.executing_eagerly()
+  _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
+
+
+@keras_export('keras.backend.learning_phase_scope')
 @tf_contextlib.contextmanager
 def learning_phase_scope(value):
   """Provides a scope within which the learning phase is equal to `value`.
@@ -425,24 +325,62 @@ def learning_phase_scope(value):
      value: Learning phase value, either 0 or 1 (integers).
 
   Yields:
-    The provided value.
+    None.
 
   Raises:
      ValueError: if `value` is neither `0` nor `1`.
   """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be 0 or 1.')
-  previous_value = learning_phase()
+
+  with ops.init_scope():
+    if context.executing_eagerly():
+      previous_eager_value = _GRAPH_LEARNING_PHASES.get(
+          _DUMMY_EAGER_GRAPH, None)
+    previous_graph_value = _GRAPH_LEARNING_PHASES.get(get_graph(), None)
+
   try:
     set_learning_phase(value)
-    yield value
+    yield
   finally:
     # Restore learning phase to initial value.
     with ops.init_scope():
       if context.executing_eagerly():
-        _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
-      else:
-        _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
+        if previous_eager_value is not None:
+          _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_eager_value
+        elif _DUMMY_EAGER_GRAPH in _GRAPH_LEARNING_PHASES:
+          del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+
+      graph = get_graph()
+      if previous_graph_value is not None:
+        _GRAPH_LEARNING_PHASES[graph] = previous_graph_value
+      elif graph in _GRAPH_LEARNING_PHASES:
+        del _GRAPH_LEARNING_PHASES[graph]
+
+@tf_contextlib.contextmanager
+def eager_learning_phase_scope(value):
+  """Internal scope that sets the learning phase in eager execution only.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+
+  Yields:
+    None.
+
+  Raises:
+     ValueError: if `value` is neither `0` nor `1`.
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  assert value in {0, 1}
+  assert context.executing_eagerly()
+  previous_value = learning_phase()
+  try:
+    _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
+    yield
+  finally:
+    # Restore learning phase to initial value.
+    _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
 
 
 def _get_session():
@@ -1681,6 +1619,7 @@ def prod(x, axis=None, keepdims=False):
   return math_ops.reduce_prod(x, axis, keepdims)
 
 
+@keras_export('keras.backend.cumsum')
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -1694,6 +1633,7 @@ def cumsum(x, axis=0):
   return math_ops.cumsum(x, axis=axis)
 
 
+@keras_export('keras.backend.cumprod')
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2794,6 +2734,11 @@ def get_value(x):
   """
   if context.executing_eagerly():
     return x.numpy()
+  elif not getattr(x, '_in_graph_mode', True):
+    # This is a variable which was created in an eager context, but is being
+    # evaluated from a Graph.
+    with context.eager_mode():
+      return x.numpy()
   elif ops.inside_function():
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   return x.eval(session=get_session())
@@ -3133,7 +3078,7 @@ class EagerExecutionFunction(object):
     # the relevant subgraph?
     graph.inputs = self.inputs + list(graph.captures.values())
     graph.outputs = self.outputs
-    graph_fn = eager_function.Function(graph)
+    graph_fn = eager_function.ConcreteFunction(graph)
     graph_fn._num_positional_args = len(self.inputs)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
@@ -3861,7 +3806,8 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
       axis = axis % len(output.shape)
       # scale preds so that the class probas of each sample sum to 1
       output = output / math_ops.reduce_sum(output, axis, True)
-      # manual computation of crossentropy
+
+      # Compute cross entropy from probabilities.
       epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
       output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
       return -math_ops.reduce_sum(target * math_ops.log(output), axis)
@@ -3943,10 +3889,13 @@ def binary_crossentropy(target, output, from_logits=False):
   """
   if not from_logits:
     if context.executing_eagerly() or output.op.type != 'Sigmoid':
-      # transform back to logits
       epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-      output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
-      output = math_ops.log(output / (1 - output))
+      output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
+
+      # Compute cross entropy from probabilities.
+      bce = target * math_ops.log(output + epsilon())
+      bce += (1 - target) * math_ops.log(1 - output + epsilon())
+      return -bce
     else:
       # When sigmoid activation function is used for output operation, we
       # use logits from the sigmoid function directly to compute loss in order
diff --git a/tensorflow/python/keras/backend_config.py b/tensorflow/python/keras/backend_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7c63ac2c72df1f64e8b6ee4eafbaf75e56c1314
--- /dev/null
+++ b/tensorflow/python/keras/backend_config.py
@@ -0,0 +1,126 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras backend config API."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.tf_export import keras_export
+
+# The type of float to use throughout a session.
+_FLOATX = 'float32'
+
+# Epsilon fuzz factor used throughout the codebase.
+_EPSILON = 1e-7
+
+# Default image data format, one of "channels_last", "channels_first".
+_IMAGE_DATA_FORMAT = 'channels_last'
+
+
+@keras_export('keras.backend.epsilon')
+def epsilon():
+  """Returns the value of the fuzz factor used in numeric expressions.
+
+  Returns:
+      A float.
+
+  Example:
+  ```python
+  keras.backend.epsilon() >>>1e-07
+  ```
+  """
+  return _EPSILON
+
+
+@keras_export('keras.backend.set_epsilon')
+def set_epsilon(value):
+  """Sets the value of the fuzz factor used in numeric expressions.
+
+  Arguments:
+      value: float. New value of epsilon.
+  Example: ```python from keras import backend as K K.epsilon() >>> 1e-07
+    K.set_epsilon(1e-05) K.epsilon() >>> 1e-05 ```
+  """
+  global _EPSILON
+  _EPSILON = value
+
+
+@keras_export('keras.backend.floatx')
+def floatx():
+  """Returns the default float type, as a string.
+
+  E.g. 'float16', 'float32', 'float64'.
+
+  Returns:
+      String, the current default float type.
+
+  Example:
+  ```python
+  keras.backend.floatx() >>> 'float32'
+  ```
+  """
+  return _FLOATX
+
+
+@keras_export('keras.backend.set_floatx')
+def set_floatx(value):
+  """Sets the default float type.
+
+  Arguments:
+      value: String; 'float16', 'float32', or 'float64'.
+  Example: ```python from keras import backend as K K.floatx() >>> 'float32'
+    K.set_floatx('float16') K.floatx() >>> 'float16' ```
+
+  Raises:
+      ValueError: In case of invalid value.
+  """
+  global _FLOATX
+  if value not in {'float16', 'float32', 'float64'}:
+    raise ValueError('Unknown floatx type: ' + str(value))
+  _FLOATX = str(value)
+
+
+@keras_export('keras.backend.image_data_format')
+def image_data_format():
+  """Returns the default image data format convention.
+
+  Returns:
+      A string, either `'channels_first'` or `'channels_last'`
+
+  Example:
+  ```python
+  keras.backend.image_data_format() >>> 'channels_first'
+  ```
+  """
+  return _IMAGE_DATA_FORMAT
+
+
+@keras_export('keras.backend.set_image_data_format')
+def set_image_data_format(data_format):
+  """Sets the value of the image data format convention.
+
+  Arguments:
+      data_format: string. `'channels_first'` or `'channels_last'`.
+  Example: ```python from keras import backend as K K.image_data_format() >>>
+    'channels_first' K.set_image_data_format('channels_last')
+    K.image_data_format() >>> 'channels_last' ```
+
+  Raises:
+      ValueError: In case of invalid `data_format` value.
+  """
+  global _IMAGE_DATA_FORMAT
+  if data_format not in {'channels_last', 'channels_first'}:
+    raise ValueError('Unknown data_format: ' + str(data_format))
+  _IMAGE_DATA_FORMAT = str(data_format)
diff --git a/tensorflow/python/keras/backend_config_test.py b/tensorflow/python/keras/backend_config_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e003d81e196868654313fe08e53fb261a2baa0
--- /dev/null
+++ b/tensorflow/python/keras/backend_config_test.py
@@ -0,0 +1,55 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for backend_config."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BackendConfigTest(test.TestCase):
+
+  def test_backend(self):
+    self.assertEqual(keras.backend.backend(), 'tensorflow')
+
+  def test_espilon(self):
+    epsilon = 1e-2
+    keras.backend_config.set_epsilon(epsilon)
+    self.assertEqual(keras.backend_config.epsilon(), epsilon)
+    keras.backend_config.set_epsilon(1e-7)
+    self.assertEqual(keras.backend_config.epsilon(), 1e-7)
+
+  def test_floatx(self):
+    floatx = 'float64'
+    keras.backend_config.set_floatx(floatx)
+    self.assertEqual(keras.backend_config.floatx(), floatx)
+    keras.backend_config.set_floatx('float32')
+    self.assertEqual(keras.backend_config.floatx(), 'float32')
+
+  def test_image_data_format(self):
+    image_data_format = 'channels_first'
+    keras.backend_config.set_image_data_format(image_data_format)
+    self.assertEqual(keras.backend_config.image_data_format(),
+                     image_data_format)
+    keras.backend_config.set_image_data_format('channels_last')
+    self.assertEqual(keras.backend_config.image_data_format(), 'channels_last')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 4b83f0bf664e3cdffee889f504dc2fc47a94a1ce..f43a3055cbc115348bea12c9acd3f44b4ce73ded 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -99,24 +99,6 @@ class BackendUtilsTest(test.TestCase):
   def test_backend(self):
     self.assertEqual(keras.backend.backend(), 'tensorflow')
 
-  def test_espilon(self):
-    epsilon = 1e-2
-    keras.backend.set_epsilon(epsilon)
-    self.assertEqual(keras.backend.epsilon(), epsilon)
-    keras.backend.set_epsilon(1e-7)
-
-  def test_floatx(self):
-    floatx = 'float64'
-    keras.backend.set_floatx(floatx)
-    self.assertEqual(keras.backend.floatx(), floatx)
-    keras.backend.set_floatx('float32')
-
-  def test_image_data_format(self):
-    image_data_format = 'channels_first'
-    keras.backend.set_image_data_format(image_data_format)
-    self.assertEqual(keras.backend.image_data_format(), image_data_format)
-    keras.backend.set_image_data_format('channels_last')
-
   def test_get_reset_uids(self):
     self.assertEqual(keras.backend.get_uid('foo'), 1)
     self.assertEqual(keras.backend.get_uid('foo'), 2)
@@ -126,34 +108,55 @@ class BackendUtilsTest(test.TestCase):
 
   def test_learning_phase(self):
     with self.cached_session() as sess:
-      keras.backend.set_learning_phase(1)
-      self.assertEqual(keras.backend.learning_phase(), 1)
       with self.assertRaises(ValueError):
         keras.backend.set_learning_phase(2)
 
       # Test running with a learning-phase-consuming layer
-      keras.backend.set_learning_phase(0)
-      x = keras.Input((3,))
-      y = keras.layers.BatchNormalization()(x)
-      if not context.executing_eagerly():
-        self.evaluate(variables.global_variables_initializer())
-        sess.run(y, feed_dict={x: np.random.random((2, 3))})
+      with keras.backend.learning_phase_scope(0):
+        x = keras.Input((3,))
+        y = keras.layers.BatchNormalization()(x)
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_scope(self):
-    with self.cached_session():
-      initial_learning_phase = keras.backend.learning_phase()
-      with keras.backend.learning_phase_scope(1) as lp:
-        self.assertEqual(lp, 1)
-        self.assertEqual(keras.backend.learning_phase(), 1)
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-      with keras.backend.learning_phase_scope(0) as lp:
-        self.assertEqual(lp, 0)
-        self.assertEqual(keras.backend.learning_phase(), 0)
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-      with self.assertRaises(ValueError):
-        with keras.backend.learning_phase_scope(None):
-          pass
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    initial_learning_phase = keras.backend.learning_phase()
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    with keras.backend.learning_phase_scope(0):
+      self.assertEqual(keras.backend.learning_phase(), 0)
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    with self.assertRaises(ValueError):
+      with keras.backend.learning_phase_scope(None):
+        pass
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+
+    new_learning_phase = 0
+    keras.backend.set_learning_phase(new_learning_phase)
+    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+
+  def test_learning_phase_scope_in_graph(self):
+    initial_learning_phase_outside_graph = keras.backend.learning_phase()
+    with keras.backend.get_graph().as_default():
+      initial_learning_phase_in_graph = keras.backend.learning_phase()
+
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
+
+    with keras.backend.get_graph().as_default():
+      self.assertEqual(keras.backend.learning_phase(),
+                       initial_learning_phase_in_graph)
+
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
 
   def test_int_shape(self):
     x = keras.backend.ones(shape=(3, 4))
@@ -164,21 +167,20 @@ class BackendUtilsTest(test.TestCase):
       self.assertEqual(keras.backend.int_shape(x), (None, 4))
 
   def test_in_train_phase(self):
-    with self.cached_session():
-      y1 = keras.backend.variable(1)
-      y2 = keras.backend.variable(2)
-      if context.executing_eagerly():
-        with keras.backend.learning_phase_scope(0):
-          y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
-        with keras.backend.learning_phase_scope(1):
-          y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
-      else:
-        y = keras.backend.in_train_phase(y1, y2)
-        f = keras.backend.function([keras.backend.learning_phase()], [y])
-        y_val_test = f([0])[0]
-        y_val_train = f([1])[0]
-      self.assertAllClose(y_val_test, 2)
-      self.assertAllClose(y_val_train, 1)
+    y1 = keras.backend.variable(1)
+    y2 = keras.backend.variable(2)
+    if context.executing_eagerly():
+      with keras.backend.learning_phase_scope(0):
+        y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
+      with keras.backend.learning_phase_scope(1):
+        y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
+    else:
+      y = keras.backend.in_train_phase(y1, y2)
+      f = keras.backend.function([keras.backend.learning_phase()], [y])
+      y_val_test = f([0])[0]
+      y_val_train = f([1])[0]
+    self.assertAllClose(y_val_test, 2)
+    self.assertAllClose(y_val_train, 1)
 
   def test_is_keras_tensor(self):
     x = keras.backend.variable(1)
@@ -205,74 +207,63 @@ class BackendUtilsTest(test.TestCase):
 class BackendVariableTest(test.TestCase):
 
   def test_zeros(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.zeros((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones(self):
-    with self.cached_session():
-      x = keras.backend.ones((3, 4))
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.ones((3, 4)))
+    x = keras.backend.ones((3, 4))
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.ones((3, 4)))
 
   def test_eye(self):
-    with self.cached_session():
-      x = keras.backend.eye(4)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.eye(4))
+    x = keras.backend.eye(4)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.eye(4))
 
   def test_zeros_like(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      y = keras.backend.zeros_like(x)
-      val = keras.backend.eval(y)
-      self.assertAllClose(val, np.zeros((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    y = keras.backend.zeros_like(x)
+    val = keras.backend.eval(y)
+    self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones_like(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      y = keras.backend.ones_like(x)
-      val = keras.backend.eval(y)
-      self.assertAllClose(val, np.ones((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    y = keras.backend.ones_like(x)
+    val = keras.backend.eval(y)
+    self.assertAllClose(val, np.ones((3, 4)))
 
   def test_random_uniform_variable(self):
-    with self.cached_session():
-      x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val.mean(), 1.5, atol=1e-1)
-      self.assertAllClose(val.max(), 2., atol=1e-1)
-      self.assertAllClose(val.min(), 1., atol=1e-1)
+    x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val.mean(), 1.5, atol=1e-1)
+    self.assertAllClose(val.max(), 2., atol=1e-1)
+    self.assertAllClose(val.min(), 1., atol=1e-1)
 
   def test_random_normal_variable(self):
-    with self.cached_session():
-      x = keras.backend.random_normal_variable((30, 20), 1., 0.5,
-                                               seed=0)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val.mean(), 1., atol=1e-1)
-      self.assertAllClose(val.std(), 0.5, atol=1e-1)
+    x = keras.backend.random_normal_variable((30, 20), 1., 0.5, seed=0)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val.mean(), 1., atol=1e-1)
+    self.assertAllClose(val.std(), 0.5, atol=1e-1)
 
   def test_count_params(self):
-    with self.cached_session():
-      x = keras.backend.zeros((4, 5))
-      val = keras.backend.count_params(x)
-      self.assertAllClose(val, 20)
+    x = keras.backend.zeros((4, 5))
+    val = keras.backend.count_params(x)
+    self.assertAllClose(val, 20)
 
   def test_constant(self):
-    with self.cached_session():
-      ref_val = np.random.random((3, 4)).astype('float32')
-      x = keras.backend.constant(ref_val)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, ref_val)
+    ref_val = np.random.random((3, 4)).astype('float32')
+    x = keras.backend.constant(ref_val)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, ref_val)
 
   def test_sparse_variable(self):
-    with self.cached_session():
-      val = scipy.sparse.eye(10)
-      x = keras.backend.variable(val)
-      self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
+    val = scipy.sparse.eye(10)
+    x = keras.backend.variable(val)
+    self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
 
-      y = keras.backend.to_dense(x)
-      self.assertFalse(keras.backend.is_sparse(y))
+    y = keras.backend.to_dense(x)
+    self.assertFalse(keras.backend.is_sparse(y))
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -310,20 +301,19 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.argmax, np.argmax),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
-                                         keras_kwargs={'axis': 1},
-                                         np_kwargs={'axis': 1})
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
-                                         keras_kwargs={'axis': -1},
-                                         np_kwargs={'axis': -1})
-        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
-          compare_single_input_op_to_numpy(keras_op, np_op,
-                                           input_shape=(4, 7, 5),
-                                           keras_kwargs={'axis': 1,
-                                                         'keepdims': True},
-                                           np_kwargs={'axis': 1,
-                                                      'keepdims': True})
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                       keras_kwargs={'axis': 1},
+                                       np_kwargs={'axis': 1})
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                       keras_kwargs={'axis': -1},
+                                       np_kwargs={'axis': -1})
+      if 'keepdims' in tf_inspect.getargspec(keras_op).args:
+        compare_single_input_op_to_numpy(keras_op, np_op,
+                                         input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': 1,
+                                                       'keepdims': True},
+                                         np_kwargs={'axis': 1,
+                                                    'keepdims': True})
 
   def test_elementwise_ops(self):
     ops_to_test = [
@@ -336,32 +326,28 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.exp, np.exp),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
 
     ops_to_test = [
         (keras.backend.sqrt, np.sqrt),
         (keras.backend.log, np.log),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op,
-                                         input_shape=(4, 7),
-                                         negative_values=False)
+      compare_single_input_op_to_numpy(keras_op, np_op,
+                                       input_shape=(4, 7),
+                                       negative_values=False)
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.clip, np.clip,
-          input_shape=(6, 4),
-          keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
-          np_kwargs={'a_min': 0.1, 'a_max': 1.4})
+    compare_single_input_op_to_numpy(
+        keras.backend.clip, np.clip,
+        input_shape=(6, 4),
+        keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
+        np_kwargs={'a_min': 0.1, 'a_max': 1.4})
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.pow, np.power,
-          input_shape=(6, 4),
-          keras_args=[3],
-          np_args=[3])
+    compare_single_input_op_to_numpy(
+        keras.backend.pow, np.power,
+        input_shape=(6, 4),
+        keras_args=[3],
+        np_args=[3])
 
   def test_two_tensor_ops(self):
     ops_to_test = [
@@ -375,84 +361,82 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.minimum, np.minimum),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                       input_shape_a=(4, 7),
-                                       input_shape_b=(4, 7))
+      compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                     input_shape_a=(4, 7),
+                                     input_shape_b=(4, 7))
 
   def test_relu(self):
     x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
-    with self.cached_session():
-      # standard relu
-      relu_op = keras.backend.relu(x)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # alpha (leaky relu used)
-      relu_op = keras.backend.relu(x, alpha=0.5)
-      if not context.executing_eagerly():
-        self.assertTrue('LeakyRelu' in relu_op.name)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+    # standard relu
+    relu_op = keras.backend.relu(x)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+    # alpha (leaky relu used)
+    relu_op = keras.backend.relu(x, alpha=0.5)
+    if not context.executing_eagerly():
+      self.assertTrue('LeakyRelu' in relu_op.name)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
 
-      # max_value < some elements
-      relu_op = keras.backend.relu(x, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+    # max_value < some elements
+    relu_op = keras.backend.relu(x, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
 
-      # nn.relu6 used
-      relu_op = keras.backend.relu(x, max_value=6)
-      if not context.executing_eagerly():
-        self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+    # nn.relu6 used
+    relu_op = keras.backend.relu(x, max_value=6)
+    if not context.executing_eagerly():
+      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
 
-      # max value > 6
-      relu_op = keras.backend.relu(x, max_value=10)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    # max value > 6
+    relu_op = keras.backend.relu(x, max_value=10)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # max value is float
-      relu_op = keras.backend.relu(x, max_value=4.3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+    # max value is float
+    relu_op = keras.backend.relu(x, max_value=4.3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
 
-      # max value == 0
-      relu_op = keras.backend.relu(x, max_value=0)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+    # max value == 0
+    relu_op = keras.backend.relu(x, max_value=0)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
 
-      # alpha and max_value
-      relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+    # alpha and max_value
+    relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
 
-      # threshold
-      relu_op = keras.backend.relu(x, threshold=3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+    # threshold
+    relu_op = keras.backend.relu(x, threshold=3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
 
-      # threshold is float
-      relu_op = keras.backend.relu(x, threshold=1.5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    # threshold is float
+    relu_op = keras.backend.relu(x, threshold=1.5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # threshold is negative
-      relu_op = keras.backend.relu(x, threshold=-5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+    # threshold is negative
+    relu_op = keras.backend.relu(x, threshold=-5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
 
-      # threshold and max_value
-      relu_op = keras.backend.relu(x, threshold=3, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+    # threshold and max_value
+    relu_op = keras.backend.relu(x, threshold=3, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
 
-      # threshold and alpha
-      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+    # threshold and alpha
+    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
 
-      # threshold, alpha, and max_value
-      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+    # threshold, alpha, and max_value
+    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendShapeOpsTest(test.TestCase):
 
   def test_reshape(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
-                                       input_shape=(4, 7),
-                                       keras_args=[(2, 14)],
-                                       np_args=[(2, 14)])
+    compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
+                                     input_shape=(4, 7),
+                                     keras_args=[(2, 14)],
+                                     np_args=[(2, 14)])
 
   def test_concatenate(self):
     a = keras.backend.variable(np.ones((1, 2, 3)))
@@ -461,12 +445,11 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 2, 5])
 
   def test_permute_dimensions(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
-                                       np.transpose,
-                                       input_shape=(4, 7),
-                                       keras_args=[(1, 0)],
-                                       np_args=[(1, 0)])
+    compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
+                                     np.transpose,
+                                     input_shape=(4, 7),
+                                     keras_args=[(1, 0)],
+                                     np_args=[(1, 0)])
 
   def test_resize_images(self):
     height_factor = 2
@@ -541,18 +524,16 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 2, 3])
 
   def test_flatten(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.flatten,
-                                       np.reshape,
-                                       input_shape=(4, 7, 6),
-                                       np_args=[(4 * 7 * 6,)])
+    compare_single_input_op_to_numpy(keras.backend.flatten,
+                                     np.reshape,
+                                     input_shape=(4, 7, 6),
+                                     np_args=[(4 * 7 * 6,)])
 
   def test_batch_flatten(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.batch_flatten,
-                                       np.reshape,
-                                       input_shape=(4, 7, 6),
-                                       np_args=[(4, 7 * 6)])
+    compare_single_input_op_to_numpy(keras.backend.batch_flatten,
+                                     np.reshape,
+                                     input_shape=(4, 7, 6),
+                                     np_args=[(4, 7 * 6)])
 
   def test_temporal_padding(self):
 
@@ -563,12 +544,11 @@ class BackendShapeOpsTest(test.TestCase):
       y[:, padding[0]:-padding[1], :] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.temporal_padding,
-                                       ref_op,
-                                       input_shape=(4, 7, 6),
-                                       keras_args=[(2, 3)],
-                                       np_args=[(2, 3)])
+    compare_single_input_op_to_numpy(keras.backend.temporal_padding,
+                                     ref_op,
+                                     input_shape=(4, 7, 6),
+                                     keras_args=[(2, 3)],
+                                     np_args=[(2, 3)])
 
   def test_spatial_2d_padding(self):
 
@@ -586,23 +566,22 @@ class BackendShapeOpsTest(test.TestCase):
         y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_2d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3),
-          keras_args=[((2, 3), (1, 2))],
-          keras_kwargs={'data_format': 'channels_last'},
-          np_args=[((2, 3), (1, 2))],
-          np_kwargs={'data_format': 'channels_last'})
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_2d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3),
-          keras_args=[((2, 3), (1, 2))],
-          keras_kwargs={'data_format': 'channels_first'},
-          np_args=[((2, 3), (1, 2))],
-          np_kwargs={'data_format': 'channels_first'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_2d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3),
+        keras_args=[((2, 3), (1, 2))],
+        keras_kwargs={'data_format': 'channels_last'},
+        np_args=[((2, 3), (1, 2))],
+        np_kwargs={'data_format': 'channels_last'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_2d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3),
+        keras_args=[((2, 3), (1, 2))],
+        keras_kwargs={'data_format': 'channels_first'},
+        np_args=[((2, 3), (1, 2))],
+        np_kwargs={'data_format': 'channels_first'})
 
   def test_spatial_3d_padding(self):
 
@@ -629,73 +608,70 @@ class BackendShapeOpsTest(test.TestCase):
           padding[2][0]:-padding[2][1]] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_3d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3, 2),
-          keras_args=[((2, 3), (1, 2), (2, 3))],
-          keras_kwargs={'data_format': 'channels_last'},
-          np_args=[((2, 3), (1, 2), (2, 3))],
-          np_kwargs={'data_format': 'channels_last'})
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_3d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3, 2),
-          keras_args=[((2, 3), (1, 2), (2, 3))],
-          keras_kwargs={'data_format': 'channels_first'},
-          np_args=[((2, 3), (1, 2), (2, 3))],
-          np_kwargs={'data_format': 'channels_first'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_3d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3, 2),
+        keras_args=[((2, 3), (1, 2), (2, 3))],
+        keras_kwargs={'data_format': 'channels_last'},
+        np_args=[((2, 3), (1, 2), (2, 3))],
+        np_kwargs={'data_format': 'channels_last'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_3d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3, 2),
+        keras_args=[((2, 3), (1, 2), (2, 3))],
+        keras_kwargs={'data_format': 'channels_first'},
+        np_args=[((2, 3), (1, 2), (2, 3))],
+        np_kwargs={'data_format': 'channels_first'})
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_bias_add(self):
-    with self.cached_session():
-      keras_op = keras.backend.bias_add
-      np_op = np.add
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 2, 7),
-                                     input_shape_b=(7,))
-
-      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
-        x = keras.backend.variable((3, 4))
-        b = keras.backend.variable((3, 4))
-        keras.backend.bias_add(x, b)
-      with self.assertRaises(ValueError):
-        x = keras.backend.variable((3, 4))
-        b = keras.backend.variable((4,))
-        keras.backend.bias_add(x, b, data_format='unknown')
+    keras_op = keras.backend.bias_add
+    np_op = np.add
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 2, 7),
+                                   input_shape_b=(7,))
+
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      x = keras.backend.variable((3, 4))
+      b = keras.backend.variable((3, 4))
+      keras.backend.bias_add(x, b)
+    with self.assertRaises(ValueError):
+      x = keras.backend.variable((3, 4))
+      b = keras.backend.variable((4,))
+      keras.backend.bias_add(x, b, data_format='unknown')
 
   def test_bias_add_channels_first(self):
-    with self.cached_session():
 
-      def keras_op(x, b):
-        return keras.backend.bias_add(x, b, data_format='channels_first')
+    def keras_op(x, b):
+      return keras.backend.bias_add(x, b, data_format='channels_first')
 
-      def np_op(x, b):
-        if x.ndim == 3:
-          b = b.reshape((1, b.shape[0], 1))
-        if x.ndim == 4:
-          b = b.reshape((1, b.shape[0], 1, 1))
-        return x + b
+    def np_op(x, b):
+      if x.ndim == 3:
+        b = b.reshape((1, b.shape[0], 1))
+      if x.ndim == 4:
+        b = b.reshape((1, b.shape[0], 1, 1))
+      return x + b
 
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 7),
-                                     input_shape_b=(3,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 7),
-                                     input_shape_b=(3,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 7),
+                                   input_shape_b=(3,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 7),
+                                   input_shape_b=(3,))
 
   def test_pool2d(self):
     val = np.random.random((10, 3, 10, 10))
@@ -855,9 +831,9 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                              strides,
                                              output_shape,
                                              'channels_last')
-          with self.cached_session():
-            conv_cf = keras.backend.eval(conv_cf)
-            conv_cl = keras.backend.eval(conv_cl)
+
+          conv_cf = keras.backend.eval(conv_cf)
+          conv_cl = keras.backend.eval(conv_cl)
 
           self.assertAllCloseAccordingToType(
               conv_cf,
@@ -905,9 +881,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                   output_shape,
                                                   'channels_last')
 
-    with self.cached_session():
-      local_conv = keras.backend.eval(local_conv)
-      local_conv_dim = keras.backend.eval(local_conv_dim)
+    local_conv = keras.backend.eval(local_conv)
+    local_conv_dim = keras.backend.eval(local_conv_dim)
 
     self.assertAllCloseAccordingToType(local_conv, local_conv_dim)
 
@@ -1063,24 +1038,23 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.cached_session():
-      for i, kwargs in enumerate(kwargs_list):
-        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                             initial_states,
-                                                             **kwargs)
-        # check static shape inference
-        self.assertEqual(last_output.get_shape().as_list(),
+    for i, kwargs in enumerate(kwargs_list):
+      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                           initial_states,
+                                                           **kwargs)
+      # check static shape inference
+      self.assertEqual(last_output.get_shape().as_list(),
+                       [num_samples, output_dim])
+      self.assertEqual(outputs.get_shape().as_list(),
+                       [num_samples, timesteps, output_dim])
+      for state in new_states:
+        self.assertEqual(state.get_shape().as_list(),
                          [num_samples, output_dim])
-        self.assertEqual(outputs.get_shape().as_list(),
-                         [num_samples, timesteps, output_dim])
-        for state in new_states:
-          self.assertEqual(state.get_shape().as_list(),
-                           [num_samples, output_dim])
 
-        last_output_list[i].append(keras.backend.eval(last_output))
-        outputs_list[i].append(keras.backend.eval(outputs))
-        self.assertEqual(len(new_states), 1)
-        state_list[i].append(keras.backend.eval(new_states[0]))
+      last_output_list[i].append(keras.backend.eval(last_output))
+      outputs_list[i].append(keras.backend.eval(outputs))
+      self.assertLen(new_states, 1)
+      state_list[i].append(keras.backend.eval(new_states[0]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1162,29 +1136,28 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.cached_session():
-      for i, kwargs in enumerate(kwargs_list):
-        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                             initial_states,
-                                                             **kwargs)
-        # check static shape inference
-        self.assertEqual(last_output.get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(outputs.get_shape().as_list(),
-                         [num_samples, timesteps, output_dim])
-        # for state in new_states:
-        #   self.assertEqual(state.get_shape().as_list(),
-        #                     [num_samples, output_dim])
-        self.assertEqual(new_states[0].get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(new_states[1].get_shape().as_list(),
-                         [num_samples, 2 * output_dim])
-
-        last_output_list[i].append(keras.backend.eval(last_output))
-        outputs_list[i].append(keras.backend.eval(outputs))
-        self.assertEqual(len(new_states), 2)
-        state_list[i].append(keras.backend.eval(new_states[0]))
-        additional_state_list[i].append(keras.backend.eval(new_states[1]))
+    for i, kwargs in enumerate(kwargs_list):
+      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                           initial_states,
+                                                           **kwargs)
+      # check static shape inference
+      self.assertEqual(last_output.get_shape().as_list(),
+                       [num_samples, output_dim])
+      self.assertEqual(outputs.get_shape().as_list(),
+                       [num_samples, timesteps, output_dim])
+      # for state in new_states:
+      #   self.assertEqual(state.get_shape().as_list(),
+      #                     [num_samples, output_dim])
+      self.assertEqual(new_states[0].get_shape().as_list(),
+                       [num_samples, output_dim])
+      self.assertEqual(new_states[1].get_shape().as_list(),
+                       [num_samples, 2 * output_dim])
+
+      last_output_list[i].append(keras.backend.eval(last_output))
+      outputs_list[i].append(keras.backend.eval(outputs))
+      self.assertLen(new_states, 2)
+      state_list[i].append(keras.backend.eval(new_states[0]))
+      additional_state_list[i].append(keras.backend.eval(new_states[1]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1374,53 +1347,52 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 class TestCTC(test.TestCase):
 
   def test_ctc_decode(self):
-    with self.cached_session():
-      depth = 6
-      seq_len_0 = 5
-      input_prob_matrix_0 = np.asarray(
-          [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
-           [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
-           [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
-           [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
-           [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
-           # Random entry added in at time=5
-           [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]],
-          dtype=np.float32)
-
-      # len max_time_steps array of batch_size x depth matrices
-      inputs = ([input_prob_matrix_0[t, :][np.newaxis, :]
-                 for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
-                2 * [np.zeros((1, depth), dtype=np.float32)])
-
-      inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
-
-      # batch_size length vector of sequence_lengths
-      input_length = keras.backend.variable(
-          np.array([seq_len_0], dtype=np.int32))
-      # batch_size length vector of negative log probabilities
-      log_prob_truth = np.array([
-          -3.5821197,  # output beam 0
-          -3.777835    # output beam 1
-      ], np.float32)[np.newaxis, :]
-
-      decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
-      beam_width = 2
-      top_paths = 2
-
-      decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
-          inputs,
-          input_length,
-          greedy=False,
-          beam_width=beam_width,
-          top_paths=top_paths)
-
-      self.assertEqual(len(decode_pred_tf), top_paths)
-      log_prob_pred = keras.backend.eval(log_prob_pred_tf)
-      for i in range(top_paths):
-        self.assertTrue(
-            np.alltrue(
-                decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
-      self.assertAllClose(log_prob_truth, log_prob_pred)
+    depth = 6
+    seq_len_0 = 5
+    input_prob_matrix_0 = np.asarray(
+        [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
+         [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
+         [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
+         [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
+         [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
+         # Random entry added in at time=5
+         [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]],
+        dtype=np.float32)
+
+    # len max_time_steps array of batch_size x depth matrices
+    inputs = ([input_prob_matrix_0[t, :][np.newaxis, :]
+               for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
+              2 * [np.zeros((1, depth), dtype=np.float32)])
+
+    inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
+
+    # batch_size length vector of sequence_lengths
+    input_length = keras.backend.variable(
+        np.array([seq_len_0], dtype=np.int32))
+    # batch_size length vector of negative log probabilities
+    log_prob_truth = np.array([
+        -3.5821197,  # output beam 0
+        -3.777835    # output beam 1
+    ], np.float32)[np.newaxis, :]
+
+    decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
+    beam_width = 2
+    top_paths = 2
+
+    decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
+        inputs,
+        input_length,
+        greedy=False,
+        beam_width=beam_width,
+        top_paths=top_paths)
+
+    self.assertEqual(len(decode_pred_tf), top_paths)
+    log_prob_pred = keras.backend.eval(log_prob_pred_tf)
+    for i in range(top_paths):
+      self.assertTrue(
+          np.alltrue(
+              decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
+    self.assertAllClose(log_prob_truth, log_prob_pred)
 
   @test_util.run_v1_only('b/120545219')
   def test_ctc_batch_cost(self):
@@ -1481,29 +1453,26 @@ class TestCTC(test.TestCase):
 class TestRandomOps(test.TestCase):
 
   def test_random_binomial(self):
-    with self.cached_session():
-      np.random.seed(123)
-      x = keras.backend.random_binomial((1000, 1000), p=0.5)
-      self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
+    np.random.seed(123)
+    x = keras.backend.random_binomial((1000, 1000), p=0.5)
+    self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
 
   def test_truncated_normal(self):
-    with self.cached_session():
-      np.random.seed(123)
-      x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
-      y = keras.backend.eval(x)
-      self.assertAllClose(np.mean(y), 0., atol=0.1)
-      self.assertAllClose(np.std(y), 0.88, atol=0.1)
-      self.assertAllClose(np.max(y), 2., atol=0.1)
-      self.assertAllClose(np.min(y), -2., atol=0.1)
+    np.random.seed(123)
+    x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
+    y = keras.backend.eval(x)
+    self.assertAllClose(np.mean(y), 0., atol=0.1)
+    self.assertAllClose(np.std(y), 0.88, atol=0.1)
+    self.assertAllClose(np.max(y), 2., atol=0.1)
+    self.assertAllClose(np.min(y), -2., atol=0.1)
 
   def test_string_input(self):
-    with self.cached_session():
-      seq = keras.Sequential([
-          keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
-          keras.layers.Lambda(lambda x: x[0])
-      ])
-      preds = seq.predict([['tensorflow eager']])
-      self.assertEqual(preds.shape, (1,))
+    seq = keras.Sequential([
+        keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
+        keras.layers.Lambda(lambda x: x[0])
+    ])
+    preds = seq.predict([['tensorflow eager']])
+    self.assertEqual(preds.shape, (1,))
 
 
 class BackendGraphTests(test.TestCase):
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index a8fb87f5cccabf0b505566ed9cd8a293d25d67fc..3223c89812513a21d07154b2d20158674f46d13b 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
 from tensorflow.python.training import saver
+from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util.tf_export import keras_export
 
 try:
@@ -51,11 +52,6 @@ except ImportError:
   requests = None
 
 
-_TRAIN = 'train'
-_TEST = 'test'
-_PREDICT = 'predict'
-
-
 # pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
@@ -66,7 +62,7 @@ def configure_callbacks(callbacks,
                         samples=None,
                         verbose=1,
                         count_mode='steps',
-                        mode=_TRAIN):
+                        mode=ModeKeys.TRAIN):
   """Configures callbacks for use in various training loops.
 
   Arguments:
@@ -79,8 +75,8 @@ def configure_callbacks(callbacks,
       samples: Number of training samples.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
-      mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
-        configure callbacks for.
+      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+        Which loop mode to configure callbacks for.
 
   Returns:
       Instance of CallbackList used to control all Callbacks.
@@ -93,7 +89,7 @@ def configure_callbacks(callbacks,
     callbacks = []
 
   # Add additional callbacks during training.
-  if mode == _TRAIN:
+  if mode == ModeKeys.TRAIN:
     model.history = History()
     stateful_metric_names = None
     if hasattr(model, 'metrics_names'):
@@ -113,7 +109,7 @@ def configure_callbacks(callbacks,
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if mode != _PREDICT and hasattr(model, 'metrics_names'):
+  if mode != ModeKeys.PREDICT and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
@@ -127,14 +123,6 @@ def configure_callbacks(callbacks,
       'metrics': callback_metrics,
   }
   callback_list.set_params(callback_params)
-
-  if (do_validation and not model._distribution_strategy and
-      not model.run_eagerly):
-    # Need to create the eval_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the eval_function
-    callback_model._make_eval_function()
-
   callback_list.model.stop_training = False
   return callback_list
 # pylint: enable=protected-access
@@ -148,7 +136,7 @@ def _is_generator_like(data):
 
 def make_logs(model, logs, outputs, mode, prefix=''):
   """Computes logs for sending to `on_batch_end` methods."""
-  if mode in {_TRAIN, _TEST}:
+  if mode in {ModeKeys.TRAIN, ModeKeys.TEST}:
     if hasattr(model, 'metrics_names'):
       for label, output in zip(model.metrics_names, outputs):
         logs[prefix + label] = output
@@ -220,57 +208,57 @@ class CallbackList(object):
 
   def _call_begin_hook(self, mode):
     """Helper function for on_{train|test|predict}_begin methods."""
-    if mode == _TRAIN:
+    if mode == ModeKeys.TRAIN:
       self.on_train_begin()
-    elif mode == _TEST:
+    elif mode == ModeKeys.TEST:
       self.on_test_begin()
     else:
       self.on_predict_begin()
 
   def _call_end_hook(self, mode):
     """Helper function for on_{train|test|predict}_end methods."""
-    if mode == _TRAIN:
+    if mode == ModeKeys.TRAIN:
       self.on_train_end()
-    elif mode == _TEST:
+    elif mode == ModeKeys.TEST:
       self.on_test_end()
     else:
       self.on_predict_end()
 
   def on_batch_begin(self, batch, logs=None):
-    self._call_batch_hook(_TRAIN, 'begin', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_batch_end(self, batch, logs=None):
-    self._call_batch_hook(_TRAIN, 'end', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
-  def on_epoch_begin(self, epoch, logs=None, mode='train'):
+  def on_epoch_begin(self, epoch, logs=None):
     """Calls the `on_epoch_begin` methods of its callbacks.
 
+    This function should only be called during TRAIN mode.
+
     Arguments:
         epoch: integer, index of epoch.
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
-        mode: One of 'train'/'test'/'predict'
     """
-    if mode == _TRAIN:
-      logs = logs or {}
-      for callback in self.callbacks:
-        callback.on_epoch_begin(epoch, logs)
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_begin(epoch, logs)
     self._reset_batch_timing()
 
-  def on_epoch_end(self, epoch, logs=None, mode='train'):
+  def on_epoch_end(self, epoch, logs=None):
     """Calls the `on_epoch_end` methods of its callbacks.
 
+    This function should only be called during TRAIN mode.
+
     Arguments:
         epoch: integer, index of epoch.
         logs: dict, metric results for this training epoch, and for the
           validation epoch if validation is performed. Validation result keys
           are prefixed with `val_`.
-        mode: One of 'train'/'test'/'predict'
     """
-    if mode == _TRAIN:
-      logs = logs or {}
-      for callback in self.callbacks:
-        callback.on_epoch_end(epoch, logs)
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_end(epoch, logs)
 
   def on_train_batch_begin(self, batch, logs=None):
     """Calls the `on_train_batch_begin` methods of its callbacks.
@@ -280,7 +268,7 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
-    self._call_batch_hook(_TRAIN, 'begin', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
     """Calls the `on_train_batch_end` methods of its callbacks.
@@ -289,7 +277,7 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
-    self._call_batch_hook(_TRAIN, 'end', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
   def on_test_batch_begin(self, batch, logs=None):
     """Calls the `on_test_batch_begin` methods of its callbacks.
@@ -299,7 +287,7 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
-    self._call_batch_hook(_TEST, 'begin', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
 
   def on_test_batch_end(self, batch, logs=None):
     """Calls the `on_test_batch_end` methods of its callbacks.
@@ -308,7 +296,7 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
-    self._call_batch_hook(_TEST, 'end', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
 
   def on_predict_batch_begin(self, batch, logs=None):
     """Calls the `on_predict_batch_begin` methods of its callbacks.
@@ -318,7 +306,7 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
-    self._call_batch_hook(_PREDICT, 'begin', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
 
   def on_predict_batch_end(self, batch, logs=None):
     """Calls the `on_predict_batch_end` methods of its callbacks.
@@ -327,7 +315,7 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
-    self._call_batch_hook(_PREDICT, 'end', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
     """Calls the `on_train_begin` methods of its callbacks.
@@ -437,29 +425,29 @@ class Callback(object):
   def on_batch_end(self, batch, logs=None):
     """A backwards compatibility alias for `on_train_batch_end`."""
 
-  def on_epoch_begin(self, epoch, logs=None, mode='train'):
+  def on_epoch_begin(self, epoch, logs=None):
     """Called at the start of an epoch.
 
-    Subclasses should override for any actions to run.
+    Subclasses should override for any actions to run. This function should only
+    be called during TRAIN mode.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
-        mode: One of 'train'/'test'/'predict'
     """
 
-  def on_epoch_end(self, epoch, logs=None, mode='train'):
+  def on_epoch_end(self, epoch, logs=None):
     """Called at the end of an epoch.
 
-    Subclasses should override for any actions to run.
+    Subclasses should override for any actions to run. This function should only
+    be called during TRAIN mode.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dict, metric results for this training epoch, and for the
           validation epoch if validation is performed. Validation result keys
           are prefixed with `val_`.
-        mode: One of 'train'/'test'/'predict'
     """
 
   def on_train_batch_begin(self, batch, logs=None):
@@ -700,15 +688,14 @@ class ProgbarLogger(Callback):
     if self.verbose:
       if self.epochs > 1:
         print('Epoch %d/%d' % (epoch + 1, self.epochs))
-      self.progbar = Progbar(
-          target=self.target,
-          verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics,
-          unit_name='step' if self.use_steps else 'sample')
+    self.progbar = Progbar(
+        target=self.target,
+        verbose=self.verbose,
+        stateful_metrics=self.stateful_metrics,
+        unit_name='step' if self.use_steps else 'sample')
 
   def on_batch_begin(self, batch, logs=None):
-    if self.seen < self.target:
-      self.log_values = []
+    self.log_values = []
 
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
@@ -727,7 +714,7 @@ class ProgbarLogger(Callback):
 
     # Skip progbar update for the last batch;
     # will be handled by on_epoch_end.
-    if self.verbose and self.seen < self.target:
+    if self.verbose and (self.target is None or self.seen < self.target):
       self.progbar.update(self.seen, self.log_values)
 
   def on_epoch_end(self, epoch, logs=None):
@@ -1178,12 +1165,15 @@ class TensorBoard(Callback):
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
 
-  def _init_writer(self):
+  def _init_writer(self, model):
     """Sets file writer."""
     if context.executing_eagerly():
       self.writer = summary_ops_v2.create_file_writer(self.log_dir)
+      if not model.run_eagerly and self.write_graph:
+        with self.writer.as_default():
+          summary_ops_v2.graph(K.get_graph())
     elif self.write_graph:
-      self.writer = tf_summary.FileWriter(self.log_dir, K.get_session().graph)
+      self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
     else:
       self.writer = tf_summary.FileWriter(self.log_dir)
 
@@ -1246,7 +1236,7 @@ class TensorBoard(Callback):
     """Sets Keras model and creates summary ops."""
 
     self.model = model
-    self._init_writer()
+    self._init_writer(model)
     # histogram summaries only enabled in graph mode
     if not context.executing_eagerly():
       self._make_histogram_ops(model)
@@ -1377,6 +1367,7 @@ class TensorBoard(Callback):
       self._epoch = epoch
       # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
+      self.model._make_eval_function()
       if self.merged not in self.model._eval_function.fetches:
         self.model._eval_function.fetches.append(self.merged)
         self.model._eval_function.fetch_callbacks[
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index ef469c5e4f5deb3e4f0cff7cb3deea95d0266d9b..6aba08b95fb1c13a7ded23a117af3ab0d2d64874 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -27,11 +27,11 @@ import tempfile
 import threading
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
@@ -89,6 +89,23 @@ class Counter(keras.callbacks.Callback):
     return _call_and_count
 
 
+def _get_numpy():
+  return np.ones((10, 10)), np.ones((10, 1))
+
+
+def _get_sequence():
+
+  class MySequence(keras.utils.data_utils.Sequence):
+
+    def __getitem__(self, _):
+      return np.ones((2, 10)), np.ones((2, 1))
+
+    def __len__(self):
+      return 5
+
+  return MySequence(), None
+
+
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 class CallbackCountsTest(keras_parameterized.TestCase):
@@ -114,8 +131,10 @@ class CallbackCountsTest(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly())
     return model
 
-  def test_callback_hooks_are_called_in_fit(self):
-    x, y = np.ones((10, 10)), np.ones((10, 1))
+  @parameterized.named_parameters(('with_numpy', _get_numpy()),
+                                  ('with_sequence', _get_sequence()))
+  def test_callback_hooks_are_called_in_fit(self, data):
+    x, y = data
     val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
 
     model = self._get_model()
@@ -148,8 +167,10 @@ class CallbackCountsTest(keras_parameterized.TestCase):
             'on_train_end': 1
         })
 
-  def test_callback_hooks_are_called_in_evaluate(self):
-    x, y = np.ones((10, 10)), np.ones((10, 1))
+  @parameterized.named_parameters(('with_numpy', _get_numpy()),
+                                  ('with_sequence', _get_sequence()))
+  def test_callback_hooks_are_called_in_evaluate(self, data):
+    x, y = data
 
     model = self._get_model()
     counter = Counter()
@@ -162,8 +183,10 @@ class CallbackCountsTest(keras_parameterized.TestCase):
             'on_test_end': 1
         })
 
-  def test_callback_hooks_are_called_in_predict(self):
-    x = np.ones((10, 10))
+  @parameterized.named_parameters(('with_numpy', _get_numpy()),
+                                  ('with_sequence', _get_sequence()))
+  def test_callback_hooks_are_called_in_predict(self, data):
+    x = data[0]
 
     model = self._get_model()
     counter = Counter()
@@ -562,13 +585,15 @@ class KerasCallbacksTest(test.TestCase):
             optimizer=keras.optimizers.SGD(lr=0.1))
         return model
 
+      # TODO(psv): Make sure the callback works correctly when min_delta is
+      # set as 0. Test fails when the order of this callback and assertion is
+      # interchanged.
       model = make_model()
-      # This should reduce the LR after the first epoch (due to high epsilon).
       cbks = [
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              min_delta=10,
+              min_delta=0,
               patience=1,
               cooldown=5)
       ]
@@ -581,16 +606,15 @@ class KerasCallbacksTest(test.TestCase):
           epochs=5,
           verbose=0)
       self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)),
-          0.01,
-          atol=1e-4)
+          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
 
       model = make_model()
+      # This should reduce the LR after the first epoch (due to high epsilon).
       cbks = [
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              min_delta=0,
+              min_delta=10,
               patience=1,
               cooldown=5)
       ]
@@ -603,7 +627,7 @@ class KerasCallbacksTest(test.TestCase):
           epochs=5,
           verbose=2)
       self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+          float(keras.backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
 
   def test_ReduceLROnPlateau_patience(self):
 
@@ -1019,7 +1043,7 @@ class KerasCallbacksTest(test.TestCase):
       def close(self):
         pass
 
-    def _init_writer(obj):
+    def _init_writer(obj, _):
       obj.writer = FileWriterStub(obj.log_dir)
 
     np.random.seed(1337)
@@ -1397,47 +1421,6 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
-  @test_util.run_deprecated_v1
-  def test_fit_generator_with_callback(self):
-
-    class TestCallback(keras.callbacks.Callback):
-
-      def set_model(self, model):
-        # Check the model operations for the optimizer operations that
-        # the _make_train_function adds under a named scope for the
-        # optimizer. This ensurs the full model is populated before the
-        # set_model callback is called.
-        optimizer_name_scope = 'training/' + model.optimizer.__class__.__name__
-        graph_def = ops.get_default_graph().as_graph_def()
-        for node in graph_def.node:
-          if node.name.startswith(optimizer_name_scope):
-            return
-        raise RuntimeError('The optimizer operations are not present in the '
-                           'model graph when the Callback.set_model function '
-                           'is called')
-    np.random.seed(1337)
-
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
-
-    with self.cached_session():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=1,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=[TestCallback()],
-          verbose=0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 8aa0bac8cb91cc4fdb80eab989c6234fe68abc21..5eb2c5a7522e81fd213f85208534174267987144 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -25,11 +25,12 @@ import itertools
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
@@ -56,7 +57,7 @@ from tensorflow.tools.docs import doc_controls
 
 
 @keras_export('keras.layers.Layer')
-class Layer(checkpointable.CheckpointableBase):
+class Layer(checkpointable.Checkpointable):
   """Base layer class.
 
   This is the class from which all layers inherit.
@@ -255,7 +256,7 @@ class Layer(checkpointable.CheckpointableBase):
                  synchronization=tf_variables.VariableSynchronization.AUTO,
                  aggregation=tf_variables.VariableAggregation.NONE,
                  **kwargs):
-    """Adds a new variable to the layer, or gets an existing one; returns it.
+    """Adds a new variable to the layer.
 
     Arguments:
       name: variable name.
@@ -306,6 +307,8 @@ class Layer(checkpointable.CheckpointableBase):
     if dtype is None:
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
+    if self._dtype is None:
+      self._dtype = dtype.base_dtype.name
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -510,12 +513,10 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     input_list = nest.flatten(inputs)
-
-    if context.executing_eagerly():
-      # Accept NumPy inputs by converting to Tensors when executing eagerly.
-      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
-        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
-        input_list = nest.flatten(inputs)
+    # Accept NumPy inputs by converting to Tensors.
+    if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+      inputs = nest.map_structure(ops.convert_to_tensor, inputs)
+      input_list = nest.flatten(inputs)
 
     # We will attempt to build a TF graph if & only if all inputs are symbolic.
     # This is always the case in graph mode. It can also be the case in eager
@@ -1128,14 +1129,7 @@ class Layer(checkpointable.CheckpointableBase):
     all_input_shapes = set(
         [str(node.input_shapes) for node in self._inbound_nodes])
     if len(all_input_shapes) == 1:
-      input_shapes = self._inbound_nodes[0].input_shapes
-      if len(input_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in input_shapes
-        ]
+      return self._inbound_nodes[0].input_shapes
     else:
       raise AttributeError('The layer "' + str(self.name) +
                            ' has multiple inbound nodes, '
@@ -1186,14 +1180,7 @@ class Layer(checkpointable.CheckpointableBase):
     all_output_shapes = set(
         [str(node.output_shapes) for node in self._inbound_nodes])
     if len(all_output_shapes) == 1:
-      output_shapes = self._inbound_nodes[0].output_shapes
-      if len(output_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in output_shapes
-        ]
+      return self._inbound_nodes[0].output_shapes
     else:
       raise AttributeError('The layer "%s"'
                            ' has multiple inbound nodes, '
@@ -1404,16 +1391,14 @@ class Layer(checkpointable.CheckpointableBase):
     # If the layer returns tensors from its inputs, unmodified,
     # we copy them to avoid loss of tensor metadata.
     output_ls = nest.flatten(outputs)
+    inputs_ls = nest.flatten(inputs)
     output_ls_copy = []
     for x in output_ls:
-      if x in nest.flatten(inputs):
+      if x in inputs_ls:
         with ops.name_scope(self.name):
           x = array_ops.identity(x)
       output_ls_copy.append(x)
-    if len(output_ls_copy) == 1:
-      outputs = output_ls_copy[0]
-    else:
-      outputs = output_ls_copy
+    outputs = nest.pack_sequence_as(outputs, output_ls_copy)
 
     inputs, kwargs = self._inputs_from_call_args(
         call_args=(inputs,) + args, call_kwargs=kwargs)
@@ -1507,19 +1492,12 @@ class Layer(checkpointable.CheckpointableBase):
         arguments: dictionary of keyword arguments that were passed to the
             `call` method of the layer at the call that created the node.
     """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
+    inbound_layers = nest.map_structure(lambda t: t._keras_history[0],
+                                        input_tensors)
+    node_indices = nest.map_structure(lambda t: t._keras_history[1],
+                                      input_tensors)
+    tensor_indices = nest.map_structure(lambda t: t._keras_history[2],
+                                        input_tensors)
 
     # Create node, add it to inbound nodes.
     Node(
@@ -1532,13 +1510,15 @@ class Layer(checkpointable.CheckpointableBase):
         arguments=arguments)
 
     # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+    # The metadata attribute consists of
+    # 1) a layer instance
+    # 2) a node index for the layer
+    # 3) a tensor index for the node.
+    # The allows layer reuse (multiple nodes per layer) and multi-output
+    # or multi-input layers (e.g. a layer can return multiple tensors,
+    # and each can be sent to a different layer).
+    for i, tensor in enumerate(nest.flatten(output_tensors)):
+      tensor._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
 
   def _get_node_attribute_at_index(self, node_index, attr, attr_name):
     """Private utility to retrieves an attribute (e.g. inputs) from a node.
@@ -1571,7 +1551,7 @@ class Layer(checkpointable.CheckpointableBase):
                        str(node_index) + ', but the layer has only ' +
                        str(len(self._inbound_nodes)) + ' inbound nodes.')
     values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
+    if isinstance(values, list) and len(values) == 1:
       return values[0]
     else:
       return values
@@ -1698,12 +1678,13 @@ class Node(object):
                input_tensors,
                output_tensors,
                arguments=None):
-    # Layer instance (NOT a list).
-    if isinstance(outbound_layer, list):
-      raise ValueError(
-          '`outbound_layer` should be a layer instance, not a list.')
-    # this is the layer that takes a list of input tensors
-    # and turns them into a list of output tensors.
+    # Layer instance (NOT a sequence)
+    if isinstance(outbound_layer, (list, tuple, dict)):
+      raise ValueError('`outbound_layer` should be a layer instance, '
+                       'not a list, tuple, or, dict.')
+
+    # this is the layer that takes a nested structure of input tensors
+    # and turns them into a nested structure of output tensors.
     # the current node will be added to
     # the inbound_nodes of outbound_layer.
     self.outbound_layer = outbound_layer
@@ -1713,33 +1694,33 @@ class Node(object):
     # and for each layer, which node and which
     # tensor output of each node.
 
-    # List of layer instances.
+    # Nested structure of layer instances.
     self.inbound_layers = inbound_layers
-    # List of integers, 1:1 mapping with inbound_layers.
+    # Nested structure of integers, 1:1 mapping with inbound_layers.
     self.node_indices = node_indices
-    # List of integers, 1:1 mapping with inbound_layers.
+    # Nested of integers, 1:1 mapping with inbound_layers.
     self.tensor_indices = tensor_indices
 
     # Following 2 properties:
     # tensor inputs and outputs of outbound_layer.
 
-    # List of tensors. 1:1 mapping with inbound_layers.
+    # Nested structure of tensors. 1:1 mapping with inbound_layers.
     self.input_tensors = input_tensors
-    # List of tensors, created by outbound_layer.call().
+    # Nested structure of tensors, created by outbound_layer.call().
     self.output_tensors = output_tensors
 
     # Following 2 properties: input and output shapes.
 
-    # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [backend.int_shape(x) for x in input_tensors]
-    # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [backend.int_shape(x) for x in output_tensors]
+    # Nested structure of shape tuples, shapes of input_tensors.
+    self.input_shapes = nest.map_structure(backend.int_shape, input_tensors)
+    # Nested structure of shape tuples, shapes of output_tensors.
+    self.output_shapes = nest.map_structure(backend.int_shape, output_tensors)
 
     # Optional keyword arguments to layer's `call`.
     self.arguments = arguments
 
     # Add nodes to all layers involved.
-    for layer in inbound_layers:
+    for layer in nest.flatten(inbound_layers):
       if layer is not None:
         # For compatibility with external Keras, we use the deprecated
         # accessor here.
@@ -1748,13 +1729,19 @@ class Node(object):
     # accessor here.
     outbound_layer.inbound_nodes.append(self)
 
+  def iterate_inbound(self):
+    """Returns a list of tuples representing the inbound data.
+
+    Returns:
+      List of tuples like: (inbound_layer, node_index, tensor_index, tensor).
+    """
+    return zip(
+        nest.flatten(self.inbound_layers), nest.flatten(self.node_indices),
+        nest.flatten(self.tensor_indices), nest.flatten(self.input_tensors))
+
   def get_config(self):
-    inbound_names = []
-    for layer in self.inbound_layers:
-      if layer:
-        inbound_names.append(layer.name)
-      else:
-        inbound_names.append(None)
+    inbound_names = nest.map_structure(
+        lambda layer: layer.name if layer else None, self.inbound_layers)
     return {
         'outbound_layer': self.outbound_layer.name,
         'inbound_layers': inbound_names,
@@ -1763,6 +1750,83 @@ class Node(object):
     }
 
 
+class TensorFlowOpLayer(Layer):
+  """Wraps a TensorFlow Operation in a Layer.
+
+  This class is used internally by the Functional API. When a user
+  uses a raw TensorFlow Operation on symbolic tensors originating
+  from an `Input` Layer, the resultant operation will be wrapped
+  with this Layer object in order to make the operation compatible
+  with the Keras API.
+
+  This Layer will create a new, identical operation (except for inputs
+  and outputs) every time it is called. If `run_eagerly` is `True`,
+  the op creation and calculation will happen inside an Eager function.
+
+  Instances of this Layer are created when `autolambda` is called, which
+  is whenever a Layer's `__call__` encounters symbolic inputs that do
+  not have Keras metadata, or when a Network's `__init__` encounters
+  outputs that do not have Keras metadata.
+
+  Attributes:
+    node_def: String, the serialized NodeDef of the Op this layer will wrap.
+    constants: Dict of NumPy arrays, the values of any Tensors needed for this
+      Operation that do not originate from a Keras `Input` Layer. Since all
+      placeholders must come from Keras `Input` Layers, these Tensors must be
+      treated as constant in the Functional API.
+    name: String, the name of the Layer.
+    trainable: Bool, whether this Layer is trainable. Currently Variables are
+      not supported, and so this parameter has no effect.
+    dtype: The default dtype of this Layer. Inherited from `Layer` and has no
+      effect on this class, however is used in `get_config`.
+  """
+
+  def __init__(self,
+               node_def,
+               constants=None,
+               name=None,
+               trainable=True,
+               dtype=None):
+    super(TensorFlowOpLayer, self).__init__(
+        name=name, trainable=trainable, dtype=dtype)
+    self.node_def = node_def_pb2.NodeDef.FromString(node_def)
+    self.constants = constants or {}
+
+  def call(self, inputs):
+    if context.executing_eagerly():
+      return self._defun_call(inputs)
+    return self._make_op(inputs)
+
+  def _make_op(self, inputs):
+    inputs = nest.flatten(inputs)
+    graph = inputs[0].graph
+    with graph.as_default():
+      for index, constant in self.constants.items():
+        constant = ops.convert_to_tensor(constant)
+        inputs.insert(index, constant)
+
+      self.node_def.name = graph.unique_name(self.node_def.name)
+      c_op = ops._create_c_op(graph, self.node_def, inputs, control_inputs=[])
+      op = graph._create_op_from_tf_operation(c_op)
+
+      if len(op.outputs) == 1:
+        return op.outputs[0]
+      return op.outputs
+
+  @function.defun
+  def _defun_call(self, inputs):
+    """Wraps the op creation method in an Eager function for `run_eagerly`."""
+    return self._make_op(inputs)
+
+  def get_config(self):
+    config = super(TensorFlowOpLayer, self).get_config()
+    config.update({
+        'node_def': self.node_def.SerializeToString(),
+        'constants': self.constants
+    })
+    return config
+
+
 def default(method):
   """Decorates a method to detect overrides in subclasses."""
   method._is_default = True
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index ebee4a3043e57d149bb8d81812e7568aff8f8eb8..93ca10ee4ea13d90e92aa3937e2148fe85c454c8 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
@@ -71,7 +73,7 @@ class InvalidLayer(base_layer.Layer):
     raise ValueError('You did something wrong!')
 
 
-class BaseLayerTest(test.TestCase, parameterized.TestCase):
+class BaseLayerTest(keras_parameterized.TestCase):
 
   @parameterized.parameters(DynamicLayer1, DynamicLayer2)
   def test_dynamic_layer_in_functional_model_in_graph_mode(self, layer_class):
@@ -210,6 +212,49 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       _ = InvalidLayer()(inputs)
 
+  @keras_parameterized.run_with_all_model_types
+  @test_util.run_in_graph_and_eager_modes
+  def test_build_with_numpy_data(self):
+    model_layers = [
+        keras.layers.Dense(3, activation='relu', kernel_initializer='ones'),
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+    ]
+    model = testing_utils.get_model_from_layers(model_layers, input_shape=(4,))
+    model(np.zeros((2, 4), dtype='float32'))
+    self.assertTrue(model.built)
+
+  def test_learning_phase_freezing_for_layers(self):
+    # This test is only meant to run in graph functions mode (ambient eager).
+    # In forced eager, `model.predict` ignores the global learning phase
+    # and just uses training=False. TODO(fchollet): consider unifying the
+    # behaviors.
+
+    class LearningPhaseLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return keras.backend.in_train_phase(
+            lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs))
+
+    def get_learning_phase_value():
+      model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
+      return np.sum(model.predict(np.ones((1, 1))))
+
+    self.assertEqual(get_learning_phase_value(), 0)
+
+    # Test scope.
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(get_learning_phase_value(), 1)
+
+    # The effects of the scope end after exiting it.
+    self.assertEqual(get_learning_phase_value(), 0)
+
+    # Test setting.
+    keras.backend.set_learning_phase(1)
+    self.assertEqual(get_learning_phase_value(), 1)
+    keras.backend.set_learning_phase(0)
+    self.assertEqual(get_learning_phase_value(), 0)
+
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
     x = keras.Input((3,))
@@ -239,18 +284,14 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
       x1 = array_ops.ones((3, 3))
     x2 = array_ops.ones((3, 3))
     self.assertIsInstance(x2, ops.EagerTensor)
-    with self.assertRaisesRegexp(TypeError,
-                                 'provided list of inputs contains '
-                                 'objects other than \'EagerTensor\''):
+    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   def test_mixing_numpy_arrays_and_graph_tensors(self):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaisesRegexp(TypeError,
-                                 'provided list of inputs contains '
-                                 'objects other than \'EagerTensor\''):
+    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index d2f947f17723fbb01280d7ef09f327dd64fc938e..5d39038ddd15e2c9efa6de4ca5c906f764fed8cd 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
 
@@ -55,7 +56,6 @@ def make_variable(name,
                   shape=None,
                   dtype=dtypes.float32,
                   initializer=None,
-                  partition_info=None,
                   trainable=None,
                   caching_device=None,
                   validate_shape=True,
@@ -76,14 +76,12 @@ def make_variable(name,
   rid of this temporary solution.
 
   TODO(fchollet): remove this method when no longer needed.
-  TODO(fchollet): handle `partitioner` argument.
 
   Arguments:
     name: Variable name.
     shape: Variable shape.
     dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
     initializer: Initializer instance (callable).
-    partition_info: Not handled at this time.
     trainable: Whether the variable should be part of the layer's
       "trainable_variables" (e.g. variables, biases)
       or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
@@ -123,8 +121,9 @@ def make_variable(name,
       # Instantiate initializer if provided initializer is a type object.
       if isinstance(initializer, type(init_ops.Initializer)):
         initializer = initializer(dtype=dtype)
-      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-          shape, dtype=dtype, partition_info=partition_info)
+      elif isinstance(initializer, type(init_ops_v2.Initializer)):
+        initializer = initializer()
+      init_val = lambda: initializer(shape, dtype=dtype)
       variable_dtype = dtype.base_dtype
   if use_resource is None:
     use_resource = True
@@ -224,13 +223,116 @@ def collect_previous_mask(input_tensors):
   return masks
 
 
-def have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = nest.flatten(iterable_or_element)
-  return all(hasattr(x, '_keras_history') for x in iterable)
+def have_all_keras_metadata(tensors):
+  return all(hasattr(x, '_keras_history') for x in nest.flatten(tensors))
 
 
 def generate_placeholders_from_shape(shape):
   return array_ops.placeholder(shape=shape, dtype=backend.floatx())
+
+
+def create_keras_history(tensors):
+  """Wraps TensorFlow Operations for compatibility with the Functional API.
+
+  This method checks to see if a Tensor in `tensors` is missing Keras metadata
+  and has its origin in a Keras `Input` Layer. If so, this method will replace
+  the raw TensorFlow Operations that created this tensor with
+  `TensorFlowOpLayer`
+  instances that create identical operations.
+
+  Any Tensors not originating from a Keras `Input` Layer will be treated as
+  constants when constructing `TensorFlowOpLayer` instances.
+
+  Arguments:
+    tensors: A structure of Tensors, some of which come from raw TensorFlow
+      operations and need to have Keras metadata assigned to them.
+  """
+
+  try:
+    _create_keras_history_helper(tensors, set())
+  except AttributeError:
+    # This can happen with sublayers inside of layers in the Functional API.
+    # The error occurs when a Functional Model is running in V2 function
+    # mode and a non-Keras Tensor is passed as an input to a sublayer inside
+    # of another layer.
+    # TODO(omalleyt): Only run `create_keras_history` during Functional API
+    # creation phase.
+    pass
+
+
+def _create_keras_history_helper(tensors, processed_ops=None):
+  """Helper method for `create_keras_history`.
+
+  Arguments:
+    tensors: A structure of Tensors for which to create Keras metadata.
+    processed_ops: TensorFlow operations that have already been wrapped in
+      `TensorFlowOpLayer` instances.
+
+  Returns:
+    The updated set of TensorFlow Operations that have been wrapped
+    in `TensorFlowOpLayer` instances.
+  """
+  # Import of `base_layer` needed in order to create `TensorFlowOpLayer`.
+  # Cannot be imported at top because of circular dependencies.
+  # TODO(omalleyt): Resolve circular dependency.
+  from tensorflow.python.keras.engine import base_layer  # pylint: disable=g-import-not-at-top
+  tensor_list = nest.flatten(tensors)
+  for tensor in tensor_list:
+    if getattr(tensor, '_keras_history', None) is not None:
+      continue
+    op = tensor.op  # The Op that created this Tensor.
+    if op not in processed_ops:
+      # Recursively set `_keras_history`.
+      op_inputs = list(op.inputs)
+      constants = {}
+      layer_inputs = []
+      for i, op_input in enumerate(op_inputs):
+        if uses_keras_input_layers(op_input):
+          layer_inputs.append(op_input)
+        else:
+          # Treat any value not originating from a `keras.Input` as
+          # a constant (Variables currently have `Placeholder` op type
+          # when originating from an eager context
+          # so can't be supported.
+          constants[i] = backend.function([], [op_input])([])
+      processed_ops = _create_keras_history_helper(layer_inputs, processed_ops)
+      name = op.name
+      node_def = op.node_def.SerializeToString()
+      op_layer = base_layer.TensorFlowOpLayer(
+          node_def, constants=constants, name=name)
+      op_layer._add_inbound_node(  # pylint: disable=protected-access
+          layer_inputs, op.outputs)
+      processed_ops.update([op])
+  return processed_ops
+
+
+def uses_keras_input_layers(tensors):
+  """Checks if at least one Tensor in `tensors` originates from a Keras `Input`.
+
+  If so, the Functional API is being used.
+
+  Arguments:
+    tensors: An arbitrary nested structure of Tensors.
+
+  Returns:
+    Bool, whether at least one Tensor originates from a Keras `Input`.
+  """
+  checked_tensors = set()
+  input_tensors = nest.flatten(tensors)
+
+  while input_tensors:
+    if any(
+        getattr(tensor, '_keras_history', None) is not None
+        for tensor in input_tensors):
+      return True
+    checked_tensors.update(input_tensors)
+    new_input_tensors = set()
+    for tensor in input_tensors:
+      try:
+        new_input_tensors.update(tensor.op.inputs)
+      except AttributeError:
+        # In case `tensor` is a Variable created in an Eager
+        # context
+        pass
+    input_tensors = list(new_input_tensors - checked_tensors)
+  return False
diff --git a/tensorflow/python/keras/engine/correctness_test.py b/tensorflow/python/keras/engine/correctness_test.py
index c2f3b040de3269c6921d95d8a845869511ac0634..68634235d1b5731d4359ef0796eaa28eeb9ca002 100644
--- a/tensorflow/python/keras/engine/correctness_test.py
+++ b/tensorflow/python/keras/engine/correctness_test.py
@@ -66,7 +66,10 @@ class SimpleBiasTest(keras_parameterized.TestCase):
 
   def _get_simple_bias_model(self):
     model = testing_utils.get_model_from_layers([Bias()], input_shape=(1,))
-    model.compile(keras.optimizer_v2.gradient_descent.SGD(0.1), 'mae')
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   def test_simple_bias_fit(self):
@@ -101,7 +104,10 @@ class MultipleInputTest(keras_parameterized.TestCase):
       model = MultiInputSubclassed()
     else:
       model = multi_input_functional()
-    model.compile(keras.optimizer_v2.gradient_descent.SGD(0.1), 'mae')
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   @parameterized.named_parameters(('subclassed', True), ('functional', False))
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index 4598f63c0222dea7ccd33e05f9472ec3588f5911..224231278bd2e9af7bf3d970a4fc5541a2b3ae84 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities related to distributed training."""
+# pylint:disable=protected-access
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -24,16 +25,22 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 
 
 def set_weights(distribution_strategy, dist_model, weights):
@@ -338,7 +345,7 @@ def init_restore_or_wait_for_variables():
   session = K._get_session()  # pylint: disable=protected-access
   worker_context = dc_context.get_current_worker_context()
   if not worker_context or worker_context.experimental_should_init:
-    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    # TODO(yuefengz): if checkpoints exist, restore from checkpoint.
     K._initialize_variables(session)  # pylint: disable=protected-access
   else:
     _wait_for_variable_initialization(session)
@@ -385,11 +392,6 @@ def validate_inputs(x, y, distribution_strategy):
     ValueError: if input is not a Dataset or a numpy array(when we use
       MirroredStrategy).
   """
-  if isinstance(x, dict) or isinstance(y, dict):
-    raise ValueError('`DistributionStrategy` does not support inputs of type '
-                     'dict. You must pass a `tf.data.Dataset` object or a '
-                     'numpy array as input.')
-
   if (isinstance(x, iterator_ops.Iterator) or
       isinstance(y, iterator_ops.Iterator)):
     raise ValueError('`DistributionStrategy` does not support inputs of type '
@@ -513,97 +515,371 @@ def get_batch_dimension(iterator):
   return dims[0] if dims else None
 
 
-def get_cpu_device(distribution_strategy):
-  """Returns the CPU device of the TPU host or the default CPU device string.
+def list_to_tuple(maybe_list):
+  """Datasets treat lists specially, so switch them to tuples."""
+  if isinstance(maybe_list, list):
+    return tuple(maybe_list)
+  return maybe_list
 
-  Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
 
-  Returns:
-    A device string which is the TPU host's CPU device in case of
-    TPUDistributionStrategy or the default CPU device string in all other
-    cases.
+def get_iterator(dataset, distribution_strategy):
+  with distribution_strategy.scope():
+    iterator = distribution_strategy.make_dataset_iterator(dataset)
+  initialize_iterator(iterator, distribution_strategy)
+  return iterator
 
-  Raises:
-    NotImplementedError: We currently don't support copying numpy data to
-    multiple hosts in the case of Cloud TPU pods.
-  """
-  if is_tpu_strategy(distribution_strategy):
-    if distribution_strategy.extended.num_hosts > 1:
-      raise NotImplementedError('TPUDistributionStrategy does not '
-                                'support numpy inputs when running on Cloud'
-                                'TPU pods.')
-    return distribution_strategy.extended.get_host_cpu_device(0)
+
+def initialize_iterator(iterator, distribution_strategy):
+  with distribution_strategy.scope():
+    init_op = control_flow_ops.group(iterator.initialize())
+    if not context.executing_eagerly():
+      K.get_session().run(init_op)
+
+
+def _get_input_from_iterator(iterator, model):
+  """Get elements from the iterator and verify the input shape and type."""
+  next_element = iterator.get_next()
+
+  if len(nest.flatten(next_element)) == len(model.inputs):
+    x = next_element
+    y = None
+    sample_weights = None
+  elif len(nest.flatten(next_element)) == (len(model.inputs) +
+                                           len(model.outputs)):
+    x, y = next_element
+    sample_weights = None
   else:
-    # For all strategies except TPUDistributionStrategy
-    # TODO(anjalisridhar): We may need to modify this when we add support for
-    # multi-worker strategy.
-    return '/CPU:0'
+    x, y, sample_weights = next_element
 
+  # Validate that all the elements in x and y are of the same type and shape.
+  validate_distributed_dataset_inputs(
+      model._distribution_strategy, x, y, sample_weights)
+  return x, y, sample_weights
 
-def get_var_for_numpy(distribution_strategy, x):
-  if isinstance(x, list):
-    var_x = tuple([_get_var_for_numpy(distribution_strategy, single_input)
-                   for single_input in x])
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  inputs = flatten_perdevice_values(strategy, inputs)
+  targets = flatten_perdevice_values(strategy, targets)
+  if mode == ModeKeys.PREDICT:
+    sample_weights = []
+    targets = []
   else:
-    var_x = _get_var_for_numpy(distribution_strategy, x)
-  return var_x
+    sample_weights = [
+        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
+    ]
+  ins = inputs + targets + sample_weights
+  if mode == ModeKeys.TRAIN and not isinstance(K.symbolic_learning_phase(),
+                                               int):
+    ins += [True]
+  return ins
+
+
+def _custom_compile_for_predict(model):
+  """Custom compile for TPU predict mode."""
+  if not model.built:
+    # Model is not compilable because it does not know its number of inputs
+    # and outputs, nor their shapes and names. We will compile after the first
+    # time the model gets called on training data.
+    return
+  model._is_compiled = True
+  model.total_loss = None
+  model._fit_function = None
+  model._eval_function = None
+  model.train_function = None
+  model.test_function = None
+  model.predict_function = None
+
+
+def _build_network_on_replica(model, mode, inputs=None, targets=None):
+  """Build an updated model on replicas.
+
+  We create a new Keras model while sharing the variables from the old graph.
+  Building a new sub-graph is required since the original keras model creates
+  placeholders for the input and the output that are not accessible till we
+  call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
 
+  The sharing of weights and layers between the old and the new model gaurantee
+  that we're using Strategy variables and any updates on either model are
+  reflected correctly in callbacks and loop iterations.
 
-def _get_var_for_numpy(distribution_strategy, input_array):
-  """Creates a variable and assigns the value of the numpy array to it.
+  We need to make sure we share the optimizers between the old and the new model
+  as well so that optimizer state is not lost if the user is running fit
+  multiple times.
 
   Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
-    input_array: The input numpy array whose value will be assigned to the
-      variable we create.
+    model: Model to be replicated across Replicas
+    mode: Which of fit/eval/predict is building the distributed network
+    inputs: Input variables to be passed to the model
+    targets: Target tensor to be passed to model.compile
 
   Returns:
-    The variable to which we will copy the value of the input numpy array.
-
+    A new model with shared layers with the old model.
   """
-  with ops.device(get_cpu_device(distribution_strategy)):
-    # Create and initialize a variable on the CPU device. This is the CPU
-    # device of the host in the case of TPUDistributionStrategy.
-    input_var = variables.VariableV1(array_ops.zeros(input_array.shape,
-                                                     input_array.dtype),
-                                     trainable=False, use_resource=True)
-  K.get_session().run(input_var.initializer)
-
-  # Create a placeholder for the numpy array input slices. We copy the value
-  # of the input numpy array to the variable in slices of size 64 MB to avoid
-  # running into memory issues or RPC message limits.
-  start_placeholder = array_ops.placeholder(dtypes.int64, ())
-  end_placeholder = array_ops.placeholder(dtypes.int64, ())
-  slice_placeholder = array_ops.placeholder(input_var.dtype)
-  assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
-      slice_placeholder)
-
-  # If each batch element is > 64 MB, then we copy each batch element
-  # individually. Otherwise, the slices will be < 128 MB. There might be padding
-  # which might mean that the slices are 128 MB even if the size of the
-  # tensor allocated is less than 128 MB.
-  # This formula gives slices with size:
-  # ceil(64 MB / byte size per batch element) bytes.
-  # Using ceil() guarantees we get a number >= 1.
-
-  # Calculate the size of each batch element.
-  byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \
-                                input_var.dtype.size
-
-  # Calculate number of elements we want to copy per slice.
-  batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element))
-
-  # Copy slices of the above size starting at 0, except the last slice will be
-  # smaller.
-  start = 0
-  limit = input_array.shape[0]
-  while start < limit:
-    end = min(start + batch_size_per_slice, limit)
-    K.get_session().run(assign_slice_op, feed_dict={
-        start_placeholder: start,
-        end_placeholder: end,
-        slice_placeholder: input_array[start:end]})
-    start = end
-
-  return input_var
+  # Need to do imports here since we run into a circular dependency error.
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
+
+  # We rely on the internal methods to avoid having share_weights weights in the
+  # public API.
+  if isinstance(model, sequential.Sequential):
+    updated_model = models._clone_sequential_model(model, input_tensors=inputs,
+                                                   share_weights=True)
+  else:
+    updated_model = models._clone_functional_model(model, input_tensors=inputs,
+                                                   share_weights=True)
+
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  updated_model.outputs = [_upcast_low_precision_outputs(o)
+                           for o in updated_model.outputs]
+
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
+
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+    _custom_compile_for_predict(updated_model)
+  else:
+    updated_model.compile(
+        model.optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
+  return updated_model
+
+
+def _build_distributed_network(model, strategy, mode, inputs=None,
+                               targets=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    distributed_model = strategy.extended.call_for_each_replica(
+        _build_network_on_replica,
+        args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+
+
+def _clone_and_build_model(model, mode, inputs=None, targets=None):
+  """Clone and build the given keras_model."""
+  # We need to set the import here since we run into a circular dependency
+  # error.
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+  cloned_model = models.clone_model(model, input_tensors=inputs)
+
+  # Compile and build model.
+  if isinstance(model.optimizer, optimizers.TFOptimizer):
+    optimizer = model.optimizer
+  else:
+    optimizer_config = model.optimizer.get_config()
+    optimizer = model.optimizer.__class__.from_config(optimizer_config)
+
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  cloned_model.outputs = [_upcast_low_precision_outputs(o)
+                          for o in cloned_model.outputs]
+
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+    _custom_compile_for_predict(cloned_model)
+  else:
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
+  return cloned_model
+
+
+def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    distributed_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+  if mode == ModeKeys.TRAIN:
+    model._make_callback_model(distributed_model)
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of distributed model execution."""
+  if context.executing_eagerly():
+    return _make_eager_execution_function(model, mode)
+
+  strategy = model._distribution_strategy
+  if not get_distributed_model(model, mode):
+    if model._compile_distribution:
+      clone_model_on_replicas(model, strategy, mode)
+    else:
+      _build_distributed_network(model, strategy, mode)
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_function, args=(get_distributed_model(model, mode),))
+
+    # Initialize the variables in the replicated model. This is necessary for
+    # multi-worker training because on some workers, initialization is not
+    # needed. This method does initialization or waiting for initialization
+    # according to the context object of distribute coordinator.
+    init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates, all_session_args) = unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        grouped_updates,
+        grouped_session_args,
+        with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
+  strategy = model._distribution_strategy
+  if not get_distributed_model(model, mode):
+    if model._compile_distribution:
+      clone_model_on_replicas(model, strategy, mode)
+    else:
+      _build_distributed_network(model, strategy, mode)
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs)
+
+  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
+  # the global one.
+  with K.get_graph().as_default(), strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs) = strategy.extended.call_for_each_replica(
+        _per_device_function, args=(get_distributed_model(model, mode),))
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of inptus/outputs
+    # on all the devices over which the model is distributed.
+    (all_inputs, all_outputs, _, _) = unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        name='eager_distributed_{}_function'.format(mode))
+
+
+def _copy_weights_to_distributed_model(original_model, mode):
+  """Copies weights from original model to distributed models."""
+  strategy = original_model._distribution_strategy
+  distributed_model = get_distributed_model(original_model, mode)
+  if strategy:
+    # Copy the weights from the original model to each of the replicated
+    # models.
+    orig_model_weights = original_model.get_weights()
+    first_model = strategy.unwrap(distributed_model)[0]
+    set_weights(strategy, first_model, orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == ModeKeys.TRAIN:
+    distributed_model = get_distributed_model(model, mode)
+    updated_weights = model._distribution_strategy.unwrap(
+        distributed_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_device_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-device batch-level outputs from a distributed step."""
+  if model._distribution_strategy is not None and mode == ModeKeys.PREDICT:
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
+
+
+def _reset_metrics(model):
+  if model._distribution_strategy:
+    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
+      distributed_model = get_distributed_model(model, mode)
+      if distributed_model:
+        first_model = model._distribution_strategy.unwrap(distributed_model)[0]
+        first_model.reset_metrics()
+
+
+def get_distributed_model(model, mode):
+  if mode is ModeKeys.TRAIN:
+    return model._distributed_model_train
+  elif mode is ModeKeys.TEST:
+    return model._distributed_model_test
+  elif mode is ModeKeys.PREDICT:
+    return model._distributed_model_predict
+
+
+def set_distributed_model(model, mode, distributed_model):
+  if mode is ModeKeys.TRAIN:
+    model._distributed_model_train = distributed_model
+  elif mode is ModeKeys.TEST:
+    model._distributed_model_test = distributed_model
+  elif mode is ModeKeys.PREDICT:
+    model._distributed_model_predict = distributed_model
+
+
+@tf_contextlib.contextmanager
+def distributed_scope(strategy, learning_phase):
+  with strategy.scope(), K.learning_phase_scope(learning_phase):
+    yield
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index b3f8cfe72585188d631c072b690729054d5db775..3bed40b08462f6907f7dbf41a90dd1503baf6a81 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class TestDNNModel(keras.models.Model):
@@ -57,7 +56,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -79,7 +78,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -103,7 +102,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     dnn_model = TestDNNModel([col_a, col_b], 20)
 
     dnn_model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -124,7 +123,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     dnn_model = TestDNNModel([col_a, col_b], 20)
 
     dnn_model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -155,7 +154,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
     model = keras.models.Model([feature_layer], [output])
 
-    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     loss_weights = [1., 0.5]
     model.compile(
@@ -184,7 +183,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
     model = keras.models.Model([fc1, fc2], [output])
 
-    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     loss_weights = [1., 0.5]
     model.compile(
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index bc2cf2fb6e10e6f80f7f56351e57ae2bc5cea726..c6dcedfce2f620b039fc8cfa7c3366d801e9c176 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -77,8 +77,9 @@ class InputLayer(base_layer.Layer):
         dtype = backend.floatx()
       else:
         dtype = backend.dtype(input_tensor)
-    elif input_tensor and input_tensor.dtype != dtype:
-      raise ValueError('`input_tensor.dtype` differs from `dtype`.')
+    elif input_tensor is not None and input_tensor.dtype != dtype:
+      raise ValueError('`input_tensor.dtype` differs from `dtype`: %s vs. %s' %
+                       (input_tensor.dtype, dtype))
     super(InputLayer, self).__init__(dtype=dtype, name=name)
     self.built = True
     self.sparse = sparse
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 1c30c6b3fbfdced0506206ae79b1ef597bfa332b..41f5f319bc625ef044964658e12daf720cd26a0a 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -37,8 +37,8 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -49,6 +49,7 @@ from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
@@ -68,8 +69,64 @@ except ImportError:
 class Network(base_layer.Layer):
   """A `Network` is a composition of layers.
 
-  It is the topological form of a "model". A `Model`
+  `Network` is the topological form of a "model". A `Model`
   is simply a `Network` with added training routines.
+
+  Two types of `Networks` exist: Graph Networks and Subclass Networks. Graph
+  networks are used in the Keras Functional and Sequential APIs. Subclassed
+  networks are used when a user subclasses the `Model` class. In general,
+  more Keras features are supported with Graph Networks than with Subclassed
+  Networks, specifically:
+
+  - Model cloning (`keras.models.clone`)
+  - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
+  - Whole-model saving (`model.save()`)
+
+  A Graph Network can be instantiated by passing two arguments to `__init__`.
+  The first argument is the `keras.Input` Tensors that represent the inputs
+  to the Network. The second argument specifies the output Tensors that
+  represent the outputs of this Network. Both arguments can be a nested
+  structure of Tensors.
+
+  Example:
+
+  ```
+  inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
+  t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
+  outputs = keras.layers.Add()([t, inputs['x2'])
+  network = Network(inputs, outputs)
+  ```
+
+  A Graph Network constructed using the Functional API can also include raw
+  TensorFlow functions, with the exception of functions that create Variables
+  or assign ops.
+
+  Example:
+
+  ```
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(1)(inputs)
+  outputs = tf.nn.relu(x)
+  network = Network(inputs, outputs)
+  ```
+
+  Subclassed Networks can be instantiated via `name` and (optional) `dynamic`
+  keyword arguments. Subclassed Networks keep track of their Layers, and their
+  `call` method can be overridden. Subclassed Networks are typically created
+  indirectly, by subclassing the `Model` class.
+
+  Example:
+
+  ```
+  class MyModel(keras.Model):
+    def __init__(self):
+      super(MyModel, self).__init__(name='my_model', dynamic=False)
+
+      self.layer1 = keras.layers.Dense(10, activation='relu')
+
+    def call(self, inputs):
+      return self.layer1(inputs)
+  ```
   """
 
   def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
@@ -165,14 +222,18 @@ class Network(base_layer.Layer):
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
-    if isinstance(inputs, (list, tuple)):
-      self.inputs = list(inputs)  # Tensor or list of tensors.
-    else:
-      self.inputs = [inputs]
-    if isinstance(outputs, (list, tuple)):
-      self.outputs = list(outputs)
-    else:
-      self.outputs = [outputs]
+    if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
+      inputs = inputs[0]
+    if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
+      outputs = outputs[0]
+    self._nested_outputs = outputs
+    self._nested_inputs = inputs
+    self.inputs = nest.flatten(inputs)
+    self.outputs = nest.flatten(outputs)
+
+    if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
+      base_layer_utils.create_keras_history(self._nested_outputs)
+
     self._validate_graph_inputs_and_outputs()
 
     self._base_init(name=name)
@@ -235,8 +296,8 @@ class Network(base_layer.Layer):
         inbound_layers=[],
         node_indices=[],
         tensor_indices=[],
-        input_tensors=self.inputs,
-        output_tensors=self.outputs)
+        input_tensors=self._nested_inputs,
+        output_tensors=self._nested_outputs)
 
     # Build self.input_names and self.output_names.
     self.input_names = []
@@ -334,9 +395,11 @@ class Network(base_layer.Layer):
     if not getattr(self, '_setattr_tracking', True):
       super(Network, self).__setattr__(name, value)
       return
-    if (isinstance(value, (base_layer.Layer,
-                           data_structures.CheckpointableDataStructure))
-        or checkpointable_layer_utils.has_weights(value)):
+
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.CheckpointableDataStructure)) or
+        checkpointable_layer_utils.has_weights(v) for v in nest.flatten(value)):
       try:
         self._is_graph_network
       except AttributeError:
@@ -415,13 +478,7 @@ class Network(base_layer.Layer):
     if not self._is_graph_network:
       return None
 
-    inputs = generic_utils.to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = generic_utils.to_list(mask)
-
-    _, output_masks = self._run_internal_graph(inputs, mask=masks)
+    _, output_masks = self._run_internal_graph(inputs, mask=mask)
     return output_masks
 
   @property
@@ -809,122 +866,83 @@ class Network(base_layer.Layer):
       raise NotImplementedError('When subclassing the `Model` class, you should'
                                 ' implement a `call` method.')
 
-    inputs = generic_utils.to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = generic_utils.to_list(mask)
-    outputs, _ = self._run_internal_graph(inputs,
-                                          training=training,
-                                          mask=masks)
+    outputs, _ = self._run_internal_graph(inputs, training=training, mask=mask)
     return outputs
 
   def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    inputs = generic_utils.to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = generic_utils.to_list(mask)
-    return self._run_internal_graph(inputs,
-                                    training=training,
-                                    mask=masks)
+    return self._run_internal_graph(inputs, training=training, mask=mask)
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
       return super(Network, self).compute_output_shape(input_shape)
 
-    if isinstance(input_shape, list):
-      input_shapes = []
-      for shape in input_shape:
-        if shape is not None:
-          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
-        else:
-          input_shapes.append(None)
-    else:
-      if input_shape is not None:
-        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
-      else:
-        input_shapes = [None]
+    # Convert any shapes in tuple format to TensorShapes.
+    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
 
-    if len(input_shapes) != len(self._input_layers):
+    if len(nest.flatten(input_shape)) != len(nest.flatten(self._input_layers)):
       raise ValueError('Invalid input_shape argument ' + str(input_shape) +
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = generic_utils.object_list_uid(input_shapes)
+    cache_key = generic_utils.object_list_uid(input_shape)
     if cache_key in self._output_shape_cache:
-      # Cache hit.
-      output_shapes = self._output_shape_cache[cache_key]
-    else:
-      layers_to_output_shapes = {}
-      for i in range(len(input_shapes)):
-        layer = self._input_layers[i]
-        input_shape = input_shapes[i]
-        # It's an input layer: then `compute_output_shape` is identity,
-        # and there is only one node and one tensor output.
-        shape_key = layer.name + '_0_0'
-        layers_to_output_shapes[shape_key] = input_shape
-
-      depth_keys = list(self._nodes_by_depth.keys())
-      depth_keys.sort(reverse=True)
-      # Iterate over nodes, by depth level.
-      if len(depth_keys) > 1:
-        for depth in depth_keys:
-          nodes = self._nodes_by_depth[depth]
-          for node in nodes:
-            # This is always a single layer, never a list.
-            layer = node.outbound_layer
-            if layer in self._input_layers:
-              # We've already covered the input layers
-              # a few lines above.
-              continue
-            # Potentially redundant list,
-            # same size as node.input_tensors.
-            input_shapes = []
-            for j in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[j]
-              node_index = node.node_indices[j]
-              tensor_index = node.tensor_indices[j]
-              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
-                                                           tensor_index)
-              input_shape = layers_to_output_shapes[shape_key]
-              input_shapes.append(input_shape)
-
-            if len(input_shapes) == 1:
-              output_shape = layer.compute_output_shape(input_shapes[0])
-            else:
-              output_shape = layer.compute_output_shape(input_shapes)
-            if isinstance(output_shape, list):
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(shape).as_list())
-                  for shape in output_shape
-              ]
-            else:
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(output_shape).as_list())
-              ]
-
-            node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
-            for j in range(len(output_shapes)):
-              shape_key = layer.name + '_%s_%s' % (node_index, j)
-              layers_to_output_shapes[shape_key] = output_shapes[j]
-
-        # Read final output shapes from layers_to_output_shapes.
-        output_shapes = []
-        for i in range(len(self._output_layers)):
-          layer, node_index, tensor_index = self._output_coordinates[i]
-          shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
-          output_shapes.append(layers_to_output_shapes[shape_key])
-        # Store in cache.
-        self._output_shape_cache[cache_key] = output_shapes
-
-    if isinstance(output_shapes, list):
-      if len(output_shapes) == 1:
-        return tensor_shape.TensorShape(output_shapes[0])
-      else:
-        return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-    else:
-      return tensor_shape.TensorShape(output_shapes)
+      # Cache hit. Return shapes as TensorShapes.
+      return self._output_shape_cache[cache_key]
+
+    layers_to_output_shapes = {}
+    for layer, shape in zip(self._input_layers, nest.flatten(input_shape)):
+      # It's an input layer: then `compute_output_shape` is identity,
+      # and there is only one node and one tensor..
+      shape_key = layer.name + '_0_0'
+      layers_to_output_shapes[shape_key] = shape
+
+    depth_keys = list(self._nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    # Iterate over nodes, by depth level.
+    if len(depth_keys) > 1:
+      for depth in depth_keys:
+        nodes = self._nodes_by_depth[depth]
+        for node in nodes:
+          # This is always a single layer, never a list.
+          layer = node.outbound_layer
+          if layer in self._input_layers:
+            # We've already covered the input layers
+            # a few lines above.
+            continue
+          # Potentially redundant list,
+          # same size as node.input_tensors.
+          layer_input_shapes = []
+          for inbound_layer, node_id, tensor_id, _ in node.iterate_inbound():
+            input_layer_key = inbound_layer.name + '_%s_%s' % (node_id,
+                                                               tensor_id)
+            layer_input_shapes.append(layers_to_output_shapes[input_layer_key])
+          layer_input_shapes = nest.pack_sequence_as(node.inbound_layers,
+                                                     layer_input_shapes)
+          # Layers expect shapes to be tuples for `compute_output_shape`.
+          layer_input_shapes = tf_utils.convert_shapes(
+              layer_input_shapes, to_tuples=True)
+          layer_output_shapes = layer.compute_output_shape(layer_input_shapes)
+          # Convert back to TensorShapes.
+          layer_output_shapes = tf_utils.convert_shapes(
+              layer_output_shapes, to_tuples=False)
+
+          node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
+          for j, shape in enumerate(nest.flatten(layer_output_shapes)):
+            shape_key = layer.name + '_%s_%s' % (node_index, j)
+            layers_to_output_shapes[shape_key] = shape
+
+      # Read final output shapes from layers_to_output_shapes.
+      output_shapes = []
+      for i in range(len(self._output_layers)):
+        layer, node_index, tensor_index = self._output_coordinates[i]
+        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+        output_shapes.append(layers_to_output_shapes[shape_key])
+      output_shapes = nest.pack_sequence_as(self._nested_outputs, output_shapes)
+      # Store in cache.
+      self._output_shape_cache[cache_key] = output_shapes
+
+    # Return shapes as TensorShapes.
+    return output_shapes
 
   def _run_internal_graph(self, inputs, training=None, mask=None):
     """Computes output tensors for new inputs.
@@ -934,9 +952,9 @@ class Network(base_layer.Layer):
         - Can be run on non-Keras tensors.
 
     Arguments:
-        inputs: List of tensors
+        inputs: Tensor or nested structure of Tensors.
         training: Boolean learning phase.
-        mask: List of masks (tensors or None).
+        mask: (Optional) Tensor or nested structure of Tensors.
 
     Returns:
         Two lists: output_tensors, output_masks
@@ -948,17 +966,20 @@ class Network(base_layer.Layer):
     # the future and 2) Keras is a major user of Network.  If you don't
     # use masking, it does not interfere with regular behavior at all and you
     # can ignore it.
+    inputs = nest.flatten(inputs)
     if mask is None:
       masks = [None for _ in range(len(inputs))]
     else:
-      masks = mask
+      masks = nest.flatten(mask)
+
+    # Dictionary mapping reference tensors to computed tensors.
+    tensor_dict = {}
+    # Dictionary mapping reference tensors to computed masks.
+    mask_dict = {}
 
-    # Dictionary mapping reference tensors to tuples
-    # (computed tensor, compute mask)
-    # we assume a 1:1 mapping from tensor to mask
-    tensor_map = {}
     for x, y, mask in zip(self.inputs, inputs, masks):
-      tensor_map[str(id(x))] = (y, mask)
+      tensor_dict[str(id(x))] = y
+      mask_dict[str(id(x))] = mask
 
     depth_keys = list(self._nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
@@ -967,87 +988,50 @@ class Network(base_layer.Layer):
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-        reference_input_tensors = node.input_tensors
-        reference_output_tensors = node.output_tensors
+        # node_input_tensors = node.input_tensors
+        # node_output_tensors = node.output_tensors
 
-        # If all previous input tensors are available in tensor_map,
-        # then call node.inbound_layer on them.
-        computed_data = []  # List of tuples (input, mask).
-        for x in reference_input_tensors:
-          if str(id(x)) in tensor_map:
-            computed_data.append(tensor_map[str(id(x))])
-
-        if len(computed_data) == len(reference_input_tensors):
+        if all(
+            str(id(tensor)) in tensor_dict
+            for tensor in nest.flatten(node.input_tensors)):
           # Call layer (reapplying ops to new inputs).
           with ops.name_scope(layer.name):
-            if node.arguments:
-              kwargs = node.arguments
-            else:
-              kwargs = {}
+            computed_tensors = nest.map_structure(
+                lambda t: tensor_dict[str(id(t))], node.input_tensors)
+            computed_masks = nest.map_structure(lambda t: mask_dict[str(id(t))],
+                                                node.input_tensors)
+            kwargs = node.arguments or {}
             # Ensure `training` arg propagation if applicable.
             argspec = self._layer_call_argspecs[layer].args
             if 'training' in argspec:
               kwargs.setdefault('training', training)
-
-            if len(computed_data) == 1:
-              computed_tensor, computed_mask = computed_data[0]
-              # Ensure mask propagation if applicable.
-              if 'mask' in argspec:
-                kwargs.setdefault('mask', computed_mask)
-
-              # Compute outputs and masks.
-              if (isinstance(layer, Network) and
-                  layer._compute_output_and_mask_jointly):
-                output_tensors, output_masks = layer._call_and_compute_mask(
-                    computed_tensor, **kwargs)
-              else:
-                if context.executing_eagerly():
-                  output_tensors = layer(computed_tensor, **kwargs)
-                elif layer.dynamic:
-                  output_tensors = layer._symbolic_call(computed_tensor)  # pylint: disable=protected-call
-                else:
-                  output_tensors = layer.call(computed_tensor, **kwargs)
-                if hasattr(layer, 'compute_mask'):
-                  output_masks = layer.compute_mask(computed_tensor,
-                                                    computed_mask)
-                else:
-                  output_masks = [None for _ in output_tensors]
-              computed_tensors = [computed_tensor]
-
+            if 'mask' in argspec:
+              kwargs.setdefault('mask', computed_masks)
+
+            # Compute outputs and masks.
+            output_masks = None
+            if (isinstance(layer, Network) and
+                layer._compute_output_and_mask_jointly):
+              output_tensors, output_masks = layer._call_and_compute_mask(
+                  computed_tensors, **kwargs)
             else:
-              computed_tensors = [x[0] for x in computed_data]
-              computed_masks = [x[1] for x in computed_data]
-              # Ensure mask propagation if applicable.
-              if 'mask' in argspec:
-                kwargs.setdefault('mask', computed_masks)
-
-              # Compute outputs and masks.
-              if (isinstance(layer, Network) and
-                  layer._compute_output_and_mask_jointly):
-                output_tensors, output_masks = layer._call_and_compute_mask(
-                    computed_tensors, **kwargs)
+              if context.executing_eagerly():
+                output_tensors = layer(computed_tensors, **kwargs)
+              elif layer.dynamic:
+                output_tensors = layer._symbolic_call(computed_tensors)  # pylint: disable=protected-call
               else:
-                if context.executing_eagerly():
-                  output_tensors = layer(computed_tensors, **kwargs)
-                elif layer.dynamic:
-                  output_tensors = layer._symbolic_call(computed_tensors)  # pylint: disable=protected-call
-                else:
-                  output_tensors = layer.call(computed_tensors, **kwargs)
-                if hasattr(layer, 'compute_mask'):
-                  output_masks = layer.compute_mask(computed_tensors,
-                                                    computed_masks)
-                else:
-                  output_masks = [None for _ in output_tensors]
-
-            output_tensors = generic_utils.to_list(output_tensors)
+                output_tensors = layer.call(computed_tensors, **kwargs)
+              if hasattr(layer, 'compute_mask'):
+                output_masks = layer.compute_mask(computed_tensors,
+                                                  computed_masks)
             if output_masks is None:
-              output_masks = [None for _ in output_tensors]
-            else:
-              output_masks = generic_utils.to_list(output_masks)
+              output_masks = nest.pack_sequence_as(
+                  output_tensors, [None for _ in nest.flatten(output_tensors)])
 
             if not context.executing_eagerly():
               # Set mask metadata.
-              for x, m in zip(output_tensors, output_masks):
+              for x, m in zip(
+                  nest.flatten(output_tensors), nest.flatten(output_masks)):
                 try:
                   x._keras_mask = m
                 except AttributeError:
@@ -1057,33 +1041,32 @@ class Network(base_layer.Layer):
               layer._handle_activity_regularization(computed_tensors,
                                                     output_tensors)
 
-          # Update tensor_map.
-          for x, y, mask in zip(reference_output_tensors, output_tensors,
-                                output_masks):
-            tensor_map[str(id(x))] = (y, mask)
+          # Update tensor_dict.
+          for x, y, mask in zip(
+              nest.flatten(node.output_tensors), nest.flatten(output_tensors),
+              nest.flatten(output_masks)):
+            tensor_dict[str(id(x))] = y
+            mask_dict[str(id(x))] = mask
 
     output_tensors = []
     output_masks = []
     output_shapes = []
     for x in self.outputs:
-      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
-      tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(backend.int_shape(x))
+      assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
+      tensor = tensor_dict[str(id(x))]
+      mask = mask_dict[str(id(x))]
+      output_shapes.append(x.shape)
       output_tensors.append(tensor)
       output_masks.append(mask)
 
-    if len(output_tensors) == 1:
-      output_tensors = output_tensors[0]
-      if output_shapes is not None:
-        output_shapes = output_shapes[0]
-      if output_masks is not None:
-        output_masks = output_masks[0]
-
     if output_shapes is not None:
-      input_shapes = [backend.int_shape(x) for x in inputs]
+      input_shapes = [x.shape for x in inputs]
       cache_key = generic_utils.object_list_uid(input_shapes)
-      self._output_shape_cache[cache_key] = output_shapes
+      self._output_shape_cache[cache_key] = nest.pack_sequence_as(
+          self._nested_outputs, output_shapes)
 
+    output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
+    output_masks = nest.pack_sequence_as(self._nested_outputs, output_masks)
     return output_tensors, output_masks
 
   def get_config(self):
@@ -1132,14 +1115,15 @@ class Network(base_layer.Layer):
             kwargs = {}
           if node.inbound_layers:
             node_data = []
-            for i in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[i]
-              node_index = node.node_indices[i]
-              tensor_index = node.tensor_indices[i]
-              node_key = _make_node_key(inbound_layer.name, node_index)
+            for inbound_layer, node_id, tensor_id, _ in node.iterate_inbound():
+              node_key = _make_node_key(inbound_layer.name, node_id)
               new_node_index = node_conversion_map.get(node_key, 0)
               node_data.append(
-                  [inbound_layer.name, new_node_index, tensor_index, kwargs])
+                  tf_utils.ListWrapper(
+                      [inbound_layer.name, new_node_index, tensor_id, kwargs]))
+            node_data = nest.pack_sequence_as(node.input_tensors, node_data)
+            # Convert ListWrapper to list for backwards compatible configs.
+            node_data = tf_utils.convert_inner_node_data(node_data)
             filtered_inbound_nodes.append(node_data)
       layer_configs.append({
           'name': layer.name,
@@ -1157,8 +1141,12 @@ class Network(base_layer.Layer):
       if node_key not in self._network_nodes:
         continue
       new_node_index = node_conversion_map[node_key]
-      model_inputs.append([layer.name, new_node_index, tensor_index])
+      model_inputs.append(
+          tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
+    model_inputs = nest.pack_sequence_as(self._nested_inputs, model_inputs)
+    model_inputs = tf_utils.convert_inner_node_data(model_inputs)
     config['input_layers'] = model_inputs
+
     model_outputs = []
     for i in range(len(self._output_layers)):
       layer, node_index, tensor_index = self._output_coordinates[i]
@@ -1166,7 +1154,10 @@ class Network(base_layer.Layer):
       if node_key not in self._network_nodes:
         continue
       new_node_index = node_conversion_map[node_key]
-      model_outputs.append([layer.name, new_node_index, tensor_index])
+      model_outputs.append(
+          tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
+    model_outputs = nest.pack_sequence_as(self._nested_outputs, model_outputs)
+    model_outputs = tf_utils.convert_inner_node_data(model_outputs)
     config['output_layers'] = model_outputs
     return copy.deepcopy(config)
 
@@ -1208,13 +1199,14 @@ class Network(base_layer.Layer):
 
       Arguments:
           layer: layer instance.
-          node_data: node config dict.
+          node_data: Nested structure of `ListWrapper`.
 
       Raises:
-          ValueError: In case of improperly formatted `node_data` dict.
+          ValueError: In case of improperly formatted `node_data`.
       """
       input_tensors = []
-      for input_data in node_data:
+      for input_data in nest.flatten(node_data):
+        input_data = input_data.as_list()
         inbound_layer_name = input_data[0]
         inbound_node_index = input_data[1]
         inbound_tensor_index = input_data[2]
@@ -1224,20 +1216,22 @@ class Network(base_layer.Layer):
           kwargs = input_data[3]
         else:
           raise ValueError('Improperly formatted model config.')
-        if inbound_layer_name not in created_layers:
-          add_unprocessed_node(layer, node_data)
-          return
+
         inbound_layer = created_layers[inbound_layer_name]
         if len(inbound_layer._inbound_nodes) <= inbound_node_index:
           add_unprocessed_node(layer, node_data)
           return
         inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-        input_tensors.append(inbound_node.output_tensors[inbound_tensor_index])
+        input_tensors.append(
+            nest.flatten(inbound_node.output_tensors)[inbound_tensor_index])
+      input_tensors = nest.pack_sequence_as(node_data, input_tensors)
       # Call layer on its inputs, thus creating the node
       # and building the layer if needed.
-      if input_tensors:
-        if len(input_tensors) == 1:
-          layer(input_tensors[0], **kwargs)
+      if input_tensors is not None:
+        # Preserve compatibility with older configs.
+        flat_input_tensors = nest.flatten(input_tensors)
+        if len(flat_input_tensors) == 1:
+          layer(flat_input_tensors[0], **kwargs)
         else:
           layer(input_tensors, **kwargs)
 
@@ -1258,8 +1252,10 @@ class Network(base_layer.Layer):
       layer = deserialize_layer(layer_data, custom_objects=custom_objects)
       created_layers[layer_name] = layer
 
-      # Gather layer inputs.
+      # Gather layer inputs and convert to `ListWrapper` objects.
       inbound_nodes_data = layer_data['inbound_nodes']
+      inbound_nodes_data = tf_utils.convert_inner_node_data(
+          inbound_nodes_data, wrap=True)
       for node_data in inbound_nodes_data:
         # We don't process nodes (i.e. make layer calls)
         # on the fly because the inbound node may not yet exist,
@@ -1284,18 +1280,27 @@ class Network(base_layer.Layer):
     name = config.get('name')
     input_tensors = []
     output_tensors = []
-    for layer_data in config['input_layers']:
-      layer_name, node_index, tensor_index = layer_data
+
+    input_layers = tf_utils.convert_inner_node_data(
+        config['input_layers'], wrap=True)
+    for layer_data in nest.flatten(input_layers):
+      layer_name, node_index, tensor_index = layer_data.as_list()
       assert layer_name in created_layers
       layer = created_layers[layer_name]
       layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      input_tensors.append(layer_output_tensors[tensor_index])
-    for layer_data in config['output_layers']:
-      layer_name, node_index, tensor_index = layer_data
+      input_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+    output_layers = tf_utils.convert_inner_node_data(
+        config['output_layers'], wrap=True)
+    for layer_data in nest.flatten(output_layers):
+      layer_name, node_index, tensor_index = layer_data.as_list()
       assert layer_name in created_layers
       layer = created_layers[layer_name]
       layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      output_tensors.append(layer_output_tensors[tensor_index])
+      output_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+    input_tensors = nest.pack_sequence_as(input_layers, input_tensors)
+    output_tensors = nest.pack_sequence_as(output_layers, output_tensors)
     return cls(inputs=input_tensors, outputs=output_tensors, name=name)
 
   def save(self, filepath, overwrite=True, include_optimizer=True):
@@ -1421,7 +1426,7 @@ class Network(base_layer.Layer):
         return
     if save_format == 'h5':
       with h5py.File(filepath, 'w') as f:
-        saving.save_weights_to_hdf5_group(f, self.layers)
+        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
     else:
       if context.executing_eagerly():
         session = None
@@ -1429,7 +1434,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
       optimizer = getattr(self, 'optimizer', None)
       if (optimizer
-          and not isinstance(optimizer, checkpointable.CheckpointableBase)):
+          and not isinstance(optimizer, checkpointable.Checkpointable)):
         logging.warning(
             ('This model was compiled with a Keras optimizer (%s) but is being '
              'saved in TensorFlow format with `save_weights`. The model\'s '
@@ -1439,9 +1444,10 @@ class Network(base_layer.Layer):
             % (optimizer,))
       self._checkpointable_saver.save(filepath, session=session)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state(
+      checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
           model_checkpoint_path=filepath,
+          save_relative_paths=True,
           all_model_checkpoint_paths=[filepath])
 
   def load_weights(self, filepath, by_name=False):
@@ -1520,9 +1526,9 @@ class Network(base_layer.Layer):
       if 'layer_names' not in f.attrs and 'model_weights' in f:
         f = f['model_weights']
       if by_name:
-        saving.load_weights_from_hdf5_group_by_name(f, self.layers)
+        hdf5_format.load_weights_from_hdf5_group_by_name(f, self.layers)
       else:
-        saving.load_weights_from_hdf5_group(f, self.layers)
+        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
 
   def _updated_config(self):
     """Util shared between different serialization methods.
@@ -1763,13 +1769,9 @@ def _map_graph_network(inputs, outputs):
     nodes_in_progress.add(node)
 
     # Propagate to all previous tensors connected to this node.
-    for i in range(len(node.inbound_layers)):
-      x = node.input_tensors[i]
-      layer = node.inbound_layers[i]
-      node_index = node.node_indices[i]
-      tensor_index = node.tensor_indices[i]
-      build_map(x, finished_nodes, nodes_in_progress, layer,
-                node_index, tensor_index)
+    for layer, node_index, tensor_index, tensor in node.iterate_inbound():
+      build_map(tensor, finished_nodes, nodes_in_progress, layer, node_index,
+                tensor_index)
 
     finished_nodes.add(node)
     nodes_in_progress.remove(node)
@@ -1801,9 +1803,7 @@ def _map_graph_network(inputs, outputs):
     # Update the depth of inbound nodes.
     # The "depth" of a node is the max of the depths
     # of all layers it is connected to.
-    for i in range(len(node.inbound_layers)):
-      inbound_layer = node.inbound_layers[i]
-      node_index = node.node_indices[i]
+    for inbound_layer, node_index, _, _ in node.iterate_inbound():
       inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
       previous_depth = nodes_depths.get(inbound_node, 0)
       nodes_depths[inbound_node] = max(depth + 1, previous_depth)
@@ -1851,7 +1851,7 @@ def _map_graph_network(inputs, outputs):
     for node in nodes_by_depth[depth]:
       layer = node.outbound_layer
       if layer:
-        for x in node.input_tensors:
+        for x in nest.flatten(node.input_tensors):
           if x not in computable_tensors:
             raise ValueError('Graph disconnected: '
                              'cannot obtain value for tensor ' + str(x) +
@@ -1859,7 +1859,7 @@ def _map_graph_network(inputs, outputs):
                              'The following previous layers '
                              'were accessed without issue: ' +
                              str(layers_with_complete_input))
-        for x in node.output_tensors:
+        for x in nest.flatten(node.output_tensors):
           computable_tensors.append(x)
         layers_with_complete_input.append(layer.name)
 
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 91eba0acabf86f605e111f8d1820471086eb12b5..b4da86d98483c85c22d2beb0d285720fac58407d 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -14,941 +14,11 @@
 # ==============================================================================
 # pylint: disable=protected-access
 """Model saving utilities.
+
+Everything has been moved to keras/saving/. This file will be deleted soon.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
-import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.utils import conv_utils
-from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import serialization
-from tensorflow.python.util.tf_export import keras_export
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-  HDF5_OBJECT_HEADER_LIMIT = 64512
-except ImportError:
-  h5py = None
-
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-@keras_export('keras.models.save_model')
-def save_model(model, filepath, overwrite=True, include_optimizer=True):
-  """Saves a model to a HDF5 file.
-
-  The saved model contains:
-      - the model's configuration (topology)
-      - the model's weights
-      - the model's optimizer's state (if any)
-
-  Thus the saved model can be reinstantiated in
-  the exact same state, without any of the code
-  used for model definition or training.
-
-  Arguments:
-      model: Keras model instance to be saved.
-      filepath: One of the following:
-          - String, path where to save the model
-          - `h5py.File` object where to save the model
-      overwrite: Whether we should overwrite any existing
-          model at the target location, or instead
-          ask the user with a manual prompt.
-      include_optimizer: If True, save optimizer's state together.
-
-  Raises:
-      ImportError: if h5py is not available.
-  """
-
-  if h5py is None:
-    raise ImportError('`save_model` requires h5py.')
-
-  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  # TODO(psv) Add warning when we save models that contain non-serializable
-  # entities like metrics added using `add_metric` and losses added using
-  # `add_loss.`
-
-  if not isinstance(filepath, h5py.File):
-    # If file exists and should not be overwritten.
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
-      if not proceed:
-        return
-
-    f = h5py.File(filepath, mode='w')
-    opened_new_file = True
-  else:
-    f = filepath
-    opened_new_file = False
-
-  try:
-    f.attrs['keras_version'] = str(keras_version).encode('utf8')
-    f.attrs['backend'] = K.backend().encode('utf8')
-    f.attrs['model_config'] = json.dumps(
-        {
-            'class_name': model.__class__.__name__,
-            'config': model.get_config()
-        },
-        default=serialization.get_json_type).encode('utf8')
-
-    model_weights_group = f.create_group('model_weights')
-    model_layers = model.layers
-    save_weights_to_hdf5_group(model_weights_group, model_layers)
-
-    if include_optimizer and model.optimizer:
-      if isinstance(model.optimizer, optimizers.TFOptimizer):
-        logging.warning(
-            'TensorFlow optimizers do not '
-            'make it possible to access '
-            'optimizer attributes or optimizer state '
-            'after instantiation. '
-            'As a result, we cannot save the optimizer '
-            'as part of the model save file.'
-            'You will have to compile your model again after loading it. '
-            'Prefer using a Keras optimizer instead '
-            '(see keras.io/optimizers).')
-      else:
-        f.attrs['training_config'] = json.dumps(
-            {
-                'optimizer_config': {
-                    'class_name': model.optimizer.__class__.__name__,
-                    'config': model.optimizer.get_config()
-                },
-                'loss': model.loss,
-                'metrics': model._compile_metrics,
-                'weighted_metrics': model._compile_weighted_metrics,
-                'sample_weight_mode': model.sample_weight_mode,
-                'loss_weights': model.loss_weights,
-            },
-            default=serialization.get_json_type).encode('utf8')
-
-        # Save optimizer weights.
-        symbolic_weights = getattr(model.optimizer, 'weights')
-        if symbolic_weights:
-          optimizer_weights_group = f.create_group('optimizer_weights')
-          weight_values = K.batch_get_value(symbolic_weights)
-          weight_names = []
-          for w, val in zip(symbolic_weights, weight_values):
-            name = str(w.name)
-            weight_names.append(name.encode('utf8'))
-          optimizer_weights_group.attrs['weight_names'] = weight_names
-          for name, val in zip(weight_names, weight_values):
-            param_dset = optimizer_weights_group.create_dataset(
-                name, val.shape, dtype=val.dtype)
-            if not val.shape:
-              # scalar
-              param_dset[()] = val
-            else:
-              param_dset[:] = val
-    f.flush()
-  finally:
-    if opened_new_file:
-      f.close()
-
-
-@keras_export('keras.models.load_model')
-def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
-  """Loads a model saved via `save_model`.
-
-  Arguments:
-      filepath: One of the following:
-          - String, path to the saved model
-          - `h5py.File` object from which to load the model
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-      compile: Boolean, whether to compile the model
-          after loading.
-
-  Returns:
-      A Keras model instance. If an optimizer was found
-      as part of the saved model, the model is already
-      compiled. Otherwise, the model is uncompiled and
-      a warning will be displayed. When `compile` is set
-      to False, the compilation is omitted without any
-      warning.
-
-  Raises:
-      ImportError: if h5py is not available.
-      ValueError: In case of an invalid savefile.
-  """
-  if h5py is None:
-    raise ImportError('`load_model` requires h5py.')
-
-  if not custom_objects:
-    custom_objects = {}
-
-  def convert_custom_objects(obj):
-    """Handles custom object lookup.
-
-    Arguments:
-        obj: object, dict, or list.
-
-    Returns:
-        The same structure, where occurrences
-            of a custom object name have been replaced
-            with the custom object.
-    """
-    if isinstance(obj, list):
-      deserialized = []
-      for value in obj:
-        deserialized.append(convert_custom_objects(value))
-      return deserialized
-    if isinstance(obj, dict):
-      deserialized = {}
-      for key, value in obj.items():
-        deserialized[key] = convert_custom_objects(value)
-      return deserialized
-    if obj in custom_objects:
-      return custom_objects[obj]
-    return obj
-
-  opened_new_file = not isinstance(filepath, h5py.File)
-  if opened_new_file:
-    f = h5py.File(filepath, mode='r')
-  else:
-    f = filepath
-
-  model = None
-  try:
-    # instantiate model
-    model_config = f.attrs.get('model_config')
-    if model_config is None:
-      raise ValueError('No model found in config file.')
-    model_config = json.loads(model_config.decode('utf-8'))
-    model = model_from_config(model_config, custom_objects=custom_objects)
-
-    # set weights
-    load_weights_from_hdf5_group(f['model_weights'], model.layers)
-
-    if compile:
-      # instantiate optimizer
-      training_config = f.attrs.get('training_config')
-      if training_config is None:
-        logging.warning('No training configuration found in save file: '
-                        'the model was *not* compiled. Compile it manually.')
-        return model
-      training_config = json.loads(training_config.decode('utf-8'))
-      optimizer_config = training_config['optimizer_config']
-      optimizer = optimizers.deserialize(
-          optimizer_config, custom_objects=custom_objects)
-
-      # Recover loss functions and metrics.
-      loss = convert_custom_objects(training_config['loss'])
-      metrics = convert_custom_objects(training_config['metrics'])
-      weighted_metrics = convert_custom_objects(
-          training_config.get('weighted_metrics', None))
-      sample_weight_mode = training_config['sample_weight_mode']
-      loss_weights = training_config['loss_weights']
-
-      # Compile model.
-      model.compile(
-          optimizer=optimizer,
-          loss=loss,
-          metrics=metrics,
-          weighted_metrics=weighted_metrics,
-          loss_weights=loss_weights,
-          sample_weight_mode=sample_weight_mode)
-
-      # Set optimizer weights.
-      if 'optimizer_weights' in f:
-        # Build train function (to get weight updates).
-        # Models that aren't graph networks must wait until they are called
-        # with data to _make_train_function() and so can't load optimizer
-        # weights.
-        if model._is_graph_network:  # pylint: disable=protected-access
-          model._make_train_function()
-          optimizer_weights_group = f['optimizer_weights']
-          optimizer_weight_names = [
-              n.decode('utf8')
-              for n in optimizer_weights_group.attrs['weight_names']
-          ]
-          optimizer_weight_values = [
-              optimizer_weights_group[n] for n in optimizer_weight_names
-          ]
-          try:
-            model.optimizer.set_weights(optimizer_weight_values)
-          except ValueError:
-            logging.warning('Error in loading the saved optimizer '
-                            'state. As a result, your model is '
-                            'starting with a freshly initialized '
-                            'optimizer.')
-        else:
-          logging.warning('Sequential models without an `input_shape` '
-                          'passed to the first layer cannot reload their '
-                          'optimizer state. As a result, your model is'
-                          'starting with a freshly initialized optimizer.')
-
-  finally:
-    if opened_new_file:
-      f.close()
-  return model
-
-
-@keras_export('keras.models.model_from_config')
-def model_from_config(config, custom_objects=None):
-  """Instantiates a Keras model from its config.
-
-  Arguments:
-      config: Configuration dictionary.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      TypeError: if `config` is not a dictionary.
-  """
-  if isinstance(config, list):
-    raise TypeError('`model_from_config` expects a dictionary, not a list. '
-                    'Maybe you meant to use '
-                    '`Sequential.from_config(config)`?')
-  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-@keras_export('keras.models.model_from_yaml')
-def model_from_yaml(yaml_string, custom_objects=None):
-  """Parses a yaml model configuration file and returns a model instance.
-
-  Arguments:
-      yaml_string: YAML string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      ImportError: if yaml module is not found.
-  """
-  if yaml is None:
-    raise ImportError('Requires yaml module installed (`pip install pyyaml`).')
-  config = yaml.load(yaml_string)
-  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-@keras_export('keras.models.model_from_json')
-def model_from_json(json_string, custom_objects=None):
-  """Parses a JSON model configuration file and returns a model instance.
-
-  Arguments:
-      json_string: JSON string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-  """
-  config = json.loads(json_string)
-  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-def preprocess_weights_for_loading(layer,
-                                   weights,
-                                   original_keras_version=None,
-                                   original_backend=None):
-  """Preprocess layer weights between different Keras formats.
-
-  Converts layers weights from Keras 1 format to Keras 2 and also weights of
-  CuDNN layers in Keras 2.
-
-  Arguments:
-      layer: Layer instance.
-      weights: List of weights values (Numpy arrays).
-      original_keras_version: Keras version for the weights, as a string.
-      original_backend: Keras backend the weights were trained with,
-          as a string.
-
-  Returns:
-      A list of weights values (Numpy arrays).
-  """
-  def convert_nested_bidirectional(weights):
-    """Converts layers nested in `Bidirectional` wrapper.
-
-    This function uses `preprocess_weights_for_loading()` for converting
-    layers.
-
-    Arguments:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    num_weights_per_layer = len(weights) // 2
-    forward_weights = preprocess_weights_for_loading(
-        layer.forward_layer, weights[:num_weights_per_layer],
-        original_keras_version, original_backend)
-    backward_weights = preprocess_weights_for_loading(
-        layer.backward_layer, weights[num_weights_per_layer:],
-        original_keras_version, original_backend)
-    return forward_weights + backward_weights
-
-  def convert_nested_time_distributed(weights):
-    """Converts layers nested in `TimeDistributed` wrapper.
-
-    This function uses `preprocess_weights_for_loading()` for converting nested
-    layers.
-
-    Arguments:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    return preprocess_weights_for_loading(
-        layer.layer, weights, original_keras_version, original_backend)
-
-  def convert_nested_model(weights):
-    """Converts layers nested in `Model` or `Sequential`.
-
-    This function uses `preprocess_weights_for_loading()` for converting nested
-    layers.
-
-    Arguments:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    new_weights = []
-    # trainable weights
-    for sublayer in layer.layers:
-      num_weights = len(sublayer.trainable_weights)
-      if num_weights > 0:
-        new_weights.extend(preprocess_weights_for_loading(
-            layer=sublayer,
-            weights=weights[:num_weights],
-            original_keras_version=original_keras_version,
-            original_backend=original_backend))
-        weights = weights[num_weights:]
-
-    # non-trainable weights
-    for sublayer in layer.layers:
-      num_weights = len([l for l in sublayer.weights
-                         if l not in sublayer.trainable_weights])
-      if num_weights > 0:
-        new_weights.extend(preprocess_weights_for_loading(
-            layer=sublayer,
-            weights=weights[:num_weights],
-            original_keras_version=original_keras_version,
-            original_backend=original_backend))
-        weights = weights[num_weights:]
-    return new_weights
-
-  # Convert layers nested in Bidirectional/Model/Sequential.
-  # Both transformation should be ran for both Keras 1->2 conversion
-  # and for conversion of CuDNN layers.
-  if layer.__class__.__name__ == 'Bidirectional':
-    weights = convert_nested_bidirectional(weights)
-  if layer.__class__.__name__ == 'TimeDistributed':
-    weights = convert_nested_time_distributed(weights)
-  elif layer.__class__.__name__ in ['Model', 'Sequential']:
-    weights = convert_nested_model(weights)
-
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'TimeDistributed':
-      weights = preprocess_weights_for_loading(
-          layer.layer, weights, original_keras_version, original_backend)
-
-    if layer.__class__.__name__ == 'Conv1D':
-      shape = weights[0].shape
-      # Handle Keras 1.1 format
-      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
-        # Legacy shape:
-        # (filters, input_dim, filter_length, 1)
-        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
-                                                           1)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-      weights[0] = weights[0][:, 0, :, :]
-
-    if layer.__class__.__name__ == 'Conv2D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-
-    if layer.__class__.__name__ == 'Conv2DTranspose':
-      if layer.data_format == 'channels_last':
-        # old: (kernel_rows, kernel_cols, stack_size, filters)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
-
-    if layer.__class__.__name__ == 'Conv3D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, ...)
-        # new: (..., stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
-
-    if layer.__class__.__name__ == 'GRU':
-      if len(weights) == 9:
-        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[4], weights[7]], axis=-1)
-        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'LSTM':
-      if len(weights) == 12:
-        # old: i, c, f, o
-        # new: i, f, c, o
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'ConvLSTM2D':
-      if len(weights) == 12:
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        if layer.data_format == 'channels_first':
-          # old: (filters, stack_size, kernel_rows, kernel_cols)
-          # new: (kernel_rows, kernel_cols, stack_size, filters)
-          kernel = np.transpose(kernel, (2, 3, 1, 0))
-          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
-        weights = [kernel, recurrent_kernel, bias]
-
-  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
-  if layer.__class__.__name__ in conv_layers:
-    if original_backend == 'theano':
-      weights[0] = conv_utils.convert_kernel(weights[0])
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = conv_utils.convert_kernel(weights[1])
-    if K.int_shape(layer.weights[0]) != weights[0].shape:
-      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
-
-  # convert CuDNN layers
-  return _convert_rnn_weights(layer, weights)
-
-
-def _convert_rnn_weights(layer, weights):
-  """Converts weights for RNN layers between native and CuDNN format.
-
-  Input kernels for each gate are transposed and converted between Fortran
-  and C layout, recurrent kernels are transposed. For LSTM biases are summed/
-  split in half, for GRU biases are reshaped.
-
-  Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
-  and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
-  compatible with `CuDNNGRU`.
-
-  For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
-
-  Arguments:
-      layer: Target layer instance.
-      weights: List of source weights values (input kernels, recurrent
-          kernels, [biases]) (Numpy arrays).
-
-  Returns:
-      A list of converted weights values (Numpy arrays).
-
-  Raises:
-      ValueError: for incompatible GRU layer/weights or incompatible biases
-  """
-
-  def transform_kernels(kernels, func, n_gates):
-    """Transforms kernel for each gate separately using given function.
-
-    Arguments:
-        kernels: Stacked array of kernels for individual gates.
-        func: Function applied to kernel of each gate.
-        n_gates: Number of gates (4 for LSTM, 3 for GRU).
-
-    Returns:
-        Stacked array of transformed kernels.
-    """
-    return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
-
-  def transpose_input(from_cudnn):
-    """Makes a function that transforms input kernels from/to CuDNN format.
-
-    It keeps the shape, but changes between the layout (Fortran/C). Eg.:
-
-    ```
-    Keras                 CuDNN
-    [[0, 1, 2],  <--->  [[0, 2, 4],
-     [3, 4, 5]]          [1, 3, 5]]
-    ```
-
-    It can be passed to `transform_kernels()`.
-
-    Arguments:
-        from_cudnn: `True` if source weights are in CuDNN format, `False`
-            if they're in plain Keras format.
-
-    Returns:
-        Function that converts input kernel to the other format.
-    """
-    order = 'F' if from_cudnn else 'C'
-
-    def transform(kernel):
-      return kernel.T.reshape(kernel.shape, order=order)
-
-    return transform
-
-  target_class = layer.__class__.__name__
-
-  # convert the weights between CuDNNLSTM and LSTM
-  if target_class in ['LSTM', 'CuDNNLSTM'] and len(weights) == 3:
-    # determine if we're loading a CuDNNLSTM layer
-    # from the number of bias weights:
-    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
-    # if there's no bias weight in the file, skip this conversion
-    units = weights[1].shape[0]
-    bias_shape = weights[2].shape
-    n_gates = 4
-
-    if bias_shape == (2 * units * n_gates,):
-      source = 'CuDNNLSTM'
-    elif bias_shape == (units * n_gates,):
-      source = 'LSTM'
-    else:
-      raise ValueError('Invalid bias shape: ' + str(bias_shape))
-
-    def convert_lstm_weights(weights, from_cudnn=True):
-      """Converts the weights between CuDNNLSTM and LSTM.
-
-      Arguments:
-        weights: Original weights.
-        from_cudnn: Indicates whether original weights are from CuDNN layer.
-
-      Returns:
-        Updated weights compatible with LSTM.
-      """
-
-      # Transpose (and reshape) input and recurrent kernels
-      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
-                                  n_gates)
-      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      if from_cudnn:
-        # merge input and recurrent biases into a single set
-        biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
-      else:
-        # Split single set of biases evenly to two sets. The way of
-        # splitting doesn't matter as long as the two sets sum is kept.
-        biases = np.tile(0.5 * weights[2], 2)
-      return [kernels, recurrent_kernels, biases]
-
-    if source != target_class:
-      weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
-
-  # convert the weights between CuDNNGRU and GRU(reset_after=True)
-  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
-    # We can determine the source of the weights from the shape of the bias.
-    # If there is no bias we skip the conversion since
-    # CuDNNGRU always has biases.
-
-    units = weights[1].shape[0]
-    bias_shape = weights[2].shape
-    n_gates = 3
-
-    def convert_gru_weights(weights, from_cudnn=True):
-      """Converts the weights between CuDNNGRU and GRU.
-
-      Arguments:
-        weights: Original weights.
-        from_cudnn: Indicates whether original weights are from CuDNN layer.
-
-      Returns:
-        Updated weights compatible with GRU.
-      """
-
-      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
-                                  n_gates)
-      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
-      return [kernels, recurrent_kernels, biases]
-
-    if bias_shape == (2 * units * n_gates,):
-      source = 'CuDNNGRU'
-    elif bias_shape == (2, units * n_gates):
-      source = 'GRU(reset_after=True)'
-    elif bias_shape == (units * n_gates,):
-      source = 'GRU(reset_after=False)'
-    else:
-      raise ValueError('Invalid bias shape: ' + str(bias_shape))
-
-    if target_class == 'CuDNNGRU':
-      target = 'CuDNNGRU'
-    elif layer.reset_after:
-      target = 'GRU(reset_after=True)'
-    else:
-      target = 'GRU(reset_after=False)'
-
-    # only convert between different types
-    if source != target:
-      types = (source, target)
-      if 'GRU(reset_after=False)' in types:
-        raise ValueError('%s is not compatible with %s' % types)
-      if source == 'CuDNNGRU':
-        weights = convert_gru_weights(weights, from_cudnn=True)
-      elif source == 'GRU(reset_after=True)':
-        weights = convert_gru_weights(weights, from_cudnn=False)
-
-  return weights
-
-
-def save_weights_to_hdf5_group(f, layers):
-  """Saves the weights of a list of layers to a HDF5 group.
-
-  Arguments:
-      f: HDF5 group.
-      layers: List of layer instances.
-  """
-  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  save_attributes_to_hdf5_group(
-      f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
-  f.attrs['backend'] = K.backend().encode('utf8')
-  f.attrs['keras_version'] = str(keras_version).encode('utf8')
-
-  for layer in layers:
-    g = f.create_group(layer.name)
-    symbolic_weights = layer.weights
-    weight_values = K.batch_get_value(symbolic_weights)
-    weight_names = []
-    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
-      if hasattr(w, 'name') and w.name:
-        name = str(w.name)
-      else:
-        name = 'param_' + str(i)
-      weight_names.append(name.encode('utf8'))
-    save_attributes_to_hdf5_group(g, 'weight_names', weight_names)
-    for name, val in zip(weight_names, weight_values):
-      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
-      if not val.shape:
-        # scalar
-        param_dset[()] = val
-      else:
-        param_dset[:] = val
-
-
-def load_weights_from_hdf5_group(f, layers):
-  """Implements topological (order-based) weight loading.
-
-  Arguments:
-      f: A pointer to a HDF5 group.
-      layers: a list of target layers.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version'].decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend'].decode('utf8')
-  else:
-    original_backend = None
-
-  filtered_layers = []
-  for layer in layers:
-    weights = layer.weights
-    if weights:
-      filtered_layers.append(layer)
-
-  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
-  filtered_layer_names = []
-  for name in layer_names:
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    if weight_names:
-      filtered_layer_names.append(name)
-  layer_names = filtered_layer_names
-  if len(layer_names) != len(filtered_layers):
-    raise ValueError('You are trying to load a weight file '
-                     'containing ' + str(len(layer_names)) +
-                     ' layers into a model with ' + str(len(filtered_layers)) +
-                     ' layers.')
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
-    layer = filtered_layers[k]
-    symbolic_weights = layer.weights
-    weight_values = preprocess_weights_for_loading(
-        layer, weight_values, original_keras_version, original_backend)
-    if len(weight_values) != len(symbolic_weights):
-      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
-                       '" in the current model) was found to '
-                       'correspond to layer ' + name + ' in the save file. '
-                       'However the new layer ' + layer.name + ' expects ' +
-                       str(len(symbolic_weights)) +
-                       ' weights, but the saved weights have ' +
-                       str(len(weight_values)) + ' elements.')
-    weight_value_tuples += zip(symbolic_weights, weight_values)
-  K.batch_set_value(weight_value_tuples)
-
-
-def load_weights_from_hdf5_group_by_name(f, layers):
-  """Implements name-based weight loading.
-
-  (instead of topological weight loading).
-
-  Layers that have no matching name are skipped.
-
-  Arguments:
-      f: A pointer to a HDF5 group.
-      layers: a list of target layers.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version'].decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend'].decode('utf8')
-  else:
-    original_backend = None
-
-  # New file format.
-  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
-
-  # Reverse index of layer name to list of layers with name.
-  index = {}
-  for layer in layers:
-    if layer.name:
-      index.setdefault(layer.name, []).append(layer)
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
-
-    for layer in index.get(name, []):
-      symbolic_weights = layer.weights
-      weight_values = preprocess_weights_for_loading(
-          layer, weight_values, original_keras_version, original_backend)
-      if len(weight_values) != len(symbolic_weights):
-        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
-                         '") expects ' + str(len(symbolic_weights)) +
-                         ' weight(s), but the saved weights' + ' have ' +
-                         str(len(weight_values)) + ' element(s).')
-      # Set values.
-      for i in range(len(weight_values)):
-        if K.int_shape(symbolic_weights[i]) != weight_values[i].shape:
-          raise ValueError('Layer #' + str(k) +' (named "' + layer.name +
-                           '"), weight ' + str(symbolic_weights[i]) +
-                           ' has shape {}'.format(K.int_shape(
-                               symbolic_weights[i])) +
-                           ', but the saved weight has shape ' +
-                           str(weight_values[i].shape) + '.')
-
-        else:
-          weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
-  K.batch_set_value(weight_value_tuples)
-
-
-def save_attributes_to_hdf5_group(group, name, data):
-  """Saves attributes (data) of the specified name into the HDF5 group.
-
-  This method deals with an inherent problem of HDF5 file which is not
-  able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
-  Arguments:
-      group: A pointer to a HDF5 group.
-      name: A name of the attributes to save.
-      data: Attributes data to store.
-
-  Raises:
-    RuntimeError: If any single attribute is too large to be saved.
-  """
-  # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
-  # because in that case even chunking the array would not make the saving
-  # possible.
-  bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
-
-  # Expecting this to never be true.
-  if bad_attributes:
-    raise RuntimeError('The following attributes cannot be saved to HDF5 '
-                       'file because they are larger than %d bytes: %s' %
-                       (HDF5_OBJECT_HEADER_LIMIT,
-                        ', '.join([x for x in bad_attributes])))
-
-  data_npy = np.asarray(data)
-
-  num_chunks = 1
-  chunked_data = np.array_split(data_npy, num_chunks)
-
-  # This will never loop forever thanks to the test above.
-  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
-    num_chunks += 1
-    chunked_data = np.array_split(data_npy, num_chunks)
-
-  if num_chunks > 1:
-    for chunk_id, chunk_data in enumerate(chunked_data):
-      group.attrs['%s%d' % (name, chunk_id)] = chunk_data
-  else:
-    group.attrs[name] = data
-
-
-def load_attributes_from_hdf5_group(group, name):
-  """Loads attributes of the specified name from the HDF5 group.
-
-  This method deals with an inherent problem
-  of HDF5 file which is not able to store
-  data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
-  Arguments:
-      group: A pointer to a HDF5 group.
-      name: A name of the attributes to load.
-
-  Returns:
-      data: Attributes data.
-  """
-  if name in group.attrs:
-    data = [n.decode('utf8') for n in group.attrs[name]]
-  else:
-    data = []
-    chunk_id = 0
-    while '%s%d' % (name, chunk_id) in group.attrs:
-      data.extend(
-          [n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
-      chunk_id += 1
-  return data
+from tensorflow.python.keras.saving import *  # pylint: disable=wildcard-import
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 5773d6e44b44e1b76dedd294af1ee68da0fc90e0..37eb2840b3ba5d5574a326cdae14f1291ab39749 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
@@ -86,8 +87,8 @@ class Sequential(Model):
   model.add(Dense(32))
   model.weights  # returns list of length 4
 
-  When using the delayed-build pattern (no input shape specified), you can
-  choose to manually build your model by calling `build(batch_input_shape)`:
+  # When using the delayed-build pattern (no input shape specified), you can
+  # choose to manually build your model by calling `build(batch_input_shape)`:
   model = Sequential()
   model.add(Dense(32))
   model.add(Dense(32))
@@ -150,7 +151,7 @@ class Sequential(Model):
     if not self._layers:
       if isinstance(layer, InputLayer):
         # Corner case where the user passes an InputLayer layer via `add`.
-        assert len(layer._inbound_nodes[-1].output_tensors) == 1
+        assert len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) == 1
         set_inputs = True
       else:
         batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
@@ -168,12 +169,14 @@ class Sequential(Model):
 
       if set_inputs:
         # If an input layer (placeholder) is available.
-        if len(layer._inbound_nodes[-1].output_tensors) != 1:
+        if len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) != 1:
           raise ValueError('All layers in a Sequential model '
                            'should have a single output tensor. '
                            'For multi-output layers, '
                            'use the functional API.')
-        self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
+        self.outputs = [
+            nest.flatten(layer._inbound_nodes[-1].output_tensors)[0]
+        ]
         self.inputs = layer_utils.get_source_inputs(self.outputs[0])
 
     elif self.outputs:
@@ -242,8 +245,11 @@ class Sequential(Model):
     if not self.built and self._is_graph_network:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
 
-    x = inputs
+    outputs = inputs  # handle the corner case where self.layers is empty
     for layer in self.layers:
+      # During each iteration, `inputs` are the inputs to `layer`, and `outputs`
+      # are the outputs of `layer` applied to `inputs`. At the end of each
+      # iteration `inputs` is set to `outputs` to prepare for the next layer.
       kwargs = {}
       argspec = self._layer_call_argspecs[layer].args
       if 'mask' in argspec:
@@ -252,26 +258,34 @@ class Sequential(Model):
         kwargs['training'] = training
 
       if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
-        x, mask = layer._call_and_compute_mask(x, **kwargs)
+        outputs, mask = layer._call_and_compute_mask(inputs, **kwargs)
       else:
         if not layer.built:
           # Build layer if applicable.
           with ops.name_scope(layer._name_scope()):
-            layer._maybe_build(x)
+            layer._maybe_build(inputs)
           layer.built = True
-        if context.executing_eagerly():
-          x = layer(x, **kwargs)
-        elif layer.dynamic:
-          x = layer._symbolic_call(x)
-        else:
-          x = layer.call(x, **kwargs)
         if layer.supports_masking:
-          mask = layer.compute_mask(x, mask)
+          mask = layer.compute_mask(inputs, mask)
         else:
           mask = None
+
+        if context.executing_eagerly():
+          # __call__ handles activity regularization.
+          outputs = layer(inputs, **kwargs)
+        elif layer.dynamic:
+          outputs = layer._symbolic_call(inputs)
+          layer._handle_activity_regularization(inputs, outputs)
+        else:
+          outputs = layer.call(inputs, **kwargs)
+          layer._handle_activity_regularization(inputs, outputs)
+
       if not context.executing_eagerly():
-        x._keras_mask = mask
-    return x, mask
+        outputs._keras_mask = mask
+
+      # `outputs` will be the inputs to the next layer.
+      inputs = outputs
+    return outputs, mask
 
   def compute_output_shape(self, input_shape):
     shape = input_shape
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 30a41e39b714534260e46cc7d9f446f42b29b929..a65c200ce66070bd4d713bfc1ebda8d32f7435c3 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -30,7 +30,6 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class TestSequential(keras_parameterized.TestCase):
@@ -57,16 +56,20 @@ class TestSequential(keras_parameterized.TestCase):
 
     model = testing_utils.get_small_sequential_mlp(
         num_hidden, num_classes, input_dim)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
     model.pop()
     self.assertEqual(len(model.layers), 1)
     self.assertEqual(model.output_shape, (None, num_hidden))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     y = np.random.random((batch_size, num_hidden))
     model.fit(x, y, epochs=1)
 
@@ -92,7 +95,7 @@ class TestSequential(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
@@ -117,7 +120,7 @@ class TestSequential(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
@@ -148,7 +151,7 @@ class TestSequential(keras_parameterized.TestCase):
         else:
           model = testing_utils.get_small_sequential_mlp(10, 4, input_dim=3)
         model.compile(
-            optimizer=rmsprop.RMSPropOptimizer(1e-3),
+            optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
         return model
@@ -264,7 +267,7 @@ class TestSequential(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertFalse(model.built)
@@ -304,14 +307,30 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 8)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_sequential_deferred_manual_build(self):
+    model = testing_utils.get_small_sequential_mlp(4, 5)
+    self.assertFalse(model.built)
+    model(array_ops.zeros([1, 2]))
+    self.assertTrue(model.built)
+    self.assertEqual(len(model.outputs), 0)
+    model.compile('rmsprop',
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    self.assertEqual(len(model.outputs), 0)
+    model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
+    self.assertEqual(len(model.outputs), 1)
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
     model = testing_utils.get_small_sequential_mlp(4, 3)
     inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
@@ -352,8 +371,10 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
     model.add(keras.layers.Dense(4, activation='relu'))
     model.add(keras.layers.Dense(5, activation='softmax'))
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
@@ -363,8 +384,10 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
   def test_build_before_fit(self):
     # Fix for b/112433577
     model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model.build((None, 6))
 
@@ -378,7 +401,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(
         num_hidden=10, num_classes=num_classes)
     model.compile(
-        rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        'rmsprop',
         metrics=['acc'],
         weighted_metrics=['mae'],
         loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index cd1f4d16971a52d595ff4967a999ab75b04bcebe..951988d852fe361a6b50b558b64169150bba6f53 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -30,12 +30,10 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
-from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 try:
   import yaml  # pylint:disable=g-import-not-at-top
@@ -310,7 +308,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(network.non_trainable_weights,
                      dense.trainable_weights + dense.non_trainable_weights)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes
   def test_trainable_weights(self):
     a = keras.layers.Input(shape=(2,))
     b = keras.layers.Dense(1)(a)
@@ -360,17 +358,17 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     x = keras.layers.Dropout(0.5)(x, training=True)
     model = keras.models.Model(inp, x)
     # Would be `dropout/cond/Merge` by default
-    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
+    self.assertTrue(model.output.op.name.endswith('dropout/mul_1'))
 
     # Test that argument is kept when applying the model
     inp2 = keras.layers.Input(shape=(2,))
     out2 = model(inp2)
-    self.assertTrue(out2.op.name.endswith('dropout/mul'))
+    self.assertTrue(out2.op.name.endswith('dropout/mul_1'))
 
     # Test that argument is kept after loading a model
     config = model.get_config()
     model = keras.models.Model.from_config(config)
-    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
+    self.assertTrue(model.output.op.name.endswith('dropout/mul_1'))
 
   def test_node_construction(self):
     # test basics
@@ -402,12 +400,12 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
 
     self.assertEqual(len(dense._inbound_nodes), 2)
     self.assertEqual(len(dense._outbound_nodes), 0)
-    self.assertListEqual(dense._inbound_nodes[0].inbound_layers, [a_layer])
+    self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
     self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
-    self.assertListEqual(dense._inbound_nodes[1].inbound_layers, [b_layer])
+    self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
     self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertListEqual(dense._inbound_nodes[0].input_tensors, [a])
-    self.assertListEqual(dense._inbound_nodes[1].input_tensors, [b])
+    self.assertEqual(dense._inbound_nodes[0].input_tensors, a)
+    self.assertEqual(dense._inbound_nodes[1].input_tensors, b)
 
     # test layer properties
     test_layer = keras.layers.Dense(16, name='test_layer')
@@ -860,7 +858,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     x = np.ones((100, 2))
     y = np.ones((100, 2))
     model.compile(
-        optimizer=gradient_descent.SGD(),
+        optimizer='sgd',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
@@ -909,7 +907,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     model.add(keras.layers.Dense(3))
     model.compile(
         loss='mse',
-        optimizer=gradient_descent.SGD(),
+        optimizer='sgd',
         metrics=['acc'],
         run_eagerly=testing_utils.should_run_eagerly())
 
@@ -920,6 +918,16 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
       yaml_str = model.to_yaml()
       keras.models.model_from_yaml(yaml_str)
 
+  def test_subclassed_error_if_init_not_called(self):
+
+    class MyNetwork(network_lib.Network):
+
+      def __init__(self):
+        self._foo = [keras.layers.Dense(10), keras.layers.Dense(10)]
+
+    with self.assertRaisesRegexp(RuntimeError, 'forgot to call'):
+      MyNetwork()
+
 
 class DeferredModeTest(test.TestCase):
 
@@ -1151,7 +1159,7 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     x = keras.layers.wrappers.TimeDistributed(s)(x)
     model = keras.Model(inputs=inputs, outputs=x)
     model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly())
 
@@ -1203,5 +1211,80 @@ class GraphUtilsTest(test.TestCase):
           {x_3, x_5, x_5.op})
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class NestedNetworkTest(test.TestCase):
+
+  def test_nested_inputs_network(self):
+    inputs = {'x1': keras.Input(shape=(1,)), 'x2': keras.Input(shape=(1,))}
+    outputs = keras.layers.Add()([inputs['x1'], inputs['x2']])
+    network = keras.engine.network.Network(inputs, outputs)
+
+    network = keras.engine.network.Network.from_config(network.get_config())
+
+    result_tensor = network({
+        'x': array_ops.ones((1, 1), 'float32'),
+        'y': array_ops.ones((1, 1), 'float32')
+    })
+    result = self.evaluate(result_tensor)
+    self.assertAllEqual(result, [[2.]])
+
+    # TODO(b/122726584): Investigate why concrete batch is flaky in some builds.
+    output_shape = network.compute_output_shape({
+        'x1': (None, 1),
+        'x2': (None, 1)
+    })
+    self.assertListEqual(output_shape.as_list(), [None, 1])
+
+  def test_nested_outputs_network(self):
+    inputs = keras.Input(shape=(1,))
+    outputs = {
+        'x+x': keras.layers.Add()([inputs, inputs]),
+        'x*x': keras.layers.Multiply()([inputs, inputs])
+    }
+
+    network = keras.engine.network.Network(inputs, outputs)
+
+    network = keras.engine.network.Network.from_config(network.get_config())
+
+    result_tensor = network(array_ops.ones((1, 1), 'float32'))
+    result = self.evaluate(result_tensor)
+    self.assertAllEqual(result['x+x'], [[2.]])
+    self.assertAllEqual(result['x*x'], [[1.]])
+
+    output_shape = network.compute_output_shape((None, 1))
+    self.assertListEqual(output_shape['x+x'].as_list(), [None, 1])
+    self.assertListEqual(output_shape['x*x'].as_list(), [None, 1])
+
+  def test_nested_network_inside_network(self):
+    inner_inputs = {
+        'x1': keras.Input(shape=(1,)),
+        'x2': keras.Input(shape=(1,))
+    }
+    inner_outputs = {
+        'x1+x2':
+            keras.layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
+        'x1*x2':
+            keras.layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
+    }
+    inner_network = keras.engine.network.Network(inner_inputs, inner_outputs)
+
+    inputs = [keras.Input(shape=(1,)), keras.Input(shape=(1,))]
+    middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
+    outputs = keras.layers.Add()([middle['x1+x2'], middle['x1*x2']])
+    network = keras.engine.network.Network(inputs, outputs)
+
+    network = keras.engine.network.Network.from_config(network.get_config())
+
+    # Computes: `(x1+x2) + (x1*x2)`
+    result_tensor = network(
+        [array_ops.ones((1, 1), 'float32'),
+         array_ops.ones((1, 1), 'float32')])
+    result = self.evaluate(result_tensor)
+    self.assertAllEqual(result, [[3.]])
+
+    output_shape = network.compute_output_shape([(None, 1), (None, 1)])
+    self.assertListEqual(output_shape.as_list(), [None, 1])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 4a398cdb16d0144c1d6af2f1693440b36bcd5e1c..12a1008153e7a33fe9a70a1e5974cc2d76c4af8d 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -19,14 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import weakref
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -42,12 +43,12 @@ from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.mode_keys import ModeKeys
@@ -122,10 +123,6 @@ class Model(Network):
 
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
-    # Create a cache for iterator get_next op.
-    self._iterator_get_next = weakref.WeakKeyDictionary()
-    # Create a cache for dataset - uninitialized iterators
-    self._dataset_iterator_cache = weakref.WeakKeyDictionary()
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
@@ -133,6 +130,7 @@ class Model(Network):
     # passing distribution strategy to compile rather than creating the model
     # under distribution strategy scope.
     self._compile_distribution = False
+    self._distributed_session_is_configured = False
 
     self.run_eagerly = None
 
@@ -221,15 +219,16 @@ class Model(Network):
                       'create the model under the distribution strategy scope.')
       self._distribution_strategy = distribute
       self._compile_distribution = True
+      self._distributed_session_is_configured = False
     else:
-      if distribution_strategy_context.has_distribution_strategy():
+      if distribution_strategy_context.has_strategy():
         # When the user builds the model in the DS scope and cross replica
         # context we want distribution strategy to be set but when building the
         # replica copies of the models internally we should not be compiling
         # with distribution strategy and use the default compilation path.
         if distribution_strategy_context.in_cross_replica_context():
           self._distribution_strategy = (
-              distribution_strategy_context.get_distribution_strategy())
+              distribution_strategy_context.get_strategy())
 
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
@@ -262,7 +261,7 @@ class Model(Network):
     self.optimizer = optimizer
     # We've disabled automatic dependency tracking for this method, but do want
     # to add a checkpoint dependency on the optimizer if it's checkpointable.
-    if isinstance(self.optimizer, checkpointable.CheckpointableBase):
+    if isinstance(self.optimizer, checkpointable.Checkpointable):
       self._track_checkpointable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
@@ -277,13 +276,12 @@ class Model(Network):
     self.target_tensors = target_tensors
 
     # Set DistributionStrategy specific parameters.
-    self._distributed_model = None
-    if self._distribution_strategy is not None:
-      distributed_training_utils.configure_and_create_session(
-          self._distribution_strategy)
+    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
+      distributed_training_utils.set_distributed_model(self, mode, None)
+
     # Initialize model metric attributes.
     self._init_metric_attributes()
-    if not self.built:
+    if not self.built or not self.inputs or not self.outputs:
       # Model is not compilable because it does not know its number of inputs
       # and outputs, nor their shapes and names. We will compile after the first
       # time the model gets called on training data.
@@ -468,21 +466,18 @@ class Model(Network):
           mask = masks[i]
           loss_weight = loss_weights_list[i]
           with K.name_scope(self.output_names[i] + '_loss'):
-            if isinstance(loss_fn, losses.Loss):
-              if mask is not None:
-                mask = math_ops.cast(mask, y_pred.dtype)
-                # Update weights with mask.
-                if sample_weight is None:
-                  sample_weight = mask
-                else:
-                  # Update dimensions of weights to match with mask if possible.
-                  mask, _, sample_weight = squeeze_or_expand_dimensions(
-                      mask, None, sample_weight)
-                  sample_weight *= mask
-              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-            else:
-              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
-              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+            if mask is not None:
+              mask = math_ops.cast(mask, y_pred.dtype)
+              # Update weights with mask.
+              if sample_weight is None:
+                sample_weight = mask
+              else:
+                # Update dimensions of weights to match with mask if possible.
+                mask, _, sample_weight = squeeze_or_expand_dimensions(
+                    mask, None, sample_weight)
+                sample_weight *= mask
+
+            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
 
           if len(self.outputs) > 1:
             # Keep track of the un-aggregated loss result tensor.
@@ -490,10 +485,8 @@ class Model(Network):
                                           '_loss'] = output_loss
 
             # Keep track of stateful result tensor and function for the loss.
-            loss_name = loss_fn.name if isinstance(
-                loss_fn, losses.Loss) else loss_fn.__name__
             mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-                loss_fn, name=loss_name)
+                loss_fn, name=loss_fn.name)
             result_tensor = self._call_metric_fn(mean_wrapped_loss, y_true,
                                                  y_pred, sample_weight, mask)
             self._compile_stateful_metrics_tensors[self.output_names[i] +
@@ -548,6 +541,21 @@ class Model(Network):
       trainable_weights = self.trainable_weights
       self._collected_trainable_weights = trainable_weights
 
+      # Validate all variables were correctly created in distribution scope.
+      if self._distribution_strategy and not self._compile_distribution:
+        for v in self.variables:
+          strategy = self._distribution_strategy
+          if not strategy.extended.variable_created_in_scope(v):
+            raise ValueError(
+                'Variable (%s) was not created in the distribution strategy '
+                'scope of (%s). It is most likely due to not all layers or '
+                'the model or optimizer being created outside the distribution '
+                'strategy scope. Try to make sure your code looks similar '
+                'to the following.\n'
+                'with strategy.scope():\n'
+                '  model=_create_model()\n'
+                '  model.compile(...)'% (v, strategy))
+
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
@@ -625,6 +633,7 @@ class Model(Network):
           initial_epoch=0,
           steps_per_epoch=None,
           validation_steps=None,
+          validation_freq=1,
           max_queue_size=10,
           workers=1,
           use_multiprocessing=False,
@@ -729,6 +738,13 @@ class Model(Network):
             is a dataset or dataset iterator. Total number of steps (batches of
             samples) to draw before stopping when performing validation
             at the end of every epoch.
+        validation_freq: Only relevant if validation data is provided. Integer
+            or `collections.Container` instance (e.g. list, tuple, etc.). If an
+            integer, specifies how many training epochs to run before a new
+            validation run is performed, e.g. `validation_freq=2` runs
+            validation every 2 epochs. If a Container, specifies the epochs on
+            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+            validation at the end of the 1st, 2nd, and 10th epochs.
         max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
             input only. Maximum size for the generator queue.
             If unspecified, `max_queue_size` will default to 10.
@@ -756,10 +772,79 @@ class Model(Network):
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
-    # TODO(fchollet): this method may be creating reference cycles, which would
-    # lead to accumulating garbage in memory when called in a loop. Investigate.
+    # Legacy support
+    if 'nb_epoch' in kwargs:
+      logging.warning(
+          'The `nb_epoch` argument in `fit` '
+          'has been renamed `epochs`.')
+      epochs = kwargs.pop('nb_epoch')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
+    # When the model expects dictionary inputs (i.e. FeatureColumn-based
+    # models), set run_eagerly to True as there's no support for graph
+    # functions.
+    training_utils.set_run_eagerly_for_dict_structure(self, x)
+
+    # Case 1: distribution strategy.
+    if self._distribution_strategy:
+      if training_utils.should_run_multi_worker():
+        # Multi-Worker mode runs the Keras training loop on multiple
+        # servers via the Distribute Coordinator.
+        def _worker_fn(_):
+          """Run training inside the distributed coordinator."""
+          self._configure_distributed_session()
+          return training_distributed.fit_distributed(
+              self,
+              x=x,
+              y=y,
+              batch_size=batch_size,
+              epochs=epochs,
+              verbose=verbose,
+              callbacks=callbacks,
+              validation_split=validation_split,
+              validation_data=validation_data,
+              shuffle=shuffle,
+              class_weight=class_weight,
+              sample_weight=sample_weight,
+              initial_epoch=initial_epoch,
+              steps_per_epoch=steps_per_epoch,
+              validation_steps=validation_steps,
+              validation_freq=validation_freq)
+
+        # Independent worker only for now.
+        return dc.run_distribute_coordinator(
+            _worker_fn,
+            self._distribution_strategy,
+            mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+      else:
+        self._configure_distributed_session()
+        return training_distributed.fit_distributed(
+            self,
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_split=validation_split,
+            validation_data=validation_data,
+            shuffle=shuffle,
+            class_weight=class_weight,
+            sample_weight=sample_weight,
+            initial_epoch=initial_epoch,
+            steps_per_epoch=steps_per_epoch,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq)
+
+    batch_size = self._validate_or_infer_batch_size(
+        batch_size, steps_per_epoch, x)
+
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
     if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(y, sample_weight)
+      training_utils.check_generator_arguments(
+          y, sample_weight, validation_split=validation_split)
       return self.fit_generator(
           x,
           steps_per_epoch=steps_per_epoch,
@@ -768,39 +853,38 @@ class Model(Network):
           callbacks=callbacks,
           validation_data=validation_data,
           validation_steps=validation_steps,
+          validation_freq=validation_freq,
           class_weight=class_weight,
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing,
           shuffle=shuffle,
           initial_epoch=initial_epoch)
+    if training_utils.is_eager_dataset_or_iterator(x):
+      # Make sure that y, sample_weights, validation_split are not passed.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      if (isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
+          and shuffle):
+        training_utils.verify_dataset_shuffled(x)
 
-    # Legacy support
-    if 'nb_epoch' in kwargs:
-      logging.warning(
-          'The `nb_epoch` argument in `fit` '
-          'has been renamed `epochs`.')
-      epochs = kwargs.pop('nb_epoch')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      distributed_training_utils.validate_callbacks(callbacks, self.optimizer)
-
-      distributed_training_utils.validate_inputs(
-          x, y, self._distribution_strategy)
-
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps_per_epoch, batch_size = (
-            distributed_training_utils.get_input_params(
-                self._distribution_strategy, first_x_value, steps_per_epoch,
-                batch_size, is_training=True))
-
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps_per_epoch,
-                                                    x)
+      return self.fit_generator(
+          x,
+          steps_per_epoch=steps_per_epoch,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          validation_freq=validation_freq,
+          class_weight=class_weight,
+          workers=0,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch)
 
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
@@ -815,43 +899,15 @@ class Model(Network):
 
     # Prepare validation data.
     if validation_data:
-      if (isinstance(validation_data, iterator_ops.Iterator) or
-          isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.DatasetV2)):
-        val_x = validation_data
-        val_y = None
-        val_sample_weight = None
-      elif len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            'When passing a `validation_data` argument, '
-            'it must contain either 2 items (x_val, y_val), '
-            'or 3 items (x_val, y_val, val_sample_weights), '
-            'or alternatively it could be a dataset or a '
-            'dataset or a dataset iterator. '
-            'However we received `validation_data=%s`' % validation_data)
-
-      # Validate and standardize validation data.
-      if self._distribution_strategy:
-        distributed_training_utils.validate_inputs(
-            val_x, val_y, self._distribution_strategy)
-        first_valx_value = nest.flatten(val_x)[0]
-        if isinstance(first_valx_value, np.ndarray):
-          validation_steps, _ = distributed_training_utils.get_input_params(
-              self._distribution_strategy, first_valx_value, validation_steps,
-              batch_size)
-
+      val_x, val_y, val_sample_weights = self._unpack_validation_data(
+          validation_data)
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
           val_y,
-          sample_weight=val_sample_weight,
+          sample_weight=val_sample_weights,
           batch_size=batch_size,
-          steps=validation_steps)
-
+          steps=validation_steps,
+          steps_name='validation_steps')
     elif validation_split and 0. < validation_split < 1.:
       if training_utils.has_symbolic_tensors(x):
         raise ValueError('If your data is in the form of symbolic tensors, '
@@ -873,32 +929,21 @@ class Model(Network):
       val_y = None
       val_sample_weights = None
 
-    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
-                             not self._distribution_strategy)):
+    if self.run_eagerly:
       return training_generator.fit_generator(
           self, (x, y, sample_weights),
           steps_per_epoch=steps_per_epoch,
           batch_size=batch_size,
           epochs=epochs,
-          shuffle=shuffle,
           verbose=verbose,
           callbacks=callbacks,
           validation_data=validation_data,
           validation_steps=validation_steps,
+          validation_freq=validation_freq,
           workers=0,
-          initial_epoch=initial_epoch)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_fit_loop(
-          self,
-          x,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_iterator=val_x,
+          shuffle=shuffle,
           initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
+          steps_name='steps_per_epoch')
     else:
       return training_arrays.fit_loop(
           self,
@@ -915,7 +960,9 @@ class Model(Network):
           shuffle=shuffle,
           initial_epoch=initial_epoch,
           steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
+          validation_steps=validation_steps,
+          validation_freq=validation_freq,
+          steps_name='steps_per_epoch')
 
   def evaluate(self,
                x=None,
@@ -1001,26 +1048,47 @@ class Model(Network):
     Raises:
         ValueError: in case of invalid arguments.
     """
+    # Case 1: distribution strategy.
+    if self._distribution_strategy:
+      self._configure_distributed_session()
+      return training_distributed.evaluate_distributed(
+          self,
+          x=x,
+          y=y,
+          batch_size=batch_size,
+          verbose=verbose,
+          sample_weight=sample_weight,
+          steps=steps,
+          callbacks=callbacks)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
     if data_utils.is_generator_or_sequence(x):
       training_utils.check_generator_arguments(y, sample_weight)
       return self.evaluate_generator(
           x,
           steps=steps,
           verbose=verbose,
+          callbacks=callbacks,
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      distributed_training_utils.validate_inputs(
-          x, y, self._distribution_strategy)
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps, batch_size = distributed_training_utils.get_input_params(
-            self._distribution_strategy, first_x_value, steps, batch_size)
-
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+    if training_utils.is_eager_dataset_or_iterator(x):
+      # Make sure that y, sample_weights are not passed.
+      training_utils.validate_dataset_input(x, y, sample_weight)
+      return training_generator.evaluate_generator(
+          self, x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
 
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
@@ -1030,8 +1098,7 @@ class Model(Network):
         steps_name='steps',
         steps=steps)
 
-    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
-                             not self._distribution_strategy)):
+    if self.run_eagerly:
       return training_generator.evaluate_generator(
           self, (x, y, sample_weights),
           steps=steps,
@@ -1039,10 +1106,6 @@ class Model(Network):
           verbose=verbose,
           workers=0,
           callbacks=callbacks)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_test_loop(
-          self, iterator=x, verbose=verbose, steps=steps)
     else:
       return training_arrays.test_loop(
           self,
@@ -1113,38 +1176,46 @@ class Model(Network):
             or in case a stateful model receives a number of samples
             that is not a multiple of the batch size.
     """
+    # Case 1: distribution strategy.
+    if self._distribution_strategy:
+      self._configure_distributed_session()
+      return training_distributed.predict_distributed(self,
+                                                      x=x,
+                                                      batch_size=batch_size,
+                                                      verbose=verbose,
+                                                      steps=steps,
+                                                      callbacks=callbacks)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
     if data_utils.is_generator_or_sequence(x):
       return self.predict_generator(
           x,
           steps=steps,
           verbose=verbose,
+          callbacks=callbacks,
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-    if self._distribution_strategy:
-      distributed_training_utils.validate_inputs(
-          x, None, self._distribution_strategy)
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps, batch_size = distributed_training_utils.get_input_params(
-            self._distribution_strategy, first_x_value, steps, batch_size)
+    if training_utils.is_eager_dataset_or_iterator(x):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
 
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
+    x, _, _ = self._standardize_user_data(
+        x, check_steps=True, steps_name='steps', steps=steps)
 
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      x, _, _ = self._standardize_user_data(
-          x, check_steps=True, steps_name='steps', steps=steps,
-          batch_size=batch_size)
-    else:
-      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-      # means we need to special case distribution strategy which needs the
-      # batch size.
-      x, _, _ = self._standardize_user_data(
-          x, check_steps=True, steps_name='steps', steps=steps)
-
-    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
-                             not self._distribution_strategy)):
+    if self.run_eagerly:
       return training_generator.predict_generator(
           self,
           x,
@@ -1153,10 +1224,6 @@ class Model(Network):
           verbose=verbose,
           workers=0,
           callbacks=callbacks)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_predict_loop(
-          self, x, verbose=verbose, steps=steps)
     else:
       return training_arrays.predict_loop(
           self,
@@ -1172,7 +1239,7 @@ class Model(Network):
       for m in self.metrics:
         m.reset_states()
       if self._distribution_strategy:
-        training_distributed._reset_metrics(self)  # pylint: disable=protected-access
+        distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
 
   def train_on_batch(self,
                      x,
@@ -1226,7 +1293,8 @@ class Model(Network):
                                 'compiled with DistributionStrategy.')
     # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, class_weight=class_weight)
+        x, y, sample_weight=sample_weight, class_weight=class_weight,
+        extract_tensors_from_dataset=True)
 
     if self.run_eagerly:
       outputs = training_eager.train_on_batch(
@@ -1295,7 +1363,7 @@ class Model(Network):
                                 'compiled with DistributionStrategy.')
     # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight)
+        x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
 
     if self.run_eagerly:
       outputs = training_eager.test_on_batch(
@@ -1338,7 +1406,8 @@ class Model(Network):
       raise NotImplementedError('`predict_on_batch` is not supported for '
                                 'models compiled with DistributionStrategy.')
     # Validate and standardize user data.
-    inputs, _, _ = self._standardize_user_data(x)
+    inputs, _, _ = self._standardize_user_data(
+        x, extract_tensors_from_dataset=True)
     if self.run_eagerly:
       if (isinstance(inputs, iterator_ops.EagerIterator) or
           (isinstance(inputs, dataset_ops.DatasetV2))):
@@ -1368,6 +1437,7 @@ class Model(Network):
                     callbacks=None,
                     validation_data=None,
                     validation_steps=None,
+                    validation_freq=1,
                     class_weight=None,
                     max_queue_size=10,
                     workers=1,
@@ -1422,6 +1492,13 @@ class Model(Network):
             to yield from `generator` before stopping.
             Optional for `Sequence`: if unspecified, will use
             the `len(validation_data)` as a number of steps.
+        validation_freq: Only relevant if validation data is provided. Integer
+            or `collections.Container` instance (e.g. list, tuple, etc.). If an
+            integer, specifies how many training epochs to run before a new
+            validation run is performed, e.g. `validation_freq=2` runs
+            validation every 2 epochs. If a Container, specifies the epochs on
+            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+            validation at the end of the 1st, 2nd, and 10th epochs.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
         max_queue_size: Integer. Maximum size for the generator queue.
@@ -1477,12 +1554,14 @@ class Model(Network):
         callbacks=callbacks,
         validation_data=validation_data,
         validation_steps=validation_steps,
+        validation_freq=validation_freq,
         class_weight=class_weight,
         max_queue_size=max_queue_size,
         workers=workers,
         use_multiprocessing=use_multiprocessing,
         shuffle=shuffle,
-        initial_epoch=initial_epoch)
+        initial_epoch=initial_epoch,
+        steps_name='steps_per_epoch')
 
   def evaluate_generator(self,
                          generator,
@@ -1633,7 +1712,9 @@ class Model(Network):
     InputLayer. If so, this method checks the provided `batch_size` and `x`
     arguments are consistent with this static batch size. Also, if
     `batch_size` is `None`, this method will attempt to infer the batch size
-    from the static batch size of the InputLayer.
+    from the static batch size of the InputLayer. Lastly, ValueError will be
+    raised if `x` is a tf.data.Dataset and `batch_size` is specified as we
+    expect users to provide batched datasets.
 
     Arguments:
       batch_size: The batch_size provided as an argument to
@@ -1645,6 +1726,10 @@ class Model(Network):
       The validated batch_size, auto-inferred from the first layer if not
       provided.
     """
+    if batch_size is not None and isinstance(x, dataset_ops.DatasetV2):
+      raise ValueError('The `batch_size` argument must not be specified when'
+                       ' using dataset as an input.')
+
     layers = super(Model, self).layers  # Avoids the override in Sequential.
     if layers:
       first_layer = layers[0]
@@ -1679,7 +1764,7 @@ class Model(Network):
 
   @property
   def _default_save_signature(self):
-    return training_utils.trace_model_call(self)
+    return saving_utils.trace_model_call(self)
 
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
@@ -1964,7 +2049,7 @@ class Model(Network):
           ' without calling `model.compile` after ?', 1)
 
   def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
+    if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
     if getattr(self, fn_name) is None:
@@ -2013,7 +2098,7 @@ class Model(Network):
         '_fit_function', [self.total_loss] + metrics_tensors)
 
   def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
+    if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     if getattr(self, fn_name) is None:
       inputs = (self._feed_inputs +
@@ -2057,7 +2142,7 @@ class Model(Network):
       # Gets network outputs. Does not update weights.
       # Does update the network states.
       kwargs = getattr(self, '_function_kwargs', {})
-      with K.name_scope('predict'):
+      with K.name_scope(ModeKeys.PREDICT):
         self.predict_function = K.function(
             inputs,
             self.outputs,
@@ -2076,13 +2161,6 @@ class Model(Network):
       self._make_predict_function()
       return self.predict_function
 
-  def _get_iterator_get_next_tensors(self, iterator):
-    get_next_op = self._iterator_get_next.get(iterator, None)
-    if get_next_op is None:
-      get_next_op = iterator.get_next()
-      self._iterator_get_next[iterator] = get_next_op
-    return get_next_op
-
   def _distribution_standardize_user_data(self,
                                           x,
                                           y=None,
@@ -2119,7 +2197,7 @@ class Model(Network):
       shuffle: Boolean whether to shuffle the training data before each epoch.
 
     Returns:
-      Iterator for reading the dataset `x`.
+      Dataset instance.
 
     Raises:
       ValueError: In case of invalid user-provided data.
@@ -2135,65 +2213,77 @@ class Model(Network):
       raise NotImplementedError('`sample_weight` is currently not supported '
                                 'when using TPUStrategy.')
 
-    # Validates `steps` argument right at the beginning since we use it to
-    # construct the dataset object.
+    if (self.stateful and distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy) and self._distribution_strategy.
+        num_replicas_in_sync != 1):
+      raise ValueError('Single core must be used for computation on '
+                       'stateful models. Consider adding `device_assignment` '
+                       'parameter to TPUStrategy using\n'
+                       'topology = tf.contrib.distribute.'
+                       'initialize_tpu_system()\n'
+                       'device_assignment = tf.contrib.tpu.DeviceAssignment('
+                       'topology, core_assignment=tf.contrib.tpu.'
+                       'SINGLE_CORE_ASSIGNMENT)\n'
+                       'tpu_strategy = tf.contrib.distribute.TPUStrategy('
+                       'device_assignment=device_assignment)')
+
+    # Validates `steps` and `shuffle` arguments right at the beginning
+    # since we use it to construct the dataset object.
     # TODO(anjalisridhar): Remove this check once we refactor the
     # _standardize_user_data code path. This check is already present elsewhere
     # in the codebase.
-    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
-      raise ValueError('When using Datasets as input, '
-                       'you should specify the `{steps_name}` argument.'
-                       .format(steps_name=steps_name))
-
-    first_x_value = nest.flatten(x)[0]
-    if isinstance(first_x_value, np.ndarray):
-      # We need to use the drop_remainder argument to allow for a static
-      # input shape which is required for TPUs.
-      drop_remainder = self._distribution_strategy.require_static_shapes
-      if y is not None:
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        var_y = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, y)
-        if sample_weight is not None:
-          var_sample_weights = distributed_training_utils.get_var_for_numpy(
-              self._distribution_strategy, sample_weight)
-
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y,
-                                                      var_sample_weights))
+    if isinstance(x, dataset_ops.DatasetV2):
+      if shuffle:
+        training_utils.verify_dataset_shuffled(x)
+
+      if check_steps and steps is None:
+        raise ValueError('When using Datasets as input, '
+                         'you should specify the `{steps_name}` argument.'
+                         .format(steps_name=steps_name))
+
+    if ops.executing_eagerly_outside_functions():
+      session = None
+    else:
+      session = K.get_session()
+
+    strategy = self._distribution_strategy
+    with strategy.scope():
+      first_x_value = nest.flatten(x)[0]
+      if isinstance(first_x_value, np.ndarray):
+        x = distributed_training_utils.list_to_tuple(x)
+        if y is not None:
+          y = distributed_training_utils.list_to_tuple(y)
+          if sample_weight is not None:
+            sample_weight = distributed_training_utils.list_to_tuple(
+                sample_weight)
+            in_tuple = (x, y, sample_weight)
+          else:
+            in_tuple = (x, y)
         else:
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
+          in_tuple = x
 
-        x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
         if shuffle:
           # 1024 is a good buffer size since it is much larger than the average
           # batch size provided by the user and provides sufficient randomness.
           # One thing to keep in mind is the memory usage based on the size of
           # each sample.
-          x = x.shuffle(1024)
-        x = x.repeat()
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
-        y = None
-        sample_weight = None
+          shuffle_buffer = 1024
+        else:
+          shuffle_buffer = None
+        ds = strategy.extended.experimental_make_numpy_dataset(in_tuple,
+                                                               session=session)
+        if shuffle_buffer:
+          ds = ds.shuffle(shuffle_buffer)
+        ds = ds.repeat()
+        # We need to use the drop_remainder argument to get a known static
+        # input shape which is required for TPUs.
+        drop_remainder = strategy.extended.experimental_require_static_shapes
+        x = ds.batch(batch_size, drop_remainder=drop_remainder)
       else:
-        # This case is for the predict call where the dataset only contains
-        # inputs and no targets, i.e. it does not return a tuple
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
-
-    assert isinstance(x, dataset_ops.DatasetV2)
-
-    with self._distribution_strategy.scope():
-      iterator = self._distribution_strategy.make_dataset_iterator(x)
-      init_op = iterator.initialize()
-      if not context.executing_eagerly():
-        K.get_session().run(init_op)
-
-    training_utils.validate_iterator_input(x, y, sample_weight,
-                                           validation_split)
-    return iterator
+        assert isinstance(x, dataset_ops.DatasetV2)
+        training_utils.validate_dataset_input(x, y, sample_weight,
+                                              validation_split)
+    return x
 
   def _standardize_user_data(self,
                              x,
@@ -2205,7 +2295,8 @@ class Model(Network):
                              steps_name='steps',
                              steps=None,
                              validation_split=0,
-                             shuffle=False):
+                             shuffle=False,
+                             extract_tensors_from_dataset=False):
     """Runs validation checks on input and target data passed by the user.
 
     Also standardizes the data to lists of arrays, in order.
@@ -2234,7 +2325,8 @@ class Model(Network):
         weight the importance of each sample in `x`.
       class_weight: An optional class-weight array by the user to
         weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
+        to, as conveyed by `y`. If both `sample_weight` and `class_weight` are
+        provided, the weights are multiplied.
       batch_size: Integer batch size. If provided, it is used to run additional
         validation checks on stateful models.
       check_steps: boolean, True if we want to check for validity of `steps` and
@@ -2248,6 +2340,10 @@ class Model(Network):
       validation_split: Float between 0 and 1.
         Fraction of the training data to be used as validation data.
       shuffle: Boolean whether to shuffle the training data before each epoch.
+      extract_tensors_from_dataset: Boolean. When `x` is a dataset instance,
+        this indicates whether to extract actual tensors from the dataset or
+        instead output the dataset instance itself.
+        Set to True when calling from `train_on_batch`/etc.
 
     Returns:
       A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
@@ -2260,78 +2356,33 @@ class Model(Network):
       ValueError: In case of invalid user-provided data.
       RuntimeError: If the model was never compiled.
     """
-    if self._distribution_strategy:
-      iterator = self._distribution_standardize_user_data(
-          x,
-          y,
-          sample_weight=sample_weight,
-          class_weight=class_weight,
-          batch_size=batch_size,
-          check_steps=check_steps,
-          steps_name=steps_name,
-          steps=steps,
-          validation_split=validation_split,
-          shuffle=shuffle)
-      return iterator, None, None
-
-    if isinstance(x, dataset_ops.DatasetV2):
-      if context.executing_eagerly():
-        x = iter(x)
-      else:
-        if x in self._dataset_iterator_cache:
-          x = self._dataset_iterator_cache[x]
-        else:
-          iterator = dataset_ops.make_initializable_iterator(x)
-          self._dataset_iterator_cache[x] = iterator
-          x = iterator
-        K.get_session().run(x.initializer)
+    if isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      # Graph mode dataset. We'll pass the dataset as-is (unless
+      # `extract_tensors_from_dataset` is True, in which case we extract
+      # the tensors from the dataset and we output them.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      if shuffle:
+        training_utils.verify_dataset_shuffled(x)
+
+      is_dataset = True
+      if extract_tensors_from_dataset:
+        # We do this for `train_on_batch`/etc.
+        x, y, sample_weight = training_utils.extract_tensors_from_dataset(x)
+    elif isinstance(x, iterator_ops.Iterator):
+      # Graph mode iterator. We extract the symbolic tensors.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      iterator = x
+      x, y, sample_weight = training_utils.unpack_iterator_input(iterator)
+      is_dataset = True
+    else:
+      is_dataset = False
 
     # Validates `steps` argument based on x's type.
     if check_steps:
       training_utils.check_steps_argument(x, steps, steps_name)
 
-    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
-    is_x_iterator = isinstance(x, iterator_ops.Iterator)
-
-    # Validate user inputs when data is given as a dataset or dataset iterator.
-    if is_x_iterator or is_x_eager_iterator:
-      training_utils.validate_iterator_input(x, y, sample_weight,
-                                             validation_split)
-
-    # For eager iterators, when we have to process multiple batches of samples,
-    # we will standardize the data when we actually loop over iterator and get
-    # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator:
-      return x, y, sample_weight
-
-    # If input data is a dataset iterator in graph mode or if it is an eager
-    # iterator and only one batch of samples is required, we fetch the data
-    # tensors from the iterator and then standardize them.
-    if is_x_iterator:
-      try:
-        next_element = self._get_iterator_get_next_tensors(x)
-      except errors.OutOfRangeError:
-        raise RuntimeError('Your dataset iterator ran out of data; '
-                           'Make sure that your dataset can generate '
-                           'required number of samples.')
-
-      if isinstance(next_element, (list, tuple)):
-        if len(next_element) not in [2, 3]:
-          raise ValueError(
-              'Please provide model inputs as a list or tuple of 2  or 3'
-              'elements: (input, target) or (input, target, sample_weights)'
-              'Received %s' % next_element)
-        if len(next_element) == 2:
-          x, y = next_element
-        else:
-          x, y, sample_weight = next_element
-      else:
-        x = next_element
-
-    if sample_weight is not None and class_weight is not None:
-      logging.warning(
-          'Received both a `sample_weight` and `class_weight` argument. '
-          'The `class_weight` argument will be ignored.')
     # First, we build/compile the model on the fly if necessary.
     all_inputs = []
     is_build_called = False
@@ -2340,44 +2391,51 @@ class Model(Network):
     # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
     if not self.inputs:
-      # We need to use `x` to set the model inputs.
-      # We type-check that `x` and `y` are either single arrays
+      # We need to use `x_input` to set the model inputs.
+
+      # If input data is a dataset iterator in graph mode or if it is an eager
+      # iterator and only one batch of samples is required, we fetch the data
+      # tensors from the iterator and then standardize them.
+      if isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+        x_input, y_input, _ = training_utils.extract_tensors_from_dataset(x)
+      else:
+        x_input = x
+        y_input = y
+      # We type-check that `x_input` and `y_input` are either single arrays
       # or lists of arrays.
-      if isinstance(x, (list, tuple)):
+      if isinstance(x_input, (list, tuple)):
         if not all(isinstance(v, np.ndarray) or
-                   tensor_util.is_tensor(v) for v in x):
+                   tensor_util.is_tensor(v) for v in x_input):
           raise ValueError('Please provide as model inputs either a single '
                            'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs += list(x)
-      elif isinstance(x, dict):
+        all_inputs += list(x_input)
+      elif isinstance(x_input, dict):
         dict_inputs = True
-        keys = sorted(x.keys())
-        all_inputs = [x[k] for k in keys]
+        keys = sorted(x_input.keys())
+        all_inputs = [x_input[k] for k in keys]
       else:
-        if not isinstance(x, np.ndarray) and not tensor_util.is_tensor(x):
+        if (not isinstance(x_input, np.ndarray) and
+            not tensor_util.is_tensor(x_input)):
           raise ValueError('Please provide as model inputs either a single '
                            'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs.append(x)
+        all_inputs.append(x_input)
 
       # Build the model using the retrieved inputs (value or symbolic).
       # If values or generated from a dataset, then in symbolic-mode
       # placeholders will be created to match the value shapes.
       is_build_called = True
-      if is_x_iterator:
-        cast_inputs = nest.map_structure(lambda v: v.shape, x)
-      elif training_utils.has_tensors(x):
-        cast_inputs = training_utils.cast_if_floating_dtype(x)
+      if is_dataset:
+        cast_inputs = nest.map_structure(lambda v: v.shape, x_input)
+      elif training_utils.has_tensors(x_input):
+        cast_inputs = training_utils.cast_if_floating_dtype(x_input)
       else:
-        cast_inputs = x
+        cast_inputs = x_input
       self._set_inputs(cast_inputs)
     else:
+      y_input = y
       dict_inputs = isinstance(self.inputs, dict)
-    if dict_inputs and context.executing_eagerly():
-      # No support for graph functions when the model expects dictionary inputs
-      # (i.e. FeatureColumn-based models).
-      self.run_eagerly = True
 
-    if y is not None:
+    if y_input is not None:
       if not self.optimizer:
         raise RuntimeError('You must compile a model before '
                            'training/testing. '
@@ -2385,23 +2443,24 @@ class Model(Network):
       if not self._is_compiled:
         # On-the-fly compilation of the model.
         # We need to use `y` to set the model targets.
-        if training_utils.has_tensors(y):
-          y = training_utils.cast_if_floating_dtype(y)
-        if isinstance(y, (list, tuple)):
+        if training_utils.has_tensors(y_input):
+          y_input = training_utils.cast_if_floating_dtype(y_input)
+        if isinstance(y_input, (list, tuple)):
           if not all(isinstance(v, np.ndarray) or
-                     tensor_util.is_tensor(v) for v in y):
+                     tensor_util.is_tensor(v) for v in y_input):
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
-          all_inputs += list(y)
-        elif isinstance(y, dict):
-          raise ValueError('Please do not pass a dictionary as model targets.')
+          all_inputs += list(y_input)
+        elif isinstance(y_input, dict):
+          raise ValueError('You cannot pass a dictionary as model targets.')
         else:
-          if not isinstance(y, np.ndarray) and not tensor_util.is_tensor(y):
+          if (not isinstance(y_input, np.ndarray) and
+              not tensor_util.is_tensor(y_input)):
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
-          all_inputs.append(y)
+          all_inputs.append(y_input)
 
         # Typecheck that all inputs are *either* value *or* symbolic.
         # TODO(fchollet): this check could be removed in Eager mode?
@@ -2411,13 +2470,13 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if self.run_eagerly or is_x_iterator:
+        if is_dataset or context.executing_eagerly():
           target_tensors = None
         else:
           # Handle target tensors if any passed.
-          if not isinstance(y, (list, tuple)):
-            y = [y]
-          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
+          if not isinstance(y_input, (list, tuple)):
+            y_input = [y_input]
+          target_tensors = [v for v in y_input if _is_symbolic_tensor(v)]
         is_compile_called = True
         self.compile(
             optimizer=self.optimizer,
@@ -2435,7 +2494,7 @@ class Model(Network):
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
     if (not self.run_eagerly and is_build_called and is_compile_called and
-        not is_x_iterator and any(_is_symbolic_tensor(v) for v in all_inputs)):
+        not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
@@ -2457,12 +2516,14 @@ class Model(Network):
       feed_input_shapes = self._feed_input_shapes
 
     # Standardize the inputs.
-    x = training_utils.standardize_input_data(
-        x,
-        feed_input_names,
-        feed_input_shapes,
-        check_batch_axis=False,  # Don't enforce the batch size.
-        exception_prefix='input')
+    if not isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      # TODO(fchollet): run static checks with dataset output shape(s).
+      x = training_utils.standardize_input_data(
+          x,
+          feed_input_names,
+          feed_input_shapes,
+          check_batch_axis=False,  # Don't enforce the batch size.
+          exception_prefix='input')
 
     if y is not None:
       if not self._is_graph_network:
@@ -2477,18 +2538,21 @@ class Model(Network):
         feed_output_shapes = []
         for output_shape, loss_fn in zip(self._feed_output_shapes,
                                          self._feed_loss_fns):
-          if loss_fn is losses.sparse_categorical_crossentropy:
+          if ((isinstance(loss_fn, losses.LossFunctionWrapper) and
+               loss_fn.fn == losses.sparse_categorical_crossentropy)) or (
+                   isinstance(loss_fn, losses.SparseCategoricalCrossentropy)):
             if K.image_data_format() == 'channels_first':
               feed_output_shapes.append(
                   (output_shape[0], 1) + output_shape[2:])
             else:
               feed_output_shapes.append(output_shape[:-1] + (1,))
-          elif (not hasattr(loss_fn, '__name__') or
-                getattr(losses, loss_fn.__name__, None) is None):
-            # If `loss_fn` is not a function (e.g. callable class)
-            # or if it not in the `losses` module, then
-            # it is a user-defined loss and we make no assumptions
-            # about it.
+          elif (not isinstance(loss_fn, losses.Loss) or
+                (isinstance(loss_fn, losses.LossFunctionWrapper) and
+                 (getattr(losses, loss_fn.fn.__name__, None) is None))):
+            # If the given loss is not an instance of the `Loss` class (custom
+            # class) or if the loss function that is wrapped is not in the
+            # `losses` module, then it is a user-defined loss and we make no
+            # assumptions about it.
             feed_output_shapes.append(None)
           else:
             feed_output_shapes.append(output_shape)
@@ -2536,10 +2600,33 @@ class Model(Network):
                          str(x[0].shape[0]) + ' samples')
 
     # If dictionary inputs were provided, we return a dictionary as well.
-    if dict_inputs:
+    if dict_inputs and not isinstance(x, (dataset_ops.DatasetV1,
+                                          dataset_ops.DatasetV2)):
       x = dict(zip(feed_input_names, x))
     return x, y, sample_weights
 
+  def _unpack_validation_data(self, validation_data):
+    if (isinstance(validation_data, (iterator_ops.Iterator,
+                                     iterator_ops.EagerIterator,
+                                     dataset_ops.DatasetV2))):
+      val_x = validation_data
+      val_y = None
+      val_sample_weight = None
+    elif len(validation_data) == 2:
+      val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+      val_sample_weight = None
+    elif len(validation_data) == 3:
+      val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+    else:
+      raise ValueError(
+          'When passing a `validation_data` argument, '
+          'it must contain either 2 items (x_val, y_val), '
+          'or 3 items (x_val, y_val, val_sample_weights), '
+          'or alternatively it could be a dataset or a '
+          'dataset or a dataset iterator. '
+          'However we received `validation_data=%s`' % validation_data)
+    return val_x, val_y, val_sample_weight
+
   @checkpointable.no_automatic_dependency_tracking
   def _set_inputs(self, inputs, outputs=None, training=None):
     """Set model's input and output specs based on the input data received.
@@ -2628,6 +2715,26 @@ class Model(Network):
     self.output_names = training_utils.generic_output_names(outputs)
     self.built = True
 
+  def _configure_distributed_session(self):
+    """Configure a Session for use with Distribution Strategies.
+
+    Raises:
+      ValueError: If a non-distributed Session has already been created.
+    """
+    if not self._distributed_session_is_configured:
+      if (dc_context.get_current_worker_context() is not None and
+          getattr(K._SESSION, 'session', None) is not None):  # pylint: disable=protected-access
+        raise ValueError('Session was created before `fit`, `evaluate`, '
+                         'or `predict` was called. With Multi-Worker '
+                         'mode, this is not allowed. Please avoid '
+                         'creating a Session outside of these methods. '
+                         'The Session may have been created by a call '
+                         'to `keras.backend.get_session()` or '
+                         'functions that use Sessions, like `load_weights`.')
+      distributed_training_utils.configure_and_create_session(
+          self._distribution_strategy)
+      self._distributed_session_is_configured = True
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with DistributionStrategy."""
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index af67444b01527fd9f68e2aa798c81dfecc02ee2c..68b8f615210de6ace43e0ebf1eb9cb66fa1a8056 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -23,15 +23,18 @@ import functools
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.engine import training_distributed
+from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.mode_keys import ModeKeys
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -39,89 +42,6 @@ except ImportError:
   issparse = None
 
 
-def _get_model_feed(model, mode):
-  if mode == 'predict':
-    feed = model._feed_inputs
-  else:
-    feed = (
-        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
-  return feed
-
-
-def _validate_arguments(steps_per_epoch, validation_steps, kwargs):
-  for k in kwargs:
-    if k != 'steps':
-      raise ValueError('Invalid argument passed: {}'.format(k))
-
-  # Validate inputs when in training mode.
-  if validation_steps and steps_per_epoch is None:
-    raise ValueError('Can only use `validation_steps` '
-                     'when doing step-wise '
-                     'training, i.e. `steps_per_epoch` '
-                     'must be set.')
-
-
-def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
-  if (val_inputs and steps_per_epoch is None and verbose and inputs and
-      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-    print('Train on %d samples, validate on %d samples' %
-          (inputs[0].shape[0], val_inputs[0].shape[0]))
-
-
-def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
-  """Returns total number of samples (when training in batch mode) or steps."""
-  if steps_per_epoch:
-    return steps_per_epoch
-  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
-                                          'steps_per_epoch')
-
-
-def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
-
-  Arguments:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of 'train'/'test'/'predict'.
-
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  if model._distribution_strategy:
-    def get_distributed_inputs():
-      return training_distributed._prepare_feed_values(
-          model, inputs, targets, sample_weights, mode)
-
-    # In the eager case, we want to call the input method per step, so return
-    # a lambda from here that can be called. Note that this is applicable only
-    # in Distribution Strategy case as it follows the same code path for both
-    # eager and graph modes.
-    # TODO(priyag,omalleyt): Either we should move the training DS with
-    # EagerIterator to use training_generator code path, or figure out how to
-    # set a symbolic Iterator out of a Dataset when in eager mode.
-    if context.executing_eagerly():
-      return get_distributed_inputs
-    else:
-      return get_distributed_inputs()
-
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  targets = targets or []
-  sample_weights = sample_weights or []
-  ins = inputs + targets + sample_weights
-  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
-    ins += [True]
-  return ins
-
-
-def _make_execution_function(model, mode):
-  """Makes function to run one step of model execution."""
-  if model._distribution_strategy:
-    return training_distributed._make_execution_function(model, mode)
-  return model._make_execution_function(mode)
-
-
 def model_iteration(model,
                     inputs,
                     targets=None,
@@ -137,22 +57,25 @@ def model_iteration(model,
                     initial_epoch=0,
                     steps_per_epoch=None,
                     validation_steps=None,
-                    mode='train',
+                    validation_freq=1,
+                    mode=ModeKeys.TRAIN,
                     validation_in_fit=False,
+                    prepared_feed_values_from_dataset=False,
+                    steps_name='steps',
                     **kwargs):
-  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
 
   Arguments:
       model: Keras Model instance.
-      inputs: Either a list of arrays or a dictionary.
-      targets: List of target arrays.
+      inputs: Either a list or dictionary of arrays, or a dataset instance.
+      targets: List/dictionary of input arrays.
       sample_weights: Optional list of sample weight arrays.
       batch_size: Integer batch size or None if unknown.
       epochs: Number of times to iterate over the data
       verbose: Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
-      val_inputs: List of input arrays.
-      val_targets: List of target arrays.
+      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
+      val_targets: List/dictionary of target arrays.
       val_sample_weights: Optional list of sample weight arrays.
       shuffle: Whether to shuffle the data at the beginning of each epoch
         concatenation of list the display names of the outputs of `f` and the
@@ -164,44 +87,98 @@ def model_iteration(model,
         the default value of `None`.
       validation_steps: Number of steps to run validation for (only if doing
         validation from data tensors). Ignored with the default value of `None`.
-      mode: One of 'train'/'test'/'predict'.
-      validation_in_fit: DEPRECATED: if true, then this method is invoked from
-        within training iteration (for validation). In this case, do not copy
-        weights when using a tf.distribute.Strategy. The input is deprecated as
-        it is not required if the user creates a distributed model under the
-        distribution strategy scope rather than passing it to compile.
+      validation_freq: Only relevant if validation data is provided. Integer or
+        `collections.Container` instance (e.g. list, tuple, etc.). If an
+        integer, specifies how many training epochs to run before a new
+        validation run is performed, e.g. `validation_freq=2` runs
+        validation every 2 epochs. If a Container, specifies the epochs on
+        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+        validation at the end of the 1st, 2nd, and 10th epochs.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+      validation_in_fit: if true, then this method is invoked from within
+        training iteration (for validation). In the case where `val_inputs` is a
+        dataset, this flag indicates that its iterator and feed values are
+        already created so should properly reuse resources.
+      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
+        tensors returned from `_prepare_feed_values` call on the validation
+        dataset, so do not call it again on `inputs`. Should only be used for
+        inline validation (i.e., only if `validation_in_fit` is also True).
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
       **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
-      - In 'train' mode: `History` object.
-      - In 'test' mode: Evaluation metrics.
-      - In 'predict' mode: Outputs of the Model called on inputs.
+      - In TRAIN mode: `History` object.
+      - In TEST mode: Evaluation metrics.
+      - In PREDICT mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
   """
   # Backwards compatibility.
   if 'steps' in kwargs:
-    steps_per_epoch = kwargs['steps']
-
-  _validate_arguments(steps_per_epoch, validation_steps, kwargs)
-  if mode == 'train':
+    steps_per_epoch = kwargs.pop('steps')
+  if kwargs:
+    raise TypeError('Unknown arguments: %s' % (kwargs,))
+
+  # In case we were passed a dataset, we extract symbolic tensors from it.
+  reset_dataset_after_each_epoch = False
+  input_iterator = None
+  is_dataset = isinstance(inputs,
+                          (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
+  # TODO(fchollet): consider moving `steps_per_epoch` inference to
+  # _standardize_user_data and set reset_dataset_after_each_epoch as an
+  # attribute on the dataset instance.
+  if is_dataset:
+    if steps_per_epoch is None:
+      reset_dataset_after_each_epoch = True
+      steps_per_epoch = training_utils.infer_steps_for_dataset(
+          inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
+    input_iterator = _get_iterator(inputs, model._distribution_strategy)
+
+  if mode == ModeKeys.TRAIN:
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
 
   # Enter DistributionStrategy scope.
   if model._distribution_strategy:
-    scope = model._distribution_strategy.scope()
+    scope = distributed_training_utils.distributed_scope(
+        strategy=model._distribution_strategy,
+        learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
     scope.__enter__()
 
   # Get step function and loop type.
   f = _make_execution_function(model, mode)
-  use_steps = steps_per_epoch is not None
+  use_steps = is_dataset or steps_per_epoch is not None
   do_validation = val_inputs is not None
 
+  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
+  inputs, targets, sample_weights = training_utils. \
+      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))
+
   # Prepare input data.
-  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
-  num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
-                                                   steps_per_epoch)
+  inputs = input_iterator or inputs
+  if validation_in_fit and prepared_feed_values_from_dataset:
+    # When invoking validation in training loop, avoid creating iterator and
+    # list of feed values for the same validation dataset multiple times (which
+    # essentially would call `iterator.get_next()` that slows down execution and
+    # leads to OOM errors eventually.
+    ins = inputs
+  else:
+    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+  if not is_dataset:
+    num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
+                                                     steps_per_epoch)
+  else:
+    num_samples_or_steps = steps_per_epoch
+
+  # Prepare validation data. Hold references to the iterator and the input list
+  # to properly reinitialize and reuse in multiple validation passes.
+  val_iterator = None
+  if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
+    val_inputs = _prepare_feed_values(
+        model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)
 
   # Configure callbacks.
   count_mode = 'steps' if use_steps else 'samples'
@@ -229,16 +206,15 @@ def model_iteration(model,
         indices_for_conversion_to_dense.append(i)
 
   # Select aggregation method.
-  if mode == 'predict':
+  if mode == ModeKeys.PREDICT:
     aggregator = training_utils.OutputsAggregator(use_steps,
                                                   num_samples_or_steps)
   else:
     aggregator = training_utils.MetricsAggregator(use_steps,
                                                   num_samples_or_steps)
 
-  if model._compile_distribution and not validation_in_fit:
-    training_distributed._copy_weights_to_distributed_model(
-        model, model._distributed_model)
+  if model._compile_distribution:
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -251,12 +227,21 @@ def model_iteration(model,
     # Setup work for each epoch
     epoch_logs = {}
     model.reset_metrics()
-    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    if mode == ModeKeys.TRAIN:
+      callbacks.on_epoch_begin(epoch, epoch_logs)
     progbar.on_epoch_begin(epoch, epoch_logs)
 
     if use_steps:
       # Step-wise loop.
-      for step in range(steps_per_epoch):
+      if steps_per_epoch is None:
+        # Loop over dataset until `OutOfRangeError` is raised.
+        target_steps = np.inf
+      else:
+        # Loop over dataset for the specified number of steps.
+        target_steps = steps_per_epoch
+
+      step = 0
+      while step < target_steps:
         batch_logs = {'batch': step, 'size': 1}
         callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
         progbar.on_batch_begin(step, batch_logs)
@@ -267,18 +252,31 @@ def model_iteration(model,
           actual_inputs = ins() if callable(ins) else ins
           batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
-          logging.warning('Your dataset iterator ran out of data; '
-                          'interrupting training. Make sure that your dataset '
-                          'can generate at least `steps_per_epoch * epochs` '
-                          'batches (in this case, %d batches). You may need to'
-                          'use the repeat() function when building your '
-                          'dataset.' % steps_per_epoch * epochs)
+          if not is_dataset:
+            # We ran out of batches while the user passed an iterator (legacy).
+            logging.warning(
+                'Your dataset iterator ran out of data; '
+                'interrupting training. Make sure that your iterator '
+                'can generate at least `%s * epochs` '
+                'batches (in this case, %d batches). You may need to'
+                'use the repeat() function when building your '
+                'dataset.' % (steps_name, steps_per_epoch * epochs))
+            callbacks.model.stop_training = True
+          else:
+            # The dataset passed by the user ran out of batches.
+            # Now we know the cardinality of the dataset.
+            if step > 0:
+              steps_per_epoch = step
+              aggregator.num_samples_or_steps = steps_per_epoch
+              progbar.params['steps'] = steps_per_epoch
+              progbar.progbar.target = steps_per_epoch
           break
+
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
 
         if model._distribution_strategy:
-          batch_outs = training_distributed._per_device_aggregate_batch(
+          batch_outs = distributed_training_utils._per_device_aggregate_batch(
               batch_outs, model, mode)
 
         # Aggregate results.
@@ -290,6 +288,7 @@ def model_iteration(model,
         batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', step, batch_logs)
         progbar.on_batch_end(step, batch_logs)
+        step += 1
 
         if callbacks.model.stop_training:
           break
@@ -351,8 +350,17 @@ def model_iteration(model,
     if len(results) == 1:
       results = results[0]
 
-    # Run the test loop every epoch during training.
-    if do_validation and not callbacks.model.stop_training:
+    # Run the test loop every `validation_freq` epochs during training.
+    if (do_validation and
+        training_utils.should_run_validation(validation_freq, epoch) and
+        not callbacks.model.stop_training):
+
+      if model._compile_distribution:
+        # Since we create a new clone from the original model we need to copy
+        # the weights back to the original model before we can run validation.
+        distributed_training_utils._copy_weights_to_original_model(
+            model, ModeKeys.TRAIN)
+
       val_results = model_iteration(
           model,
           val_inputs,
@@ -362,33 +370,138 @@ def model_iteration(model,
           steps_per_epoch=validation_steps,
           callbacks=callbacks,
           verbose=0,
-          mode='test',
-          validation_in_fit=True)
+          mode=ModeKeys.TEST,
+          validation_in_fit=True,
+          prepared_feed_values_from_dataset=(val_iterator is not None),
+          steps_name='validation_steps')
       if not isinstance(val_results, list):
         val_results = [val_results]
       epoch_logs = cbks.make_logs(
           model, epoch_logs, val_results, mode, prefix='val_')
+      if val_iterator and epoch < epochs - 1:
+        _reinitialize_iterator(val_iterator, model._distribution_strategy)
 
-    if mode == 'train':
+    if mode == ModeKeys.TRAIN:
       # Epochs only apply to `fit`.
-      callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
-      progbar.on_epoch_end(epoch, epoch_logs)
+      callbacks.on_epoch_end(epoch, epoch_logs)
+    progbar.on_epoch_end(epoch, epoch_logs)
+
+    # Reinitialize dataset iterator for the next epoch.
+    if reset_dataset_after_each_epoch and epoch < epochs - 1:
+      _reinitialize_iterator(input_iterator, model._distribution_strategy)
 
   callbacks._call_end_hook(mode)
 
   if model._distribution_strategy:
-    if model._compile_distribution and not validation_in_fit:
+    if model._compile_distribution:
       # TODO(priyag, psv): Copy back metrics to the original model as well?
-      training_distributed._copy_weights_to_original_model(
-          model, model._distributed_model, mode)
+      distributed_training_utils._copy_weights_to_original_model(model, mode)
     scope.__exit__(None, None, None)
 
-  if mode == 'train':
+  if mode == ModeKeys.TRAIN:
     return model.history
   return results
 
 
+def _get_model_feed(model, mode):
+  if mode == ModeKeys.PREDICT:
+    feed = model._feed_inputs
+  else:
+    feed = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
+  return feed
+
+
+def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
+  if (val_inputs and steps_per_epoch is None and verbose and inputs and
+      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+    print('Train on %d samples, validate on %d samples' %
+          (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+
+def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
+  """Returns total number of samples (when training in batch mode) or steps."""
+  if steps_per_epoch:
+    return steps_per_epoch
+  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
+                                          'steps_per_epoch')
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  if model._distribution_strategy:
+    if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      inputs = distributed_training_utils.get_iterator(
+          inputs, model._distribution_strategy)
+
+    def get_distributed_inputs():
+      return distributed_training_utils._prepare_feed_values(
+          model, inputs, targets, sample_weights, mode)
+
+    # In the eager case, we want to call the input method per step, so return
+    # a lambda from here that can be called. Note that this is applicable only
+    # in Distribution Strategy case as it follows the same code path for both
+    # eager and graph modes.
+    # TODO(priyag,omalleyt): Either we should move the training DS with
+    # EagerIterator to use training_generator code path, or figure out how to
+    # set a symbolic Iterator out of a Dataset when in eager mode.
+    if context.executing_eagerly():
+      return get_distributed_inputs
+    else:
+      return get_distributed_inputs()
+
+  if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                         iterator_ops.Iterator)):
+    inputs, targets, sample_weights = model._standardize_user_data(
+        inputs,
+        extract_tensors_from_dataset=True)
+
+  inputs = training_utils.ModelInputs(inputs).as_list()
+  targets = targets or []
+  sample_weights = sample_weights or []
+  ins = inputs + targets + sample_weights
+  if mode == ModeKeys.TRAIN and not isinstance(K.symbolic_learning_phase(),
+                                               int):
+    ins += [True]  # Add learning phase value.
+  return ins
+
+
+def _get_iterator(inputs, distribution_strategy=None):
+  if distribution_strategy:
+    return distributed_training_utils.get_iterator(
+        inputs, distribution_strategy)
+  return training_utils.get_iterator(inputs)
+
+
+def _reinitialize_iterator(iterator, distribution_strategy=None):
+  if distribution_strategy:
+    distributed_training_utils.initialize_iterator(
+        iterator, distribution_strategy)
+  else:
+    training_utils.initialize_iterator(iterator)
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of model execution."""
+  if model._distribution_strategy:
+    return distributed_training_utils._make_execution_function(model, mode)
+  return model._make_execution_function(mode)
+
+
 # For backwards compatibility for internal users of these loops.
-fit_loop = functools.partial(model_iteration, mode='train')
-test_loop = functools.partial(model_iteration, mode='test', shuffle=False)
-predict_loop = functools.partial(model_iteration, mode='predict', shuffle=False)
+fit_loop = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
+test_loop = functools.partial(
+    model_iteration, mode=ModeKeys.TEST, shuffle=False)
+predict_loop = functools.partial(
+    model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index d6cc93d1ef77b14142851e6267158d61edcbc13b..5ed07fc13135000ccfe00a79e3b7c8eca2e90f28 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -23,17 +23,15 @@ import logging
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
@@ -42,7 +40,7 @@ class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_iterators_single_io(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics,
@@ -90,43 +88,20 @@ class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
                 epochs=1, steps_per_epoch=2, verbose=0)
 
     with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
+        ValueError, 'the `steps_per_epoch` argument'):
       model.fit(iterator, epochs=1, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.evaluate(iterator, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.predict(iterator, verbose=0)
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics,
-                  run_eagerly=testing_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_iterators_running_out_of_data(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae']
     model.compile(optimizer, loss, metrics=metrics,
@@ -157,7 +132,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
       self.skipTest('b/120673224')
 
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae']
     model.compile(optimizer, loss, metrics=metrics,
@@ -172,9 +147,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     # Call fit with validation data
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
 
@@ -182,7 +154,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_dataset(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics,
@@ -191,7 +163,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
+    dataset = dataset.repeat()  # Infinite dataset.
     dataset = dataset.batch(10)
 
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
@@ -223,26 +195,41 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
           sample_weight=sample_weight)
 
     # Test invalid usage
+    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
+                                 ' must not be specified when using dataset'
+                                 ' as an input.'):
+      model.fit(dataset, batch_size=10, epochs=1, steps_per_epoch=2,
+                verbose=0)
+    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
+                                 ' must not be specified when using dataset'
+                                 ' as an input.'):
+      model.predict(dataset, batch_size=10, steps=2, verbose=0)
+    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
+                                 ' must not be specified when using dataset'
+                                 ' as an input.'):
+      model.evaluate(dataset, batch_size=10, steps=2, verbose=0)
+
     with self.assertRaisesRegexp(ValueError,
                                  'you should not specify a target'):
       model.fit(dataset, dataset,
                 epochs=1, steps_per_epoch=2, verbose=0)
 
+    # With an infinite dataset, `steps_per_epoch`/`steps` argument is required.
     with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
+        ValueError, 'the `steps_per_epoch` argument'):
       model.fit(dataset, epochs=1, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.evaluate(dataset, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.predict(dataset, verbose=0)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_dataset_with_sample_weights(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics,
@@ -264,25 +251,52 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_dataset_with_sparse_labels(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    for loss in ['sparse_categorical_crossentropy',
-                 losses_impl.sparse_softmax_cross_entropy]:
-      model.compile(optimizer, loss,
-                    run_eagerly=testing_utils.should_run_eagerly())
-
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
+    optimizer = 'rmsprop'
+    model.compile(
+        optimizer,
+        loss='sparse_categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    inputs = np.zeros((10, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_fit_correctness(self):
+
+    class SumLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.w = self.add_weight('w', ())
+
+      def call(self, inputs):
+        return keras.backend.sum(inputs) + self.w * 0
+
+    model = keras.Sequential([SumLayer(input_shape=(2,))])
+    model.compile(
+        'rmsprop', loss='mae', run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((40, 2), dtype=np.float32)
+    inputs[10:20, :] = 2
+    inputs[20:30, :] = 1
+    inputs[30:, :] = 4
+    targets = np.zeros((40, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+    history = model.fit(dataset,
+                        epochs=2, steps_per_epoch=2, verbose=1, shuffle=False)
+    self.assertListEqual(history.history['loss'],
+                         [inputs[:20].sum() / 2, inputs[20:].sum() / 2])
 
   @tf_test_util.run_deprecated_v1
   def test_dataset_input_shape_validation(self):
     with self.cached_session():
       model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+      model.compile(optimizer='rmsprop', loss='mse')
 
       # User forgets to batch the dataset
       inputs = np.zeros((10, 3))
@@ -307,6 +321,46 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
                                    r'expected (.*?) to have shape \(3,\)'):
         model.train_on_batch(dataset)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_known_cardinality_no_steps_arg(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = 'rmsprop'
+    model.compile(optimizer, 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+
+    history = model.fit(dataset, epochs=2, verbose=1)
+    self.assertEqual(len(history.history['loss']), 2)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = 'rmsprop'
+    model.compile(optimizer, 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.filter(lambda x, y: True).batch(10)
+    self.assertEqual(keras.backend.get_value(cardinality.cardinality(dataset)),
+                     cardinality.UNKNOWN)
+
+    history = model.fit(dataset, epochs=2, verbose=1)
+    self.assertEqual(len(history.history['loss']), 2)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
 
 class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
 
@@ -324,7 +378,7 @@ class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
     model.compile(
         loss='binary_crossentropy',
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(123)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 7842228e050cf68468a60ce15569c6a4dd984343..57b665a3daf21fd08c378bce260708cc7301b836 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -19,49 +19,204 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import enum  # pylint: disable=g-bad-import-order
 import numpy as np
 
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.engine import training_arrays
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 
 
-class _Mode(enum.Enum):
-  TRAIN = 'train'
-  TEST = 'test'
-  PREDICT = 'predict'
-# TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
-
-
-def experimental_fit_loop(model,
-                          iterator,
-                          epochs=100,
-                          verbose=1,
-                          callbacks=None,
-                          initial_epoch=0,
-                          steps_per_epoch=None,
-                          val_iterator=None,
-                          validation_steps=None):
+def fit_distributed(model,
+                    x=None,
+                    y=None,
+                    batch_size=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_split=0.,
+                    validation_data=None,
+                    shuffle=True,
+                    class_weight=None,
+                    sample_weight=None,
+                    initial_epoch=0,
+                    steps_per_epoch=None,
+                    validation_steps=None,
+                    validation_freq=1):
+  """Fit loop for Distribution Strategies."""
+  distributed_training_utils.validate_callbacks(callbacks, model.optimizer)
+  distributed_training_utils.validate_inputs(
+      x, y, model._distribution_strategy)
+
+  first_x_value = nest.flatten(x)[0]
+  if isinstance(first_x_value, np.ndarray):
+    steps_per_epoch, batch_size = (
+        distributed_training_utils.get_input_params(
+            model._distribution_strategy, first_x_value, steps_per_epoch,
+            batch_size, is_training=True))
+  batch_size = model._validate_or_infer_batch_size(
+      batch_size, steps_per_epoch, x)
+  dataset = model._distribution_standardize_user_data(
+      x, y,
+      sample_weight=sample_weight,
+      class_weight=class_weight,
+      batch_size=batch_size,
+      check_steps=True,
+      steps_name='steps_per_epoch',
+      steps=steps_per_epoch,
+      validation_split=validation_split,
+      shuffle=shuffle)
+
+  val_dataset = None
+  if validation_data:
+    val_x, val_y, val_sample_weights = model._unpack_validation_data(
+        validation_data)
+    distributed_training_utils.validate_inputs(
+        val_x, val_y, model._distribution_strategy)
+    first_valx_value = nest.flatten(val_x)[0]
+    if isinstance(first_valx_value, np.ndarray):
+      validation_steps, _ = distributed_training_utils.get_input_params(
+          model._distribution_strategy, first_valx_value, validation_steps,
+          batch_size)
+    val_dataset = model._distribution_standardize_user_data(
+        val_x, val_y,
+        sample_weight=val_sample_weights,
+        class_weight=None,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='validation_steps',
+        steps=validation_steps,
+        validation_split=validation_split,
+        shuffle=shuffle)
+  elif validation_split:
+    raise ValueError('validation_split argument is not supported with '
+                     'distribution strategies.')
+
+  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+    return experimental_tpu_fit_loop(
+        model,
+        dataset,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_dataset=val_dataset,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=1)
+  else:
+    return training_arrays.fit_loop(
+        model,
+        dataset,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_inputs=val_dataset,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq)
+
+
+def evaluate_distributed(model,
+                         x=None,
+                         y=None,
+                         batch_size=None,
+                         verbose=1,
+                         sample_weight=None,
+                         steps=None,
+                         callbacks=None):
+  """Evaluate loop for Distribution Strategies."""
+  distributed_training_utils.validate_inputs(x, y, model._distribution_strategy)
+  first_x_value = nest.flatten(x)[0]
+  if isinstance(first_x_value, np.ndarray):
+    steps, batch_size = distributed_training_utils.get_input_params(
+        model._distribution_strategy, first_x_value, steps, batch_size)
+  batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+  dataset = model._distribution_standardize_user_data(
+      x, y,
+      sample_weight=sample_weight,
+      batch_size=batch_size,
+      check_steps=True,
+      steps_name='steps',
+      steps=steps)
+
+  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+    # TODO(fchollet): why aren't callbacks supported here?
+    return experimental_tpu_test_loop(
+        model, dataset, verbose=verbose, steps=steps)
+  else:
+    return training_arrays.test_loop(
+        model,
+        inputs=dataset,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
+
+
+def predict_distributed(model,
+                        x=None,
+                        batch_size=None,
+                        verbose=0,
+                        steps=None,
+                        callbacks=None):
+  """Predict loop for Distribution Strategies."""
+  distributed_training_utils.validate_inputs(
+      x, None, model._distribution_strategy)
+  first_x_value = nest.flatten(x)[0]
+  if isinstance(first_x_value, np.ndarray):
+    steps, batch_size = distributed_training_utils.get_input_params(
+        model._distribution_strategy, first_x_value, steps, batch_size)
+  batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+  dataset = model._distribution_standardize_user_data(
+      x,
+      batch_size=batch_size,
+      check_steps=True,
+      steps_name='steps',
+      steps=steps)
+  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+    # TODO(fchollet): why aren't callbacks supported here?
+    return experimental_tpu_predict_loop(
+        model, dataset, verbose=verbose, steps=steps)
+  else:
+    return training_arrays.predict_loop(
+        model,
+        dataset,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
+
+
+def experimental_tpu_fit_loop(model,
+                              dataset,
+                              epochs=100,
+                              verbose=1,
+                              callbacks=None,
+                              initial_epoch=0,
+                              steps_per_epoch=None,
+                              val_dataset=None,
+                              validation_steps=None,
+                              validation_freq=1):
   """Fit loop for training with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator that returns inputs and targets
+      dataset: Dataset that returns inputs and targets
       epochs: Number of times to iterate over the data
       verbose: Integer, Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
@@ -70,10 +225,17 @@ def experimental_fit_loop(model,
       steps_per_epoch: Total number of steps (batches of samples)
           before declaring one epoch finished and starting the
           next epoch. Ignored with the default value of `None`.
-      val_iterator: Iterator for validation data.
+      val_dataset: Dataset for validation data.
       validation_steps: Number of steps to run validation for
           (only if doing validation from data tensors).
           Ignored with the default value of `None`.
+      validation_freq: Only relevant if validation data is provided. Integer or
+          `collections.Container` instance (e.g. list, tuple, etc.). If an
+          integer, specifies how many training epochs to run before a new
+          validation run is performed, e.g. `validation_freq=2` runs
+          validation every 2 epochs. If a Container, specifies the epochs on
+          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+          validation at the end of the 1st, 2nd, and 10th epochs.
 
   Returns:
       Returns `None`.
@@ -81,8 +243,12 @@ def experimental_fit_loop(model,
   Raises:
       ValueError: in case of invalid arguments.
   """
+  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
   current_strategy = model._distribution_strategy
-  scope = current_strategy.scope()
+  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=1)
   scope.__enter__()
 
   def _per_device_fit_function(model):
@@ -90,20 +256,18 @@ def experimental_fit_loop(model,
     return (model._fit_function.inputs, model._fit_function.outputs,
             model._fit_function.updates_op, model._fit_function.session_kwargs)
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(1)
   out_labels = model.metrics_names or []
 
   def step_fn(ctx, inputs):
     """Clones the model and calls make_fit_function."""
     inputs, targets = inputs
     if model._compile_distribution:
-      clone_model_on_replicas(model, current_strategy,
-                              make_callback_model=True, inputs=inputs,
-                              targets=targets, mode=_Mode.TRAIN)
+      distributed_training_utils.clone_model_on_replicas(
+          model, current_strategy, ModeKeys.TRAIN, inputs=inputs,
+          targets=targets)
     else:
-      _build_distributed_network(model, current_strategy, inputs,
-                                 targets, mode=_Mode.TRAIN)
+      distributed_training_utils._build_distributed_network(
+          model, current_strategy, ModeKeys.TRAIN, inputs, targets)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.extended.call_for_each_replica(
@@ -148,10 +312,9 @@ def experimental_fit_loop(model,
       dtype='int32',
       name='steps_per_run')
 
-  with current_strategy.scope():
-    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-        step_fn, iterator, iterations=steps_per_run,
-        initial_loop_values=initial_loop_values)
+  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+      step_fn, iterator, iterations=steps_per_run,
+      initial_loop_values=initial_loop_values)
 
   train_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
@@ -159,8 +322,8 @@ def experimental_fit_loop(model,
   do_validation = bool(validation_steps)
 
   if model._compile_distribution:
-    with current_strategy.scope():
-      _copy_weights_to_distributed_model(model, model._distributed_model_train)
+    distributed_training_utils._copy_weights_to_distributed_model(
+        model, ModeKeys.TRAIN)
 
   callbacks = cbks.configure_callbacks(
       callbacks,
@@ -179,8 +342,7 @@ def experimental_fit_loop(model,
 
   callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
-    with current_strategy.scope():
-      _reset_metrics(model, model._distributed_model_train)
+    distributed_training_utils._reset_metrics(model)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
@@ -207,19 +369,19 @@ def experimental_fit_loop(model,
       if callbacks.model.stop_training:
         break
 
-    if do_validation:
+    if (do_validation and
+        training_utils.should_run_validation(validation_freq, epoch)):
       logging.info('Running validation at fit epoch: %s', epoch)
 
       if model._compile_distribution:
         # Since we create a new clone from the original model we need to copy
         # the weights back to the original model before we can run validation.
-        with current_strategy.scope():
-          _copy_weights_to_original_model(
-              model, model._distributed_model_train, 'train')
+        distributed_training_utils._copy_weights_to_original_model(
+            model, ModeKeys.TRAIN)
 
-      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
+      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
           model,
-          val_iterator,
+          val_dataset,
           steps=validation_steps,
           verbose=verbose)
       if not isinstance(val_outs, list):
@@ -235,22 +397,21 @@ def experimental_fit_loop(model,
 
   if model._compile_distribution:
     # Copy the weights back from the replicated model to the original model.
-    with current_strategy.scope():
-      _copy_weights_to_original_model(model, model._distributed_model_train,
-                                      'train')
+    distributed_training_utils._copy_weights_to_original_model(
+        model, ModeKeys.TRAIN)
   scope.__exit__(None, None, None)
   return model.history
 
 
-def experimental_test_loop(model,
-                           iterator,
-                           verbose=0,
-                           steps=None):
+def experimental_tpu_test_loop(model,
+                               dataset,
+                               verbose=0,
+                               steps=None):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator for input data.
+      dataset: Dataset for input data.
       verbose: Integer, Verbosity mode 0 or 1.
       steps: Total number of steps (batches of samples)
           before declaring predictions finished.
@@ -263,7 +424,9 @@ def experimental_test_loop(model,
       the display labels for the outputs.
   """
   current_strategy = model._distribution_strategy
-  scope = current_strategy.scope()
+  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=0)
   scope.__enter__()
 
   def _per_device_eval_function(model):
@@ -272,19 +435,16 @@ def experimental_test_loop(model,
             model._eval_function.updates_op,
             model._eval_function.session_kwargs)
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(0)
-
   def step_fn(ctx, inputs):
     """Clones the model and calls make_eval_function."""
     inputs, targets = inputs
     if model._compile_distribution:
-      clone_model_on_replicas(model, current_strategy,
-                              make_callback_model=False, inputs=inputs,
-                              targets=targets, mode=_Mode.TEST)
+      distributed_training_utils. clone_model_on_replicas(
+          model, current_strategy, mode=ModeKeys.TEST, inputs=inputs,
+          targets=targets)
     else:
-      _build_distributed_network(model, current_strategy, inputs,
-                                 targets, mode=_Mode.TEST)
+      distributed_training_utils._build_distributed_network(
+          model, current_strategy, ModeKeys.TEST, inputs, targets)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.extended.call_for_each_replica(
@@ -319,12 +479,11 @@ def experimental_test_loop(model,
     tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
-  with current_strategy.scope():
-    # TODO(priyag): Use steps_per_run when we use new metrics as they will
-    # allow handling metric computation at each step using variables.
-    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-        step_fn, iterator, iterations=1,
-        initial_loop_values=initial_loop_values)
+  # TODO(priyag): Use steps_per_run when we use new metrics as they will
+  # allow handling metric computation at each step using variables.
+  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+      step_fn, iterator, iterations=1,
+      initial_loop_values=initial_loop_values)
 
   test_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
@@ -333,10 +492,10 @@ def experimental_test_loop(model,
     progbar = Progbar(target=steps)
 
   if model._compile_distribution:
-    with current_strategy.scope():
-      _copy_weights_to_distributed_model(model, model._distributed_model_test)
-  with current_strategy.scope():
-    _reset_metrics(model, model._distributed_model_test)
+    distributed_training_utils._copy_weights_to_distributed_model(
+        model, ModeKeys.TEST)
+
+  distributed_training_utils._reset_metrics(model)
 
   assert steps is not None
   outs = [0.] * len(model.metrics_names)
@@ -362,12 +521,12 @@ def experimental_test_loop(model,
   return outs
 
 
-def experimental_predict_loop(model, iterator, verbose=0, steps=None):
+def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator for input data.
+      dataset: Dataset for input data.
       verbose: Integer, Verbosity mode 0 or 1.
       steps: Total number of steps (batches of samples)
           before declaring `_predict_loop` finished.
@@ -379,11 +538,11 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
       (if the model has multiple outputs).
   """
   current_strategy = model._distribution_strategy
-  scope = current_strategy.scope()
-  scope.__enter__()
+  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(0)
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=0)
+  scope.__enter__()
 
   def _per_device_predict_function(model):
     model._make_predict_function()
@@ -395,12 +554,11 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
   def step_fn(ctx, inputs):
     """Clones the model and calls make_predict_function."""
     if model._compile_distribution:
-      clone_model_on_replicas(model, current_strategy,
-                              make_callback_model=False, inputs=inputs,
-                              mode=_Mode.PREDICT)
+      distributed_training_utils. clone_model_on_replicas(
+          model, current_strategy, ModeKeys.PREDICT, inputs=inputs)
     else:
-      _build_distributed_network(model, current_strategy, inputs,
-                                 mode=_Mode.PREDICT)
+      distributed_training_utils._build_distributed_network(
+          model, current_strategy, ModeKeys.PREDICT, inputs)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.extended.call_for_each_replica(
@@ -432,11 +590,10 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
     shape.dims = [batch_dimension] + shape.dims[1:]
     initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)
 
-  with current_strategy.scope():
-    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
-    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-        step_fn, iterator, iterations=1,
-        initial_loop_values=initial_loop_values)
+  # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
+  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+      step_fn, iterator, iterations=1,
+      initial_loop_values=initial_loop_values)
 
   predict_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
@@ -445,11 +602,10 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
     progbar = Progbar(target=steps)
 
   if model._compile_distribution:
-    with current_strategy.scope():
-      _copy_weights_to_distributed_model(
-          model, model._distributed_model_predict)
-  with current_strategy.scope():
-    _reset_metrics(model, model._distributed_model_predict)
+    distributed_training_utils._copy_weights_to_distributed_model(
+        model, ModeKeys.PREDICT)
+
+  distributed_training_utils._reset_metrics(model)
 
   assert steps is not None
   # Since we do not know how many samples we will see, we cannot pre-allocate
@@ -465,347 +621,10 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
       progbar.update(step + 1)
 
   scope.__exit__(None, None, None)
+
   if len(unconcatenated_outs) == 1:
     return np.concatenate(unconcatenated_outs[0], axis=0)
   return [
       np.concatenate(unconcatenated_outs[i], axis=0)
       for i in range(len(unconcatenated_outs))
   ]
-
-
-def _custom_compile_for_predict(model):
-  """Custom compile for TPU predict mode."""
-  model.total_loss = None
-  model._fit_function = None
-  model._eval_function = None
-  model.train_function = None
-  model.test_function = None
-  model.predict_function = None
-
-
-def _build_network_on_replica(model, inputs=None, targets=None, mode=None):
-  """Build an updated model on replicas.
-
-  We create a new Keras model while sharing the variables from the old graph.
-  Building a new sub-graph is required since the original keras model creates
-  placeholders for the input and the output that are not accessible till we
-  call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
-
-  The sharing of weights and layers between the old and the new model gaurantee
-  that we're using Strategy variables and any updates on either model are
-  reflected correctly in callbacks and loop iterations.
-
-  We need to make sure we share the optimizers between the old and the new model
-  as well so that optimizer state is not lost if the user is running fit
-  multiple times.
-
-  Args:
-    model: Model to be replicated across Replicas
-    inputs: Input variables to be passed to the model
-    targets: Target tensor to be passed to model.compile
-    mode: Which of fit/eval/predict is building the distributed network
-
-  Returns:
-    A new model with shared layers with the old model.
-  """
-  # Need to do imports here since we run into a circular dependency error.
-  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
-
-  # We rely on the internal methods to avoid having share_weights weights in the
-  # public API.
-  if isinstance(model, sequential.Sequential):
-    updated_model = models._clone_sequential_model(model, input_tensors=inputs,
-                                                   share_weights=True)
-  else:
-    updated_model = models._clone_functional_model(model, input_tensors=inputs,
-                                                   share_weights=True)
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == dtypes.bfloat16:
-      return math_ops.cast(output, dtypes.float32)
-    else:
-      return output
-  updated_model.outputs = [_upcast_low_precision_outputs(o)
-                           for o in updated_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = nest.flatten(targets)
-
-  if mode == _Mode.PREDICT:
-    _custom_compile_for_predict(updated_model)
-  else:
-    updated_model.compile(
-        model.optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return updated_model
-
-
-def _build_distributed_network(model, strategy, inputs=None, targets=None,
-                               mode=None):
-  """Create a cloned model on each replica."""
-  with K.get_graph().as_default(), strategy.scope():
-    distributed_model = strategy.extended.call_for_each_replica(
-        _build_network_on_replica,
-        args=(model, inputs, targets, mode))
-    if mode is _Mode.TRAIN:
-      model._distributed_model_train = distributed_model
-    elif mode is _Mode.TEST:
-      model._distributed_model_test = distributed_model
-    elif mode is _Mode.PREDICT:
-      model._distributed_model_predict = distributed_model
-    else:
-      model._distributed_model = distributed_model
-
-
-def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
-  """Clone and build the given keras_model."""
-  # We need to set the import here since we run into a circular dependency
-  # error.
-  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
-  cloned_model = models.clone_model(model, input_tensors=inputs)
-
-  # Compile and build model.
-  if isinstance(model.optimizer, optimizers.TFOptimizer):
-    optimizer = model.optimizer
-  else:
-    optimizer_config = model.optimizer.get_config()
-    optimizer = model.optimizer.__class__.from_config(optimizer_config)
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == dtypes.bfloat16:
-      return math_ops.cast(output, dtypes.float32)
-    else:
-      return output
-  cloned_model.outputs = [_upcast_low_precision_outputs(o)
-                          for o in cloned_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = nest.flatten(targets)
-  if mode == _Mode.PREDICT:
-    _custom_compile_for_predict(cloned_model)
-  else:
-    cloned_model.compile(
-        optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return cloned_model
-
-
-def clone_model_on_replicas(model, strategy, make_callback_model=False,
-                            inputs=None, targets=None, mode=None):
-  """Create a cloned model on each replica."""
-  with K.get_graph().as_default(), strategy.scope():
-    distributed_model = strategy.extended.call_for_each_replica(
-        _clone_and_build_model, args=(model, inputs, targets, mode))
-    if mode is _Mode.TRAIN:
-      model._distributed_model_train = distributed_model
-    elif mode is _Mode.TEST:
-      model._distributed_model_test = distributed_model
-    elif mode is _Mode.PREDICT:
-      model._distributed_model_predict = distributed_model
-    else:
-      model._distributed_model = distributed_model
-  if make_callback_model:
-    model._make_callback_model(distributed_model)
-
-
-def _get_input_from_iterator(iterator, model):
-  """Get elements from the iterator and verify the input shape and type."""
-  next_element = iterator.get_next()
-
-  if len(nest.flatten(next_element)) == len(model.inputs):
-    x = next_element
-    y = None
-    sample_weights = None
-  elif len(nest.flatten(next_element)) == (len(model.inputs) +
-                                           len(model.outputs)):
-    x, y = next_element
-    sample_weights = None
-  else:
-    x, y, sample_weights = next_element
-
-  # Validate that all the elements in x and y are of the same type and shape.
-  # We can then pass the first element of x and y to `_standardize_weights`
-  # below and be confident of the output.
-  distributed_training_utils.validate_distributed_dataset_inputs(
-      model._distribution_strategy, x, y, sample_weights)
-  return x, y, sample_weights
-
-
-def _make_execution_function(model, mode):
-  """Makes function to run one step of distributed model execution."""
-  if context.executing_eagerly():
-    return _make_eager_execution_function(model, mode)
-
-  strategy = model._distribution_strategy
-  if not model._distributed_model:
-    if model._compile_distribution:
-      clone_model_on_replicas(
-          model, strategy, make_callback_model=(mode == 'train'))
-    else:
-      _build_distributed_network(model, strategy)
-
-  def _per_device_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
-
-  with strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_device_function, args=(model._distributed_model,))
-
-    if mode == 'train':
-      # Initialize the variables in the replicated model. This is necessary for
-      # multi-worker training because on some workers, initialization is not
-      # needed. This method does initialization or waiting for initialization
-      # according to the context object of distribute coordinator.
-      distributed_training_utils.init_restore_or_wait_for_variables()
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         strategy,
-         grouped_inputs,
-         grouped_outputs,
-         grouped_updates,
-         grouped_session_args,
-         with_loss_tensor=(mode != 'predict'))
-
-    return K.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_{}_function'.format(mode),
-        **all_session_args)
-
-
-def _make_eager_execution_function(model, mode):
-  """Makes function to run one step of distributed model eager execution."""
-  strategy = model._distribution_strategy
-  if not model._distributed_model:
-    if model._compile_distribution:
-      clone_model_on_replicas(
-          model, strategy, make_callback_model=(mode == 'train'))
-    else:
-      _build_distributed_network(model, strategy)
-
-  def _per_device_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs)
-
-  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
-  # the global one.
-  with K.get_graph().as_default(), strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs) = strategy.call_for_each_replica(
-        _per_device_function, args=(model._distributed_model,))
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of inptus/outputs
-    # on all the devices over which the model is distributed.
-    (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values(
-        strategy,
-        grouped_inputs,
-        grouped_outputs,
-        with_loss_tensor=(mode != 'predict'))
-
-    return K.function(
-        all_inputs,
-        all_outputs,
-        name='eager_distributed_{}_function'.format(mode))
-
-
-def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
-
-  Arguments:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of 'train'/'test'/'predict'.
-
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  strategy = model._distribution_strategy
-  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
-  inputs = distributed_training_utils.flatten_perdevice_values(strategy, inputs)
-  targets = distributed_training_utils.flatten_perdevice_values(
-      strategy, targets)
-  if mode == 'predict':
-    sample_weights = []
-    targets = []
-  else:
-    sample_weights = [
-        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
-    ]
-  ins = inputs + targets + sample_weights
-  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
-    ins += [True]
-  return ins
-
-
-def _copy_weights_to_distributed_model(original_model, grouped_model):
-  """Copies weights from original model to distributed models."""
-  strategy = original_model._distribution_strategy
-  if strategy:
-    # Copy the weights from the original model to each of the replicated
-    # models.
-    orig_model_weights = original_model.get_weights()
-    distributed_model = strategy.unwrap(grouped_model)[0]
-    distributed_training_utils.set_weights(strategy, distributed_model,
-                                           orig_model_weights)
-
-
-def _copy_weights_to_original_model(model, grouped_model, mode):
-  """Copies weights from first distributed model back to original model."""
-  if model._distribution_strategy and mode == 'train':
-    updated_weights = model._distribution_strategy.unwrap(
-        grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
-
-
-def _per_device_aggregate_batch(batch_outs, model, mode):
-  """Aggregates the per-device batch-level outputs from a distributed step."""
-  if model._distribution_strategy is not None and mode == 'predict':
-    total_batch_outs = []
-    for i in range(len(model.outputs)):
-      num_replicas = model._distribution_strategy.num_replicas_in_sync
-      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
-      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
-    return total_batch_outs
-  return batch_outs
-
-
-def _reset_metrics(model, distributed_model=None):
-  if model._distribution_strategy:
-    distributed_model = (
-        distributed_model or
-        model._distribution_strategy.unwrap(model._distributed_model)[0])
-    distributed_model.reset_metrics()
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 895db5bc633669641b0493b8bfb918094f312513..6242b54c0cc5331d32d7d911ed70f813793fa7cf 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -25,12 +25,11 @@ from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import losses as losses_module
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
 
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
@@ -59,8 +58,8 @@ def _eager_metrics_fn(model,
   Returns:
       Returns the metric results for each output of the model.
   """
-  outputs = generic_utils.to_list(outputs)
-  targets = generic_utils.to_list(targets)
+  outputs = nest.flatten(outputs)
+  targets = nest.flatten(targets)
   # TODO(psv): Consider supporting skip target indices in eager mode?
   metric_results = model._handle_metrics(
       outputs,
@@ -104,15 +103,15 @@ def _model_loss(model,
 
   if model._compute_output_and_mask_jointly:
     outs, masks = model._call_and_compute_mask(inputs, **kwargs)
-    masks = generic_utils.to_list(masks)
+    masks = nest.flatten(masks)
   else:
     outs = model.call(inputs, **kwargs)
     masks = None
 
-  outs = generic_utils.to_list(outs)
+  outs = nest.flatten(outs)
   if masks is None:
     masks = [None for _ in outs]
-  targets = generic_utils.to_list(targets)
+  targets = nest.flatten(targets)
 
   loss_metrics = []
   aggregated_loss_metrics = []
@@ -124,22 +123,16 @@ def _model_loss(model,
         weights = None
       mask = masks[i]
       with backend.name_scope(model.output_names[i] + '_loss'):
-        if isinstance(loss_fn, losses_module.Loss):
-          if mask is not None:
-            mask = math_ops.cast(mask, outs[i].dtype)
-            # Update weights with mask.
-            if weights is None:
-              weights = mask
-            else:
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, weights = squeeze_or_expand_dimensions(
-                  mask, None, weights)
-              weights *= mask
-          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
-        else:
-          weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
-          output_loss = weighted_masked_fn(
-              targets[i], outs[i], weights, mask=mask)
+        if mask is not None:
+          mask = math_ops.cast(mask, outs[i].dtype)
+          # Update weights with mask.
+          if weights is None:
+            weights = mask
+          else:
+            # Update dimensions of weights to match with mask if possible.
+            mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
+            weights *= mask
+        output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
 
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
@@ -202,7 +195,7 @@ def _process_single_batch(model,
   Raises:
       ValueError: If the model has no loss to optimize.
   """
-  with backend.learning_phase_scope(1 if training else 0):
+  with backend.eager_learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
       outs, loss, loss_metrics, aggregated_loss_metrics, masks\
         = _model_loss(
@@ -267,7 +260,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       sample_weights=sample_weights,
       masks=masks,
       return_stateful_result=True)
-  loss = generic_utils.to_list(loss)
+  loss = nest.flatten(loss)
 
   return [
       tensor_util.constant_value(v)
@@ -314,7 +307,7 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       sample_weights=sample_weights,
       masks=masks,
       return_stateful_result=True)
-  loss = generic_utils.to_list(loss)
+  loss = nest.flatten(loss)
 
   return [
       tensor_util.constant_value(v)
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 27eaea23ba09d1405ca16f3beaa2f4c4f4a18661..84f1fa0efcba08c227cc6eb4e3e2ad4623c7adc9 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -28,13 +28,20 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_multi_io(self):
+    if not context.executing_eagerly():
+      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+      # symbolic tensors has different requirements.
+      return
+
     input_a = keras.layers.Input(shape=(3,), name='input_a')
     input_b = keras.layers.Input(shape=(3,), name='input_b')
 
@@ -53,13 +60,13 @@ class TrainingTest(keras_parameterized.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
-        run_eagerly=True,
+        run_eagerly=testing_utils.should_run_eagerly(),
         sample_weight_mode=None)
 
-    input_a = keras.backend.zeros(shape=(10, 3))
-    input_b = keras.backend.zeros(shape=(10, 3))
-    target_a = keras.backend.zeros(shape=(10, 4))
-    target_b = keras.backend.zeros(shape=(10, 4))
+    input_a = array_ops.zeros(shape=(10, 3))
+    input_b = array_ops.zeros(shape=(10, 3))
+    target_a = array_ops.zeros(shape=(10, 4))
+    target_b = array_ops.zeros(shape=(10, 4))
 
     model.fit(
         [input_a, input_b], [target_a, target_b],
@@ -107,16 +114,26 @@ class TrainingTest(keras_parameterized.TestCase):
     model.test_on_batch([input_a, input_b], [target_a, target_b])
 
   @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_single_io(self):
+    if not context.executing_eagerly():
+      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+      # symbolic tensors has different requirements.
+      return
+
     model = testing_utils.get_small_mlp(10, 4, 3)
 
     optimizer = rmsprop.RMSprop(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = keras.backend.zeros(shape=(10, 3))
-    targets = keras.backend.zeros(shape=(10, 4))
+    inputs = array_ops.zeros(shape=(10, 3))
+    targets = array_ops.zeros(shape=(10, 4))
 
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
     model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
@@ -134,24 +151,25 @@ class TrainingTest(keras_parameterized.TestCase):
                   loss='mse',
                   run_eagerly=True)
 
-    x = keras.backend.zeros(shape=(10, 3))
-    y = keras.backend.zeros(shape=(10, 4))
+    x = array_ops.zeros(shape=(10, 3))
+    y = array_ops.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
-        (x, y)).repeat(10).batch(5)
+        (x, y)).repeat().batch(5)  # Infinite dataset.
     validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
     if not context.executing_eagerly():
-      # In eager execution, `keras.backend.zeros` returns value tensors
+      # In eager execution, `array_ops.zeros` returns value tensors
       # which can be used for validation without a `validation_steps` argument.
       with self.assertRaisesRegexp(
           ValueError, r'provide either `batch_size` or `validation_steps`'):
         model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                   validation_data=(x, y))
+    # Step argument is required for infinite datasets.
     with self.assertRaisesRegexp(ValueError,
                                  'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index bc6a3e8dd0be81ff2af8150c4d62e9416ced4f4f..da460ee20592a0cb4296a79235f184bd005c1768 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 
 
@@ -45,16 +46,18 @@ def model_iteration(model,
                     callbacks=None,
                     validation_data=None,
                     validation_steps=None,
+                    validation_freq=1,
                     class_weight=None,
                     max_queue_size=10,
                     workers=1,
                     use_multiprocessing=False,
                     shuffle=False,
                     initial_epoch=0,
-                    mode='train',
+                    mode=ModeKeys.TRAIN,
                     batch_size=None,
+                    steps_name='steps',
                     **kwargs):
-  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
 
   Arguments:
       model: Keras Model instance.
@@ -72,6 +75,13 @@ def model_iteration(model,
         `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
       validation_steps: Total number of steps (batches of samples) before
         declaring validation finished.
+      validation_freq: Only relevant if validation data is provided. Integer or
+        `collections.Container` instance (e.g. list, tuple, etc.). If an
+        integer, specifies how many training epochs to run before a new
+        validation run is performed, e.g. `validation_freq=2` runs
+        validation every 2 epochs. If a Container, specifies the epochs on
+        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+        validation at the end of the 1st, 2nd, and 10th epochs.
       class_weight: Dictionary mapping class indices to a weight for the class.
       max_queue_size: Integer. Maximum size for the generator queue. If
         unspecified, `max_queue_size` will default to 10.
@@ -89,16 +99,19 @@ def model_iteration(model,
         `None`.
       initial_epoch: Epoch at which to start training (useful for resuming a
         previous training run).
-      mode: One of 'train'/'test'/'predict'.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
       batch_size: Integer batch size or None if unknown. Will only be used if
         `data` is in NumPy/Tensor format.
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
       **kwargs: Additional arguments for backwards compatibility. `steps` is
         accepted as an alias for `steps_per_epoch`.
 
   Returns:
-      - In 'train' mode: `History` object.
-      - In 'test' mode: Evaluation metrics.
-      - In 'predict' mode: Outputs of the Model called on inputs.
+      - In TRAIN mode: `History` object.
+      - In TEST mode: Evaluation metrics.
+      - In PREDICT mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
@@ -106,6 +119,18 @@ def model_iteration(model,
   if 'steps' in kwargs:
     steps_per_epoch = kwargs['steps']
 
+  # Determine the number of steps per epoch and whether we should reset the
+  # dataset at the end of each epoch.
+  reset_dataset_after_each_epoch = False
+  original_dataset = None
+  is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1))
+  if is_dataset:
+    original_dataset = data
+    if steps_per_epoch is None:
+      reset_dataset_after_each_epoch = True
+      steps_per_epoch = training_utils.infer_steps_for_dataset(
+          data, steps_per_epoch, epochs=epochs, steps_name=steps_name)
+
   # Convert to a format that supports `next(generator)`.
   generator, steps_per_epoch = convert_to_generator_like(
       data,
@@ -115,9 +140,8 @@ def model_iteration(model,
       shuffle=shuffle)
 
   do_validation = validation_data is not None
-  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   is_sequence = isinstance(generator, data_utils.Sequence)
-  _validate_arguments(is_sequence, use_multiprocessing, workers,
+  _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                       steps_per_epoch, validation_data, validation_steps, mode,
                       kwargs)
 
@@ -151,14 +175,15 @@ def model_iteration(model,
   progbar.params = callbacks.params
   progbar.params['verbose'] = verbose
 
-  if mode == 'predict':
+  if mode == ModeKeys.PREDICT:
     aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
   else:
     aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
 
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   if should_set_learning_phase:
     old_learning_phase = backend.learning_phase()
-    backend.set_learning_phase(1 if mode == 'train' else 0)
+    backend.set_eager_learning_phase(1 if mode == ModeKeys.TRAIN else 0)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -170,13 +195,40 @@ def model_iteration(model,
     # Setup work for each epoch.
     model.reset_metrics()
     epoch_logs = {}
-    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    if mode == ModeKeys.TRAIN:
+      callbacks.on_epoch_begin(epoch, epoch_logs)
     progbar.on_epoch_begin(epoch, epoch_logs)
 
-    for step in range(steps_per_epoch):
+    if steps_per_epoch is None:
+      # Loop over dataset until `OutOfRangeError` is raised.
+      target_steps = np.inf
+    else:
+      # Loop over dataset for the specified number of steps.
+      target_steps = steps_per_epoch
+
+    step = 0
+    while step < target_steps:
       batch_data = _get_next_batch(output_generator, mode)
       if batch_data is None:
-        callbacks.model.stop_training = True
+        if not is_dataset:
+          # We ran out of batches while the user passed an iterator (legacy).
+          logging.warning(
+              'Your dataset iterator ran out of data; '
+              'interrupting training. Make sure that your iterator '
+              'can generate at least `%s * epochs` '
+              'batches (in this case, %d batches). You may need to'
+              'use the repeat() function when building your '
+              'dataset.' % (steps_name, steps_per_epoch * epochs))
+          callbacks.model.stop_training = True
+        else:
+          # The dataset passed by the user ran out of batches.
+          # Now we know the cardinality of the dataset.
+          # assert steps_per_epoch is None
+          if step > 0:
+            steps_per_epoch = step
+            aggregator.num_samples_or_steps = steps_per_epoch
+            progbar.params['steps'] = steps_per_epoch
+            progbar.progbar.target = steps_per_epoch
         break
 
       # `batch_size` used for validation data if validation
@@ -201,6 +253,7 @@ def model_iteration(model,
       batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
       callbacks._call_batch_hook(mode, 'end', step, batch_logs)
       progbar.on_batch_end(step, batch_logs)
+      step += 1
 
       if callbacks.model.stop_training:
         break
@@ -212,7 +265,9 @@ def model_iteration(model,
       results = results[0]
 
     # Run the test loop every epoch during training.
-    if do_validation and not callbacks.model.stop_training:
+    if (do_validation and
+        training_utils.should_run_validation(validation_freq, epoch) and
+        not callbacks.model.stop_training):
       val_results = model_iteration(
           model,
           validation_data,
@@ -224,17 +279,22 @@ def model_iteration(model,
           max_queue_size=max_queue_size,
           callbacks=callbacks,
           verbose=0,
-          mode='test')
+          mode=ModeKeys.TEST,
+          steps_name='validation_steps')
 
       if not isinstance(val_results, list):
         val_results = [val_results]
       epoch_logs = cbks.make_logs(
           model, epoch_logs, val_results, mode, prefix='val_')
 
-    if mode == 'train':
+    if mode == ModeKeys.TRAIN:
       # Epochs only apply to `fit`.
-      callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
-      progbar.on_epoch_end(epoch, epoch_logs)
+      callbacks.on_epoch_end(epoch, epoch_logs)
+    progbar.on_epoch_end(epoch, epoch_logs)
+
+    # Recreate dataset iterator for the next epoch.
+    if reset_dataset_after_each_epoch and epoch < epochs - 1:
+      generator = dataset_ops.make_one_shot_iterator(original_dataset)
 
   callbacks._call_end_hook(mode)
 
@@ -242,31 +302,29 @@ def model_iteration(model,
     enqueuer.stop()
 
   if should_set_learning_phase:
-    backend.set_learning_phase(old_learning_phase)
+    backend.set_eager_learning_phase(old_learning_phase)
 
-  if mode == 'train':
+  if mode == ModeKeys.TRAIN:
     return model.history
   return results
 
 
 # Maintain compatibility with the existing names.
-fit_generator = functools.partial(model_iteration, mode='train')
+fit_generator = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
 evaluate_generator = functools.partial(
-    model_iteration, mode='test', shuffle=False)
+    model_iteration, mode=ModeKeys.TEST, shuffle=False)
 predict_generator = functools.partial(
-    model_iteration, mode='predict', shuffle=False)
+    model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
 
 
 def _get_next_batch(output_generator, mode):
   """Retrieves the next batch of input data."""
   try:
     generator_output = next(output_generator)
-  except (errors.OutOfRangeError, StopIteration):
-    # Returning `None` will trigger looping to stop.
-    logging.warning('Your dataset iterator ran out of data.')
+  except (StopIteration, errors.OutOfRangeError):
     return None
   if not isinstance(generator_output, tuple):
-    if mode == 'predict':
+    if mode == ModeKeys.PREDICT:
       # Always wrap in a tuple.
       return (generator_output,)
     else:
@@ -281,7 +339,7 @@ def _get_next_batch(output_generator, mode):
   return generator_output
 
 
-def _validate_arguments(is_sequence, use_multiprocessing, workers,
+def _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                         steps_per_epoch, validation_data, validation_steps,
                         mode, kwargs):
   """Raises errors if arguments are invalid.
@@ -289,6 +347,7 @@ def _validate_arguments(is_sequence, use_multiprocessing, workers,
   Arguments:
     is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
       instance.
+    is_dataset: Boolean, whether data is a dataset instance.
     use_multiprocessing: Boolean. If `True`, use process-based threading. If
       unspecified, `use_multiprocessing` will default to `False`. Note that
       because this implementation relies on multiprocessing, you should not pass
@@ -305,7 +364,7 @@ def _validate_arguments(is_sequence, use_multiprocessing, workers,
       `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
     validation_steps: Total number of steps (batches of samples) before
       declaring validation finished.
-    mode: One of 'train'/'test'/'predict'.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
     kwargs: Additional arguments for backwards compatibility.
 
   Raises:
@@ -320,15 +379,14 @@ def _validate_arguments(is_sequence, use_multiprocessing, workers,
                     ' Please consider using the `keras.utils.Sequence`'
                     ' class.'))
 
-  if steps_per_epoch is None:
-    arg_name = 'steps_per_epoch' if mode == 'train' else 'steps'
+  if steps_per_epoch is None and not is_dataset:
+    arg_name = 'steps_per_epoch' if mode == ModeKeys.TRAIN else 'steps'
     raise ValueError('Please specify the number of steps via the '
                      '`{}` argument.'.format(arg_name))
 
   val_gen = (
       data_utils.is_generator_or_sequence(validation_data) or
-      isinstance(validation_data, iterator_ops.EagerIterator) or
-      isinstance(validation_data, dataset_ops.DatasetV2))
+      isinstance(validation_data, iterator_ops.EagerIterator))
   if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
       not validation_steps):
     raise ValueError('Please specify the `validation_steps` argument.')
@@ -352,7 +410,9 @@ def convert_to_generator_like(data,
       and may be `None` or `[None]`.
     batch_size: Used when creating a generator out of tuples of NumPy arrays or
       EagerTensors.
-    steps_per_epoch: Steps of the generator to run each epoch.
+    steps_per_epoch: Steps of the generator to run each epoch. If `None` the
+      number of steps will be read from the data (for
+      `keras.utils.data_utils.Sequence` types).
     epochs: Total number of epochs to run.
     shuffle: Whether the data should be shuffled.
 
@@ -373,7 +433,8 @@ def convert_to_generator_like(data,
   if data_utils.is_generator_or_sequence(data) or isinstance(
       data, iterator_ops.EagerIterator):
     if isinstance(data, data_utils.Sequence):
-      steps_per_epoch = len(data)
+      if steps_per_epoch is None:
+        steps_per_epoch = len(data)
     return data, steps_per_epoch
   if isinstance(data, dataset_ops.DatasetV2):
     return dataset_ops.make_one_shot_iterator(data), steps_per_epoch
@@ -427,13 +488,9 @@ def _make_enqueued_generator(generator,
 
 def _make_execution_function(model, mode, class_weight=None):
   """Makes function to run one step of model execution."""
-  if mode == 'train':
-    if not context.executing_eagerly():
-      model._make_fit_function()
+  if mode == ModeKeys.TRAIN:
     f = functools.partial(model.train_on_batch, class_weight=class_weight)
-  elif mode == 'test':
-    if not context.executing_eagerly():
-      model._make_eval_function()
+  elif mode == ModeKeys.TEST:
     f = model.test_on_batch
   else:
     # Match signature of other modes to allow
@@ -444,7 +501,7 @@ def _make_execution_function(model, mode, class_weight=None):
     f = predict_on_batch
 
   # Maintain stateful metrics across batch-level calls.
-  if mode != 'predict':
+  if mode != ModeKeys.PREDICT:
     f = functools.partial(f, reset_metrics=False)
 
   return f
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 90c45dfcb7fdae23ffba5c0a8e72404f3b9350dd..6b754c18b3d45a66fd704a64e01b425d854d3329 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -66,8 +66,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_fit_generator_method(self):
     model = testing_utils.get_small_mlp(
@@ -107,8 +106,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_evaluate_generator_method(self):
     model = testing_utils.get_small_mlp(
@@ -173,8 +171,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                             max_queue_size=10,
                             workers=0)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_generator_methods_with_sample_weights(self):
     model = testing_utils.get_small_mlp(
@@ -208,8 +205,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                              max_queue_size=10,
                              use_multiprocessing=False)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_generator_methods_invalid_use_case(self):
 
@@ -249,8 +245,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                                max_queue_size=10,
                                use_multiprocessing=False)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_generator_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
@@ -275,8 +270,7 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
 
 class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_training_with_sequences(self):
 
@@ -307,8 +301,7 @@ class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
                         workers=0,
                         use_multiprocessing=False)
 
-  # TODO(b/120940700): Bug with subclassed model inputs.
-  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 45dcfe43995b280072395b11a573e20d57bcadc7..ddc947339dd8f68a7c85eefb48860f9f65b1fad2 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.layers.convolutional import Conv2D
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class TrainingGPUTest(test.TestCase):
@@ -65,7 +64,7 @@ class TrainingGPUTest(test.TestCase):
                            bias_initializer='ones')(input_tensor)
       simple_model = keras.models.Model(inputs=input_tensor,
                                         outputs=predictions)
-      simple_model.compile(optimizer=rmsprop.RMSPropOptimizer(1e-3), loss=loss)
+      simple_model.compile(optimizer='rmsprop', loss=loss)
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 345673a84d526e439f5ce5607aa772637b3ffd6d..6be4da70f6e60c4da4a0e96999e51cae1aabc005 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -22,6 +22,7 @@ import io
 import logging
 import sys
 
+from absl.testing import parameterized
 import numpy as np
 import six
 
@@ -36,6 +37,7 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
+from tensorflow.python.keras.engine.training_utils import set_run_eagerly_for_dict_structure
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -43,6 +45,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -348,16 +351,21 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertEqual(len(out), 2)
 
   @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
   def test_activity_regularizer_fit(self):
     loss = {}
     for reg in [None, 'l2']:
-      inputs = keras.layers.Input(shape=(10,))
-      x = keras.layers.Dense(
-          10, activation='relu', activity_regularizer=reg,
-          kernel_initializer='ones', use_bias=False)(inputs)
-      outputs = keras.layers.Dense(1, activation='sigmoid',
-                                   kernel_initializer='ones', use_bias=False)(x)
-      model = keras.Model(inputs, outputs)
+      layers = [
+          keras.layers.Dense(
+              10, activation='relu', activity_regularizer=reg,
+              kernel_initializer='ones', use_bias=False),
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones',
+              use_bias=False),
+      ]
+
+      model = testing_utils.get_model_from_layers(
+          layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
       y = np.ones((10, 1), 'float32')
@@ -370,15 +378,14 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertLess(loss[None], loss['l2'])
 
   @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
   def test_activity_regularizer_loss_value(self):
-    inputs = keras.layers.Input(shape=(10,))
-    outputs = keras.layers.Dense(
-        1,
-        kernel_initializer=keras.initializers.zeros(),
-        bias_initializer=keras.initializers.ones(),
-        activity_regularizer='l2')(
-            inputs)
-    model = keras.Model(inputs, outputs)
+    layer = keras.layers.Dense(
+        1, kernel_initializer=keras.initializers.zeros(),
+        bias_initializer=keras.initializers.ones(), activity_regularizer='l2')
+
+    model = testing_utils.get_model_from_layers([layer], input_shape=(10,))
+
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
     optimizer = RMSPropOptimizer(learning_rate=0.001)
@@ -797,6 +804,34 @@ class TrainingTest(keras_parameterized.TestCase):
       test_model_loss = test_model.train_on_batch(train_x, train_y)
       self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(
+      ('default', 1, 4), ('integer_two', 2, 2), ('integer_four', 4, 1),
+      ('simple_list', [1, 3, 4], 3), ('duplicated_list', [4, 2, 2], 2))
+  def test_validation_freq(self, validation_freq, expected_runs):
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    model = testing_utils.get_small_mlp(2, 1, 10)
+    model.compile('sgd', 'mse')
+
+    class ValCounter(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.val_runs = 0
+
+      def on_test_begin(self, logs=None):
+        self.val_runs += 1
+
+    val_counter = ValCounter()
+    model.fit(
+        x,
+        y,
+        epochs=4,
+        validation_data=(x, y),
+        validation_freq=validation_freq,
+        callbacks=[val_counter])
+    self.assertEqual(val_counter.val_runs, expected_runs)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -993,37 +1028,6 @@ class LossWeightingTest(keras_parameterized.TestCase):
           x_test[test_ids, :], y_test[test_ids, :], verbose=0)
       self.assertLess(score[0], ref_score[0])
 
-  @keras_parameterized.run_all_keras_modes
-  def test_warning_for_concurrent_sample_and_class_weights(self):
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, input_shape=(3,)))
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.01),
-        run_eagerly=testing_utils.should_run_eagerly())
-    x_train = np.random.random((10, 3))
-    y_train = np.random.random((10, 10))
-    sample_weight = np.ones((y_train.shape[0]))
-    class_weight = {0: 1., 1: 1.}
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(
-          x_train,
-          y_train,
-          epochs=1,
-          verbose=0,
-          sample_weight=sample_weight,
-          class_weight=class_weight)
-      msg = 'The `class_weight` argument will be ignored.'
-
-      msg_found = False
-      for call_args in mock_log.call_args_list:
-        if msg in str(call_args):
-          msg_found = True
-
-      self.assertTrue(msg_found)
-
   @keras_parameterized.run_all_keras_modes
   def test_temporal_sample_weights(self):
     num_classes = 5
@@ -2506,6 +2510,79 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
     self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_a1_total_loss_available_with_dict_dataset(self):
+
+    class TestModel(keras.models.Model):
+
+      def call(self, inputs, training=None, mask=None):
+        return math_ops.to_float(inputs['id'])
+
+    model = TestModel()
+    model.compile(
+        optimizer=AdamOptimizer(), loss='mean_squared_error', metrics=['mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    dataset = dataset_ops.Dataset.from_tensor_slices(({
+        'id': [[6], [3], [1]]
+    }, [[0.7], [0.4], [0.2]]))
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(({
+        'id': [[8], [5]]
+    }, [[0.9], [0.6]]))
+    history = model.fit(
+        dataset,
+        steps_per_epoch=2,
+        validation_data=val_dataset,
+        validation_steps=2)
+    self.assertAlmostEqual(history.history['val_loss'][0], 34.885, 2)
+    model.evaluate(dataset, steps=30)
+    model.predict([7])
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_total_loss_available_with_dict_array(self):
+
+    class TestModel(keras.models.Model):
+
+      def call(self, inputs, training=None, mask=None):
+        return math_ops.to_float(inputs['id'])
+
+    model = TestModel()
+    model.compile(
+        optimizer=AdamOptimizer(), loss='mean_squared_error', metrics=['mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    x = {'id': np.array([[3], [1]])}
+    y = np.array([[4], [2]])
+    val_dataset = (x, y)
+    history = model.fit(
+        x,
+        y,
+        batch_size=32,
+        steps_per_epoch=2,
+        validation_data=val_dataset,
+        validation_steps=2)
+    self.assertAlmostEqual(history.history['val_loss'][0], 1.0, 2)
+    model.evaluate(x, y)
+    model.predict([7])
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_set_run_eagerly_for_dict_structure(self):
+    test_model = keras.models.Model()
+    self.assertFalse(test_model.run_eagerly)
+    set_run_eagerly_for_dict_structure(
+        test_model,
+        {'a': 2})
+    self.assertTrue(test_model.run_eagerly)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_set_run_eagerly_for_dict_dataset(self):
+    test_model = keras.models.Model()
+    self.assertFalse(test_model.run_eagerly)
+    set_run_eagerly_for_dict_structure(
+        test_model,
+        dataset_ops.Dataset.from_tensor_slices(({
+            'id': [[3], [1]]
+        }, [[0.5], [0.2]])))
+    self.assertTrue(test_model.run_eagerly)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 8325a91c5738b0d751890f85e741ef6931926650..bbc180be8909d64b5fbe169440f4b069f474872f 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -19,19 +19,24 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import collections
 from collections import OrderedDict
 import copy
+import json
+import os
 
 import numpy as np
 import six
 
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
@@ -43,6 +48,8 @@ from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensi
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 
@@ -105,6 +112,8 @@ class MetricsAggregator(Aggregator):
     self.results[1:] = batch_outs[1:]
 
   def finalize(self):
+    if not self.results:
+      raise ValueError('Empty training data.')
     self.results[0] /= self.num_samples_or_steps
 
 
@@ -468,14 +477,16 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
       ValueError: if a loss function or target array
           is incompatible with an output.
   """
-  key_losses = {
+  key_loss_fns = {
       losses.mean_squared_error, losses.binary_crossentropy,
       losses.categorical_crossentropy
   }
+  key_loss_classes = (losses.MeanSquaredError, losses.BinaryCrossentropy,
+                      losses.CategoricalCrossentropy)
   for y, loss, shape in zip(targets, loss_fns, output_shapes):
     if y is None or loss is None or tensor_util.is_tensor(y):
       continue
-    if loss is losses.categorical_crossentropy:
+    if losses.is_categorical_crossentropy(loss):
       if y.shape[-1] == 1:
         raise ValueError('You are passing a target array of shape ' + str(
             y.shape) + ' while using as loss `categorical_crossentropy`. '
@@ -492,14 +503,20 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
                          'Alternatively, you can use the loss function '
                          '`sparse_categorical_crossentropy` instead, '
                          'which does expect integer targets.')
-    if loss in key_losses:
+
+    is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
+    if (isinstance(loss, key_loss_classes) or (is_loss_wrapper and
+                                               (loss.fn in key_loss_fns))):
       for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
         if out_dim is not None and target_dim != out_dim:
+          loss_name = loss.name
+          if loss_name is None:
+            loss_type = loss.fn if is_loss_wrapper else type(loss)
+            loss_name = loss_type.__name__
           raise ValueError('A target array with shape ' + str(y.shape) +
                            ' was passed for an output of shape ' + str(shape) +
-                           ' while using as loss `' + loss.__name__ + '`. '
-                           'This loss expects '
-                           'targets to have the same shape '
+                           ' while using as loss `' + loss_name + '`. '
+                           'This loss expects targets to have the same shape '
                            'as the output.')
 
 
@@ -674,7 +691,8 @@ def standardize_weights(y,
   """Performs sample weight validation and standardization.
 
   Everything gets normalized to a single sample-wise (or timestep-wise)
-  weight array.
+  weight array. If both `sample_weight` and `class_weight` are provided,
+  the weights are multiplied.
 
   Arguments:
       y: Numpy array of model targets to be weighted.
@@ -735,22 +753,26 @@ def standardize_weights(y,
           'Found a sample_weight array with shape ' + str(sample_weight.shape) +
           ' for an input with shape ' + str(y.shape) + '. '
           'sample_weight cannot be broadcast.')
-    return sample_weight
-  elif isinstance(class_weight, dict):
+
+  # Class weights applied per-sample.
+  class_sample_weight = None
+  if isinstance(class_weight, dict):
     if len(y.shape) > 2:
       raise ValueError('`class_weight` not supported for '
                        '3+ dimensional targets.')
-    if y.shape[1] > 1:
-      y_classes = np.argmax(y, axis=1)
-    elif y.shape[1] == 1:
-      y_classes = np.reshape(y, y.shape[0])
+
+    if len(y.shape) == 2:
+      if y.shape[1] > 1:
+        y_classes = np.argmax(y, axis=1)
+      elif y.shape[1] == 1:
+        y_classes = np.reshape(y, y.shape[0])
     else:
       y_classes = y
 
-    weights = np.asarray(
+    class_sample_weight = np.asarray(
         [class_weight[cls] for cls in y_classes if cls in class_weight])
 
-    if len(weights) != len(y_classes):
+    if len(class_sample_weight) != len(y_classes):
       # subtract the sets to pick all missing classes
       existing_classes = set(y_classes)
       existing_class_weight = set(class_weight.keys())
@@ -758,9 +780,15 @@ def standardize_weights(y,
                        ' The classes %s exist in the data but not in '
                        '`class_weight`.' %
                        (existing_classes - existing_class_weight))
-    return weights
-  else:
-    return None
+
+  if class_sample_weight is not None and sample_weight is not None:
+    # Multiply weights if both are provided.
+    return class_sample_weight * sample_weight
+  if sample_weight is not None:
+    return sample_weight
+  if class_sample_weight is not None:
+    return class_sample_weight
+  return None
 
 
 def has_symbolic_tensors(ls):
@@ -855,17 +883,25 @@ def get_loss_function(loss):
   if loss is None or isinstance(loss, losses.Loss):
     return loss
 
-  # TODO(psv): After we have added all V2 losses, update this function.
-  if loss in ['mse', 'MSE', 'mean_squared_error']:
-    return losses.MeanSquaredError()
-  return losses.get(loss)
+  # Deserialize loss configuration, if needed.
+  if isinstance(loss, collections.Mapping):
+    loss = losses.get(loss)
+
+  # Custom callable class.
+  if callable(loss) and not hasattr(loss, '__name__'):
+    return loss
+
+  # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
+  # in `LossFunctionWrapper` class.
+  loss_fn = losses.get(loss)
+  return losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
 
 
-def validate_iterator_input(x, y, sample_weight, validation_split=None):
+def validate_dataset_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
   Arguments:
-    x: Input data. A `tf.data` dataset iterator.
+    x: Input data. A `tf.data` dataset or iterator.
     y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
         Expected to be `None` when `x` is a dataset iterator.
     sample_weight: An optional sample-weight array passed by the user to
@@ -899,7 +935,9 @@ def validate_iterator_input(x, y, sample_weight, validation_split=None):
         'Received: x=%s, validation_split=%f' % (x, validation_split))
 
 
-def check_generator_arguments(y=None, sample_weight=None):
+def check_generator_arguments(y=None,
+                              sample_weight=None,
+                              validation_split=None):
   """Validates arguments passed when using a generator."""
   if y is not None:
     raise ValueError('`y` argument is not supported when data is'
@@ -909,6 +947,9 @@ def check_generator_arguments(y=None, sample_weight=None):
     raise ValueError('`sample_weight` argument is not supported when data is'
                      'a generator or Sequence instance. Instead pass sample'
                      ' weights as the third element of the generator.')
+  if validation_split:
+    raise ValueError('If your data is in the form of a Python generator, '
+                     'you cannot use `validation_split`.')
 
 
 def check_steps_argument(input_data, steps, steps_name):
@@ -934,15 +975,13 @@ def check_steps_argument(input_data, steps, steps_name):
       ValueError: if `steps` argument is required for given input data type
         but not provided.
   """
-
-  is_x_iterator = (
-      isinstance(input_data, iterator_ops.Iterator) or
-      isinstance(input_data, iterator_ops.EagerIterator))
-
+  # TODO(fchollet): allow datasets with steps=None if cardinality is known.
+  is_x_iterator = isinstance(input_data, (iterator_ops.Iterator,
+                                          iterator_ops.EagerIterator))
   if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
       (isinstance(input_data, list) and not input_data)):
     if steps is None:
-      input_type_str = 'iterators' if is_x_iterator else 'data tensors'
+      input_type_str = 'a Dataset iterator' if is_x_iterator else 'data tensors'
       raise ValueError('When using {input_type} as input to a model, you should'
                        ' specify the `{steps_name}` argument.'.format(
                            input_type=input_type_str, steps_name=steps_name))
@@ -1063,6 +1102,258 @@ def is_feature_layer(layer):
   return getattr(layer, '_is_feature_layer', False)
 
 
+def is_eager_dataset_or_iterator(data):
+  return context.executing_eagerly() and isinstance(
+      data, (dataset_ops.DatasetV1,
+             dataset_ops.DatasetV2,
+             iterator_ops.EagerIterator))
+
+
+# pylint: disable=protected-access
+def assert_not_batched(dataset):
+  """Asserts that `dataset` is not batched.
+
+  The algorithm used by this method is sound but not complete. In other words,
+  if the method fails to establish the assertion, it does not mean the dataset
+  is batched.
+
+  Example usage:
+  ```python
+  try:
+    assert_not_batched(dataset)
+    # safe to assume `dataset` it not batched here
+  expect ValueError:
+    # make no assumptions about `dataset`
+  ```
+
+  Args:
+    dataset: The dataset to analyze.
+
+  Raises:
+    ValueError: If the method cannot establish the assertion.
+  """
+  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+    return assert_not_batched(dataset._dataset)
+  else:
+    whitelisted_types = [
+        dataset_ops._OptionsDataset,
+        dataset_ops.ConcatenateDataset,
+        dataset_ops.CacheDataset,
+        dataset_ops.FilterDataset,
+        dataset_ops.MapDataset,
+        dataset_ops.ParallelMapDataset,
+        dataset_ops.PrefetchDataset,
+        dataset_ops.RangeDataset,
+        dataset_ops.RepeatDataset,
+        dataset_ops.ShuffleDataset,
+        dataset_ops.SkipDataset,
+        dataset_ops.SparseTensorSliceDataset,
+        dataset_ops.TakeDataset,
+        dataset_ops.TensorDataset,
+        dataset_ops.TensorSliceDataset,
+        dataset_ops.ZipDataset,
+        readers.FixedLengthRecordDatasetV2,
+        readers.TextLineDatasetV2,
+        readers.TFRecordDatasetV2,
+    ]
+    for ty in whitelisted_types:
+      if isinstance(dataset, ty):
+        for input_dataset in dataset._inputs():
+          assert_not_batched(input_dataset)
+        return
+    raise ValueError('Could not assert that dataset is not batched.')
+
+
+# pylint: disable=protected-access
+def assert_not_shuffled(dataset):
+  """Asserts that `dataset` is not shuffled.
+
+  The algorithm used by this method is sound but not complete. In other words,
+  if the method fails to establish the assertion, it does not mean the dataset
+  is shuffled.
+
+  Example usage:
+  ```python
+  try:
+    assert_not_shuffled(dataset)
+    # safe to assume `dataset` it not shuffled here
+  expect ValueError:
+    # make no assumptions about `dataset`
+  ```
+
+  Args:
+    dataset: The dataset to analyze.
+
+  Raises:
+    ValueError: If the method cannot establish the assertion.
+  """
+  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+    return assert_not_shuffled(dataset._dataset)
+  else:
+    whitelisted_types = [
+        dataset_ops._OptionsDataset,
+        dataset_ops.BatchDataset,
+        dataset_ops.ConcatenateDataset,
+        dataset_ops.CacheDataset,
+        dataset_ops.FilterDataset,
+        dataset_ops.MapDataset,
+        dataset_ops.PaddedBatchDataset,
+        dataset_ops.ParallelMapDataset,
+        dataset_ops.PrefetchDataset,
+        dataset_ops.RangeDataset,
+        dataset_ops.RepeatDataset,
+        dataset_ops.SkipDataset,
+        dataset_ops.SparseTensorSliceDataset,
+        dataset_ops.TakeDataset,
+        dataset_ops.TensorDataset,
+        dataset_ops.TensorSliceDataset,
+        dataset_ops.WindowDataset,
+        dataset_ops.ZipDataset,
+        readers.FixedLengthRecordDatasetV2,
+        readers.TextLineDatasetV2,
+        readers.TFRecordDatasetV2,
+    ]
+    for ty in whitelisted_types:
+      if isinstance(dataset, ty):
+        for input_dataset in dataset._inputs():
+          assert_not_shuffled(input_dataset)
+        return
+    raise ValueError('Could not assert that dataset is not shuffled.')
+
+
+def verify_dataset_shuffled(x):
+  """Verifies that the dataset is shuffled.
+
+  Args:
+    x: Dataset passed as an input to the model.
+
+  Raises:
+    ValueError: if the dataset is not already shuffled.
+  """
+  assert isinstance(x, dataset_ops.DatasetV2)
+  try:
+    assert_not_shuffled(x)
+  except ValueError:
+    # Dataset may or may not be shuffled.
+    return
+  else:
+    logging.warning('Expected a shuffled dataset but input dataset `x` is '
+                    'not shuffled. Please invoke `shuffle()` on input dataset.')
+
+
+def is_dataset_or_iterator(data):
+  return isinstance(data, (dataset_ops.DatasetV1,
+                           dataset_ops.DatasetV2,
+                           iterator_ops.EagerIterator,
+                           iterator_ops.Iterator))
+
+
+def get_iterator(dataset):
+  """Create and initialize an iterator from a dataset."""
+  iterator = dataset_ops.make_initializable_iterator(dataset)
+  initialize_iterator(iterator)
+  return iterator
+
+
+def initialize_iterator(iterator):
+  init_op = iterator.initializer
+  if not context.executing_eagerly():
+    K.get_session().run(init_op)
+
+
+def extract_tensors_from_dataset(dataset):
+  """Extract a tuple of tensors `inputs, targets, sample_weight` from a dataset.
+
+  Arguments:
+    dataset: Dataset instance.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  iterator = get_iterator(dataset)
+  inputs, targets, sample_weight = unpack_iterator_input(iterator)
+  return inputs, targets, sample_weight
+
+
+def unpack_iterator_input(iterator):
+  """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
+
+  Arguments:
+    iterator: Instance of a dataset iterator.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  try:
+    next_element = iterator.get_next()
+  except errors.OutOfRangeError:
+    raise RuntimeError('Your dataset iterator ran out of data; '
+                       'Make sure that your dataset can generate '
+                       'required number of samples.')
+
+  if isinstance(next_element, (list, tuple)):
+    if len(next_element) not in [2, 3]:
+      raise ValueError(
+          'Please provide model inputs as a list or tuple of 2 or 3 '
+          'elements: (input, target) or (input, target, sample_weights) '
+          'Received %s' % next_element)
+    if len(next_element) == 2:
+      x, y = next_element
+      weights = None
+    else:
+      x, y, weights = next_element
+  else:
+    x = next_element
+    y = None
+    weights = None
+  return x, y, weights
+
+
+def infer_steps_for_dataset(dataset, steps, epochs=1, steps_name='steps'):
+  """Infers steps_per_epoch needed to loop through a dataset.
+
+  Arguments:
+      dataset: Input data of type tf.data.Dataset.
+      steps: Number of steps to draw from the dataset (may be None if unknown).
+      epochs: Number of times to iterate over the dataset.
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
+
+  Returns:
+    Integer or `None`. Inferred number of steps to loop through the dataset.
+    `None` is returned if the size of the dataset is unknown and `steps` was
+    not specified.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  assert isinstance(dataset, dataset_ops.DatasetV2)
+  size = K.get_value(cardinality.cardinality(dataset))
+  if size == cardinality.INFINITE and steps is None:
+    raise ValueError('When passing an infinitely repeating dataset, you '
+                     'must specify the `%s` argument.' % (steps_name,))
+  if size != cardinality.UNKNOWN:
+    if steps is not None and steps * epochs > size:
+      if epochs > 1:
+        raise ValueError('The dataset you passed contains %s batches, but you '
+                         'passed `epochs=%s` and `%s=%s`, which is a total of '
+                         '%s steps. We cannot draw that many steps from this '
+                         'dataset. We suggest to set `%s=%s`.' %
+                         (size, epochs, steps_name, steps, steps * epochs,
+                          steps_name, size // epochs))
+      else:
+        raise ValueError('The dataset you passed contains %s batches, but you '
+                         'passed `%s=%s`. We cannot draw that many steps from '
+                         'this dataset. We suggest to set `%s=%s`.' %
+                         (size, steps_name, steps, steps_name, size))
+  if steps is None:
+    if size >= 0:
+      return size
+    return None
+  return steps
+
+
 class ModelInputs(object):
   """Encapsulates model inputs.
 
@@ -1192,55 +1483,79 @@ def generic_output_names(outputs_list):
   return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
 
 
-def trace_model_call(model, input_signature=None):
-  """Trace the model call to create a tf.function for exporting a Keras model.
+def set_run_eagerly_for_dict_structure(model, x):
+  """Set model.run_eagerly to true if x is dict structure.
+
+  Set model.run_eagerly to true if x is dict or
+  Iterator/EagerIterator/Dataset of dict.
 
   Args:
     model: A Keras model.
-    input_signature: optional, a list of tf.TensorSpec objects specifying the
-      inputs to the model.
+    x: Input data.
+  """
+  if not context.executing_eagerly():
+    return
+  if isinstance(x, dict):
+    model.run_eagerly = True
+  if (isinstance(x, (iterator_ops.Iterator, iterator_ops.EagerIterator,
+                     dataset_ops.DatasetV2))):
+    for item in x.output_shapes:
+      if isinstance(item, dict):
+        model.run_eagerly = True
+        return
 
-  Returns:
-    A tf.function wrapping the model's call function with input signatures set.
 
-  Raises:
-    ValueError: if input signature cannot be inferred from the model.
+def convert_eager_tensors_to_numpy(structure):
+  """Convert every EagerTensor in `structure` to NumPy.
+
+  Arguments:
+    structure: An arbitrary structure of elements to be converted to NumPy
+      arrays.
+
+  Returns:
+    An identical structure with EagerTensors converted to NumPy arrays.
   """
-  if input_signature is None:
-    if isinstance(model.call, def_function.PolymorphicFunction):
-      input_signature = model.call.input_signature
 
-  if input_signature is None:
-    try:
-      inputs = model.inputs
-      input_names = model.input_names
-    except AttributeError:
-      raise ValueError(
-          'Model {} cannot be saved because the input shapes have not been '
-          'set. Usually, input shapes are automatically determined from calling'
-          ' .fit() or .predict(). To manually set the shapes, call '
-          'model._set_inputs(inputs).'.format(model))
-    input_specs = []
-    for input_tensor, input_name in zip(inputs, input_names):
-      input_specs.append(tensor_spec.TensorSpec(
-          shape=input_tensor.shape, dtype=input_tensor.dtype,
-          name=input_name))
-    # The input signature of the call function is a list with one element, since
-    # all tensor inputs must be passed in as the first argument.
-    input_signature = [input_specs] if len(input_specs) > 1 else input_specs
-
-  # TODO(mdan): Should the model's call be autographed by default?
-  @def_function.function(input_signature=input_signature, autograph=False)
-  def _wrapped_model(*args):
-    """A concrete tf.function that wraps the model's call function."""
-    # When given a single input, Keras models will call the model on the tensor
-    # rather than a list consisting of the single tensor.
-    inputs = args[0] if len(input_signature) == 1 else list(args)
-    outputs_list = nest.flatten(model(inputs=inputs))
-    try:
-      output_names = model.output_names
-    except AttributeError:
-      output_names = generic_output_names(outputs_list)
-    return {name: output for name, output in zip(output_names, outputs_list)}
+  def _convert(element):
+    if isinstance(element, ops.EagerTensor):
+      return element.numpy()
+    return element
+
+  return nest.map_structure(_convert, structure)
+
+
+def should_run_multi_worker():
+  """Whether a model should be run using DistributedCoordinator."""
+  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
+  cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
+  return tf_config and 'master' not in cluster_spec.jobs
+
+
+def should_run_validation(validation_freq, epoch):
+  """Checks if validation should be run this epoch.
 
-  return _wrapped_model
+  Arguments:
+    validation_freq: Integer or list. If an integer, specifies how many training
+      epochs to run before a new validation run is performed. If a list,
+      specifies the epochs on which to run validation.
+    epoch: Integer, the number of the training epoch just completed.
+
+  Returns:
+    Bool, True if validation should be run.
+
+  Raises:
+    ValueError: if `validation_freq` is an Integer and less than 1, or if
+    it is neither an Integer nor a Sequence.
+  """
+  # `epoch` is 0-indexed internally but 1-indexed in the public API.
+  one_indexed_epoch = epoch + 1
+
+  if isinstance(validation_freq, int):
+    if validation_freq < 1:
+      raise ValueError('`validation_freq` can not be less than 1.')
+    return one_indexed_epoch % validation_freq == 0
+
+  if not isinstance(validation_freq, collections.Container):
+    raise ValueError('`validation_freq` must be an Integer or '
+                     '`collections.Container` (e.g. list, tuple, etc.)')
+  return one_indexed_epoch in validation_freq
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index d8acec32cb65ffb2bbf517007802504e7c184544..e472dc3f2cdd87f114e9b398560dd61f9cf33337 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -18,30 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
+from absl.testing import parameterized
 import numpy as np
 
 
-from tensorflow.python.client import session as session_lib
-from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.saved_model import loader
-from tensorflow.python.saved_model import save as save_lib
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.platform import tf_logging as logging
 
 
 class ModelInputsTest(test.TestCase):
@@ -102,168 +92,159 @@ class ModelInputsTest(test.TestCase):
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
 
 
-class TraceModelCallTest(keras_parameterized.TestCase):
-
-  def _assert_all_close(self, expected, actual):
-    if not context.executing_eagerly():
-      with self.cached_session() as sess:
-        K._initialize_variables(sess)
-        self.assertAllClose(expected, actual)
+class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2), ValueError),
+      ('Cache', lambda: dataset_ops.Dataset.range(5).cache()),
+      ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5))),
+      ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
+      ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
+      ('FixedLengthRecordDatasetV2',
+       lambda: readers.FixedLengthRecordDatasetV2([], 42)),
+      ('FromTensors', lambda: dataset_ops.Dataset.from_tensors(0)),
+      ('FromTensorSlices',
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
+      ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       ValueError),
+      ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), ValueError),
+      ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
+      ('Options',
+       lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
+      ),
+      ('PaddedBatch', lambda: dataset_ops.Dataset.range(5).padded_batch(2, []),
+       ValueError),
+      ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1)),
+      ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
+      ('Range', lambda: dataset_ops.Dataset.range(0)),
+      ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
+      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1)),
+      ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
+      ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
+      ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
+      ('TFRecordDataset', lambda: readers.TFRecordDatasetV2([])),
+      ('Window', lambda: dataset_ops.Dataset.range(5).window(2), ValueError),
+      ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
+      # pylint: enable=g-long-lambda
+  )
+  def test_assert_not_batched(self, dataset_fn, expected_error=None):
+    if expected_error is None:
+      training_utils.assert_not_batched(dataset_fn())
     else:
-      self.assertAllClose(expected, actual)
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_trace_model_outputs(self):
-    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
-    model = testing_utils.get_small_mlp(10, 3, input_dim)
-    inputs = array_ops.ones((8, 5))
-
-    if input_dim is None:
-      with self.assertRaisesRegexp(ValueError,
-                                   'input shapes have not been set'):
-        training_utils.trace_model_call(model)
-      model._set_inputs(inputs)
-
-    fn = training_utils.trace_model_call(model)
-    signature_outputs = fn(inputs)
-    expected_outputs = {model.output_names[0]: model(inputs)}
-
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_trace_model_outputs_after_fitting(self):
-    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
-    model = testing_utils.get_small_mlp(10, 3, input_dim)
-    model.compile(optimizer='sgd', loss='mse')
-    model.fit(x=np.random.random((8, 5)),
-              y=np.random.random((8, 3)), epochs=2)
-
-    inputs = array_ops.ones((8, 5))
-
-    fn = training_utils.trace_model_call(model)
-    signature_outputs = fn(inputs)
-    expected_outputs = {model.output_names[0]: model(inputs)}
-
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
-  @keras_parameterized.run_all_keras_modes
-  def test_trace_multi_io_model_outputs(self):
-    input_dim = 5
-    num_classes = 3
-    num_classes_b = 4
-    input_a = keras.layers.Input(shape=(input_dim,), name='input_a')
-    input_b = keras.layers.Input(shape=(input_dim,), name='input_b')
-
-    dense = keras.layers.Dense(num_classes, name='dense')
-    dense2 = keras.layers.Dense(num_classes_b, name='dense2')
-    dropout = keras.layers.Dropout(0.5, name='dropout')
-    branch_a = [input_a, dense]
-    branch_b = [input_b, dense, dense2, dropout]
-
-    model = testing_utils.get_multi_io_model(branch_a, branch_b)
-
-    input_a_np = np.random.random((10, input_dim)).astype(np.float32)
-    input_b_np = np.random.random((10, input_dim)).astype(np.float32)
-
-    if testing_utils.get_model_type() == 'subclass':
-      with self.assertRaisesRegexp(ValueError,
-                                   'input shapes have not been set'):
-        training_utils.trace_model_call(model)
-
-    model.compile(optimizer='sgd', loss='mse')
-    model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
-                 np.random.random((8, input_dim)).astype(np.float32)],
-              y=[np.random.random((8, num_classes)).astype(np.float32),
-                 np.random.random((8, num_classes_b)).astype(np.float32)],
-              epochs=2)
-
-    fn = training_utils.trace_model_call(model)
-    signature_outputs = fn([input_a_np, input_b_np])
-    outputs = model([input_a_np, input_b_np])
-    expected_outputs = {model.output_names[0]: outputs[0],
-                        model.output_names[1]: outputs[1]}
-
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_specify_input_signature(self):
-    model = testing_utils.get_small_sequential_mlp(10, 3, None)
-    inputs = array_ops.ones((8, 5))
-
-    with self.assertRaisesRegexp(ValueError, 'input shapes have not been set'):
-      training_utils.trace_model_call(model)
-
-    fn = training_utils.trace_model_call(
-        model, [tensor_spec.TensorSpec(shape=[None, 5], dtype=dtypes.float32)])
-    signature_outputs = fn(inputs)
-    expected_outputs = {model.output_names[0]: model(inputs)}
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_subclassed_model_with_input_signature(self):
-
-    class Model(keras.Model):
-
-      def __init__(self):
-        super(Model, self).__init__()
-        self.dense = keras.layers.Dense(3, name='dense')
-
-      @def_function.function(
-          input_signature=[[tensor_spec.TensorSpec([None, 5], dtypes.float32),
-                            tensor_spec.TensorSpec([None], dtypes.float32)]],)
-      def call(self, inputs, *args):
-        x, y = inputs
-        return self.dense(x) + y
-
-    model = Model()
-    fn = training_utils.trace_model_call(model)
-    x = array_ops.ones((8, 5), dtype=dtypes.float32)
-    y = array_ops.ones((3,), dtype=dtypes.float32)
-    expected_outputs = {'output_1': model([x, y])}
-    signature_outputs = fn([x, y])
-    self._assert_all_close(expected_outputs, signature_outputs)
-
-
-def _import_and_infer(save_dir, inputs):
-  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
-  graph = ops.Graph()
-  with graph.as_default(), session_lib.Session() as session:
-    model = loader.load(session, [tag_constants.SERVING], save_dir)
-    signature = model.signature_def[
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-    assert set(inputs.keys()) == set(signature.inputs.keys())
-    feed_dict = {}
-    for arg_name in inputs.keys():
-      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
-          inputs[arg_name])
-    output_dict = {}
-    for output_name, output_tensor_info in signature.outputs.items():
-      output_dict[output_name] = graph.get_tensor_by_name(
-          output_tensor_info.name)
-    return session.run(output_dict, feed_dict=feed_dict)
-
-
-class ModelSaveTest(keras_parameterized.TestCase):
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_model_save(self):
-    input_dim = 5
-    model = testing_utils.get_small_mlp(10, 3, input_dim)
-    inputs = array_ops.ones((8, 5))
-
-    if testing_utils.get_model_type() == 'subclass':
-      model._set_inputs(inputs)
-
-    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save_lib.save(model, save_dir)
+      with self.assertRaises(expected_error):
+        training_utils.assert_not_batched(dataset_fn())
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2)),
+      ('Cache', lambda: dataset_ops.Dataset.range(5).cache()),
+      ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5))),
+      ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
+      ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
+      ('FixedLengthRecordDatasetV2',
+       lambda: readers.FixedLengthRecordDatasetV2([], 42)),
+      ('FromTensors', lambda: dataset_ops.Dataset.from_tensors(0)),
+      ('FromTensorSlices',
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
+      ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       ValueError),
+      ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
+      ('Options',
+       lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
+      ),
+      ('PaddedBatch', lambda: dataset_ops.Dataset.range(5).padded_batch(2, [])),
+      ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), ValueError),
+      ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1)),
+      ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
+      ('Range', lambda: dataset_ops.Dataset.range(0)),
+      ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
+      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1), ValueError),
+      ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
+      ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
+      ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
+      ('TFRecordDataset', lambda: readers.TFRecordDatasetV2([])),
+      ('Window', lambda: dataset_ops.Dataset.range(5).window(2)),
+      ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
+      # pylint: enable=g-long-lambda
+  )
+  def test_assert_not_shuffled(self, dataset_fn, expected_error=None):
+    if expected_error is None:
+      training_utils.assert_not_shuffled(dataset_fn())
+    else:
+      with self.assertRaises(expected_error):
+        training_utils.assert_not_shuffled(dataset_fn())
+
+  def test_verify_dataset_shuffled(self):
+    dataset = dataset_ops.Dataset.range(5)
+    training_utils.assert_not_shuffled(dataset)
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      training_utils.verify_dataset_shuffled(dataset)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'input dataset `x` is not shuffled.')
+
+    shuffled_dataset = dataset.shuffle(10)
+    training_utils.verify_dataset_shuffled(shuffled_dataset)
+
+
+class StandardizeWeightsTest(keras_parameterized.TestCase):
+
+  def test_sample_weights(self):
+    y = np.array([0, 1, 0, 0, 2])
+    sample_weights = np.array([0.5, 1., 1., 0., 2.])
+    weights = training_utils.standardize_weights(y, sample_weights)
+    self.assertAllClose(weights, sample_weights)
+
+  def test_class_weights(self):
+    y = np.array([0, 1, 0, 0, 2])
+    class_weights = {0: 0.5, 1: 1., 2: 1.5}
+    weights = training_utils.standardize_weights(y, class_weight=class_weights)
+    self.assertAllClose(weights, np.array([0.5, 1., 0.5, 0.5, 1.5]))
+
+  def test_sample_weights_and_class_weights(self):
+    y = np.array([0, 1, 0, 0, 2])
+    sample_weights = np.array([0.5, 1., 1., 0., 2.])
+    class_weights = {0: 0.5, 1: 1., 2: 1.5}
+    weights = training_utils.standardize_weights(y, sample_weights,
+                                                 class_weights)
+    expected = sample_weights * np.array([0.5, 1., 0.5, 0.5, 1.5])
+    self.assertAllClose(weights, expected)
+
+  def test_dataset_with_class_weight(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse')
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    class_weight_np = np.array([0.25, 0.25, 0.25, 0.25])
+    class_weight = dict(enumerate(class_weight_np))
+
+    model.fit(
+        dataset,
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=1,
+        class_weight=class_weight)
 
-    self.assertAllClose(
-        {model.output_names[0]: model.predict_on_batch(inputs)},
-        _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py
index 33415fd13930c2537872ffe2fdfba84d5aeb110e..ac55ff965e693905407a534f083c8fab3f679c21 100644
--- a/tensorflow/python/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -42,12 +42,30 @@ from tensorflow.python.ops.init_ops import RandomUniform as TFRandomUniform
 from tensorflow.python.ops.init_ops import TruncatedNormal as TFTruncatedNormal
 from tensorflow.python.ops.init_ops import VarianceScaling  # pylint: disable=unused-import
 from tensorflow.python.ops.init_ops import Zeros
+# pylint: disable=unused-import, disable=line-too-long
+from tensorflow.python.ops.init_ops_v2 import Constant as ConstantV2
+from tensorflow.python.ops.init_ops_v2 import GlorotNormal as GlorotNormalV2
+from tensorflow.python.ops.init_ops_v2 import GlorotUniform as GlorotUniformV2
+from tensorflow.python.ops.init_ops_v2 import he_normal as he_normalV2
+from tensorflow.python.ops.init_ops_v2 import he_uniform as he_uniformV2
+from tensorflow.python.ops.init_ops_v2 import Identity as IdentityV2
+from tensorflow.python.ops.init_ops_v2 import Initializer as InitializerV2
+from tensorflow.python.ops.init_ops_v2 import lecun_normal as lecun_normalV2
+from tensorflow.python.ops.init_ops_v2 import lecun_uniform  as lecun_uniformV2
+from tensorflow.python.ops.init_ops_v2 import Ones as OnesV2
+from tensorflow.python.ops.init_ops_v2 import Orthogonal as OrthogonalV2
+from tensorflow.python.ops.init_ops_v2 import RandomNormal as RandomNormalV2
+from tensorflow.python.ops.init_ops_v2 import RandomUniform as RandomUniformV2
+from tensorflow.python.ops.init_ops_v2 import TruncatedNormal as TruncatedNormalV2
+from tensorflow.python.ops.init_ops_v2 import VarianceScaling as VarianceScalingV2
+from tensorflow.python.ops.init_ops_v2 import Zeros as ZerosV2
+# pylint: enable=unused-import, enable=line-too-long
 
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.initializers.TruncatedNormal',
-              'keras.initializers.truncated_normal')
+@keras_export(v1=['keras.initializers.TruncatedNormal',
+                  'keras.initializers.truncated_normal'])
 class TruncatedNormal(TFTruncatedNormal):
   """Initializer that generates a truncated normal distribution.
 
@@ -71,8 +89,9 @@ class TruncatedNormal(TFTruncatedNormal):
         mean=mean, stddev=stddev, seed=seed, dtype=dtype)
 
 
-@keras_export('keras.initializers.RandomUniform', 'keras.initializers.uniform',
-              'keras.initializers.random_uniform')
+@keras_export(v1=['keras.initializers.RandomUniform',
+                  'keras.initializers.uniform',
+                  'keras.initializers.random_uniform'])
 class RandomUniform(TFRandomUniform):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -92,8 +111,9 @@ class RandomUniform(TFRandomUniform):
         minval=minval, maxval=maxval, seed=seed, dtype=dtype)
 
 
-@keras_export('keras.initializers.RandomNormal', 'keras.initializers.normal',
-              'keras.initializers.random_normal')
+@keras_export(v1=['keras.initializers.RandomNormal',
+                  'keras.initializers.normal',
+                  'keras.initializers.random_normal'])
 class RandomNormal(TFRandomNormal):
   """Initializer that generates tensors with a normal distribution.
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 4f91bea1e331f0b52a4f34fc848b3d51509e1360..36f2d405326f4bb96027d8022545c585072dcc98 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.ops import init_ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
 
@@ -67,6 +67,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_max=2, target_min=-2)
 
+  @test_util.run_deprecated_v1
   def test_constant(self):
     tensor_shape = (5, 6, 4)
     with self.cached_session():
@@ -134,6 +135,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.orthogonal(seed=123), tensor_shape,
                    target_mean=0.)
 
+  @test_util.run_deprecated_v1
   def test_identity(self):
     with self.cached_session():
       tensor_shape = (3, 4, 5)
@@ -145,28 +147,33 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.identity(), tensor_shape,
                    target_mean=1. / tensor_shape[0], target_max=1.)
 
+  @test_util.run_deprecated_v1
   def test_zero(self):
     tensor_shape = (4, 5)
     with self.cached_session():
       self._runner(keras.initializers.zeros(), tensor_shape,
                    target_mean=0., target_max=0.)
 
+  @test_util.run_deprecated_v1
   def test_one(self):
     tensor_shape = (4, 5)
     with self.cached_session():
       self._runner(keras.initializers.ones(), tensor_shape,
                    target_mean=1., target_max=1.)
 
+  @test_util.run_deprecated_v1
   def test_default_random_uniform(self):
     ru = keras.initializers.get('uniform')
     self.assertEqual(ru.minval, -0.05)
     self.assertEqual(ru.maxval, 0.05)
 
+  @test_util.run_deprecated_v1
   def test_default_random_normal(self):
     rn = keras.initializers.get('normal')
     self.assertEqual(rn.mean, 0.0)
     self.assertEqual(rn.stddev, 0.05)
 
+  @test_util.run_deprecated_v1
   def test_default_truncated_normal(self):
     tn = keras.initializers.get('truncated_normal')
     self.assertEqual(tn.mean, 0.0)
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index fbe3508f07d85d91c845a9defd2f3660d0b25754..dc8d1deddb26172a724deaf51a0403302554d9f2 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -22,299 +22,179 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.layers import core as tf_core_layers
-from tensorflow.python.ops import nn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
 
-class KerasIntegrationTest(test.TestCase):
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
+
+  def test_vector_classification(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = testing_utils.get_model_from_layers(
+        [keras.layers.Dense(16, activation='relu'),
+         keras.layers.Dropout(0.1),
+         keras.layers.Dense(y_train.shape[-1], activation='softmax')],
+        input_shape=x_train.shape[1:])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
-  def test_version(self):
-    self.assertTrue(keras.__version__.endswith('-tf'))
-
-  def test_vector_classification_sequential(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential([
-          keras.layers.Dense(16,
-                             activation='relu',
-                             input_shape=x_train.shape[1:]),
-          keras.layers.Dropout(0.1),
-          keras.layers.Dense(y_train.shape[-1], activation='softmax')
-      ])
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_deprecated_v1
-  def test_vector_classification_functional(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(20,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.layers.Input(shape=x_train.shape[1:])
-      x = keras.layers.Dense(16, activation='relu')(inputs)
-      x = keras.layers.Dropout(0.1)(x)
-      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
-
-      model = keras.models.Model(inputs, outputs)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_deprecated_v1
-  def test_temporal_classification_sequential(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(4, 10),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.LSTM(5, return_sequences=True,
-                                  input_shape=x_train.shape[1:]))
-      model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_deprecated_v1
-  def test_temporal_classification_sequential_tf_rnn(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(4, 10),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.RNN(rnn_cell.LSTMCell(5), return_sequences=True,
-                                 input_shape=x_train.shape[1:]))
-      model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
-                                                  activation='softmax',
-                                                  dtype=dtypes.float32)))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_image_classification_sequential(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(12, 12, 3),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.Conv2D(
-          4, 3,
-          padding='same',
-          activation='relu',
-          input_shape=x_train.shape[1:]))
-      model.add(keras.layers.Conv2D(
-          8, 3,
-          padding='same',
-          activation='relu'))
-      model.add(keras.layers.Conv2D(
-          16, 3,
-          padding='same',
-          activation='relu'))
-      model.add(keras.layers.Flatten())
-      model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_video_classification_functional(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(4, 8, 8, 3),
-          num_classes=3)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.layers.Input(shape=x_train.shape[1:])
-      x = keras.layers.TimeDistributed(
-          keras.layers.Conv2D(4, 3, activation='relu'))(inputs)
-      x = keras.layers.BatchNormalization()(x)
-      x = keras.layers.TimeDistributed(keras.layers.GlobalMaxPooling2D())(x)
-      x = keras.layers.Conv1D(8, 3, activation='relu')(x)
-      x = keras.layers.Flatten()(x)
-      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
-
-      model = keras.models.Model(inputs, outputs)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_vector_classification_shared_sequential(self):
+  def test_vector_classification_shared_model(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      base_model = keras.models.Sequential([
-          keras.layers.Dense(16,
-                             activation='relu',
-                             kernel_regularizer=keras.regularizers.l2(1e-5),
-                             bias_regularizer=keras.regularizers.l2(1e-5),
-                             input_shape=x_train.shape[1:]),
-          keras.layers.BatchNormalization(),
-      ])
-      x = keras.layers.Input(x_train.shape[1:])
-      y = base_model(x)
-      y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
-      model = keras.models.Model(x, y)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    base_model = testing_utils.get_model_from_layers(
+        [keras.layers.Dense(16,
+                            activation='relu',
+                            kernel_regularizer=keras.regularizers.l2(1e-5),
+                            bias_regularizer=keras.regularizers.l2(1e-5)),
+         keras.layers.BatchNormalization()],
+        input_shape=x_train.shape[1:])
+    x = keras.layers.Input(x_train.shape[1:])
+    y = base_model(x)
+    y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
+    model = keras.models.Model(x, y)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    if not testing_utils.should_run_eagerly():
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_vector_classification_shared_model(self):
-    # Test that functional models that feature internal updates
-    # and internal losses can be shared.
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.layers.Input(x_train.shape[1:])
-      x = keras.layers.Dense(16,
-                             activation='relu',
-                             kernel_regularizer=keras.regularizers.l2(1e-5),
-                             bias_regularizer=keras.regularizers.l2(1e-5),
-                             input_shape=x_train.shape[1:])(inputs)
-      x = keras.layers.BatchNormalization()(x)
-      base_model = keras.models.Model(inputs, x)
-
-      x = keras.layers.Input(x_train.shape[1:])
-      y = base_model(x)
-      y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
-      model = keras.models.Model(x, y)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_embedding_with_clipnorm(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Embedding(input_dim=1, output_dim=1))
-      model.compile(optimizer=keras.optimizers.SGD(clipnorm=0.1), loss='mse')
-      model.fit(np.array([[0]]), np.array([[[0.5]]]), epochs=1)
-
-  def test_using_tf_layers_in_keras_sequential_model(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-
-      model = keras.models.Sequential()
-      model.add(tf_core_layers.Dense(32, activation=nn.relu, input_shape=(10,)))
-      model.add(tf_core_layers.Dense(2, activation=nn.softmax))
-      model.summary()
-
-      y_train = keras.utils.to_categorical(y_train)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_using_tf_layers_in_keras_functional_model(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.Input(shape=(10,))
-      x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
-      outputs = tf_core_layers.Dense(2, activation=nn.softmax)(x)
-      model = keras.Model(inputs, outputs)
-      model.summary()
-
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
+    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+
+# See b/122473407
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  def test_timeseries_classification(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(4, 10),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    layers = [
+        keras.layers.LSTM(5, return_sequences=True),
+        keras.layers.GRU(y_train.shape[-1], activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(
+        layers, input_shape=x_train.shape[1:])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=15, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+  def test_timeseries_classification_sequential_tf_rnn(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(4, 10),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.RNN(rnn_cell.LSTMCell(5), return_sequences=True,
+                               input_shape=x_train.shape[1:]))
+    model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
+                                                activation='softmax',
+                                                dtype=dtypes.float32)))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=15, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
+
+  def test_image_classification(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10, 10, 3),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    layers = [
+        keras.layers.Conv2D(4, 3, padding='same', activation='relu'),
+        keras.layers.Conv2D(8, 3, padding='same'),
+        keras.layers.BatchNormalization(),
+        keras.layers.Conv2D(8, 3, padding='same'),
+        keras.layers.Flatten(),
+        keras.layers.Dense(y_train.shape[-1], activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(
+        layers, input_shape=x_train.shape[1:])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 285388f340fc9aa6890a7d141127d1192d565528..88fbaca3eacfc074bc567fa066e59d8f010c7ea2 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -110,6 +110,10 @@ from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
 from tensorflow.python.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization import LayerNormalization
+
+# Kernelized layers.
+from tensorflow.python.keras.layers.kernelized import RandomFourierFeatures
 
 # Pooling layers.
 from tensorflow.python.keras.layers.pooling import MaxPooling1D
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index be1039a2ac9510e9acbc7472b584f104a8625033..5095287430735b4d370b0545c3971da14a4c0b6d 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -121,11 +121,9 @@ class PReLU(Layer):
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
     param_shape = list(input_shape[1:])
-    self.param_broadcast = [False] * len(param_shape)
     if self.shared_axes is not None:
       for i in self.shared_axes:
         param_shape[i - 1] = 1
-        self.param_broadcast[i - 1] = True
     self.alpha = self.add_weight(
         shape=param_shape,
         name='alpha',
@@ -143,12 +141,7 @@ class PReLU(Layer):
 
   def call(self, inputs, mask=None):
     pos = K.relu(inputs)
-    if K.backend() == 'theano':
-      neg = (
-          K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - math_ops.abs(inputs)) * 0.5)
-    else:
-      neg = -self.alpha * K.relu(-inputs)
+    neg = -self.alpha * K.relu(-inputs)
     return pos + neg
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index f32bb457c825d9769c6dccf625d9318c07843237..f04185417effae2b705a610edddd97a2ccf2ad74 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.keras import keras_parameterized
@@ -88,6 +90,13 @@ class AdvancedActivationsTest(keras_parameterized.TestCase):
             kwargs={'negative_slope': -2},
             input_shape=(2, 3, 4))
 
+  @keras_parameterized.run_with_all_model_types
+  def test_layer_as_activation(self):
+    layer = keras.layers.Dense(1, activation=keras.layers.ReLU())
+    model = testing_utils.get_model_from_layers([layer], input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 7251a67191f07c4198728b87db1192aa0e6cc7d9..30b919cc0a9038cf0eeb10a240105fbabd591efa 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -180,12 +180,14 @@ class Conv(Layer):
       op_padding = 'valid'
     else:
       op_padding = self.padding
+    if not isinstance(op_padding, (list, tuple)):
+      op_padding = op_padding.upper()
     self._convolution_op = nn_ops.Convolution(
         input_shape,
         filter_shape=self.kernel.get_shape(),
         dilation_rate=self.dilation_rate,
         strides=self.strides,
-        padding=op_padding.upper(),
+        padding=op_padding,
         data_format=conv_utils.convert_data_format(self.data_format,
                                                    self.rank + 2))
     self.built = True
@@ -199,21 +201,8 @@ class Conv(Layer):
           # nn.bias_add does not accept a 1D input tensor.
           bias = array_ops.reshape(self.bias, (1, self.filters, 1))
           outputs += bias
-        if self.rank == 2:
+        else:
           outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          if outputs_shape[0] is None:
-            outputs_shape[0] = -1
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
       else:
         outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
 
@@ -1127,24 +1116,10 @@ class Conv3DTranspose(Conv3D):
       outputs.set_shape(out_shape)
 
     if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
+      outputs = nn.bias_add(
+          outputs,
           self.bias,
           data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
 
     if self.activation is not None:
       return self.activation(outputs)
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 81af06b4eca3a962d95b59e73dc3148d0312c733..9140ce426e6881b2abbc821e835c1e792c884343 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -18,8 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
@@ -30,39 +29,32 @@ from tensorflow.python.platform import test
 
 
 @keras_parameterized.run_all_keras_modes
-class Convolution1DTest(keras_parameterized.TestCase):
+class Conv1DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     length = 7
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv1D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, length, stack_size))
-
-  def test_conv1d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same', 'causal'])
-    self._run_test(kwargs, 'strides', [2])
-    self._run_test(kwargs, 'dilation_rate', [2])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
-    self._run_test(kwargs, 'dilation_rate', [3])
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv1D,
+          kwargs=kwargs,
+          input_shape=(num_samples, length, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('padding_same_dilation_3', {'padding': 'same', 'dilation_rate': 3}),
+      ('padding_causal', {'padding': 'causal'}),
+      ('strides', {'strides': 2}),
+      ('dilation_rate', {'dilation_rate': 2}),
+  )
+  def test_conv1d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = 3
+    self._run_test(kwargs)
 
   def test_conv1d_regularizers(self):
     kwargs = {
@@ -74,7 +66,7 @@ class Convolution1DTest(keras_parameterized.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -93,7 +85,7 @@ class Convolution1DTest(keras_parameterized.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
@@ -103,41 +95,33 @@ class Convolution1DTest(keras_parameterized.TestCase):
 @keras_parameterized.run_all_keras_modes
 class Conv2DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv2D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_conv2d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2)])
-    if test.is_gpu_available(cuda_only=True):
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv2D,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('strides', {'strides': (2, 2)}),
+      ('dilation_rate', {'dilation_rate': (2, 2)}),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-    self._run_test(kwargs, 'dilation_rate', [(2, 2)])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
+      ('data_format', {'data_format': 'channels_first'}),
+  )
+  def test_conv2d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
 
   def test_conv2d_regularizers(self):
     kwargs = {
@@ -149,7 +133,7 @@ class Conv2DTest(keras_parameterized.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -168,357 +152,43 @@ class Conv2DTest(keras_parameterized.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@keras_parameterized.run_all_keras_modes
-class Conv2DTransposeTest(keras_parameterized.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv2DTranspose,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_conv2dtranspose(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-
-    kwargs['strides'] = (2, 2)
-    self._run_test(kwargs, 'output_padding', [(1, 1)])
-
-  def test_conv2dtranspose_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv2DTranspose(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv2dtranspose_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv2DTranspose(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_conv2d_transpose_dilation(self):
-    testing_utils.layer_test(keras.layers.Conv2DTranspose,
-                             kwargs={'filters': 2,
-                                     'kernel_size': 3,
-                                     'padding': 'same',
-                                     'data_format': 'channels_last',
-                                     'dilation_rate': (2, 2)},
-                             input_shape=(2, 5, 6, 3))
-
-    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
-    expected_output = np.float32([[192, 228, 192, 228],
-                                  [336, 372, 336, 372],
-                                  [192, 228, 192, 228],
-                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
-    testing_utils.layer_test(keras.layers.Conv2DTranspose,
-                             input_data=input_data,
-                             kwargs={'filters': 1,
-                                     'kernel_size': 3,
-                                     'padding': 'same',
-                                     'data_format': 'channels_last',
-                                     'dilation_rate': (2, 2),
-                                     'kernel_initializer': 'ones'},
-                             expected_output=expected_output)
-
-
-@keras_parameterized.run_all_keras_modes
-class Conv3DTransposeTest(keras_parameterized.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-    depth = 5
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv3DTranspose,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, depth, num_row, num_col, stack_size))
-
-  def test_conv3dtranspose(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-
-    kwargs['strides'] = (2, 2, 2)
-    self._run_test(kwargs, 'output_padding', [(1, 1, 1)])
-
-  def test_conv3dtranspose_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv3DTranspose(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv3dtranspose_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv3DTranspose(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-
-@keras_parameterized.run_all_keras_modes
-class SeparableConv1DTest(keras_parameterized.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    length = 7
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.SeparableConv1D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, length, stack_size))
-
-  def test_separable_conv1d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same', 'causal'])
-    self._run_test(kwargs, 'strides', [2])
-    self._run_test(kwargs, 'dilation_rate', [2])
-    self._run_test(kwargs, 'depth_multiplier', [2])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
-
-  def test_separable_conv1d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'depthwise_regularizer': 'l2',
-        'pointwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((1, 5, 2))))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_separable_conv1d_constraints(self):
-    d_constraint = lambda x: x
-    p_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'pointwise_constraint': p_constraint,
-        'depthwise_constraint': d_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
-      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-
-@keras_parameterized.run_all_keras_modes
-class SeparableConv2DTest(keras_parameterized.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.SeparableConv2D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_separable_conv2d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [2])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-    self._run_test(kwargs, 'dilation_rate', [2])
-    self._run_test(kwargs, 'depth_multiplier', [2])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
-
-  def test_separable_conv2d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'depthwise_regularizer': 'l2',
-        'pointwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_separable_conv2d_constraints(self):
-    d_constraint = lambda x: x
-    p_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'pointwise_constraint': p_constraint,
-        'depthwise_constraint': d_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
-      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-
 @keras_parameterized.run_all_keras_modes
 class Conv3DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
     depth = 5
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv3D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, depth, num_row, num_col, stack_size))
-
-  def test_conv3d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2, 2)])
-    self._run_test(kwargs, 'dilation_rate', [(2, 2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv3D,
+          kwargs=kwargs,
+          input_shape=(num_samples, depth, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2, 2)}),
+      ('dilation_rate', {'dilation_rate': (2, 2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+  )
+  def test_conv3d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
 
   def test_conv3d_regularizers(self):
     kwargs = {
@@ -530,7 +200,7 @@ class Conv3DTest(keras_parameterized.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv3D(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -550,12 +220,36 @@ class Conv3DTest(keras_parameterized.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv3D(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  def test_conv3d_dynamic_shape(self):
+    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      # Won't raise error here.
+      testing_utils.layer_test(
+          keras.layers.Conv3D,
+          kwargs={
+              'data_format': 'channels_last',
+              'filters': 3,
+              'kernel_size': 3
+          },
+          input_shape=(None, None, None, None, 3),
+          input_data=input_data)
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.Conv3D,
+            kwargs={
+                'data_format': 'channels_first',
+                'filters': 3,
+                'kernel_size': 3
+            },
+            input_shape=(None, 3, None, None, None),
+            input_data=input_data)
+
 
 @keras_parameterized.run_all_keras_modes
 class ZeroPaddingTest(keras_parameterized.TestCase):
@@ -567,7 +261,7 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
     shape = (num_samples, num_steps, input_dim)
     inputs = np.ones(shape)
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # basic test
       testing_utils.layer_test(
           keras.layers.ZeroPadding1D,
@@ -698,7 +392,7 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
     inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
                       input_len_dim3, stack_size))
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # basic test
       testing_utils.layer_test(
           keras.layers.ZeroPadding3D,
@@ -730,7 +424,7 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
 class UpSamplingTest(keras_parameterized.TestCase):
 
   def test_upsampling_1d(self):
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
 
@@ -884,7 +578,7 @@ class CroppingTest(keras_parameterized.TestCase):
     input_len_dim1 = 2
     inputs = np.random.rand(num_samples, time_length, input_len_dim1)
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.Cropping1D,
           kwargs={'cropping': (2, 2)},
@@ -1020,30 +714,34 @@ class CroppingTest(keras_parameterized.TestCase):
 @keras_parameterized.run_all_keras_modes
 class DepthwiseConv2DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.DepthwiseConv2D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_depthwise_conv2d(self):
-    kwargs = {'kernel_size': (3, 3)}
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-    self._run_test(kwargs, 'depth_multiplier', [1, 2])
-
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.DepthwiseConv2D,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('depth_multiplier_1', {'depth_multiplier': 1}),
+      ('depth_multiplier_2', {'depth_multiplier': 2}),
+  )
+  def test_depthwise_conv2d(self, kwargs):
+    kwargs['kernel_size'] = (3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_depthwise_conv2d_full(self):
     kwargs = {
         'kernel_size': 3,
         'padding': 'valid',
@@ -1055,8 +753,9 @@ class DepthwiseConv2DTest(keras_parameterized.TestCase):
         'depthwise_constraint': 'unit_norm',
         'use_bias': True,
         'strides': (2, 2),
+        'depth_multiplier': 1,
     }
-    self._run_test(kwargs, 'depth_multiplier', [1])
+    self._run_test(kwargs)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/convolutional_transpose_test.py b/tensorflow/python/keras/layers/convolutional_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd73d22d51b014c6dd00e946ad1cf7f0cd7332f8
--- /dev/null
+++ b/tensorflow/python/keras/layers/convolutional_transpose_test.py
@@ -0,0 +1,209 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional transpose layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class Conv2DTransposeTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv2DTranspose,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('strides_output_padding', {'strides': (2, 2), 'output_padding': (1, 1)}),
+  )
+  def test_conv2d_transpose(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_conv2d_transpose_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+  def test_conv2d_transpose_constraints(self):
+    k_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': k_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+  def test_conv2d_transpose_dilation(self):
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             kwargs={'filters': 2,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2)},
+                             input_shape=(2, 5, 6, 3))
+
+    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
+    expected_output = np.float32([[192, 228, 192, 228],
+                                  [336, 372, 336, 372],
+                                  [192, 228, 192, 228],
+                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             input_data=input_data,
+                             kwargs={'filters': 1,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2),
+                                     'kernel_initializer': 'ones'},
+                             expected_output=expected_output)
+
+
+@keras_parameterized.run_all_keras_modes
+class Conv3DTransposeTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+    depth = 5
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv3DTranspose,
+          kwargs=kwargs,
+          input_shape=(num_samples, depth, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('strides_output_padding', {'strides': (2, 2, 2),
+                                  'output_padding': (1, 1, 1)}),
+  )
+  def test_conv3d_transpose(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_conv3d_transpose_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+  def test_conv3d_transpose_constraints(self):
+    k_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': k_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+  def test_conv3d_transpose_dynamic_shape(self):
+    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      # Won't raise error here.
+      testing_utils.layer_test(
+          keras.layers.Conv3DTranspose,
+          kwargs={
+              'data_format': 'channels_last',
+              'filters': 3,
+              'kernel_size': 3
+          },
+          input_shape=(None, None, None, None, 3),
+          input_data=input_data)
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.Conv3DTranspose,
+            kwargs={
+                'data_format': 'channels_first',
+                'filters': 3,
+                'kernel_size': 3
+            },
+            input_shape=(None, 3, None, None, None),
+            input_data=input_data)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index dfbab80be3f806fd7463bb792993e00c90442c10..fdc0a76877330f06cf1d9d664b56106295276957 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
@@ -932,6 +933,10 @@ class Dense(Layer):
     self.input_spec = InputSpec(min_ndim=2)
 
   def build(self, input_shape):
+    dtype = dtypes.as_dtype(self.dtype or K.floatx())
+    if not (dtype.is_floating or dtype.is_complex):
+      raise TypeError('Unable to build `Dense` layer with non-floating point '
+                      'dtype %s' % (dtype,))
     input_shape = tensor_shape.TensorShape(input_shape)
     if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
@@ -972,6 +977,7 @@ class Dense(Layer):
         output_shape = shape[:-1] + [self.units]
         outputs.set_shape(output_shape)
     else:
+      inputs = math_ops.cast(inputs, self.dtype)
       outputs = gen_math_ops.mat_mul(inputs, self.kernel)
     if self.use_bias:
       outputs = nn.bias_add(outputs, self.bias)
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 9df40f806fa2cd78699218298b6d31199ed126d6..d61c8e3d08c072eb13149afa5bef1c3834a557b3 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import math_ops
@@ -283,6 +284,13 @@ class CoreLayersTest(keras_parameterized.TestCase):
     testing_utils.layer_test(
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
+  def test_dense_dtype(self):
+    inputs = ops.convert_to_tensor(
+        np.random.randint(low=0, high=7, size=(2, 2)))
+    layer = keras.layers.Dense(5, dtype='float32')
+    outputs = layer(inputs)
+    self.assertEqual(outputs.dtype, 'float32')
+
   def test_dense_regularization(self):
     layer = keras.layers.Dense(
         3,
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 36f2d2fa3834baee5906b460dcd2cd9c36d6b8af..c7d8d82ee2b178ba2b9ab43c6f4a19d1cd4bddcb 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.keras.optimizer_v2.rmsprop import RMSprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 @keras_parameterized.run_all_keras_modes
@@ -408,8 +407,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
     model.add(
         keras.layers.Bidirectional(
             rnn(output_dim), merge_mode=mode, input_shape=(None, dim)))
-    model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(loss='mse', optimizer='rmsprop')
     model.fit(x, y, epochs=1, batch_size=1)
 
     # test config
@@ -425,8 +423,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
             merge_mode=mode,
             input_shape=(None, dim)))
     model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
-    model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(loss='mse', optimizer=R'rmsprop')
     model.fit(x, y, epochs=1, batch_size=1)
 
     # test with functional API
@@ -435,8 +432,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
         rnn(output_dim), merge_mode=mode)(
             inputs)
     model = keras.Model(inputs, outputs)
-    model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(loss='mse', optimizer=R'rmsprop')
     model.fit(x, y, epochs=1, batch_size=1)
 
     # Bidirectional and stateful
@@ -445,8 +441,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
         rnn(output_dim, stateful=True), merge_mode=mode)(
             inputs)
     model = keras.Model(inputs, outputs)
-    model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(loss='mse', optimizer='rmsprop')
     model.fit(x, y, epochs=1, batch_size=1)
 
   @test_util.run_gpu_only
@@ -467,7 +462,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
 
     def assert_not_compatible(src, dest, message):
       with self.assertRaises(ValueError) as ex:
-        keras.engine.saving.preprocess_weights_for_loading(
+        keras.saving.preprocess_weights_for_loading(
             dest,
             get_layer_weights(src))
       self.assertIn(message, str(ex.exception))
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index d05e7eeb633e4e9b4c255e13ef7b21ad71ab4348..91183a4d732fb87e9e5868c9996c74a5ed5b0932 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -18,14 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 @keras_parameterized.run_all_keras_modes
@@ -50,8 +51,8 @@ class GRULayerTest(keras_parameterized.TestCase):
     layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.01), 'mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
@@ -68,17 +69,17 @@ class GRULayerTest(keras_parameterized.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  def test_implementation_mode_GRU(self):
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_GRU(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    for mode in [0, 1, 2]:
-      testing_utils.layer_test(
-          keras.layers.GRU,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.GRU,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_reset_after_GRU(self):
     num_samples = 2
@@ -98,8 +99,8 @@ class GRULayerTest(keras_parameterized.TestCase):
                                  reset_after=True)
     output = gru_layer(inputs)
     gru_model = keras.models.Model(inputs, output)
-    gru_model.compile(RMSPropOptimizer(0.01), 'mse',
-                      run_eagerly=testing_utils.should_run_eagerly())
+    gru_model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     gru_model.fit(x_train, y_train)
     gru_model.predict(x_train)
 
@@ -111,11 +112,75 @@ class GRULayerTest(keras_parameterized.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  def test_statefulness_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.GRU
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer='sgd', loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    np.testing.assert_allclose(out7, out6, atol=1e-5)
+
 
 @tf_test_util.run_all_in_graph_and_eager_modes
 class GRULayerGenericTest(test.TestCase):
@@ -146,75 +211,6 @@ class GRULayerGenericTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
-
-class GRULayerGraphOnlyTest(test.TestCase):
-
-  @tf_test_util.run_v1_only('b/120545219')
-  def test_statefulness_GRU(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.GRU
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer='sgd', loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      np.testing.assert_allclose(out7, out6, atol=1e-5)
-
-  # b/120919032
-  @tf_test_util.run_deprecated_v1
   def test_regularizers_GRU(self):
     embedding_dim = 4
     layer_class = keras.layers.GRU
@@ -232,7 +228,11 @@ class GRULayerGraphOnlyTest(test.TestCase):
 
     x = keras.backend.variable(np.ones((2, 3, 2)))
     layer(x)
-    self.assertEqual(len(layer.get_losses_for(x)), 1)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
new file mode 100644
index 0000000000000000000000000000000000000000..9753fc66de9ad98b831b225974db180e6f5737d1
--- /dev/null
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -0,0 +1,258 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers that implement explicit (approximate) kernel feature maps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+
+_SUPPORTED_RBF_KERNEL_TYPES = ['gaussian', 'laplacian']
+
+
+class RandomFourierFeatures(base_layer.Layer):
+  r"""Layer that maps its inputs using random Fourier features.
+
+  This layer implements a feature map \\(\phi: \mathbb{R}^d \rightarrow
+  \mathbb{R}^D\\) which approximates shift-invariant kernels. A kernel function
+  K(x, y) defined over \\(\mathbb{R}^d x \mathbb{R}^d\\) is shift-invariant if
+  K(x, y) = k(x-y) for some function defined over \\(\mathbb{R}^d\\). Many
+  popular Radial Basis Functions (in short RBF), including gaussian and
+  laplacian kernels are shift-invariant.
+
+  The layer approximates a (shift invariant) kernel K in the following sense:
+    up to a scaling factor, for all inputs \\(x, y \in \mathbb{R}^d\\)
+        \\(\phi(x)^T \cdot \phi(y) \approx K(x, y)\\)
+
+  The implementation of this layer is based on the following paper:
+  "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
+  (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+
+  The distribution from which the parameters of the random features map (layer)
+  are sampled, determines which shift-invariant kernel the layer approximates
+  (see paper for more details). The users can use the distribution of their
+  choice. Due to their popularity, the layer supports the out-of-the-box
+  approximation of the following RBF kernels:
+  - Gaussian: \\(K(x, y) = e^{-\frac{\|x-y\|_2^2}{2 \cdot scale^2}}\\)
+  - Laplacian: \\(K(x, y) = e^{-\frac{\|x-y\|_1}{scale}}\\)
+
+  NOTE: Unlike the map described in the paper and the scikit-learn
+  implementation, the output of this layer does not apply the sqrt(2/D)
+  normalization factor.
+
+  Usage for ML: Typically, this layer is used to "kernelize" linear models by
+  applying a non-linear transformation (this layer) to the input features and
+  then training a linear model on top of the transformed features. Depending on
+  the loss function of the linear model, the composition of this layer and the
+  linear model results to models that are equivalent (up to approximation) to
+  kernel SVMs (for hinge loss), kernel logistic regression (for logistic loss),
+  kernel linear regression (for squared loss) etc.
+
+  Example of building a kernel multinomial logistic regression model with
+  Gaussian kernel in keras:
+  ```python
+  random_features_layer = RandomFourierFeatures(
+      output_dim=500,
+      kernel_initializer='gaussian',
+      scale=5.0,
+      ...)
+
+  model = tf.keras.models.Sequential()
+  model.add(random_features_layer)
+  model.add(tf.keras.layers.Dense(units=num_classes, activation='softmax')
+
+  model.compile(elif isinstance(identifier, six.string_types):
+    loss=tf.keras.losses.categorical_crossentropy, optimizer=..., metrics=...)
+  ```
+
+  To use another kernel, replace the layer creation command with:
+  ```python
+  random_features_layer = RandomFourierFeatures(
+      output_dim=500,
+      kernel_initializer=<my_initializer>,
+      scale=...,
+      ...)
+  ```
+
+  Arguments:
+    output_dim: Positive integer, the dimension of the layer's output, i.e., the
+      number of random features used to approximate the kernel.
+    kernel_initializer: Determines the distribution of the parameters of the
+      random features map (and therefore the kernel approximated by the layer).
+      It can be either a string or an instance of TensorFlow's Initializer
+      class. Currently only 'gaussian' and 'laplacian' are supported as string
+      initializers (case insensitive). Note that these parameters are not
+      trainable.
+    scale: For gaussian and laplacian kernels, this corresponds to a scaling
+      factor of the corresponding kernel approximated by the layer (see concrete
+      definitions above). When provided, it should be a positive float. If None,
+      the implementation chooses a default value (1.0 typically). Both the
+      approximation error of the kernel and the classification quality are
+      sensitive to this parameter. If trainable is set to True, this paramater
+      is learned end-to-end during training and the provided value serves as an
+      initialization value.
+      NOTE: When this layer is used to map the initial features and then the
+        transformed features are fed to a linear model, by making `scale`
+        trainable, the resulting optimization problem is no longer convex (even
+        if the loss function used by the linear model is convex).
+    trainable: Whether the scaling parameter of th layer is trainable. Defaults
+      to False.
+    name: name for the RandomFourierFeatures layer.
+
+  Raises:
+    ValueError: if output_dim or stddev are not positive or if the provided
+      kernel_initializer is not supported.
+  """
+
+  def __init__(self,
+               output_dim,
+               kernel_initializer='gaussian',
+               scale=None,
+               trainable=False,
+               name=None,
+               **kwargs):
+    if output_dim <= 0:
+      raise ValueError(
+          '`output_dim` should be a positive integer. Given: {}.'.format(
+              output_dim))
+    if isinstance(kernel_initializer, six.string_types):
+      if kernel_initializer.lower() not in _SUPPORTED_RBF_KERNEL_TYPES:
+        raise ValueError(
+            'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'
+            .format(kernel_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
+    if scale is not None and scale <= 0.0:
+      raise ValueError('When provided, `scale` should be a positive float. '
+                       'Given: {}.'.format(scale))
+    super(RandomFourierFeatures, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self.output_dim = output_dim
+    self.kernel_initializer = kernel_initializer
+    self.scale = scale
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    # TODO(sibyl-vie3Poto): Allow higher dimension inputs. Currently the input is expected
+    # to have shape [batch_size, dimension].
+    if input_shape.rank != 2:
+      raise ValueError(
+          'The rank of the input tensor should be 2. Got {} instead.'.format(
+              input_shape.ndims))
+    if input_shape.dims[1].value is None:
+      raise ValueError(
+          'The last dimension of the inputs to `RandomFourierFeatures` '
+          'should be defined. Found `None`.')
+    self.input_spec = input_spec.InputSpec(
+        ndim=2, axes={1: input_shape.dims[1].value})
+    input_dim = input_shape.dims[1].value
+
+    kernel_initializer = _get_random_features_initializer(
+        self.kernel_initializer, shape=(input_dim, self.output_dim))
+
+    unscaled_kernel = self.add_weight(
+        name='unscaled_random_features',
+        shape=(input_dim, self.output_dim),
+        dtype=dtypes.float32,
+        initializer=kernel_initializer,
+        trainable=False)
+
+    self.bias = self.add_weight(
+        name='random_features_bias',
+        shape=(self.output_dim,),
+        dtype=dtypes.float32,
+        initializer=init_ops.random_uniform_initializer(
+            minval=0.0, maxval=2 * np.pi, dtype=dtypes.float32),
+        trainable=False)
+
+    if self.scale is None:
+      self.scale = _get_default_scale(self.kernel_initializer, input_dim)
+    scale = self.add_weight(
+        name='random_features_scale',
+        shape=(1,),
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(self.scale),
+        trainable=True,
+        constraint='NonNeg')
+    self.kernel = (1.0 / scale) * unscaled_kernel
+    super(RandomFourierFeatures, self).build(input_shape)
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    inputs = gen_math_ops.cast(inputs, dtypes.float32)
+    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    outputs = nn.bias_add(outputs, self.bias)
+    return gen_math_ops.cos(outputs)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank(2)
+    if input_shape.dims[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input shape must be defined. Given: %s' %
+          input_shape)
+    return input_shape[:-1].concatenate(self.output_dim)
+
+  def get_config(self):
+    kernel_initializer = self.kernel_initializer
+    if isinstance(self.kernel_initializer, init_ops.Initializer):
+      kernel_initializer = initializers.serialize(self.kernel_initializer)
+    config = {
+        'output_dim': self.output_dim,
+        'kernel_initializer': kernel_initializer,
+        'scale': self.scale,
+    }
+    base_config = super(RandomFourierFeatures, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def _get_random_features_initializer(initializer, shape):
+  """Returns Initializer object for random features."""
+
+  def _get_cauchy_samples(loc, scale, shape):
+    probs = np.random.uniform(low=0., high=1., size=shape)
+    return loc + scale * np.tan(np.pi * (probs - 0.5))
+
+  random_features_initializer = initializer
+  if isinstance(initializer, six.string_types):
+    if initializer.lower() == 'gaussian':
+      random_features_initializer = init_ops.random_normal_initializer(
+          stddev=1.0)
+    elif initializer.lower() == 'laplacian':
+      random_features_initializer = init_ops.constant_initializer(
+          _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape))
+
+    else:
+      raise ValueError(
+          'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'.format(
+              random_features_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
+  return random_features_initializer
+
+
+def _get_default_scale(initializer, input_dim):
+  if (isinstance(initializer, six.string_types) and
+      initializer.lower() == 'gaussian'):
+    return np.sqrt(input_dim / 2.0)
+  return 1.0
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..300b1b815595474426b7faf400dd9c9b61cd5acd
--- /dev/null
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -0,0 +1,391 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernelized.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as keras_backend
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.layers import kernelized as kernel_layers
+from tensorflow.python.keras.utils import kernelized_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+def _exact_gaussian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+
+
+def _exact_laplacian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+
+
+class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_all_close(self, expected, actual, atol=0.001):
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        keras_backend._initialize_variables(sess)
+        self.assertAllClose(expected, actual, atol=atol)
+    else:
+      self.assertAllClose(expected, actual, atol=atol)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_output_dim(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
+      _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_unsupported_kernel_type(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'):
+      _ = kernel_layers.RandomFourierFeatures(
+          3, 'unsupported_kernel', stddev=2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_scale(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'When provided, `scale` should be a positive float. Given: 0.0.'):
+      _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_input_shape(self):
+    inputs = random_ops.random_uniform((3, 2, 4), seed=1)
+    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'The rank of the input tensor should be 2. Got 3 instead.'):
+      _ = rff_layer.apply(inputs)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 10.0, False),
+      ('random', init_ops.random_uniform_initializer, 1.0, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_random_features_properties(self, initializer, scale, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=10,
+        kernel_initializer=initializer,
+        scale=scale,
+        trainable=trainable)
+    self.assertEqual(rff_layer.output_dim, 10)
+    self.assertEqual(rff_layer.kernel_initializer, initializer)
+    self.assertEqual(rff_layer.scale, scale)
+    self.assertEqual(rff_layer.trainable, trainable)
+
+  @parameterized.named_parameters(('gaussian', 'gaussian', False),
+                                  ('laplacian', 'laplacian', True),
+                                  ('other', init_ops.ones_initializer, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_call(self, initializer, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=10,
+        kernel_initializer=initializer,
+        scale=1.0,
+        trainable=trainable,
+        name='random_fourier_features')
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    outputs = rff_layer(inputs)
+    self.assertListEqual([3, 10], outputs.get_shape().as_list())
+    num_trainable_vars = 1 if trainable else 0
+    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
+    if not context.executing_eagerly():
+      self.assertLen(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          num_trainable_vars)
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def test_no_eager_Leak(self):
+    # Tests that repeatedly constructing and building a Layer does not leak
+    # Python objects.
+    inputs = random_ops.random_uniform((5, 4), seed=1)
+    kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
+    kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_output_shape(self):
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=7, name='random_fourier_features', trainable=True)
+    outputs = rff_layer(inputs)
+    self.assertEqual([3, 7], outputs.get_shape().as_list())
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian'), ('laplacian', 'laplacian'),
+      ('other', init_ops.random_uniform_initializer))
+  @test_util.run_deprecated_v1
+  def test_call_on_placeholder(self, initializer):
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5,
+        kernel_initializer=initializer,
+        name='random_fourier_features')
+    with self.assertRaisesRegexp(
+        ValueError, r'The last dimension of the inputs to '
+        '`RandomFourierFeatures` should be defined. Found `None`.'):
+      rff_layer(inputs)
+
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5,
+        kernel_initializer=initializer,
+        name='random_fourier_features')
+    with self.assertRaisesRegexp(
+        ValueError, r'The last dimension of the inputs to '
+        '`RandomFourierFeatures` should be defined. Found `None`.'):
+      rff_layer(inputs)
+
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5, name='random_fourier_features')
+    rff_layer(inputs)
+
+  @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0),
+                                  ('laplacian', 5, 'laplacian', None),
+                                  ('other', 10, init_ops.ones_initializer, 1.0))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_output_shape(self, output_dim, initializer, scale):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim, initializer, scale=scale, name='rff')
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape(None))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([]))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3]))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The innermost dimension of input shape must be defined.'):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None]))
+
+    self.assertEqual([None, output_dim],
+                     rff_layer.compute_output_shape((None, 3)).as_list())
+    self.assertEqual([None, output_dim],
+                     rff_layer.compute_output_shape(
+                         tensor_shape.TensorShape([None, 2])).as_list())
+    self.assertEqual([4, output_dim],
+                     rff_layer.compute_output_shape((4, 1)).as_list())
+
+  @parameterized.named_parameters(
+      ('gaussian', 10, 'gaussian', 3.0, False),
+      ('laplacian', 5, 'laplacian', 5.5, True),
+      ('other', 7, init_ops.random_uniform_initializer(), None, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_get_config(self, output_dim, initializer, scale, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim,
+        initializer,
+        scale=scale,
+        trainable=trainable,
+        name='random_fourier_features',
+    )
+    expected_initializer = initializer
+    if isinstance(initializer, init_ops.Initializer):
+      expected_initializer = initializers.serialize(initializer)
+
+    expected_config = {
+        'output_dim': output_dim,
+        'kernel_initializer': expected_initializer,
+        'scale': scale,
+        'name': 'random_fourier_features',
+        'trainable': trainable,
+        'dtype': None,
+    }
+    self.assertLen(expected_config, len(rff_layer.get_config()))
+    self.assertSameElements(
+        list(expected_config.items()), list(rff_layer.get_config().items()))
+
+  @parameterized.named_parameters(
+      ('gaussian', 5, 'gaussian', None, True),
+      ('laplacian', 5, 'laplacian', 5.5, False),
+      ('other', 7, init_ops.ones_initializer(), 2.0, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_from_config(self, output_dim, initializer, scale, trainable):
+    model_config = {
+        'output_dim': output_dim,
+        'kernel_initializer': initializer,
+        'scale': scale,
+        'trainable': trainable,
+        'name': 'random_fourier_features',
+    }
+    rff_layer = kernel_layers.RandomFourierFeatures.from_config(model_config)
+    self.assertEqual(rff_layer.output_dim, output_dim)
+    self.assertEqual(rff_layer.kernel_initializer, initializer)
+    self.assertEqual(rff_layer.scale, scale)
+    self.assertEqual(rff_layer.trainable, trainable)
+
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    outputs = rff_layer(inputs)
+    self.assertListEqual([3, output_dim], outputs.get_shape().as_list())
+    num_trainable_vars = 1 if trainable else 0
+    self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
+    if trainable:
+      self.assertEqual('random_fourier_features/random_features_scale:0',
+                       rff_layer.trainable_variables[0].name)
+    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
+    if not context.executing_eagerly():
+      self.assertLen(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          num_trainable_vars)
+
+  @parameterized.named_parameters(
+      ('gaussian', 10, 'gaussian', 3.0, True),
+      ('laplacian', 5, 'laplacian', 5.5, False),
+      ('other', 10, init_ops.random_uniform_initializer(), None, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_same_random_features_params_reused(self, output_dim, initializer,
+                                              scale, trainable):
+    """Applying the layer on the same input twice gives the same output."""
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        trainable=trainable,
+        name='random_fourier_features')
+    inputs = constant_op.constant(
+        np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
+    output1 = rff_layer.apply(inputs)
+    output2 = rff_layer.apply(inputs)
+    self._assert_all_close(output1, output2)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
+      ('other', init_ops.random_uniform_initializer(), 5.0))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_different_params_similar_approximation(self, initializer, scale):
+    # Layers initialized using different randomness (seed).
+    rff_layer1 = kernel_layers.RandomFourierFeatures(
+        output_dim=3000,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='rff1')
+    rff_layer2 = kernel_layers.RandomFourierFeatures(
+        output_dim=2000,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='rff2')
+    # Two distinct inputs.
+    x = constant_op.constant([[1.0, -1.0, 0.5]])
+    y = constant_op.constant([[-1.0, 1.0, 1.0]])
+
+    # Apply both layers to both inputs.
+    output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(x)
+    output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(y)
+    output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(x)
+    output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(y)
+
+    # Compute the inner products of the outputs (on inputs x and y) for both
+    # layers. For any fixed random features layer rff_layer, and inputs x, y,
+    # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
+    approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
+    approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
+    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.05)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
+      ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn):
+    """Approximation is bad when output dimension is small."""
+    # Two distinct inputs.
+    x = constant_op.constant([[1.0, -1.0, 0.5]])
+    y = constant_op.constant([[-1.0, 1.0, 1.0]])
+
+    small_output_dim = 10
+    random_seed.set_random_seed(1234)
+    # Initialize layer.
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=small_output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='random_fourier_features')
+
+    # Apply layer to both inputs.
+    output_x = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(x)
+    output_y = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(y)
+
+    # The inner products of the outputs (on inputs x and y) approximates the
+    # real value of the RBF kernel but poorly since the output dimension of the
+    # layer is small.
+    exact_kernel_value = exact_kernel_fn(x, y)
+    approx_kernel_value = kernelized_utils.inner_product(output_x, output_y)
+    abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value)
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        keras_backend._initialize_variables(sess)
+        abs_error_eval = sess.run([abs_error])
+        self.assertGreater(abs_error_eval[0][0], 0.05)
+        self.assertLess(abs_error_eval[0][0], 0.5)
+    else:
+      self.assertGreater(abs_error, 0.05)
+      self.assertLess(abs_error, 0.5)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 10.0, _exact_gaussian(stddev=10.0)),
+      ('laplacian', 'laplacian', 50.0, _exact_laplacian(stddev=50.0)))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_good_kernel_approximation_multiple_inputs(self, initializer, scale,
+                                                     exact_kernel_fn):
+    # Parameters.
+    input_dim = 5
+    output_dim = 5000
+    x_rows = 20
+    y_rows = 30
+
+    random_seed.set_random_seed(1234)
+    x = random_ops.random_uniform(shape=(x_rows, input_dim), maxval=1.0)
+    y = random_ops.random_uniform(shape=(y_rows, input_dim), maxval=1.0)
+
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='random_fourier_features')
+
+    # The shapes of output_x and output_y are (x_rows, output_dim) and
+    # (y_rows, output_dim) respectively.
+    output_x = math.sqrt(2.0 / output_dim) * rff_layer.apply(x)
+    output_y = math.sqrt(2.0 / output_dim) * rff_layer.apply(y)
+
+    approx_kernel_matrix = kernelized_utils.inner_product(output_x, output_y)
+    exact_kernel_matrix = exact_kernel_fn(x, y)
+    self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index b132d2ee8ea4c89e043ece1f029f7d65c0f79c23..38e165653e80c4ed82f55ac0482ae8ed5a5d5b4f 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -22,13 +22,12 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.eager import context
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 @keras_parameterized.run_all_keras_modes
@@ -68,8 +67,8 @@ class LSTMLayerTest(keras_parameterized.TestCase):
     layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.001), 'mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
@@ -126,9 +125,10 @@ class LSTMLayerTest(keras_parameterized.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_masking_with_stacking_LSTM(self):
@@ -139,9 +139,10 @@ class LSTMLayerTest(keras_parameterized.TestCase):
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
     model.add(keras.layers.RNN(lstm_cells, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
@@ -246,9 +247,10 @@ class LSTMLayerTest(keras_parameterized.TestCase):
     output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
 
     model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -318,74 +320,6 @@ class LSTMLayerTest(keras_parameterized.TestCase):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
-
-class LSTMLayerGraphOnlyTest(test.TestCase):
-
-  def test_statefulness_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.LSTM
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertAllClose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      self.assertAllClose(out7, out6, atol=1e-5)
-
-  # b/120919032
-  @tf_test_util.run_deprecated_v1
   def test_regularizers_LSTM(self):
     embedding_dim = 4
     layer_class = keras.layers.LSTM
@@ -402,7 +336,73 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
     self.assertEqual(len(layer.losses), 3)
     x = keras.backend.variable(np.ones((2, 3, 2)))
     layer(x)
-    self.assertEqual(len(layer.get_losses_for(x)), 1)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.LSTM
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertAllClose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    self.assertAllClose(out7, out6, atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index c3c5b2db7d73ba67bc574f7672adaed2c73dc920..5008bd77147420822c96bab872b878d7f3fa6781 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
@@ -41,6 +39,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.util.tf_export import tf_export
 
 
 @keras_export('keras.layers.BatchNormalization', v1=[])
@@ -91,8 +90,7 @@ class BatchNormalizationV2(Layer):
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
       implementation.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    trainable: Boolean, if `True` the variables will be marked as trainable.
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
       which means batch normalization is performed across the whole batch. When
       `virtual_batch_size` is not `None`, instead perform "Ghost Batch
@@ -393,16 +391,16 @@ class BatchNormalizationV2(Layer):
               aggregation=tf_variables.VariableAggregation.MEAN)
           return var
 
-        with distribution_strategy_context.get_distribution_strategy(
-        ).colocate_vars_with(self.moving_mean):
+        with distribution_strategy_context.get_strategy(
+        ).extended.colocate_vars_with(self.moving_mean):
           self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
           self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
         # We initialize renorm_stddev to 0, and maintain the (0-initialized)
         # renorm_stddev_weight. This allows us to (1) mix the average
         # stddev with the minibatch stddev early in training, and (2) compute
         # the unbiased average stddev by dividing renorm_stddev by the weight.
-        with distribution_strategy_context.get_distribution_strategy(
-        ).colocate_vars_with(self.moving_variance):
+        with distribution_strategy_context.get_strategy(
+        ).extended.colocate_vars_with(self.moving_variance):
           self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
           self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
                                                        ())
@@ -414,23 +412,7 @@ class BatchNormalizationV2(Layer):
   def _assign_moving_average(self, variable, value, momentum):
     with ops.name_scope(None, 'AssignMovingAvg',
                         [variable, value, momentum]) as scope:
-      # TODO(b/120571621): We want to avoid colocating the variables here
-      # since TPUStrategy does not implement replica local variables.
-      # Remove this hack once we support TPULocalVariables.
-      is_tpu_strategy = False
-      if distribution_strategy_context.has_distribution_strategy():
-        distribute = distribution_strategy_context.get_distribution_strategy()
-        if distribute.__class__.__name__ == 'TPUStrategy':
-          is_tpu_strategy = True
-
-      # TODO(apassos,srbs,skyewm): the colocation constraints here are disabled
-      # because of a bug which leads cond_v2 to skip rewriting them creating
-      # conflicts.
-      if tf2.enabled() or is_tpu_strategy:
-        cm = contextlib.contextmanager(lambda: (yield))()
-      else:
-        cm = ops.colocate_with(variable)
-      with cm:
+      with ops.colocate_with(variable):
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
@@ -481,10 +463,19 @@ class BatchNormalizationV2(Layer):
     else:
       momentum = ops.convert_to_tensor(self.momentum)
     if training_value or training_value is None:
-      mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                momentum)
-      variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, momentum)
+      if distribution_strategy_context.in_cross_replica_context():
+        strategy = distribution_strategy_context.get_strategy()
+        mean_update = strategy.extended.update(
+            self.moving_mean, self._assign_moving_average,
+            (mean, self.momentum))
+        variance_update = strategy.extended.update(
+            self.moving_variance, self._assign_moving_average,
+            (variance, self.momentum))
+      else:
+        mean_update = self._assign_moving_average(self.moving_mean, mean,
+                                                  momentum)
+        variance_update = self._assign_moving_average(self.moving_variance,
+                                                      variance, momentum)
       self.add_update(mean_update, inputs=True)
       self.add_update(variance_update, inputs=True)
 
@@ -665,7 +656,8 @@ class BatchNormalizationV2(Layer):
         scale, offset = _compose_transforms(r, d, scale, offset)
 
       if distribution_strategy_context.in_cross_replica_context():
-        strategy = distribution_strategy_context.get_distribution_strategy()
+        strategy = distribution_strategy_context.get_strategy()
+
         def _do_update(var, value):
           """Compute the updates for mean and variance."""
           if in_eager_mode and not self.trainable:
@@ -786,7 +778,243 @@ class BatchNormalizationV1(BatchNormalizationV2):
   _USE_V2_BEHAVIOR = False
 
 
-if tf2.enabled():
+BatchNormalization = None  # pylint: disable=invalid-name
+
+
+@tf_export(v1=['enable_v2_batch_normalization'])
+def enable_v2_batch_normalization():
+  global BatchNormalization  # pylint: disable=invalid-name
   BatchNormalization = BatchNormalizationV2
-else:
+
+
+@tf_export(v1=['disable_v2_batch_normalization'])
+def disable_v2_batch_normalization():
+  global BatchNormalization  # pylint: disable=invalid-name
   BatchNormalization = BatchNormalizationV1
+
+
+if tf2.enabled():
+  enable_v2_batch_normalization()
+else:
+  disable_v2_batch_normalization()
+
+
+@keras_export('keras.layers.experimental.LayerNormalization')
+class LayerNormalization(Layer):
+  """Layer normalization layer (Ba et al., 2016).
+
+  Normalize the activations of the previous layer for each given example in a
+  batch independently, rather than across a batch like Batch Normalization.
+  i.e. applies a transformation that maintains the mean activation within each
+  example close to 0 and the activation standard deviation close to 1.
+
+  Given a tensor `inputs` of rank `R`, moments are calculated and normalization
+  is performed over all axes in norm_axis.  Scaling and centering,
+  if requested, is performed over all axes in params_axis.
+
+  By default, normalization is performed over all but the first axis
+  (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable
+  parameters are calculated for the rightmost axis (the `C` if `inputs` is
+  `NHWC`).  Scaling and recentering is performed via broadcast of the
+  `beta` and `gamma` parameters with the normalized tensor.
+
+  The shapes of `beta` and `gamma` are
+  `[inputs.shape[i] for i in (param axes)]`,
+  and this part of the inputs' shape must be fully defined.
+
+  Arguments:
+    norm_axis: Integer or List. normalization will be
+      performed along these dimensions. If unspecified (None), it will default
+      to the dimensions `begin_norm_axis : rank(inputs)`
+    params_axis: Integer or List. The (beta, gamma) dimensions: scale
+      and centering parameters will have take their shapes from these axes and
+      will be broadcast with the normalized inputs accordingly. If unspecified
+      (None), it will default to the last dimension
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    trainable: Boolean, if `True` the variables will be marked as trainable.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+  """
+
+  def __init__(self,
+               norm_axis=None,
+               params_axis=-1,
+               epsilon=1e-12,
+               center=True,
+               scale=True,
+               beta_initializer='zeros',
+               gamma_initializer='ones',
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               beta_constraint=None,
+               gamma_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(LayerNormalization, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(norm_axis, list):
+      self.norm_axis = norm_axis[:]
+    elif isinstance(norm_axis, int):
+      self.norm_axis = norm_axis
+    elif norm_axis is None:
+      self.norm_axis = None
+    else:
+      raise TypeError('norm_axis must be int or list or None, type given: %s'
+                      % type(norm_axis))
+
+    if isinstance(params_axis, list):
+      self.params_axis = params_axis[:]
+    elif isinstance(params_axis, int):
+      self.params_axis = params_axis
+    else:
+      raise TypeError('params_axis must be int or list, type given: %s'
+                      % type(params_axis))
+
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    ndims = len(input_shape)
+    if ndims is None:
+      raise ValueError('Input shape %s has undefined rank.' % input_shape)
+
+    # Handle an unspecified norm_axis
+    if self.norm_axis is None:
+      self.norm_axis = list(range(1, ndims))
+
+    # Convert axes to lists and resolve negatives
+    if isinstance(self.norm_axis, int):
+      self.norm_axis = [self.norm_axis]
+    for idx, x in enumerate(self.norm_axis):
+      if x < 0:
+        self.norm_axis[idx] = ndims + x
+
+    if isinstance(self.params_axis, int):
+      self.params_axis = [self.params_axis]
+    for idx, x in enumerate(self.params_axis):
+      if x < 0:
+        self.params_axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.norm_axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.norm_axis) != len(set(self.norm_axis)):
+      raise ValueError('Duplicate axis: %s' % self.norm_axis)
+
+    for x in self.params_axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.params_axis) != len(set(self.params_axis)):
+      raise ValueError('Duplicate axis: %s' % self.params_axis)
+
+    param_shape = [input_shape[dim] for dim in self.params_axis]
+
+    if self.scale:
+      self.gamma = self.add_weight(
+          name='gamma',
+          shape=param_shape,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
+    else:
+      self.gamma = None
+
+    if self.center:
+      self.beta = self.add_weight(
+          name='beta',
+          shape=param_shape,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
+    else:
+      self.beta = None
+
+  def call(self, inputs):
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+
+    # Calculate the moments on the last axis (layer activations).
+    mean, variance = nn.moments(inputs, self.norm_axis, keep_dims=True)
+
+    # Broadcasting only necessary for norm where the params axes aren't just
+    # the last dimension
+    broadcast_shape = [1] * ndims
+    for dim in self.params_axis:
+      broadcast_shape[dim] = input_shape.dims[dim].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          self.params_axis != [ndims - 1]):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+    # Compute layer normalization using the batch_normalization function.
+    outputs = nn.batch_normalization(
+        inputs,
+        mean,
+        variance,
+        offset=offset,
+        scale=scale,
+        variance_epsilon=self.epsilon)
+
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
+
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+  def get_config(self):
+    config = {
+        'norm_axis': self.norm_axis,
+        'params_axis': self.params_axis,
+        'epsilon': self.epsilon,
+        'center': self.center,
+        'scale': self.scale,
+        'beta_initializer': initializers.serialize(self.beta_initializer),
+        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint': constraints.serialize(self.beta_constraint),
+        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+    }
+    base_config = super(LayerNormalization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index f81ddcecb42662c8cfa481808919c4382771467b..3815d1e673db7c97eb540d7ac4899fa2d86e26f5 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -355,12 +356,241 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
       # Simulates training-mode with trainable layer.
       # Should use mini-batch statistics.
-      keras.backend.set_learning_phase(1)
-      model = get_model(bn_mean, bn_std)
-      model.compile(loss='mse', optimizer='rmsprop')
-      out = model.predict(val_a)
-      self.assertAllClose(
-          (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+      with keras.backend.learning_phase_scope(1):
+        model = get_model(bn_mean, bn_std)
+        model.compile(loss='mse', optimizer='rmsprop')
+        out = model.predict(val_a)
+        self.assertAllClose(
+            (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+
+
+def _run_layernorm_correctness_test(layer, dtype='float32'):
+  model = keras.models.Sequential()
+  norm = layer(input_shape=(2, 2, 2))
+  model.add(norm)
+  model.compile(loss='mse',
+                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                run_eagerly=testing_utils.should_run_eagerly())
+
+  # centered on 5.0, variance 10.0
+  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+       .astype(dtype))
+  model.fit(x, x, epochs=4, verbose=0)
+  out = model.predict(x)
+  out -= keras.backend.eval(norm.beta)
+  out /= keras.backend.eval(norm.gamma)
+
+  np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+  np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class LayerNormalizationTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_basic_layernorm(self):
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={
+            'gamma_regularizer': keras.regularizers.l2(0.01),
+            'beta_regularizer': keras.regularizers.l2(0.01)
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={
+            'gamma_initializer': 'ones',
+            'beta_initializer': 'ones',
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={'scale': False,
+                'center': False},
+        input_shape=(3, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_weights(self):
+    layer = keras.layers.LayerNormalization(scale=False, center=False)
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.weights), 0)
+
+    layer = keras.layers.LayerNormalization()
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.weights), 2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_regularization(self):
+    layer = keras.layers.LayerNormalization(
+        gamma_regularizer='l1', beta_regularizer='l1')
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.losses), 2)
+    max_norm = keras.constraints.max_norm
+    layer = keras.layers.LayerNormalization(
+        gamma_constraint=max_norm, beta_constraint=max_norm)
+    layer.build((None, 3, 4))
+    self.assertEqual(layer.gamma.constraint, max_norm)
+    self.assertEqual(layer.beta.constraint, max_norm)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        model = keras.models.Sequential()
+        norm = keras.layers.LayerNormalization(
+            input_shape=(3, 4, 4), params_axis=1)
+        model.add(norm)
+        model.compile(loss='mse',
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                      run_eagerly=testing_utils.should_run_eagerly())
+
+        # centered on 5.0, variance 10.0
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+        model.fit(x, x, epochs=4, verbose=0)
+        out = model.predict(x)
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet_channel_last(self):
+    model = keras.models.Sequential()
+    norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
+    model.add(norm)
+    model.compile(loss='mse',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_correctness(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float32')
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_mixed_precision(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float16')
+
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   norm_axis=None,
+                   params_axis=-1,
+                   dtype=None):
+    ndim = len(input_shape)
+    if norm_axis is None:
+      moments_axis = range(1, ndim)
+    elif isinstance(norm_axis, int):
+      if norm_axis < 0:
+        moments_axis = [norm_axis + ndim]
+      else:
+        moments_axis = [norm_axis]
+    else:
+      moments_axis = []
+      for dim in norm_axis:
+        if dim < 0:
+          dim = dim + ndim
+        moments_axis.append(dim)
+
+    moments_axis = tuple(moments_axis)
+    expected_shape = []
+    for i in range(ndim):
+      if i not in moments_axis:
+        expected_shape.append(input_shape[i])
+
+    expected_mean = np.zeros(expected_shape)
+    expected_var = np.ones(expected_shape)
+    for mu in [0.0, 1e2]:
+      for sigma in [1.0, 0.1]:
+        inputs = np.random.randn(*input_shape) * sigma + mu
+        inputs_t = constant_op.constant(inputs, shape=input_shape)
+        layer = normalization.LayerNormalization(
+            norm_axis=norm_axis, params_axis=params_axis, dtype=dtype)
+        outputs = layer(inputs_t)
+        beta = layer.beta
+        gamma = layer.gamma
+        for weight in layer.weights:
+          self.evaluate(weight.initializer)
+        outputs = self.evaluate(outputs)
+        beta = self.evaluate(beta)
+        gamma = self.evaluate(gamma)
+
+        # The mean and variance of the output should be close to 0 and 1
+        # respectively.
+
+        # Make sure that there are no NaNs
+        self.assertFalse(np.isnan(outputs).any())
+        mean = np.mean(outputs, axis=moments_axis)
+        var = np.var(outputs, axis=moments_axis)
+        # Layer-norm implemented in numpy
+        eps = 1e-12
+        expected_out = (
+            (gamma * (inputs - np.mean(
+                inputs, axis=moments_axis, keepdims=True)) /
+             np.sqrt(eps + np.var(
+                 inputs, axis=moments_axis, keepdims=True))) + beta)
+        self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
+        self.assertAllClose(expected_var, var, atol=tol)
+        # The full computation gets a bigger tolerance
+        self.assertAllClose(expected_out, outputs, atol=5 * tol)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInput(self):
+    self.doOutputTest((10, 300))
+    self.doOutputTest((10, 300), norm_axis=[0])
+    self.doOutputTest((10, 300), params_axis=[0, 1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInputDegenerateNormAxis(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid axis: 2'):
+      self.doOutputTest((10, 300), norm_axis=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInput(self):
+    self.doOutputTest((100, 10, 10, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInputNormOnInnermostAxis(self):
+    # Equivalent tests
+    shape = (100, 10, 10, 3)
+    self.doOutputTest(
+        shape, norm_axis=list(range(3, len(shape))), tol=1e-4, dtype='float64')
+    self.doOutputTest(shape, norm_axis=-1, tol=1e-4, dtype='float64')
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInput(self):
+    self.doOutputTest((10, 10, 10, 30))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnInnermostAxis(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=3)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnMixedAxes(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3])
+    self.doOutputTest((10, 10, 10, 30), params_axis=[-2, -1])
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3],
+                      params_axis=[-3, -2, -1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputBigInput(self):
+    self.doOutputTest((1, 100, 100, 1))
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2])
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2],
+                      params_axis=[-2, -1])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
index 936e73ecf9dab86cb12a9e45499bf0e7599a0dc4..67df4d7a256c03b2c476c9b5d6ca1622870a6553 100644
--- a/tensorflow/python/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class GlobalPoolingTest(test.TestCase):
@@ -48,7 +47,7 @@ class GlobalPoolingTest(test.TestCase):
     model = keras.Sequential()
     model.add(keras.layers.Masking(mask_value=0., input_shape=(3, 4)))
     model.add(keras.layers.GlobalAveragePooling1D())
-    model.compile(loss='mae', optimizer=rmsprop.RMSPropOptimizer(0.001))
+    model.compile(loss='mae', optimizer='rmsprop')
 
     model_input = np.random.random((2, 3, 4))
     model_input[0, 1:, :] = 0
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 95257a085ed68d1db73447c42e1fe5a40bc95507..9404543ed9c2a4a48eaf40eb4190ac21be0e0d9d 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -468,7 +468,7 @@ class RNN(Layer):
     self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
+    if isinstance(cell, checkpointable.Checkpointable):
       self._track_checkpointable(self.cell, name='cell')
     self.return_sequences = return_sequences
     self.return_state = return_state
@@ -550,8 +550,12 @@ class RNN(Layer):
       return output_shape
 
   def compute_mask(self, inputs, mask):
-    if isinstance(mask, list):
-      mask = mask[0]
+    # Time step masks must be the same for each input.
+    # This is because the mask for an RNN is of size [batch, time_steps, 1],
+    # and specifies which time steps should be skipped, and a time step
+    # must be skipped for all inputs.
+    # TODO(scottzhu): Should we accept multiple different masks?
+    mask = nest.flatten(mask)[0]
     output_mask = mask if self.return_sequences else None
     if self.return_state:
       state_mask = [None for _ in self.states]
@@ -766,8 +770,10 @@ class RNN(Layer):
     inputs, initial_state, constants = self._process_inputs(
         inputs, initial_state, constants)
 
-    if isinstance(mask, list):
-      mask = mask[0]
+    if mask is not None:
+      # Time step masks must be the same for each input.
+      # TODO(scottzhu): Should we accept multiple different masks?
+      mask = nest.flatten(mask)[0]
 
     if nest.is_sequence(inputs):
       # In the case of nested input, use the first element for shape check.
@@ -3346,6 +3352,7 @@ def _standardize_args(
     # For either case, we will use num_inputs to split the input list, and
     # restructure the real input into tuple.
     assert initial_state is None and constants is None
+    inputs = nest.flatten(inputs)
     if num_constants is not None:
       constants = inputs[-num_constants:]
       inputs = inputs[:-num_constants]
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index e9bf788740fcb355a3cb2143ca4f0db1dcc8b802..ddea2f4eae49e0a1948ca2de151eaa5f74f6a378 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -40,7 +40,6 @@ from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 from tensorflow.python.training.checkpointable import util as checkpointable_util
 from tensorflow.python.util import nest
 
@@ -73,9 +72,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -85,9 +85,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cells)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_non_layer_multiple_states(self):
@@ -114,9 +115,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -128,9 +130,10 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(layer.cell.output_size, 32)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_layer(self):
@@ -169,9 +172,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test basic case serialization.
@@ -194,9 +198,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cells)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacked RNN serialization.
@@ -234,9 +239,10 @@ class RNNTest(keras_parameterized.TestCase):
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -253,9 +259,10 @@ class RNNTest(keras_parameterized.TestCase):
 
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, cell_units[-1])))
@@ -269,9 +276,10 @@ class RNNTest(keras_parameterized.TestCase):
         units, time_major=True, return_sequences=True)(mask)
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(rnn)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -282,9 +290,10 @@ class RNNTest(keras_parameterized.TestCase):
     y = rnn_1(x)
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -357,9 +366,10 @@ class RNNTest(keras_parameterized.TestCase):
     y = layer(x, constants=c)
 
     model = keras.models.Model([x, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -396,9 +406,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.recurrent.RNN(cells)
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -411,9 +422,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.recurrent.RNN(cells)
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -484,9 +496,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x, initial_state=s, constants=c)
     model = keras.models.Model([x, s, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -615,9 +628,10 @@ class RNNTest(keras_parameterized.TestCase):
       layer = keras.layers.RNN(cell)
       y = layer(x)
       model = keras.models.Model(x, y)
-      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse',
-                    run_eagerly=testing_utils.should_run_eagerly())
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       # Test basic case serialization.
       x_np = np.random.random((6, 5, 5))
@@ -638,9 +652,10 @@ class RNNTest(keras_parameterized.TestCase):
       layer = keras.layers.RNN(cells)
       y = layer(x)
       model = keras.models.Model(x, y)
-      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse',
-                    run_eagerly=testing_utils.should_run_eagerly())
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       # Test stacked RNN serialization.
       x_np = np.random.random((6, 5, 5))
@@ -706,9 +721,10 @@ class RNNTest(keras_parameterized.TestCase):
     y = np.random.random((2, 2))
     model = keras.models.Sequential()
     model.add(rnn(2))
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, epochs=1, batch_size=1)
 
     # check whether the model variables are present in the
@@ -740,9 +756,10 @@ class RNNTest(keras_parameterized.TestCase):
                        [None, unit_a, unit_b])
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b)))
@@ -757,9 +774,10 @@ class RNNTest(keras_parameterized.TestCase):
     layer = keras.layers.RNN(cells)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a * 4, unit_b * 4)))
@@ -781,9 +799,10 @@ class RNNTest(keras_parameterized.TestCase):
     y = layer(x, initial_state=s)
 
     model = keras.models.Model([x, s], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b))
@@ -819,9 +838,10 @@ class RNNTest(keras_parameterized.TestCase):
                        [None, state_size])
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_size)),
         np.zeros((batch, input_size)))
@@ -875,9 +895,10 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(outputs[1].shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model((input_1, input_2), outputs)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
         [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
@@ -897,9 +918,10 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(outputs[1].shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model([input_1, input_2], outputs)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -926,9 +948,10 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(s2.shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -951,9 +974,10 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertEqual(s2.shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -985,9 +1009,10 @@ class RNNTest(keras_parameterized.TestCase):
 
     model = keras.models.Model([input_1, input_2, init_s1, init_s2],
                                [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -1017,9 +1042,10 @@ class RNNTest(keras_parameterized.TestCase):
 
     model = keras.models.Model([input_1, input_2, init_s1, init_s2],
                                [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -1093,9 +1119,10 @@ class RNNTest(keras_parameterized.TestCase):
     y, s = keras.layers.RNN(
         Cell(), return_state=True)(x_masked, initial_state=s_0)
     model = keras.models.Model([x, s_0], [y, s])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse',
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     # last time step masked
     x_np = np.array([[[1.], [2.], [0.]]])
@@ -1118,9 +1145,10 @@ class RNNTest(keras_parameterized.TestCase):
       masked_input = mask(x)
       y = layer(masked_input)
       model = keras.models.Model(x, y)
-      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse',
-                    run_eagerly=testing_utils.should_run_eagerly())
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       np_x = np.ones((6, 5, 5))
       result_1 = model.predict(np_x)
diff --git a/tensorflow/python/keras/layers/separable_convolutional_test.py b/tensorflow/python/keras/layers/separable_convolutional_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8234bfe704d84e0de6e7f60e33df31de5a800cc5
--- /dev/null
+++ b/tensorflow/python/keras/layers/separable_convolutional_test.py
@@ -0,0 +1,167 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for separable convolutional layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class SeparableConv1DTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    length = 7
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.SeparableConv1D,
+          kwargs=kwargs,
+          input_shape=(num_samples, length, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('padding_causal', {'padding': 'causal'}),
+      ('strides', {'strides': 2}),
+      ('dilation_rate', {'dilation_rate': 2}),
+      ('depth_multiplier', {'depth_multiplier': 2}),
+  )
+  def test_separable_conv1d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = 3
+    self._run_test(kwargs)
+
+  def test_separable_conv1d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv1d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+
+@keras_parameterized.run_all_keras_modes
+class SeparableConv2DTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.SeparableConv2D,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('strides', {'strides': 2}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('dilation_rate', {'dilation_rate': 2}),
+      ('depth_multiplier', {'depth_multiplier': 2}),
+  )
+  def test_separable_conv2d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = 3
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_separable_conv2d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv2d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 7c45e08b5c48084cc57569a4d1102a0a7c5b29e1..932cc58e036bbb25b7a14c42b3d01b7484ea17ab 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -20,6 +20,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.engine.base_layer import TensorFlowOpLayer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.layers.advanced_activations import *
@@ -36,12 +37,15 @@ from tensorflow.python.keras.layers.pooling import *
 from tensorflow.python.keras.layers.recurrent import *
 from tensorflow.python.keras.layers.wrappers import *
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.layers.serialize')
 def serialize(layer):
   return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
 
 
+@keras_export('keras.layers.deserialize')
 def deserialize(config, custom_objects=None):
   """Instantiates a layer from a config dictionary.
 
@@ -51,10 +55,11 @@ def deserialize(config, custom_objects=None):
           of custom (non-Keras) objects to class/functions
 
   Returns:
-      Layer instance (may be Model, Sequential, Layer...)
+      Layer instance (may be Model, Sequential, Network, Layer...)
   """
   from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
   globs = globals()  # All layers.
+  globs['Network'] = models.Network
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
   return deserialize_keras_object(
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index b5063850f0cd56348ed477c598faef031c71ef8a..0ee074d19c5eb35bbdaea68de0f69676f3282ee5 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -21,12 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.eager import context
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 @keras_parameterized.run_all_keras_modes
@@ -51,7 +50,7 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
     layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.01), 'mse')
+    model.compile('rmsprop', 'mse')
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
@@ -107,8 +106,7 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_SimpleRNN(self):
@@ -118,6 +116,28 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_regularizers_SimpleRNN(self):
+    embedding_dim = 4
+    layer_class = keras.layers.SimpleRNN
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -136,7 +156,7 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
     model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                  loss='mse')
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -180,29 +200,5 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
 
     np.testing.assert_allclose(out7, out6, atol=1e-5)
 
-
-class SimpleRNNLayerGraphOnlyTest(test.TestCase):
-
-  # b/120919032
-  @tf_test_util.run_deprecated_v1
-  def test_regularizers_SimpleRNN(self):
-    embedding_dim = 4
-    layer_class = keras.layers.SimpleRNN
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
-
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    self.assertEqual(len(layer.get_losses_for(x)), 1)
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/unified_gru_test.py
index 6e77acc20a8697f3e21930fa935a4dc25ea12d65..8259643d7cc2493855cc5cce45ceb9b1665e9572 100644
--- a/tensorflow/python/keras/layers/unified_gru_test.py
+++ b/tensorflow/python/keras/layers/unified_gru_test.py
@@ -137,37 +137,6 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
-
-class GRULayerGradientTapeTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_in_tape(self):
-    if not context.executing_eagerly():
-      self.skipTest('bloo')
-    time_steps = 10
-    embedding_size = 11
-    gru_unit_size = 12
-
-    gru = keras.layers.UnifiedGRU(gru_unit_size,
-                                  return_sequences=True,
-                                  return_state=True,
-                                  recurrent_activation='sigmoid',
-                                  recurrent_initializer='glorot_uniform')
-
-    x = random_ops.random_uniform([1, time_steps, embedding_size])
-    y = random_ops.random_uniform([1, gru_unit_size])
-
-    with backprop.GradientTape() as tape:
-      hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
-      _, state = gru(x, initial_state=hidden_state)
-
-      loss = math_ops.reduce_mean(math_ops.square(state - y))
-
-    tape.gradient(loss, gru.variables)
-
-
-class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
-
   def test_unified_gru_feature_parity_with_canonical_gru(self):
     with context.eager_mode():
       # Run this test under eager only due to b/120160788 for model.set_weights.
@@ -207,8 +176,8 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
       cudnn_model.fit(x_train, y_train)
       y_4 = cudnn_model.predict(x_train)
 
-      self.assertAllClose(y_1, y_3)
-      self.assertAllClose(y_2, y_4)
+      self.assertAllClose(y_1, y_3, rtol=2e-5, atol=2e-5)
+      self.assertAllClose(y_2, y_4, rtol=2e-5, atol=2e-5)
 
   @parameterized.named_parameters(
       # test_name, use_bias, bias_initializer, activation
@@ -216,7 +185,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
       ('no_bias', False, 'zeros'),
       ('random_bias', True, 'random_uniform'),
   )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_unified_gru_model_save_load(self, use_bias, bias_initializer):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir)
@@ -250,7 +218,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(y, y_ref)
     self.assertAllClose(layer.get_weights(), new_layer.get_weights())
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_unified_gru_output_on_multiple_kernel(self):
     input_shape = 10
     rnn_state_size = 8
@@ -297,7 +264,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
       ('go_backwards', False, True),
       ('both', True, True),
   )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_time_major_and_go_backward(self, time_major, go_backwards):
     input_shape = 10
     rnn_state_size = 8
@@ -335,7 +301,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(y, y_ref)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_with_masking_layer_GRU(self):
     layer_class = keras.layers.UnifiedGRU
     inputs = np.random.random((2, 3, 4))
@@ -348,7 +313,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
                   optimizer=gradient_descent.GradientDescentOptimizer(0.001))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_masking_with_stacking_GRU(self):
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -362,7 +326,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_return_sequences_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -374,7 +337,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_dropout_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -387,7 +349,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_constraints_GRU(self):
     embedding_dim = 4
     layer_class = keras.layers.UnifiedGRU
@@ -408,7 +369,6 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
   @parameterized.parameters([0, 1, 2])
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_implementation_mode_GRU(self, implementation_mode):
     num_samples = 2
     timesteps = 3
@@ -420,8 +380,28 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
                 'implementation': implementation_mode},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_v1_only("b/120941292")
-  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_regularizers_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedGRU
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
   def test_statefulness_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -439,7 +419,8 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     layer = layer_class(
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
-    model.compile(optimizer='sgd', loss='mse')
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -484,6 +465,34 @@ class GRULayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     np.testing.assert_allclose(out7, out6, atol=1e-5)
 
 
+class GRULayerGradientTapeTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_in_tape(self):
+    if not context.executing_eagerly():
+      self.skipTest('bloo')
+    time_steps = 10
+    embedding_size = 11
+    gru_unit_size = 12
+
+    gru = keras.layers.UnifiedGRU(gru_unit_size,
+                                  return_sequences=True,
+                                  return_state=True,
+                                  recurrent_activation='sigmoid',
+                                  recurrent_initializer='glorot_uniform')
+
+    x = random_ops.random_uniform([1, time_steps, embedding_size])
+    y = random_ops.random_uniform([1, gru_unit_size])
+
+    with backprop.GradientTape() as tape:
+      hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
+      _, state = gru(x, initial_state=hidden_state)
+
+      loss = math_ops.reduce_mean(math_ops.square(state - y))
+
+    tape.gradient(loss, gru.variables)
+
+
 class GRULayerGraphOnlyTest(test.TestCase):
 
   # Need session for test
@@ -591,28 +600,6 @@ class GRULayerGraphOnlyTest(test.TestCase):
         self.assertNotEqual(existing_loss, loss_value)
         existing_loss = loss_value
 
-  # b/120919032
-  @test_util.run_deprecated_v1
-  def test_regularizers_GRU(self):
-    embedding_dim = 4
-    layer_class = keras.layers.UnifiedGRU
-    with self.cached_session(config=_config):
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
index 15086a53a8d029f2fd584311609b05168a4e0c13..375894b166215ed7068767eed095fec2f60963ca 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -296,152 +296,6 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
-
-class LSTMLayerGraphOnlyTest(test.TestCase):
-
-  # Need session for test
-  @test_util.run_deprecated_v1
-  def test_unifiedLSTM(self):
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 1
-
-    with self.cached_session(config=_config, use_gpu=True) as sess:
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = keras.utils.to_categorical(y_train, output_shape)
-
-      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
-
-      inputs = array_ops.placeholder(
-          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
-      predict = array_ops.placeholder(
-          dtypes.float32, shape=(None, output_shape), name='predict')
-
-      outputs, runtime = layer(inputs)
-      loss = losses.softmax_cross_entropy(predict, outputs)
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      train_op = optimizer.minimize(loss)
-
-      sess.run([variables.global_variables_initializer()])
-      existing_loss = 0
-      for _ in range(epoch):
-        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
-            inputs: x_train,
-            predict: y_train
-        })
-        if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
-        else:
-          self.assertEqual(runtime_value, b'cpu')
-        # Make sure the loss is updated for every epoch
-        # (layer weights properly updated).
-        self.assertNotEqual(existing_loss, loss_value)
-        existing_loss = loss_value
-
-  # Need session for test
-  @test_util.run_deprecated_v1
-  def test_unifiedLSTM_with_cond(self):
-    # This test is to demonstrate the graph rewrite of grappler plugin under
-    # the condition that the function returns different number of internal
-    # states.
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 1
-
-    with self.cached_session(config=_config, use_gpu=True) as sess:
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = keras.utils.to_categorical(y_train, output_shape)
-
-      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
-
-      inputs = array_ops.placeholder(
-          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
-      predict = array_ops.placeholder(
-          dtypes.float32, shape=(None, output_shape), name='predict')
-
-      zeros = array_ops.zeros([batch, output_shape])
-      dummy_runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
-      a = constant_op.constant(0)
-      b = constant_op.constant(1)
-      # Will always run the lstm layer.
-      outputs, runtime = control_flow_ops.cond(
-          gen_math_ops.less(a, b),
-          lambda: layer(inputs),
-          lambda: (zeros, dummy_runtime))
-      loss = losses.softmax_cross_entropy(predict, outputs)
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      train_op = optimizer.minimize(loss)
-
-      sess.run([variables.global_variables_initializer()])
-      existing_loss = 0
-
-      for _ in range(epoch):
-        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
-            inputs: x_train,
-            predict: y_train
-        })
-        if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
-        else:
-          self.assertEqual(runtime_value, b'cpu')
-        # Make sure the loss is updated for every epoch
-        # (layer weights properly updated).
-        self.assertNotEqual(existing_loss, loss_value)
-        existing_loss = loss_value
-
-  # b/120919032
-  @test_util.run_deprecated_v1
-  def test_regularizers_LSTM(self):
-    embedding_dim = 4
-    layer_class = keras.layers.UnifiedLSTM
-    layer = layer_class(
-        5,
-        return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_regularizer=keras.regularizers.l1(0.01),
-        recurrent_regularizer=keras.regularizers.l1(0.01),
-        bias_regularizer='l2',
-        activity_regularizer='l1')
-    layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
-    x = keras.backend.variable(np.ones((2, 3, 2)))
-    layer(x)
-    self.assertEqual(len(layer.get_losses_for(x)), 1)
-
-
-class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_dropout_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
-        kwargs={
-            'units': units,
-            'dropout': 0.1,
-            'recurrent_dropout': 0.1
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
-
   def test_unified_lstm_feature_parity_with_canonical_lstm(self):
     with context.eager_mode():
       # Run this test under eager only due to b/120160788 for model.set_weights.
@@ -478,11 +332,10 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
       cudnn_model.fit(x_train, y_train)
       y_4 = cudnn_model.predict(x_train)
 
-      self.assertAllClose(y_1, y_3)
-      self.assertAllClose(y_2, y_4)
+      self.assertAllClose(y_1, y_3, rtol=1e-5, atol=1e-5)
+      self.assertAllClose(y_2, y_4, rtol=1e-5, atol=1e-5)
 
   @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
@@ -525,7 +378,6 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_masking_with_stacking_LSTM(self):
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -546,7 +398,6 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
       ('go_backwards', False, True),
       ('both', True, True),
   )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_time_major_and_go_backward(self, time_major, go_backwards):
     input_shape = 10
     rnn_state_size = 8
@@ -615,7 +466,6 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
       ('no_bias', False, 'zeros'),
       ('random_bias', True, 'random_uniform'),
   )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_unified_lstm_model_save_load(self, use_bias, bias_initializer):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir)
@@ -649,7 +499,6 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(y, y_ref)
     self.assertAllClose(layer.get_weights(), new_layer.get_weights())
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_unified_lstm_output_on_multiple_kernel(self):
     input_shape = 10
     rnn_state_size = 8
@@ -688,7 +537,6 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(y_1, y_2)
     self.assertAllClose(y_2, y_3)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_return_sequences_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -702,9 +550,27 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedLSTM
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-  @test_util.run_v1_only("b/120941292")
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -722,8 +588,8 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     layer = layer_class(
         units, return_sequences=False, stateful=True, weights=None)
     model.add(layer)
-    model.compile(
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
     out1 = model.predict(np.ones((num_samples, timesteps)))
     self.assertEqual(out1.shape, (num_samples, units))
 
@@ -768,6 +634,132 @@ class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose(out7, out6, atol=1e-5)
 
 
+class LSTMLayerGraphOnlyTest(test.TestCase):
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+
+class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+
 class UnifiedLSTMPerformanceTest(test.Benchmark):
 
   def _measure_performance(self, test_config, model, x_train, y_train):
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 46d5487b2c00fa3177a595774dc7ce8d40655f2e..f3aa5c4d684592d1b276cac47d31f8d1ba06d600 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import util as checkpointable_util
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
@@ -77,7 +76,7 @@ class TimeDistributedTest(test.TestCase):
     model.add(
         keras.layers.TimeDistributed(
             keras.layers.Dense(2), input_shape=(3, 4)))
-    model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+    model.compile(optimizer='rmsprop', loss='mse')
     model.fit(
         np.random.random((10, 3, 4)),
         np.random.random((10, 3, 2)),
@@ -98,7 +97,7 @@ class TimeDistributedTest(test.TestCase):
     model.add(
         keras.layers.TimeDistributed(
             keras.layers.Dense(2), input_shape=(3, 4), batch_size=10))
-    model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+    model.compile(optimizer='rmsprop', loss='mse')
     model.fit(
         np.random.random((10, 3, 4)),
         np.random.random((10, 3, 2)),
@@ -159,8 +158,8 @@ class TimeDistributedTest(test.TestCase):
       # test layers that need learning_phase to be set
       np.random.seed(1234)
       x = keras.layers.Input(shape=(3, 2))
-      y = keras.layers.TimeDistributed(
-          keras.layers.Dropout(.999))(x, training=True)
+      y = keras.layers.TimeDistributed(keras.layers.Dropout(.999))(
+          x, training=True)
       model = keras.models.Model(x, y)
       y = model.predict(np.random.random((10, 3, 2)))
       self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
@@ -277,7 +276,7 @@ class BidirectionalTest(test.TestCase):
         model.add(
             keras.layers.Bidirectional(
                 rnn(output_dim), merge_mode=mode, input_shape=(timesteps, dim)))
-        model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+        model.compile(optimizer='rmsprop', loss='mse')
         model.fit(x, y, epochs=1, batch_size=1)
 
         # check whether the model variables are present in the
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index bca0cb0e4d5fd446272bff4093c1fbbc34dd2db1..9fbd28fcf3b4fcb43fc1ed5305743d933448505c 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -24,6 +24,7 @@ import abc
 import six
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
@@ -33,8 +34,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
 
 
+@keras_export('keras.losses.Loss')
 class Loss(object):
   """Loss base class.
 
@@ -85,7 +88,10 @@ class Loss(object):
     Raises:
       ValueError: If the shape of `sample_weight` is invalid.
     """
-    with ops.name_scope(self.name, format(self.__class__.__name__),
+    # If we are wrapping a lambda function strip '<>' from the name as it is not
+    # accepted in scope name.
+    scope_name = 'lambda' if self.name == '<lambda>' else self.name
+    with ops.name_scope(scope_name, format(self.__class__.__name__),
                         (y_pred, y_true, sample_weight)):
       losses = self.call(y_true, y_pred)
       return compute_weighted_loss(
@@ -107,6 +113,7 @@ class Loss(object):
     return {'reduction': self.reduction, 'name': self.name}
 
   @abc.abstractmethod
+  @doc_controls.for_subclass_implementers
   def call(self, y_true, y_pred):
     """Invokes the `Loss` instance.
 
@@ -117,6 +124,48 @@ class Loss(object):
     NotImplementedError('Must be implemented in subclasses.')
 
 
+class LossFunctionWrapper(Loss):
+  """Wraps a loss function in the `Loss` class.
+
+  Args:
+    fn: The loss function to wrap, with signature `fn(y_true, y_pred,
+      **kwargs)`.
+    reduction: (Optional) Type of `tf.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: (Optional) name for the loss.
+    **kwargs: The keyword arguments that are passed on to `fn`.
+  """
+
+  def __init__(self,
+               fn,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None,
+               **kwargs):
+    super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name)
+    self.fn = fn
+    self._fn_kwargs = kwargs
+
+  def call(self, y_true, y_pred):
+    """Invokes the `LossFunctionWrapper` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Loss values per sample.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return self.fn(y_true, y_pred, **self._fn_kwargs)
+
+  def get_config(self):
+    config = {'fn': self.fn}
+    config.update(self._fn_kwargs)
+    base_config = super(LossFunctionWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @keras_export('keras.losses.MeanSquaredError')
 class MeanSquaredError(Loss):
   """Computes the mean of squares of errors between labels and predictions.
@@ -271,7 +320,14 @@ class MeanSquaredLogarithmicError(Loss):
 
 @keras_export('keras.losses.BinaryCrossentropy')
 class BinaryCrossentropy(Loss):
-  """Computes the binary cross entropy loss between the labels and predictions.
+  """Computes the crossentropy loss between the labels and predictions.
+
+  Use this crossentropy loss function when there are only two label classes
+  (assumed to be 0 and 1). There should be a single floating point value per
+  feature.
+
+  In the snippet below, there is a single floating pointing value per example,
+  and the shape of both `y_pred` and `y_true` are `[batch_size]`.
 
   Usage:
 
@@ -286,12 +342,12 @@ class BinaryCrossentropy(Loss):
   ```python
   model = keras.models.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
-  ````
+  ```
 
   Args:
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-    label_smoothing: If greater than `0` then smooth the labels.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
     reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
       `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
@@ -304,7 +360,8 @@ class BinaryCrossentropy(Loss):
                name=None):
     super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
     self.from_logits = from_logits
-    self.label_smoothing = label_smoothing
+    self.label_smoothing = ops.convert_to_tensor(
+        label_smoothing, dtype=K.floatx())
 
   def call(self, y_true, y_pred):
     """Invokes the `BinaryCrossentropy` instance.
@@ -318,16 +375,25 @@ class BinaryCrossentropy(Loss):
     """
     y_pred = ops.convert_to_tensor(y_pred)
     y_true = math_ops.cast(y_true, y_pred.dtype)
-
-    if self.label_smoothing > 0:
-      y_true = y_true * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
-
-    return binary_crossentropy(y_true, y_pred, from_logits=self.from_logits)
+    return binary_crossentropy(
+        y_true,
+        y_pred,
+        from_logits=self.from_logits,
+        label_smoothing=self.label_smoothing)
 
 
 @keras_export('keras.losses.CategoricalCrossentropy')
 class CategoricalCrossentropy(Loss):
-  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+  """Computes the crossentropy loss between the labels and predictions.
+
+  Use this crossentropy loss function when there are two or more label classes.
+  We expect labels to be provided in a `one_hot` representation. If you want to
+  provide labels as integers, please use `SparseCategoricalCrossentropy` loss.
+  There should be `# classes` floating point values per feature.
+
+  In the snippet below, there is `# classes` floating pointing values per
+  example. The shape of both `y_pred` and `y_true` are
+  `[batch_size, num_classes]`.
 
   Usage:
 
@@ -344,13 +410,15 @@ class CategoricalCrossentropy(Loss):
   ```python
   model = keras.models.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
-  ````
+  ```
 
   Args:
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-    label_smoothing: If greater than `0` then smooth the labels. This option is
-      currently not supported when `y_pred` is a sparse input (not one-hot).
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+      meaning the confidence on label values are relaxed. e.g.
+      `label_smoothing=0.2` means that we will use a value of `0.1` for label
+      `0` and `0.9` for label `1`"
     reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
       `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
@@ -364,7 +432,8 @@ class CategoricalCrossentropy(Loss):
     super(CategoricalCrossentropy, self).__init__(
         reduction=reduction, name=name)
     self.from_logits = from_logits
-    self.label_smoothing = label_smoothing
+    self.label_smoothing = ops.convert_to_tensor(
+        label_smoothing, dtype=K.floatx())
 
   def call(self, y_true, y_pred):
     """Invokes the `CategoricalCrossentropy` instance.
@@ -377,22 +446,341 @@ class CategoricalCrossentropy(Loss):
       Categorical cross entropy losses.
     """
     y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return categorical_crossentropy(
+        y_true,
+        y_pred,
+        from_logits=self.from_logits,
+        label_smoothing=self.label_smoothing)
+
+
+@keras_export('keras.losses.SparseCategoricalCrossentropy')
+class SparseCategoricalCrossentropy(Loss):
+  """Computes the crossentropy loss between the labels and predictions.
+
+  Use this crossentropy loss function when there are two or more label classes.
+  We expect labels to be provided as integers. If you want to provide labels
+  using `one-hot` representation, please use `CategoricalCrossentropy` loss.
+  There should be `# classes` floating point values per feature for `y_pred`
+  and a single floating point value per feature for `y_true`.
+
+  In the snippet below, there is a single floating point value per example for
+  `y_true` and `# classes` floating pointing values per example for `y_pred`.
+  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+  `[batch_size, num_classes]`.
+
+  Usage:
+
+  ```python
+  cce = tf.keras.losses.SparseCategoricalCrossentropy()
+  loss = cce(
+    [0, 1, 2],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.SparseCategoricalCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(SparseCategoricalCrossentropy, self).__init__(
+        reduction=reduction, name=name)
+    self.from_logits = from_logits
+
+  def call(self, y_true, y_pred):
+    """Invokes the `SparseCategoricalCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Sparse categorical cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
     y_true = ops.convert_to_tensor(y_true)
-    is_sparse = y_pred.shape != y_true.shape
+    return sparse_categorical_crossentropy(
+        y_true, y_pred, from_logits=self.from_logits)
+
+
+@keras_export('keras.losses.Hinge')
+class Hinge(Loss):
+  """Computes the hinge loss between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  h = tf.losses.Hinge()
+  loss = h([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.Hinge())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Calculates the hinge loss.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Hinge loss.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return hinge(y_true, y_pred)
 
-    if is_sparse:
-      return sparse_categorical_crossentropy(
-          y_true, y_pred, from_logits=self.from_logits)
-    else:
-      y_true = math_ops.cast(y_true, y_pred.dtype)
-      if self.label_smoothing > 0:
-        num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
-        smooth_positives = 1.0 - self.label_smoothing
-        smooth_negatives = self.label_smoothing / num_classes
-        y_true = y_true * smooth_positives + smooth_negatives
 
-      return categorical_crossentropy(
-          y_true, y_pred, from_logits=self.from_logits)
+@keras_export('keras.losses.SquaredHinge')
+class SquaredHinge(Loss):
+  """Computes the squared hinge loss between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  sh = tf.losses.SquaredHinge()
+  loss = sh([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.SquaredHinge())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Calculates the squared hinge loss.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Squared hinge loss.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return squared_hinge(y_true, y_pred)
+
+
+@keras_export('keras.losses.CategoricalHinge')
+class CategoricalHinge(Loss):
+  """Computes the categorical hinge loss between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  ch = tf.losses.CategoricalHinge()
+  loss = ch([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.CategoricalHinge())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Calculates the categorical hinge loss.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Categorical hinge loss.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return categorical_hinge(y_true, y_pred)
+
+
+@keras_export('keras.losses.LogLoss')
+class LogLoss(Loss):
+  """Computes the log loss between `y_true` and `y_pred`.
+
+  `logloss = - y_true * log(y_pred) - (1 - y_true) * log(1 - y_pred)`
+
+  Usage:
+
+  ```python
+  l = tf.losses.LogLoss()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 10.745
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.LogLoss())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return logloss(y_true, y_pred)
+
+
+@keras_export('keras.losses.Poisson')
+class Poisson(Loss):
+  """Computes the Poisson loss between `y_true` and `y_pred`.
+
+  `loss = y_pred - y_true * log(y_pred)`
+
+  Usage:
+
+  ```python
+  p = tf.losses.Poisson()
+  loss = p([1, 9, 2], [4, 8, 12])
+  print('Loss: ', loss.numpy())  # Loss: -4.63
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.Poisson())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return poisson(y_true, y_pred)
+
+
+@keras_export('keras.losses.LogCosh')
+class LogCosh(Loss):
+  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
+
+  Usage:
+
+  ```python
+  l = tf.losses.LogCosh()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.289
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.LogCosh())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return logcosh(y_true, y_pred)
+
+
+@keras_export('keras.losses.KLDivergence')
+class KLDivergence(Loss):
+  """Computes Kullback Leibler divergence loss between `y_true` and `y_pred`.
+
+  `loss = y_true * log(y_true / y_pred)`
+
+  Usage:
+
+  ```python
+  k = tf.losses.KLDivergence()
+  loss = k([.4, .9, .2], [.5, .8, .12])
+  print('Loss: ', loss.numpy())  # Loss: -0.043
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.KLDivergence())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return kullback_leibler_divergence(y_true, y_pred)
+
+
+@keras_export('keras.losses.Huber')
+class Huber(Loss):
+  """Computes the Huber loss between `y_true` and `y_pred`.
+
+  For each value x in `error=y_true-y_pred`, the following is calculated:
+
+  ```
+  0.5 * x^2                  if |x| <= d
+  0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
+  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+  Usage:
+
+  ```python
+  l = tf.losses.Huber()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.333
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.Huber())
+  ```
+
+  Args:
+    delta: A float, the point where the Huber loss function changes from a
+      quadratic to linear.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               delta=1.0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(Huber, self).__init__(reduction=reduction, name=name)
+    self.delta = delta
+
+  def call(self, y_true, y_pred):
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return huber_loss(y_true, y_pred, delta=self.delta)
 
 
 @keras_export('keras.metrics.mean_squared_error',
@@ -402,7 +790,7 @@ class CategoricalCrossentropy(Loss):
               'keras.losses.mse',
               'keras.losses.MSE')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
 
 
 @keras_export('keras.metrics.mean_absolute_error',
@@ -436,7 +824,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
 def mean_squared_logarithmic_error(y_true, y_pred):
   first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(math_ops.square(first_log - second_log), axis=-1)
+  return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
 
 
 @keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
@@ -457,6 +845,46 @@ def categorical_hinge(y_true, y_pred):
   return math_ops.maximum(0., neg - pos + 1.)
 
 
+def logloss(y_true, y_pred):
+  losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon()))
+  losses += math_ops.multiply((1 - y_true),
+                              math_ops.log(1 - y_pred + K.epsilon()))
+  return K.mean(-losses, axis=-1)
+
+
+def huber_loss(y_true, y_pred, delta=1.0):
+  """Computes Huber loss value.
+
+  For each value x in `error=y_true-y_pred`, the following is calculated:
+
+  ```
+  0.5 * x^2                  if |x| <= d
+  0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
+  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+  Args:
+    y_true: tensor of true targets.
+    y_pred: tensor of predicted targets.
+    delta: A float, the point where the Huber loss function changes from a
+      quadratic to linear.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = math_ops.cast(y_pred, dtype=K.floatx())
+  y_true = math_ops.cast(y_true, dtype=K.floatx())
+  error = math_ops.subtract(y_pred, y_true)
+  abs_error = math_ops.abs(error)
+  quadratic = math_ops.minimum(abs_error, delta)
+  linear = math_ops.subtract(abs_error, quadratic)
+  return math_ops.add(
+      math_ops.multiply(
+          ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
+          math_ops.multiply(quadratic, quadratic)),
+      math_ops.multiply(delta, linear))
+
+
 @keras_export('keras.losses.logcosh')
 def logcosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
@@ -482,7 +910,29 @@ def logcosh(y_true, y_pred):
 
 @keras_export('keras.metrics.categorical_crossentropy',
               'keras.losses.categorical_crossentropy')
-def categorical_crossentropy(y_true, y_pred, from_logits=False):
+def categorical_crossentropy(y_true,
+                             y_pred,
+                             from_logits=False,
+                             label_smoothing=0):
+  """Computes the categorical crossentropy loss.
+
+  Args:
+    y_true: tensor of true targets.
+    y_pred: tensor of predicted targets.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
+
+  Returns:
+    Categorical crossentropy loss value.
+  """
+
+  def _smooth_labels():
+    num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+    return y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes)
+
+  y_true = smart_cond.smart_cond(label_smoothing,
+                                 _smooth_labels, lambda: y_true)
   return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
@@ -495,7 +945,13 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
 
 @keras_export('keras.metrics.binary_crossentropy',
               'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred, from_logits=False):
+def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
+
+  def _smooth_labels():
+    return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
+
+  y_true = smart_cond.smart_cond(label_smoothing,
+                                 _smooth_labels, lambda: y_true)
   return K.mean(
       K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
@@ -521,14 +977,15 @@ def poisson(y_true, y_pred):
               'keras.metrics.cosine',
               'keras.losses.cosine_proximity',
               'keras.losses.cosine')
-def cosine_proximity(y_true, y_pred):
-  y_true = nn.l2_normalize(y_true, axis=-1)
-  y_pred = nn.l2_normalize(y_pred, axis=-1)
-  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
+def cosine_proximity(y_true, y_pred, axis=-1):
+  y_true = nn.l2_normalize(y_true, axis=axis)
+  y_pred = nn.l2_normalize(y_pred, axis=axis)
+  return -math_ops.reduce_sum(y_true * y_pred, axis=axis)
 
 
+@keras_export('keras.losses.CosineProximity')
 class CosineProximity(Loss):
-  """Computes the cosine distance between `y_true` and `y_pred`.
+  """Computes the cosine proximity between `y_true` and `y_pred`.
 
   Usage:
 
@@ -544,8 +1001,22 @@ class CosineProximity(Loss):
   model = keras.models.Model(inputs, outputs)
   model.compile('sgd', loss=tf.losses.CosineProximity())
   ```
+
+  Args:
+    axis: (Optional) Defaults to -1. The dimension along which the cosine
+      proximity is computed.
+    reduction: (Optional) Type of `tf.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
   """
 
+  def __init__(self,
+               axis=-1,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(CosineProximity, self).__init__(reduction=reduction, name=name)
+    self.axis = axis
+
   def call(self, y_true, y_pred):
     """Calculates the cosine proximity loss.
 
@@ -558,7 +1029,7 @@ class CosineProximity(Loss):
     """
     y_pred = ops.convert_to_tensor(y_pred)
     y_true = math_ops.cast(y_true, y_pred.dtype)
-    return cosine_proximity(y_true, y_pred)
+    return cosine_proximity(y_true, y_pred, axis=self.axis)
 
 
 # Aliases.
@@ -571,6 +1042,16 @@ kld = KLD = kullback_leibler_divergence
 cosine = cosine_proximity
 
 
+def is_categorical_crossentropy(loss):
+  result = ((isinstance(loss, CategoricalCrossentropy) or
+             (isinstance(loss, LossFunctionWrapper) and
+              loss.fn == categorical_crossentropy) or
+             (hasattr(loss, '__name__') and
+              loss.__name__ == 'categorical_crossentropy') or
+             (loss == 'categorical_crossentropy')))
+  return result
+
+
 @keras_export('keras.losses.serialize')
 def serialize(loss):
   return serialize_keras_object(loss)
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index bc040fb685759ef20b698642dd9becb303562e73..04dd71241eca265dbc111581bc3646838e29964c 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -27,7 +27,6 @@ from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 
@@ -57,7 +56,7 @@ class _MSEMAELoss(object):
   def __init__(self, mse_fraction):
     self.mse_fraction = mse_fraction
 
-  def __call__(self, y_true, y_pred):
+  def __call__(self, y_true, y_pred, sample_weight=None):
     return (self.mse_fraction * keras.losses.mse(y_true, y_pred) +
             (1 - self.mse_fraction) * keras.losses.mae(y_true, y_pred))
 
@@ -181,6 +180,25 @@ class KerasLossesTest(test.TestCase):
         loaded_model = keras.models.load_model(model_filename)
         loaded_model.predict(np.random.rand(128, 2))
 
+  def test_loss_wrapper(self):
+    loss_fn = keras.losses.get('mse')
+    mse_obj = keras.losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
+
+    self.assertEqual(mse_obj.name, 'mean_squared_error')
+    self.assertEqual(mse_obj.reduction,
+                     losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE)
+
+    y_true = constant_op.constant([[1., 9.], [2., 5.]])
+    y_pred = constant_op.constant([[4., 8.], [12., 3.]])
+    sample_weight = constant_op.constant([1.2, 0.5])
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
+    # mse = [5, 52]
+    # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
+    # reduced_weighted_mse = (6 + 26) / 2 =
+    self.assertAllClose(self.evaluate(loss), 16, 1e-2)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class MeanSquaredErrorTest(test.TestCase):
@@ -485,59 +503,88 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class CosineProximityTest(test.TestCase):
 
+  def l2_norm(self, x, axis):
+    epsilon = 1e-12
+    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+    return np.multiply(x, x_inv_norm)
+
+  def setup(self, axis=1):
+    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+    y_true = self.l2_norm(self.np_y_true, axis)
+    y_pred = self.l2_norm(self.np_y_pred, axis)
+    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+    self.y_true = constant_op.constant(self.np_y_true)
+    self.y_pred = constant_op.constant(self.np_y_pred)
+
   def test_config(self):
     cosine_obj = keras.losses.CosineProximity(
-        reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+        axis=2, reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
     self.assertEqual(cosine_obj.name, 'cosine_loss')
     self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cosine_obj.axis, 2)
 
   def test_unweighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), -0.18722, 3)
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_scalar_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), -0.43060, 3)
+    sample_weight = 2.3
+    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = np.mean(self.expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_sample_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
-    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.15599, 3)
+    sample_weight = np.asarray([1.2, 3.4])
+    loss = cosine_obj(
+        self.y_true,
+        self.y_pred,
+        sample_weight=constant_op.constant(sample_weight))
+    expected_loss = np.mean(self.expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_timestep_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3, 1),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), -2.0000, 3)
+    np_y_true = self.np_y_true.reshape((2, 3, 1))
+    np_y_pred = self.np_y_pred.reshape((2, 3, 1))
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
+
+    y_true = self.l2_norm(np_y_true, 2)
+    y_pred = self.l2_norm(np_y_pred, 2)
+    expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(2,))
+
+    y_true = constant_op.constant(np_y_true)
+    y_pred = constant_op.constant(np_y_pred)
+    loss = cosine_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+
+    expected_loss = np.mean(expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_zero_weighted(self):
+    self.setup()
     cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred, sample_weight=0)
+    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
+  def test_axis(self):
+    self.setup(axis=1)
+    cosine_obj = keras.losses.CosineProximity(axis=1)
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class BinaryCrossentropyTest(test.TestCase):
@@ -564,74 +611,132 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     bce_obj = keras.losses.BinaryCrossentropy()
-    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
-    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
     loss = bce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 8.0004, 3)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #         -log(Y_MAX + EPSILON), -log(1)]
+    #      = [0, 15.33, 0, 0]
+    # Reduced loss = 15.33 / 4
+
+    self.assertAlmostEqual(self.evaluate(loss), 3.833, 3)
 
     # Test with logits.
-    logits = constant_op.constant([10., 10., 10., -10., 10, -10],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), 5., 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    #      = [((100 - 100 * 1 + log(1 + exp(-100))) +
+    #          (0 + 100 * 0 + log(1 + exp(-100))) +
+    #          (100 - 100 * 1 + log(1 + exp(-100))),
+    #         ((100 - 100 * 0 + log(1 + exp(-100))) +
+    #          (100 - 100 * 1 + log(1 + exp(-100))) +
+    #          (0 + 100 * 1 + log(1 + exp(-100))))]
+    #      = [(0 + 0 + 0) / 3, 200 / 3]
+    # Reduced loss = (0 + 66.666) / 2
+
+    self.assertAlmostEqual(self.evaluate(loss), 33.333, 3)
 
   def test_scalar_weighted(self):
     bce_obj = keras.losses.BinaryCrossentropy()
-    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
-    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     loss = bce_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 18.4010, 3)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #         -log(Y_MAX + EPSILON), -log(1)]
+    #      = [0, 15.33, 0, 0]
+    # Weighted loss = [0, 15.33 * 2.3, 0, 0]
+    # Reduced loss = 15.33 * 2.3 / 4
+
+    self.assertAlmostEqual(self.evaluate(loss), 8.817, 3)
 
     # Test with logits.
-    y_true = array_ops.ones((32, 1))
-    logits = array_ops.ones((32, 1), dtype=dtypes.float32)
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 0.7205, 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    # Loss = [(0 + 0 + 0) / 3, 200 / 3]
+    # Weighted loss = [0 * 2.3, 66.666 * 2.3]
+    # Reduced loss = (0 + 66.666 * 2.3) / 2
+
+    self.assertAlmostEqual(self.evaluate(loss), 76.667, 3)
 
   def test_sample_weighted(self):
     bce_obj = keras.losses.BinaryCrossentropy()
-    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
-    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float64)
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 21.4907, 3)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #         -log(Y_MAX + EPSILON), -log(1)]
+    #      = [0, 15.33, 0, 0]
+    # Reduced loss = 15.33 * 1.2 / 4
+
+    self.assertAlmostEqual(self.evaluate(loss), 4.6, 3)
 
     # Test with logits.
-    y_true = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
-    logits = constant_op.constant(
-        [[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
-         [-100.0, -100.0, 100.0]],
-        dtype=dtypes.float64)
-    weights = constant_op.constant([3, 2, 8])
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
+    weights = constant_op.constant([4, 3])
     bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits, sample_weight=weights)
-    self.assertAlmostEqual(self.evaluate(loss), 288.8888, 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    # Loss = [(0 + 0 + 0)/3, 200 / 3]
+    # Weighted loss = [0 * 4, 66.666 * 3]
+    # Reduced loss = (0 + 66.666 * 3) / 2
+
+    self.assertAlmostEqual(self.evaluate(loss), 100, 3)
 
   def test_no_reduction(self):
-    y_true = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
-    logits = constant_op.constant(((100.0, -100.0, 100.0),
-                                   (100.0, -100.0, 100.0),
-                                   (100.0, 100.0, -100.0)))
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(
         from_logits=True, reduction=losses_impl.ReductionV2.NONE)
     loss = bce_obj(y_true, logits)
-    self.assertAllClose((0., 66.6666, 66.6666), self.evaluate(loss), 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    # Loss = [(0 + 0 + 0)/3, (200)/3]
+
+    self.assertAllClose((0., 66.6666), self.evaluate(loss), 3)
 
   def test_label_smoothing(self):
     logits = constant_op.constant([[100.0, -100.0, -100.0]])
     y_true = constant_op.constant([[1, 0, 1]])
     label_smoothing = 0.1
     # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
     # Label smoothing: z' = z * (1 - L) + 0.5L
     #                  1  = 1 - 0.5L
     #                  0  = 0.5L
@@ -744,22 +849,26 @@ class CategoricalCrossentropyTest(test.TestCase):
     expected_value = 400.0 * label_smoothing / 3.0
     self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
 
-  def test_all_correct_unweighted_sparse(self):
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseCategoricalCrossentropyTest(test.TestCase):
+
+  def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
     y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
                                   dtype=dtypes.float32)
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     loss = cce_obj(y_true, y_pred)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
     # Test with logits.
     logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
-  def test_unweighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_unweighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([0, 1, 2])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -768,12 +877,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
 
-  def test_scalar_weighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -782,12 +891,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=2.3)
     self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
 
-  def test_sample_weighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -797,18 +906,645 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
 
-  def test_no_reduction_sparse(self):
+  def test_no_reduction(self):
     y_true = constant_op.constant([[0], [1], [2]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(
         from_logits=True, reduction=losses_impl.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class HingeTest(test.TestCase):
+
+  def test_config(self):
+    hinge_obj = keras.losses.Hinge(
+        reduction=losses_impl.ReductionV2.SUM, name='hinge_loss')
+    self.assertEqual(hinge_obj.name, 'hinge_loss')
+    self.assertEqual(hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = hinge_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 7.3333, 3)
+
+  def test_scalar_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 16.8666, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 24.9333, 3)
+
+  def test_timestep_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 2.0, 3)
+
+  def test_zero_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = hinge_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SquaredHingeTest(test.TestCase):
+
+  def test_config(self):
+    sq_hinge_obj = keras.losses.SquaredHinge(
+        reduction=losses_impl.ReductionV2.SUM, name='sq_hinge_loss')
+    self.assertEqual(sq_hinge_obj.name, 'sq_hinge_loss')
+    self.assertEqual(sq_hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
+    y_pred = constant_op.constant([4, 8, 12, 8],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+
+    # Sq hinge = mean(square(max(1. - y_true * y_pred, 0.)), axis=-1)
+    # (1. - y_true * y_pred) = [[1-4, 1-72], [1-24, 1+40]] = [0, 48]
+    # sq(max(above val, 0)) = sq([[0, 0], [0, 41]) = [[0, 0], [0, 1681]]
+    # Mean = [0, 840.5]. Reduced loss = (0 + 840.5)/2 = 420.25
+    loss = sq_hinge_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 420.25, 3)
+
+  def test_scalar_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 647.833, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 957.667, 3)
+
+  def test_timestep_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 6.0, 3)
+
+  def test_zero_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalHingeTest(test.TestCase):
+
+  def test_config(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge(
+        reduction=losses_impl.ReductionV2.SUM, name='cat_hinge_loss')
+    self.assertEqual(cat_hinge_obj.name, 'cat_hinge_loss')
+    self.assertEqual(cat_hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
+    y_pred = constant_op.constant([4, 8, 12, 8],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    loss = cat_hinge_obj(y_true, y_pred)
+
+    # pos = reduce_sum(y_true * y_pred) = [1*4+8*9, 12*2+8*-5] = [76, -16]
+    # neg = reduce_max((1. - y_true) * y_pred) = [[0, -64], [-12, 48]] = [0, 48]
+    # cat_hinge = max(0., neg - pos + 1.) = [0, 65]
+    # reduced_loss = (0 + 65)/2 = 32.5
+    self.assertAlmostEqual(self.evaluate(loss), 32.5, 3)
+
+  def test_scalar_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 83.95, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 124.1, 3)
+
+  def test_timestep_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 4.0, 3)
+
+  def test_zero_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogLossTest(test.TestCase):
+
+  def setup(self):
+    # TODO(psv): Change to setUp() after b/122319309 is fixed.
+    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
+    epsilon = 1e-7  # to avoid log 0
+
+    self.batch_size = 6
+    self.expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
+    self.expected_losses += np.multiply(1 - y_true,
+                                        np.log(1 - y_pred + epsilon))
+    self.expected_losses = -self.expected_losses
+
+    self.y_pred = constant_op.constant(y_pred)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    log_loss_obj = keras.losses.LogLoss(
+        reduction=losses_impl.ReductionV2.SUM, name='log')
+    self.assertEqual(log_loss_obj.name, 'log')
+    self.assertEqual(log_loss_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    loss = log_loss_obj(self.y_true, self.y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    loss = log_loss_obj(self.y_true, self.y_pred)
+    actual_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = 2.3
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
+
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_timestep_weighted(self):
+    log_loss_obj = keras.losses.LogLoss()
+
+    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3, 1))
+    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3, 1))
+    epsilon = 1e-7  # to avoid log 0
+    batch_size = 6
+
+    expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
+    expected_losses += np.multiply(1 - y_true, np.log(1 - y_pred + epsilon))
+
+    y_pred = constant_op.constant(y_pred)
+    y_true = constant_op.constant(y_true)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+    loss = log_loss_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    actual_loss = np.multiply(-expected_losses, sample_weight)
+    actual_loss = np.sum(actual_loss) / batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = 0
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogCoshTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    error = y_pred - y_true
+    self.expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    logcosh_obj = keras.losses.LogCosh(
+        reduction=losses_impl.ReductionV2.SUM, name='logcosh_loss')
+    self.assertEqual(logcosh_obj.name, 'logcosh_loss')
+    self.assertEqual(logcosh_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+
+    loss = logcosh_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+    sample_weight = 2.3
+
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+    y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
+    y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
+    error = y_pred - y_true
+    expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+    loss = logcosh_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+    sample_weight = 0
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PoissonTest(test.TestCase):
+
+  def setup(self):
+    self.np_y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+    self.np_y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_losses = self.np_y_pred - np.multiply(self.np_y_true,
+                                                        np.log(self.np_y_pred))
+
+    self.y_pred = constant_op.constant(self.np_y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    poisson_obj = keras.losses.Poisson(
+        reduction=losses_impl.ReductionV2.SUM, name='poisson')
+    self.assertEqual(poisson_obj.name, 'poisson')
+    self.assertEqual(poisson_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+
+    loss = poisson_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    sample_weight = 2.3
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    y_true = self.np_y_true.reshape(2, 3, 1)
+    y_pred = self.np_y_pred.reshape(2, 3, 1)
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3, 1)
+    expected_losses = y_pred - np.multiply(y_true, np.log(y_pred))
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+
+    loss = poisson_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KLDivergenceTest(test.TestCase):
+
+  def setup(self):
+    self.np_y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
+    self.np_y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
+
+    self.batch_size = 2
+    self.expected_losses = np.multiply(self.np_y_true,
+                                       np.log(self.np_y_true / self.np_y_pred))
+
+    self.y_pred = constant_op.constant(self.np_y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    k_obj = keras.losses.KLDivergence(
+        reduction=losses_impl.ReductionV2.SUM, name='kld')
+    self.assertEqual(k_obj.name, 'kld')
+    self.assertEqual(k_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+
+    loss = k_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    sample_weight = 2.3
+
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(2, 3))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    y_true = self.np_y_true.reshape(2, 3, 1)
+    y_pred = self.np_y_pred.reshape(2, 3, 1)
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
+    expected_losses = np.sum(
+        np.multiply(y_true, np.log(y_true / y_pred)), axis=-1)
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+    loss = k_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+
+    num_timesteps = 3
+    expected_loss = np.sum(expected_losses * sample_weight) / (
+        self.batch_size * num_timesteps)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class HuberLossTest(test.TestCase):
+
+  def huber_loss(self, y_true, y_pred, delta=1.0):
+    error = y_pred - y_true
+    abs_error = np.abs(error)
+
+    quadratic = np.minimum(abs_error, delta)
+    linear = np.subtract(abs_error, quadratic)
+    return np.add(
+        np.multiply(0.5, np.multiply(quadratic, quadratic)),
+        np.multiply(delta, linear))
+
+  def setup(self, delta=1.0):
+    self.np_y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    self.np_y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_losses = self.huber_loss(self.np_y_true, self.np_y_pred,
+                                           delta)
+
+    self.y_pred = constant_op.constant(self.np_y_pred)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    h_obj = keras.losses.Huber(
+        reduction=losses_impl.ReductionV2.SUM, name='huber')
+    self.assertEqual(h_obj.name, 'huber')
+    self.assertEqual(h_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    loss = h_obj(self.y_true, self.y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    loss = h_obj(self.y_true, self.y_pred)
+    actual_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    sample_weight = 2.3
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
+
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    y_pred = self.np_y_pred.reshape((2, 3, 1))
+    y_true = self.np_y_true.reshape((2, 3, 1))
+    expected_losses = self.huber_loss(y_true, y_pred)
+
+    y_pred = constant_op.constant(y_pred)
+    y_true = constant_op.constant(y_true)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+    loss = h_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    actual_loss = np.multiply(expected_losses, sample_weight)
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    sample_weight = 0
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+  def test_non_default_delta(self):
+    self.setup(delta=0.8)
+    h_obj = keras.losses.Huber(delta=0.8)
+    sample_weight = 2.3
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 8ccb514ba9263703812afaa5239b79fc4729b196..6aed0e7ac64e54732c015e4f76a0c02409bbfce9 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -20,23 +20,21 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import functools
 import sys
 import types
-import weakref
-from enum import Enum
 import numpy as np
 import six
 
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
+from tensorflow.python.keras.losses import categorical_hinge
 from tensorflow.python.keras.losses import cosine_proximity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
@@ -48,285 +46,23 @@ from tensorflow.python.keras.losses import mean_squared_logarithmic_error
 from tensorflow.python.keras.losses import poisson
 from tensorflow.python.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras.losses import squared_hinge
+from tensorflow.python.keras.utils import metrics_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import to_list
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
-def clone_metric(metric):
-  """Returns a clone of the metric if stateful, otherwise returns it as is."""
-  if isinstance(metric, Metric):
-    return metric.__class__.from_config(metric.get_config())
-  return metric
-
-
-def clone_metrics(metrics):
-  """Clones the given metric list/dict."""
-  if metrics is None:
-    return None
-  if isinstance(metrics, dict):
-    return {key: clone_metric(value) for key, value in metrics.items()}
-  return [clone_metric(metric) for metric in metrics]
-
-
-def update_state_wrapper(update_state_fn):
-  """Decorator to wrap metric `update_state()` with `add_update()`.
-
-  Args:
-    update_state_fn: function that accumulates metric statistics.
-
-  Returns:
-    Decorated function that wraps `update_state_fn()` with `add_update()`.
-  """
-
-  def decorated(metric_obj, *args, **kwargs):
-    """Decorated function with `add_update()`."""
-
-    update_op = update_state_fn(*args, **kwargs)
-    if update_op is not None:  # update_op will be None in eager execution.
-      metric_obj.add_update(update_op, inputs=True)
-    return update_op
-
-  return tf_decorator.make_decorator(update_state_fn, decorated)
-
-
-def result_wrapper(result_fn):
-  """Decorator to wrap metric `result()` function in `merge_call()`.
-
-  Result computation is an idempotent operation that simply calculates the
-  metric value using the state variables.
-
-  If metric state variables are distributed across replicas/devices and
-  `result()` is requested from the context of one device - This function wraps
-  `result()` in a distribution strategy `merge_call()`. With this,
-  the metric state variables will be aggregated across devices.
-
-  Args:
-    result_fn: function that computes the metric result.
-
-  Returns:
-    Decorated function that wraps `result_fn()` in distribution strategy
-    `merge_call()`.
-  """
-
-  def decorated(_, *args):
-    """Decorated function with merge_call."""
-    replica_context = distribution_strategy_context.get_replica_context()
-    if replica_context is None:  # if in cross replica context already
-      result_t = result_fn(*args)
-    else:
-      # TODO(psv): Test distribution of metrics using different distribution
-      # strategies.
-
-      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
-      # with distribution object as the first parameter. We create a wrapper
-      # here so that the result function need not have that parameter.
-      def merge_fn_wrapper(distribution, merge_fn, *args):
-        # We will get `PerDevice` merge function. Taking the first one as all
-        # are identical copies of the function that we had passed below.
-        return distribution.unwrap(merge_fn)[0](*args)
-
-      # Wrapping result in merge_call. merge_call is used when we want to leave
-      # replica mode and compute a value in cross replica mode.
-      result_t = replica_context.merge_call(
-          merge_fn_wrapper, args=(result_fn,) + args)
-    return result_t
-
-  return tf_decorator.make_decorator(result_fn, decorated)
-
-
-def weakmethod(method):
-  """Creates a weak reference to the bound method."""
-
-  cls = method.im_class
-  func = method.im_func
-  instance_ref = weakref.ref(method.im_self)
-
-  @functools.wraps(method)
-  def inner(*args, **kwargs):
-    return func.__get__(instance_ref(), cls)(*args, **kwargs)
-
-  del method
-  return inner
-
-
-class _ConfusionMatrix(Enum):
-  TRUE_POSITIVES = 'tp'
-  FALSE_POSITIVES = 'fp'
-  TRUE_NEGATIVES = 'tn'
-  FALSE_NEGATIVES = 'fn'
-
-
-def _assert_thresholds_range(thresholds):
-  invalid_thresholds = [t for t in thresholds if t is None or t < 0 or t > 1]
-  if invalid_thresholds:
-    raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
-                     .format(invalid_thresholds))
-
-
-def _parse_init_thresholds(thresholds, default_threshold=0.5):
-  thresholds = to_list(default_threshold if thresholds is None else thresholds)
-  _assert_thresholds_range(thresholds)
-  return thresholds
-
-
-def _update_confusion_matrix_variables(variables_to_update,
-                                       y_true,
-                                       y_pred,
-                                       thresholds,
-                                       sample_weight=None):
-  """Returns op to update the given confusion matrix variables.
-
-  For every pair of values in y_true and y_pred:
-
-  true_positive: y_true == True and y_pred > thresholds
-  false_negatives: y_true == True and y_pred <= thresholds
-  true_negatives: y_true == False and y_pred <= thresholds
-  false_positive: y_true == False and y_pred > thresholds
-
-  The results will be weighted and added together. When multiple thresholds are
-  provided, we will repeat the same for every threshold.
-
-  For estimation of these metrics over a stream of data, the function creates an
-  `update_op` operation that updates the given variables.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use weights of 0 to mask values.
-
-  Args:
-    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
-      and corresponding variables to update as values.
-    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
-    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
-      the range `[0, 1]`.
-    thresholds: A float value or a python list or tuple of float thresholds in
-      `[0, 1]`.
-    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
-      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
-      be either `1`, or the same as the corresponding `y_true` dimension).
-
-  Returns:
-    Update op.
-
-  Raises:
-    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
-      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
-      `variables_to_update` contains invalid keys.
-  """
-  if variables_to_update is None:
-    return
-  y_true = ops.convert_to_tensor(y_true)
-  y_pred = ops.convert_to_tensor(y_pred)
-  y_pred.shape.assert_is_compatible_with(y_true.shape)
-
-  if not any(
-      key for key in variables_to_update if key in list(_ConfusionMatrix)):
-    raise ValueError(
-        'Please provide at least one valid confusion matrix '
-        'variable to update. Valid variable key options are: "{}". '
-        'Received: "{}"'.format(
-            list(_ConfusionMatrix), variables_to_update.keys()))
-
-  invalid_keys = [
-      key for key in variables_to_update if key not in list(_ConfusionMatrix)
-  ]
-  if invalid_keys:
-    raise ValueError(
-        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
-            invalid_keys, list(_ConfusionMatrix)))
-
-  with ops.control_dependencies([
-      check_ops.assert_greater_equal(
-          y_pred,
-          math_ops.cast(0.0, dtype=y_pred.dtype),
-          message='predictions must be >= 0'),
-      check_ops.assert_less_equal(
-          y_pred,
-          math_ops.cast(1.0, dtype=y_pred.dtype),
-          message='predictions must be <= 1')
-  ]):
-    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
-        math_ops.cast(y_pred, dtype=dtypes.float32),
-        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
-
-  thresholds = to_list(thresholds)
-  num_thresholds = len(thresholds)
-  num_predictions = array_ops.size(y_pred)
-
-  # Reshape predictions and labels.
-  predictions_2d = array_ops.reshape(y_pred, [1, -1])
-  labels_2d = array_ops.reshape(
-      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
-
-  # Tile the thresholds for every prediction.
-  thresh_tiled = array_ops.tile(
-      array_ops.expand_dims(array_ops.constant(thresholds), 1),
-      array_ops.stack([1, num_predictions]))
-
-  # Tile the predictions for every threshold.
-  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
-
-  # Compare predictions and threshold.
-  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
-
-  # Tile labels by number of thresholds
-  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
-
-  if sample_weight is not None:
-    weights = weights_broadcast_ops.broadcast_weights(
-        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
-    weights_tiled = array_ops.tile(
-        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
-  else:
-    weights_tiled = None
-
-  update_ops = []
-
-  def weighted_assign_add(label, pred, weights, var):
-    label_and_pred = math_ops.cast(
-        math_ops.logical_and(label, pred), dtype=dtypes.float32)
-    if weights is not None:
-      label_and_pred *= weights
-    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
-
-  loop_vars = {
-      _ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
-  }
-  update_tn = _ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
-  update_fp = _ConfusionMatrix.FALSE_POSITIVES in variables_to_update
-  update_fn = _ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
-
-  if update_fn or update_tn:
-    pred_is_neg = math_ops.logical_not(pred_is_pos)
-    loop_vars[_ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
-
-  if update_fp or update_tn:
-    label_is_neg = math_ops.logical_not(label_is_pos)
-    loop_vars[_ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
-    if update_tn:
-      loop_vars[_ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
-
-  for matrix_cond, (label, pred) in loop_vars.items():
-    if matrix_cond in variables_to_update:
-      update_ops.append(
-          weighted_assign_add(label, pred, weights_tiled,
-                              variables_to_update[matrix_cond]))
-  return control_flow_ops.group(update_ops)
-
-
+@keras_export('keras.metrics.Metric')
 @six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
@@ -391,15 +127,15 @@ class Metric(Layer):
       if sample_weight is not None:
         sample_weight = math_ops.cast(sample_weight, self._dtype)
         values = math_ops.multiply(values, sample_weight)
-      state_ops.assign_add(self.true_positives, math_ops.reduce_sum(values))
+      self.true_positives.assign_add(math_ops.reduce_sum(values))
 
     def result(self):
       return array_ops.identity(self.true_positives)
   ```
   """
 
-  def __init__(self, name=None, dtype=None):
-    super(Metric, self).__init__(name=name, dtype=dtype)
+  def __init__(self, name=None, dtype=None, **kwargs):
+    super(Metric, self).__init__(name=name, dtype=dtype, **kwargs)
     self.stateful = True  # All metric layers are stateful.
     self.built = True
     self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
@@ -412,15 +148,18 @@ class Metric(Layer):
       # weak reference. This is to remove reference cycle that is created here.
       # This is not an issue in python versions > 3.
       if context.executing_eagerly():
-        obj.update_state = weakmethod(obj.update_state)
-      obj.update_state = weakmethod(
-          types.MethodType(update_state_wrapper(obj.update_state), obj))
-      result = weakmethod(obj.result)
-      obj.result = weakmethod(types.MethodType(result_wrapper(result), obj))
+        obj.update_state = metrics_utils.weakmethod(obj.update_state)
+      obj.update_state = metrics_utils.weakmethod(
+          types.MethodType(
+              metrics_utils.update_state_wrapper(obj.update_state), obj))
+      result = metrics_utils.weakmethod(obj.result)
+      obj.result = metrics_utils.weakmethod(
+          types.MethodType(metrics_utils.result_wrapper(result), obj))
     else:
       obj.update_state = types.MethodType(
-          update_state_wrapper(obj.update_state), obj)
-      obj.result = types.MethodType(result_wrapper(obj.result), obj)
+          metrics_utils.update_state_wrapper(obj.update_state), obj)
+      obj.result = types.MethodType(
+          metrics_utils.result_wrapper(obj.result), obj)
 
     return obj
 
@@ -450,6 +189,14 @@ class Metric(Layer):
         result_t._metric_obj = self  # pylint: disable=protected-access
       return result_t
 
+  @property
+  def dtype(self):
+    return self._dtype
+
+  def get_config(self):
+    """Returns the serializable config of the metric."""
+    return {'name': self.name, 'dtype': self.dtype}
+
   def reset_states(self):
     """Resets all of the metric state variables.
 
@@ -472,7 +219,6 @@ class Metric(Layer):
          All update ops added to the graph by this function will be executed.
       As a result, code should generally work the same way with graph or
       eager execution.
-    and adds the update op to the metric layer.
 
     Args:
       *args:
@@ -489,12 +235,6 @@ class Metric(Layer):
     """
     NotImplementedError('Must be implemented in subclasses.')
 
-  @classmethod
-  def from_config(cls, config):
-    if 'trainable' in config:
-      config.pop('trainable')
-    return cls(**config)
-
   ### For use by subclasses ###
   @doc_controls.for_subclass_implementers
   def add_weight(self,
@@ -502,12 +242,13 @@ class Metric(Layer):
                  shape=(),
                  aggregation=tf_variables.VariableAggregation.SUM,
                  synchronization=tf_variables.VariableSynchronization.ON_READ,
-                 initializer=None):
+                 initializer=None,
+                 dtype=None):
     """Adds state variable. Only for use by subclasses."""
     return super(Metric, self).add_weight(
         name=name,
         shape=shape,
-        dtype=self._dtype,
+        dtype=self._dtype if dtype is None else dtype,
         trainable=False,
         initializer=initializer,
         collections=[],
@@ -517,8 +258,141 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
+class Reduce(Metric):
+  """Encapsulates metrics that perform a reduce operation on the values."""
+
+  def __init__(self, reduction, name, dtype=None):
+    """Creates a `Reduce` instance.
+
+    Args:
+      reduction: a `tf.keras.metrics.Reduction` enum value.
+      name: string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Reduce, self).__init__(name=name, dtype=dtype)
+    self.reduction = reduction
+    self.total = self.add_weight(
+        'total', initializer=init_ops.zeros_initializer)
+    if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+                     metrics_utils.Reduction.WEIGHTED_MEAN]:
+      self.count = self.add_weight(
+          'count', initializer=init_ops.zeros_initializer)
+
+  def update_state(self, values, sample_weight=None):
+    """Accumulates statistics for computing the reduction metric.
+
+    For example, if `values` is [1, 3, 5, 7] and reduction=SUM_OVER_BATCH_SIZE,
+    then the value of `result()` is 4. If the `sample_weight` is specified as
+    [1, 1, 0, 0] then value of `result()` would be 2.
+
+    Args:
+      values: Per-example value.
+      sample_weight: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      Update op.
+    """
+    values = math_ops.cast(values, self._dtype)
+    if sample_weight is not None:
+      sample_weight = math_ops.cast(sample_weight, self._dtype)
+      # Update dimensions of weights to match with values if possible.
+      values, _, sample_weight = squeeze_or_expand_dimensions(
+          values, None, sample_weight)
+      try:
+        # Broadcast weights if possible.
+        sample_weight = weights_broadcast_ops.broadcast_weights(
+            sample_weight, values)
+      except ValueError:
+        # Reduce values to same ndim as weight array
+        ndim = K.ndim(values)
+        weight_ndim = K.ndim(sample_weight)
+        if self.reduction == metrics_utils.Reduction.SUM:
+          values = math_ops.reduce_sum(
+              values, axis=list(range(weight_ndim, ndim)))
+        else:
+          values = math_ops.reduce_mean(
+              values, axis=list(range(weight_ndim, ndim)))
+      values = math_ops.multiply(values, sample_weight)
+
+    value_sum = math_ops.reduce_sum(values)
+    with ops.control_dependencies([value_sum]):
+      update_total_op = self.total.assign_add(value_sum)
+
+    # Exit early if the reduction doesn't have a denominator.
+    if self.reduction == metrics_utils.Reduction.SUM:
+      return update_total_op
+
+    # Update `count` for reductions that require a denominator.
+    if self.reduction == metrics_utils.Reduction.SUM_OVER_BATCH_SIZE:
+      num_values = math_ops.cast(array_ops.size(values), self._dtype)
+    elif self.reduction == metrics_utils.Reduction.WEIGHTED_MEAN:
+      if sample_weight is None:
+        num_values = math_ops.cast(array_ops.size(values), self._dtype)
+      else:
+        num_values = math_ops.reduce_sum(sample_weight)
+    else:
+      raise NotImplementedError(
+          'reduction [%s] not implemented' % self.reduction)
+
+    with ops.control_dependencies([update_total_op]):
+      return self.count.assign_add(num_values)
+
+  def result(self):
+    if self.reduction == metrics_utils.Reduction.SUM:
+      return array_ops.identity(self.total)
+    elif self.reduction in [
+        metrics_utils.Reduction.WEIGHTED_MEAN,
+        metrics_utils.Reduction.SUM_OVER_BATCH_SIZE
+    ]:
+      return math_ops.div_no_nan(self.total, self.count)
+    else:
+      raise NotImplementedError(
+          'reduction [%s] not implemented' % self.reduction)
+
+
+@keras_export('keras.metrics.Sum')
+class Sum(Reduce):
+  """Computes the (weighted) sum of the given values.
+
+  For example, if values is [1, 3, 5, 7] then the sum is 16.
+  If the weights were specified as [1, 1, 0, 0] then the sum would be 4.
+
+  This metric creates one variable, `total`, that is used to compute the sum of
+  `values`. This is ultimately returned as `sum`.
+
+  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
+  to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Sum()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 16.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.add_metric(tf.keras.metrics.Sum(name='sum_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
+  """
+
+  def __init__(self, name='sum', dtype=None):
+    """Creates a `Sum` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Sum, self).__init__(reduction=metrics_utils.Reduction.SUM,
+                              name=name, dtype=dtype)
+
+
 @keras_export('keras.metrics.Mean')
-class Mean(Metric):
+class Mean(Reduce):
   """Computes the (weighted) mean of the given values.
 
   For example, if values is [1, 3, 5, 7] then the mean is 4.
@@ -555,58 +429,89 @@ class Mean(Metric):
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
-    super(Mean, self).__init__(name=name, dtype=dtype)
-    # Create new state variables
-    self.total = self.add_weight(
-        'total', initializer=init_ops.zeros_initializer)
-    self.count = self.add_weight(
-        'count', initializer=init_ops.zeros_initializer)
+    super(Mean, self).__init__(
+        reduction=metrics_utils.Reduction.WEIGHTED_MEAN, name=name, dtype=dtype)
 
-  def update_state(self, values, sample_weight=None):
-    """Accumulates statistics for computing the mean.
 
-    For example, if `values` is [1, 3, 5, 7] then the mean is 4. If
-    the `sample_weight` is specified as [1, 1, 0, 0] then the mean would be 2.
+@keras_export('keras.metrics.MeanRelativeError')
+class MeanRelativeError(Mean):
+  """Computes the mean relative error by normalizing with the given values.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the mean relative absolute error. This average is weighted by
+  `sample_weight`, and it is ultimately returned as `mean_relative_error`:
+  an idempotent operation that simply divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3])
+  m.update_state([1, 3, 2, 3], [2, 4, 6, 8])
+
+  # metric = mean(|y_pred - y_true| / normalizer)
+  #        = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3])
+  #        = 5/4 = 1.25
+  print('Final result: ', m.result().numpy())  # Final result: 1.25
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.MeanRelativeError(normalizer=[1, 3])])
+  ```
+  """
+
+  def __init__(self, normalizer, name=None, dtype=None):
+    """Creates a `MeanRelativeError` instance.
 
     Args:
-      values: Per-example value.
-      sample_weight: Optional weighting of each example. Defaults to 1.
+      normalizer: The normalizer values with same shape as predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanRelativeError, self).__init__(name=name, dtype=dtype)
+    normalizer = math_ops.cast(normalizer, self._dtype)
+    self.normalizer = normalizer
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
 
     Returns:
       Update op.
     """
-    values = math_ops.cast(values, self._dtype)
-    if sample_weight is None:
-      num_values = math_ops.cast(array_ops.size(values), self._dtype)
-    else:
-      sample_weight = math_ops.cast(sample_weight, self._dtype)
-
-      # Update dimensions of weights to match with values if possible.
-      values, _, sample_weight = squeeze_or_expand_dimensions(
-          values, None, sample_weight)
-      try:
-        # Broadcast weights if possible.
-        sample_weight = weights_broadcast_ops.broadcast_weights(
-            sample_weight, values)
-      except ValueError:
-        # Reduce values to same ndim as weight array
-        ndim = K.ndim(values)
-        weight_ndim = K.ndim(sample_weight)
-        values = math_ops.reduce_mean(
-            values, axis=list(range(weight_ndim, ndim)))
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
 
-      num_values = math_ops.reduce_sum(sample_weight)
-      values = math_ops.multiply(values, sample_weight)
-    values = math_ops.reduce_sum(values)
+    y_pred, self.normalizer = confusion_matrix.remove_squeezable_dimensions(
+        y_pred, self.normalizer)
+    y_pred.shape.assert_is_compatible_with(y_pred.shape)
+    relative_errors = math_ops.div_no_nan(
+        math_ops.abs(y_true - y_pred), self.normalizer)
 
-    # Update state variables. Count should be updated only when total is
-    # updated.
-    update_total_op = state_ops.assign_add(self.total, values)
-    with ops.control_dependencies([update_total_op]):
-      return state_ops.assign_add(self.count, num_values)
+    return super(MeanRelativeError, self).update_state(
+        relative_errors, sample_weight=sample_weight)
 
-  def result(self):
-    return math_ops.div_no_nan(self.total, self.count)
+  def get_config(self):
+    n = self.normalizer
+    config = {'normalizer': K.eval(n) if _is_tensor_or_variable(n) else n}
+    base_config = super(MeanRelativeError, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 class MeanMetricWrapper(Mean):
@@ -651,8 +556,9 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = {'fn': self._fn}
-    config.update(self._fn_kwargs)
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if _is_tensor_or_variable(v) else v
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
@@ -692,12 +598,6 @@ class Accuracy(MeanMetricWrapper):
   def __init__(self, name='accuracy', dtype=None):
     super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(Accuracy, cls).from_config(config)
-
 
 @keras_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
@@ -743,12 +643,6 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(BinaryAccuracy, cls).from_config(config)
-
 
 @keras_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
@@ -799,12 +693,6 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CategoricalAccuracy, cls).from_config(config)
-
 
 @keras_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
@@ -846,58 +734,124 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(SparseCategoricalAccuracy, cls).from_config(config)
 
+@keras_export('keras.metrics.TopKCategoricalAccuracy')
+class TopKCategoricalAccuracy(MeanMetricWrapper):
+  """Computes how often targets are in the top `K` predictions.
 
-class _ConfusionMatrixConditionCount(Metric):
-  """Calculates the number of the given confusion matrix condition."""
+  Usage:
 
-  def __init__(self,
-               confusion_matrix_cond,
-               thresholds=None,
-               name=None,
-               dtype=None):
-    """Creates a `_ConfusionMatrixConditionCount` instance.
+  ```python
+  m = tf.keras.metrics.TopKCategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
+  ```
+  """
+
+  def __init__(self, k=5, name='top_k_categorical_accuracy', dtype=None):
+    """Creates a `TopKCategoricalAccuracy` instance.
 
     Args:
-      confusion_matrix_cond: One of `_ConfusionMatrix` conditions.
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
-        (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to 5.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
-    super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
-    self._confusion_matrix_cond = confusion_matrix_cond
-    self.thresholds = _parse_init_thresholds(
-        thresholds, default_threshold=0.5)
-    self.accumulator = self.add_weight(
-        'accumulator',
-        shape=(len(self.thresholds),),
-        initializer=init_ops.zeros_initializer)
+    super(TopKCategoricalAccuracy, self).__init__(
+        top_k_categorical_accuracy, name, dtype=dtype, k=k)
 
-  def update_state(self, y_true, y_pred, sample_weight=None):
-    """Accumulates the given confusion matrix condition statistics.
 
-    Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
-      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
-        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
-        be broadcastable to `y_true`.
+@keras_export('keras.metrics.SparseTopKCategoricalAccuracy')
+class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
+  """Computes how often integer targets are in the top `K` predictions.
 
-    Returns:
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseTopKCategoricalAccuracy()
+  m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
+  ```
+  """
+
+  def __init__(self, k=5, name='sparse_top_k_categorical_accuracy', dtype=None):
+    """Creates a `SparseTopKCategoricalAccuracy` instance.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to 5.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(SparseTopKCategoricalAccuracy, self).__init__(
+        sparse_top_k_categorical_accuracy, name, dtype=dtype, k=k)
+
+
+class _ConfusionMatrixConditionCount(Metric):
+  """Calculates the number of the given confusion matrix condition."""
+
+  def __init__(self,
+               confusion_matrix_cond,
+               thresholds=None,
+               name=None,
+               dtype=None):
+    """Creates a `_ConfusionMatrixConditionCount` instance.
+
+    Args:
+      confusion_matrix_cond: One of `metrics_utils.ConfusionMatrix` conditions.
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
+    self._confusion_matrix_cond = confusion_matrix_cond
+    self.init_thresholds = thresholds
+    self.thresholds = metrics_utils.parse_init_thresholds(
+        thresholds, default_threshold=0.5)
+    self.accumulator = self.add_weight(
+        'accumulator',
+        shape=(len(self.thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the given confusion matrix condition statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        self._confusion_matrix_cond: self.accumulator
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {self._confusion_matrix_cond: self.accumulator},
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        sample_weight=sample_weight)
 
   def result(self):
     if len(self.thresholds) == 1:
@@ -911,6 +865,11 @@ class _ConfusionMatrixConditionCount(Metric):
     for v in self.variables:
       K.set_value(v, np.zeros((num_thresholds,)))
 
+  def get_config(self):
+    config = {'thresholds': self.init_thresholds}
+    base_config = super(_ConfusionMatrixConditionCount, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.metrics.FalsePositives')
 class FalsePositives(_ConfusionMatrixConditionCount):
@@ -956,7 +915,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(FalsePositives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.FALSE_POSITIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_POSITIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
@@ -1006,7 +965,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(FalseNegatives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.FALSE_NEGATIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_NEGATIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
@@ -1056,7 +1015,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(TrueNegatives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.TRUE_NEGATIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_NEGATIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
@@ -1106,7 +1065,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(TruePositives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.TRUE_POSITIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_POSITIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
@@ -1128,6 +1087,15 @@ class Precision(Metric):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `top_k` is set, we'll calculate precision as how often on average a class
+  among the top-k classes with the highest predicted values of a batch entry is
+  correct and can be found in the label for that entry.
+
+  If `class_id` is specified, we calculate precision by considering only the
+  entries in the batch for which `class_id` is above the threshold and/or in the
+  top-k highest predictions, and computing the fraction of them for which
+  `class_id` is indeed a correct label.
+
   Usage:
 
   ```python
@@ -1144,26 +1112,42 @@ class Precision(Metric):
   ```
   """
 
-  def __init__(self, thresholds=None, name=None, dtype=None):
+  def __init__(self,
+               thresholds=None,
+               top_k=None,
+               class_id=None,
+               name=None,
+               dtype=None):
     """Creates a `Precision` instance.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
-        (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value. If neither thresholds nor top_k are set, the
+        default is to calculate precision with `thresholds=0.5`.
+      top_k: (Optional) Unset by default. An int value specifying the top-k
+        predictions to consider when calculating precision.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
     super(Precision, self).__init__(name=name, dtype=dtype)
-    self.thresholds = _parse_init_thresholds(
-        thresholds, default_threshold=0.5)
-    self.tp = self.add_weight(
+    self.init_thresholds = thresholds
+    self.top_k = top_k
+    self.class_id = class_id
+
+    default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
+    self.thresholds = metrics_utils.parse_init_thresholds(
+        thresholds, default_threshold=default_threshold)
+    self.true_positives = self.add_weight(
         'true_positives',
         shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
-    self.fp = self.add_weight(
+    self.false_positives = self.add_weight(
         'false_positives',
         shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
@@ -1172,8 +1156,9 @@ class Precision(Metric):
     """Accumulates true positive and false positive statistics.
 
     Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
+      y_true: The ground truth values, with the same dimensions as `y_pred`.
+        Will be cast to `bool`.
+      y_pred: The predicted values. Each element must be in the range `[0, 1]`.
       sample_weight: Optional weighting of each example. Defaults to 1. Can be a
         `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
         be broadcastable to `y_true`.
@@ -1181,13 +1166,21 @@ class Precision(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
-        _ConfusionMatrix.FALSE_POSITIVES: self.fp
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {
+            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+            metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives
+        },
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        top_k=self.top_k,
+        class_id=self.class_id,
+        sample_weight=sample_weight)
 
   def result(self):
-    result = math_ops.div_no_nan(self.tp, self.tp + self.fp)
+    result = math_ops.div_no_nan(self.true_positives,
+                                 self.true_positives + self.false_positives)
     return result[0] if len(self.thresholds) == 1 else result
 
   def reset_states(self):
@@ -1195,6 +1188,15 @@ class Precision(Metric):
     for v in self.variables:
       K.set_value(v, np.zeros((num_thresholds,)))
 
+  def get_config(self):
+    config = {
+        'thresholds': self.init_thresholds,
+        'top_k': self.top_k,
+        'class_id': self.class_id
+    }
+    base_config = super(Precision, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @keras_export('keras.metrics.Recall')
 class Recall(Metric):
@@ -1212,6 +1214,14 @@ class Recall(Metric):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `top_k` is set, recall will be computed as how often on average a class
+  among the labels of a batch entry is in the top-k predictions.
+
+  If `class_id` is specified, we calculate recall by considering only the
+  entries in the batch for which `class_id` is in the label, and computing the
+  fraction of them for which `class_id` is above the threshold and/or in the
+  top-k predictions.
+
   Usage:
 
   ```python
@@ -1228,26 +1238,42 @@ class Recall(Metric):
   ```
   """
 
-  def __init__(self, thresholds=None, name=None, dtype=None):
+  def __init__(self,
+               thresholds=None,
+               top_k=None,
+               class_id=None,
+               name=None,
+               dtype=None):
     """Creates a `Recall` instance.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
-        (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value. If neither thresholds nor top_k are set, the
+        default is to calculate recall with `thresholds=0.5`.
+      top_k: (Optional) Unset by default. An int value specifying the top-k
+        predictions to consider when calculating recall.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
     super(Recall, self).__init__(name=name, dtype=dtype)
-    self.thresholds = _parse_init_thresholds(
-        thresholds, default_threshold=0.5)
-    self.tp = self.add_weight(
+    self.init_thresholds = thresholds
+    self.top_k = top_k
+    self.class_id = class_id
+
+    default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
+    self.thresholds = metrics_utils.parse_init_thresholds(
+        thresholds, default_threshold=default_threshold)
+    self.true_positives = self.add_weight(
         'true_positives',
         shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
-    self.fn = self.add_weight(
+    self.false_negatives = self.add_weight(
         'false_negatives',
         shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
@@ -1256,8 +1282,9 @@ class Recall(Metric):
     """Accumulates true positive and false negative statistics.
 
     Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
+      y_true: The ground truth values, with the same dimensions as `y_pred`.
+        Will be cast to `bool`.
+      y_pred: The predicted values. Each element must be in the range `[0, 1]`.
       sample_weight: Optional weighting of each example. Defaults to 1. Can be a
         `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
         be broadcastable to `y_true`.
@@ -1265,13 +1292,21 @@ class Recall(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
-        _ConfusionMatrix.FALSE_NEGATIVES: self.fn
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {
+            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives
+        },
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        top_k=self.top_k,
+        class_id=self.class_id,
+        sample_weight=sample_weight)
 
   def result(self):
-    result = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+    result = math_ops.div_no_nan(self.true_positives,
+                                 self.true_positives + self.false_negatives)
     return result[0] if len(self.thresholds) == 1 else result
 
   def reset_states(self):
@@ -1279,6 +1314,15 @@ class Recall(Metric):
     for v in self.variables:
       K.set_value(v, np.zeros((num_thresholds,)))
 
+  def get_config(self):
+    config = {
+        'thresholds': self.init_thresholds,
+        'top_k': self.top_k,
+        'class_id': self.class_id
+    }
+    base_config = super(Recall, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @six.add_metaclass(abc.ABCMeta)
 class SensitivitySpecificityBase(Metric):
@@ -1293,19 +1337,19 @@ class SensitivitySpecificityBase(Metric):
     if num_thresholds <= 0:
       raise ValueError('`num_thresholds` must be > 0.')
     self.value = value
-    self.tp = self.add_weight(
+    self.true_positives = self.add_weight(
         'true_positives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
-    self.tn = self.add_weight(
+    self.true_negatives = self.add_weight(
         'true_negatives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
-    self.fp = self.add_weight(
+    self.false_positives = self.add_weight(
         'false_positives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
-    self.fn = self.add_weight(
+    self.false_negatives = self.add_weight(
         'false_negatives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
@@ -1331,12 +1375,17 @@ class SensitivitySpecificityBase(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
-        _ConfusionMatrix.TRUE_NEGATIVES: self.tn,
-        _ConfusionMatrix.FALSE_POSITIVES: self.fp,
-        _ConfusionMatrix.FALSE_NEGATIVES: self.fn,
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {
+            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+            metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,
+            metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,
+            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,
+        },
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        sample_weight=sample_weight)
 
   def reset_states(self):
     num_thresholds = len(self.thresholds)
@@ -1395,12 +1444,15 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     """
     if specificity < 0 or specificity > 1:
       raise ValueError('`specificity` must be in the range [0, 1].')
+    self.specificity = specificity
+    self.num_thresholds = num_thresholds
     super(SensitivityAtSpecificity, self).__init__(
         specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
 
   def result(self):
     # Calculate specificities at all the thresholds.
-    specificities = math_ops.div_no_nan(self.tn, self.tn + self.fp)
+    specificities = math_ops.div_no_nan(
+        self.true_negatives, self.true_negatives + self.false_positives)
 
     # Find the index of the threshold where the specificity is closest to the
     # given specificity.
@@ -1409,8 +1461,17 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     min_index = math_ops.cast(min_index, dtypes.int32)
 
     # Compute sensitivity at that index.
-    return math_ops.div_no_nan(self.tp[min_index],
-                               self.tp[min_index] + self.fn[min_index])
+    return math_ops.div_no_nan(
+        self.true_positives[min_index],
+        self.true_positives[min_index] + self.false_negatives[min_index])
+
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'specificity': self.specificity
+    }
+    base_config = super(SensitivityAtSpecificity, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 @keras_export('keras.metrics.SpecificityAtSensitivity')
@@ -1464,12 +1525,15 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     """
     if sensitivity < 0 or sensitivity > 1:
       raise ValueError('`sensitivity` must be in the range [0, 1].')
+    self.sensitivity = sensitivity
+    self.num_thresholds = num_thresholds
     super(SpecificityAtSensitivity, self).__init__(
         sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
 
   def result(self):
     # Calculate sensitivities at all the thresholds.
-    sensitivities = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+    sensitivities = math_ops.div_no_nan(
+        self.true_positives, self.true_positives + self.false_negatives)
 
     # Find the index of the threshold where the sensitivity is closest to the
     # given specificity.
@@ -1478,10 +1542,274 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     min_index = math_ops.cast(min_index, dtypes.int32)
 
     # Compute specificity at that index.
-    return math_ops.div_no_nan(self.tn[min_index],
-                               self.tn[min_index] + self.fp[min_index])
+    return math_ops.div_no_nan(
+        self.true_negatives[min_index],
+        self.true_negatives[min_index] + self.false_positives[min_index])
+
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'sensitivity': self.sensitivity
+    }
+    base_config = super(SpecificityAtSensitivity, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export('keras.metrics.AUC')
+class AUC(Metric):
+  """Computes the approximate AUC (Area under the curve) via a Riemann sum.
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the AUC.
+  To discretize the AUC curve, a linearly spaced set of thresholds is used to
+  compute pairs of recall and precision values. The area under the ROC-curve is
+  therefore computed using the height of the recall values by the false positive
+  rate, while the area under the PR-curve is the computed using the height of
+  the precision values by the recall.
+
+  This value is ultimately returned as `auc`, an idempotent operation that
+  computes the area under a discretized curve of precision versus recall values
+  (computed using the aforementioned variables). The `num_thresholds` variable
+  controls the degree of discretization with larger numbers of thresholds more
+  closely approximating the true AUC. The quality of the approximation may vary
+  dramatically depending on `num_thresholds`.
+
+  For best results, `predictions` should be distributed approximately uniformly
+  in the range [0, 1] and not peaked around 0 or 1. The quality of the AUC
+  approximation may be poor if this is not the case. Setting `summation_method`
+  to 'minoring' or 'majoring' can help quantify the error in the approximation
+  by providing lower or upper bound estimate of the AUC.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.AUC(num_thresholds=3)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+
+  # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
+  # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+  # recall = [1, 0.5, 0], fp_rate = [1, 0, 0]
+  # auc = ((((1+0.5)/2)*(1-0))+ (((0.5+0)/2)*(0-0))) = 0.75
+
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.AUC()])
+  ```
+  """
 
+  def __init__(self,
+               num_thresholds=200,
+               curve=metrics_utils.AUCCurve.ROC,
+               summation_method=metrics_utils.AUCSummationMethod.INTERPOLATION,
+               name=None,
+               dtype=None):
+    """Creates an `AUC` instance.
+
+    Args:
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use when discretizing the roc curve. Values must be > 1.
+      curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
+        [default] or 'PR' for the Precision-Recall-curve.
+      summation_method: (Optional) Specifies the Riemann summation method used
+        (https://en.wikipedia.org/wiki/Riemann_sum): 'interpolation' [default],
+          applies mid-point summation scheme for `ROC`. For PR-AUC, interpolates
+          (true/false) positives but not the ratio that is precision (see Davis
+          & Goadrich 2006 for details); 'minoring' that applies left summation
+          for increasing intervals and right summation for decreasing intervals;
+          'majoring' that does the opposite.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    # Validate configurations.
+    if num_thresholds <= 1:
+      raise ValueError('`num_thresholds` must be > 1.')
+    if curve not in list(metrics_utils.AUCCurve):
+      raise ValueError('Invalid curve: "{}". Valid options are: "{}"'.format(
+          curve, list(metrics_utils.AUCCurve)))
+    if summation_method not in list(metrics_utils.AUCSummationMethod):
+      raise ValueError(
+          'Invalid summation method: "{}". Valid options are: "{}"'.format(
+              summation_method, list(metrics_utils.AUCSummationMethod)))
+
+    # Update properties.
+    self.num_thresholds = num_thresholds
+    self.curve = curve
+    self.summation_method = summation_method
+    super(AUC, self).__init__(name=name, dtype=dtype)
+
+    # Create metric variables
+    self.true_positives = self.add_weight(
+        'true_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.true_negatives = self.add_weight(
+        'true_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.false_positives = self.add_weight(
+        'false_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.false_negatives = self.add_weight(
+        'false_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+
+    # Compute `num_thresholds` thresholds in [0, 1]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
+    self.thresholds = [0.0 - K.epsilon()] + thresholds + [1.0 + K.epsilon()]
+    # epsilon - to account for floating point imprecisions.
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return metrics_utils.update_confusion_matrix_variables({
+        metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+        metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,
+        metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,
+        metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,
+    }, y_true, y_pred, self.thresholds, sample_weight=sample_weight)
+
+  def interpolate_pr_auc(self):
+    """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+    https://www.biostat.wisc.edu/~page/rocpr.pdf
+
+    Note here we derive & use a closed formula not present in the paper
+    as follows:
+
+      Precision = TP / (TP + FP) = TP / P
+
+    Modeling all of TP (true positive), FP (false positive) and their sum
+    P = TP + FP (predicted positive) as varying linearly within each interval
+    [A, B] between successive thresholds, we get
+
+      Precision slope = dTP / dP
+                      = (TP_B - TP_A) / (P_B - P_A)
+                      = (TP - TP_A) / (P - P_A)
+      Precision = (TP_A + slope * (P - P_A)) / P
+
+    The area within the interval is (slope / total_pos_weight) times
+
+      int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+      int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+
+    where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+
+      int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+
+    Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
+
+      slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+
+    where dTP == TP_B - TP_A.
+
+    Note that when P_A == 0 the above calculation simplifies into
+
+      int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+
+    which is really equivalent to imputing constant precision throughout the
+    first bucket having >0 true positives.
+
+    Returns:
+      pr_auc: an approximation of the area under the P-R curve.
+    """
+    dtp = self.true_positives[:self.num_thresholds -
+                              1] - self.true_positives[1:]
+    p = self.true_positives + self.false_positives
+    dp = p[:self.num_thresholds - 1] - p[1:]
+
+    prec_slope = math_ops.div_no_nan(
+        dtp, math_ops.maximum(dp, 0), name='prec_slope')
+    intercept = self.true_positives[1:] - math_ops.multiply(prec_slope, p[1:])
+
+    safe_p_ratio = array_ops.where(
+        math_ops.logical_and(p[:self.num_thresholds - 1] > 0, p[1:] > 0),
+        math_ops.div_no_nan(
+            p[:self.num_thresholds - 1],
+            math_ops.maximum(p[1:], 0),
+            name='recall_relative_ratio'),
+        array_ops.ones_like(p[1:]))
+
+    return math_ops.reduce_sum(
+        math_ops.div_no_nan(
+            prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
+            math_ops.maximum(self.true_positives[1:] + self.false_negatives[1:],
+                             0),
+            name='pr_auc_increment'),
+        name='interpolate_pr_auc')
+
+  def result(self):
+    if (self.curve == metrics_utils.AUCCurve.PR and
+        self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION
+       ):
+      # This use case is different and is handled separately.
+      return self.interpolate_pr_auc()
+
+    # Set `x` and `y` values for the curves based on `curve` config.
+    recall = math_ops.div_no_nan(self.true_positives,
+                                 self.true_positives + self.false_negatives)
+    if self.curve == metrics_utils.AUCCurve.ROC:
+      fp_rate = math_ops.div_no_nan(self.false_positives,
+                                    self.false_positives + self.true_negatives)
+      x = fp_rate
+      y = recall
+    else:  # curve == 'PR'.
+      precision = math_ops.div_no_nan(
+          self.true_positives, self.true_positives + self.false_positives)
+      x = recall
+      y = precision
+
+    # Find the rectangle heights based on `summation_method`.
+    if self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION:
+      # Note: the case ('PR', 'interpolation') has been handled above.
+      heights = (y[:self.num_thresholds - 1] + y[1:]) / 2.
+    elif self.summation_method == metrics_utils.AUCSummationMethod.MINORING:
+      heights = math_ops.minimum(y[:self.num_thresholds - 1], y[1:])
+    else:  # self.summation_method = metrics_utils.AUCSummationMethod.MAJORING:
+      heights = math_ops.maximum(y[:self.num_thresholds - 1], y[1:])
+
+    # Sum up the areas of all the rectangles.
+    return math_ops.reduce_sum(
+        math_ops.multiply(x[:self.num_thresholds - 1] - x[1:], heights),
+        name=self.name)
+
+  def reset_states(self):
+    num_thresholds = len(self.thresholds)
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'curve': self.curve,
+        'summation_method': self.summation_method,
+    }
+    base_config = super(AUC, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
+
+@keras_export('keras.metrics.CosineProximity')
 class CosineProximity(MeanMetricWrapper):
   """Computes the cosine distance between the labels and predictions.
 
@@ -1509,76 +1837,783 @@ class CosineProximity(MeanMetricWrapper):
   ```
   """
 
-  def __init__(self, name='cosine_proximity', dtype=None):
-    super(CosineProximity, self).__init__(cosine, name, dtype=dtype)
+  def __init__(self, name='cosine_proximity', dtype=None, axis=-1):
+    """Creates a `CosineProximity` instance.
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CosineProximity, cls).from_config(config)
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      axis: (Optional) Defaults to -1. The dimension along which the cosine
+        proximity is computed.
+    """
+    super(CosineProximity, self).__init__(cosine, name, dtype=dtype, axis=axis)
 
 
-def accuracy(y_true, y_pred):
-  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
-  if y_true.dtype != y_pred.dtype:
-    y_pred = math_ops.cast(y_pred, y_true.dtype)
-  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+@keras_export('keras.metrics.MeanAbsoluteError')
+class MeanAbsoluteError(MeanMetricWrapper):
+  """Computes the mean absolute error between the labels and predictions.
 
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean absolute error is 3/4 (0.75).
 
-@keras_export('keras.metrics.binary_accuracy')
-def binary_accuracy(y_true, y_pred, threshold=0.5):
-  threshold = math_ops.cast(threshold, y_pred.dtype)
-  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
-  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
+  Usage:
+  ```python
+  m = tf.metrics.MeanAbsoluteError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
 
+  Usage with tf.keras API:
 
-@keras_export('keras.metrics.categorical_accuracy')
-def categorical_accuracy(y_true, y_pred):
-  return math_ops.cast(
-      math_ops.equal(
-          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
-      K.floatx())
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsoluteError()])
+  ```
+  """
 
+  def __init__(self, name='mean_absolute_error', dtype=None):
+    super(MeanAbsoluteError, self).__init__(
+        mean_absolute_error, name, dtype=dtype)
 
-@keras_export('keras.metrics.sparse_categorical_accuracy')
-def sparse_categorical_accuracy(y_true, y_pred):
-  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
-    y_true = array_ops.squeeze(y_true, [-1])
-  y_pred = math_ops.argmax(y_pred, axis=-1)
 
-  # If the predicted output and actual output types don't match, force cast them
-  # to match.
-  if K.dtype(y_pred) != K.dtype(y_true):
-    y_pred = math_ops.cast(y_pred, K.dtype(y_true))
+@keras_export('keras.metrics.MeanAbsolutePercentageError')
+class MeanAbsolutePercentageError(MeanMetricWrapper):
+  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
-  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean absolute percentage error is 5e+08.
 
+  Usage:
 
-@keras_export('keras.metrics.top_k_categorical_accuracy')
-def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(
-      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
+  ```python
+  m = tf.keras.metrics.MeanAbsolutePercentageError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 5e+08
+  ```
 
+  Usage with tf.keras API:
 
-@keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
-def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
-    y_true = array_ops.squeeze(y_true, [-1])
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
+  ```
+  """
 
-  return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), axis=-1)
+  def __init__(self, name='mean_absolute_percentage_error', dtype=None):
+    super(MeanAbsolutePercentageError, self).__init__(
+        mean_absolute_percentage_error, name, dtype=dtype)
 
-# Aliases
 
-mse = MSE = mean_squared_error
-mae = MAE = mean_absolute_error
-mape = MAPE = mean_absolute_percentage_error
-msle = MSLE = mean_squared_logarithmic_error
+@keras_export('keras.metrics.MeanSquaredError')
+class MeanSquaredError(MeanMetricWrapper):
+  """Computes the mean squared error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean squared error is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanSquaredError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredError()])
+  ```
+  """
+
+  def __init__(self, name='mean_squared_error', dtype=None):
+    super(MeanSquaredError, self).__init__(
+        mean_squared_error, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.MeanSquaredLogarithmicError')
+class MeanSquaredLogarithmicError(MeanMetricWrapper):
+  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean squared logarithmic error is 0.36034.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanSquaredLogarithmicError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.36034
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
+  ```
+  """
+
+  def __init__(self, name='mean_squared_logarithmic_error', dtype=None):
+    super(MeanSquaredLogarithmicError, self).__init__(
+        mean_squared_logarithmic_error, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.Hinge')
+class Hinge(MeanMetricWrapper):
+  """Computes the hinge metric between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.]
+  the hinge metric value is 0.66.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Hinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.Hinge()])
+  ```
+  """
+
+  def __init__(self, name='hinge', dtype=None):
+    super(Hinge, self).__init__(hinge, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.SquaredHinge')
+class SquaredHinge(MeanMetricWrapper):
+  """Computes the squared hinge metric between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.]
+  the squared hinge metric value is 0.66.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SquaredHinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.SquaredHinge()])
+  ```
+  """
+
+  def __init__(self, name='squared_hinge', dtype=None):
+    super(SquaredHinge, self).__init__(squared_hinge, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.CategoricalHinge')
+class CategoricalHinge(MeanMetricWrapper):
+  """Computes the categorical hinge metric between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.]
+  the categorical hinge metric value is 1.0.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalHinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.CategoricalHinge()])
+  ```
+  """
+
+  def __init__(self, name='categorical_hinge', dtype=None):
+    super(CategoricalHinge, self).__init__(categorical_hinge, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.RootMeanSquaredError')
+class RootMeanSquaredError(Mean):
+  """Computes root mean squared error metric between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.RootMeanSquaredError()
+  m.update_state([2., 4., 6.], [1., 3., 2.])
+  print('Final result: ', m.result().numpy())  # Final result: 2.449
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.RootMeanSquaredError()])
+  ```
+  """
+
+  def __init__(self, name='root_mean_squared_error', dtype=None):
+    super(RootMeanSquaredError, self).__init__(name, dtype=dtype)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates root mean squared error statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+    error_sq = math_ops.squared_difference(y_pred, y_true)
+    return super(RootMeanSquaredError, self).update_state(
+        error_sq, sample_weight=sample_weight)
+
+  def result(self):
+    return math_ops.sqrt(math_ops.div_no_nan(self.total, self.count))
+
+
+@keras_export('keras.metrics.LogCoshError')
+class LogCoshError(MeanMetricWrapper):
+  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.LogCoshError()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.289
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.LogCoshError()])
+  ```
+  """
+
+  def __init__(self, name='logcosh', dtype=None):
+    super(LogCoshError, self).__init__(logcosh, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.Poisson')
+class Poisson(MeanMetricWrapper):
+  """Computes the Poisson metric between `y_true` and `y_pred`.
+
+  `metric = y_pred - y_true * log(y_pred)`
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Poisson()
+  m.update_state([1, 9, 2], [4, 8, 12])
+  print('Final result: ', m.result().numpy())  # Final result: -4.63
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.Poisson()])
+  ```
+  """
+
+  def __init__(self, name='poisson', dtype=None):
+    super(Poisson, self).__init__(poisson, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.KLDivergence')
+class KLDivergence(MeanMetricWrapper):
+  """Computes Kullback Leibler divergence metric between `y_true` and `y_pred`.
+
+  `metric = y_true * log(y_true / y_pred)`
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.KLDivergence()
+  m.update_state([.4, .9, .2], [.5, .8, .12])
+  print('Final result: ', m.result().numpy())  # Final result: -0.043
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.KLDivergence()])
+  ```
+  """
+
+  def __init__(self, name='kullback_leibler_divergence', dtype=None):
+    super(KLDivergence, self).__init__(
+        kullback_leibler_divergence, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.MeanIoU')
+class MeanIoU(Metric):
+  """Computes the mean Intersection-Over-Union metric.
+
+  Mean Intersection-Over-Union is a common evaluation metric for semantic image
+  segmentation, which first computes the IOU for each semantic class and then
+  computes the average over classes. IOU is defined as follows:
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+  The predictions are accumulated in a confusion matrix, weighted by
+  `sample_weight` and the metric is then calculated from it.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanIoU(num_classes=2)
+  m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
+
+    # cm = [[1, 1],
+            [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    # result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2 = 0.33
+  print('Final result: ', m.result().numpy())  # Final result: 0.33
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.MeanIoU(num_classes=2)])
+  ```
+  """
+
+  def __init__(self, num_classes, name=None, dtype=None):
+    """Creates a `MeanIoU` instance.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        This value must be provided, since a confusion matrix of dimension =
+        [num_classes, num_classes] will be allocated.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanIoU, self).__init__(name=name, dtype=dtype)
+    self.num_classes = num_classes
+
+    # Variable to accumulate the predictions in the confusion matrix. Setting
+    # the type to be `float64` as required by confusion_matrix_ops.
+    self.total_cm = self.add_weight(
+        'total_confusion_matrix',
+        shape=(num_classes, num_classes),
+        initializer=init_ops.zeros_initializer,
+        dtype=dtypes.float64)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    # Flatten the input if its rank > 1.
+    if y_pred.shape.ndims > 1:
+      y_pred = array_ops.reshape(y_pred, [-1])
+
+    if y_true.shape.ndims > 1:
+      y_true = array_ops.reshape(y_true, [-1])
+
+    if sample_weight is not None and sample_weight.shape.ndims > 1:
+      sample_weight = array_ops.reshape(sample_weight, [-1])
+
+    # Accumulate the prediction to current confusion matrix.
+    current_cm = confusion_matrix.confusion_matrix(
+        y_true,
+        y_pred,
+        self.num_classes,
+        weights=sample_weight,
+        dtype=dtypes.float64)
+    return self.total_cm.assign_add(current_cm)
+
+  def result(self):
+    """Compute the mean intersection-over-union via the confusion matrix."""
+    sum_over_row = math_ops.cast(
+        math_ops.reduce_sum(self.total_cm, axis=0), dtype=self._dtype)
+    sum_over_col = math_ops.cast(
+        math_ops.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
+    true_positives = math_ops.cast(
+        array_ops.diag_part(self.total_cm), dtype=self._dtype)
+
+    # sum_over_row + sum_over_col =
+    #     2 * true_positives + false_positives + false_negatives.
+    denominator = sum_over_row + sum_over_col - true_positives
+
+    # The mean is only computed over classes that appear in the
+    # label or prediction tensor. If the denominator is 0, we need to
+    # ignore the class.
+    num_valid_entries = math_ops.reduce_sum(
+        math_ops.cast(math_ops.not_equal(denominator, 0), dtype=self._dtype))
+
+    iou = math_ops.div_no_nan(true_positives, denominator)
+
+    return math_ops.div_no_nan(
+        math_ops.reduce_sum(iou, name='mean_iou'), num_valid_entries)
+
+  def reset_states(self):
+    K.set_value(self.total_cm, np.zeros((self.num_classes, self.num_classes)))
+
+  def get_config(self):
+    config = {'num_classes': self.num_classes}
+    base_config = super(MeanIoU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export('keras.metrics.MeanTensor')
+class MeanTensor(Metric):
+  """Computes the element-wise (weighted) mean of the given tensors.
+
+  `MeanTensor` returns a tensor with the same shape of the input tensors. The
+  mean value is updated by keeping local variables `total` and `count`. The
+  `total` tracks the sum of the weighted values, and `count` stores the sum of
+  the weighted counts.
+
+  Usage:
+
+  ```python
+  m = tf.metrics.MeanTensor()
+  m.update_state([0, 1, 2, 3])
+  m.update_state([4, 5, 6, 7])
+  print('Result: ', m.result().numpy())  # Result: [2, 3, 4, 5]
+  m.update_state([12, 10, 8, 6], sample_weights= [0, 0.2, 0.5, 1])
+  print('Result: ', m.result().numpy())  # Result: [2, 3.636, 4.8, 5.333]
+  ```
+  """
+
+  def __init__(self, name='mean_tensor', dtype=None):
+    """Creates a `MeanTensor` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanTensor, self).__init__(name=name, dtype=dtype)
+    self._shape = None
+    self._total = None
+    self._count = None
+    self._built = False
+
+  def _build(self, shape):
+    self._shape = tensor_shape.TensorShape(shape)
+    # Create new state variables
+    self._total = self.add_weight(
+        'total', shape=shape, initializer=init_ops.zeros_initializer)
+    self._count = self.add_weight(
+        'count', shape=shape, initializer=init_ops.zeros_initializer)
+    with ops.init_scope():
+      if not context.executing_eagerly():
+        K._initialize_variables(K._get_session())  # pylint: disable=protected-access
+    self._built = True
+
+  @property
+  def total(self):
+    return self._total if self._built else None
+
+  @property
+  def count(self):
+    return self._count if self._built else None
+
+  def update_state(self, values, sample_weight=None):
+    """Accumulates statistics for computing the element-wise mean.
+
+    Args:
+      values: Per-example value.
+      sample_weight: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      Update op.
+    """
+    values = math_ops.cast(values, self._dtype)
+    if not self._built:
+      self._build(values.shape)
+    elif values.shape != self._shape:
+      raise ValueError('MeanTensor input values must always have the same '
+                       'shape. Expected shape (set during the first call): {}. '
+                       'Got: {}'.format(self._shape, values.get_shape()))
+
+    num_values = array_ops.ones_like(values)
+    if sample_weight is not None:
+      sample_weight = math_ops.cast(sample_weight, self._dtype)
+
+      # Update dimensions of weights to match with values if possible.
+      values, _, sample_weight = squeeze_or_expand_dimensions(
+          values, None, sample_weight)
+      try:
+        # Broadcast weights if possible.
+        sample_weight = weights_broadcast_ops.broadcast_weights(
+            sample_weight, values)
+      except ValueError:
+        # Reduce values to same ndim as weight array
+        ndim = K.ndim(values)
+        weight_ndim = K.ndim(sample_weight)
+        values = math_ops.reduce_mean(
+            values, axis=list(range(weight_ndim, ndim)))
+
+      num_values = math_ops.multiply(num_values, sample_weight)
+      values = math_ops.multiply(values, sample_weight)
+
+    update_total_op = self._total.assign_add(values)
+    with ops.control_dependencies([update_total_op]):
+      return self._count.assign_add(num_values)
+
+  def result(self):
+    if not self._built:
+      raise ValueError(
+          'MeanTensor does not have any result yet. Please call the MeanTensor '
+          'instance or use `.update_state(value)` before retrieving the result.'
+          )
+    return math_ops.div_no_nan(self.total, self.count)
+
+  def reset_states(self):
+    if self._built:
+      for v in self.variables:
+        K.set_value(v, np.zeros(self._shape.as_list()))
+
+
+@keras_export('keras.metrics.BinaryCrossentropy')
+class BinaryCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  This is the crossentropy metric class to be used when there are only two
+  label classes (0 and 1).
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.BinaryCrossentropy()
+  m.update_state([1., 0., 1., 0.], [1., 1., 1., 0.])
+
+  # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+  # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+  # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+  # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+  #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+  #           -log(Y_MAX + EPSILON), -log(1)]
+  #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+  # Reduced metric = 7.665 / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 3.833
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.BinaryCrossentropy()])
+  ```
+  """
+
+  def __init__(self,
+               name='binary_crossentropy',
+               dtype=None,
+               from_logits=False,
+               label_smoothing=0):
+    """Creates a `BinaryCrossentropy` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      from_logits: (Optional )Whether output is expected to be a logits tensor.
+        By default, we consider that output encodes a probability distribution.
+      label_smoothing: (Optional) Float in [0, 1]. When > 0, label values are
+        smoothed, meaning the confidence on label values are relaxed.
+        e.g. `label_smoothing=0.2` means that we will use a value of `0.1` for
+        label `0` and `0.9` for label `1`"
+    """
+    label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
+
+    super(BinaryCrossentropy, self).__init__(
+        binary_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
+
+
+@keras_export('keras.metrics.CategoricalCrossentropy')
+class CategoricalCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  This is the crossentropy metric class to be used when there are multiple
+  label classes (2 or more). Here we assume that labels are given as a `one_hot`
+  representation. eg., When labels values are [2, 0, 1],
+   `y_true` = [[0, 0, 1], [1, 0, 0], [0, 1, 0]].
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalCrossentropy()
+  m.update_state([[0, 1, 0], [0, 0, 1]],
+                 [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+
+  # EPSILON = 1e-7, y = y_true, y` = y_pred
+  # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+  # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+  # xent = -sum(y * log(y'), axis = -1)
+  #      = -((log 0.95), (log 0.1))
+  #      = [0.051, 2.302]
+  # Reduced xent = (0.051 + 2.302) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.176
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.CategoricalCrossentropy()])
+  ```
+
+  Args:
+    name: (Optional) string name of the metric instance.
+    dtype: (Optional) data type of the metric result.
+    from_logits: (Optional ) Whether `y_pred` is expected to be a logits tensor.
+      By default, we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+      meaning the confidence on label values are relaxed. e.g.
+      `label_smoothing=0.2` means that we will use a value of `0.1` for label
+      `0` and `0.9` for label `1`"
+  """
+
+  def __init__(self,
+               name='categorical_crossentropy',
+               dtype=None,
+               from_logits=False,
+               label_smoothing=0):
+    label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
+
+    super(CategoricalCrossentropy, self).__init__(
+        categorical_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
+
+
+def accuracy(y_true, y_pred):
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+  if y_true.dtype != y_pred.dtype:
+    y_pred = math_ops.cast(y_pred, y_true.dtype)
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
+
+@keras_export('keras.metrics.binary_accuracy')
+def binary_accuracy(y_true, y_pred, threshold=0.5):
+  threshold = math_ops.cast(threshold, y_pred.dtype)
+  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
+  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
+
+
+@keras_export('keras.metrics.categorical_accuracy')
+def categorical_accuracy(y_true, y_pred):
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
+      K.floatx())
+
+
+@keras_export('keras.metrics.sparse_categorical_accuracy')
+def sparse_categorical_accuracy(y_true, y_pred):
+  y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
+  y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
+  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
+      K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+    y_true = array_ops.squeeze(y_true, [-1])
+  y_pred = math_ops.argmax(y_pred, axis=-1)
+
+  # If the predicted output and actual output types don't match, force cast them
+  # to match.
+  if K.dtype(y_pred) != K.dtype(y_true):
+    y_pred = math_ops.cast(y_pred, K.dtype(y_true))
+
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
+
+@keras_export('keras.metrics.top_k_categorical_accuracy')
+def top_k_categorical_accuracy(y_true, y_pred, k=5):
+  return K.mean(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
+
+
+@keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
+def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
+  y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
+  y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
+  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
+      K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+    y_true = array_ops.squeeze(y_true, [-1])
+
+  return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), axis=-1)
+
+# Aliases
+
+mse = MSE = mean_squared_error
+mae = MAE = mean_absolute_error
+mape = MAPE = mean_absolute_percentage_error
+msle = MSLE = mean_squared_logarithmic_error
 cosine = cosine_proximity
 
 
+def clone_metric(metric):
+  """Returns a clone of the metric if stateful, otherwise returns it as is."""
+  if isinstance(metric, Metric):
+    return metric.__class__.from_config(metric.get_config())
+  return metric
+
+
+def clone_metrics(metrics):
+  """Clones the given metric list/dict."""
+  if metrics is None:
+    return None
+  if isinstance(metrics, dict):
+    return {key: clone_metric(value) for key, value in metrics.items()}
+  return [clone_metric(metric) for metric in metrics]
+
+
 @keras_export('keras.metrics.serialize')
 def serialize(metric):
   return serialize_keras_object(metric)
@@ -1604,3 +2639,7 @@ def get(identifier):
   else:
     raise ValueError('Could not interpret '
                      'metric function identifier: %s' % identifier)
+
+
+def _is_tensor_or_variable(x):
+  return tensor_util.is_tensor(x) or isinstance(x, tf_variables.Variable)
diff --git a/tensorflow/python/keras/metrics_confusion_matrix_test.py b/tensorflow/python/keras/metrics_confusion_matrix_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0fcfed07730e6d72a16638a8dc0f20860a97b6
--- /dev/null
+++ b/tensorflow/python/keras/metrics_confusion_matrix_test.py
@@ -0,0 +1,1120 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalsePositivesTest(test.TestCase):
+
+  def test_config(self):
+    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
+    self.assertEqual(fp_obj.name, 'my_fp')
+    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    fp_obj2 = metrics.FalsePositives.from_config(fp_obj.get_config())
+    self.assertEqual(fp_obj2.name, 'my_fp')
+    self.assertEqual(len(fp_obj2.variables), 1)
+    self.assertEqual(fp_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(14., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7., 4., 2.], result)
+
+  def test_weighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
+                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
+
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+
+  def test_threshold_limit(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
+      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[None\]'):
+      metrics.FalsePositives(thresholds=[None])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalseNegativesTest(test.TestCase):
+
+  def test_config(self):
+    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
+    self.assertEqual(fn_obj.name, 'my_fn')
+    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    fn_obj2 = metrics.FalseNegatives.from_config(fn_obj.get_config())
+    self.assertEqual(fn_obj2.name, 'my_fn')
+    self.assertEqual(len(fn_obj2.variables), 1)
+    self.assertEqual(fn_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(5., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([1., 4., 6.], result)
+
+  def test_weighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TrueNegativesTest(test.TestCase):
+
+  def test_config(self):
+    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
+    self.assertEqual(tn_obj.name, 'my_tn')
+    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    tn_obj2 = metrics.TrueNegatives.from_config(tn_obj.get_config())
+    self.assertEqual(tn_obj2.name, 'my_tn')
+    self.assertEqual(len(tn_obj2.variables), 1)
+    self.assertEqual(tn_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(4., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([2., 5., 7.], result)
+
+  def test_weighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TruePositivesTest(test.TestCase):
+
+  def test_config(self):
+    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
+    self.assertEqual(tp_obj.name, 'my_tp')
+    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    tp_obj2 = metrics.TruePositives.from_config(tp_obj.get_config())
+    self.assertEqual(tp_obj2.name, 'my_tp')
+    self.assertEqual(len(tp_obj2.variables), 1)
+    self.assertEqual(tp_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(12., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([6., 3., 1.], result)
+
+  def test_weighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    result = tp_obj(y_true, y_pred, sample_weight=37.)
+    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrecisionTest(test.TestCase):
+
+  def test_config(self):
+    p_obj = metrics.Precision(
+        name='my_precision', thresholds=[0.4, 0.9], top_k=15, class_id=12)
+    self.assertEqual(p_obj.name, 'my_precision')
+    self.assertEqual(len(p_obj.variables), 2)
+    self.assertEqual([v.name for v in p_obj.variables],
+                     ['true_positives:0', 'false_positives:0'])
+    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+    self.assertEqual(p_obj.top_k, 15)
+    self.assertEqual(p_obj.class_id, 12)
+
+    # Check save and restore config
+    p_obj2 = metrics.Precision.from_config(p_obj.get_config())
+    self.assertEqual(p_obj2.name, 'my_precision')
+    self.assertEqual(len(p_obj2.variables), 2)
+    self.assertEqual(p_obj2.thresholds, [0.4, 0.9])
+    self.assertEqual(p_obj2.top_k, 15)
+    self.assertEqual(p_obj2.class_id, 12)
+
+  def test_value_is_idempotent(self):
+    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = p_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_precision = self.evaluate(p_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
+                           1e-3)
+
+  def test_unweighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    p_obj = metrics.Precision(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 4.0
+    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
+                         1e-3)
+
+  def test_unweighted_top_k(self):
+    p_obj = metrics.Precision(top_k=3)
+    y_pred = constant_op.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1. / 3, self.evaluate(result))
+
+  def test_weighted_top_k(self):
+    p_obj = metrics.Precision(top_k=3)
+    y_pred1 = constant_op.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
+    y_true1 = constant_op.constant([0, 1, 1, 0, 1], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    self.evaluate(
+        p_obj(
+            y_true1,
+            y_pred1,
+            sample_weight=constant_op.constant([[1, 4, 2, 3, 5]])))
+
+    y_pred2 = constant_op.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
+    y_true2 = constant_op.constant([1, 0, 1, 1, 1], shape=(1, 5))
+    result = p_obj(y_true2, y_pred2, sample_weight=constant_op.constant(3))
+
+    tp = (2 + 5) + (3 + 3)
+    predicted_positives = (1 + 2 + 5) + (3 + 3 + 3)
+    expected_precision = tp / predicted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_unweighted_class_id(self):
+    p_obj = metrics.Precision(class_id=2)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 0, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.false_positives))
+
+  def test_unweighted_top_k_and_class_id(self):
+    p_obj = metrics.Precision(class_id=2, top_k=2)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    y_pred = constant_op.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+  def test_unweighted_top_k_and_threshold(self):
+    p_obj = metrics.Precision(thresholds=.7, top_k=2)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 1], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RecallTest(test.TestCase):
+
+  def test_config(self):
+    r_obj = metrics.Recall(
+        name='my_recall', thresholds=[0.4, 0.9], top_k=15, class_id=12)
+    self.assertEqual(r_obj.name, 'my_recall')
+    self.assertEqual(len(r_obj.variables), 2)
+    self.assertEqual([v.name for v in r_obj.variables],
+                     ['true_positives:0', 'false_negatives:0'])
+    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+    self.assertEqual(r_obj.top_k, 15)
+    self.assertEqual(r_obj.class_id, 12)
+
+    # Check save and restore config
+    r_obj2 = metrics.Recall.from_config(r_obj.get_config())
+    self.assertEqual(r_obj2.name, 'my_recall')
+    self.assertEqual(len(r_obj2.variables), 2)
+    self.assertEqual(r_obj2.thresholds, [0.4, 0.9])
+    self.assertEqual(r_obj2.top_k, 15)
+    self.assertEqual(r_obj2.class_id, 12)
+
+  def test_value_is_idempotent(self):
+    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = r_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_recall = self.evaluate(r_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+
+  def test_unweighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    r_obj = metrics.Recall(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 1.0
+    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+    expected_recall = weighted_tp / weighted_t
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
+                         1e-3)
+
+  def test_unweighted_top_k(self):
+    r_obj = metrics.Recall(top_k=3)
+    y_pred = constant_op.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_weighted_top_k(self):
+    r_obj = metrics.Recall(top_k=3)
+    y_pred1 = constant_op.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
+    y_true1 = constant_op.constant([0, 1, 1, 0, 1], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    self.evaluate(
+        r_obj(
+            y_true1,
+            y_pred1,
+            sample_weight=constant_op.constant([[1, 4, 2, 3, 5]])))
+
+    y_pred2 = constant_op.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
+    y_true2 = constant_op.constant([1, 0, 1, 1, 1], shape=(1, 5))
+    result = r_obj(y_true2, y_pred2, sample_weight=constant_op.constant(3))
+
+    tp = (2 + 5) + (3 + 3)
+    positives = (4 + 2 + 5) + (3 + 3 + 3 + 3)
+    expected_recall = tp / positives
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_unweighted_class_id(self):
+    r_obj = metrics.Recall(class_id=2)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 0, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+  def test_unweighted_top_k_and_class_id(self):
+    r_obj = metrics.Recall(class_id=2, top_k=2)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
+
+    y_pred = constant_op.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+  def test_unweighted_top_k_and_threshold(self):
+    r_obj = metrics.Recall(thresholds=.7, top_k=2)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([1, 1, 1, 0, 1], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.25, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(3, self.evaluate(r_obj.false_negatives))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SensitivityAtSpecificity(
+        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
+    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.specificity, 0.4)
+    self.assertEqual(s_obj.num_thresholds, 100)
+
+    # Check save and restore config
+    s_obj2 = metrics.SensitivityAtSpecificity.from_config(s_obj.get_config())
+    self.assertEqual(s_obj2.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj2.variables, 4)
+    self.assertEqual(s_obj2.specificity, 0.4)
+    self.assertEqual(s_obj2.num_thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_sensitivity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.8, self.evaluate(result))
+
+  def test_unweighted_low_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.675, self.evaluate(result))
+
+  def test_invalid_specificity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`specificity` must be in the range \[0, 1\].'):
+      metrics.SensitivityAtSpecificity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SpecificityAtSensitivity(
+        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
+    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.sensitivity, 0.4)
+    self.assertEqual(s_obj.num_thresholds, 100)
+
+    # Check save and restore config
+    s_obj2 = metrics.SpecificityAtSensitivity.from_config(s_obj.get_config())
+    self.assertEqual(s_obj2.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj2.variables, 4)
+    self.assertEqual(s_obj2.sensitivity, 0.4)
+    self.assertEqual(s_obj2.num_thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_specificity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_unweighted_low_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_invalid_sensitivity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
+      metrics.SpecificityAtSensitivity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AUCTest(test.TestCase):
+
+  def setup(self):
+    self.num_thresholds = 3
+    self.y_pred = constant_op.constant([0, 0.5, 0.3, 0.9], dtype=dtypes.float32)
+    self.y_true = constant_op.constant([0, 0, 1, 1])
+    self.sample_weight = [1, 2, 3, 4]
+
+    # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
+    # y_pred when threshold = 0 - 1e-7  : [1, 1, 1, 1]
+    # y_pred when threshold = 0.5       : [0, 0, 0, 1]
+    # y_pred when threshold = 1 + 1e-7  : [0, 0, 0, 0]
+
+    # without sample_weight:
+    # tp = np.sum([[0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 0]], axis=1)
+    # fp = np.sum([[1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
+    # fn = np.sum([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 1]], axis=1)
+    # tn = np.sum([[0, 0, 0, 0], [1, 1, 0, 0], [1, 1, 0, 0]], axis=1)
+
+    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+
+    # with sample_weight:
+    # tp = np.sum([[0, 0, 3, 4], [0, 0, 0, 4], [0, 0, 0, 0]], axis=1)
+    # fp = np.sum([[1, 2, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
+    # fn = np.sum([[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 3, 4]], axis=1)
+    # tn = np.sum([[0, 0, 0, 0], [1, 2, 0, 0], [1, 2, 0, 0]], axis=1)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+
+  def test_config(self):
+    auc_obj = metrics.AUC(
+        num_thresholds=100,
+        curve=metrics_utils.AUCCurve.PR,
+        summation_method=metrics_utils.AUCSummationMethod.MAJORING,
+        name='auc_1')
+    self.assertEqual(auc_obj.name, 'auc_1')
+    self.assertEqual(len(auc_obj.variables), 4)
+    self.assertEqual(auc_obj.num_thresholds, 100)
+    self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
+    self.assertEqual(auc_obj.summation_method,
+                     metrics_utils.AUCSummationMethod.MAJORING)
+
+    # Check save and restore config
+    auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
+    self.assertEqual(auc_obj2.name, 'auc_1')
+    self.assertEqual(len(auc_obj2.variables), 4)
+    self.assertEqual(auc_obj2.num_thresholds, 100)
+    self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
+    self.assertEqual(auc_obj2.summation_method,
+                     metrics_utils.AUCSummationMethod.MAJORING)
+
+  def test_value_is_idempotent(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=3)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+
+    # Run several updates.
+    update_op = auc_obj.update_state(self.y_true, self.y_pred)
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_auc = self.evaluate(auc_obj.result())
+    for _ in range(10):
+      self.assertAllClose(initial_auc, self.evaluate(auc_obj.result()), 1e-3)
+
+  def test_unweighted_all_correct(self):
+    self.setup()
+    auc_obj = metrics.AUC()
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_true)
+    self.assertEqual(self.evaluate(result), 1)
+
+  def test_unweighted(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred)
+
+    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+    # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
+    # fp_rate = [2/2, 0, 0] = [1, 0, 0]
+    # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (0.75 * 1 + 0.25 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_roc_interpolation(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+    # heights = [(1 + 0.571)/2, (0.571 + 0)/2] = [0.7855, 0.2855]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (0.7855 * 1 + 0.2855 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_roc_majoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        summation_method=metrics_utils.AUCSummationMethod.MAJORING)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+    # heights = [max(1, 0.571), max(0.571, 0)] = [1, 0.571]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (1 * 1 + 0.571 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_roc_minoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        summation_method=metrics_utils.AUCSummationMethod.MINORING)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+    # heights = [min(1, 0.571), min(0.571, 0)] = [0.571, 0]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (0.571 * 1 + 0 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_pr_majoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        curve=metrics_utils.AUCCurve.PR,
+        summation_method=metrics_utils.AUCSummationMethod.MAJORING)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # heights = [max(0.7, 1), max(1, 0)] = [1, 1]
+    # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
+    expected_result = (1 * 0.429 + 1 * 0.571)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_pr_minoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        curve=metrics_utils.AUCCurve.PR,
+        summation_method=metrics_utils.AUCSummationMethod.MINORING)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # heights = [min(0.7, 1), min(1, 0)] = [0.7, 0]
+    # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
+    expected_result = (0.7 * 0.429 + 0 * 0.571)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_pr_interpolation(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        curve=metrics_utils.AUCCurve.PR,
+        summation_method=metrics_utils.AUCSummationMethod.INTERPOLATION)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # auc = (slope / Total Pos) * [dTP - intercept * log(Pb/Pa)]
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # P = tp + fp = [10, 4, 0]
+    # dTP = [7-4, 4-0] = [3, 4]
+    # dP = [10-4, 4-0] = [6, 4]
+    # slope = dTP/dP = [0.5, 1]
+    # intercept = (TPa+(slope*Pa) = [(4 - 0.5*4), (0 - 1*0)] = [2, 0]
+    # (Pb/Pa) = (Pb/Pa) if Pb > 0 AND Pa > 0 else 1 = [10/4, 4/0] = [2.5, 1]
+    # auc * TotalPos = [(0.5 * (3 + 2 * log(2.5))), (1 * (4 + 0))]
+    #                = [2.416, 4]
+    # auc = [2.416, 4]/(tp[1:]+fn[1:])
+    expected_result = (2.416/7 + 4/7)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
+      metrics.AUC(num_thresholds=-1)
+
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
+      metrics.AUC(num_thresholds=1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 9720d910eb337580c2e630b5dfb8888f8843c271..35732ad307164695694fd2b82ddec55ff79fe7a8 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -18,11 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
 import os
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -30,14 +31,129 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
+from tensorflow.python.keras import Model
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KerasSumTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sum(self):
+    m = metrics.Sum(name='my_sum')
+
+    # check config
+    self.assertEqual(m.name, 'my_sum')
+    self.assertTrue(m.stateful)
+    self.assertEqual(m.dtype, dtypes.float32)
+    self.assertEqual(len(m.variables), 1)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # check initial state
+    self.assertEqual(self.evaluate(m.total), 0)
+
+    # check __call__()
+    self.assertEqual(self.evaluate(m(100)), 100)
+    self.assertEqual(self.evaluate(m.total), 100)
+
+    # check update_state() and result() + state accumulation + tensor input
+    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+    self.evaluate(update_op)
+    self.assertAlmostEqual(self.evaluate(m.result()), 106)
+    self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+
+    # check reset_states()
+    m.reset_states()
+    self.assertEqual(self.evaluate(m.total), 0)
+
+  def test_sum_with_sample_weight(self):
+    m = metrics.Sum(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # check scalar weight
+    result_t = m(100, sample_weight=0.5)
+    self.assertEqual(self.evaluate(result_t), 50)
+    self.assertEqual(self.evaluate(m.total), 50)
+
+    # check weights not scalar and weights rank matches values rank
+    result_t = m([1, 5], sample_weight=[1, 0.2])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 52., 4)  # 50 + 1 + 5 * 0.2
+    self.assertAlmostEqual(self.evaluate(m.total), 52., 4)
+
+    # check weights broadcast
+    result_t = m([1, 2], sample_weight=0.5)
+    self.assertAlmostEqual(self.evaluate(result_t), 53.5, 1)  # 52 + 0.5 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 53.5, 1)
+
+    # check weights squeeze
+    result_t = m([1, 5], sample_weight=[[1], [0.2]])
+    self.assertAlmostEqual(self.evaluate(result_t), 55.5, 1)  # 53.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 55.5, 1)
+
+    # check weights expand
+    result_t = m([[1], [5]], sample_weight=[1, 0.2])
+    self.assertAlmostEqual(self.evaluate(result_t), 57.5, 2)  # 55.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 57.5, 1)
+
+    # check values reduced to the dimensions of weight
+    result_t = m([[[1., 2.], [3., 2.], [0.5, 4.]]], sample_weight=[0.5])
+    result = np.round(self.evaluate(result_t), decimals=2)
+    # result = (prev: 57.5) + 0.5 + 1 + 1.5 + 1 + 0.25 + 2
+    self.assertAlmostEqual(result, 63.75, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 63.75, 2)
+
+  def test_sum_graph_with_placeholder(self):
+    with context.graph_mode(), self.cached_session() as sess:
+      m = metrics.Sum()
+      v = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+      self.evaluate(variables.variables_initializer(m.variables))
+
+      # check __call__()
+      result_t = m(v, sample_weight=w)
+      result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
+      self.assertEqual(result, 50)
+      self.assertEqual(self.evaluate(m.total), 50)
+
+      # check update_state() and result()
+      result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
+      self.assertAlmostEqual(result, 52., 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(self.evaluate(m.total), 52., 2)
+
+  def test_save_restore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    m = metrics.Sum()
+    checkpoint = checkpointable_utils.Checkpoint(sum=m)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # update state
+    self.evaluate(m(100.))
+    self.evaluate(m(200.))
+
+    # save checkpoint and then add an update
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(m(1000.))
+
+    # restore to the same checkpoint sum object (= 300)
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.evaluate(m(300.))
+    self.assertEqual(600., self.evaluate(m.result()))
+
+    # restore to a different checkpoint sum object
+    restore_sum = metrics.Sum()
+    restore_checkpoint = checkpointable_utils.Checkpoint(sum=restore_sum)
+    status = restore_checkpoint.restore(save_path)
+    restore_update = restore_sum(300.)
+    status.assert_consumed().run_restore_ops()
+    self.evaluate(restore_update)
+    self.assertEqual(600., self.evaluate(restore_sum.result()))
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -76,6 +192,13 @@ class KerasMeanTest(test.TestCase):
     self.assertEqual(self.evaluate(m.total), 0)
     self.assertEqual(self.evaluate(m.count), 0)
 
+    # Check save and restore config
+    m2 = metrics.Mean.from_config(m.get_config())
+    self.assertEqual(m2.name, 'my_mean')
+    self.assertTrue(m2.stateful)
+    self.assertEqual(m2.dtype, dtypes.float32)
+    self.assertEqual(len(m2.variables), 2)
+
   def test_mean_with_sample_weight(self):
     m = metrics.Mean(dtype=dtypes.float64)
     self.assertEqual(m.dtype, dtypes.float64)
@@ -189,6 +312,13 @@ class KerasAccuracyTest(test.TestCase):
     result = self.evaluate(acc_obj.result())
     self.assertEqual(result, 1)  # 2/2
 
+    # Check save and restore config
+    a2 = metrics.Accuracy.from_config(acc_obj.get_config())
+    self.assertEqual(a2.name, 'my acc')
+    self.assertTrue(a2.stateful)
+    self.assertEqual(len(a2.variables), 2)
+    self.assertEqual(a2.dtype, dtypes.float32)
+
     # check with sample_weight
     result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
     result = self.evaluate(result_t)
@@ -279,700 +409,1158 @@ class KerasAccuracyTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-  def test_assert_thresholds_range(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Threshold values must be in \[0, 1\]. Invalid values: \[None\]'):
-      metrics._assert_thresholds_range([None, 0.5])
+  def test_sparse_categorical_accuracy_mismatched_dims(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([2, 1], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([2, 1], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+  def test_sparse_categorical_accuracy_mismatched_dims_dynamic(self):
+    with context.graph_mode(), self.cached_session() as sess:
+      acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+      self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+      t = array_ops.placeholder(dtypes.float32)
+      p = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+
+      result_t = acc_obj(t, p, w)
+      result = sess.run(
+          result_t,
+          feed_dict=({
+              t: [2, 1],
+              p: [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+              w: [[0.5], [0.2]]
+          }))
+      self.assertAlmostEqual(result, 0.71, 2)  # 2.5/2.7
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FalsePositivesTest(test.TestCase):
+class CosineProximityTest(test.TestCase):
+
+  def l2_norm(self, x, axis):
+    epsilon = 1e-12
+    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+    return np.multiply(x, x_inv_norm)
+
+  def setup(self, axis=1):
+    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+    y_true = self.l2_norm(self.np_y_true, axis)
+    y_pred = self.l2_norm(self.np_y_pred, axis)
+    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+    self.y_true = constant_op.constant(self.np_y_true)
+    self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
-    self.assertEqual(fp_obj.name, 'my_fp')
-    self.assertEqual(len(fp_obj.variables), 1)
-    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+    cosine_obj = metrics.CosineProximity(
+        axis=2, name='my_cos', dtype=dtypes.int32)
+    self.assertEqual(cosine_obj.name, 'my_cos')
+    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    cosine_obj2 = metrics.CosineProximity.from_config(cosine_obj.get_config())
+    self.assertEqual(cosine_obj2.name, 'my_cos')
+    self.assertEqual(cosine_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    fp_obj = metrics.FalsePositives()
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    self.setup()
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_weighted(self):
+    self.setup()
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    sample_weight = np.asarray([1.2, 3.4])
+    loss = cosine_obj(
+        self.y_true,
+        self.y_pred,
+        sample_weight=constant_op.constant(sample_weight))
+    expected_loss = np.sum(
+        self.expected_loss * sample_weight) / np.sum(sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_axis(self):
+    self.setup(axis=1)
+    cosine_obj = metrics.CosineProximity(axis=1)
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsoluteErrorTest(test.TestCase):
 
+  def test_config(self):
+    mae_obj = metrics.MeanAbsoluteError(name='my_mae', dtype=dtypes.int32)
+    self.assertEqual(mae_obj.name, 'my_mae')
+    self.assertEqual(mae_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    mae_obj2 = metrics.MeanAbsoluteError.from_config(mae_obj.get_config())
+    self.assertEqual(mae_obj2.name, 'my_mae')
+    self.assertEqual(mae_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    mae_obj = metrics.MeanAbsoluteError()
+    self.evaluate(variables.variables_initializer(mae_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = fp_obj.update_state(y_true, y_pred)
+    update_op = mae_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = fp_obj.result()
-    self.assertAllClose(7., result)
+    result = mae_obj.result()
+    self.assertAllClose(0.5, result, atol=1e-5)
 
   def test_weighted(self):
-    fp_obj = metrics.FalsePositives()
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    mae_obj = metrics.MeanAbsoluteError()
+    self.evaluate(variables.variables_initializer(mae_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(14., self.evaluate(result))
+    result = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
 
-  def test_unweighted_with_thresholds(self):
-    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsolutePercentageErrorTest(test.TestCase):
 
-    update_op = fp_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fp_obj.result()
-    self.assertAllClose([7., 4., 2.], result)
+  def test_config(self):
+    mape_obj = metrics.MeanAbsolutePercentageError(
+        name='my_mape', dtype=dtypes.int32)
+    self.assertEqual(mape_obj.name, 'my_mape')
+    self.assertEqual(mape_obj._dtype, dtypes.int32)
 
-  def test_weighted_with_thresholds(self):
-    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    # Check save and restore config
+    mape_obj2 = metrics.MeanAbsolutePercentageError.from_config(
+        mape_obj.get_config())
+    self.assertEqual(mape_obj2.name, 'my_mape')
+    self.assertEqual(mape_obj2._dtype, dtypes.int32)
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
-                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
+  def test_unweighted(self):
+    mape_obj = metrics.MeanAbsolutePercentageError()
+    self.evaluate(variables.variables_initializer(mape_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+    update_op = mape_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = mape_obj.result()
+    self.assertAllClose(35e7, result, atol=1e-5)
 
-  def test_threshold_limit(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
-      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+  def test_weighted(self):
+    mape_obj = metrics.MeanAbsolutePercentageError()
+    self.evaluate(variables.variables_initializer(mape_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(40e7, self.evaluate(result), atol=1e-5)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FalseNegativesTest(test.TestCase):
+class MeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
-    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
-    self.assertEqual(fn_obj.name, 'my_fn')
-    self.assertEqual(len(fn_obj.variables), 1)
-    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+    mse_obj = metrics.MeanSquaredError(name='my_mse', dtype=dtypes.int32)
+    self.assertEqual(mse_obj.name, 'my_mse')
+    self.assertEqual(mse_obj._dtype, dtypes.int32)
 
-  def test_unweighted(self):
-    fn_obj = metrics.FalseNegatives()
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    # Check save and restore config
+    mse_obj2 = metrics.MeanSquaredError.from_config(mse_obj.get_config())
+    self.assertEqual(mse_obj2.name, 'my_mse')
+    self.assertEqual(mse_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    mse_obj = metrics.MeanSquaredError()
+    self.evaluate(variables.variables_initializer(mse_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = fn_obj.update_state(y_true, y_pred)
+    update_op = mse_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = fn_obj.result()
-    self.assertAllClose(3., result)
+    result = mse_obj.result()
+    self.assertAllClose(0.5, result, atol=1e-5)
 
   def test_weighted(self):
-    fn_obj = metrics.FalseNegatives()
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    mse_obj = metrics.MeanSquaredError()
+    self.evaluate(variables.variables_initializer(mse_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(5., self.evaluate(result))
+    result = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
 
-  def test_unweighted_with_thresholds(self):
-    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
-    update_op = fn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fn_obj.result()
-    self.assertAllClose([1., 4., 6.], result)
+  def test_config(self):
+    msle_obj = metrics.MeanSquaredLogarithmicError(
+        name='my_msle', dtype=dtypes.int32)
+    self.assertEqual(msle_obj.name, 'my_msle')
+    self.assertEqual(msle_obj._dtype, dtypes.int32)
 
-  def test_weighted_with_thresholds(self):
-    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    # Check save and restore config
+    msle_obj2 = metrics.MeanSquaredLogarithmicError.from_config(
+        msle_obj.get_config())
+    self.assertEqual(msle_obj2.name, 'my_msle')
+    self.assertEqual(msle_obj2._dtype, dtypes.int32)
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+  def test_unweighted(self):
+    msle_obj = metrics.MeanSquaredLogarithmicError()
+    self.evaluate(variables.variables_initializer(msle_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+    update_op = msle_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = msle_obj.result()
+    self.assertAllClose(0.24022, result, atol=1e-5)
+
+  def test_weighted(self):
+    msle_obj = metrics.MeanSquaredLogarithmicError()
+    self.evaluate(variables.variables_initializer(msle_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.26082, self.evaluate(result), atol=1e-5)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class TrueNegativesTest(test.TestCase):
+class HingeTest(test.TestCase):
 
   def test_config(self):
-    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
-    self.assertEqual(tn_obj.name, 'my_tn')
-    self.assertEqual(len(tn_obj.variables), 1)
-    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+    hinge_obj = metrics.Hinge(name='hinge', dtype=dtypes.int32)
+    self.assertEqual(hinge_obj.name, 'hinge')
+    self.assertEqual(hinge_obj._dtype, dtypes.int32)
 
-  def test_unweighted(self):
-    tn_obj = metrics.TrueNegatives()
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    # Check save and restore config
+    hinge_obj2 = metrics.Hinge.from_config(hinge_obj.get_config())
+    self.assertEqual(hinge_obj2.name, 'hinge')
+    self.assertEqual(hinge_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    hinge_obj = metrics.Hinge()
+    self.evaluate(variables.variables_initializer(hinge_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = tn_obj.update_state(y_true, y_pred)
+    update_op = hinge_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tn_obj.result()
-    self.assertAllClose(3., result)
+    result = hinge_obj.result()
+    self.assertAllClose(0.65, result, atol=1e-5)
 
   def test_weighted(self):
-    tn_obj = metrics.TrueNegatives()
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    hinge_obj = metrics.Hinge()
+    self.evaluate(variables.variables_initializer(hinge_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(4., self.evaluate(result))
+    result = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.65714, self.evaluate(result), atol=1e-5)
 
-  def test_unweighted_with_thresholds(self):
-    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class SquaredHingeTest(test.TestCase):
 
-    update_op = tn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = tn_obj.result()
-    self.assertAllClose([2., 5., 7.], result)
+  def test_config(self):
+    sq_hinge_obj = metrics.SquaredHinge(name='sq_hinge', dtype=dtypes.int32)
+    self.assertEqual(sq_hinge_obj.name, 'sq_hinge')
+    self.assertEqual(sq_hinge_obj._dtype, dtypes.int32)
 
-  def test_weighted_with_thresholds(self):
-    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    # Check save and restore config
+    sq_hinge_obj2 = metrics.SquaredHinge.from_config(sq_hinge_obj.get_config())
+    self.assertEqual(sq_hinge_obj2.name, 'sq_hinge')
+    self.assertEqual(sq_hinge_obj2._dtype, dtypes.int32)
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+  def test_unweighted(self):
+    sq_hinge_obj = metrics.SquaredHinge()
+    self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+    update_op = sq_hinge_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = sq_hinge_obj.result()
+    self.assertAllClose(0.65, result, atol=1e-5)
+
+  def test_weighted(self):
+    sq_hinge_obj = metrics.SquaredHinge()
+    self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.65714, self.evaluate(result), atol=1e-5)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class TruePositivesTest(test.TestCase):
+class CategoricalHingeTest(test.TestCase):
 
   def test_config(self):
-    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
-    self.assertEqual(tp_obj.name, 'my_tp')
-    self.assertEqual(len(tp_obj.variables), 1)
-    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+    cat_hinge_obj = metrics.CategoricalHinge(
+        name='cat_hinge', dtype=dtypes.int32)
+    self.assertEqual(cat_hinge_obj.name, 'cat_hinge')
+    self.assertEqual(cat_hinge_obj._dtype, dtypes.int32)
 
-  def test_unweighted(self):
-    tp_obj = metrics.TruePositives()
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    # Check save and restore config
+    cat_hinge_obj2 = metrics.CategoricalHinge.from_config(
+        cat_hinge_obj.get_config())
+    self.assertEqual(cat_hinge_obj2.name, 'cat_hinge')
+    self.assertEqual(cat_hinge_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    cat_hinge_obj = metrics.CategoricalHinge()
+    self.evaluate(variables.variables_initializer(cat_hinge_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = tp_obj.update_state(y_true, y_pred)
+    update_op = cat_hinge_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tp_obj.result()
-    self.assertAllClose(7., result)
+    result = cat_hinge_obj.result()
+    self.assertAllClose(0.5, result, atol=1e-5)
 
   def test_weighted(self):
-    tp_obj = metrics.TruePositives()
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    cat_hinge_obj = metrics.CategoricalHinge()
+    self.evaluate(variables.variables_initializer(cat_hinge_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(12., self.evaluate(result))
+    result = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.5, self.evaluate(result), atol=1e-5)
 
-  def test_unweighted_with_thresholds(self):
-    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class RootMeanSquaredErrorTest(test.TestCase):
+
+  def test_config(self):
+    rmse_obj = metrics.RootMeanSquaredError(name='rmse', dtype=dtypes.int32)
+    self.assertEqual(rmse_obj.name, 'rmse')
+    self.assertEqual(rmse_obj._dtype, dtypes.int32)
 
-    update_op = tp_obj.update_state(y_true, y_pred)
+    rmse_obj2 = metrics.RootMeanSquaredError.from_config(rmse_obj.get_config())
+    self.assertEqual(rmse_obj2.name, 'rmse')
+    self.assertEqual(rmse_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    rmse_obj = metrics.RootMeanSquaredError()
+    self.evaluate(variables.variables_initializer(rmse_obj.variables))
+    y_true = constant_op.constant((2, 4, 6))
+    y_pred = constant_op.constant((1, 3, 2))
+
+    update_op = rmse_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tp_obj.result()
-    self.assertAllClose([6., 3., 1.], result)
+    result = rmse_obj.result()
+    # error = [-1, -1, -4], square(error) = [1, 1, 16], mean = 18/3 = 6
+    self.assertAllClose(math.sqrt(6), result, atol=1e-3)
+
+  def test_weighted(self):
+    rmse_obj = metrics.RootMeanSquaredError()
+    self.evaluate(variables.variables_initializer(rmse_obj.variables))
+    y_true = constant_op.constant((2, 4, 6, 8))
+    y_pred = constant_op.constant((1, 3, 2, 3))
+    sample_weight = constant_op.constant((0, 1, 0, 1))
+    result = rmse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(math.sqrt(13), self.evaluate(result), atol=1e-3)
 
-  def test_weighted_with_thresholds(self):
-    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class TopKCategoricalAccuracyTest(test.TestCase):
+
+  def test_config(self):
+    a_obj = metrics.TopKCategoricalAccuracy(name='topkca', dtype=dtypes.int32)
+    self.assertEqual(a_obj.name, 'topkca')
+    self.assertEqual(a_obj._dtype, dtypes.int32)
+
+    a_obj2 = metrics.TopKCategoricalAccuracy.from_config(a_obj.get_config())
+    self.assertEqual(a_obj2.name, 'topkca')
+    self.assertEqual(a_obj2._dtype, dtypes.int32)
+
+  def test_correctness(self):
+    a_obj = metrics.TopKCategoricalAccuracy()
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([[0, 0, 1], [0, 1, 0]])
+    y_pred = constant_op.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+    # With `k` < 5.
+    a_obj = metrics.TopKCategoricalAccuracy(k=1)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+    # With `k` > 5.
+    y_true = constant_op.constant([[0, 0, 1, 0, 0, 0, 0],
+                                   [0, 1, 0, 0, 0, 0, 0]])
+    y_pred = constant_op.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
+                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
+    a_obj = metrics.TopKCategoricalAccuracy(k=6)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseTopKCategoricalAccuracyTest(test.TestCase):
 
-    result = tp_obj(y_true, y_pred, sample_weight=37.)
-    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+  def test_config(self):
+    a_obj = metrics.SparseTopKCategoricalAccuracy(
+        name='stopkca', dtype=dtypes.int32)
+    self.assertEqual(a_obj.name, 'stopkca')
+    self.assertEqual(a_obj._dtype, dtypes.int32)
+
+    a_obj2 = metrics.SparseTopKCategoricalAccuracy.from_config(
+        a_obj.get_config())
+    self.assertEqual(a_obj2.name, 'stopkca')
+    self.assertEqual(a_obj2._dtype, dtypes.int32)
+
+  def test_correctness(self):
+    a_obj = metrics.SparseTopKCategoricalAccuracy()
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([2, 1])
+    y_pred = constant_op.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+    # With `k` < 5.
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=1)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+    # With `k` > 5.
+    y_pred = constant_op.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
+                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=6)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class PrecisionTest(test.TestCase):
+class LogCoshErrorTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    error = y_pred - y_true
+    self.expected_results = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
-    self.assertEqual(p_obj.name, 'my_precision')
-    self.assertEqual(len(p_obj.variables), 2)
-    self.assertEqual([v.name for v in p_obj.variables],
-                     ['true_positives:0', 'false_positives:0'])
-    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
-
-  def test_value_is_idempotent(self):
-    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
-    y_pred = random_ops.random_uniform(shape=(10, 3))
-    y_true = random_ops.random_uniform(shape=(10, 3))
-    update_op = p_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_precision = self.evaluate(p_obj.result())
-    for _ in range(10):
-      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
-                           1e-3)
+    logcosh_obj = metrics.LogCoshError(name='logcosh', dtype=dtypes.int32)
+    self.assertEqual(logcosh_obj.name, 'logcosh')
+    self.assertEqual(logcosh_obj._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    p_obj = metrics.Precision()
-    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_all_incorrect(self):
-    p_obj = metrics.Precision(thresholds=[0.5])
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs)
-    y_true = constant_op.constant(1 - inputs)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0, self.evaluate(result))
+    self.setup()
+    logcosh_obj = metrics.LogCoshError()
+    self.evaluate(variables.variables_initializer(logcosh_obj.variables))
+
+    update_op = logcosh_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = logcosh_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
 
   def test_weighted(self):
-    p_obj = metrics.Precision()
-    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
-    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(
-        y_true,
-        y_pred,
-        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
-    weighted_tp = 3.0 + 4.0
-    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
-    expected_precision = weighted_tp / weighted_positives
-    self.assertAlmostEqual(expected_precision, self.evaluate(result))
-
-  def test_div_by_zero(self):
-    p_obj = metrics.Precision()
-    y_pred = constant_op.constant([0, 0, 0, 0])
-    y_true = constant_op.constant([0, 0, 0, 0])
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertEqual(0, self.evaluate(result))
-
-  def test_unweighted_with_threshold(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
-    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
-
-  def test_weighted_with_threshold(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[4, 0], [3, 1]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred, sample_weight=weights)
-    weighted_tp = 0 + 3.
-    weighted_positives = (0 + 3.) + (4. + 0.)
-    expected_precision = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
-
-  def test_multiple_updates(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[4, 0], [3, 1]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
-    for _ in range(2):
-      self.evaluate(update_op)
+    self.setup()
+    logcosh_obj = metrics.LogCoshError()
+    self.evaluate(variables.variables_initializer(logcosh_obj.variables))
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    result = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
 
-    weighted_tp = (0 + 3.) + (0 + 3.)
-    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
-    expected_precision = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
-                         1e-3)
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / np.sum(sample_weight)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RecallTest(test.TestCase):
+class PoissonTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_results = y_pred - np.multiply(y_true, np.log(y_pred))
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
-    self.assertEqual(r_obj.name, 'my_recall')
-    self.assertEqual(len(r_obj.variables), 2)
-    self.assertEqual([v.name for v in r_obj.variables],
-                     ['true_positives:0', 'false_negatives:0'])
-    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
-
-  def test_value_is_idempotent(self):
-    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
-    y_pred = random_ops.random_uniform(shape=(10, 3))
-    y_true = random_ops.random_uniform(shape=(10, 3))
-    update_op = r_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_recall = self.evaluate(r_obj.result())
-    for _ in range(10):
-      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+    poisson_obj = metrics.Poisson(name='poisson', dtype=dtypes.int32)
+    self.assertEqual(poisson_obj.name, 'poisson')
+    self.assertEqual(poisson_obj._dtype, dtypes.int32)
+
+    poisson_obj2 = metrics.Poisson.from_config(poisson_obj.get_config())
+    self.assertEqual(poisson_obj2.name, 'poisson')
+    self.assertEqual(poisson_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    r_obj = metrics.Recall()
-    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_all_incorrect(self):
-    r_obj = metrics.Recall(thresholds=[0.5])
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs)
-    y_true = constant_op.constant(1 - inputs)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0, self.evaluate(result))
+    self.setup()
+    poisson_obj = metrics.Poisson()
+    self.evaluate(variables.variables_initializer(poisson_obj.variables))
+
+    update_op = poisson_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = poisson_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
 
   def test_weighted(self):
-    r_obj = metrics.Recall()
-    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
-    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(
-        y_true,
-        y_pred,
-        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
-    weighted_tp = 3.0 + 1.0
-    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
-    expected_recall = weighted_tp / weighted_t
-    self.assertAlmostEqual(expected_recall, self.evaluate(result))
-
-  def test_div_by_zero(self):
-    r_obj = metrics.Recall()
-    y_pred = constant_op.constant([0, 0, 0, 0])
-    y_true = constant_op.constant([0, 0, 0, 0])
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertEqual(0, self.evaluate(result))
-
-  def test_unweighted_with_threshold(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
-    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
-
-  def test_weighted_with_threshold(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[1, 4], [3, 2]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred, sample_weight=weights)
-    weighted_tp = 0 + 3.
-    weighted_positives = (0 + 3.) + (4. + 0.)
-    expected_recall = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
-
-  def test_multiple_updates(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[1, 4], [3, 2]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
-    for _ in range(2):
-      self.evaluate(update_op)
+    self.setup()
+    poisson_obj = metrics.Poisson()
+    self.evaluate(variables.variables_initializer(poisson_obj.variables))
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
 
-    weighted_tp = (0 + 3.) + (0 + 3.)
-    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
-    expected_recall = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
-                         1e-3)
+    result = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / np.sum(sample_weight)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+class KLDivergenceTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
+    y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
+
+    self.batch_size = 2
+    self.expected_results = np.multiply(y_true, np.log(y_true / y_pred))
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    s_obj = metrics.SensitivityAtSpecificity(
-        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
-    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.value, 0.4)
-    self.assertLen(s_obj.thresholds, 100)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.7)
-    y_pred = random_ops.random_uniform((10, 3),
-                                       maxval=1,
-                                       dtype=dtypes.float32,
-                                       seed=1)
-    y_true = random_ops.random_uniform((10, 3),
-                                       maxval=2,
-                                       dtype=dtypes.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_sensitivity = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
-    y_true = constant_op.constant(inputs)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.8)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.8, self.evaluate(result))
-
-  def test_unweighted_low_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.SensitivityAtSpecificity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = math_ops.cast(label_values, dtype=label_dtype)
-    weights = constant_op.constant(weight_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.675, self.evaluate(result))
-
-  def test_invalid_specificity(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'`specificity` must be in the range \[0, 1\].'):
-      metrics.SensitivityAtSpecificity(-1)
+    k_obj = metrics.KLDivergence(name='kld', dtype=dtypes.int32)
+    self.assertEqual(k_obj.name, 'kld')
+    self.assertEqual(k_obj._dtype, dtypes.int32)
+
+    k_obj2 = metrics.KLDivergence.from_config(k_obj.get_config())
+    self.assertEqual(k_obj2.name, 'kld')
+    self.assertEqual(k_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    self.setup()
+    k_obj = metrics.KLDivergence()
+    self.evaluate(variables.variables_initializer(k_obj.variables))
+
+    update_op = k_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = k_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
 
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
-      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+  def test_weighted(self):
+    self.setup()
+    k_obj = metrics.KLDivergence()
+    self.evaluate(variables.variables_initializer(k_obj.variables))
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    result = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / (1.2 + 3.4)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+class MeanRelativeErrorTest(test.TestCase):
 
   def test_config(self):
-    s_obj = metrics.SpecificityAtSensitivity(
-        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
-    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.value, 0.4)
-    self.assertLen(s_obj.thresholds, 100)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.7)
-    y_pred = random_ops.random_uniform((10, 3),
-                                       maxval=1,
-                                       dtype=dtypes.float32,
-                                       seed=1)
-    y_true = random_ops.random_uniform((10, 3),
-                                       maxval=2,
-                                       dtype=dtypes.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_specificity = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
-    y_true = constant_op.constant(inputs)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.8)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.4, self.evaluate(result))
-
-  def test_unweighted_low_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.SpecificityAtSensitivity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = math_ops.cast(label_values, dtype=label_dtype)
-    weights = constant_op.constant(weight_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.4, self.evaluate(result))
-
-  def test_invalid_sensitivity(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
-      metrics.SpecificityAtSensitivity(-1)
+    normalizer = constant_op.constant([1, 3], dtype=dtypes.float32)
+    mre_obj = metrics.MeanRelativeError(normalizer=normalizer, name='mre')
+    self.assertEqual(mre_obj.name, 'mre')
+    self.assertArrayNear(self.evaluate(mre_obj.normalizer), [1, 3], 1e-1)
+
+    mre_obj2 = metrics.MeanRelativeError.from_config(mre_obj.get_config())
+    self.assertEqual(mre_obj2.name, 'mre')
+    self.assertArrayNear(self.evaluate(mre_obj2.normalizer), [1, 3], 1e-1)
+
+  def test_unweighted(self):
+    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    expected_error = np.mean(
+        np.divide(np.absolute(np_y_pred - np_y_true), np_y_true))
+
+    y_pred = constant_op.constant(np_y_pred, shape=(1, 4), dtype=dtypes.float32)
+    y_true = constant_op.constant(np_y_true, shape=(1, 4))
+
+    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(y_true, y_pred)
+    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+  def test_weighted(self):
+    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    sample_weight = np.asarray([0.2, 0.3, 0.5, 0], dtype=np.float32)
+    rel_errors = np.divide(np.absolute(np_y_pred - np_y_true), np_y_true)
+    expected_error = np.sum(rel_errors * sample_weight)
+
+    y_pred = constant_op.constant(np_y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(np_y_true)
+
+    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+  def test_zero_normalizer(self):
+    y_pred = constant_op.constant([2, 4], dtype=dtypes.float32)
+    y_true = constant_op.constant([1, 3])
+
+    mre_obj = metrics.MeanRelativeError(normalizer=array_ops.zeros_like(y_true))
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
 
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
-      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+    result = mre_obj(y_true, y_pred)
+    self.assertEqual(self.evaluate(result), 0)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class MeanIoUTest(test.TestCase):
 
   def test_config(self):
-    cosine_obj = metrics.CosineProximity(name='my_cos', dtype=dtypes.int32)
-    self.assertEqual(cosine_obj.name, 'my_cos')
-    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+    m_obj = metrics.MeanIoU(num_classes=2, name='mean_iou')
+    self.assertEqual(m_obj.name, 'mean_iou')
+    self.assertEqual(m_obj.num_classes, 2)
+
+    m_obj2 = metrics.MeanIoU.from_config(m_obj.get_config())
+    self.assertEqual(m_obj2.name, 'mean_iou')
+    self.assertEqual(m_obj2.num_classes, 2)
 
   def test_unweighted(self):
-    cosine_obj = metrics.CosineProximity()
-    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    y_pred = constant_op.constant([0, 1, 0, 1], dtype=dtypes.float32)
+    y_true = constant_op.constant([0, 0, 1, 1])
 
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+
+    result = m_obj(y_true, y_pred)
+
+    # cm = [[1, 1],
+    #       [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+  def test_weighted(self):
+    y_pred = constant_op.constant([0, 1, 0, 1], dtype=dtypes.float32)
+    y_true = constant_op.constant([0, 0, 1, 1])
+    sample_weight = constant_op.constant([0.2, 0.3, 0.4, 0.1])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+
+    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # cm = [[0.2, 0.3],
+    #       [0.4, 0.1]]
+    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+  def test_multi_dim_input(self):
+    y_pred = constant_op.constant([[0, 1], [0, 1]], dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 0], [1, 1]])
+    sample_weight = constant_op.constant([[0.2, 0.3], [0.4, 0.1]])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+
+    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # cm = [[0.2, 0.3],
+    #       [0.4, 0.1]]
+    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+  def test_zero_valid_entries(self):
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+    self.assertAllClose(self.evaluate(m_obj.result()), 0, atol=1e-3)
 
-    update_op = cosine_obj.update_state(y_true, y_pred)
+  def test_zero_and_non_zero_entries(self):
+    y_pred = constant_op.constant([1], dtype=dtypes.float32)
+    y_true = constant_op.constant([1])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+    result = m_obj(y_true, y_pred)
+
+    # cm = [[0, 0],
+    #       [0, 1]]
+    # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (0 + 1 / (1 + 1 - 1)) / 1
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+class MeanTensorTest(keras_parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_config(self):
+    m = metrics.MeanTensor(name='mean_by_element')
+
+    # check config
+    self.assertEqual(m.name, 'mean_by_element')
+    self.assertTrue(m.stateful)
+    self.assertEqual(m.dtype, dtypes.float32)
+    self.assertEqual(len(m.variables), 0)
+
+    with self.assertRaisesRegexp(ValueError, 'does not have any result yet'):
+      m.result()
+
+    self.evaluate(m([[3], [5], [3]]))
+    self.assertAllEqual(m._shape, [3, 1])
+
+    m2 = metrics.MeanTensor.from_config(m.get_config())
+    self.assertEqual(m2.name, 'mean_by_element')
+    self.assertTrue(m2.stateful)
+    self.assertEqual(m2.dtype, dtypes.float32)
+    self.assertEqual(len(m2.variables), 0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_unweighted(self):
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+
+    # check __call__()
+    self.assertAllClose(self.evaluate(m([100, 40])), [100, 40])
+    self.assertAllClose(self.evaluate(m.total), [100, 40])
+    self.assertAllClose(self.evaluate(m.count), [1, 1])
+
+    # check update_state() and result() + state accumulation + tensor input
+    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
     self.evaluate(update_op)
-    result = cosine_obj.result()
-    self.assertAllClose(-0.60723, result, atol=1e-5)
+    self.assertAllClose(self.evaluate(m.result()), [50.5, 22.5])
+    self.assertAllClose(self.evaluate(m.total), [101, 45])
+    self.assertAllClose(self.evaluate(m.count), [2, 2])
+
+    # check reset_states()
+    m.reset_states()
+    self.assertAllClose(self.evaluate(m.total), [0, 0])
+    self.assertAllClose(self.evaluate(m.count), [0, 0])
 
+  @test_util.run_in_graph_and_eager_modes
   def test_weighted(self):
-    cosine_obj = metrics.CosineProximity()
-    self.evaluate(variables.variables_initializer(cosine_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
+
+    # check scalar weight
+    result_t = m([100, 30], sample_weight=0.5)
+    self.assertAllClose(self.evaluate(result_t), [100, 30])
+    self.assertAllClose(self.evaluate(m.total), [50, 15])
+    self.assertAllClose(self.evaluate(m.count), [0.5, 0.5])
+
+    # check weights not scalar and weights rank matches values rank
+    result_t = m([1, 5], sample_weight=[1, 0.2])
+    result = self.evaluate(result_t)
+    self.assertAllClose(result, [51 / 1.5, 16 / 0.7], 2)
+    self.assertAllClose(self.evaluate(m.total), [51, 16])
+    self.assertAllClose(self.evaluate(m.count), [1.5, 0.7])
+
+    # check weights broadcast
+    result_t = m([1, 2], sample_weight=0.5)
+    self.assertAllClose(self.evaluate(result_t), [51.5 / 2, 17 / 1.2])
+    self.assertAllClose(self.evaluate(m.total), [51.5, 17])
+    self.assertAllClose(self.evaluate(m.count), [2, 1.2])
+
+    # check weights squeeze
+    result_t = m([1, 5], sample_weight=[[1], [0.2]])
+    self.assertAllClose(self.evaluate(result_t), [52.5 / 3, 18 / 1.4])
+    self.assertAllClose(self.evaluate(m.total), [52.5, 18])
+    self.assertAllClose(self.evaluate(m.count), [3, 1.4])
+
+    # check weights expand
+    m = metrics.MeanTensor((2, 1), dtype=dtypes.float64)
+    self.evaluate(variables.variables_initializer(m.variables))
+    result_t = m([[1], [5]], sample_weight=[1, 0.2])
+    self.assertAllClose(self.evaluate(result_t), [[1], [5]])
+    self.assertAllClose(self.evaluate(m.total), [[1], [1]])
+    self.assertAllClose(self.evaluate(m.count), [[1], [0.2]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_value_shape(self):
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+    m([1])
+    with self.assertRaisesRegexp(
+        ValueError, 'MeanTensor input values must always have the same shape'):
+      m([1, 5])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_build_in_tf_function(self):
+    """Ensure that variables are created correctly in a tf function."""
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+
+    @eager_function.defun
+    def call_metric(x):
+      return m(x)
+
+    self.assertAllClose(self.evaluate(call_metric([100, 40])), [100, 40])
+    self.assertAllClose(self.evaluate(m.total), [100, 40])
+    self.assertAllClose(self.evaluate(m.count), [1, 1])
+    self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
+
+  def test_in_keras_model(self):
+    with context.eager_mode():
+      class ModelWithMetric(Model):
+
+        def __init__(self):
+          super(ModelWithMetric, self).__init__()
+          self.dense1 = layers.Dense(
+              3, activation='relu', kernel_initializer='ones')
+          self.dense2 = layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones')
+          self.mean_tensor = metrics.MeanTensor()
+
+        def call(self, x):
+          x = self.dense1(x)
+          x = self.dense2(x)
+          self.mean_tensor(self.dense1.kernel)
+          return x
+
+      model = ModelWithMetric()
+      model.compile(
+          loss='mae',
+          optimizer='rmsprop',
+          run_eagerly=True)
+
+      x = np.ones((100, 4))
+      y = np.zeros((100, 1))
+      model.evaluate(x, y, batch_size=50)
+      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                          np.ones((4, 3)))
+      self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                          np.full((4, 3), 2))
+      self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                          np.full((4, 3), 2))
+
+      model.evaluate(x, y, batch_size=25)
+      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                          np.ones((4, 3)))
+      self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                          np.full((4, 3), 4))
+      self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                          np.full((4, 3), 4))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BinaryCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    bce_obj = metrics.BinaryCrossentropy(
+        name='bce', dtype=dtypes.int32, label_smoothing=0.2)
+    self.assertEqual(bce_obj.name, 'bce')
+    self.assertEqual(bce_obj._dtype, dtypes.int32)
+
+    old_config = bce_obj.get_config()
+    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
+
+    # Check save and restore config
+    bce_obj2 = metrics.BinaryCrossentropy.from_config(old_config)
+    self.assertEqual(bce_obj2.name, 'bce')
+    self.assertEqual(bce_obj2._dtype, dtypes.int32)
+    new_config = bce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_unweighted(self):
+    bce_obj = metrics.BinaryCrossentropy()
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+    result = bce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #           -log(Y_MAX + EPSILON), -log(1)]
+    #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+    # Reduced metric = 7.665 / 2
+
+    self.assertAllClose(self.evaluate(result), 3.833, atol=1e-3)
+
+  def test_unweighted_with_logits(self):
+    bce_obj = metrics.BinaryCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    y_pred = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
+    result = bce_obj(y_true, y_pred)
+
+    # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #              (where x = logits and z = y_true)
+    #        = [((100 - 100 * 1 + log(1 + exp(-100))) +
+    #            (0 + 100 * 0 + log(1 + exp(-100))) +
+    #            (100 - 100 * 1 + log(1 + exp(-100))),
+    #           ((100 - 100 * 0 + log(1 + exp(-100))) +
+    #            (100 - 100 * 1 + log(1 + exp(-100))) +
+    #            (0 + 100 * 1 + log(1 + exp(-100))))]
+    #        = [(0 + 0 + 0) / 3, 200 / 3]
+    # Reduced metric = (0 + 66.666) / 2
+
+    self.assertAllClose(self.evaluate(result), 33.333, atol=1e-3)
+
+  def test_weighted(self):
+    bce_obj = metrics.BinaryCrossentropy()
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #           -log(Y_MAX + EPSILON), -log(1)]
+    #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+    # Weighted metric = [7.665 * 1.5, 0]
+    # Reduced metric = 7.665 * 1.5 / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 3.285, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    bce_obj = metrics.BinaryCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    y_pred = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
+    sample_weight = constant_op.constant([2., 2.5])
+    result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #              (where x = logits and z = y_true)
+    #        = [(0 + 0 + 0) / 3, 200 / 3]
+    # Weighted metric = [0, 66.666 * 2.5]
+    # Reduced metric = 66.666 * 2.5 / (2 + 2.5)
+
+    self.assertAllClose(self.evaluate(result), 37.037, atol=1e-3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant(((100., -100., -100.)))
+    y_true = constant_op.constant(((1, 0, 1)))
+    label_smoothing = 0.1
+    # Metric: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #             (where x = logits and z = y_true)
+    # Label smoothing: z' = z * (1 - L) + 0.5L
+    # After label smoothing, label 1 becomes 1 - 0.5L
+    #                        label 0 becomes 0.5L
+    # Applying the above two fns to the given input:
+    # (100 - 100 * (1 - 0.5 L)  + 0 +
+    #  0   + 100 * (0.5 L)      + 0 +
+    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+    #  = (100 + 50L) * 1/3
+    bce_obj = metrics.BinaryCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    result = bce_obj(y_true, logits)
+    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+    self.assertAllClose(expected_value, self.evaluate(result), atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = metrics.CategoricalCrossentropy(
+        name='cce', dtype=dtypes.int32, label_smoothing=0.2)
+    self.assertEqual(cce_obj.name, 'cce')
+    self.assertEqual(cce_obj._dtype, dtypes.int32)
+
+    old_config = cce_obj.get_config()
+    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
+
+    # Check save and restore config
+    cce_obj2 = metrics.CategoricalCrossentropy.from_config(old_config)
+    self.assertEqual(cce_obj2.name, 'cce')
+    self.assertEqual(cce_obj2._dtype, dtypes.int32)
+    new_config = cce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_unweighted(self):
+    cce_obj = metrics.CategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    result = cce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+    # Metric = -sum(y * log(y'), axis = -1)
+    #        = -((log 0.95), (log 0.1))
+    #        = [0.051, 2.302]
+    # Reduced metric = (0.051 + 2.302) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+  def test_unweighted_from_logits(self):
+    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    result = cce_obj(y_true, logits)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+
+    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # labels * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+    # xent = [0.00045, 7.00182]
+    # Reduced xent = (0.00045 + 7.00182) / 2
+
+    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
+
+  def test_weighted(self):
+    cce_obj = metrics.CategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+    # Metric = -sum(y * log(y'), axis = -1)
+    #        = -((log 0.95), (log 0.1))
+    #        = [0.051, 2.302]
+    # Weighted metric = [0.051 * 1.5, 2.302 * 2.]
+    # Reduced metric = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = cce_obj(y_true, logits, sample_weight=sample_weight)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+    # xent = [0.00045, 7.00182]
+    # weighted xent = [0.000675, 14.00364]
+    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+  def test_label_smoothing(self):
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    label_smoothing = 0.1
+
+    # Label smoothing: z' = z * (1 - L) + L/n,
+    #     where L = label smoothing value and n = num classes
+    # Label value 1 becomes: 1 - L + L/n
+    # Label value 0 becomes: L/n
+    # y_true with label_smoothing = [[0.0333, 0.9333, 0.0333],
+    #                               [0.0333, 0.0333, 0.9333]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # labels * log(softmax) = [[-0.26641, -0.00042, -0.29971],
+    #                          [-0.23316, -0.00006, -6.53479]]
+    # xent = [0.56654, 6.76801]
+    # Reduced xent = (0.56654 + 6.76801) / 2
+
+    cce_obj = metrics.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose(self.evaluate(loss), 3.667, atol=1e-3)
 
 
 def _get_model(compile_metrics):
@@ -984,7 +1572,7 @@ def _get_model(compile_metrics):
   model.compile(
       loss='mae',
       metrics=compile_metrics,
-      optimizer=RMSPropOptimizer(learning_rate=0.001),
+      optimizer='rmsprop',
       run_eagerly=testing_utils.should_run_eagerly())
   return model
 
@@ -1039,11 +1627,11 @@ class ResetStatesTest(keras_parameterized.TestCase):
     x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
     y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
     model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.tp), 50.)
-    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+    self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
     model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.tp), 50.)
-    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+    self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
 
   def test_reset_states_recall(self):
     r_obj = metrics.Recall()
@@ -1051,11 +1639,11 @@ class ResetStatesTest(keras_parameterized.TestCase):
     x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
     y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
     model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.tp), 50.)
-    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
     model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.tp), 50.)
-    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
 
   def test_reset_states_sensitivity_at_specificity(self):
     s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
@@ -1064,16 +1652,13 @@ class ResetStatesTest(keras_parameterized.TestCase):
                         np.ones((25, 4))))
     y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
                         np.zeros((25, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+    for _ in range(2):
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
+      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
 
   def test_reset_states_specificity_at_sensitivity(self):
     s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
@@ -1082,16 +1667,41 @@ class ResetStatesTest(keras_parameterized.TestCase):
                         np.ones((25, 4))))
     y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
                         np.zeros((25, 1))))
+
+    for _ in range(2):
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
+      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
+
+  def test_reset_states_auc(self):
+    auc_obj = metrics.AUC(num_thresholds=3)
+    model = _get_model([auc_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+
+    for _ in range(2):
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.)
+      self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.)
+      self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
+      self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
+
+  def test_reset_states_mean_iou(self):
+    m_obj = metrics.MeanIoU(num_classes=2)
+    model = _get_model([m_obj])
+    x = np.asarray([[0, 0, 0, 0], [1, 1, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
+                   dtype=np.float32)
+    y = np.asarray([[0], [1], [1], [1]], dtype=np.float32)
     model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
     model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index cf64e00d20cb34058ad872581a11fb174d3f2119..6d8ff9d847bafe8a6632741dd8ccb09295db3057 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -572,7 +571,7 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
                             use_bn=True)
     model.compile(
         loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         metrics=['acc', keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
 
@@ -590,10 +589,11 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     model = MultiIOTestModel(num_classes=num_classes,
                              use_dp=True,
                              use_bn=True)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x1 = np.ones((num_samples, input_dim))
     x2 = np.ones((num_samples, input_dim))
@@ -611,7 +611,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     with self.cached_session():
       model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(
-          loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+          loss='mse',
+          optimizer='rmsprop',
           run_eagerly=testing_utils.should_run_eagerly())
 
       x = np.ones((num_samples, input_dim), dtype=np.float32)
@@ -643,7 +644,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 0)
 
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([x1, x2], [y1, y2])
 
@@ -675,7 +677,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
 
     model = BNNet()
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     y_ref = model.predict(x)
 
@@ -707,7 +710,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     y = model.predict(x)
     self.assertEqual(np.sum(y), np.sum(x))
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
@@ -727,7 +731,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     model.fit({'input_1': x1, 'input_2': x2},
@@ -738,7 +743,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([x1, x2], [y1, y2])
     model.train_on_batch({'input_1': x1, 'input_2': x2},
@@ -758,7 +764,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     model.evaluate([x1, x2], [y1, y2])
     model.test_on_batch([x1, x2], [y1, y2])
@@ -782,7 +789,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     y_ref_1, y_ref_2 = model.predict([x1, x2])
@@ -818,10 +826,11 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     input_dim = 50
 
     model = NestedTestModel1(num_classes=num_classes)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -841,10 +850,11 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     input_dim = 50
 
     model = NestedTestModel2(num_classes=num_classes)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -864,10 +874,11 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     input_dim = 50
 
     model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -898,10 +909,11 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
         return self.bn(x)
 
     model = keras.Sequential([Inner()])
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -938,7 +950,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     y = model.predict(x)
     self.assertEqual(np.sum(y), np.sum(x))
     model.compile(
-        loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
@@ -956,7 +969,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model = SimpleTestModel(num_classes=num_classes,
                               use_dp=True,
                               use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer='rmsprop')
 
       x = array_ops.ones((num_samples, input_dim))
       y = array_ops.zeros((num_samples, num_classes))
@@ -974,7 +987,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model = MultiIOTestModel(num_classes=num_classes,
                                use_dp=True,
                                use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer='rmsprop')
 
       x1 = array_ops.ones((num_samples, input_dim))
       x2 = array_ops.ones((num_samples, input_dim))
@@ -1062,7 +1075,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model = MultiIOTestModel(num_classes=num_classes,
                                use_dp=True,
                                use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer='rmsprop')
 
       x1 = np.ones((num_samples, input_dim))
       x2 = np.ones((num_samples, input_dim))
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 45af953c61ee1200067aed5ec68cb0ff9b3dd3eb..9bc5aa2be5628d05b97ac58058f0183b5375b7d3 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -22,25 +22,28 @@ from __future__ import print_function
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.saving import hdf5_format
+from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
+
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
 Sequential = sequential.Sequential  # pylint: disable=invalid-name
-save_model = saving.save_model
-load_model = saving.load_model
-model_from_config = saving.model_from_config
-model_from_yaml = saving.model_from_yaml
-model_from_json = saving.model_from_json
+save_model = hdf5_format.save_model
+load_model = hdf5_format.load_model
+model_from_config = model_config.model_from_config
+model_from_yaml = model_config.model_from_yaml
+model_from_json = model_config.model_from_json
 
 
 def _clone_layer(layer):
@@ -97,15 +100,14 @@ def _clone_functional_model(model, input_tensors=None, share_weights=False):
       # Cache newly created input layer.
       newly_created_input_layer = input_tensor._keras_history[0]
       layer_map[layer] = newly_created_input_layer
+
     for original_input_layer, cloned_input_layer in zip(model._input_layers,
                                                         input_layers):
       layer_map[original_input_layer] = cloned_input_layer
   else:
     # Make sure that all input tensors come from a Keras layer.
     # If tensor comes from an input layer: cache the input layer.
-    if isinstance(input_tensors, tuple):
-      input_tensors = list(input_tensors)
-    input_tensors = generic_utils.to_list(input_tensors)
+    input_tensors = nest.flatten(input_tensors)
     input_tensors_ = []
     for i in range(len(input_tensors)):
       input_tensor = input_tensors[i]
@@ -114,6 +116,7 @@ def _clone_functional_model(model, input_tensors=None, share_weights=False):
         name = original_input_layer.name
         input_tensor = Input(tensor=input_tensor,
                              name='input_wrapper_for_' + name)
+
         input_tensors_.append(input_tensor)
         # Cache newly created input layer.
         newly_created_input_layer = input_tensor._keras_history[0]
@@ -148,34 +151,18 @@ def _clone_functional_model(model, input_tensors=None, share_weights=False):
         if isinstance(layer, InputLayer):
           continue
 
-      # Gather inputs to call the new layer.
-      reference_input_tensors = node.input_tensors
-      reference_output_tensors = node.output_tensors
-
       # If all previous input tensors are available in tensor_map,
       # then call node.inbound_layer on them.
-      computed_tensors = []
-      for x in reference_input_tensors:
-        if x in tensor_map:
-          computed_tensors.append(tensor_map[x])
-
-      if len(computed_tensors) == len(reference_input_tensors):
+      if all(
+          tensor in tensor_map for tensor in nest.flatten(node.input_tensors)):
+        computed_tensors = nest.map_structure(lambda t: tensor_map[t],
+                                              node.input_tensors)
         # Call layer.
-        if node.arguments:
-          kwargs = node.arguments
-        else:
-          kwargs = {}
-        if len(computed_tensors) == 1:
-          computed_tensor = computed_tensors[0]
-          output_tensors = generic_utils.to_list(layer(computed_tensor,
-                                                       **kwargs))
-          computed_tensors = [computed_tensor]
-        else:
-          computed_tensors = computed_tensors
-          output_tensors = generic_utils.to_list(layer(computed_tensors,
-                                                       **kwargs))
+        kwargs = node.arguments or {}
+        output_tensors = layer(computed_tensors, **kwargs)
 
-        for x, y in zip(reference_output_tensors, output_tensors):
+        for x, y in zip(
+            nest.flatten(node.output_tensors), nest.flatten(output_tensors)):
           tensor_map[x] = y
 
   # Check that we did compute the model outputs,
@@ -184,6 +171,9 @@ def _clone_functional_model(model, input_tensors=None, share_weights=False):
   for x in model.outputs:
     assert x in tensor_map, 'Could not compute output ' + str(x)
     output_tensors.append(tensor_map[x])
+
+  input_tensors = nest.pack_sequence_as(model._nested_inputs, input_tensors)
+  output_tensors = nest.pack_sequence_as(model._nested_outputs, output_tensors)
   return Model(input_tensors, output_tensors, name=model.name)
 
 
@@ -292,8 +282,6 @@ def clone_model(model, input_tensors=None):
 
 
 # "Clone" a subclassed model by reseting all of the attributes.
-
-
 def _in_place_subclassed_model_reset(model):
   """Substitute for model cloning that works for subclassed models.
 
@@ -393,11 +381,30 @@ def _in_place_subclassed_model_reset(model):
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
   model._original_attributes_cache = attributes_cache
-  # Reset built state
+  _reset_build_compile_trackers(model)
+  model._setattr_tracking = setattr_tracking
+
+
+def _reset_build_compile_trackers(model):
+  """Reset state trackers for model.
+
+  Note that we do not actually zero out attributes such as optimizer,
+  but instead rely on the expectation that all of the attrs will be
+  over-written on calling build/compile/etc. This is somewhat fragile,
+  insofar as we check elsewhere for the presence of these attributes as
+  evidence of having been built/compiled/etc. Pending a better way to do this,
+  we reset key attributes here to allow building and compiling.
+
+  Args:
+    model: the model that is being reset
+  """
+  # Reset build state
   model.built = False
   model.inputs = None
   model.outputs = None
-  model._setattr_tracking = setattr_tracking
+  # Reset compile state
+  model._is_compiled = False  # pylint:disable=protected-access
+  model.optimizer = None
 
 
 def in_place_subclassed_model_state_restoration(model):
@@ -429,9 +436,7 @@ def in_place_subclassed_model_state_restoration(model):
     model._setattr_tracking = setattr_tracking
   else:
     # Restore to the state of a never-called model.
-    model.built = False
-    model.inputs = None
-    model.outputs = None
+    _reset_build_compile_trackers(model)
 
 
 def clone_and_build_model(
@@ -473,7 +478,10 @@ def clone_and_build_model(
       - cloning a subclassed model with `in_place_reset` set to False.
       - compiling the clone when the original model has not been compiled.
   """
-  if compile_clone and not model.optimizer:
+  # Grab optimizer now, as we reset-in-place for subclassed models, but
+  # want to maintain access to the original optimizer.
+  orig_optimizer = model.optimizer
+  if compile_clone and not orig_optimizer:
     raise ValueError(
         'Error when cloning model: compile_clone was set to True, but the '
         'original model has not been compiled.')
@@ -509,14 +517,14 @@ def clone_and_build_model(
         input_tensors = input_tensors[0]
       clone._set_inputs(input_tensors)
 
-  if compile_clone and model.optimizer:
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
+  if compile_clone:
+    if isinstance(orig_optimizer, optimizers.TFOptimizer):
       optimizer = optimizers.TFOptimizer(
-          model.optimizer.optimizer, optimizer_iterations)
+          orig_optimizer.optimizer, optimizer_iterations)
       K.track_tf_optimizer(optimizer)
     else:
-      optimizer_config = model.optimizer.get_config()
-      optimizer = model.optimizer.__class__.from_config(optimizer_config)
+      optimizer_config = orig_optimizer.get_config()
+      optimizer = orig_optimizer.__class__.from_config(optimizer_config)
       if optimizer_iterations is not None:
         optimizer.iterations = optimizer_iterations
 
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 0a5f9a7bea03dba27e9c9cef1609b5c469f7147d..f429aba498d90b3afc9d18925543c88b48c5ffd9 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -19,18 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import functools
 import os
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -52,158 +55,181 @@ class TestModel(keras.Model):
     return self.layer1(x)
 
 
-def sequential_model(add_input_layer, include_input_shape=True):
-  model = keras.models.Sequential()
+def _get_layers(input_shape=(4,), add_input_layer=False):
   if add_input_layer:
-    model.add(keras.layers.InputLayer(input_shape=(4,)))
-    model.add(keras.layers.Dense(4))
-  elif include_input_shape:
-    model.add(keras.layers.Dense(4, input_shape=(4,)))
+    model_layers = [keras.layers.InputLayer(input_shape=input_shape),
+                    keras.layers.Dense(4)]
+  elif input_shape:
+    model_layers = [keras.layers.Dense(4, input_shape=input_shape)]
   else:
-    model.add(keras.layers.Dense(4))
-  model.add(keras.layers.BatchNormalization())
-  model.add(keras.layers.Dropout(0.5))
-  model.add(keras.layers.Dense(4))
-  return model
-
-
-class TestModelCloning(test.TestCase):
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_sequential_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      model = sequential_model(False)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models.clone_model(model)
+    model_layers = [keras.layers.Dense(4)]
+
+  model_layers += [
+      keras.layers.BatchNormalization(),
+      keras.layers.Dropout(0.5),
+      keras.layers.Dense(4)]
+
+  return model_layers
+
+
+def _get_model(input_shape=(4,)):
+  model_layers = _get_layers(input_shape=None, add_input_layer=False)
+  return testing_utils.get_model_from_layers(
+      model_layers, input_shape=input_shape)
+
+
+class TestModelCloning(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'has_input_layer',
+       'input_shape': (4,),
+       'add_input_layer': True,
+       'share_weights': False},
+      {'testcase_name': 'no_input_layer',
+       'input_shape': None,
+       'add_input_layer': False,
+       'share_weights': False},
+      {'testcase_name': 'has_input_layer_share_weights',
+       'input_shape': (4,),
+       'add_input_layer': True,
+       'share_weights': True},
+      {'testcase_name': 'no_input_layer_share_weights',
+       'input_shape': None,
+       'add_input_layer': False,
+       'share_weights': True},
+  ])
+  def test_clone_sequential_model(
+      self, input_shape, add_input_layer, share_weights):
+
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_sequential_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    val_a = np.random.random((10, 4))
+    model = models.Sequential(_get_layers(input_shape, add_input_layer))
+    # Sanity check
+    self.assertEqual(
+        isinstance(model._layers[0], keras.layers.InputLayer),
+        add_input_layer)
+    self.assertEqual(model._is_graph_network, add_input_layer)
+
+    # With placeholder creation -- clone model should have an InputLayer
+    # if the original model has one.
+    new_model = clone_fn(model)
+    self.assertEqual(
+        isinstance(new_model._layers[0], keras.layers.InputLayer),
+        add_input_layer)
+    self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+    if input_shape:
       # update ops from batch norm needs to be included
       self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new tensor
-      input_a = keras.Input(shape=(4,))
-      new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
 
-      # On top of new, non-Keras tensor
+    # On top of new tensor  -- clone model should always have an InputLayer.
+    input_a = keras.Input(shape=(4,))
+    new_model = clone_fn(model, input_tensors=input_a)
+    self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+    self.assertTrue(new_model._is_graph_network)
+
+    # On top of new, non-Keras tensor  -- clone model should always have an
+    # InputLayer.
+    if not context.executing_eagerly():
+      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
+      # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
-      new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_sequential_model_input_layer(self):
-
-    def test_input_layer(include_inputs):
-      with self.cached_session():
-        val_a = np.random.random((10, 4))
-        model = sequential_model(include_inputs, include_inputs)
-        # Sanity check
-        self.assertEqual(
-            isinstance(model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(model._is_graph_network, include_inputs)
-
-      keras.backend.clear_session()
-      with self.cached_session():
-        # With placeholder creation -- clone model should have an InputLayer
-        # if the original model has one.
-        new_model = keras.models.clone_model(model)
-        self.assertEqual(
-            isinstance(new_model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
-
-        # On top of new tensor  -- clone model should always have an InputLayer.
-        input_a = keras.Input(shape=(4,))
-        new_model = keras.models.clone_model(model, input_tensors=input_a)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-        # On top of new, non-Keras tensor  -- clone model should always have an
-        # InputLayer.
-        input_a = keras.backend.variable(val_a)
-        new_model = keras.models.clone_model(model, input_tensors=input_a)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-    test_input_layer(True)
-    test_input_layer(False)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_functional_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_b = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      input_a = keras.Input(shape=(4,))
-      input_b = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_b = dense_1(input_b)
-      x_a = dense_2(x_a)
-      outputs = keras.layers.add([x_a, x_b])
-      model = keras.models.Model([input_a, input_b], outputs)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models.clone_model(model)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new tensors
-      input_a = keras.Input(shape=(4,), name='a')
-      input_b = keras.Input(shape=(4,), name='b')
-      new_model = keras.models.clone_model(
-          model, input_tensors=[input_a, input_b])
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new, non-Keras tensors
+      new_model = clone_fn(model, input_tensors=input_a)
+      self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+      self.assertTrue(new_model._is_graph_network)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_clone_functional_model(self, share_weights):
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_functional_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    val_a = np.random.random((10, 4))
+    val_b = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    input_a = keras.Input(shape=(4,))
+    input_b = keras.Input(shape=(4,))
+    dense_1 = keras.layers.Dense(4,)
+    dense_2 = keras.layers.Dense(4,)
+
+    x_a = dense_1(input_a)
+    x_a = keras.layers.Dropout(0.5)(x_a)
+    x_a = keras.layers.BatchNormalization()(x_a)
+    x_b = dense_1(input_b)
+    x_a = dense_2(x_a)
+    outputs = keras.layers.add([x_a, x_b])
+    model = keras.models.Model([input_a, input_b], outputs)
+
+    # With placeholder creation
+    new_model = clone_fn(model)
+    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch([val_a, val_b], val_out)
+
+    # On top of new tensors
+    input_a = keras.Input(shape=(4,), name='a')
+    input_b = keras.Input(shape=(4,), name='b')
+    new_model = keras.models.clone_model(
+        model, input_tensors=[input_a, input_b])
+    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch([val_a, val_b], val_out)
+
+    # On top of new, non-Keras tensors
+    if not context.executing_eagerly():
+      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
+      # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
       input_b = keras.backend.variable(val_b)
-      new_model = keras.models.clone_model(
-          model, input_tensors=[input_a, input_b])
+      new_model = clone_fn(model, input_tensors=[input_a, input_b])
       self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
+      new_model.compile(
+          testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+          run_eagerly=testing_utils.should_run_eagerly())
       new_model.train_on_batch(None, val_out)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_clone_functional_model_with_masking(self):
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      inputs = keras.Input((2, 1))
-      outputs = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = keras.layers.TimeDistributed(
-          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-      model = keras.Model(inputs, outputs)
-
-      model = keras.models.clone_model(model)
-      model.compile(loss='mse', optimizer=adam.AdamOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_clone_functional_with_masking(self, share_weights):
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_functional_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    x = np.array([[[1.], [1.]], [[0.], [0.]]])
+    inputs = keras.Input((2, 1))
+    outputs = keras.layers.Masking(mask_value=0)(inputs)
+    outputs = keras.layers.TimeDistributed(
+        keras.layers.Dense(1, kernel_initializer='one'))(outputs)
+    model = keras.Model(inputs, outputs)
+
+    model = clone_fn(model)
+    model.compile(
+        loss='mse', optimizer=testing_utils.get_v2_optimizer('adam'),
+        run_eagerly=testing_utils.should_run_eagerly())
+    y = np.array([[[1], [1]], [[1], [1]]])
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(float(loss), 0.)
 
   def test_model_cloning_invalid_use_cases(self):
     seq_model = keras.models.Sequential()
@@ -249,168 +275,26 @@ class TestModelCloning(test.TestCase):
       self.assertFalse(has_placeholder)
 
 
-class TestModelCloningLayerPreserveWeights(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def test_clone_sequential_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      model = sequential_model(False)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models._clone_sequential_model(
-          model, share_weights=True)
-      # update ops from batch norm needs to be included
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new tensor
-      input_a = keras.Input(shape=(4,))
-      new_model = keras.models._clone_sequential_model(
-          model, input_tensors=input_a, share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new, non-Keras tensor
-      input_a = keras.backend.variable(val_a)
-      new_model = keras.models._clone_sequential_model(
-          model, input_tensors=input_a, share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_deprecated_v1
-  def test_clone_sequential_model_input_layer(self):
-
-    @test_util.run_deprecated_v1
-    def test_input_layer(include_inputs):
-      with self.cached_session():
-        val_a = np.random.random((10, 4))
-        model = sequential_model(include_inputs, include_inputs)
-        # Sanity check
-        self.assertEqual(
-            isinstance(model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(model._is_graph_network, include_inputs)
-
-      keras.backend.clear_session()
-      with self.cached_session():
-        # With placeholder creation -- clone model should have an InputLayer
-        # if the original model has one.
-        new_model = keras.models._clone_sequential_model(
-            model, share_weights=True)
-        self.assertEqual(
-            isinstance(new_model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
-
-        # On top of new tensor  -- clone model should always have an InputLayer.
-        input_a = keras.Input(shape=(4,))
-        new_model = keras.models._clone_sequential_model(
-            model, input_tensors=input_a, share_weights=True)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-        # On top of new, non-Keras tensor  -- clone model should always have an
-        # InputLayer.
-        input_a = keras.backend.variable(val_a)
-        new_model = keras.models._clone_sequential_model(
-            model, input_tensors=input_a, share_weights=True)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-    test_input_layer(True)
-    test_input_layer(False)
-
-  @test_util.run_deprecated_v1
-  def test_clone_functional_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_b = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      input_a = keras.Input(shape=(4,))
-      input_b = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_b = dense_1(input_b)
-      x_a = dense_2(x_a)
-      outputs = keras.layers.add([x_a, x_b])
-      model = keras.models.Model([input_a, input_b], outputs)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models._clone_functional_model(
-          model, share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new tensors
-      input_a = keras.Input(shape=(4,), name='a')
-      input_b = keras.Input(shape=(4,), name='b')
-      new_model = keras.models._clone_functional_model(
-          model, input_tensors=[input_a, input_b], share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new, non-Keras tensors
-      input_a = keras.backend.variable(val_a)
-      input_b = keras.backend.variable(val_b)
-      new_model = keras.models._clone_functional_model(
-          model, input_tensors=[input_a, input_b], share_weights=True)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_clone_functional_model_with_masking(self):
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      inputs = keras.Input((2, 1))
-      outputs = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = keras.layers.TimeDistributed(
-          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-      model = keras.Model(inputs, outputs)
-
-      model = keras.models._clone_functional_model(
-          model, share_weights=True)
-      model.compile(loss='mse', optimizer=adam.AdamOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
-
-
 def _has_placeholder(graph):
   ops_types = [op.type for op in graph.get_operations()]
   return any('Placeholder' in s for s in ops_types)
 
 
-class CheckpointingTests(test.TestCase):
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class CheckpointingTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_optimizer_dependency(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_shape=(4,)))
-    opt = adam.AdamOptimizer(0.01)
-    model.compile(optimizer=opt, loss='mse')
-    model.fit(x=np.array([[1., 2., 3., 4.]]), y=[1.], epochs=2)
+    model = _get_model()
+    opt = adam.AdamOptimizer(.01)
+    model.compile(
+        optimizer=opt, loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    model.fit(
+        x=np.array([[1., 2., 3., 4.]]),
+        y=np.array([[1., 1., 1., 1.]]),
+        epochs=2)
     save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
     beta1_power, _ = opt._get_beta_accumulators()
     self.evaluate(beta1_power.assign(12.))
@@ -420,7 +304,8 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(12., self.evaluate(beta1_power))
 
 
-class TestModelBackend(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class TestModelBackend(keras_parameterized.TestCase):
 
   def test_model_backend_float64_use_cases(self):
     # Test case for GitHub issue 19318
@@ -430,7 +315,9 @@ class TestModelBackend(test.TestCase):
     x = keras.Input((5,))
     y = keras.layers.Dense(1)(x)
     model = keras.models.Model(x, y)
-    model.compile('rmsprop', 'mse')
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     keras.backend.set_floatx(floatx)
 
@@ -465,48 +352,46 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
-@test_util.run_v1_only('b/120545219')
-class TestCloneAndBuildModel(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types
   def test_clone_and_build_non_compiled_model(self):
-    with self.cached_session():
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
-        models.clone_and_build_model(model, compile_clone=True)
-
-      # With placeholder creation
-      new_model = models.clone_and_build_model(model, compile_clone=False)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.evaluate(inp, out)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.train_on_batch(inp, out)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(inp, out)
+    model = _get_model()
+
+    with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
+      models.clone_and_build_model(model, compile_clone=True)
 
-      # Create new tensors for inputs and targets
-      input_a = keras.Input(shape=(4,))
-      target_a = keras.Input(shape=(4,))
-      new_model = models.clone_and_build_model(model, input_tensors=input_a,
-                                               target_tensors=[target_a],
-                                               compile_clone=False)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.evaluate(inp, out)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.train_on_batch(inp, out)
-      new_model.compile('rmsprop', 'mse')
+    is_subclassed = (testing_utils.get_model_type() == 'subclass')
+    # With placeholder creation
+    new_model = models.clone_and_build_model(
+        model, compile_clone=False, in_place_reset=is_subclassed)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.evaluate(inp, out)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.train_on_batch(inp, out)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch(inp, out)
+
+    # Create new tensors for inputs and targets
+    input_a = keras.Input(shape=(4,))
+    target_a = keras.Input(shape=(4,))
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, target_tensors=[target_a],
+        compile_clone=False, in_place_reset=is_subclassed)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.evaluate(inp, out)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
       new_model.train_on_batch(inp, out)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch(inp, out)
 
   def _assert_same_compile_params(self, model):
     """Assert that two models have the same compile parameters."""
@@ -519,134 +404,88 @@ class TestCloneAndBuildModel(test.TestCase):
     self.assertEqual(['acc', metrics.categorical_accuracy],
                      model._compile_metrics)
 
-  def _clone_and_build_test_helper(self, model, is_subclassed=False):
+  def _clone_and_build_test_helper(self, model, model_type):
     inp = np.random.random((10, 4))
     out = np.random.random((10, 4))
 
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = models.clone_and_build_model(
-          model, compile_clone=True, in_place_reset=is_subclassed)
+    is_subclassed = (model_type == 'subclass')
+
+    # With placeholder creation
+    new_model = models.clone_and_build_model(
+        model, compile_clone=True, in_place_reset=is_subclassed)
+
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+    # Create new tensors for inputs and targets
+    input_a = keras.Input(shape=(4,), name='a')
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, compile_clone=True,
+        in_place_reset=is_subclassed)
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+    target_a = keras.Input(shape=(4,), name='b')
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, target_tensors=[target_a],
+        compile_clone=True, in_place_reset=is_subclassed)
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_clone_and_build_compiled(self):
+    model = _get_model()
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    self._clone_and_build_test_helper(model, testing_utils.get_model_type())
+
+  def test_clone_and_build_sequential_without_inputs_defined(self):
+    model = models.Sequential(_get_layers(input_shape=None))
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'),
+        'mse', metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
+    self._clone_and_build_test_helper(model, 'sequential')
 
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-      # Create new tensors for inputs and targets
-      input_a = keras.Input(shape=(4,), name='a')
-      new_model = models.clone_and_build_model(
-          model, input_tensors=input_a, compile_clone=True,
-          in_place_reset=is_subclassed)
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-      target_a = keras.Input(shape=(4,), name='b')
-      new_model = models.clone_and_build_model(
-          model, input_tensors=input_a, target_tensors=[target_a],
-          compile_clone=True, in_place_reset=is_subclassed)
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-  def test_clone_and_build_compiled_sequential_model(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-
-    self._clone_and_build_test_helper(model)
-
-  def test_clone_and_build_functional_model(self):
-    with self.cached_session():
-      input_a = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_a = dense_2(x_a)
-      model = keras.models.Model(input_a, x_a)
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-
-    self._clone_and_build_test_helper(model)
-
-  def test_clone_and_build_subclassed_model(self):
-    class SubclassedModel(keras.Model):
-
-      def __init__(self):
-        super(SubclassedModel, self).__init__()
-        self.layer1 = keras.layers.Dense(4)
-        self.layer2 = keras.layers.Dense(4)
-
-      def call(self, inp):
-        out = self.layer1(inp)
-        out = keras.layers.BatchNormalization()(out)
-        out = keras.layers.Dropout(0.5)(out)
-        out = self.layer2(out)
-        return out
-
-    with self.cached_session():
-      model = SubclassedModel()
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-    self._clone_and_build_test_helper(model, True)
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+    model.train_on_batch(inp, out)
+    self._clone_and_build_test_helper(model, 'sequential')
 
   def assert_optimizer_iterations_increases(self, optimizer):
-    with self.cached_session():
-      input_a = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_a = dense_2(x_a)
-      model = keras.models.Model(input_a, x_a)
-      model.compile(optimizer, 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
+    model = _get_model()
+    model.compile(
+        optimizer, 'mse', metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      global_step = keras.backend.variable(123, dtype=dtypes.int64)
-      clone_model = models.clone_and_build_model(
-          model, compile_clone=True, optimizer_iterations=global_step)
+    global_step = keras.backend.variable(123, dtype=dtypes.int64)
+    clone_model = models.clone_and_build_model(
+        model, compile_clone=True, optimizer_iterations=global_step,
+        in_place_reset=(testing_utils.get_model_type() == 'subclass'))
 
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
-      clone_model.train_on_batch(inp, out)
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+    clone_model.train_on_batch(inp, out)
 
-      self.assertEqual(K.eval(global_step), 124)
+    self.assertEqual(K.eval(global_step), 124)
 
+  @keras_parameterized.run_with_all_model_types
   def test_replace_tf_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
+  @keras_parameterized.run_with_all_model_types
   def test_replace_keras_optimizer_iterations_variable(self):
-    self.assert_optimizer_iterations_increases('adam')
+    if testing_utils.should_run_eagerly():
+      # This needs to be updated to run with v2 optimizers.
+      self.skipTest('b/120991591')
 
-  def test_replace_keras_optimizer_v2_iterations_variable(self):
-    self.assert_optimizer_iterations_increases(
-        keras.optimizer_v2.adam.Adam(0.01))
-
-  def test_clone_and_build_sequential_model_without_inputs_defined(self):
-    with self.cached_session():
-      model = sequential_model(False, False)
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-    self._clone_and_build_test_helper(model, False)
-
-    with self.cached_session():
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
-      model.train_on_batch(inp, out)
-    self._clone_and_build_test_helper(model, False)
+    self.assert_optimizer_iterations_increases('adam')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/ops.py b/tensorflow/python/keras/ops.py
index dca076eea1f8e9e526957870c531b540369bfa0d..bc14eef505853723dc494e0f8c6b764bf5d297d0 100644
--- a/tensorflow/python/keras/ops.py
+++ b/tensorflow/python/keras/ops.py
@@ -19,37 +19,76 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import keras_export
 
 
-keras_export("keras.initializers.Initializer")(
+# pylint: disable=bad-continuation
+keras_export(v1=["keras.initializers.Initializer"])(
     init_ops.Initializer)
-keras_export("keras.initializers.Zeros", "keras.initializers.zeros")(
+keras_export(v1=["keras.initializers.Zeros", "keras.initializers.zeros"])(
     init_ops.Zeros)
-keras_export("keras.initializers.Ones", "keras.initializers.ones")(
+keras_export(v1=["keras.initializers.Ones", "keras.initializers.ones"])(
     init_ops.Ones)
-keras_export("keras.initializers.Constant", "keras.initializers.constant")(
+keras_export(v1=["keras.initializers.Constant", "keras.initializers.constant"])(
     init_ops.Constant)
-keras_export("keras.initializers.VarianceScaling")(
+keras_export(v1=["keras.initializers.VarianceScaling"])(
     init_ops.VarianceScaling)
-keras_export("keras.initializers.Orthogonal", "keras.initializers.orthogonal")(
+keras_export(v1=["keras.initializers.Orthogonal",
+                 "keras.initializers.orthogonal"])(
     init_ops.Orthogonal)
-keras_export("keras.initializers.Identity", "keras.initializers.identity")(
+keras_export(v1=["keras.initializers.Identity",
+                 "keras.initializers.identity"])(
     init_ops.Identity)
-keras_export("keras.initializers.glorot_uniform")(
+keras_export(v1=["keras.initializers.glorot_uniform"])(
     init_ops.GlorotUniform)
-keras_export("keras.initializers.glorot_normal")(
+keras_export(v1=["keras.initializers.glorot_normal"])(
     init_ops.GlorotNormal)
-keras_export("keras.initializers.lecun_normal")(
+keras_export(v1=["keras.initializers.lecun_normal"])(
     init_ops.lecun_normal)
-keras_export("keras.initializers.lecun_uniform")(
+keras_export(v1=["keras.initializers.lecun_uniform"])(
     init_ops.lecun_uniform)
-keras_export("keras.initializers.he_normal")(
+keras_export(v1=["keras.initializers.he_normal"])(
     init_ops.he_normal)
-keras_export("keras.initializers.he_uniform")(
+keras_export(v1=["keras.initializers.he_uniform"])(
     init_ops.he_uniform)
 
+keras_export("keras.initializers.Initializer", v1=[])(
+    init_ops_v2.Initializer)
+keras_export("keras.initializers.Zeros", v1=[])(
+    init_ops_v2.Zeros)
+keras_export("keras.initializers.Ones", v1=[])(
+    init_ops_v2.Ones)
+keras_export("keras.initializers.Constant", v1=[])(
+    init_ops_v2.Constant)
+keras_export("keras.initializers.VarianceScaling", v1=[])(
+    init_ops_v2.VarianceScaling)
+keras_export("keras.initializers.Orthogonal", v1=[])(
+    init_ops_v2.Orthogonal)
+keras_export("keras.initializers.Identity", v1=[])(
+    init_ops_v2.Identity)
+keras_export("keras.initializers.GlorotUniform", v1=[])(
+    init_ops_v2.GlorotUniform)
+keras_export("keras.initializers.GlorotNormal", v1=[])(
+    init_ops_v2.GlorotNormal)
+keras_export("keras.initializers.lecun_normal", v1=[])(
+    init_ops_v2.lecun_normal)
+keras_export("keras.initializers.lecun_uniform", v1=[])(
+    init_ops_v2.lecun_uniform)
+keras_export("keras.initializers.he_normal", v1=[])(
+    init_ops_v2.he_normal)
+keras_export("keras.initializers.he_uniform", v1=[])(
+    init_ops_v2.he_uniform)
+keras_export("keras.initializers.RandomNormal", v1=[])(
+    init_ops_v2.RandomNormal)
+keras_export("keras.initializers.RandomUniform", v1=[])(
+    init_ops_v2.RandomUniform)
+keras_export("keras.initializers.TruncatedNormal", v1=[])(
+    init_ops_v2.TruncatedNormal)
+# pylint: enable=bad-continuation
+
+
 keras_export("keras.backend.name_scope")(ops.name_scope)
 
 keras_export("keras.losses.Reduction", v1=[])(
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b8f01249419c595a735442310c735bc10648cba6..45afe2a134cdfb5a6bae1f0c5be760433602f65b 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -34,6 +34,8 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/keras:backend_config",
     ],
 )
 
@@ -172,9 +174,9 @@ cuda_py_test(
 
 py_test(
     name = "optimizer_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["optimizer_v2_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "no_windows",
     ],
@@ -212,4 +214,5 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 2,
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 8f485b2440e497b708c4f8a40f2b1fe60a612257..a3d5538ea86a0e0ed86e5ee70df69248ec76ba48 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -20,12 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.Adadelta', v1=[])
+@keras_export('keras.optimizers.Adadelta')
 class Adadelta(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the Adadelta algorithm.
 
@@ -77,7 +78,11 @@ class Adadelta(optimizer_v2.OptimizerV2):
                to better conditioning the grad update.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adadelta".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     @compatibility(eager)
     When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
@@ -86,6 +91,8 @@ class Adadelta(optimizer_v2.OptimizerV2):
     invocations of optimizer functions.
     @end_compatibility
     """
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     super(Adadelta, self).__init__(name, **kwargs)
     self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index c95af6a8ad5308c357d96532f6599342b16aa276..06ff975212d9e405ff9bc4c6283e2e115ce4c1d2 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -170,11 +170,25 @@ class AdadeltaOptimizerTest(test.TestCase):
 
   def testConstructAdadeltaWithLR(self):
     opt = adadelta.Adadelta(lr=1.0, rho=0.9, epsilon=1.)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1., lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1.)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdadeltaWithEpsilonValues(self):
+    opt = adadelta.Adadelta(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adadelta.Adadelta(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index af359b5f591186641e483aa0dc30a734b3aee62f..0840aa6fae5be0b698de69827f483ec55b9ea37a 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -30,7 +31,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.Adagrad', v1=[])
+@keras_export('keras.optimizers.Adagrad')
 class Adagrad(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the Adagrad algorithm.
 
@@ -70,7 +71,11 @@ class Adagrad(optimizer_v2.OptimizerV2):
         Starting value for the accumulators, must be positive.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adagrad".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
@@ -85,6 +90,8 @@ class Adagrad(optimizer_v2.OptimizerV2):
     if initial_accumulator_value < 0.0:
       raise ValueError('initial_accumulator_value must be non-negative: %s' %
                        initial_accumulator_value)
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     if epsilon < 1e-7:
       raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
     super(Adagrad, self).__init__(name, **kwargs)
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index cf6f6a7832c56cd36d4b99ac88e26ce5c09ac7f6..864aefaf70def4249b8d93cfad34f0a594e03ba9 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -400,11 +400,29 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testConstructAdagradWithLR(self):
     opt = adagrad.Adagrad(lr=1.0)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = adagrad.Adagrad(learning_rate=0.1, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = adagrad.Adagrad(learning_rate=0.1)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdagradWithEpsilonValues(self):
+    opt = adagrad.Adagrad(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adagrad.Adagrad(epsilon=1e-6)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-6)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "epsilon must be larger than 1e-7"):
+      opt = adagrad.Adagrad(epsilon=1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 292323be60a769e8330085b89627c66ec027bd87..4fa7c7361543d5389872ebd70cb0df261325a9c4 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -18,16 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.Adam', v1=[])
+@keras_export('keras.optimizers.Adam')
 class Adam(optimizer_v2.OptimizerV2):
   """Optimizer that implements the Adam algorithm.
 
@@ -125,9 +125,15 @@ class Adam(optimizer_v2.OptimizerV2):
         a callable that takes no arguments and returns the actual value to use.
         This can be useful for changing these values across different
         invocations of optimizer functions. @end_compatibility
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
 
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     super(Adam, self).__init__(name, **kwargs)
     self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
@@ -240,11 +246,6 @@ class Adam(optimizer_v2.OptimizerV2):
           use_locking=self._use_locking)
       return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
 
-  def _resource_scatter_add(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
-      return x.value()
-
   def get_config(self):
     config = super(Adam, self).get_config()
     config.update({
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index 49a9de41cdf8fd6391c31b2e75b9eb116eeabfbd..7918c09b7e04aa0a558b7ebc30ee0120eb358b9f 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -505,11 +505,25 @@ class AdamOptimizerTest(test.TestCase):
 
   def testConstructAdamWithLR(self):
     opt = adam.Adam(lr=1.0)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = adam.Adam(learning_rate=0.1, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = adam.Adam(learning_rate=0.1)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdamWithEpsilonValues(self):
+    opt = adam.Adam(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adam.Adam(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index 8ee5c2a9f890141a87651d712b727a4cfa4e5696..3102e28cffcc846e12f72c8f2dd03662a99e2ed3 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -19,17 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.Adamax', v1=[])
-class Adamax(adam.Adam):
+@keras_export('keras.optimizers.Adamax')
+class Adamax(optimizer_v2.OptimizerV2):
   """Optimizer that implements the Adamax algorithm.
 
   It is a variant of Adam based on the infinity norm.
@@ -90,18 +90,27 @@ class Adamax(adam.Adam):
       epsilon: A small constant for numerical stability.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
-    # pylint: disable=useless-super-delegation
-    super(Adamax, self).__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=False,
-        name=name,
-        **kwargs)
-    # pylint: enable=useless-super-delegation
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
+    super(Adamax, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      self.add_slot(var, 'm')  # Create slots for the first moments.
+    for var in var_list:
+      self.add_slot(var, 'v')  # Create slots for the second moments.
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
@@ -154,8 +163,13 @@ class Adamax(adam.Adam):
       var_update = self._resource_scatter_add(var, indices, var_slice)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
-  def _resource_scatter_update(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_update(
-            x.handle, i, v)]):
-      return x.value()
+  def get_config(self):
+    config = super(Adamax, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index 339c0fe6e6dbc5d9fc90aa29b212b5e0c2a290f1..6934f1590eb32dc2626efa65fcdfb56d4dace4bb 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -364,11 +364,25 @@ class AdamaxOptimizerTest(test.TestCase):
 
   def testConstructAdamaxWithLR(self):
     opt = adamax.Adamax(lr=1.0)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = adamax.Adamax(learning_rate=0.1, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = adamax.Adamax(learning_rate=0.1)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdamaxWithEpsilonValues(self):
+    opt = adamax.Adamax(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adamax.Adamax(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index 5783fb12b36081fee62d5a693eccc4cab676e6d8..a86fd8d89dbc824cc35a4a6585c85e1794a6aa5c 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -24,7 +24,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.optimizers.Ftrl', v1=[])
+@keras_export('keras.optimizers.Ftrl')
 class Ftrl(optimizer_v2.OptimizerV2):
   """Optimizer that implements the FTRL algorithm.
 
@@ -72,7 +72,11 @@ class Ftrl(optimizer_v2.OptimizerV2):
                   2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
         where lr_t is the learning rate at t.
         When input is sparse shrinkage will only happen on the active weights.\
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If one of the arguments is invalid.
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 2e64e080954fc64b86a8ce8be750369e228f43fa..c444f969f6492dd3de646f8bab80ba65d3da625d 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -24,7 +24,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export("keras.optimizers.SGD", v1=[])
+@keras_export("keras.optimizers.SGD")
 class SGD(optimizer_v2.OptimizerV2):
   """Stochastic gradient descent and momentum optimizer.
 
@@ -74,7 +74,11 @@ class SGD(optimizer_v2.OptimizerV2):
       nesterov: boolean. Whether to apply Nesterov momentum.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to 'SGD'.
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
     super(SGD, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 9a4178db46981afb7fe841e5b8d2506db7692cfe..333a6f288eaeda7313e07002d8fad229d372ebec 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -263,11 +263,16 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testConstructSGDWithLR(self):
     opt = gradient_descent.SGD(lr=1.0)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = gradient_descent.SGD(learning_rate=0.1, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = gradient_descent.SGD(learning_rate=0.1)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
 
 class MomentumOptimizerTest(test.TestCase):
@@ -667,11 +672,16 @@ class MomentumOptimizerTest(test.TestCase):
       opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
       config = opt.get_config()
       opt2 = gradient_descent.SGD.from_config(config)
-      # assert both are equal float values.
-      self.assertEqual(
-          opt._get_hyper("learning_rate"), opt2._get_hyper("learning_rate"))
-      self.assertEqual(opt._get_hyper("momentum"), opt2._get_hyper("momentum"))
-      # self.assertEqual(opt._get_hyper("decay"), opt2._get_hyper("decay"))
+      lr = opt.lr
+      lr2 = opt2.lr
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
+      self.assertAllClose(
+          self.evaluate(opt._get_hyper("momentum")),
+          self.evaluate(opt2._get_hyper("momentum")))
+      self.assertAllClose(
+          self.evaluate(opt._get_hyper("decay")),
+          self.evaluate(opt2._get_hyper("decay")))
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       loss = lambda: 3 * var0
       # learning rate variable created when calling minimize.
@@ -679,14 +689,15 @@ class MomentumOptimizerTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       config = opt.get_config()
       opt3 = gradient_descent.SGD.from_config(config)
-      self.assertEqual(
-          self.evaluate(opt._get_hyper("learning_rate")),
-          opt3._get_hyper("learning_rate"))
-      self.assertEqual(
+      lr3 = opt3.lr
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
+      self.assertAllClose(
           self.evaluate(opt._get_hyper("momentum")),
-          opt3._get_hyper("momentum"))
-      # self.assertEqual(
-      #     self.evaluate(opt._get_hyper("decay")), opt3._get_hyper("decay"))
+          self.evaluate(opt3._get_hyper("momentum")))
+      self.assertAllClose(
+          self.evaluate(opt._get_hyper("decay")),
+          self.evaluate(opt3._get_hyper("decay")))
       self.assertTrue(opt3.nesterov)
 
   def testNesterovWithoutMomentum(self):
@@ -695,11 +706,16 @@ class MomentumOptimizerTest(test.TestCase):
 
   def testConstructMomentumWithLR(self):
     opt = gradient_descent.SGD(lr=1.0, momentum=0.9)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index afa74c8de37665ea217fa55cbdea3dda86908f55..d515f987251f26cd46c2358f068b325cb29fa5cc 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -18,15 +18,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
-class Nadam(adam.Adam):
+@keras_export('keras.optimizers.Nadam')
+class Nadam(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the NAdam algorithm.
 
   Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
@@ -34,17 +36,21 @@ class Nadam(adam.Adam):
 
   Initialization:
 
-  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
-  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$m_0 := 0 \text{(Initialize 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
+  $$mu_0 := 1$$
   $$t := 0 \text{(Initialize timestep)}$$
 
   Computes:
   $$t := t + 1$$
-  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
-  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+  $$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
+  $$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
+  $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
+  $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+  $$v' := v_t / (1 - \beta_2^t)$$
+  $$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
+  $$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
 
   gradient is evaluated at theta(t) + momentum * v(t), and the variables always
   store theta + beta_1 * m / sqrt(v) instead of theta.
@@ -71,57 +77,89 @@ class Nadam(adam.Adam):
       epsilon: A small constant for numerical stability.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
 
     # Backwards compatiblity with keras NAdam optimizer.
-    if 'schedule_decay' in kwargs:
-      kwargs['decay'] = kwargs.pop('schedule_decay')
-    # pylint: disable=useless-super-delegation
-    super(Nadam, self).__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=False,
-        name=name,
-        **kwargs)
-    # pylint: enable=useless-super-delegation
+    kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
+    super(Nadam, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+    self._m_cache = None
+
+  def _create_slots(self, var_list):
+    var_dtype = var_list[0].dtype.base_dtype
+    if self._m_cache is None:
+      self._m_cache = self.add_weight(
+          'momentum_cache',
+          shape=[],
+          dtype=var_dtype,
+          initializer='ones',
+          trainable=False)
+      self._weights.append(self._m_cache)
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      # Create slots for the first moments.
+      self.add_slot(var, 'm')
+    for var in var_list:
+      # Create slots for the second moments.
+      self.add_slot(var, 'v')
+
+  def _prepare(self, var_list):
+    var_dtype = var_list[0].dtype.base_dtype
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    decay_base = math_ops.cast(0.96, var_dtype)
+    self.m_cache_t = beta_1_t * (
+        1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * local_step)))
+    self.m_cache_t_1 = beta_1_t * (
+        1. - 0.5 *
+        (math_ops.pow(decay_base, self._initial_decay * (local_step + 1))))
+    m_schedule_new = self._m_cache * self.m_cache_t
+    self.m_schedule_new = state_ops.assign(
+        self._m_cache, m_schedule_new, use_locking=self._use_locking)
+    self.m_schedule_next = self.m_schedule_new * self.m_cache_t_1
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
+    lr_t = self._get_hyper('learning_rate', var_dtype)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
-    beta_1_power = math_ops.pow(beta_1_t, local_step)
-    beta_2_power = math_ops.pow(beta_2_t, local_step)
-    return training_ops.resource_apply_adam(
-        var.handle,
-        m.handle,
-        v.handle,
-        beta_1_power,
-        beta_2_power,
-        lr_t,
-        beta_1_t,
-        beta_2_t,
-        self._get_hyper('epsilon', var_dtype),
-        grad,
-        use_locking=self._use_locking,
-        use_nesterov=True)
+
+    g_prime = grad / (1. - self.m_schedule_new)
+    m_t = beta_1_t * m + (1 - beta_1_t) * grad
+    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
+    m_t_prime = m_t / (1. - self.m_schedule_next)
+    v_t = beta_2_t * v + (1 - beta_2_t) * math_ops.square(grad)
+    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)
+    v_t_prime = v_t / (1. - math_ops.pow(beta_2_t, local_step))
+    m_t_bar = (1. - self.m_cache_t) * g_prime + self.m_cache_t_1 * m_t_prime
+    var_t = var - lr_t * m_t_bar / (math_ops.sqrt(v_t_prime) + epsilon_t)
+    return state_ops.assign(var, var_t, use_locking=self._use_locking).op
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
+    lr_t = self._get_hyper('learning_rate', var_dtype)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
-    beta_1_power = math_ops.pow(beta_1_t, local_step)
-    beta_2_power = math_ops.pow(beta_2_t, local_step)
-    epsilon_t = self._get_hyper('epsilon', var_dtype)
-    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    g_prime = grad / (1. - self.m_schedule_new)
 
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, 'm')
@@ -129,8 +167,10 @@ class Nadam(adam.Adam):
     m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
-      # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+      m_t_slice = array_ops.gather(m_t, indices)
+
+    m_t_prime = m_t_slice / (1. - self.m_schedule_next)
+    m_t_bar = (1. - self.m_cache_t) * g_prime + self.m_cache_t_1 * m_t_prime
 
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, 'v')
@@ -138,9 +178,22 @@ class Nadam(adam.Adam):
     v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
-
-    v_t_slice = array_ops.gather(v_t, indices)
-    v_sqrt = math_ops.sqrt(v_t_slice)
-    var_update = self._resource_scatter_add(var, indices,
-                                            -lr * m_bar / (v_sqrt + epsilon_t))
-    return control_flow_ops.group(*[var_update, m_bar, v_t])
+      v_t_slice = array_ops.gather(v_t, indices)
+
+    v_t_prime = v_t_slice / (1. - math_ops.pow(beta_2_t, local_step))
+    v_prime_sqrt = math_ops.sqrt(v_t_prime)
+
+    var_update = self._resource_scatter_add(
+        var, indices, -lr_t * m_t_bar / (v_prime_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_t_bar, v_t])
+
+  def get_config(self):
+    config = super(Nadam, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
index 73568e81f0c6ae680226a123c0098e56a131e826..8dd61956f6f1efcbf11c8e8379ac0c5eac2cc5ef 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -40,45 +40,54 @@ def get_beta_accumulators(opt, dtype):
   return (beta_1_power, beta_2_power)
 
 
+def update_m_cache(m_cache, t, beta1=0.9):
+  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
+  m_cache_t = m_cache * mu_t
+  return m_cache_t
+
+
 def nadam_update_numpy(param,
                        g_t,
                        t,
                        m,
                        v,
+                       m_cache,
                        alpha=0.001,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=1e-8):
-  alpha_t = alpha * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
 
+  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
+  mu_t_1 = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 2)))
+  m_cache_t_1 = m_cache * mu_t_1
+  g_prime_t = g_t / (1 - m_cache)
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
 
-  m_bar = (1 - beta1) * g_t + beta1 * m_t
+  m_prime_t = m_t / (1 - m_cache_t_1)
+  v_prime_t = v_t / (1 - beta2**(t + 1))
+  m_bar_t = (1 - mu_t) * g_prime_t + mu_t_1 * m_prime_t
 
-  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  param_t = param - alpha * m_bar_t / (np.sqrt(v_prime_t) + epsilon)
   return param_t, m_t, v_t
 
 
 class NadamOptimizerTest(test.TestCase):
 
-  def doTestSparse(self, use_resource=False):
+  @test_util.run_deprecated_v1
+  def testSparse(self):
     sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
         var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -103,74 +112,22 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
           update.run()
 
+          mcache = update_m_cache(mcache, t)
           var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+              var0_np, grads0_np, t, m0, v0, mcache, epsilon=sparse_epsilon)
           var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  @test_util.run_deprecated_v1
-  def testSparse(self):
-    self.doTestSparse(use_resource=False)
-
-  @test_util.run_deprecated_v1
-  def testResourceSparse(self):
-    self.doTestSparse(use_resource=True)
-
-  def doTestBasic(self, use_resource=False):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-        grads0 = constant_op.constant(grads0_np)
-        grads1 = constant_op.constant(grads1_np)
-        opt = nadam.Nadam()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
-        # Run 3 steps of Nadam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
-          update.run()
-
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+              var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
   @test_util.run_deprecated_v1
-  def testResourceBasic(self):
-    self.doTestBasic(use_resource=True)
-
-  @test_util.run_deprecated_v1
-  def testBasicWithLearningRateDecay(self):
+  def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
@@ -180,9 +137,7 @@ class NadamOptimizerTest(test.TestCase):
         var1 = resource_variable_ops.ResourceVariable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
-        learning_rate = 0.001
-        decay = 0.5
-        opt = nadam.Nadam(learning_rate=learning_rate, decay=decay)
+        opt = nadam.Nadam()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
@@ -190,19 +145,15 @@ class NadamOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], var0.eval())
         self.assertAllClose([3.0, 4.0], var1.eval())
 
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
         # Run 3 steps of Nadam
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
           update.run()
 
-          lr = learning_rate / (1 + decay * t)
-          var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, alpha=lr)
-          var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, alpha=lr)
+          mcache = update_m_cache(mcache, t)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               mcache)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               mcache)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
@@ -210,15 +161,31 @@ class NadamOptimizerTest(test.TestCase):
 
   def testConstructNAdamWithLR(self):
     opt = nadam.Nadam(lr=1.0)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = nadam.Nadam(learning_rate=0.1, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = nadam.Nadam(learning_rate=0.1)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
   def testConstructNAdamWithScheduleDecay(self):
     opt = nadam.Nadam(schedule_decay=0.2)
-    self.assertEqual(opt.decay, 0.2)
+    self.assertIsInstance(opt.decay, variables.Variable)
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.decay), (0.2))
+
+  def testConstructNAdamWithEpsilonValues(self):
+    opt = nadam.Nadam(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = nadam.Nadam(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index d0f16f0b4f340b5dbb088171427b3823894d6e34..6cd6cf0a8d934a04e04fab8ba7bd1810b304de12 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -28,6 +28,7 @@ import six
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -39,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -67,8 +69,8 @@ def _deduplicate_indexed_slices(values, indices):
 
 
 @six.add_metaclass(abc.ABCMeta)
-@keras_export("keras.optimizers.Optimizer", v1=[])
-class OptimizerV2(checkpointable.CheckpointableBase):
+@keras_export("keras.optimizers.Optimizer")
+class OptimizerV2(checkpointable.Checkpointable):
   """Updated base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -93,6 +95,11 @@ class OptimizerV2(checkpointable.CheckpointableBase):
   opt_op.run()
   ```
 
+  ### Thread Compatibility
+
+  The entire optimizer is currently thread compatible, not thread-safe. The user
+  needs to perform synchronization if necessary.
+
   ### Processing gradients before applying them.
 
   Calling `minimize()` takes care of both computing the gradients and
@@ -155,13 +162,26 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     Args:
       name: A non-empty string.  The name to use for accumulators created
         for the optimizer.
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If name is malformed.
       RuntimeError: If _create_slots has been overridden instead of
           _create_vars.
     """
+    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay"}
+    for k in kwargs:
+      if k not in allowed_kwargs:
+        raise TypeError("Unexpected keyword argument "
+                        "passed to optimizer: " + str(k))
+      # checks that all keyword arguments are non-negative.
+      if kwargs[k] < 0:
+        raise ValueError("Expected {} >= 0, received: {}".format(k, kwargs[k]))
+
     self._use_locking = True
     self._name = name
     self._hyper = {}
@@ -183,9 +203,12 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     if decay < 0.:
       raise ValueError("decay cannot be less than 0: {}".format(decay))
     self._initial_decay = decay
-    self.__dict__.update(kwargs)
+    if "clipnorm" in kwargs:
+      self.clipnorm = kwargs.pop("clipnorm")
+    if "clipvalue" in kwargs:
+      self.clipvalue = kwargs.pop("clipvalue")
 
-    self._prepared = False
+    self._hypers_created = False
 
   def minimize(self, loss, var_list, grad_loss=None, name=None):
     """Add operations to minimize `loss` by updating `var_list`.
@@ -272,8 +295,7 @@ class OptimizerV2(checkpointable.CheckpointableBase):
   @staticmethod
   def _scale_loss(loss_value):
     if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= (1. / num_replicas)
     return loss_value
@@ -331,15 +353,17 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     """
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
-    if distribute_ctx.has_distribution_strategy():
+    if distribute_ctx.has_strategy():
       reduced_grads = merge_grads(grads_and_vars)
       grads_and_vars = zip(reduced_grads, var_list)
 
-    self._prepare()
+    self._create_hypers()
     with ops.init_scope():
       self._create_slots(var_list)
     update_ops = []
 
+    self._prepare(var_list)
+
     def update_grad_to_var(grad, var):
       """Apply gradient to variable."""
       if isinstance(var, ops.Tensor):
@@ -392,6 +416,8 @@ class OptimizerV2(checkpointable.CheckpointableBase):
         backend.set_value(self._hyper[name], value)
 
   def _get_hyper(self, name, dtype=None):
+    if not self._hypers_created:
+      self._create_hypers()
     value = self._hyper[name]
     if callable(value):
       value = value()
@@ -412,7 +438,7 @@ class OptimizerV2(checkpointable.CheckpointableBase):
       if name == "lr":
         name = "learning_rate"
       if name in self._hyper:
-        return self._hyper[name]
+        return self._get_hyper(name)
       raise e
 
   def __setattr__(self, name, value):
@@ -461,8 +487,11 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     slot_dict = self._slots[var_key]
     return slot_dict[slot_name]
 
-  def _prepare(self):
-    if self._prepared:
+  def _prepare(self, var_list):
+    pass
+
+  def _create_hypers(self):
+    if self._hypers_created:
       return
     if self._iterations is None:
       with ops.device("cpu:0"):
@@ -483,18 +512,18 @@ class OptimizerV2(checkpointable.CheckpointableBase):
             trainable=False,
             initializer=value,
             aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
-    self._prepared = True
+    self._hypers_created = True
 
   @property
   def iterations(self):
     """Variable. The number of training steps this Optimizer has run."""
-    if not self._prepared:
-      self._prepare()
+    if not self._hypers_created:
+      self._create_hypers()
     return self._iterations
 
   @iterations.setter
   def iterations(self, variable):
-    if self._prepared:
+    if self._hypers_created:
       raise RuntimeError("Cannot set `iterations` to a new Variable after"
                          "the Optimizer weights have been created")
     self._iterations = variable
@@ -551,10 +580,11 @@ class OptimizerV2(checkpointable.CheckpointableBase):
 
   def _serialize_hyperparameter(self, hyperparameter_name):
     """Serialize a hyperparameter that can be a float, callable, or Tensor."""
-    value = self._get_hyper(hyperparameter_name)
+    value = self._hyper[hyperparameter_name]
     if callable(value):
       return value()
-    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
+    if isinstance(value, (ops.Tensor, tf_variables.Variable,
+                          distributed_values.TPUMirroredVariable)):
       return backend.get_value(value)
     return value
 
@@ -723,6 +753,16 @@ class OptimizerV2(checkpointable.CheckpointableBase):
     """
     raise NotImplementedError()
 
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(x.handle, i, v)]):
+      return x.value()
+
   # ---------------
   # For implementing the checkpointable interface
   # ---------------
@@ -843,8 +883,8 @@ def merge_grads(grads_and_vars):
   """Merge gradients from different replicas."""
 
   def merge_grad_fn(strategy, grads_and_vars):
-    reduced_grads = strategy.batch_reduce(ds_reduce_util.ReduceOp.SUM,
-                                          grads_and_vars)
+    reduced_grads = strategy.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     return reduced_grads
 
   return distribute_ctx.get_replica_context().merge_call(
@@ -866,8 +906,7 @@ def _var_key(var):
   """
 
   # pylint: disable=protected-access
-  if distribute_ctx.has_distribution_strategy() and hasattr(
-      var, "_primary_var"):
+  if distribute_ctx.has_strategy() and hasattr(var, "_primary_var"):
     var = var._primary_var
   if hasattr(var, "op"):
     return var._shared_name
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 42f9fcaea8ecab213029e56d56c5854b527ef95c..8069703b7a2ba2fb94c319be5b64dbd98ece2da6 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,38 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import tempfile
-
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.keras.optimizer_v2 import adagrad
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import adamax
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum
 from tensorflow.python.training import training_util
@@ -262,42 +260,25 @@ class OptimizerTest(test.TestCase):
       self.evaluate(sgd.iterations.initializer)
       self.assertEqual(0, self.evaluate(sgd.iterations))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testSerializationWithinDefun(self):
-    with self.cached_session():
-      sgd = gradient_descent.SGD(3.0)
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                    dtype=dtypes.float32)
-      loss = lambda: 3 * var0
-      sgd.minimize(loss, [var0])
-
-      def serialize():
-        config = sgd.get_config()
-        gradient_descent.SGD.from_config(config)
-
-      compiled_serialize = function.defun(serialize)
-      with self.assertRaisesRegexp(RuntimeError, 'inside Tensorflow graph'):
-        compiled_serialize()
-
   @test_util.run_in_graph_and_eager_modes
   def testConfig(self):
     with self.cached_session():
       opt = gradient_descent.SGD(learning_rate=1.0)
       config = opt.get_config()
       opt2 = gradient_descent.SGD.from_config(config)
+      lr = opt._get_hyper('learning_rate')
+      lr2 = opt2._get_hyper('learning_rate')
+      self.evaluate(variables.global_variables_initializer())
       # assert both are equal float values.
-      self.assertEqual(
-          opt._get_hyper('learning_rate'), opt2._get_hyper('learning_rate'))
+      self.assertEqual(self.evaluate(lr), self.evaluate(lr2))
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       loss = lambda: 3 * var0
       # learning rate variable created when calling minimize.
       opt.minimize(loss, [var0])
-      self.evaluate(variables.global_variables_initializer())
-      config = opt.get_config()
       opt3 = gradient_descent.SGD.from_config(config)
-      self.assertEqual(
-          self.evaluate(opt._get_hyper('learning_rate')),
-          opt3._get_hyper('learning_rate'))
+      lr3 = opt3._get_hyper('learning_rate')
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(lr), self.evaluate(lr3))
 
   @test_util.run_in_graph_and_eager_modes
   def testGradClipValue(self):
@@ -321,6 +302,16 @@ class OptimizerTest(test.TestCase):
       self.evaluate(opt_op)
       self.assertAllClose([0.], self.evaluate(var))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidClipNorm(self):
+    with self.assertRaisesRegexp(ValueError, '>= 0'):
+      gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidKwargs(self):
+    with self.assertRaisesRegexp(TypeError, 'Unexpected keyword argument'):
+      gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
+
   @test_util.run_in_graph_and_eager_modes
   def testWeights(self):
     with self.cached_session():
@@ -396,6 +387,31 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(AttributeError):
       opt.not_an_attr += 3
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGettingHyperParametersWithLrInConstructor(self):
+    opt = gradient_descent.SGD(lr=3.0)
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+
+    self.assertTrue(isinstance(opt.lr, resource_variable_ops.ResourceVariable))
+    self.assertTrue(
+        isinstance(opt.learning_rate, resource_variable_ops.ResourceVariable))
+
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(3.0, lr)
+
+    opt.lr = 2.0
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(2.0, lr)
+
+    self.evaluate(opt.lr.assign(4.0))
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(4.0, lr)
+
   @test_util.run_in_graph_and_eager_modes
   def testOptimizerWithKerasModel(self):
     a = input_layer.Input(shape=(3,), name='input_a')
@@ -475,25 +491,20 @@ class OptimizerTest(test.TestCase):
     opt.iterations = global_step
     var = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                  dtype=dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+    init_step_value = self.evaluate(global_step)
     loss = lambda: 3 * var
     opt_op = opt.minimize(loss, [var])
     self.evaluate(variables.global_variables_initializer())
-    init_step_value = self.evaluate(global_step)
     self.evaluate(opt_op)
     new_step_value = self.evaluate(global_step)
     self.assertEqual(new_step_value, init_step_value + 1)
 
 
-class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+@keras_parameterized.run_with_all_model_types
+class OptimizersCompatibilityTest(keras_parameterized.TestCase):
 
-  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
-  # NAdam has been unified: currently these three algorithms behave differently.
-  @parameterized.named_parameters(
-      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
-      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
-      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
-      ('sgd', 'sgd', False, True))
-  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+  def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True):
     np.random.seed(1331)
     with self.cached_session():
       train_samples = 20
@@ -507,43 +518,65 @@ class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
       y = keras.utils.to_categorical(y)
 
       num_hidden = 5
-      model = testing_utils.get_small_sequential_mlp(
+      model_v1 = testing_utils.get_small_sequential_mlp(
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_v1.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model_v1.fit(x, y, batch_size=5, epochs=1)
 
-      old_mode = os.environ.get('TF2_BEHAVIOR', None)
-      # Disable tf2 to create V1 optimizer.
-      disable_tf2()
-      if opt_str == 'momentum':
-        opt_v1 = optimizers.SGD(momentum=0.9)
-      else:
-        opt_v1 = optimizers.get(opt_str)
-
-      # Test compile and fit with v1 optimizer.
-      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
-      model.fit(x, y, batch_size=5, epochs=1)
-      model_dir = tempfile.mkdtemp()
-      gfile.MakeDirs(model_dir)
-      file_name = os.path.join(model_dir, 'model.h5')
-      model.save(file_name)
-
-      enable_tf2()
-      # Test load and fit with v2 optimizer.
-      model_2 = saving.load_model(file_name)
-      opt_v2 = model_2.optimizer
-      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
-      # set_weights is called inside load_model but exception is swallowed,
-      # this call checks the weights can be set correctly.
+      model_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_v2.set_weights(model_v1.get_weights())
+      model_v2.compile(opt_v2, loss='categorical_crossentropy', metrics=[])
+      model_v2._make_train_function()
       if test_weights:
         opt_v2.set_weights(opt_v1.get_weights())
-      if test_numeric:
-        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-        self.assertAllClose(model.get_weights(), model_2.get_weights())
-        self.assertAllClose(model.get_weights(), model_2.get_weights())
-        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
 
-      if old_mode is not None:
-        os.environ['TF2_BEHAVIOR'] = old_mode
+      hist_1 = model_v1.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+      hist_2 = model_v2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+      self.assertAllClose(model_v1.get_weights(), model_v2.get_weights(),
+                          rtol=1e-5, atol=1e-5)
+      self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'],
+                          rtol=1e-5, atol=1e-5)
+
+  def testAdadeltaCompatibility(self):
+    opt_v1 = optimizers.Adadelta(lr=0.01)
+    opt_v2 = adadelta.Adadelta(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdagradCompatibility(self):
+    opt_v1 = optimizers.Adagrad(lr=0.01)
+    opt_v2 = adagrad.Adagrad(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdamCompatibility(self):
+    opt_v1 = optimizers.Adam()
+    opt_v2 = adam.Adam()
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdamaxCompatibility(self):
+    opt_v1 = optimizers.Adamax(lr=0.01)
+    opt_v2 = adamax.Adamax(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testNadamCompatibility(self):
+    opt_v1 = optimizers.Nadam(lr=0.001)
+    opt_v2 = nadam.Nadam(learning_rate=0.001)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testMomentumCompatibility(self):
+    opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9)
+    opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testRMSpropCompatibility(self):
+    opt_v1 = optimizers.RMSprop()
+    opt_v2 = rmsprop.RMSprop()
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testSGDCompatibility(self):
+    opt_v1 = optimizers.SGD(lr=0.01)
+    opt_v2 = gradient_descent.SGD(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2, False)
 
   def testNumericEquivalenceForNesterovMomentum(self):
     np.random.seed(1331)
@@ -621,15 +654,6 @@ class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
 
 
-def disable_tf2():
-  if 'TF2_BEHAVIOR' in os.environ:
-    del os.environ['TF2_BEHAVIOR']
-
-
-def enable_tf2():
-  os.environ['TF2_BEHAVIOR'] = 'enabled'
-
-
 # Note: These tests are kept in a separate class to avoid bugs in some
 # distributions of Python that break AutoGraph which is used by tf.function.
 class OptimizerWithFunctionTest(test.TestCase):
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index b52ac4524676bd5f92e56317387b501984fc1ae1..e55e6375a3ea8e89cb377f1f9ac9291c2d098142 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -17,13 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export("keras.optimizers.RMSprop", v1=[])
+@keras_export("keras.optimizers.RMSprop")
 class RMSprop(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the RMSprop algorithm.
 
@@ -90,8 +97,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
         `epsilon` can each be a callable that takes no arguments and returns the
         actual value to use. This can be useful for changing these values across
         different invocations of optimizer functions. @end_compatibility
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     super(RMSprop, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("decay", self._initial_decay)
@@ -110,77 +123,122 @@ class RMSprop(optimizer_v2.OptimizerV2):
   def _create_slots(self, var_list):
     for var in var_list:
       self.add_slot(var, "rms")
-      self.add_slot(var, "momentum")
-      if self.centered:
+    if self._momentum:
+      for var in var_list:
+        self.add_slot(var, "momentum")
+    if self.centered:
+      for var in var_list:
         self.add_slot(var, "mg")
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
-    mom = self.get_slot(var, "momentum")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self.centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          use_locking=self._use_locking)
+    if self._momentum:
+      mom = self.get_slot(var, "momentum")
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        return training_ops.resource_apply_centered_rms_prop(
+            var.handle,
+            mg.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.resource_apply_rms_prop(
+            var.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            use_locking=self._use_locking)
     else:
-      return training_ops.resource_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          use_locking=self._use_locking)
+      rms_t = rho * rms + (1. - rho) * math_ops.square(grad)
+      rms_t = state_ops.assign(rms, rms_t, use_locking=self._use_locking)
+      denom_t = rms_t
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        mg_t = rho * mg + (1. - rho) * grad
+        mg_t = state_ops.assign(mg, mg_t, use_locking=self._use_locking)
+        denom_t = rms_t - math_ops.square(mg_t)
+      var_t = var - lr_t * grad / (math_ops.sqrt(denom_t) + epsilon)
+      return state_ops.assign(var, var_t, use_locking=self._use_locking).op
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
-    mom = self.get_slot(var, "momentum")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self.centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_sparse_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+    if self._momentum:
+      mom = self.get_slot(var, "momentum")
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        return training_ops.resource_sparse_apply_centered_rms_prop(
+            var.handle,
+            mg.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            indices,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.resource_sparse_apply_rms_prop(
+            var.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            indices,
+            use_locking=self._use_locking)
     else:
-      return training_ops.resource_sparse_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+      rms_scaled_g_values = (grad * grad) * (1. - rho)
+      rms_t = state_ops.assign(rms, rms * rho, use_locking=self._use_locking)
+      with ops.control_dependencies([rms_t]):
+        rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values)
+        rms_slice = array_ops.gather(rms_t, indices)
+      denom_slice = rms_slice
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        mg_scaled_g_values = grad * (1. - rho)
+        mg_t = state_ops.assign(mg, mg * rho, use_locking=self._use_locking)
+        with ops.control_dependencies([mg_t]):
+          mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values)
+          mg_slice = array_ops.gather(mg_t, indices)
+          denom_slice = rms_slice - math_ops.square(mg_slice)
+      var_update = self._resource_scatter_add(
+          var, indices, -lr_t * grad / (math_ops.sqrt(denom_slice) + epsilon))
+      if self.centered:
+        return control_flow_ops.group(*[var_update, rms_t, mg_t])
+      return control_flow_ops.group(*[var_update, rms_t])
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(RMSprop, self).set_weights(weights)
 
   def get_config(self):
     config = super(RMSprop, self).get_config()
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 4d61cfbbc52789db172445f9286fdb848c0a7bc6..a9ddc2155a63e4030e56104293fc1a92b11de5d1 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -58,14 +58,18 @@ class RMSpropOptimizerTest(test.TestCase):
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
                             epsilon, centered):
     rms_t = rms * rho + (1 - rho) * g * g
-    denom_t = rms_t + epsilon
     if centered:
       mg_t = mg * rho + (1 - rho) * g
-      denom_t -= mg_t * mg_t
+      denom_t = rms_t - mg_t * mg_t
     else:
       mg_t = mg
-    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
-    var_t = var - mom_t
+      denom_t = rms_t
+    if momentum > 0.:
+      mom_t = momentum * mom + lr * g / (np.sqrt(denom_t + epsilon))
+      var_t = var - mom_t
+    else:
+      mom_t = mom
+      var_t = var - lr * g / (np.sqrt(denom_t) + epsilon)
     return var_t, mg_t, rms_t, mom_t
 
   def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
@@ -78,12 +82,18 @@ class RMSpropOptimizerTest(test.TestCase):
       gindex = gindexs[i]
       gvalue = gvalues[i]
       rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
-      denom_t = rms_t[gindex] + epsilon
       if centered:
         mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
-        denom_t -= mg_t[gindex] * mg_t[gindex]
-      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
-      var_t[gindex] = var[gindex] - mom_t[gindex]
+        denom_t = rms_t[gindex] - mg_t[gindex] * mg_t[gindex]
+      else:
+        denom_t = rms_t[gindex]
+      if momentum > 0.:
+        mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t +
+                                                                       epsilon)
+        var_t[gindex] = var[gindex] - mom_t[gindex]
+      else:
+        mom_t[gindex] = mom[gindex]
+        var_t[gindex] = var[gindex] - lr * gvalue / (np.sqrt(denom_t) + epsilon)
     return var_t, mg_t, rms_t, mom_t
 
   @test_util.run_deprecated_v1
@@ -117,14 +127,17 @@ class RMSpropOptimizerTest(test.TestCase):
           mg0 = None
           mg1 = None
 
+        if momentum > 0.:
+          mom0 = opt.get_slot(var0, "momentum")
+          mom1 = opt.get_slot(var1, "momentum")
+        else:
+          mom0 = None
+          mom1 = None
+
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
         rms1 = opt.get_slot(var1, "rms")
         self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
 
         mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -137,8 +150,8 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        # Run 4 steps of RMSprop
-        for _ in range(1, 5):
+        # Run 3 steps of RMSprop
+        for _ in range(1, 4):
           self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
@@ -152,10 +165,11 @@ class RMSpropOptimizerTest(test.TestCase):
           if centered:
             self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
             self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          if momentum > 0.:
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
           self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -191,10 +205,12 @@ class RMSpropOptimizerTest(test.TestCase):
     self.assertTrue(rms0 is not None)
     rms1 = opt.get_slot(var1, "rms")
     self.assertTrue(rms1 is not None)
-    mom0 = opt.get_slot(var0, "momentum")
-    self.assertTrue(mom0 is not None)
-    mom1 = opt.get_slot(var1, "momentum")
-    self.assertTrue(mom1 is not None)
+    if momentum > 0.:
+      mom0 = opt.get_slot(var0, "momentum")
+      mom1 = opt.get_slot(var1, "momentum")
+    else:
+      mom0 = None
+      mom1 = None
 
     mg0_np = np.array([0.0, 0.0])
     mg1_np = np.array([0.0, 0.0])
@@ -222,8 +238,9 @@ class RMSpropOptimizerTest(test.TestCase):
       # Validate updated params
       self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
       self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-      self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-      self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      if momentum > 0.:
+        self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+        self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
       self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
       self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -325,10 +342,12 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertTrue(rms0 is not None)
         rms1 = opt.get_slot(var1, "rms")
         self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
+        if momentum > 0.:
+          mom0 = opt.get_slot(var0, "momentum")
+          mom1 = opt.get_slot(var1, "momentum")
+        else:
+          mom0 = None
+          mom1 = None
 
         mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -341,8 +360,8 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        # Run 4 steps of RMSprop
-        for _ in range(1, 5):
+        # Run 3 steps of RMSprop
+        for _ in range(1, 4):
           self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
@@ -358,8 +377,9 @@ class RMSpropOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
           self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
           self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          if momentum > 0.:
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -414,11 +434,51 @@ class RMSpropOptimizerTest(test.TestCase):
 
   def testConstructRMSpropWithLR(self):
     opt = rmsprop.RMSprop(lr=1.0)
-    self.assertEqual(opt.lr, 1.0)
     opt_2 = rmsprop.RMSprop(learning_rate=0.1, lr=1.0)
-    self.assertEqual(opt_2.lr, 1.0)
     opt_3 = rmsprop.RMSprop(learning_rate=0.1)
-    self.assertEqual(opt_3.lr, 0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = variables.Variable(1.)
+      v2 = variables.Variable(1.)
+
+      opt = rmsprop.RMSprop(1., momentum=0., centered=False)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and one unique slot variable for v1 and v2.
+      self.assertEqual(3, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and three unique slot variables for v1 and v2
+      self.assertEqual(7, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+  def testConstructRMSpropWithEpsilonValues(self):
+    opt = rmsprop.RMSprop(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = rmsprop.RMSprop(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index a8544199fd14e3083f072caeb13750e17690dc21..b704b885cb967997a7a8735b31f08a1537cf4a1c 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -45,7 +44,6 @@ from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export(v1=['keras.optimizers.Optimizer'])
 class Optimizer(object):
   """Abstract optimizer base class.
 
@@ -159,7 +157,6 @@ class Optimizer(object):
     return cls(**config)
 
 
-@keras_export(v1=['keras.optimizers.SGD'])
 class SGD(Optimizer):
   """Stochastic gradient descent optimizer.
 
@@ -224,7 +221,6 @@ class SGD(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export(v1=['keras.optimizers.RMSprop'])
 class RMSprop(Optimizer):
   """RMSProp optimizer.
 
@@ -291,7 +287,6 @@ class RMSprop(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export(v1=['keras.optimizers.Adagrad'])
 class Adagrad(Optimizer):
   """Adagrad optimizer.
 
@@ -358,7 +353,6 @@ class Adagrad(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export(v1=['keras.optimizers.Adadelta'])
 class Adadelta(Optimizer):
   """Adadelta optimizer.
 
@@ -442,7 +436,6 @@ class Adadelta(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export(v1=['keras.optimizers.Adam'])
 class Adam(Optimizer):
   """Adam optimizer.
 
@@ -539,7 +532,6 @@ class Adam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export(v1=['keras.optimizers.Adamax'])
 class Adamax(Optimizer):
   """Adamax optimizer from Adam paper's Section 7.
 
@@ -575,7 +567,7 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -583,7 +575,8 @@ class Adamax(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
@@ -622,7 +615,6 @@ class Adamax(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export('keras.optimizers.Nadam')
 class Nadam(Optimizer):
   """Nesterov Adam optimizer.
 
@@ -661,9 +653,10 @@ class Nadam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
@@ -680,7 +673,7 @@ class Nadam(Optimizer):
     ms = [K.zeros(shape) for shape in shapes]
     vs = [K.zeros(shape) for shape in shapes]
 
-    self.weights = [self.iterations] + ms + vs
+    self.weights = [self.iterations, self.m_schedule] + ms + vs
 
     for p, g, m, v in zip(params, grads, ms, vs):
       # the following equations given in [1]
@@ -717,7 +710,7 @@ class Nadam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class TFOptimizer(Optimizer, checkpointable.CheckpointableBase):
+class TFOptimizer(Optimizer, checkpointable.Checkpointable):
   """Wrapper class for native TensorFlow optimizers.
   """
 
@@ -738,7 +731,7 @@ class TFOptimizer(Optimizer, checkpointable.CheckpointableBase):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribution_strategy_context.has_strategy():
       self.updates = []
 
       if not params:
@@ -806,27 +799,15 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
-  if tf2.enabled():
-    all_classes = {
-        'adadelta': adadelta_v2.Adadelta,
-        'adagrad': adagrad_v2.Adagrad,
-        'adam': adam_v2.Adam,
-        'adamax': adamax_v2.Adamax,
-        'nadam': nadam_v2.Nadam,
-        'rmsprop': rmsprop_v2.RMSprop,
-        'sgd': gradient_descent_v2.SGD
-    }
-  else:
-    all_classes = {
-        'adadelta': Adadelta,
-        'adagrad': Adagrad,
-        'adam': Adam,
-        'adamax': Adamax,
-        'nadam': Nadam,
-        'rmsprop': RMSprop,
-        'sgd': SGD,
-        'tfoptimizer': TFOptimizer
-    }
+  all_classes = {
+      'adadelta': adadelta_v2.Adadelta,
+      'adagrad': adagrad_v2.Adagrad,
+      'adam': adam_v2.Adam,
+      'adamax': adamax_v2.Adamax,
+      'nadam': nadam_v2.Nadam,
+      'rmsprop': rmsprop_v2.RMSprop,
+      'sgd': gradient_descent_v2.SGD
+  }
 
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 18a20567ce9db90725a1cb05c34ae6baeacbcd7c..606e711483137e3e7eb5336029edcb52b3cfe916 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -44,116 +44,119 @@ def _get_model(input_dim, num_hidden, output_dim):
   return model
 
 
-def _test_optimizer(optimizer, target=0.75):
-  np.random.seed(1337)
-  (x_train, y_train), _ = testing_utils.get_test_data(train_samples=1000,
-                                                      test_samples=200,
-                                                      input_shape=(10,),
-                                                      num_classes=2)
-  y_train = keras.utils.to_categorical(y_train)
-  model = _get_model(x_train.shape[1], 20, y_train.shape[1])
-  model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          0)
-  history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          126)  # 63 steps per epoch
-  assert history.history['acc'][-1] >= target
-  config = keras.optimizers.serialize(optimizer)
-  optim = keras.optimizers.deserialize(config)
-  new_config = keras.optimizers.serialize(optim)
-  new_config['class_name'] = new_config['class_name'].lower()
-  new_config['config'].pop('name', None)
-  if 'amsgrad' not in config['config']:
-    new_config['config'].pop('amsgrad', None)
-  if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
-    new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
-  if 'momentum' not in config['config']:
-    new_config['config'].pop('momentum', None)
-  if 'centered' not in config['config']:
-    new_config['config'].pop('centered', None)
-  assert config == new_config
-
-  # Test constraints.
-  model = keras.models.Sequential()
-  dense = keras.layers.Dense(10,
-                             input_shape=(x_train.shape[1],),
-                             kernel_constraint=lambda x: 0. * x + 1.,
-                             bias_constraint=lambda x: 0. * x + 2.,
-                             activation='relu')
-  model.add(dense)
-  model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
-  model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          126)  # Using same optimizer from before
-  model.train_on_batch(x_train[:10], y_train[:10])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          127)
-  kernel, bias = dense.get_weights()
-  np.testing.assert_allclose(kernel, 1., atol=1e-3)
-  np.testing.assert_allclose(bias, 2., atol=1e-3)
-
-
 class KerasOptimizersTest(test.TestCase):
 
+  def _test_optimizer(self, optimizer, target=0.75):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=1000, test_samples=200, input_shape=(10,), num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+    model = _get_model(x_train.shape[1], 20, y_train.shape[1])
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=['accuracy'])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations), 0)
+    history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations),
+        126)  # 63 steps per epoch
+    self.assertGreaterEqual(history.history['acc'][-1], target)
+    config = keras.optimizers.serialize(optimizer)
+    optim = keras.optimizers.deserialize(config)
+    new_config = keras.optimizers.serialize(optim)
+    new_config['class_name'] = new_config['class_name'].lower()
+    new_config['config'].pop('name', None)
+    if 'amsgrad' not in config['config']:
+      new_config['config'].pop('amsgrad', None)
+    if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
+      new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
+    if 'momentum' not in config['config']:
+      new_config['config'].pop('momentum', None)
+    if 'centered' not in config['config']:
+      new_config['config'].pop('centered', None)
+    self.assertDictEqual(config, new_config)
+
+    # Test constraints.
+    model = keras.models.Sequential()
+    dense = keras.layers.Dense(
+        10,
+        input_shape=(x_train.shape[1],),
+        kernel_constraint=lambda x: 0. * x + 1.,
+        bias_constraint=lambda x: 0. * x + 2.,
+        activation='relu')
+    model.add(dense)
+    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=['accuracy'])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations),
+        126)  # Using same optimizer from before
+    model.train_on_batch(x_train[:10], y_train[:10])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations), 127)
+    kernel, bias = dense.get_weights()
+    np.testing.assert_allclose(kernel, 1., atol=1e-3)
+    np.testing.assert_allclose(bias, 2., atol=1e-3)
+
   def test_sgd(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           nesterov=True))
+      self._test_optimizer(keras.optimizers.SGD())
+
+  def test_momentum(self):
+    with self.cached_session():
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))
 
   def test_rmsprop(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.RMSprop())
-      _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
+      self._test_optimizer(keras.optimizers.RMSprop())
+      self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
   def test_adagrad(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adagrad())
-      _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
+      self._test_optimizer(keras.optimizers.Adagrad())
+      self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
   def test_adadelta(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
+      self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
       # Accuracy seems dependent on the initialization. Even adding tf.Print
       # nodes in the graph seemed to affect the initialization seed, and hence
       # the accuracy.
-      _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
+      self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
   def test_adam(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adam())
+      self._test_optimizer(keras.optimizers.Adam())
       # Accuracy seems dependent on the seed initialization.
       # TODO(b/121051441): fix test flakiness.
-      _test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
-      _test_optimizer(keras.optimizers.Adam(amsgrad=True))
+      self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
+      self._test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
   def test_adamax(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adamax())
-      _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
+      self._test_optimizer(keras.optimizers.Adamax())
+      self._test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
   def test_nadam(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Nadam())
+      self._test_optimizer(keras.optimizers.Nadam())
 
   def test_clipnorm(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           clipnorm=0.5))
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
 
   def test_clipvalue(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           clipvalue=0.5))
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
 
-  def test_tfoptimizer(self):
+  def test_tf_optimizer(self):
     optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(
@@ -187,8 +190,7 @@ class KerasOptimizersTest(test.TestCase):
     self.assertIs(graph_weak(), None)
     self.assertIs(optimizer_weak(), None)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_tfoptimizer_iterations(self):
+  def test_tf_optimizer_iterations(self):
     with self.cached_session():
       optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
       model = keras.models.Sequential()
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
deleted file mode 100644
index 1ab7aca58eba4fe12b507b2cc53fd1892f62d6ee..0000000000000000000000000000000000000000
--- a/tensorflow/python/keras/saving/BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Description:
-#   Keras saving and loading libraries.
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "saving",
-    srcs = ["__init__.py"],
-    deps = [":saved_model"],
-)
-
-py_library(
-    name = "saved_model",
-    srcs = ["saved_model.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:mode_keys",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:util",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/saved_model",
-        "//tensorflow/python/saved_model/model_utils",
-    ],
-)
-
-py_test(
-    name = "saved_model_test",
-    size = "medium",
-    srcs = ["saved_model_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # TODO(b/119349471): Re-enable
-        "no_windows",
-    ],
-    deps = [
-        ":saved_model",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:mode_keys",
-        "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
diff --git a/tensorflow/python/keras/saving/__init__.py b/tensorflow/python/keras/saving/__init__.py
index 8ff9f3b74e8ff253506cde18e60a01bbc9fac3ff..bb4db681248e8f25672cacd2d80dc65ea43a4113 100644
--- a/tensorflow/python/keras/saving/__init__.py
+++ b/tensorflow/python/keras/saving/__init__.py
@@ -12,10 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utils for saving a Keras Model or Estimator to the SavedModel format."""
+"""Utils for saving and loading Keras Models."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.saving.hdf5_format import load_attributes_from_hdf5_group
+from tensorflow.python.keras.saving.hdf5_format import load_model
+from tensorflow.python.keras.saving.hdf5_format import load_weights_from_hdf5_group
+from tensorflow.python.keras.saving.hdf5_format import load_weights_from_hdf5_group_by_name
+from tensorflow.python.keras.saving.hdf5_format import preprocess_weights_for_loading
+from tensorflow.python.keras.saving.hdf5_format import save_attributes_to_hdf5_group
+from tensorflow.python.keras.saving.hdf5_format import save_model
+from tensorflow.python.keras.saving.hdf5_format import save_weights_to_hdf5_group
+from tensorflow.python.keras.saving.model_config import model_from_config
+from tensorflow.python.keras.saving.model_config import model_from_json
+from tensorflow.python.keras.saving.model_config import model_from_yaml
 from tensorflow.python.keras.saving.saved_model import export
 from tensorflow.python.keras.saving.saved_model import load_from_saved_model
+from tensorflow.python.keras.saving.saving_utils import trace_model_call
+
+
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..973cdcac8a079bb7c19f4e004a6315e2da94ff86
--- /dev/null
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -0,0 +1,911 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Functions for saving and loading a Keras Model from HDF5 format.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.saving import model_config as model_config_lib
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import serialization
+from tensorflow.python.util.tf_export import keras_export
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+  HDF5_OBJECT_HEADER_LIMIT = 64512
+except ImportError:
+  h5py = None
+# pylint: enable=g-import-not-at-top
+
+
+@keras_export('keras.models.save_model')
+def save_model(model, filepath, overwrite=True, include_optimizer=True):
+  """Saves a model to a HDF5 file.
+
+  The saved model contains:
+      - the model's configuration (topology)
+      - the model's weights
+      - the model's optimizer's state (if any)
+
+  Thus the saved model can be reinstantiated in
+  the exact same state, without any of the code
+  used for model definition or training.
+
+  Arguments:
+      model: Keras model instance to be saved.
+      filepath: One of the following:
+          - String, path where to save the model
+          - `h5py.File` object where to save the model
+      overwrite: Whether we should overwrite any existing
+          model at the target location, or instead
+          ask the user with a manual prompt.
+      include_optimizer: If True, save optimizer's state together.
+
+  Raises:
+      ImportError: if h5py is not available.
+  """
+
+  if h5py is None:
+    raise ImportError('`save_model` requires h5py.')
+
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  # TODO(psv) Add warning when we save models that contain non-serializable
+  # entities like metrics added using `add_metric` and losses added using
+  # `add_loss.`
+
+  if not isinstance(filepath, h5py.File):
+    # If file exists and should not be overwritten.
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
+
+    f = h5py.File(filepath, mode='w')
+    opened_new_file = True
+  else:
+    f = filepath
+    opened_new_file = False
+
+  try:
+    f.attrs['keras_version'] = str(keras_version).encode('utf8')
+    f.attrs['backend'] = K.backend().encode('utf8')
+    f.attrs['model_config'] = json.dumps(
+        {
+            'class_name': model.__class__.__name__,
+            'config': model.get_config()
+        },
+        default=serialization.get_json_type).encode('utf8')
+
+    model_weights_group = f.create_group('model_weights')
+    model_layers = model.layers
+    save_weights_to_hdf5_group(model_weights_group, model_layers)
+
+    if include_optimizer and model.optimizer:
+      if isinstance(model.optimizer, optimizers.TFOptimizer):
+        logging.warning(
+            'TensorFlow optimizers do not '
+            'make it possible to access '
+            'optimizer attributes or optimizer state '
+            'after instantiation. '
+            'As a result, we cannot save the optimizer '
+            'as part of the model save file. '
+            'You will have to compile your model again after loading it. '
+            'Prefer using a Keras optimizer instead '
+            '(see keras.io/optimizers).')
+      else:
+        f.attrs['training_config'] = json.dumps(
+            {
+                'optimizer_config': {
+                    'class_name': model.optimizer.__class__.__name__,
+                    'config': model.optimizer.get_config()
+                },
+                'loss': model.loss,
+                'metrics': model._compile_metrics,
+                'weighted_metrics': model._compile_weighted_metrics,
+                'sample_weight_mode': model.sample_weight_mode,
+                'loss_weights': model.loss_weights,
+            },
+            default=serialization.get_json_type).encode('utf8')
+
+        # Save optimizer weights.
+        save_optimizer_weights_to_hdf5_group(f, model.optimizer)
+    f.flush()
+  finally:
+    if opened_new_file:
+      f.close()
+
+
+@keras_export('keras.models.load_model')
+def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
+  """Loads a model saved via `save_model`.
+
+  Arguments:
+      filepath: One of the following:
+          - String, path to the saved model
+          - `h5py.File` object from which to load the model
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+      compile: Boolean, whether to compile the model
+          after loading.
+
+  Returns:
+      A Keras model instance. If an optimizer was found
+      as part of the saved model, the model is already
+      compiled. Otherwise, the model is uncompiled and
+      a warning will be displayed. When `compile` is set
+      to False, the compilation is omitted without any
+      warning.
+
+  Raises:
+      ImportError: if h5py is not available.
+      ValueError: In case of an invalid savefile.
+  """
+  if h5py is None:
+    raise ImportError('`load_model` requires h5py.')
+
+  if not custom_objects:
+    custom_objects = {}
+
+  def convert_custom_objects(obj):
+    """Handles custom object lookup.
+
+    Arguments:
+        obj: object, dict, or list.
+
+    Returns:
+        The same structure, where occurrences
+            of a custom object name have been replaced
+            with the custom object.
+    """
+    if isinstance(obj, list):
+      deserialized = []
+      for value in obj:
+        deserialized.append(convert_custom_objects(value))
+      return deserialized
+    if isinstance(obj, dict):
+      deserialized = {}
+      for key, value in obj.items():
+        deserialized[key] = convert_custom_objects(value)
+      return deserialized
+    if obj in custom_objects:
+      return custom_objects[obj]
+    return obj
+
+  opened_new_file = not isinstance(filepath, h5py.File)
+  if opened_new_file:
+    f = h5py.File(filepath, mode='r')
+  else:
+    f = filepath
+
+  model = None
+  try:
+    # instantiate model
+    model_config = f.attrs.get('model_config')
+    if model_config is None:
+      raise ValueError('No model found in config file.')
+    model_config = json.loads(model_config.decode('utf-8'))
+    model = model_config_lib.model_from_config(model_config,
+                                               custom_objects=custom_objects)
+
+    # set weights
+    load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+    if compile:
+      # instantiate optimizer
+      training_config = f.attrs.get('training_config')
+      if training_config is None:
+        logging.warning('No training configuration found in save file: '
+                        'the model was *not* compiled. Compile it manually.')
+        return model
+      training_config = json.loads(training_config.decode('utf-8'))
+      optimizer_config = training_config['optimizer_config']
+      optimizer = optimizers.deserialize(
+          optimizer_config, custom_objects=custom_objects)
+
+      # Recover loss functions and metrics.
+      loss = convert_custom_objects(training_config['loss'])
+      metrics = convert_custom_objects(training_config['metrics'])
+      weighted_metrics = convert_custom_objects(
+          training_config.get('weighted_metrics', None))
+      sample_weight_mode = training_config['sample_weight_mode']
+      loss_weights = training_config['loss_weights']
+
+      # Compile model.
+      model.compile(
+          optimizer=optimizer,
+          loss=loss,
+          metrics=metrics,
+          weighted_metrics=weighted_metrics,
+          loss_weights=loss_weights,
+          sample_weight_mode=sample_weight_mode)
+
+      # Set optimizer weights.
+      if 'optimizer_weights' in f:
+        # Build train function (to get weight updates).
+        # Models that aren't graph networks must wait until they are called
+        # with data to _make_train_function() and so can't load optimizer
+        # weights.
+        if model._is_graph_network:  # pylint: disable=protected-access
+          model._make_train_function()
+          optimizer_weight_values = load_optimizer_weights_from_hdf5_group(f)
+          try:
+            model.optimizer.set_weights(optimizer_weight_values)
+          except ValueError:
+            logging.warning('Error in loading the saved optimizer '
+                            'state. As a result, your model is '
+                            'starting with a freshly initialized '
+                            'optimizer.')
+        else:
+          logging.warning('Sequential models without an `input_shape` '
+                          'passed to the first layer cannot reload their '
+                          'optimizer state. As a result, your model is'
+                          'starting with a freshly initialized optimizer.')
+
+  finally:
+    if opened_new_file:
+      f.close()
+  return model
+
+
+def preprocess_weights_for_loading(layer,
+                                   weights,
+                                   original_keras_version=None,
+                                   original_backend=None):
+  """Preprocess layer weights between different Keras formats.
+
+  Converts layers weights from Keras 1 format to Keras 2 and also weights of
+  CuDNN layers in Keras 2.
+
+  Arguments:
+      layer: Layer instance.
+      weights: List of weights values (Numpy arrays).
+      original_keras_version: Keras version for the weights, as a string.
+      original_backend: Keras backend the weights were trained with,
+          as a string.
+
+  Returns:
+      A list of weights values (Numpy arrays).
+  """
+  def convert_nested_bidirectional(weights):
+    """Converts layers nested in `Bidirectional` wrapper.
+
+    This function uses `preprocess_weights_for_loading()` for converting
+    layers.
+
+    Arguments:
+        weights: List of weights values (Numpy arrays).
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+    num_weights_per_layer = len(weights) // 2
+    forward_weights = preprocess_weights_for_loading(
+        layer.forward_layer, weights[:num_weights_per_layer],
+        original_keras_version, original_backend)
+    backward_weights = preprocess_weights_for_loading(
+        layer.backward_layer, weights[num_weights_per_layer:],
+        original_keras_version, original_backend)
+    return forward_weights + backward_weights
+
+  def convert_nested_time_distributed(weights):
+    """Converts layers nested in `TimeDistributed` wrapper.
+
+    This function uses `preprocess_weights_for_loading()` for converting nested
+    layers.
+
+    Arguments:
+        weights: List of weights values (Numpy arrays).
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+    return preprocess_weights_for_loading(
+        layer.layer, weights, original_keras_version, original_backend)
+
+  def convert_nested_model(weights):
+    """Converts layers nested in `Model` or `Sequential`.
+
+    This function uses `preprocess_weights_for_loading()` for converting nested
+    layers.
+
+    Arguments:
+        weights: List of weights values (Numpy arrays).
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+    new_weights = []
+    # trainable weights
+    for sublayer in layer.layers:
+      num_weights = len(sublayer.trainable_weights)
+      if num_weights > 0:
+        new_weights.extend(preprocess_weights_for_loading(
+            layer=sublayer,
+            weights=weights[:num_weights],
+            original_keras_version=original_keras_version,
+            original_backend=original_backend))
+        weights = weights[num_weights:]
+
+    # non-trainable weights
+    for sublayer in layer.layers:
+      num_weights = len([l for l in sublayer.weights
+                         if l not in sublayer.trainable_weights])
+      if num_weights > 0:
+        new_weights.extend(preprocess_weights_for_loading(
+            layer=sublayer,
+            weights=weights[:num_weights],
+            original_keras_version=original_keras_version,
+            original_backend=original_backend))
+        weights = weights[num_weights:]
+    return new_weights
+
+  # Convert layers nested in Bidirectional/Model/Sequential.
+  # Both transformation should be ran for both Keras 1->2 conversion
+  # and for conversion of CuDNN layers.
+  if layer.__class__.__name__ == 'Bidirectional':
+    weights = convert_nested_bidirectional(weights)
+  if layer.__class__.__name__ == 'TimeDistributed':
+    weights = convert_nested_time_distributed(weights)
+  elif layer.__class__.__name__ in ['Model', 'Sequential']:
+    weights = convert_nested_model(weights)
+
+  if original_keras_version == '1':
+    if layer.__class__.__name__ == 'TimeDistributed':
+      weights = preprocess_weights_for_loading(
+          layer.layer, weights, original_keras_version, original_backend)
+
+    if layer.__class__.__name__ == 'Conv1D':
+      shape = weights[0].shape
+      # Handle Keras 1.1 format
+      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
+        # Legacy shape:
+        # (filters, input_dim, filter_length, 1)
+        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
+                                                           1)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+      weights[0] = weights[0][:, 0, :, :]
+
+    if layer.__class__.__name__ == 'Conv2D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+
+    if layer.__class__.__name__ == 'Conv2DTranspose':
+      if layer.data_format == 'channels_last':
+        # old: (kernel_rows, kernel_cols, stack_size, filters)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
+
+    if layer.__class__.__name__ == 'Conv3D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, ...)
+        # new: (..., stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
+
+    if layer.__class__.__name__ == 'GRU':
+      if len(weights) == 9:
+        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[4], weights[7]], axis=-1)
+        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'LSTM':
+      if len(weights) == 12:
+        # old: i, c, f, o
+        # new: i, f, c, o
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'ConvLSTM2D':
+      if len(weights) == 12:
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        if layer.data_format == 'channels_first':
+          # old: (filters, stack_size, kernel_rows, kernel_cols)
+          # new: (kernel_rows, kernel_cols, stack_size, filters)
+          kernel = np.transpose(kernel, (2, 3, 1, 0))
+          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
+        weights = [kernel, recurrent_kernel, bias]
+
+  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
+  if layer.__class__.__name__ in conv_layers:
+    if original_backend == 'theano':
+      weights[0] = conv_utils.convert_kernel(weights[0])
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = conv_utils.convert_kernel(weights[1])
+    if K.int_shape(layer.weights[0]) != weights[0].shape:
+      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
+
+  # convert CuDNN layers
+  return _convert_rnn_weights(layer, weights)
+
+
+def _convert_rnn_weights(layer, weights):
+  """Converts weights for RNN layers between native and CuDNN format.
+
+  Input kernels for each gate are transposed and converted between Fortran
+  and C layout, recurrent kernels are transposed. For LSTM biases are summed/
+  split in half, for GRU biases are reshaped.
+
+  Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
+  and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
+  compatible with `CuDNNGRU`.
+
+  For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
+
+  Arguments:
+      layer: Target layer instance.
+      weights: List of source weights values (input kernels, recurrent
+          kernels, [biases]) (Numpy arrays).
+
+  Returns:
+      A list of converted weights values (Numpy arrays).
+
+  Raises:
+      ValueError: for incompatible GRU layer/weights or incompatible biases
+  """
+
+  def transform_kernels(kernels, func, n_gates):
+    """Transforms kernel for each gate separately using given function.
+
+    Arguments:
+        kernels: Stacked array of kernels for individual gates.
+        func: Function applied to kernel of each gate.
+        n_gates: Number of gates (4 for LSTM, 3 for GRU).
+
+    Returns:
+        Stacked array of transformed kernels.
+    """
+    return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
+
+  def transpose_input(from_cudnn):
+    """Makes a function that transforms input kernels from/to CuDNN format.
+
+    It keeps the shape, but changes between the layout (Fortran/C). Eg.:
+
+    ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+    ```
+
+    It can be passed to `transform_kernels()`.
+
+    Arguments:
+        from_cudnn: `True` if source weights are in CuDNN format, `False`
+            if they're in plain Keras format.
+
+    Returns:
+        Function that converts input kernel to the other format.
+    """
+    order = 'F' if from_cudnn else 'C'
+
+    def transform(kernel):
+      return kernel.T.reshape(kernel.shape, order=order)
+
+    return transform
+
+  target_class = layer.__class__.__name__
+
+  # convert the weights between CuDNNLSTM and LSTM
+  if target_class in ['LSTM', 'CuDNNLSTM'] and len(weights) == 3:
+    # determine if we're loading a CuDNNLSTM layer
+    # from the number of bias weights:
+    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+    # if there's no bias weight in the file, skip this conversion
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 4
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNLSTM'
+    elif bias_shape == (units * n_gates,):
+      source = 'LSTM'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    def convert_lstm_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNLSTM and LSTM.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with LSTM.
+      """
+
+      # Transpose (and reshape) input and recurrent kernels
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      if from_cudnn:
+        # merge input and recurrent biases into a single set
+        biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
+      else:
+        # Split single set of biases evenly to two sets. The way of
+        # splitting doesn't matter as long as the two sets sum is kept.
+        biases = np.tile(0.5 * weights[2], 2)
+      return [kernels, recurrent_kernels, biases]
+
+    if source != target_class:
+      weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
+
+  # convert the weights between CuDNNGRU and GRU(reset_after=True)
+  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
+    # We can determine the source of the weights from the shape of the bias.
+    # If there is no bias we skip the conversion since
+    # CuDNNGRU always has biases.
+
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 3
+
+    def convert_gru_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNGRU and GRU.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with GRU.
+      """
+
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
+      return [kernels, recurrent_kernels, biases]
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNGRU'
+    elif bias_shape == (2, units * n_gates):
+      source = 'GRU(reset_after=True)'
+    elif bias_shape == (units * n_gates,):
+      source = 'GRU(reset_after=False)'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    if target_class == 'CuDNNGRU':
+      target = 'CuDNNGRU'
+    elif layer.reset_after:
+      target = 'GRU(reset_after=True)'
+    else:
+      target = 'GRU(reset_after=False)'
+
+    # only convert between different types
+    if source != target:
+      types = (source, target)
+      if 'GRU(reset_after=False)' in types:
+        raise ValueError('%s is not compatible with %s' % types)
+      if source == 'CuDNNGRU':
+        weights = convert_gru_weights(weights, from_cudnn=True)
+      elif source == 'GRU(reset_after=True)':
+        weights = convert_gru_weights(weights, from_cudnn=False)
+
+  return weights
+
+
+def save_optimizer_weights_to_hdf5_group(hdf5_group, optimizer):
+  """Saves optimizer weights of a optimizer to a HDF5 group.
+
+  Arguments:
+      hdf5_group: HDF5 group.
+      optimizer: optimizer instance.
+  """
+
+  symbolic_weights = getattr(optimizer, 'weights')
+  if symbolic_weights:
+    weights_group = hdf5_group.create_group('optimizer_weights')
+    weight_names = [str(w.name).encode('utf8') for w in symbolic_weights]
+    save_attributes_to_hdf5_group(weights_group, 'weight_names', weight_names)
+    weight_values = K.batch_get_value(symbolic_weights)
+    for name, val in zip(weight_names, weight_values):
+      param_dset = weights_group.create_dataset(
+          name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def load_optimizer_weights_from_hdf5_group(hdf5_group):
+  """Load optimizer weights from a HDF5 group.
+
+  Arguments:
+      hdf5_group: A pointer to a HDF5 group.
+
+  Returns:
+      data: List of optimizer weight names.
+  """
+  weights_group = hdf5_group['optimizer_weights']
+  optimizer_weight_names = load_attributes_from_hdf5_group(
+      weights_group, 'weight_names')
+  return [weights_group[weight_name] for weight_name in optimizer_weight_names]
+
+
+def save_weights_to_hdf5_group(f, layers):
+  """Saves the weights of a list of layers to a HDF5 group.
+
+  Arguments:
+      f: HDF5 group.
+      layers: List of layer instances.
+  """
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  save_attributes_to_hdf5_group(
+      f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+
+  # On TPUs, modifying the graph between session.runs() triggers some expensive
+  # recompilation overhead. To avoid this, we build up the full set of tensors
+  # to save before fetching weights, thus only modifying the graph once.
+  layer_weights_dict = {}
+  for layer in layers:
+    layer_weights_dict[layer.name] = [ops.convert_to_tensor(w)
+                                      for w in layer.weights]
+
+  for layer in layers:
+    g = f.create_group(layer.name)
+    symbolic_weights = layer_weights_dict[layer.name]
+    weight_values = K.batch_get_value(symbolic_weights)
+    weight_names = []
+    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+      if hasattr(w, 'name') and w.name:
+        name = str(w.name)
+      else:
+        name = 'param_' + str(i)
+      weight_names.append(name.encode('utf8'))
+    save_attributes_to_hdf5_group(g, 'weight_names', weight_names)
+    for name, val in zip(weight_names, weight_values):
+      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def load_weights_from_hdf5_group(f, layers):
+  """Implements topological (order-based) weight loading.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  filtered_layers = []
+  for layer in layers:
+    weights = layer.weights
+    if weights:
+      filtered_layers.append(layer)
+
+  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
+  filtered_layer_names = []
+  for name in layer_names:
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    if weight_names:
+      filtered_layer_names.append(name)
+  layer_names = filtered_layer_names
+  if len(layer_names) != len(filtered_layers):
+    raise ValueError('You are trying to load a weight file '
+                     'containing ' + str(len(layer_names)) +
+                     ' layers into a model with ' + str(len(filtered_layers)) +
+                     ' layers.')
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
+    layer = filtered_layers[k]
+    symbolic_weights = layer.weights
+    weight_values = preprocess_weights_for_loading(
+        layer, weight_values, original_keras_version, original_backend)
+    if len(weight_values) != len(symbolic_weights):
+      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                       '" in the current model) was found to '
+                       'correspond to layer ' + name + ' in the save file. '
+                       'However the new layer ' + layer.name + ' expects ' +
+                       str(len(symbolic_weights)) +
+                       ' weights, but the saved weights have ' +
+                       str(len(weight_values)) + ' elements.')
+    weight_value_tuples += zip(symbolic_weights, weight_values)
+  K.batch_set_value(weight_value_tuples)
+
+
+def load_weights_from_hdf5_group_by_name(f, layers):
+  """Implements name-based weight loading.
+
+  (instead of topological weight loading).
+
+  Layers that have no matching name are skipped.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  # New file format.
+  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
+
+  # Reverse index of layer name to list of layers with name.
+  index = {}
+  for layer in layers:
+    if layer.name:
+      index.setdefault(layer.name, []).append(layer)
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
+
+    for layer in index.get(name, []):
+      symbolic_weights = layer.weights
+      weight_values = preprocess_weights_for_loading(
+          layer, weight_values, original_keras_version, original_backend)
+      if len(weight_values) != len(symbolic_weights):
+        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                         '") expects ' + str(len(symbolic_weights)) +
+                         ' weight(s), but the saved weights' + ' have ' +
+                         str(len(weight_values)) + ' element(s).')
+      # Set values.
+      for i in range(len(weight_values)):
+        if K.int_shape(symbolic_weights[i]) != weight_values[i].shape:
+          raise ValueError('Layer #' + str(k) +' (named "' + layer.name +
+                           '"), weight ' + str(symbolic_weights[i]) +
+                           ' has shape {}'.format(K.int_shape(
+                               symbolic_weights[i])) +
+                           ', but the saved weight has shape ' +
+                           str(weight_values[i].shape) + '.')
+
+        else:
+          weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
+  K.batch_set_value(weight_value_tuples)
+
+
+def save_attributes_to_hdf5_group(group, name, data):
+  """Saves attributes (data) of the specified name into the HDF5 group.
+
+  This method deals with an inherent problem of HDF5 file which is not
+  able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+  Arguments:
+      group: A pointer to a HDF5 group.
+      name: A name of the attributes to save.
+      data: Attributes data to store.
+
+  Raises:
+    RuntimeError: If any single attribute is too large to be saved.
+  """
+  # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+  # because in that case even chunking the array would not make the saving
+  # possible.
+  bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+  # Expecting this to never be true.
+  if bad_attributes:
+    raise RuntimeError('The following attributes cannot be saved to HDF5 '
+                       'file because they are larger than %d bytes: %s' %
+                       (HDF5_OBJECT_HEADER_LIMIT,
+                        ', '.join([x for x in bad_attributes])))
+
+  data_npy = np.asarray(data)
+
+  num_chunks = 1
+  chunked_data = np.array_split(data_npy, num_chunks)
+
+  # This will never loop forever thanks to the test above.
+  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
+    num_chunks += 1
+    chunked_data = np.array_split(data_npy, num_chunks)
+
+  if num_chunks > 1:
+    for chunk_id, chunk_data in enumerate(chunked_data):
+      group.attrs['%s%d' % (name, chunk_id)] = chunk_data
+  else:
+    group.attrs[name] = data
+
+
+def load_attributes_from_hdf5_group(group, name):
+  """Loads attributes of the specified name from the HDF5 group.
+
+  This method deals with an inherent problem
+  of HDF5 file which is not able to store
+  data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+  Arguments:
+      group: A pointer to a HDF5 group.
+      name: A name of the attributes to load.
+
+  Returns:
+      data: Attributes data.
+  """
+  if name in group.attrs:
+    data = [n.decode('utf8') for n in group.attrs[name]]
+  else:
+    data = []
+    chunk_id = 0
+    while '%s%d' % (name, chunk_id) in group.attrs:
+      data.extend(
+          [n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
+      chunk_id += 1
+  return data
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
similarity index 97%
rename from tensorflow/python/keras/engine/saving_test.py
rename to tensorflow/python/keras/saving/hdf5_format_test.py
index 92fac6f24285017422d4daa5d1524d6787227bba..c51eecf4a3cbb5ab091e1306a221fc25fcd37b80 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #,============================================================================
-"""Tests for model saving."""
+"""Tests for model saving in the HDF5 format."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -31,8 +31,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -174,17 +174,17 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
     ]
     for layer, weights, input_shape in cases:
       layer.build(input_shape)
-      _ = keras.engine.saving.preprocess_weights_for_loading(
+      _ = hdf5_format.preprocess_weights_for_loading(
           layer, weights, original_keras_version='1')
 
     model = keras.models.Sequential([keras.layers.Dense(2, input_dim=2)])
-    _ = keras.engine.saving.preprocess_weights_for_loading(
+    _ = hdf5_format.preprocess_weights_for_loading(
         model, model.weights, original_keras_version='1')
 
     x = keras.Input((2,))
     y = keras.layers.Dense(2)(x)
     model = keras.models.Model(x, y)
-    _ = keras.engine.saving.preprocess_weights_for_loading(
+    _ = hdf5_format.preprocess_weights_for_loading(
         model, model.weights, original_keras_version='1')
 
   @parameterized.named_parameters(
@@ -215,7 +215,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
       layer = layer_class(**layer_args)
       layer.build(input_shape=layer_args.get('input_shape'))
       weights1 = layer.get_weights()
-      weights2 = keras.engine.saving.preprocess_weights_for_loading(
+      weights2 = hdf5_format.preprocess_weights_for_loading(
           layer, weights1)
       _ = [
           self.assertAllClose(x, y, rtol=1e-05)
@@ -274,7 +274,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                         metrics=[keras.metrics.categorical_accuracy])
 
       f_ref_model = h5py.File(h5_path, 'w')
-      saving.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
+      hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
 
       f_model = h5py.File(h5_path, 'r')
       model = keras.models.Sequential()
@@ -288,7 +288,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                  r'Layer #0 \(named \"d1\"\) expects 1 '
                                  r'weight\(s\), but the saved weights have 2 '
                                  r'element\(s\)\.'):
-      saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
+      hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
   @test_util.run_deprecated_v1
   def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
@@ -312,7 +312,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                         metrics=[keras.metrics.categorical_accuracy])
 
       f_ref_model = h5py.File(h5_path, 'w')
-      saving.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
+      hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
 
       f_model = h5py.File(h5_path, 'r')
       model = keras.models.Sequential()
@@ -328,7 +328,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                    r'shape=\(3, 10\) dtype=float32> has '
                                    r'shape \(3, 10\), but the saved weight has '
                                    r'shape \(3, 5\)\.'):
-        saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
+        hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
 
 class TestWholeModelSaving(test.TestCase):
@@ -348,13 +348,16 @@ class TestWholeModelSaving(test.TestCase):
           optimizer=keras.optimizers.RMSprop(lr=0.0001),
           metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalCrossentropy(
+                  name='cce', label_smoothing=constant_op.constant(0.2)),
           ],
           weighted_metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.categorical_crossentropy,
+              keras.metrics.CategoricalCrossentropy(
+                  name='cce', label_smoothing=constant_op.constant(0.2)),
           ],
           sample_weight_mode='temporal')
+
       x = np.random.random((1, 3))
       y = np.random.random((1, 3, 3))
       model.train_on_batch(x, y)
@@ -640,7 +643,6 @@ class TestWholeModelSaving(test.TestCase):
       os.remove(fname)
 
   def test_saving_model_with_long_weights_names(self):
-    self.skipTest('b/120921503')
     if h5py is None:
       self.skipTest('h5py required to run this test')
 
diff --git a/tensorflow/python/keras/saving/model_config.py b/tensorflow/python/keras/saving/model_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f59ecd7df53d794abf9db0dee15f410f4453951
--- /dev/null
+++ b/tensorflow/python/keras/saving/model_config.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Functions that save the model's config into different formats.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.util.tf_export import keras_export
+
+# pylint: disable=g-import-not-at-top
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+@keras_export('keras.models.model_from_config')
+def model_from_config(config, custom_objects=None):
+  """Instantiates a Keras model from its config.
+
+  Arguments:
+      config: Configuration dictionary.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      TypeError: if `config` is not a dictionary.
+  """
+  if isinstance(config, list):
+    raise TypeError('`model_from_config` expects a dictionary, not a list. '
+                    'Maybe you meant to use '
+                    '`Sequential.from_config(config)`?')
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+@keras_export('keras.models.model_from_yaml')
+def model_from_yaml(yaml_string, custom_objects=None):
+  """Parses a yaml model configuration file and returns a model instance.
+
+  Arguments:
+      yaml_string: YAML string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      ImportError: if yaml module is not found.
+  """
+  if yaml is None:
+    raise ImportError('Requires yaml module installed (`pip install pyyaml`).')
+  config = yaml.load(yaml_string)
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+@keras_export('keras.models.model_from_json')
+def model_from_json(json_string, custom_objects=None):
+  """Parses a JSON model configuration file and returns a model instance.
+
+  Arguments:
+      json_string: JSON string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+  """
+  config = json.loads(json_string)
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
index d22c4ee5d34c24b71937133ada0ed64b9cc80320..fbf0bf68ef7a6abf2130fe9c8f959e15dc11983b 100644
--- a/tensorflow/python/keras/saving/saved_model.py
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -24,13 +24,9 @@ import six
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import models as models_lib
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.metrics import Metric
-from tensorflow.python.keras.models import model_from_json
-from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.keras.saving import model_from_json
+from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -135,7 +131,7 @@ def export(
   if serving_only:
     save_lib.save(
         model, export_dir,
-        signatures=training_utils.trace_model_call(model, input_signature))
+        signatures=saving_utils.trace_model_call(model, input_signature))
   else:
     _save_v1_format(model, export_dir, custom_objects, as_text, input_signature)
 
@@ -167,6 +163,8 @@ def _export_model_variables(model, saved_model_path):
 
 def _save_v1_format(model, path, custom_objects, as_text, input_signature):
   """Exports model to v1 SavedModel format."""
+  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
+
   if not model._is_graph_network:
     if isinstance(model, sequential.Sequential):
       # If input shape is not directly set in the model, the exported model
@@ -250,15 +248,15 @@ def _export_mode(
     ValueError: If the train/eval mode is being exported, but the model does
       not have an optimizer.
   """
+  from tensorflow.python.keras import models as models_lib  # pylint: disable=g-import-not-at-top
   compile_clone = (mode != mode_keys.ModeKeys.PREDICT)
   if compile_clone and not model.optimizer:
     raise ValueError(
         'Model does not have an optimizer. Cannot export mode %s' % mode)
 
   model_graph = ops.get_default_graph()
-  with ops.Graph().as_default() as g:
-
-    K.set_learning_phase(mode == mode_keys.ModeKeys.TRAIN)
+  with ops.Graph().as_default() as g, K.learning_phase_scope(
+      mode == mode_keys.ModeKeys.TRAIN):
 
     if input_signature is None:
       input_tensors = None
@@ -331,7 +329,7 @@ def _create_signature_def_map(model, mode):
     inputs_dict.update(targets_dict)
   outputs_dict = {name: x
                   for name, x in zip(model.output_names, model.outputs)}
-  metrics = metrics_utils.extract_model_metrics_as_v1_metrics(model)
+  metrics = saving_utils.extract_model_metrics(model)
 
   # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
   # are by default not added to any collections. We are doing this here, so
@@ -339,6 +337,7 @@ def _create_signature_def_map(model, mode):
   local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
   vars_to_add = set()
   if metrics is not None:
+    from tensorflow.python.keras.metrics import Metric  # pylint: disable=g-import-not-at-top
     for key, value in six.iteritems(metrics):
       if isinstance(value, Metric):
         vars_to_add.update(value.variables)
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..95da169e82367c7e6ee7ef17fcb22295f8b0242b
--- /dev/null
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -0,0 +1,103 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utils related to keras model saving."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.util import nest
+
+
+def extract_model_metrics(model):
+  """Convert metrics from a Keras model to (value, update) ops.
+
+  This is used for converting Keras models to Estimators and SavedModels.
+
+  Args:
+    model: A `tf.keras.Model` object.
+
+  Returns:
+    Dictionary mapping metric names to tuples of (value, update) ops. May return
+    `None` if the model does not contain any metrics.
+  """
+  from tensorflow.python.keras import metrics  # pylint: disable=g-import-not-at-top
+  if not getattr(model, '_compile_metrics', None):
+    return None
+
+  # TODO(psv/kathywu): use this implementation in model to estimator flow.
+  eval_metric_ops = {}
+  for metric_name in model.metrics_names[1:]:  # Index 0 is `loss`.
+    m = metrics.Mean()
+    m(model._compile_metrics_tensors[metric_name])
+    eval_metric_ops[metric_name] = m
+  return eval_metric_ops
+
+
+def trace_model_call(model, input_signature=None):
+  """Trace the model call to create a tf.function for exporting a Keras model.
+
+  Args:
+    model: A Keras model.
+    input_signature: optional, a list of tf.TensorSpec objects specifying the
+      inputs to the model.
+
+  Returns:
+    A tf.function wrapping the model's call function with input signatures set.
+
+  Raises:
+    ValueError: if input signature cannot be inferred from the model.
+  """
+  if input_signature is None:
+    if isinstance(model.call, def_function.Function):
+      input_signature = model.call.input_signature
+
+  if input_signature is None:
+    try:
+      inputs = model.inputs
+      input_names = model.input_names
+    except AttributeError:
+      raise ValueError(
+          'Model {} cannot be saved because the input shapes have not been '
+          'set. Usually, input shapes are automatically determined from calling'
+          ' .fit() or .predict(). To manually set the shapes, call '
+          'model._set_inputs(inputs).'.format(model))
+    input_specs = []
+    for input_tensor, input_name in zip(inputs, input_names):
+      input_specs.append(tensor_spec.TensorSpec(
+          shape=input_tensor.shape, dtype=input_tensor.dtype,
+          name=input_name))
+    # The input signature of the call function is a list with one element, since
+    # all tensor inputs must be passed in as the first argument.
+    input_signature = [input_specs] if len(input_specs) > 1 else input_specs
+
+  # TODO(mdan): Should the model's call be autographed by default?
+  @def_function.function(input_signature=input_signature, autograph=False)
+  def _wrapped_model(*args):
+    """A concrete tf.function that wraps the model's call function."""
+    # When given a single input, Keras models will call the model on the tensor
+    # rather than a list consisting of the single tensor.
+    inputs = args[0] if len(input_signature) == 1 else list(args)
+    outputs_list = nest.flatten(model(inputs=inputs))
+    try:
+      output_names = model.output_names
+    except AttributeError:
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      output_names = training_utils.generic_output_names(outputs_list)
+    return {name: output for name, output in zip(output_names, outputs_list)}
+
+  return _wrapped_model
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd14c085298861c091b0da0a15ba2743471117f
--- /dev/null
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -0,0 +1,241 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for saving utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+
+from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save as save_lib
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import rmsprop
+
+
+class TraceModelCallTest(keras_parameterized.TestCase):
+
+  def _assert_all_close(self, expected, actual):
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        K._initialize_variables(sess)
+        self.assertAllClose(expected, actual)
+    else:
+      self.assertAllClose(expected, actual)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_model_outputs(self):
+    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    inputs = array_ops.ones((8, 5))
+
+    if input_dim is None:
+      with self.assertRaisesRegexp(ValueError,
+                                   'input shapes have not been set'):
+        saving_utils.trace_model_call(model)
+      model._set_inputs(inputs)
+
+    fn = saving_utils.trace_model_call(model)
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_model_outputs_after_fitting(self):
+    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x=np.random.random((8, 5)),
+              y=np.random.random((8, 3)), epochs=2)
+
+    inputs = array_ops.ones((8, 5))
+
+    fn = saving_utils.trace_model_call(model)
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_multi_io_model_outputs(self):
+    input_dim = 5
+    num_classes = 3
+    num_classes_b = 4
+    input_a = keras.layers.Input(shape=(input_dim,), name='input_a')
+    input_b = keras.layers.Input(shape=(input_dim,), name='input_b')
+
+    dense = keras.layers.Dense(num_classes, name='dense')
+    dense2 = keras.layers.Dense(num_classes_b, name='dense2')
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dense2, dropout]
+
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+
+    input_a_np = np.random.random((10, input_dim)).astype(np.float32)
+    input_b_np = np.random.random((10, input_dim)).astype(np.float32)
+
+    if testing_utils.get_model_type() == 'subclass':
+      with self.assertRaisesRegexp(ValueError,
+                                   'input shapes have not been set'):
+        saving_utils.trace_model_call(model)
+
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
+                 np.random.random((8, input_dim)).astype(np.float32)],
+              y=[np.random.random((8, num_classes)).astype(np.float32),
+                 np.random.random((8, num_classes_b)).astype(np.float32)],
+              epochs=2)
+
+    fn = saving_utils.trace_model_call(model)
+    signature_outputs = fn([input_a_np, input_b_np])
+    outputs = model([input_a_np, input_b_np])
+    expected_outputs = {model.output_names[0]: outputs[0],
+                        model.output_names[1]: outputs[1]}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_specify_input_signature(self):
+    model = testing_utils.get_small_sequential_mlp(10, 3, None)
+    inputs = array_ops.ones((8, 5))
+
+    with self.assertRaisesRegexp(ValueError, 'input shapes have not been set'):
+      saving_utils.trace_model_call(model)
+
+    fn = saving_utils.trace_model_call(
+        model, [tensor_spec.TensorSpec(shape=[None, 5], dtype=dtypes.float32)])
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_subclassed_model_with_input_signature(self):
+
+    class Model(keras.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.dense = keras.layers.Dense(3, name='dense')
+
+      @def_function.function(
+          input_signature=[[tensor_spec.TensorSpec([None, 5], dtypes.float32),
+                            tensor_spec.TensorSpec([None], dtypes.float32)]],)
+      def call(self, inputs, *args):
+        x, y = inputs
+        return self.dense(x) + y
+
+    model = Model()
+    fn = saving_utils.trace_model_call(model)
+    x = array_ops.ones((8, 5), dtype=dtypes.float32)
+    y = array_ops.ones((3,), dtype=dtypes.float32)
+    expected_outputs = {'output_1': model([x, y])}
+    signature_outputs = fn([x, y])
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+
+def _import_and_infer(save_dir, inputs):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
+
+
+class ModelSaveTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_model_save(self):
+    input_dim = 5
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    inputs = array_ops.ones((8, 5))
+
+    if testing_utils.get_model_type() == 'subclass':
+      model._set_inputs(inputs)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save_lib.save(model, save_dir)
+
+    self.assertAllClose(
+        {model.output_names[0]: model.predict_on_batch(inputs)},
+        _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
+
+
+class ExtractModelMetricsTest(test.TestCase):
+
+  def test_extract_model_metrics(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+    extract_metrics = saving_utils.extract_model_metrics(model)
+    self.assertEqual(None, extract_metrics)
+
+    extract_metric_names = [
+        'dense_loss', 'dropout_loss', 'dense_binary_accuracy',
+        'dropout_binary_accuracy'
+    ]
+    model_metric_names = ['loss'] + extract_metric_names
+    model.compile(
+        loss='mae',
+        metrics=[keras.metrics.BinaryAccuracy()],
+        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=None)
+    extract_metrics = saving_utils.extract_model_metrics(model)
+    self.assertEqual(set(model_metric_names), set(model.metrics_names))
+    self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index fd062b0ab337aa6fa62a7603a36749cde315c3da..eff0f39b6d006c60198a607e796e7619b968eaf3 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,7 +25,13 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
@@ -162,10 +168,13 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # See b/120160788 for more details. This should be mitigated after 2.0.
   model = keras.models.Model(x, layer(x))
   if _thread_local_data.run_eagerly is not None:
-    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'],
-                  run_eagerly=should_run_eagerly())
+    model.compile(
+        'rmsprop',
+        'mse',
+        weighted_metrics=['acc'],
+        run_eagerly=should_run_eagerly())
   else:
-    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
+    model.compile('rmsprop', 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
@@ -355,11 +364,20 @@ class _SubclassModel(keras.Model):
 
   def __init__(self, layers):
     super(_SubclassModel, self).__init__()
-    self.all_layers = layers
+    # Note that clone and build doesn't support lists of layers in subclassed
+    # models. Adding each layer directly here.
+    for i, layer in enumerate(layers):
+      setattr(self, self._layer_name_for_i(i), layer)
+
+    self.num_layers = len(layers)
+
+  def _layer_name_for_i(self, i):
+    return 'layer{}'.format(i)
 
   def call(self, inputs, **kwargs):
     x = inputs
-    for layer in self.all_layers:
+    for i in range(self.num_layers):
+      layer = getattr(self, self._layer_name_for_i(i))
       x = layer(x)
     return x
 
@@ -626,3 +644,39 @@ def get_multi_io_model(
     return keras.Model(inputs, outputs)
 
   raise ValueError('Unknown model type {}'.format(model_type))
+
+
+_V2_OPTIMIZER_MAP = {
+    'adadelta': adadelta_v2.Adadelta,
+    'adagrad': adagrad_v2.Adagrad,
+    'adam': adam_v2.Adam,
+    'adamax': adamax_v2.Adamax,
+    'nadam': nadam_v2.Nadam,
+    'rmsprop': rmsprop_v2.RMSprop,
+    'sgd': gradient_descent_v2.SGD
+}
+
+
+def get_v2_optimizer(name, **kwargs):
+  """Get the v2 optimizer requested.
+
+  This is only necessary until v2 are the default, as we are testing in Eager,
+  and Eager + v1 optimizers fail tests. When we are in v2, the strings alone
+  should be sufficient, and this mapping can theoretically be removed.
+
+  Args:
+    name: string name of Keras v2 optimizer.
+    **kwargs: any kwargs to pass to the optimizer constructor.
+
+  Returns:
+    Initialized Keras v2 optimizer.
+
+  Raises:
+    ValueError: if an unknown name was passed.
+  """
+  try:
+    return _V2_OPTIMIZER_MAP[name](**kwargs)
+  except KeyError:
+    raise ValueError(
+        'Could not find requested v2 optimizer: {}\nValid choices: {}'.format(
+            name, list(_V2_OPTIMIZER_MAP.keys())))
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 61940ad789c4009fca5462079014482fb8bfec1b..66d9817a6aecd28aafcf01896d089a342401fca7 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -34,10 +34,12 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
+from tensorflow.python.keras.utils.layer_utils import print_summary
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import model_to_dot
 from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index f486e631e50e5beb8da606879f23cd67131389f5..ea7427f61a8cc234f69df28d111d26b87b326a48 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -194,9 +194,11 @@ def normalize_data_format(value):
 
 
 def normalize_padding(value):
+  if isinstance(value, (list, tuple)):
+    return value
   padding = value.lower()
   if padding not in {'valid', 'same', 'causal'}:
-    raise ValueError('The `padding` argument must be one of '
+    raise ValueError('The `padding` argument must be a list/tuple or one of '
                      '"valid", "same" (or "causal", only for `Conv1D). '
                      'Received: ' + str(padding))
   return padding
diff --git a/tensorflow/python/keras/utils/conv_utils_test.py b/tensorflow/python/keras/utils/conv_utils_test.py
index eb2a360bfdaf04d695a599b477c0d154bac062cd..ef7ad1b8c53edbc313d95382b248b159c6c2da1d 100644
--- a/tensorflow/python/keras/utils/conv_utils_test.py
+++ b/tensorflow/python/keras/utils/conv_utils_test.py
@@ -52,6 +52,114 @@ input_shapes = [
 ]
 
 
+class TestBasicConvUtilsTest(test.TestCase):
+
+  def test_convert_data_format(self):
+    self.assertEqual('NCDHW', conv_utils.convert_data_format(
+        'channels_first', 5))
+    self.assertEqual('NCHW', conv_utils.convert_data_format(
+        'channels_first', 4))
+    self.assertEqual('NCW', conv_utils.convert_data_format('channels_first', 3))
+    self.assertEqual('NHWC', conv_utils.convert_data_format('channels_last', 4))
+    self.assertEqual('NWC', conv_utils.convert_data_format('channels_last', 3))
+    self.assertEqual('NDHWC', conv_utils.convert_data_format(
+        'channels_last', 5))
+
+    with self.assertRaises(ValueError):
+      conv_utils.convert_data_format('invalid', 2)
+
+  def test_normalize_tuple(self):
+    self.assertEqual((2, 2, 2),
+                     conv_utils.normalize_tuple(2, n=3, name='strides'))
+    self.assertEqual((2, 1, 2),
+                     conv_utils.normalize_tuple((2, 1, 2), n=3, name='strides'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple((2, 1), n=3, name='strides')
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple(None, n=3, name='strides')
+
+  def test_normalize_data_format(self):
+    self.assertEqual('channels_last',
+                     conv_utils.normalize_data_format('Channels_Last'))
+    self.assertEqual('channels_first',
+                     conv_utils.normalize_data_format('CHANNELS_FIRST'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_data_format('invalid')
+
+  def test_normalize_padding(self):
+    self.assertEqual('same', conv_utils.normalize_padding('SAME'))
+    self.assertEqual('valid', conv_utils.normalize_padding('VALID'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_padding('invalid')
+
+  def test_conv_output_length(self):
+    self.assertEqual(4, conv_utils.conv_output_length(4, 2, 'same', 1, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'same', 2, 1))
+    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'valid', 1, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'valid', 2, 1))
+    self.assertEqual(5, conv_utils.conv_output_length(4, 2, 'full', 1, 1))
+    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'full', 2, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(5, 2, 'valid', 2, 2))
+
+  def test_conv_input_length(self):
+    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'same', 1))
+    self.assertEqual(2, conv_utils.conv_input_length(2, 2, 'same', 2))
+    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'valid', 1))
+    self.assertEqual(4, conv_utils.conv_input_length(2, 2, 'valid', 2))
+    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'full', 1))
+    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'full', 2))
+
+  def test_deconv_output_length(self):
+    self.assertEqual(4, conv_utils.deconv_output_length(4, 2, 'same', stride=1))
+    self.assertEqual(8, conv_utils.deconv_output_length(4, 2, 'same', stride=2))
+    self.assertEqual(5, conv_utils.deconv_output_length(
+        4, 2, 'valid', stride=1))
+    self.assertEqual(8, conv_utils.deconv_output_length(
+        4, 2, 'valid', stride=2))
+    self.assertEqual(3, conv_utils.deconv_output_length(4, 2, 'full', stride=1))
+    self.assertEqual(6, conv_utils.deconv_output_length(4, 2, 'full', stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=2, stride=1))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=1, stride=2))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=2, stride=1))
+    self.assertEqual(
+        9,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=1, stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=2, stride=1))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=1, stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=1, stride=1, dilation=2))
+    self.assertEqual(
+        12,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=2, stride=2, dilation=3))
+    self.assertEqual(
+        6,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=2, stride=2, dilation=3))
+
+
 @parameterized.parameters(input_shapes)
 class TestConvUtils(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 454854618c742fd8dd2bd0abff76f4a3322e1b1a..8b5cdadd45ff89997af3ef4db6517e0b56a96e4d 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -391,9 +391,8 @@ class Progbar(object):
         sys.stdout.write('\n')
 
       if self.target is not None:
-        numdigits = int(np.floor(np.log10(self.target))) + 1
-        barstr = '%%%dd/%d [' % (numdigits, self.target)
-        bar = barstr % current
+        numdigits = int(np.log10(self.target)) + 1
+        bar = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
         prog = float(current) / self.target
         prog_width = int(self.width * prog)
         if prog_width > 0:
@@ -456,7 +455,10 @@ class Progbar(object):
       sys.stdout.flush()
 
     elif self.verbose == 2:
-      if self.target is None or current >= self.target:
+      if self.target is not None and current >= self.target:
+        numdigits = int(np.log10(self.target)) + 1
+        count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
+        info = count + info
         for k in self._values_order:
           info += ' - %s:' % k
           avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
diff --git a/tensorflow/python/keras/utils/kernelized_utils.py b/tensorflow/python/keras/utils/kernelized_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e73cb2d4c63df2f1098802deffbcc899039d0cb
--- /dev/null
+++ b/tensorflow/python/keras/utils/kernelized_utils.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods related to kernelized layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _to_matrix(u):
+  """If input tensor is a vector (i.e., has rank 1), converts it to matrix."""
+  u_rank = len(u.shape)
+  if u_rank not in [1, 2]:
+    raise ValueError('The input tensor should have rank 1 or 2. Given rank: {}'
+                     .format(u_rank))
+  if u_rank == 1:
+    return array_ops.expand_dims(u, 0)
+  return u
+
+
+def _align_matrices(x, y):
+  """Aligns x and y tensors to allow computations over pairs of their rows."""
+  x_matrix = _to_matrix(x)
+  y_matrix = _to_matrix(y)
+  x_shape = x_matrix.shape
+  y_shape = y_matrix.shape
+  if y_shape[1] != x_shape[1]:  # dimensions do not match.
+    raise ValueError(
+        'The outermost dimensions of the input tensors should match. Given: {} '
+        'vs {}.'.format(y_shape[1], x_shape[1]))
+
+  x_tile = array_ops.tile(
+      array_ops.expand_dims(x_matrix, 1), [1, y_shape[0], 1])
+  y_tile = array_ops.tile(
+      array_ops.expand_dims(y_matrix, 0), [x_shape[0], 1, 1])
+  return x_tile, y_tile
+
+
+def inner_product(u, v):
+  u = _to_matrix(u)
+  v = _to_matrix(v)
+  return math_ops.matmul(u, v, transpose_b=True)
+
+
+def exact_gaussian_kernel(x, y, stddev):
+  """Computes exact Gaussian kernel value(s) for tensors x and y and stddev.
+
+  The Gaussian kernel for vectors u, v is defined as follows:
+       K(u, v) = exp(-||u-v||^2 / (2* stddev^2))
+  where the norm is the l2-norm. x, y can be either vectors or matrices. If they
+  are vectors, they must have the same dimension. If they are matrices, they
+  must have the same number of columns. In the latter case, the method returns
+  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
+  v is a row from y.
+
+  Args:
+    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+    stddev: The width of the Gaussian kernel.
+
+  Returns:
+    A single value (scalar) with shape (1, 1) (if x, y are vectors) or a matrix
+      of shape (m, n) with entries K(u, v) (where K is the Gaussian kernel) for
+      all (u,v) pairs where u, v are rows from x and y respectively.
+
+  Raises:
+    InvalidShapeError: if the shapes of x, y are not compatible.
+  """
+  x_aligned, y_aligned = _align_matrices(x, y)
+  diff_squared_l2_norm = math_ops.reduce_sum(
+      math_ops.squared_difference(x_aligned, y_aligned), 2)
+  return math_ops.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
+
+
+def exact_laplacian_kernel(x, y, stddev):
+  """Computes exact Laplacian kernel value(s) for tensors x and y using stddev.
+
+  The Laplacian kernel for vectors u, v is defined as follows:
+       K(u, v) = exp(-||u-v|| / stddev)
+  where the norm is the l1-norm. x, y can be either vectors or matrices. If they
+  are vectors, they must have the same dimension. If they are matrices, they
+  must have the same number of columns. In the latter case, the method returns
+  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
+  v is a row from y.
+
+  Args:
+    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+    stddev: The width of the Gaussian kernel.
+
+  Returns:
+    A single value (scalar) with shape (1, 1)  if x, y are vectors or a matrix
+    of shape (m, n) with entries K(u, v) (where K is the Laplacian kernel) for
+    all (u,v) pairs where u, v are rows from x and y respectively.
+
+  Raises:
+    InvalidShapeError: if the shapes of x, y are not compatible.
+  """
+  x_aligned, y_aligned = _align_matrices(x, y)
+  diff_l1_norm = math_ops.reduce_sum(
+      math_ops.abs(math_ops.subtract(x_aligned, y_aligned)), 2)
+  return math_ops.exp(-diff_l1_norm / stddev)
diff --git a/tensorflow/python/keras/utils/kernelized_utils_test.py b/tensorflow/python/keras/utils/kernelized_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9a72493ddee5cf1d0f310c06d0fa1860b2a61f
--- /dev/null
+++ b/tensorflow/python/keras/utils/kernelized_utils_test.py
@@ -0,0 +1,116 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernelized_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras.utils import kernelized_utils
+from tensorflow.python.platform import test
+
+
+def _exact_gaussian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+
+
+def _exact_laplacian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+
+
+class KernelizedUtilsTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
+      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
+  def test_equal_vectors(self, exact_kernel_fn, expected_values):
+    """Identical vectors give exactly the identity kernel value."""
+    x = constant_op.constant([0.5, -0.5, -0.5, 0.5])
+    y = constant_op.constant([0.5, -0.5, -0.5, 0.5])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are identical and therefore K(x, y) will be precisely equal to
+    # the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-6)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
+      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
+  def test_almost_identical_vectors(self, exact_kernel_fn, expected_values):
+    """Almost identical vectors give the identity kernel value."""
+    x = constant_op.constant([1.0, 0.4, -2.1, -1.1])
+    y = constant_op.constant([1.01, 0.39, -2.099, -1.101])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are almost identical and therefore K(x, y) will be almost equal to
+    # the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-3)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=1.0), [[0.99], [0.977]]),
+      ('laplacian', _exact_laplacian(stddev=5.0), [[0.96], [0.94]]))
+  def test_similar_matrices(self, exact_kernel_fn, expected_values):
+    """Pairwise "close" vectors give high kernel values (similarity scores)."""
+    x = constant_op.constant([1.0, 3.4, -2.1, 0.9, 3.3, -2.0], shape=[2, 3])
+    y = constant_op.constant([1.1, 3.35, -2.05])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # The 2 rows of x are close to y. The pairwise kernel values (similarity
+    # scores) are somewhat close to the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=2.0), [[.997, .279], [.251, 1.],
+                                                 [.164, 0.019]]),
+      ('laplacian', _exact_laplacian(stddev=2.0), [[.904, .128], [.116, 1.],
+                                                   [.07, 0.027]]))
+  def test_matrices_varying_similarity(self, exact_kernel_fn, expected_values):
+    """Test matrices with row vectors of varying pairwise similarity."""
+    x = constant_op.constant([1.0, 2., -2., 0.9, 3.3, -1.0], shape=[3, 2])
+    y = constant_op.constant([1.1, 2.1, -2., 0.9], shape=[2, 2])
+    exact_kernel = exact_kernel_fn(x, y)
+
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=1.0), [[0.0]]),
+      ('laplacian', _exact_laplacian(stddev=1.0), [[0.0]]))
+  def test_completely_dissimilar_vectors(self, exact_kernel_fn,
+                                         expected_values):
+    """Very dissimilar vectors give very low similarity scores."""
+    x = constant_op.constant([1.0, 3.4, -2.1, -5.1])
+    y = constant_op.constant([0.5, 2.1, 1.0, 3.0])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are very "far" from each other and so the corresponding kernel
+    # value will be very low.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index d7eed2e86da88953b768756aa59d536f747c332e..640462d5c63f459f59bb09d24edc1f78f7016c35 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.conv_utils import convert_kernel
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -53,14 +54,11 @@ def get_source_inputs(tensor, layer=None, node_index=None):
     node = layer._inbound_nodes[node_index]
     if not node.inbound_layers:
       # Reached an Input layer, stop recursion.
-      return node.input_tensors
+      return nest.flatten(node.input_tensors)
     else:
       source_tensors = []
-      for i in range(len(node.inbound_layers)):
-        x = node.input_tensors[i]
-        layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        previous_sources = get_source_inputs(x, layer, node_index)
+      for layer, node_index, _, tensor in node.iterate_inbound():
+        previous_sources = get_source_inputs(tensor, layer, node_index)
         # Avoid input redundancy.
         for x in previous_sources:
           if x not in source_tensors:
@@ -110,7 +108,8 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     nodes_by_depth = model._nodes_by_depth.values()
     nodes = []
     for v in nodes_by_depth:
-      if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
+      if (len(v) > 1) or (len(v) == 1 and
+                          len(nest.flatten(v[0].inbound_layers)) > 1):
         # if the model has multiple nodes
         # or if the nodes have multiple inbound_layers
         # the model is no longer sequential
@@ -159,6 +158,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       line += ' ' * (positions[i] - len(line))
     print_fn(line)
 
+  print_fn('Model: "{}"'.format(model.name))
   print_fn('_' * line_length)
   print_row(to_display, positions)
   print_fn('=' * line_length)
@@ -195,12 +195,10 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       if relevant_nodes and node not in relevant_nodes:
         # node is not part of the current network
         continue
-      for i in range(len(node.inbound_layers)):
-        inbound_layer = node.inbound_layers[i].name
-        inbound_node_index = node.node_indices[i]
-        inbound_tensor_index = node.tensor_indices[i]
-        connections.append(inbound_layer + '[' + str(inbound_node_index) +
-                           '][' + str(inbound_tensor_index) + ']')
+
+      for inbound_layer, node_index, tensor_index, _ in node.iterate_inbound():
+        connections.append('{}[{}][{}]'.format(inbound_layer.name, node_index,
+                                               tensor_index))
 
     name = layer.name
     cls_name = layer.__class__.__name__
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index fc4b4ac7dfd0966af5f4c21d4b78ba8ecd6bf46a..d42b354fb140bc592ee1127c3789069365371bc4 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import array_ops
@@ -51,10 +52,31 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
     the last dimension squeezed,
     `sample_weight` could be extended by one dimension.
   """
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
   if y_true is not None:
-    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
-    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
-        y_true, y_pred)
+
+    # If sparse matrix is provided as `y_true`, the last dimension in `y_pred`
+    # may be > 1. Eg: y_true = [0, 1, 2] (shape=(3,)),
+    # y_pred = [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]] (shape=(3, 3))
+    # In this case, we should not try to remove squeezable dimension.
+    y_true_shape = y_true.get_shape()
+    y_true_rank = y_true_shape.ndims
+    if (y_true_rank is not None) and (y_pred_rank is not None):
+      # Use static rank for `y_true` and `y_pred`.
+      if (y_pred_rank - y_true_rank != 1) or y_pred_shape[-1] == 1:
+        y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+            y_true, y_pred)
+    else:
+      # Use dynamic rank.
+      rank_diff = array_ops.rank(y_pred) - array_ops.rank(y_true)
+      squeeze_dims = lambda: confusion_matrix.remove_squeezable_dimensions(  # pylint: disable=g-long-lambda
+          y_true, y_pred)
+      is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1])
+      maybe_squeeze_dims = lambda: control_flow_ops.cond(  # pylint: disable=g-long-lambda
+          is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred))
+      y_true, y_pred = control_flow_ops.cond(
+          math_ops.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims)
 
   if sample_weight is None:
     return y_pred, y_true, None
@@ -65,8 +87,6 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
   if weights_rank == 0:  # If weights is scalar, do nothing.
     return y_pred, y_true, sample_weight
 
-  y_pred_shape = y_pred.get_shape()
-  y_pred_rank = y_pred_shape.ndims
   if (y_pred_rank is not None) and (weights_rank is not None):
     # Use static rank.
     if weights_rank - y_pred_rank == 1:
@@ -167,8 +187,8 @@ def compute_weighted_loss(losses,
         losses, None, sample_weight)
     losses = ops.convert_to_tensor(losses)
     input_dtype = losses.dtype
-    losses = math_ops.to_float(losses)
-    sample_weight = math_ops.to_float(sample_weight)
+    losses = math_ops.cast(losses, dtypes.float32)
+    sample_weight = math_ops.cast(sample_weight, dtypes.float32)
 
     try:
       # Broadcast weights if possible.
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 431d107091e90c8ecf7be38a465443aaede11936..c5ff617ad75c57da7f6f0b1f2ccdfd82b1215fd1 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -19,59 +19,328 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
+import functools
+import weakref
 
-from tensorflow.python.keras import metrics
-from tensorflow.python.ops import metrics as metrics_module
+from enum import Enum
 
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils.generic_utils import to_list
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.util import tf_decorator
 
-def extract_model_metrics_as_v1_metrics(model):
-  """Convert metrics from a Keras model to (value, update) ops.
+NEG_INF = -1e10
 
-  This is used for converting Keras models to Estimators and SavedModels.
+
+class Reduction(Enum):
+  """Types of metrics reduction.
+
+  Contains the following values:
+
+  * `SUM`: Scalar sum of weighted values.
+  * `SUM_OVER_BATCH_SIZE`: Scalar sum of weighted values divided by
+        number of elements.
+  * `WEIGHTED_MEAN`: Scalar sum of weighted values divided by sum of weights.
+  """
+  SUM = 'sum'
+  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
+  WEIGHTED_MEAN = 'weighted_mean'
+
+
+def update_state_wrapper(update_state_fn):
+  """Decorator to wrap metric `update_state()` with `add_update()`.
 
   Args:
-    model: A `tf.keras.Model` object.
+    update_state_fn: function that accumulates metric statistics.
 
   Returns:
-    Dictionary mapping metric names to tuples of (value, update) ops. May return
-    `None` if the model does not contain any metrics.
+    Decorated function that wraps `update_state_fn()` with `add_update()`.
   """
-  if not getattr(model, 'metrics', None):
-    return None
-
-  eval_metric_ops = {}
-
-  def get_metric_name(metric):
-    if isinstance(metric, metrics.Metric):
-      return metric.name
-    if callable(metric):
-      return metric.__name__
-    assert isinstance(metric, six.string_types)
-    return metric
-
-  # When each metric maps to an output
-  if isinstance(model.metrics, dict):
-    for i, output_name in enumerate(model.metrics.keys()):
-      # `metric` is the user given metric value in `compile`. This can be
-      # metric name (`acc`), metric function (binary_accuracy) or a metric
-      # object (BinaryAccuracy()).
-      metric = model.metrics[output_name]
-      metric_name = get_metric_name(metric)
-      # When some outputs use the same metric
-      if list(model.metrics.values()).count(metric_name) > 1:
-        metric_name += '_' + output_name
-      if isinstance(metric, metrics.Metric):
-        eval_metric_ops[metric_name] = metric
-      else:
-        eval_metric_ops[metric_name] = metrics_module.mean(
-            model.metrics_tensors[i - len(model.metrics)])
+
+  def decorated(metric_obj, *args, **kwargs):
+    """Decorated function with `add_update()`."""
+
+    update_op = update_state_fn(*args, **kwargs)
+    if update_op is not None:  # update_op will be None in eager execution.
+      metric_obj.add_update(update_op, inputs=True)
+    return update_op
+
+  return tf_decorator.make_decorator(update_state_fn, decorated)
+
+
+def result_wrapper(result_fn):
+  """Decorator to wrap metric `result()` function in `merge_call()`.
+
+  Result computation is an idempotent operation that simply calculates the
+  metric value using the state variables.
+
+  If metric state variables are distributed across replicas/devices and
+  `result()` is requested from the context of one device - This function wraps
+  `result()` in a distribution strategy `merge_call()`. With this,
+  the metric state variables will be aggregated across devices.
+
+  Args:
+    result_fn: function that computes the metric result.
+
+  Returns:
+    Decorated function that wraps `result_fn()` in distribution strategy
+    `merge_call()`.
+  """
+
+  def decorated(_, *args):
+    """Decorated function with merge_call."""
+    replica_context = distribution_strategy_context.get_replica_context()
+    if replica_context is None:  # if in cross replica context already
+      result_t = result_fn(*args)
+    else:
+      # TODO(psv): Test distribution of metrics using different distribution
+      # strategies.
+
+      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
+      # with distribution object as the first parameter. We create a wrapper
+      # here so that the result function need not have that parameter.
+      def merge_fn_wrapper(distribution, merge_fn, *args):
+        # We will get `PerDevice` merge function. Taking the first one as all
+        # are identical copies of the function that we had passed below.
+        return distribution.unwrap(merge_fn)[0](*args)
+
+      # Wrapping result in merge_call. merge_call is used when we want to leave
+      # replica mode and compute a value in cross replica mode.
+      result_t = replica_context.merge_call(
+          merge_fn_wrapper, args=(result_fn,) + args)
+    return result_t
+
+  return tf_decorator.make_decorator(result_fn, decorated)
+
+
+def weakmethod(method):
+  """Creates a weak reference to the bound method."""
+
+  cls = method.im_class
+  func = method.im_func
+  instance_ref = weakref.ref(method.im_self)
+
+  @functools.wraps(method)
+  def inner(*args, **kwargs):
+    return func.__get__(instance_ref(), cls)(*args, **kwargs)
+
+  del method
+  return inner
+
+
+def assert_thresholds_range(thresholds):
+  if thresholds is not None:
+    invalid_thresholds = [t for t in thresholds if t is None or t < 0 or t > 1]
+    if invalid_thresholds:
+      raise ValueError(
+          'Threshold values must be in [0, 1]. Invalid values: {}'.format(
+              invalid_thresholds))
+
+
+def parse_init_thresholds(thresholds, default_threshold=0.5):
+  if thresholds is not None:
+    assert_thresholds_range(to_list(thresholds))
+  thresholds = to_list(default_threshold if thresholds is None else thresholds)
+  return thresholds
+
+
+class ConfusionMatrix(Enum):
+  TRUE_POSITIVES = 'tp'
+  FALSE_POSITIVES = 'fp'
+  TRUE_NEGATIVES = 'tn'
+  FALSE_NEGATIVES = 'fn'
+
+
+class AUCCurve(Enum):
+  ROC = 'ROC'
+  PR = 'PR'
+
+
+class AUCSummationMethod(Enum):
+  INTERPOLATION = 'interpolation'
+  MAJORING = 'majoring'
+  MINORING = 'minoring'
+
+
+def update_confusion_matrix_variables(variables_to_update,
+                                      y_true,
+                                      y_pred,
+                                      thresholds,
+                                      top_k=None,
+                                      class_id=None,
+                                      sample_weight=None):
+  """Returns op to update the given confusion matrix variables.
+
+  For every pair of values in y_true and y_pred:
+
+  true_positive: y_true == True and y_pred > thresholds
+  false_negatives: y_true == True and y_pred <= thresholds
+  true_negatives: y_true == False and y_pred <= thresholds
+  false_positive: y_true == False and y_pred > thresholds
+
+  The results will be weighted and added together. When multiple thresholds are
+  provided, we will repeat the same for every threshold.
+
+  For estimation of these metrics over a stream of data, the function creates an
+  `update_op` operation that updates the given variables.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use weights of 0 to mask values.
+
+  Args:
+    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+      and corresponding variables to update as values.
+    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
+      the range `[0, 1]`.
+    thresholds: A float value or a python list or tuple of float thresholds in
+      `[0, 1]`, or NEG_INF (used when top_k is set).
+    top_k: Optional int, indicates that the positive labels should be limited to
+      the top k predictions.
+    class_id: Optional int, limits the prediction and labels to the class
+      specified by this argument.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `y_true` dimension).
+
+  Returns:
+    Update op.
+
+  Raises:
+    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
+      `variables_to_update` contains invalid keys.
+  """
+  if variables_to_update is None:
+    return
+  y_true = ops.convert_to_tensor(y_true)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_pred.shape.assert_is_compatible_with(y_true.shape)
+
+  if not any(
+      key for key in variables_to_update if key in list(ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(ConfusionMatrix), variables_to_update.keys()))
+
+  invalid_keys = [
+      key for key in variables_to_update if key not in list(ConfusionMatrix)
+  ]
+  if invalid_keys:
+    raise ValueError(
+        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
+            invalid_keys, list(ConfusionMatrix)))
+
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          y_pred,
+          math_ops.cast(0.0, dtype=y_pred.dtype),
+          message='predictions must be >= 0'),
+      check_ops.assert_less_equal(
+          y_pred,
+          math_ops.cast(1.0, dtype=y_pred.dtype),
+          message='predictions must be <= 1')
+  ]):
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        math_ops.cast(y_pred, dtype=dtypes.float32),
+        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
+
+  if top_k is not None:
+    y_pred = _filter_top_k(y_pred, top_k)
+  if class_id is not None:
+    y_true = y_true[..., class_id]
+    y_pred = y_pred[..., class_id]
+
+  thresholds = to_list(thresholds)
+  num_thresholds = len(thresholds)
+  num_predictions = array_ops.size(y_pred)
+
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(y_pred, [1, -1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
+
+  # Tile the thresholds for every prediction.
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), 1),
+      array_ops.stack([1, num_predictions]))
+
+  # Tile the predictions for every threshold.
+  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
+
+  # Compare predictions and threshold.
+  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+
+  if sample_weight is not None:
+    weights = weights_broadcast_ops.broadcast_weights(
+        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
   else:
-    for i, metric in enumerate(model.metrics):
-      metric_name = get_metric_name(metric)
-      if isinstance(metric, metrics.Metric):
-        eval_metric_ops[metric_name] = metric
-      else:
-        eval_metric_ops[metric_name] = metrics_module.mean(
-            model.metrics_tensors[i])
-  return eval_metric_ops
+    weights_tiled = None
+
+  update_ops = []
+
+  def weighted_assign_add(label, pred, weights, var):
+    label_and_pred = math_ops.cast(
+        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+    if weights is not None:
+      label_and_pred *= weights
+    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
+
+  loop_vars = {
+      ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+  }
+  update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+  update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+  update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+
+  if update_fn or update_tn:
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+    loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+
+  if update_fp or update_tn:
+    label_is_neg = math_ops.logical_not(label_is_pos)
+    loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+    if update_tn:
+      loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+
+  for matrix_cond, (label, pred) in loop_vars.items():
+    if matrix_cond in variables_to_update:
+      update_ops.append(
+          weighted_assign_add(label, pred, weights_tiled,
+                              variables_to_update[matrix_cond]))
+  return control_flow_ops.group(update_ops)
+
+
+def _filter_top_k(x, k):
+  """Filters top-k values in the last dim of x and set the rest to NEG_INF.
+
+  Used for computing top-k prediction values in dense labels (which has the same
+  shape as predictions) for recall and precision top-k metrics.
+
+  Args:
+    x: tensor with any dimensions.
+    k: the number of values to keep.
+
+  Returns:
+    tensor with same shape and dtype as x.
+  """
+  _, top_k_idx = nn_ops.top_k(x, k, sorted=False)
+  top_k_mask = math_ops.reduce_sum(
+      array_ops.one_hot(top_k_idx, x.shape[-1], axis=-1), axis=-2)
+  return x * top_k_mask + NEG_INF * (1 - top_k_mask)
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 8c1abd632484273a01fd99cbd72ee73b66e46f27..9c711bd2a28395279c1e8cd726084d6b9ab4e188 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -148,7 +148,6 @@ class TestMultiGPUModel(test.TestCase):
       input_shape = (num_samples,) + shape
       x_train = np.random.randint(0, 255, input_shape)
       y_train = np.random.randint(0, num_classes, (input_shape[0],))
-      keras.backend.set_learning_phase(True)
 
       y_train = keras.utils.to_categorical(y_train, num_classes)
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 7b4c9e7239e2f097e0351b160bd7520ee587a8b3..dc5c4f1d905a5cd7f11e9f1b7a9ea4328207f479 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
@@ -102,6 +104,7 @@ def get_reachable_from_inputs(inputs, targets=None):
   Returns:
     A set of tensors reachable from the inputs (includes the inputs themselves).
   """
+  inputs = nest.flatten(inputs)
   reachable = set(inputs)
   if targets:
     targets = set(targets)
@@ -129,6 +132,140 @@ def get_reachable_from_inputs(inputs, targets=None):
   return reachable
 
 
+# This function needs access to private functions of `nest`.
+#  pylint: disable=protected-access
+def map_structure_with_atomic(is_atomic_fn, map_fn, nested):
+  """Maps the atomic elements of a nested structure.
+
+  Arguments:
+    is_atomic_fn: A function that determines if an element of `nested` is
+      atomic.
+    map_fn: The function to apply to atomic elements of `nested`.
+    nested: A nested structure.
+
+  Returns:
+    The nested structure, with atomic elements mapped according to `map_fn`.
+
+  Raises:
+    ValueError: If an element that is neither atomic nor a sequence is
+      encountered.
+  """
+  if is_atomic_fn(nested):
+    return map_fn(nested)
+
+  # Recursively convert.
+  if not nest.is_sequence(nested):
+    raise ValueError(
+        'Received non-atomic and non-sequence element: {}'.format(nested))
+  if nest._is_mapping(nested):
+    values = [nested[k] for k in nest._sorted(nested)]
+  else:
+    values = nested
+  mapped_values = [
+      map_structure_with_atomic(is_atomic_fn, map_fn, ele) for ele in values
+  ]
+  return nest._sequence_like(nested, mapped_values)
+
+
+#  pylint: enable=protected-access
+
+
+def convert_shapes(input_shape, to_tuples=True):
+  """Converts nested shape representations  to desired format.
+
+  Performs:
+
+  TensorShapes -> tuples if `to_tuples=True`.
+  tuples of int or None -> TensorShapes if `to_tuples=False`.
+
+  Valid objects to be converted are:
+  - TensorShapes
+  - tuples with elements of type int or None.
+  - ints
+  - None
+
+  Arguments:
+    input_shape: A nested structure of objects to be converted to TensorShapes.
+    to_tuples: If `True`, converts all TensorShape to tuples. Otherwise converts
+      all tuples representing shapes to TensorShapes.
+
+  Returns:
+    Nested structure of shapes in desired format.
+  """
+
+  def _is_shape_component(element):
+    value = tensor_shape.as_dimension(element).value
+    return value is None or isinstance(value, int)
+
+  def _is_atomic_shape(input_shape):
+    # Ex: TensorShape or (None, 10, 32) or 5 or `None`
+    if input_shape is None or isinstance(input_shape, int):
+      return True
+    if isinstance(input_shape, tensor_shape.TensorShape):
+      return True
+    if (isinstance(input_shape, tuple) and
+        all(_is_shape_component(ele) for ele in input_shape)):
+      return True
+    return False
+
+  def _convert_shape(input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if to_tuples:
+      input_shape = tuple(input_shape.as_list())
+    return input_shape
+
+  return map_structure_with_atomic(_is_atomic_shape, _convert_shape,
+                                   input_shape)
+
+
+class ListWrapper(object):
+  """A wrapper for lists to be treated as elements for `nest`."""
+
+  def __init__(self, list_to_wrap):
+    self._list = list_to_wrap
+
+  def as_list(self):
+    return self._list
+
+
+def convert_inner_node_data(nested, wrap=False):
+  """Either wraps or unwraps innermost node data lists in `ListWrapper` objects.
+
+  Arguments:
+    nested: A nested data structure.
+    wrap: If `True`, wrap innermost lists in `ListWrapper` objects. If `False`,
+      unwraps `ListWrapper` objects into lists.
+
+  Returns:
+    Strucutre of same type as nested, with lists wrapped/unwrapped.
+  """
+
+  def _is_atomic_nested(nested):
+    """Returns `True` if `nested` is a list representing node data."""
+    if isinstance(nested, ListWrapper):
+      return True
+    # Node data can be of form `[layer_name, node_id, tensor_id]` or
+    # `[layer_name, node_id, tensor_id, kwargs]`.
+    if (isinstance(nested, list) and (len(nested) in [3, 4]) and
+        isinstance(nested[0], six.string_types)):
+      return True
+    return False
+
+  def _convert_object_or_list(nested):
+    """Convert b/t `ListWrapper` object and list representations."""
+    if wrap:
+      if isinstance(nested, ListWrapper):
+        return nested
+      return ListWrapper(nested)
+    else:
+      if isinstance(nested, ListWrapper):
+        return nested.as_list()
+      return nested
+
+  return map_structure_with_atomic(_is_atomic_nested, _convert_object_or_list,
+                                   nested)
+
+
 def shape_type_conversion(fn):
   """Decorator that handles tuple/TensorShape conversion.
 
@@ -142,17 +279,15 @@ def shape_type_conversion(fn):
   """
 
   def wrapper(instance, input_shape):
+    # Pass shapes as tuples to `fn`
+    # This preserves compatibility with external Keras.
     if input_shape is not None:
-      if isinstance(input_shape, list):
-        input_shape = [
-            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
-      else:
-        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+      input_shape = convert_shapes(input_shape, to_tuples=True)
     output_shape = fn(instance, input_shape)
+    # Return shapes from `fn` as TensorShapes.
     if output_shape is not None:
-      if isinstance(output_shape, list):
-        return [tensor_shape.TensorShape(x) for x in output_shape]
-      return tensor_shape.TensorShape(output_shape)
+      output_shape = convert_shapes(output_shape, to_tuples=False)
+    return output_shape
 
   return wrapper
 
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 9833a492993feb3a989d09160919fbf85c3a21e7..9c478af4ecbbe1bb976c982e596f82ac56e2045d 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -130,5 +130,17 @@ class TestIsSymbolicTensor(test.TestCase):
     self.assertIsInstance(y, Foo)
 
 
+class ConvertInnerNodeDataTest(test.TestCase):
+
+  def test_convert_inner_node_data(self):
+    data = tf_utils.convert_inner_node_data((tf_utils.ListWrapper(['l', 2, 3]),
+                                             tf_utils.ListWrapper(['l', 5, 6])))
+    self.assertEqual(data, (['l', 2, 3], ['l', 5, 6]))
+
+    data = tf_utils.convert_inner_node_data(((['l', 2, 3], ['l', 5, 6])),
+                                            wrap=True)
+    self.assertTrue(all(isinstance(ele, tf_utils.ListWrapper) for ele in data))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 82bc2755bdc7cc49c2f79fbbfbc964f3c9dd51f3..9394cc114e9e9b8366acc8de37a90b972bbd96ed 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -67,6 +67,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   """
   from tensorflow.python.keras.layers.wrappers import Wrapper
   from tensorflow.python.keras.models import Sequential
+  from tensorflow.python.util import nest
 
   _check_pydot()
   dot = pydot.Dot()
@@ -77,7 +78,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   if isinstance(model, Sequential):
     if not model.built:
       model.build()
-  layers = model.layers
+  layers = model._layers
 
   # Create graph nodes.
   for layer in layers:
@@ -120,7 +121,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
     for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
       if node_key in model._network_nodes:  # pylint: disable=protected-access
-        for inbound_layer in node.inbound_layers:
+        for inbound_layer in nest.flatten(node.inbound_layers):
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
           dot.add_edge(pydot.Edge(inbound_layer_id, layer_id))
diff --git a/tensorflow/python/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/wrappers/scikit_learn.py
index 566f9db5d4459b92b0e707df4bc8a5c391a2e9ae..b5fe7669ab89f5c2cb3e92be60673fd7029d871a 100644
--- a/tensorflow/python/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn.py
@@ -23,6 +23,7 @@ import types
 
 import numpy as np
 
+from tensorflow.python.keras import losses
 from tensorflow.python.keras.models import Sequential
 from tensorflow.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras.utils.np_utils import to_categorical
@@ -155,10 +156,8 @@ class BaseWrapper(object):
     else:
       self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
 
-    loss_name = self.model.loss
-    if hasattr(loss_name, '__name__'):
-      loss_name = loss_name.__name__
-    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+    if (losses.is_categorical_crossentropy(self.model.loss) and
+        len(y.shape) != 2):
       y = to_categorical(y)
 
     fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index bd5c103b38dc1561fbcb19b326052bd4f3c6f293..aef4354ee081cb04b6544ebb5ecc2863408aaade 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -131,6 +131,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -145,6 +146,7 @@ cuda_py_test(
         "//tensorflow/python:platform_benchmark",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -161,6 +163,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -226,6 +229,7 @@ cuda_py_test(
     ],
     shard_count = 5,
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -282,6 +286,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -387,6 +392,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -645,6 +651,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["optonly"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -659,6 +666,7 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -671,6 +679,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -684,6 +693,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -695,6 +705,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -713,6 +724,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -795,6 +807,7 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -853,6 +866,7 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
     ],
     tags = ["noasan"],  # http://b/32635055
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1159,6 +1173,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1174,6 +1189,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1290,6 +1306,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1311,6 +1328,7 @@ cuda_py_test(
         "noguitar",
         "notap",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1326,6 +1344,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["notsan"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1342,6 +1361,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1374,6 +1394,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1385,6 +1406,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1413,6 +1435,7 @@ cuda_py_test(
         "noasan",  # times out
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1425,6 +1448,7 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1441,6 +1465,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 10,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1455,6 +1480,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1468,6 +1494,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1482,11 +1509,12 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
     name = "bias_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["bias_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1497,6 +1525,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1509,6 +1538,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1527,6 +1557,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1542,6 +1573,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1560,6 +1592,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1600,6 +1633,7 @@ cuda_py_test(
     ],
     shard_count = 16,
     tags = ["no_gpu"],  # TODO(b/117928656)
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1640,6 +1674,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1654,6 +1689,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1668,6 +1704,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1679,6 +1716,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1692,6 +1730,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1707,6 +1746,7 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1721,6 +1761,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1733,6 +1774,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["manual"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1748,6 +1790,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1762,6 +1805,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1774,6 +1818,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1786,6 +1831,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1812,6 +1858,7 @@ cuda_py_test(
     grpc_enabled = True,
     shard_count = 2,
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1827,6 +1874,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1834,12 +1882,14 @@ cuda_py_test(
     size = "medium",
     srcs = ["gather_op_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1853,6 +1903,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1880,6 +1931,7 @@ cuda_py_test(
         "noasan",
         "notap",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1897,6 +1949,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1912,6 +1965,7 @@ cuda_py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_grad",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1928,6 +1982,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python/ops/linalg",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1941,6 +1996,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1958,6 +2014,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1971,6 +2028,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1986,6 +2044,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:numerics",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1999,6 +2058,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2013,6 +2073,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2025,6 +2086,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2039,6 +2101,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2057,6 +2120,7 @@ cuda_py_test(
         "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2070,6 +2134,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:string_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2085,6 +2150,7 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2105,6 +2171,7 @@ cuda_py_test(
         "noguitar",
         "notap",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2123,6 +2190,7 @@ cuda_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python:tf2",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2135,6 +2203,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2147,6 +2216,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2159,6 +2229,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2177,6 +2248,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2190,6 +2262,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2202,6 +2275,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2218,6 +2292,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2232,6 +2307,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2245,6 +2321,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2258,6 +2335,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2273,6 +2351,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2287,6 +2366,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -2331,6 +2411,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_grad",
         "//tensorflow/python:sparse_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2355,6 +2436,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2369,6 +2451,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2384,6 +2467,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2396,6 +2480,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:string_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2408,6 +2493,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2421,6 +2507,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:summary",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2436,6 +2523,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:summary",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2466,6 +2554,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     flaky = 1,  # create_local_cluster sometimes times out.
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2478,6 +2567,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2496,6 +2586,7 @@ cuda_py_test(
         "no_oss",
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2508,6 +2599,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2525,6 +2617,7 @@ cuda_py_test(
         "//tensorflow/python:state_ops_gen",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2541,6 +2634,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2552,6 +2646,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2571,6 +2666,7 @@ cuda_py_test(
     tags = [
         "no_gpu",  #  Flaky: b/80127739
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2586,6 +2682,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
     ],
     tags = ["manual"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2599,6 +2696,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2617,6 +2715,7 @@ cuda_py_test(
     tags = [
         "optonly",  # flaky timeouts unless optimized
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2630,6 +2729,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2654,10 +2754,9 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
-        # TODO(b/118887316): Re-enable this test in Kokoro.
-        "no_oss",
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2675,6 +2774,7 @@ cuda_py_test(
     ],
     # TODO(b/118842098): Re-enable this test in Kokoro.
     tags = ["no_oss"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -2703,6 +2803,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["manual"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2716,6 +2817,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2734,6 +2836,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops_gen",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2763,6 +2866,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2778,6 +2882,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = ["optonly"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2792,6 +2897,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2808,6 +2914,7 @@ cuda_py_test(
     tags = [
         "no_oss",  # Requires 4GB+ RAM
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2821,6 +2928,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2844,6 +2952,7 @@ cuda_py_test(
         "noasan",
         "optonly",  # b/77589990
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2863,6 +2972,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 # TODO(gpapan): Revisit the gradient of extract_image_patches_op to resolve
@@ -2879,6 +2989,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["notap"],  # http://b/31080670
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2893,6 +3004,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2907,6 +3019,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2924,6 +3037,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2939,6 +3053,7 @@ cuda_py_test(
         "nomsan",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2952,6 +3067,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
     ],
     shard_count = 30,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2972,6 +3088,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2992,6 +3109,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3012,6 +3130,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3037,6 +3156,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3054,6 +3174,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3067,6 +3188,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3084,6 +3206,7 @@ cuda_py_test(
     data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3099,6 +3222,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3118,6 +3242,7 @@ cuda_py_test(
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3137,6 +3262,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3152,6 +3278,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 sycl_py_test(
@@ -3398,6 +3525,7 @@ cuda_py_test(
         "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3408,6 +3536,7 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -3423,4 +3552,5 @@ cuda_py_test(
         "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index f4c442b7b1932c3ddab0d255f57c3fac5a23954a..5ed549f268f61072d5ae1f54d6172a1871e6633c 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1050,10 +1050,12 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[None] = [6]  # new axis
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testSliceAssign(self):
     self.doTestSliceAssign(use_resource=False)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testSliceAssignResource(self):
     self.doTestSliceAssign(use_resource=True)
 
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index a13e325835cfd343eda61037b8392e83bed0f1c2..b84e76472399943279c1f9b680332f69f8ed48d8 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -160,7 +160,7 @@ class AtrousConv2DTest(test.TestCase):
                                                       [x_shape, f_shape],
                                                       output, y_shape)
         print("atrous_conv2d gradient err = %g " % err)
-        err_tolerance = 1e-3
+        err_tolerance = 4e-3 if test_util.is_xla_enabled() else 1e-3
         self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index bffa5e6e8f4d9125f5021eb531319f67fd6e77bb..a91f96cf952252bf162e9e708d7b8e5808aad38c 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import gfile
@@ -125,6 +126,7 @@ class BenchmarkTest(test.TestCase):
     self.assertFalse(_ran_somebenchmark_2[0])
     self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testReportingBenchmark(self):
     tempdir = test.get_temp_dir()
     try:
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 66f442dbddb5f609e7525ba0db9809dc3943ee25..e89e2400f4f13f561615e1137884294dc37f2ba6 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -196,9 +196,7 @@ class BiasAddTest(test.TestCase):
       self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
 
   @test_util.run_deprecated_v1
-  def testGradientTensor(self):
-    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
-    # all dimensions are supported.
+  def testGradientTensor2D(self):
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         np_input = np.array(
@@ -207,9 +205,19 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
+  def testGradientTensor3D(self):
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        np_input = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+                            dtype=dtype.as_numpy_dtype).reshape(1, 3, 2)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
   @test_util.run_deprecated_v1
   def testGradientTensor4D(self):
-    # BiasAddGrad with NCHW support 4D so all are enabled.
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
                                    ("NCHW", False), ("NCHW", True)]:
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -219,6 +227,18 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
+  def testGradientTensor5D(self):
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        np_input = np.arange(
+            1.0, 49.0, dtype=dtype.as_numpy_dtype).reshape(
+                [1, 2, 3, 4, 2]).astype(np.float32)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
   @test_util.run_deprecated_v1
   def testEmpty(self):
     np.random.seed(7)
@@ -226,11 +246,17 @@ class BiasAddTest(test.TestCase):
       self._testAll(np.random.randn(*shape), np.random.randn(shape[-1]))
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testEmptyGradient(self):
-    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
-    # all dimensions are supported.
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
-      for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
+      for shape in (0, 0), (2, 0), (0, 2):
+        self._testGradient(
+            np.random.randn(*shape), np.random.randn(shape[-1]), dtypes.float64,
+            data_format, use_gpu)
+
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for shape in (4, 3, 0), (4, 0, 3), (0, 4, 3):
         self._testGradient(
             np.random.randn(*shape),
             np.random.randn(shape[-1]), dtypes.float64, data_format, use_gpu)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 37a60fa0e38c6d45a4ff40fcc3863226ca98e6be..0315456447dec43264e48d918b74ba3bf0e119c5 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -145,7 +145,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
@@ -164,7 +164,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
       save.save(sess, save_path)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
@@ -177,7 +177,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
@@ -195,7 +195,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index b3187e1637193a8b34f7f3668220d94d783b6170..e9be8e7d5f73c9ea6f7a0fe15d84ecba7201156b 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -157,7 +157,7 @@ class CastOpTest(test.TestCase):
       # np.float64("np.inf").astype(np.int32) is negative on x86 but positive on ppc64le
       # Numpy link to relevant discussion - https://github.com/numpy/numpy/issues/9040
       # Tensorflow link to relevant discussion - https://github.com/tensorflow/tensorflow/issues/9360
-      if platform.machine() == "ppc64le":
+      if platform.machine() == "ppc64le" or platform.machine() == "aarch64":
         self._compare(-np.inf, np.int32, i4.min, False)
         self._compare(-np.inf, np.int64, i8.min, False)
       else:
@@ -169,8 +169,13 @@ class CastOpTest(test.TestCase):
     self._compare(-np.inf, np.int64, i8.min, False)
     self.assertAllEqual(np.isnan(self._cast(np.nan, np.float32, False)), True)
     self.assertAllEqual(np.isnan(self._cast(np.nan, np.float64, False)), True)
-    self._compare(np.nan, np.int32, i4.min, False)
-    self._compare(np.nan, np.int64, i8.min, False)
+    # np.float64(np.nan).astype(np.int32) is 0 on ARM
+    if platform.machine() == "aarch64":
+      self._compare(np.nan, np.int32, 0, False)
+      self._compare(np.nan, np.int64, 0, False)
+    else:
+      self._compare(np.nan, np.int32, i4.min, False)
+      self._compare(np.nan, np.int64, i8.min, False)
 
     self._compare(np.inf, np.float32, np.inf, True)
     self._compare(np.inf, np.float64, np.inf, True)
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 95bac85027bd1709420dcfc7f96f92195f8f2472..d5f3696a9dc8a86e8a6fb75a4c59f9accf279ba9 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -889,6 +889,8 @@ class EnsureShapeTest(test.TestCase):
 
   # Dynamic shape check
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA"
+                        )  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -902,6 +904,8 @@ class EnsureShapeTest(test.TestCase):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA"
+                        )  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index a08cfe960d005451ab5a02aff02e90a0fbcb92a0..abb71a672c13dd62eda24f0b0e31c7625ea6727a 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -163,6 +163,7 @@ class CholeskyOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(tensor3)
 
+  @test_util.disable_xla("This test never passed for XLA")  # all nan on XLA
   def testNotInvertibleCPU(self):
     # The input should be invertible.
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 474760a93ff84be698388a7784f66445c21cd8ca..a968b061270ae00ddcb056f73cad3b215e413d1d 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.disable_all_xla("This test never passed for XLA")
 class ConcatOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -640,6 +641,8 @@ class ConcatOpTest(test.TestCase):
         output = self.evaluate(c)
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
+
+@test_util.disable_all_xla("This test never passed for XLA")
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
@@ -683,6 +686,8 @@ class ConcatOffsetTest(test.TestCase):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla(
+      "This test never passed for XLA")  # Different error message on XLA
   def testSizeMismatch(self):
     cdim = constant_op.constant(1, dtypes.int32)
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 8fe3ba41e27aa101fd4f2e3b41b0a0b226471047..244b0bdd7fd48d8e0b4b7fb5a778123dede5fef2 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,7 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -145,6 +147,22 @@ class CondV2Test(test.TestCase):
     self.assertEqual(cond_op.type, "If")
     return output, cond_op
 
+  def _createNestedCond(self, name):
+    """Like _createCond but creates a nested cond_v2 call as well."""
+    pred = constant_op.constant(True, name="pred")
+    x = constant_op.constant(1.0, name="x")
+
+    def true_fn():
+      return cond_v2.cond_v2(pred, lambda: x, lambda: x + 1)
+
+    def false_fn():
+      return x + 2
+
+    output = cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
+    cond_op = output.op.inputs[0].op
+    self.assertEqual(cond_op.type, "If")
+    return output, cond_op
+
   def testDefaultName(self):
     with ops.Graph().as_default():
       _, cond_op = self._createCond(None)
@@ -612,6 +630,26 @@ class CondV2Test(test.TestCase):
         # d2[x]/dx2 = 0
         self.assertEqual(false_val, [0.0])
 
+  def testGradientTapeOfCondWithResourceVariableInFunction(self):
+    with context.eager_mode():
+      v = variables.Variable(2.)
+
+      @def_function.function
+      def fnWithCond():  # pylint: disable=invalid-name
+        with backprop.GradientTape() as tape:
+          pred = constant_op.constant(True, dtype=dtypes.bool)
+
+          def true_fn():
+            return math_ops.pow(v, 3)
+
+          def false_fn():
+            return v
+
+          cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond")
+        return tape.gradient(cond, v)
+
+      self.assertAllEqual(fnWithCond(), 12.0)
+
   def testLowering(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
@@ -645,9 +683,14 @@ class CondV2Test(test.TestCase):
       # Build the cond_v2 in an XLA context
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      cond_output, _ = self._createCond("cond")
+      cond_output, cond_op = self._createCond("cond")
       xla_context.Exit()
 
+      # Check lowering attr is not set.
+      with self.assertRaises(ValueError):
+        cond_op.get_attr("_lower_using_switch_merge")
+
+      # Check the actual graph that is run.
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
       sess.run(cond_output, options=run_options, run_metadata=run_metadata)
@@ -672,6 +715,29 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  @test_util.run_deprecated_v1
+  def testNestedLoweringDisabledInXLA(self):
+    # Build the cond_v2 in an XLA context
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    _, cond_op = self._createNestedCond("cond")
+    xla_context.Exit()
+
+    # Check lowering attr is not set for either If node.
+    with self.assertRaises(ValueError):
+      cond_op.get_attr("_lower_using_switch_merge")
+
+    nested_if_ops = []
+    for func in ops.get_default_graph()._functions.values():
+      nested_if_ops.extend(op for op in func._graph.get_operations()
+                           if op.type == "If")
+    self.assertEqual(len(nested_if_ops), 1)
+    with self.assertRaises(ValueError):
+      nested_if_ops[0].get_attr("_lower_using_switch_merge")
+
+    # TODO(skyewm): check the actual graphs that are run once we have a way to
+    # programmatically access those graphs.
+
   @test_util.run_deprecated_v1
   def testLoweringDisabledWithSingleThreadedExecutorContext(self):
     with self.session(graph=ops.Graph()) as sess:
@@ -719,8 +785,8 @@ class CondV2Test(test.TestCase):
       return ((x,), y * 3.0)
 
     with self.assertRaisesRegexp(
-        ValueError, "Outputs of true_fn and false_fn must"
-        " have the same structure"):
+        TypeError, "true_fn and false_fn arguments to tf.cond must have the "
+        "same number, type, and overall structure of return values."):
       control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
 
   @test_util.enable_control_flow_v2
@@ -1040,7 +1106,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
                 self.evaluate(cond_v2.cond_v2(constant_op.constant(True),
                                               fn2, fn2)))
         else:
-          self.skipTest("Test requrires a GPU to check GPU device placement.")
+          self.skipTest("Test requires a GPU to check GPU device placement.")
 
   def testDeviceInAndOutOfCond(self):
     with ops.Graph().as_default() as g:
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index ae13c8e32e5ed5c8f3e6b670835db66d1e7dad0f..0ea5b1f5d8c35a1d5f7e883872475fdeb97688c6 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -470,9 +470,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
       }
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Can not squeeze dim\[2\]"):
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r"Can not squeeze dim\[2\]"):
         dynamic_labels.eval(feed_dict=feed_dict)
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
@@ -498,9 +497,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       }
       self.assertAllEqual(
           label_values, dynamic_labels.eval(feed_dict=feed_dict))
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Can not squeeze dim\[2\]"):
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r"Can not squeeze dim\[2\]"):
         dynamic_predictions.eval(feed_dict=feed_dict)
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 457766c62252bfaa29c4aaaa239219492c7fa441..3f0b1b3bbc587c3134f1e7f428693a2008c66a46 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -53,6 +53,7 @@ from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
@@ -63,6 +64,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
@@ -70,6 +72,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2  # pylint: disable=unused-import
 # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_factory_ops
 import tensorflow.python.ops.tensor_array_grad
 # pylint: enable=unused-import
 from tensorflow.python.platform import test
@@ -427,23 +430,49 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.run_v1_only("b/120545219")
+  def testCondMismatchedIndexedSlices(self):
+    @def_function.function
+    def foo():
+      values = constant_op.constant(10)
+      indices = constant_op.constant(0)
+      x = ops.IndexedSlices(values, indices)
+      v1_msg = "The two structures don't have the same nested structure"
+      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
+                "number, type, and overall structure of return values.")
+      with self.assertRaisesRegexp(
+          TypeError,
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
+        control_flow_ops.cond(
+            constant_op.constant(True),
+            lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices),
+            lambda: math_ops.add(x.values, 1), indices)
+    foo()
+
   def testCondSparseTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2.0, 4.0], name="values")
-      indices = constant_op.constant(
-          [[0], [3]], dtype=dtypes.int64, name="indices")
-      shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
-      x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
-      pred = math_ops.less(1, 2)
-      fn1 = lambda: sparse_tensor.SparseTensor(
-          indices + 1, x.values + 1, dense_shape=shape)
-      fn2 = lambda: sparse_tensor.SparseTensor(
-          indices, x.values - 1, dense_shape=shape)
-      r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([3.0, 5.0], r.values)
-      self.assertAllEqual([[1], [4]], r.indices)
-      self.assertAllEqual(r.values.get_shape(), (2,))
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: sparse_tensor.SparseTensor(
+        indices + 1, x.values + 1, dense_shape=shape)
+    fn2 = lambda: sparse_tensor.SparseTensor(
+        indices, x.values - 1, dense_shape=shape)
+    r = control_flow_ops.cond(pred, fn1, fn2)
+    self.assertAllEqual([3.0, 5.0], r.values)
+    self.assertAllEqual([[1], [4]], r.indices)
+    self.assertAllEqual(r.values.get_shape(), (2,))
+
+  def testCondRaggedTensor(self):
+    rt = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: array_ops.concat([rt + 2, [[100]]], axis=0)
+    fn2 = lambda: rt[:2] - 2
+    result = control_flow_ops.cond(pred, fn1, fn2)
+    self.assertAllEqual([3, 4, 5, 6, 7, 8, 100], result.values)
+    self.assertAllEqual([0, 2, 3, 6, 7], result.row_splits)
 
   @test_util.run_v1_only("b/120545219")
   def testCondResource(self):
@@ -704,12 +733,12 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
       fn2 = lambda: {"c": y, "d": y}
       v1_msg = "The two structures don't have the same nested structure"
-      v2_msg = "Outputs of true_fn and false_fn must have the same structure"
+      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
+                "number, type, and overall structure of return values.")
       with self.assertRaisesRegexp(
-          ValueError,
+          TypeError if control_flow_util.ENABLE_CONTROL_FLOW_V2 else ValueError,
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
-        r = control_flow_ops.cond(pred, fn1, fn2)
-        self.evaluate(r)
+        control_flow_ops.cond(pred, fn1, fn2)
 
   @test_util.run_deprecated_v1
   def testCondRef(self):
@@ -914,6 +943,68 @@ class ControlFlowTest(test.TestCase):
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  @test_util.run_deprecated_v1
+  def testCondGrad_ResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(
+        np.ones((4, 2), dtype=np.float32))
+    x = constant_op.constant(1.0)
+    r = control_flow_ops.cond(
+        constant_op.constant(True),
+        lambda: x * math_ops.reduce_sum(var.sparse_read([1, 2])),
+        lambda: constant_op.constant(np.zeros((2, 3)),
+                                     dtype=dtypes.float32))
+    grad = gradients_impl.gradients(r, var)[0]
+
+    self.evaluate(variables.global_variables_initializer())
+    grad_val = self.evaluate(grad)
+    self.assertIsInstance(grad_val, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad_val), [[0., 0.],
+                                                                  [1., 1.],
+                                                                  [1., 1.],
+                                                                  [0., 0.]])
+
+  def testCondGrad_MultiGather(self):
+    # NOTE(skyewm): this test is interesting because the array_ops.gather and
+    # ResourceVariable.sparse_read gradient functions returns IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(
+        np.ones((4, 2), dtype=np.float32))
+    x1 = constant_op.constant(np.ones((3, 3), dtype=np.float32))
+    x2 = constant_op.constant(2.0)
+
+    def true_fn():
+      y1 = var.sparse_read([1, 2])
+      y2 = array_ops.gather(x1, [2]) * x2
+      y3 = x2 * [1., 1., 1.]
+      return y1, y2, y3
+
+    def false_fn():
+      y1 = np.zeros((2, 2), dtype=np.float32)
+      y2 = array_ops.gather(x1, [2]) * x2
+      y3 = array_ops.gather(x1, [2])
+      return y1, y2, y3
+
+    @def_function.function
+    def foo():
+      r = control_flow_ops.cond(constant_op.constant(True), true_fn, false_fn)
+      return gradients_impl.gradients(r, [var, x1, x2])
+
+    grad = foo()
+    self.evaluate(variables.global_variables_initializer())
+    var_grad, x1_grad, x2_grad = self.evaluate(grad)
+    self.assertIsInstance(var_grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var_grad), [[0., 0.],
+                                                                  [1., 1.],
+                                                                  [1., 1.],
+                                                                  [0., 0]])
+    self.assertIsInstance(x1_grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(x1_grad), [[0., 0., 0.],
+                                                                 [0., 0., 0.],
+                                                                 [2., 2., 2.]])
+    self.assertIsInstance(x1_grad, ops.IndexedSlicesValue)
+    self.assertEqual(gradient_checker_v2._to_numpy(x2_grad), 6.)
+
   @test_util.run_v1_only("b/120545219")
   def testCondPredicateTensor(self):
     """Regression test for lowering predicate from non-first output of an op."""
@@ -1065,7 +1156,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(printed.contents(), "D\nD\n")
 
   # Microbenchmark: 256,000 iterations/s.
-  @test_util.disable_control_flow_v2("b/116630618 (Times out)")
   def testWhile_1(self):
     with self.cached_session():
       n = constant_op.constant(0)
@@ -1080,7 +1170,7 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       v = variables.Variable(0.0)
       v.initializer.run()
-      increment = v.assign_add(1.0)
+      increment = v.assign_add(1.0).read_value()
 
       def body_fn(i):
         with ops.control_dependencies([increment]):
@@ -1097,7 +1187,8 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       v = variables.Variable(0.0)
       v.initializer.run()
-      increment = v.assign_add(1.0)
+      # TODO(apassos): figure out why the reading is necessary here.
+      increment = v.assign_add(1.0).read_value()
 
       def body_fn(unused_i):
         with ops.control_dependencies([increment]):
@@ -1182,6 +1273,8 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+      self.skipTest("WhileV2 does lazy evaluation of maximum_iterations")
     v = constant_op.constant(1.0)
 
     def inner_body(i, x):
@@ -1202,44 +1295,27 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      xla_context.Enter()
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations is None. It is required and must be statically "
-          r"known \(e.g. a constant value or known shape dimension\) when "
-          r"building while_loop in XLA context."):
-        loop_no_maxiter = create_while_loop()
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations must be statically "
-          r"known \(e.g. a constant value or known shape dimension\) when "
-          r"building while_loop in XLA context."):
-        loop_with_maxiter = create_while_loop(maximum_iterations=2)
-      xla_context.Exit()
-    else:
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      xla_context.Enter()
-      loop_no_maxiter = create_while_loop()
-      loop_with_maxiter = create_while_loop(maximum_iterations=2)
-      xla_context.Exit()
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    loop_no_maxiter = create_while_loop()
+    loop_with_maxiter = create_while_loop(maximum_iterations=2)
+    xla_context.Exit()
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"Cannot create a gradient accumulator for tensor '.+' inside "
-          r"XLA while_loop because maximum_iterations was not passed to "
-          r"the tf.while_loop call \('.+'\)."):
-        _ = gradients_impl.gradients(loop_no_maxiter, v)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside "
+        r"XLA while_loop because maximum_iterations was not passed to "
+        r"the tf.while_loop call \('.+'\)."):
+      _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-          r"while_loop. maximum_iterations tensor '.+' for while_loop context "
-          r"'.+' must be statically known \(e.g. a constant value or known "
-          r"shape dimension\), or be defined at or outside the while loop "
-          r"context '.*' \(currently defined in '.*'\)"):
-        _ = gradients_impl.gradients(loop_with_maxiter, v)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+        r"'.+' must be statically known \(e.g. a constant value or known "
+        r"shape dimension\), or be defined at or outside the while loop "
+        r"context '.*' \(currently defined in '.*'\)"):
+      _ = gradients_impl.gradients(loop_with_maxiter, v)
 
   @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
@@ -1264,10 +1340,7 @@ class ControlFlowTest(test.TestCase):
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
       with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations must be statically known \(e.g. a constant value"
-          r" or known shape dimension\) when building while_loop in XLA "
-          r"context."):
+          ValueError, r"Tensor.*Placeholder:0.* must be from the same graph.*"):
         loop = create_while_loop()
       xla_context.Exit()
     else:
@@ -1537,35 +1610,95 @@ class ControlFlowTest(test.TestCase):
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
   @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2.0, 4.0], name="values")
-      indices = constant_op.constant(
-          [[0], [3]], dtype=dtypes.int64, name="indices")
-      shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
-      i = constant_op.constant(0)
-      x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
-
-      def c(i, _):
-        return i < 10
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    i = constant_op.constant(0)
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+    def c(i, _):
+      return i < 10
+
+    def b1(i, x):  # modifies values.  (shape of components is not changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
+      ]
 
-      def b(i, x):
-        return [
-            i + 1,
-            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
-        ]
+    def b2(i, x):  # adds new values.  (shape of components is changed.)
+      return [
+          i + 1,
+          sparse_ops.sparse_add(
+              x,
+              sparse_tensor.SparseTensor(
+                  indices=math_ops.cast(
+                      array_ops.fill([1, 1], i), dtypes.int64),
+                  values=array_ops.fill([1], 1.0),
+                  dense_shape=x.dense_shape))
+      ]
 
-      _, r = control_flow_ops.while_loop(c, b, [i, x])
-      self.assertEqual(r.dense_shape.get_shape()[0], 1)
+    def b3(i, x):  # modifies rank.  (shape of all components is changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(
+              array_ops.concat([x.indices, [[i], [i]]], axis=1), x.values * 2.0,
+              array_ops.concat([x.dense_shape, [10]], axis=0))
+      ]
 
+    # Default shape invariant; b1 only modifies values.
+    _, r = control_flow_ops.while_loop(c, b1, [i, x])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+
+    # Default shape invariant; b2 adds new values
+    _, r = control_flow_ops.while_loop(c, b2, [i, x])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+
+    # Default shape invariant; b3 modifies rank (which is not allowed).
+    with self.assertRaises(ValueError):
+      _, r = control_flow_ops.while_loop(c, b3, [i, x])
+
+    # Explicit shape invariant, allowing any rank; b1 only modifies values.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None])])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Explicit shape invariant, allowing any rank; b3 modifies rank.
+    _, r = control_flow_ops.while_loop(
+        c, b3, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None])])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Shape invariant with ndims=None.  Technically, this isn't supported
+    # according to the docs, but we support it for backwards compatibility.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape(None)])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    _, r = control_flow_ops.while_loop(
+        c, b3, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape(None)])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Explicit shape invariant, with a specific (incompatible) rank.
+    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x],
-          [i.get_shape(), tensor_shape.TensorShape([None])])
-      self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
-
-      with self.assertRaisesRegexp(ValueError, "is not compatible with"):
-        _, r = control_flow_ops.while_loop(
-            c, b, [i, x],
-            [i.get_shape(), tensor_shape.TensorShape([5])])
+          c, b1, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
   @test_util.run_v1_only("b/120545219")
@@ -1601,6 +1734,65 @@ class ControlFlowTest(test.TestCase):
             c, b, [i, x],
             [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
+  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+  def testWhileShapeInferenceRaggedTensor(self):
+    i = constant_op.constant(0)
+    x = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
+    c = lambda i, _: i < 10
+
+    def b1(i, x):  # Adds new values to rows (but doesn't create new rows)
+      return [
+          i + 1,
+          array_ops.concat([x, x], axis=1)
+      ]
+
+    def b2(i, x):  # Adds new rows.
+      return [
+          i + 1,
+          array_ops.concat([x, x], axis=0)
+      ]
+
+    # Default shape invariant; b1 adds new values to rows.
+    _, r = control_flow_ops.while_loop(c, b1, [i, x])
+    self.assertEqual(r.row_splits.shape.as_list(), [4])
+
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+    # Default shape invariant; b2 adds new rows (not allowed).
+    if not context.executing_eagerly():
+      with self.assertRaises(ValueError):
+        _, r = control_flow_ops.while_loop(c, b2, [i, x])
+
+    # Explicit shape invariant; b1 adds new values to rows.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None, None])])
+    self.assertTrue(r.row_splits.shape.as_list() in ([4], [None]))
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+    # Explicit shape invariant; b2 adds new rows.
+    _, r = control_flow_ops.while_loop(
+        c, b2, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None, None])])
+    self.assertTrue(r.row_splits.shape.as_list() in ([3 * 2**10 + 1], [None]))
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+  def testWhileShapeInferenceRaggedTensorRaggedRank2(self):
+    i = constant_op.constant(0)
+    x = ragged_factory_ops.constant([[[1, 2], [3], [4, 5, 6]],
+                                     [[], [8, 9, 10]]])
+    c = lambda i, _: i < 10
+    def b(i, x):
+      return [
+          i + 1,
+          array_ops.concat([x, x[..., i:i+1]], axis=-1)
+      ]
+    _, r = control_flow_ops.while_loop(c, b, [i, x])
+    self.assertEqual(r.row_splits.shape.as_list(), [3])
+    self.assertTrue(r.values.row_splits.shape.as_list() in ([6], [None]))
+    self.assertTrue(r.values.values.shape.as_list() in ([49], [None]))
+
   def _testNestedWhile_1(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
@@ -1834,7 +2026,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(0.0, sess.run(r, {p: False}))
       self.assertEqual([2.0], sess.run(r1, {p: False}))
 
-  @test_util.disable_control_flow_v2("b/116743589")
   @test_util.run_deprecated_v1
   def testCondWhile_3(self):
     self._testCondWhile_3(use_gpu=False)
@@ -2262,7 +2453,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(216.0, grad_a_val)
       self.assertAllClose(81.0, grad_v_val)
 
-  @test_util.disable_control_flow_v2("b/116630618 (parallel_iters: times out)")
   @test_util.run_deprecated_v1
   def testWhileGrad_Mul(self):
     self._testWhileGrad_Mul(use_gpu=False, p_iters=1)
@@ -2399,6 +2589,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
+  @test_util.disable_xla("This test never passed for XLA"
+                        )  # Resource variable issue for ControlFlowV2
   @test_util.run_gpu_only
   def testGpuResourceAccess(self):
     with ops.device(test.gpu_device_name()):
@@ -2772,13 +2964,11 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/117519152")
   @test_util.run_deprecated_v1
   def testWhileCondGrad_Simple(self):
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/117276490")
   @test_util.run_deprecated_v1
   def testWhileCondGrad_UnknownShape(self):
     with self.cached_session() as sess:
@@ -2899,7 +3089,6 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   @test_util.run_v1_only("b/120545219")
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
@@ -3062,7 +3251,6 @@ class ControlFlowTest(test.TestCase):
       all_ops = x.graph.get_operations()
       self.assertFalse(any(name in op.name for op in all_ops))
 
-  @test_util.disable_control_flow_v2("b/117954949")
   @test_util.run_deprecated_v1
   def testWhileGradGradFail(self):
     theta = variables.Variable(initial_value=1.)
@@ -3133,7 +3321,6 @@ class ControlFlowTest(test.TestCase):
       self.evaluate(q.initializer)
       self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
-  @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   @test_util.run_v1_only("b/120545219")
   def testIssue16504(self):
     c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
@@ -3654,6 +3841,21 @@ class ControlFlowTest(test.TestCase):
       result = func(qint)
       self.evaluate(result)
 
+  def testSparseIdentity(self):
+    st1 = sparse_tensor.SparseTensor([[0, 5]], ['x'], [10, 10])
+    st2 = control_flow_ops._Identity(st1)
+    self.assertAllEqual(st1.indices, st2.indices)
+    self.assertAllEqual(st1.values, st2.values)
+    self.assertAllEqual(st1.dense_shape, st2.dense_shape)
+
+  def testSparseEnterExit(self):
+    st1 = sparse_tensor.SparseTensor([[0, 5]], ['x'], [10, 10])
+    st2 = control_flow_ops._Enter(st1, "foo_1")
+    st3 = control_flow_ops.exit(st2)
+    self.assertAllEqual(st1.indices, st3.indices)
+    self.assertAllEqual(st1.values, st3.values)
+    self.assertAllEqual(st1.dense_shape, st3.dense_shape)
+
 
 class ControlFlowContextCheckTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 2f6f3bb383b381de1dac78cc72882fe5fe4291c9..d9b908be1e7d813b6a37a0e4e26fd008e3d20526 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -26,13 +26,18 @@ import numpy as np
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import layers
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
@@ -165,6 +170,12 @@ class Conv2DTest(test.TestCase):
       # as we will be using its gradients as reference for fp16 gradients.
       return [dtypes.float32, dtypes.float16, dtypes.float64]
 
+  def _CreateNumpyTensor(self, shape):
+    total_size = 1
+    for s in shape:
+      total_size *= s
+    return np.arange(1, total_size + 1, dtype=np.float32).reshape(shape)
+
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
                             strides, padding, data_format, dtype, use_gpu):
     """Verifies the output values of the convolution function.
@@ -183,26 +194,22 @@ class Conv2DTest(test.TestCase):
     Returns:
       Symbolic tensor value that can be used to execute the computation
     """
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-      total_size_1 *= s
-    for s in filter_in_sizes:
-      total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
 
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
       strides = [1] + strides + [1]
       dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
       if data_format == "NCHW":
         t1 = test_util.NHWCToNCHW(t1)
         strides = test_util.NHWCToNCHW(strides)
         dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW(padding)
       conv = nn_ops.conv2d(
           t1,
           t2,
@@ -249,22 +256,13 @@ class Conv2DTest(test.TestCase):
       tensors.append(_SetupVal(data_format, use_gpu))
     values = self.evaluate(tensors)
     for i in range(1, len(values)):
-      self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
+      self.assertAllClose(values[0], values[i], rtol=1e-3, atol=1e-3)
 
   def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
                                    stride, dilation, padding, data_format,
                                    use_gpu):
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-      total_size_1 *= s
-    for s in filter_in_sizes:
-      total_size_2 *= s
-
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
@@ -299,7 +297,7 @@ class Conv2DTest(test.TestCase):
     return expected, computed
 
   def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
-                               padding, dilations):
+                               padding, dilations, rtol=1e-4):
     expected_results = []
     computed_results = []
     for data_format, use_gpu in GetTestConfigs():
@@ -312,16 +310,29 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.info("expected = ", e_value)
-        tf_logging.info("actual = ", c_value)
+        tf_logging.debug("expected = %s", e_value)
+        tf_logging.debug("actual = %s", c_value)
         self.assertAllClose(
-            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
+            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
 
-  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
-                    expected):
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    filter_in_sizes,
+                    strides,
+                    padding,
+                    expected,
+                    dilations=(1, 1),
+                    gpu_only=False,
+                    test_grappler_layout_optimizer=False,
+                    tol=1e-5,
+                    fp16_tol=1e-3):
+    if gpu_only and not test.is_gpu_available(cuda_only=True):
+      return
     tensors = []
-    dilations = [1, 1]
+    dilations = list(dilations)
     for (data_format, use_gpu) in GetTestConfigs():
+      if gpu_only and not use_gpu:
+        continue
       for dtype in self._DtypesToTest(use_gpu):
         result = self._SetupValuesForDevice(
             tensor_in_sizes,
@@ -332,19 +343,71 @@ class Conv2DTest(test.TestCase):
             data_format,
             dtype,
             use_gpu=use_gpu)
+        if test_grappler_layout_optimizer and data_format == "NHWC" and use_gpu:
+          # Grappler's layout optimizer will not optimize a fetch node, so
+          # this identity allows Grappler to optimize the Conv2D node.
+          result = array_ops.identity(result)
         tensors.append(result)
       values = self.evaluate(tensors)
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        tf_logging.info("expected = ", expected)
-        tf_logging.info("actual = ", value)
-        tol = 1e-5
-        if value.dtype == np.float16:
-          tol = 1e-3
-        self.assertAllClose(expected, np.ravel(value), atol=tol, rtol=tol)
+        tf_logging.debug("expected = %s", expected)
+        tf_logging.debug("actual = %s", value)
+        tol_to_use = fp16_tol if value.dtype == np.float16 else tol
+        self.assertAllClose(expected, np.ravel(value), atol=tol_to_use,
+                            rtol=tol_to_use)
         self.assertShapeEqual(value, conv)
 
+  def _VerifyExplicitPaddings(self,
+                              tensor_in_sizes,
+                              filter_in_sizes,
+                              strides,
+                              padding,
+                              dilations=(1, 1),
+                              test_grappler_layout_optimizer=False,
+                              tol=1e-5,
+                              fp16_tol=1e-3):
+    """Verifies Conv2D with explicit padding generates correct values.
+
+    It does this by comparing with Conv2D without explicit padding. This
+    function assumes Conv2D without explicit padding works correctly.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols,
+        input_depth, output_depth].
+      strides: [row_stride, col_stride] for the convolution;
+      padding: Explicit padding amounts.
+      dilations: Dilation values
+      test_grappler_layout_optimizer: If True, allow the Grappler layout
+        optimizer to run, which turns NHWC Conv2Ds on the GPU to NCHW Conv2Ds.
+      tol: The absolute and relative tolerance for non-fp16 dtypes.
+      fp16_tol: The absolute and relative tolerance for fp16.
+    """
+    input_tensor = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_tensor = self._CreateNumpyTensor(filter_in_sizes)
+    input_tensor = array_ops.pad(input_tensor, [(0, 0)] + padding + [(0, 0)])
+    dilations = list(dilations)
+    conv2d_result = nn_ops.conv2d(
+        input_tensor,
+        filter_tensor, [1] + list(strides) + [1],
+        "VALID",
+        dilations=[1] + dilations + [1])
+    expected = list(self.evaluate(array_ops.reshape(conv2d_result, [-1])))
+    self._VerifyValues(
+        tensor_in_sizes,
+        filter_in_sizes,
+        strides,
+        padding,
+        expected,
+        dilations,
+        gpu_only=True,
+        test_grappler_layout_optimizer=test_grappler_layout_optimizer,
+        tol=tol,
+        fp16_tol=fp16_tol)
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D1x1Filter(self):
     expected_output = [
@@ -510,6 +573,134 @@ class Conv2DTest(test.TestCase):
         dilations=[2, 2],
         padding="VALID")
 
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D0x0Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[0, 0], [0, 0]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[3, 4, 3, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 2],
+        padding=[[0, 0], [0, 0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D1x1Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[1, 1, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[2, 1, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 2], [2, 2]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 1],
+        padding=[[2, 2], [2, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2DOnlyBottomPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[0, 3], [0, 0]], tol=2e-5)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[2, 2, 4, 3],
+        filter_in_sizes=[1, 2, 3, 2],
+        strides=[2, 2],
+        padding=[[0, 3], [0, 0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2DOnlyTopRightPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 2]],
+        tol=5e-5)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 4, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 3],
+        padding=[[1, 0], [0, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2DLotsPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 1, 1, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[3, 4], [4, 2]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 1],
+        filter_in_sizes=[2, 2, 1, 3],
+        strides=[2, 1],
+        padding=[[3, 4], [4, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2DExplicitPaddingWithDilations(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3])
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2DExplicitPaddingWithLayoutOptimizer(self):
+    # Test with Grappler's layout optimizer, to ensure the layout optimizer
+    # handles explicit padding correctly.
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1],
+        test_grappler_layout_optimizer=True)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3],
+        test_grappler_layout_optimizer=True)
+
   # TODO(yzhwang): this currently fails.
   # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
   #                   filter_in_sizes=[2, 2, 1, 1],
@@ -517,19 +708,22 @@ class Conv2DTest(test.TestCase):
   #                   expected=[72, 112, 392, 432])
 
   # Testing for backprops
-  def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
-                                 strides, padding, expected, data_format,
-                                 use_gpu, err):
-    total_output_size = 1
-    total_filter_size = 1
-    for s in output_sizes:
-      total_output_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_filter_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
+  def _RunAndVerifyBackpropInput(self,
+                                 input_sizes,
+                                 filter_sizes,
+                                 output_sizes,
+                                 strides,
+                                 padding,
+                                 expected,
+                                 data_format,
+                                 use_gpu,
+                                 err,
+                                 dilations=(1, 1)):
+    if use_gpu and not test.is_gpu_available(cuda_only=True):
+      return
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
     with test_util.device(use_gpu):
       if data_format == "NCHW":
         input_sizes = test_util.NHWCToNCHW(input_sizes)
@@ -537,18 +731,30 @@ class Conv2DTest(test.TestCase):
       t1 = constant_op.constant(x1, shape=filter_sizes)
       t2 = constant_op.constant(x2, shape=output_sizes)
       strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
       if data_format == "NCHW":
         t2 = test_util.NHWCToNCHW(t2)
         strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW((padding))
       conv = nn_ops.conv2d_backprop_input(
-          t0, t1, t2, strides=strides, padding=padding, data_format=data_format)
+          t0,
+          t1,
+          t2,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations)
       if data_format == "NCHW":
         conv = test_util.NCHWToNHWC(conv)
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    tf_logging.info("expected = ", expected)
-    tf_logging.info("actual = ", value)
+    tf_logging.debug("expected = %s", expected)
+    tf_logging.debug("actual = %s", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -691,41 +897,51 @@ class Conv2DTest(test.TestCase):
           err=1e-5)
 
   # Testing for backprops
-  def _RunAndVerifyBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
-                                  strides, padding, expected, data_format,
-                                  use_gpu):
-    total_input_size = 1
-    total_output_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in output_sizes:
-      total_output_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x0 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
+  def _RunAndVerifyBackpropFilter(self,
+                                  input_sizes,
+                                  filter_sizes,
+                                  output_sizes,
+                                  strides,
+                                  padding,
+                                  expected,
+                                  data_format,
+                                  use_gpu,
+                                  dilations=(1, 1),
+                                  err=1e-5):
+    x0 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    explicit_strides = [1] + strides + [1]
+    new_padding = padding
+    new_dilations = [1] + dilations + [1]
+    if isinstance(new_padding, (list, tuple)):
+      new_padding = [(0, 0)] + new_padding + [(0, 0)]
+    if data_format == "NCHW":
+      explicit_strides = test_util.NHWCToNCHW(explicit_strides)
+      new_dilations = test_util.NHWCToNCHW(new_dilations)
+      if isinstance(padding, (list, tuple)):
+        new_padding = test_util.NHWCToNCHW(new_padding)
     for dtype in self._DtypesToTest(use_gpu=use_gpu):
       with test_util.device(use_gpu):
         t0 = constant_op.constant(x0, shape=input_sizes, dtype=dtype)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = constant_op.constant(x2, shape=output_sizes, dtype=dtype)
-        explicit_strides = [1] + strides + [1]
         if data_format == "NCHW":
           t0 = test_util.NHWCToNCHW(t0)
           t2 = test_util.NHWCToNCHW(t2)
-          explicit_strides = test_util.NHWCToNCHW(explicit_strides)
         conv = nn_ops.conv2d_backprop_filter(
             t0,
             t1,
             t2,
             strides=explicit_strides,
-            padding=padding,
+            padding=new_padding,
+            dilations=new_dilations,
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      tf_logging.info("expected = ", expected)
-      tf_logging.info("actual = ", value)
-      self.assertArrayNear(expected, value.flatten(), 1e-5)
+      tf_logging.debug("expected = %s", expected)
+      tf_logging.debug("actual = %s", value)
+      self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
                          conv_strides, padding):
@@ -866,16 +1082,8 @@ class Conv2DTest(test.TestCase):
   def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
                                          output_sizes, strides, dilations,
                                          padding, data_format, use_gpu, err):
-    total_input_size = 1
-    total_filter_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    x1 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(filter_sizes)
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -912,24 +1120,16 @@ class Conv2DTest(test.TestCase):
         value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      tf_logging.debug("expected = %s", value_2)
+      tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
   def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
                                           output_sizes, strides, dilations,
                                           padding, data_format, use_gpu, err):
-    total_input_size = 1
-    total_filter_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    x1 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(filter_sizes)
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -965,8 +1165,8 @@ class Conv2DTest(test.TestCase):
         value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      tf_logging.debug("expected = %s", value_2)
+      tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1111,20 +1311,357 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  def _RunAndVerifyBackpropInputExplicitPadding(self,
+                                                input_sizes,
+                                                filter_sizes,
+                                                output_sizes,
+                                                strides,
+                                                padding,
+                                                data_format,
+                                                dilations=(1, 1),
+                                                err=2e-5):
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    padded_input_sizes = input_sizes[:]
+    padded_input_sizes[1] += padding[0][0] + padding[0][1]
+    padded_input_sizes[2] += padding[1][0] + padding[1][1]
+    c = nn_ops.conv2d_backprop_input(
+        padded_input_sizes,
+        x1,
+        x2,
+        strides=[1] + strides + [1],
+        padding="VALID",
+        dilations=[1] + dilations + [1])
+    c = c[:, padding[0][0]:(c.shape[1] - padding[0][1]), padding[1][0]:(
+        c.shape[2] - padding[1][1]), :]
+    expected = list(self.evaluate(array_ops.reshape(c, [-1])))
+    self._RunAndVerifyBackpropInput(
+        input_sizes,
+        filter_sizes,
+        output_sizes,
+        strides,
+        padding,
+        expected,
+        data_format,
+        use_gpu=True,
+        err=err,
+        dilations=dilations)
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding0x0BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 4, 2],
+            filter_sizes=[2, 2, 2, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[2, 2],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding1x1BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 3, 4, 2],
+            strides=[1, 1],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format, err=1e-4)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 2],
+            filter_sizes=[1, 1, 2, 1],
+            output_sizes=[1, 4, 3, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 4, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 4, 2, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            dilations=[2, 2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding2x2BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[2, 3, 1, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[2, 2, 5, 1],
+            strides=[3, 1],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 3, 4, 1],
+            strides=[1, 2],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding_1_8_4_1_BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 10, 8, 1],
+            strides=[1, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format, err=5e-5)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 5, 3, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 4, 8, 1],
+            strides=[3, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding_5_0_2_2_BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[1, 7, 7, 1],
+            strides=[1, 1],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            err=5e-5)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 4, 2, 1],
+            filter_sizes=[3, 3, 1, 1],
+            output_sizes=[1, 5, 2, 1],
+            strides=[1, 2],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 1])
+
+  def _RunAndVerifyBackpropFilterExplicitPadding(self,
+                                                 input_sizes,
+                                                 filter_sizes,
+                                                 output_sizes,
+                                                 strides,
+                                                 padding,
+                                                 data_format,
+                                                 dilations=(1, 1),
+                                                 err=1e-5):
+    x0 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+
+    x0 = np.pad(x0, [(0, 0)] + padding + [(0, 0)], "constant")
+    c = nn_ops.conv2d_backprop_filter(
+        x0,
+        filter_sizes,
+        x2,
+        strides=[1] + strides + [1],
+        padding="VALID",
+        dilations=[1] + dilations + [1])
+    expected = list(self.evaluate(array_ops.reshape(c, [-1])))
+    self._RunAndVerifyBackpropFilter(
+        input_sizes,
+        filter_sizes,
+        output_sizes,
+        strides,
+        padding,
+        expected,
+        data_format,
+        use_gpu=True,
+        dilations=dilations,
+        err=err)
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding0x0BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 4, 2],
+            filter_sizes=[2, 2, 2, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[2, 2],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding1x1BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 3, 4, 2],
+            strides=[1, 1],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            err=5e-5)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 2],
+            filter_sizes=[1, 1, 2, 1],
+            output_sizes=[1, 4, 3, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 4, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 4, 2, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            dilations=[2, 2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding2x2BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[2, 3, 1, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[2, 2, 5, 1],
+            strides=[3, 1],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 3, 4, 1],
+            strides=[1, 2],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding_1_8_4_1_BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 10, 8, 1],
+            strides=[1, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format,
+            err=1e-4)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 5, 3, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 4, 8, 1],
+            strides=[3, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  @test_util.disable_xla("This test never passed for XLA")
+  def testConv2D2x2Depth1Padding_5_0_2_2_BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[1, 7, 7, 1],
+            strides=[1, 1],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            err=1e-4)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 4, 2, 1],
+            filter_sizes=[3, 3, 1, 1],
+            output_sizes=[1, 5, 2, 1],
+            strides=[1, 2],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 1])
+
   # Gradient checkers
   def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
                                filter_cols, in_depth, out_depth, stride_rows,
                                stride_cols, padding, test_input, data_format,
-                               use_gpu):
+                               use_gpu, max_err=0.002):
     input_shape = [batch, input_rows, input_cols, in_depth]
     filter_shape = [filter_rows, filter_cols, in_depth, out_depth]
     # TODO(yangke): re-factor the computation of output shape.
     if padding == "VALID":
       output_rows = (input_rows - filter_rows + stride_rows) // stride_rows
       output_cols = (input_cols - filter_cols + stride_cols) // stride_cols
-    else:
+    elif padding == "SAME":
       output_rows = (input_rows + stride_rows - 1) // stride_rows
       output_cols = (input_cols + stride_cols - 1) // stride_cols
+    else:
+      self.assertIsInstance(padding, (list, tuple))
+      output_rows = (input_rows + padding[1][0] + padding[1][1] - filter_rows +
+                     stride_rows) // stride_rows
+      output_cols = (input_cols + padding[2][0] + padding[2][1] - filter_cols +
+                     stride_cols) // stride_cols
     output_shape = [batch, output_rows, output_cols, out_depth]
     input_size = 1
     for x in input_shape:
@@ -1145,16 +1682,19 @@ class Conv2DTest(test.TestCase):
         filter_tensor = constant_op.constant(
             filter_data, shape=filter_shape, dtype=dtype, name="filter")
         strides = [1, stride_rows, stride_cols, 1]
+        new_padding = padding
         if data_format == "NCHW":
           new_input_tensor = test_util.NHWCToNCHW(input_tensor)
           strides = test_util.NHWCToNCHW(strides)
+          if isinstance(padding, (list, tuple)):
+            new_padding = test_util.NHWCToNCHW(padding)
         else:
           new_input_tensor = input_tensor
         conv = nn_ops.conv2d(
             new_input_tensor,
             filter_tensor,
             strides,
-            padding,
+            new_padding,
             data_format=data_format,
             name="conv")
         if data_format == "NCHW":
@@ -1178,8 +1718,8 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        tf_logging.info("conv_2d gradient error = ", err)
-        self.assertLess(err, 0.002)
+        tf_logging.debug("conv_2d gradient error = %s", err)
+        self.assertLess(err, max_err)
 
   def testInputGradientValidPaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1436,6 +1976,260 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.disable_xla("This test never passed for XLA")
+  def testInputGradient1x1PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu,
+            max_err=0.0025)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testFilterGradient1x1PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testInputGradient1x1PaddingStrideTwo(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=4,
+            input_cols=5,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=2,
+            stride_cols=2,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testFilterGradient1x1PaddingStrideTwo(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=4,
+            input_cols=5,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=2,
+            stride_cols=2,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testInputGradient2x2PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testFilterGradient2x2PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu,
+            max_err=0.003)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testInputGradient1_2_3_4PaddingStride3x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=8,
+            input_cols=5,
+            filter_rows=4,
+            filter_cols=2,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=3,
+            stride_cols=2,
+            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testFilterGradient1_2_3_4PaddingStride3x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=8,
+            input_cols=5,
+            filter_rows=4,
+            filter_cols=2,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=3,
+            stride_cols=2,
+            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testInputGradient4_3_2_1PaddingStride2x1(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=3,
+            input_rows=5,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=2,
+            in_depth=1,
+            out_depth=2,
+            stride_rows=2,
+            stride_cols=1,
+            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testFilterGradient4_3_2_1PaddingStride2x1(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=3,
+            input_rows=5,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=2,
+            in_depth=1,
+            out_depth=2,
+            stride_rows=2,
+            stride_cols=1,
+            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testInputGradient0_0_0_5PaddingStride1x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=6,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=4,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=1,
+            stride_cols=2,
+            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  @test_util.disable_xla("This test never passed for XLA")
+  def testFilterGradient0_0_0_5PaddingStride1x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=6,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=4,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=1,
+            stride_cols=2,
+            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     c1 = nn_ops.conv2d(
@@ -1473,6 +2267,56 @@ class Conv2DTest(test.TestCase):
           strides=[1, 1, 1, 1],
           padding="SAME")
 
+    # Negative padding.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, -1], [1, 2], [0, 0]])
+
+    # Nonzero padding in nonspatial dimension.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[1, 0], [0, 0], [0, 0], [0, 0]])
+
+    # Nonzero NCHW padding in nonspatial dimension.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, 1], [0, 0], [0, 0]],
+          data_format="NCHW")
+
+    # Wrong amount of padding
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, 0], [0, 0]])
+
+    # Only specify one padding amount per dimension
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0], [0], [0], [0]])
+
+    # Explicit padding elements are not lists
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[0, 0, 0, 0])
+
+  @test_util.disable_xla("This test never passed for XLA")
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
@@ -1513,6 +2357,41 @@ class Conv2DTest(test.TestCase):
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
 
+      # Filter larger than input + padding.
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            nn_ops.conv2d(
+                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                array_ops.placeholder(dtypes.float32, shape=[24, 25, 3, 2]),
+                strides=[1, 1, 1, 1],
+                padding=[[0, 0], [2, 2], [2, 2], [0, 0]]))
+
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        # Negative padding during backprop.
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "nonnegative"):
+          sess.run(
+              nn_ops.conv2d_backprop_input([32, 20, 20, 3],
+                                           array_ops.placeholder(
+                                               dtypes.float32,
+                                               shape=[18, 18, 3, 2]),
+                                           array_ops.placeholder(
+                                               dtypes.float32,
+                                               shape=[32, 3, 2, 2]),
+                                           strides=[1, 1, 1, 1],
+                                           padding=[[0, 0], [-1, 0], [0, 0],
+                                                    [0, 0]]))
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "nonnegative"):
+          sess.run(
+              nn_ops.conv2d_backprop_filter(
+                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                  [18, 18, 3, 2],
+                  array_ops.placeholder(dtypes.float32, shape=[32, 3, 2, 2]),
+                  strides=[1, 1, 1, 1],
+                  padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]))
+
 
 class DepthwiseConv2DTest(test.TestCase):
 
@@ -1546,7 +2425,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = self.evaluate(conv)
-    tf_logging.info("value = ", value)
+    tf_logging.debug("value = %s", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +2547,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = self.evaluate(conv)
-    tf_logging.info("value = ", value)
+    tf_logging.debug("value = %s", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
@@ -1828,6 +2707,194 @@ class Conv2DBenchmark(test.Benchmark):
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
         tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
+  def _bench_op(self, name, op, burn_iters, num_iters):
+    config = config_pb2.ConfigProto()
+    # Prevent Grappler from optimizing away the entire graph.
+    config.graph_options.rewrite_options.dependency_optimization = (
+        rewriter_config_pb2.RewriterConfig.OFF)
+    with session_lib.Session(config=config) as session:
+      variables.global_variables_initializer().run()
+      self.run_op_benchmark(
+          session, op, burn_iters=burn_iters, min_iters=num_iters, name=name)
+
+  def benchmarkExplicitVsManualPadding(self):
+    """Compare performance of EXPLICIT padding and calling tf.pad.
+
+    A Conv2D op with EXPLICIT padding is benchmarked, and a tf.pad with the same
+    padding followed by an equivalent Conv2D op is benchmarked.
+    """
+    if not test.is_gpu_available():
+      return
+
+    with ops.Graph().as_default():
+      burn_iters = 15
+      num_iters = 300
+      batch_size = 64
+      # The input and filter correspond to the first layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              3,
+              224,
+              224
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([7, 7, 3, 64]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 2, 2]
+      padding = [(0, 0), (0, 0), (3, 3), (3, 3)]
+      output_explicit_pad = nn_ops.conv2d(
+          input, filter, strides, padding=padding, data_format="NCHW")
+      input_padded = array_ops.pad(input, padding)
+      output_manual_pad = nn_ops.conv2d(
+          input_padded, filter, strides, padding="VALID", data_format="NCHW")
+      # Benchmark just the forward pass.
+      self._bench_op("explicit_pad_forward", output_explicit_pad.op, burn_iters,
+                     num_iters)
+      self._bench_op("manual_pad_forward", output_manual_pad.op, burn_iters,
+                     num_iters)
+
+      # Benchmark both the forward and backwards passes.
+      input_grad_explicit_pad, filter_grad_explicit_pad = (
+          gradients_impl.gradients(output_explicit_pad, [input, filter]))
+      self._bench_op(
+          "explicit_pad_backward",
+          control_flow_ops.group(input_grad_explicit_pad,
+                                 filter_grad_explicit_pad), burn_iters,
+          num_iters)
+      input_grad_manual_pad, filter_grad_manual_pad = gradients_impl.gradients(
+          output_manual_pad, [input, filter])
+      self._bench_op(
+          "manual_pad_backward",
+          control_flow_ops.group(input_grad_manual_pad, filter_grad_manual_pad),
+          burn_iters, num_iters)
+
+  def benchmarkExplicitVsSamePaddingGraph(self):
+    """Compare performance of EXPLICIT and SAME padding in graph mode.
+
+    A Conv2D op with SAME padding is benchmarked, and an equivalent Conv2D op
+    with explicit padding is benchmarked, where the padding is the same as in
+    the SAME case. The purpose is to ensure EXPLICIT padding is just as
+    efficient as the SAME case
+    """
+    if not test.is_gpu_available():
+      return
+
+    with ops.Graph().as_default():
+      burn_iters = 15
+      num_convs = 20
+      num_iters = 50
+      batch_size = 64
+      # The input and filter correspond to a middle layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              256,
+              14,
+              14
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([3, 3, 256, 256]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 1, 1]
+      padding = [(0, 0), (0, 0), (1, 1), (1, 1)]
+      output_explicit_pad = input
+      output_same_pad = input
+
+      for _ in range(num_convs):
+        output_explicit_pad = nn_ops.conv2d(
+            output_explicit_pad,
+            filter,
+            strides,
+            padding=padding,
+            data_format="NCHW")
+        output_same_pad = nn_ops.conv2d(
+            output_same_pad,
+            filter,
+            strides,
+            padding="SAME",
+            data_format="NCHW")
+      grad_explicit_pad, = gradients_impl.gradients(output_explicit_pad, filter)
+      grad_same_pad, = gradients_impl.gradients(output_same_pad, filter)
+      self._bench_op("graph_explicit_pad", grad_explicit_pad.op, burn_iters,
+                     num_iters)
+      self._bench_op("graph_same_pad", grad_same_pad.op, burn_iters, num_iters)
+
+  def benchmarkExplicitVsSamePaddingEager(self):
+    """Compare performance of EXPLICIT and SAME padding in eager mode.
+
+    A Conv2D op with SAME padding is benchmarked, and an equivalent Conv2D op
+    with explicit padding is benchmarked, where the padding is the same as in
+    the SAME case. Currently, EXPLICIT padding is slightly slower, due to the
+    fact the Python padding list must be checked and processed before the Conv2D
+    op can run.
+    """
+    # TODO(reedwm): Make EXPLICIT padding as fast as SAME padding.
+    if not test.is_gpu_available():
+      return
+
+    with context.eager_mode():
+      burn_iters = 15
+      num_convs = 20
+      num_iters = 50
+      batch_size = 64
+      # The input and filter correspond to a middle layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              256,
+              14,
+              14
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([3, 3, 256, 256]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 1, 1]
+      padding = [(0, 0), (0, 0), (1, 1), (1, 1)]
+      output_explicit_pad = input
+      output_same_pad = input
+      for _ in range(burn_iters):
+        output_explicit_pad = nn_ops.conv2d(
+            output_explicit_pad,
+            filter,
+            strides,
+            padding=padding,
+            data_format="NCHW")
+        output_same_pad = nn_ops.conv2d(
+            output_same_pad,
+            filter,
+            strides,
+            padding="SAME",
+            data_format="NCHW")
+
+      start = time.time()
+      for _ in range(num_iters):
+        with backprop.GradientTape() as tape:
+          for _ in range(num_convs):
+            output_explicit_pad = nn_ops.conv2d(
+                output_explicit_pad,
+                filter,
+                strides,
+                padding=padding,
+                data_format="NCHW")
+          tape.gradient(output_explicit_pad, filter)
+      end = time.time()
+      self.report_benchmark(
+          name="eager_explicit_pad",
+          wall_time=(end - start) / num_iters,
+          iters=num_iters)
+
+      start = time.time()
+      for _ in range(num_iters):
+        with backprop.GradientTape() as tape:
+          for _ in range(num_convs):
+            output_same_pad = nn_ops.conv2d(
+                output_same_pad,
+                filter,
+                strides,
+                padding="SAME",
+                data_format="NCHW")
+          tape.gradient(output_same_pad, filter)
+      end = time.time()
+      self.report_benchmark(
+          name="eager_same_pad",
+          wall_time=(end - start) / num_iters,
+          iters=num_iters)
+
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
                         gpu_only=False):
@@ -1855,7 +2922,8 @@ def GetInceptionFwdDilatedConvTest(input_size, filter_size, stride, padding):
           filter_in_sizes=filter_size,
           strides=[stride, stride],
           dilations=[2, 2],
-          padding=padding)
+          padding=padding,
+          rtol=5e-4)
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 96c9b5258e2a4a103a3d981a3340f67a01bbec94..97d3645b617947c2ced88ac52207ced98c59c877 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -295,6 +295,7 @@ class DepthToSpaceTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index ed2a9e8e47e961549dbaa99a78624e22af146937..b8139918c597fa455ce9b726d165ec685c959fb3 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -65,6 +65,7 @@ class MatrixDiagTest(test.TestCase):
       array_ops.matrix_diag(0)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -269,6 +270,7 @@ class MatrixDiagPartTest(test.TestCase):
       array_ops.matrix_diag_part(0)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index a83622ac9ada908d4dffea8c10f75374c2e1d5bd..22c98201dd1847586af6a30eed8004757a21b335 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -23,6 +23,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -42,6 +43,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 3,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -54,6 +56,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -71,6 +74,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -86,6 +90,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -105,6 +110,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -119,6 +125,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -139,6 +146,7 @@ cuda_py_test(
         "noguitar",  # b/110489471
         "notap",  # b/110489471
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -154,6 +162,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -169,6 +178,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -184,6 +194,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -200,6 +211,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = ["manual"],  # b/69001419
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -218,12 +230,14 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = [
-        # TODO(b/121223043): Re-enable this test on mac after fixing "mean not
-        # defined" errors.
+        # TODO(b/121223043): Re-enable this test after fixing "mean not defined"
+        # errors.
         "no_mac",
+        "no_oss",
         # disable to avoid false positives from scipy.
         "nomsan",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -240,6 +254,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -258,6 +273,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -274,6 +290,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -291,4 +308,5 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 0d6a3cbd3527ac409ddf5c1c851c8993f404d029..92f4e7b39e047a6a6b95a34f09161f4828535663 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -57,6 +58,7 @@ def simple_scoped_fn(a, x):
 
 
 @test_util.with_control_flow_v2
+@test_util.disable_all_xla("This test never passed for XLA")
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -200,6 +202,13 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(
         np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
 
+  def testMapDtypeEager(self):
+    with context.eager_mode():
+      dtype = functional_ops.map_fn(lambda x: constant_op.constant(""),
+                                    constant_op.constant([]),
+                                    dtype=dtypes.string).dtype
+      self.assertEqual(dtype, dtypes.string)
+
   def testMapSparseTensor(self):
     with self.cached_session():
       with self.assertRaises(TypeError):
@@ -762,6 +771,26 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(Run(sess, 20.), 210.)
           self.assertAllEqual(Run(sess, 100.), 5050.)
 
+  # Like above, but using int32 in order to ensure that int32 tensors don't get
+  # copied to the GPU during the application of the while.
+  def testWhileInt32(self):
+    with ops.Graph().as_default() as g:
+
+      @function.Defun(*[dtypes.int32] * 2)
+      def Cond(n, unused_x):
+        return n > 0
+
+      @function.Defun(*[dtypes.int32] * 2)
+      def Body(n, x):
+        return n - 1, x + n
+
+      def Run(sess, n):
+        return sess.run(functional_ops.While([n, 0], Cond, Body))[1]
+
+      with self.session(graph=g, use_gpu=True) as sess:
+        self.assertAllEqual(Run(sess, 20), 210)
+        self.assertAllEqual(Run(sess, 100), 5050)
+
   @test_util.run_deprecated_v1
   def testWhileLowering(self):
 
@@ -798,6 +827,8 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(Run(100., True), 5050.)
 
   @test_util.run_v1_only("b/120545219")
+  @test_util.disable_xla(
+      "This test never passed for XLA")  # Different error message
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -1074,6 +1105,7 @@ class FunctionalOpsTest(test.TestCase):
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
 # below test cases.
+@test_util.disable_all_xla("This test never passed for XLA")
 class PartitionedCallTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 320ffc9674bd2e0ce601084ab8fc375c4cbdc3e2..76ae2fcb72f606d95a6d4523f08ecad3514eb974 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.disable_all_xla("This test never passed for XLA")
 class GatherNdTest(test.TestCase):
 
   def _testSimpleDtype(self, dtype):
@@ -56,6 +57,7 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
@@ -358,6 +360,7 @@ class GatherNdTest(test.TestCase):
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
 
+@test_util.disable_all_xla("This test never passed for XLA")
 class GatherNdOpBenchmark(test.Benchmark):
 
   def benchmark_gather_nd_op(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index fc86068c3fc08d1ad01ba8dfa9bb4c5bc6c429f2..1d6f7955aa0cb002df7fc9d344066972820c9664 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -18,8 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,7 +34,7 @@ _TEST_TYPES = (dtypes.int64, dtypes.float32,
                dtypes.complex64, dtypes.complex128)
 
 
-class GatherTest(test.TestCase):
+class GatherTest(test.TestCase, parameterized.TestCase):
 
   def _buildParams(self, data, dtype):
     data = data.astype(dtype.as_numpy_dtype)
@@ -248,6 +250,244 @@ class GatherTest(test.TestCase):
           gather = array_ops.gather(params, indices, axis=2)
           self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
+  @parameterized.parameters([
+      # batch_dims=0 (equivalent to tf.gather)
+      dict(  # 2D indices
+          batch_dims=0,
+          params=[6, 7, 8, 9],
+          indices=[[2, 1], [0, 3]],
+          expected=[[8, 7], [6, 9]]),
+      dict(  # 3D indices
+          batch_dims=0,
+          params=[6, 7, 8, 9],
+          indices=[[[3, 1], [2, 0]], [[0, 3], [2, 2]]],
+          expected=[[[9, 7], [8, 6]], [[6, 9], [8, 8]]]),
+      dict(  # 4D indices
+          batch_dims=0,
+          params=[8, 9],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[8, 9], [9, 8]], [[8, 8], [9, 9]]],
+                    [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
+
+      # batch_dims=indices.shape.ndims - 1 (equivalent to tf.batch_gather)
+      dict(  # 2D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[2, 1], [0, 3]],
+          expected=[[12, 11], [20, 23]]),
+      dict(  # 3D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[100, 101], [110, 111]], [[200, 201], [210, 211]]],
+          indices=[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+          expected=[[[100, 101], [111, 110]], [[200, 200], [211, 211]]]),
+      dict(  # 2D indices (1 batch dim)
+          batch_dims=-1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[2, 1], [0, 3]],
+          expected=[[12, 11], [20, 23]]),
+      dict(  # 3D indices (2 batch dims)
+          batch_dims=-1,
+          params=[[[100, 101], [110, 111]], [[200, 201], [210, 211]]],
+          indices=[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+          expected=[[[100, 101], [111, 110]], [[200, 200], [211, 211]]]),
+
+      # 0 < batch_dims < indices.shape.ndims - 1
+      dict(  # 3D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[[3, 1], [2, 0]], [[0, 3], [2, 2]]],
+          expected=[[[13, 11], [12, 10]], [[20, 23], [22, 22]]]),
+      dict(  # 4D indices (1 batch dim)
+          batch_dims=1,
+          params=[[6, 7], [8, 9]],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[6, 7], [7, 6]], [[6, 6], [7, 7]]],
+                    [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
+      dict(  # 4D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[2, 3], [4, 5]], [[6, 7], [8, 9]]],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[2, 3], [3, 2]], [[4, 4], [5, 5]]],
+                    [[[7, 7], [6, 6]], [[8, 9], [9, 8]]]]),
+
+      # axis > 0
+      dict(  # 3D indices, batch_dims=1, axis=2
+          # params.shape  = [I1, J1, J2] = [2, 2, 3]
+          # indices.shape = [I1, K1, K2] = [2, 1, 5]
+          # result.shape  = [I1, J1, K1, K2] = [2, 2, 1, 5]
+          batch_dims=1,
+          axis=2,
+          params=[[[10, 11, 12], [13, 14, 15]], [[20, 21, 22], [23, 24, 25]]],
+          indices=[[[0, 1, 2, 1, 0]], [[0, 1, 2, 1, 0]]],
+          expected=[[[[10, 11, 12, 11, 10]], [[13, 14, 15, 14, 13]]],
+                    [[[20, 21, 22, 21, 20]], [[23, 24, 25, 24, 23]]]]),
+      dict(  # 3D indices, batch_dims=None, axis=1
+          batch_dims=None,
+          axis=1,
+          params=[[10, 11, 12], [13, 14, 15]],
+          indices=[1, 0],
+          expected=[[11, 10], [14, 13]]),
+  ])
+  @test_util.run_in_graph_and_eager_modes
+  def testBatchDims(self, params, indices, batch_dims, expected=None,
+                    axis=None):
+    result = array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=2,
+          output_shape=[2, 3, 8, 9, 10, 5, 6, 7]
+          # = params.shape[:2] + indices.shape[2:] + params.shape[3:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=3,
+          output_shape=[2, 3, 4, 8, 9, 10, 6, 7]
+          # = params.shape[:3] + indices.shape[2:] + params.shape[4:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=4,
+          output_shape=[2, 3, 4, 5, 8, 9, 10, 7]
+          # = params.shape[:4] + indices.shape[2:] + params.shape[5:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=5,
+          output_shape=[2, 3, 4, 5, 6, 8, 9, 10]
+          # = params.shape[:5] + indices.shape[2:] + params.shape[6:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-4,
+          output_shape=[2, 3, 8, 9, 10, 5, 6, 7]
+          # = params.shape[:2] + indices.shape[2:] + params.shape[3:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-3,
+          output_shape=[2, 3, 4, 8, 9, 10, 6, 7]
+          # = params.shape[:3] + indices.shape[2:] + params.shape[4:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-2,
+          output_shape=[2, 3, 4, 5, 8, 9, 10, 7]
+          # = params.shape[:4] + indices.shape[2:] + params.shape[5:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-1,
+          output_shape=[2, 3, 4, 5, 6, 8, 9, 10]
+          # = params.shape[:5] + indices.shape[2:] + params.shape[6:]
+          ),
+  ])
+  @test_util.run_in_graph_and_eager_modes
+  def testBatchDimsMatchesPythonBatching(self, params_shape, indices_shape,
+                                         batch_dims, axis, output_shape):
+    """Checks that batch_dims matches multiple calls to tf.gather()."""
+    # Generate a `params` tensor with the indicated shape.
+    params_size = np.prod(params_shape)
+    params = np.reshape(np.arange(params_size), params_shape)
+
+    # Generate an `indices` tensor with the indicated shape, where each index
+    # is within the appropriate range.
+    indices_size = np.prod(indices_shape)
+    indices = np.reshape(np.arange(indices_size), indices_shape)
+    indices = indices % params_shape[axis]
+
+    # Perform repeated (batched) gather operations with numpy, to find the
+    # expected result.
+    expected = self._batchNumpyGather(params, indices, axis, batch_dims)
+
+    # On Windows, we get an exception if we pass in the transformed numpy
+    # arrays ("Failed to convert numpy ndarray to a Tensor (Unsupported
+    # feed type)."); so convert them back to lists before calling tf.gather.
+    params = params.tolist()
+    indices = indices.tolist()
+
+    result = array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
+    self.assertAllEqual(output_shape, result.shape.as_list())
+    self.assertAllEqual(expected, result)
+
+  def _batchNumpyGather(self, params, indices, axis, batch_dims):
+    """Performs a batch gather by making recursive calls to np.take().
+
+    This is used by testBatchDims() to construct the expected value.
+
+    Args:
+      params: A numpy array
+      indices: A numpy array
+      axis: An integer
+      batch_dims: An integer
+    Returns:
+      A numpy array
+    """
+    if batch_dims == 0:
+      return np.take(params, indices, axis=axis)
+    self.assertEqual(params.shape[0], indices.shape[0])
+    if axis > 0:
+      axis -= 1
+    return np.stack([
+        self._batchNumpyGather(params[i], indices[i], axis, batch_dims - 1)
+        for i in range(params.shape[0])
+    ])
+
+  def testSkipEagerErrors(self):
+    if context.executing_eagerly():
+      return
+    with self.assertRaisesRegexp(ValueError, r"tf\.gather does not allow.*"):
+      array_ops.gather(
+          params=[1, 2],
+          batch_dims=1,
+          indices=array_ops.placeholder(dtypes.int32))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testErrors(self):
+
+    with self.assertRaisesRegexp(
+        ValueError, r"batch_dims = 2 must be less than ndims\(indices\) = 2"):
+      array_ops.gather(
+          params=[[1, 2], [3, 4]], indices=[[1, 2], [3, 4]], batch_dims=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"batch_dims = 1 must be less than ndims\(params\) = 1"):
+      array_ops.gather(
+          params=[1, 2, 3, 4], indices=[[1, 2], [3, 4]], batch_dims=1)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"batch_dims = 1 must be less than or equal to axis = 0"):
+      array_ops.gather(
+          params=[[1, 2], [3, 4]],
+          indices=[[1, 2], [3, 4]],
+          batch_dims=1,
+          axis=0)
+
+    one = array_ops.ones((), dtypes.int32)
+    with self.assertRaisesRegexp(TypeError, "batch_dims must be an int"):
+      array_ops.gather(params=[[1]], indices=[[1]], batch_dims=one)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 09b9944baa1d92bfbcd484f5dba45cea28e6eafe..4b9681afd2cac5660107ca8072770f66944ec2a4 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -592,6 +592,22 @@ class LinSpaceTest(test.TestCase):
       self.assertArrayNear(self._LinSpace(5., 5., 3), np.array([5.] * 3), 1e-5)
       self.assertArrayNear(self._LinSpace(5., 5., 4), np.array([5.] * 4), 1e-5)
 
+  def testEndpointsAreExact(self):
+    for self.force_gpu in self._gpu_modes():
+      # Test some cases that produce last values not equal to "stop" when
+      # computed via start + (num - 1) * ((stop - start) / (num - 1)), since
+      # float arithmetic will introduce error through precision loss.
+      self.assertAllEqual(
+          self._LinSpace(0., 1., 42)[[0, -1]], np.array([0., 1.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(-1., 0., 42)[[0, -1]], np.array([-1., 0.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(.1, .2, 4)[[0, -1]], np.array([.1, .2], np.float32))
+      # Check a case for float64 error too.
+      self.assertAllEqual(
+          self._LinSpace(np.array(0., np.float64), .1, 12)[[0, -1]],
+          np.array([0., .1], np.float64))
+
 
 class DeviceTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index ba9e64979a48ccce82a283e74a1a024c4bcceda8..53815858e4c8fc9c9dad0246f9ff9933a47459bc 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -22,6 +22,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -38,6 +39,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -60,6 +62,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -76,6 +79,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -98,6 +102,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -120,6 +125,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -144,6 +150,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out, b/79171797
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
@@ -166,6 +173,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -187,6 +195,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -209,6 +218,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -229,6 +239,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -251,6 +262,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -270,6 +282,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -290,6 +303,7 @@ cuda_py_test(
         "noasan",  # times out
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -310,6 +324,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -328,4 +343,5 @@ cuda_py_test(
     ],
     shard_count = 5,
     tags = ["optonly"],  # Test is flaky without optimization.
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index 8e296c026c09b36afd39b891befb767a222f5f19..ec78a3ffe0b2ae1ff5c5f6c4d73480f2ad92fd26 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -30,6 +30,8 @@ _CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
 _MATMUL = linear_operator_algebra._MATMUL
 _registered_cholesky = linear_operator_algebra._registered_cholesky
 _registered_matmul = linear_operator_algebra._registered_matmul
+_INVERSES = linear_operator_algebra._INVERSES
+_registered_inverse = linear_operator_algebra._registered_inverse
 # pylint: enable=protected-access
 
 
@@ -129,5 +131,51 @@ class MatmulTest(test.TestCase):
       self.assertEqual(v, _registered_matmul(k[0], k[1]))
 
 
+class InverseTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Inverse to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterInverse(CustomLinOp)
+    def _inverse(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "singular"):
+      CustomLinOp(dtype=None, is_non_singular=False).inverse()
+
+    self.assertEqual("OK", CustomLinOp(
+        dtype=None, is_non_singular=True).inverse())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterInverse(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
+
+  def testExactRegistrationsAllMatch(self):
+    for (k, v) in _INVERSES.items():
+      self.assertEqual(v, _registered_inverse(k[0]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index f0cc5d709f9bfec2e3dcfadecc8f949bb6ce6e6d..96e6e3c04c77e2a32d11d72feea02c177cfa3e61 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -155,20 +155,38 @@ class SquareLinearOperatorBlockDiagTest(
         is_self_adjoint=True,
     )
     cholesky_factor = operator.cholesky()
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         cholesky_factor,
-        block_diag.LinearOperatorBlockDiag))
+        block_diag.LinearOperatorBlockDiag)
     self.assertEqual(2, len(cholesky_factor.operators))
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[0],
-            lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[0],
+        lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[1],
+        lower_triangular.LinearOperatorLowerTriangular
     )
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[1],
-            lower_triangular.LinearOperatorLowerTriangular)
+
+  def test_block_diag_inverse_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+        ],
+        is_non_singular=True,
     )
+    inverse = operator.inverse()
+    self.assertIsInstance(
+        inverse,
+        block_diag.LinearOperatorBlockDiag)
+    self.assertEqual(2, len(inverse.operators))
 
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index dcbc0dd7c97184df150fc7094a28441fcfaa1257..4d7a31be87cf5f51d952704ee585d140c3147a3f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -194,9 +194,12 @@ class LinearOperatorDiagTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
-        operator.cholesky(),
-        linalg.LinearOperatorDiag))
+    self.assertIsInstance(operator.cholesky(), linalg.LinearOperatorDiag)
+
+  def test_diag_inverse_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+    self.assertIsInstance(operator.inverse(), linalg.LinearOperatorDiag)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 2da5e712d77b88ca6bb20a5f0920335f00c7b594..ea9ee99a582fee6441207a5d9710571bc5fd6804 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -265,9 +265,14 @@ class LinearOperatorIdentityTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
-        operator.cholesky(),
-        linalg_lib.LinearOperatorIdentity))
+    self.assertIsInstance(
+        operator.cholesky(), linalg_lib.LinearOperatorIdentity)
+
+  def test_identity_inverse_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2, is_non_singular=True)
+    self.assertIsInstance(
+        operator.inverse(), linalg_lib.LinearOperatorIdentity)
 
 
 class LinearOperatorScaledIdentityTest(
@@ -458,7 +463,7 @@ class LinearOperatorScaledIdentityTest(
         is_positive_definite=False, is_non_singular=True)
     self.assertFalse(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
-    self.assertTrue(operator.is_self_adjoint is None)
+    self.assertTrue(operator.is_self_adjoint)  # Auto-set due to real multiplier
 
   def test_identity_matmul(self):
     operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
@@ -491,9 +496,19 @@ class LinearOperatorScaledIdentityTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         operator.cholesky(),
-        linalg_lib.LinearOperatorScaledIdentity))
+        linalg_lib.LinearOperatorScaledIdentity)
+
+  def test_scaled_identity_inverse_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_non_singular=True,
+    )
+    self.assertIsInstance(
+        operator.inverse(),
+        linalg_lib.LinearOperatorScaledIdentity)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 513b246803233f1117b48f1a3d413be42f15238a..54ccc0c5f642ad98c04174d01d9fca0c0fc056d6 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -100,7 +100,7 @@ class SquareLinearOperatorKroneckerTest(
 
   @property
   def _tests_to_skip(self):
-    return ["det", "solve", "solve_with_broadcast"]
+    return ["det", "inverse", "solve", "solve_with_broadcast"]
 
   def _operator_and_matrix(
       self, build_info, dtype, use_placeholder,
@@ -211,20 +211,33 @@ class SquareLinearOperatorKroneckerTest(
         is_self_adjoint=True,
     )
     cholesky_factor = operator.cholesky()
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         cholesky_factor,
-        kronecker.LinearOperatorKronecker))
+        kronecker.LinearOperatorKronecker)
     self.assertEqual(2, len(cholesky_factor.operators))
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[0],
-            lower_triangular.LinearOperatorLowerTriangular)
-    )
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[1],
-            lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[0],
+        lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[1],
+        lower_triangular.LinearOperatorLowerTriangular)
+
+  def test_kronecker_inverse_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+        ],
+        is_non_singular=True,
     )
+    inverse = operator.inverse()
+    self.assertIsInstance(
+        inverse,
+        kronecker.LinearOperatorKronecker)
+    self.assertEqual(2, len(inverse.operators))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index eb0b8ef127749e9e5709861d14b143877790bffd..10651d3c8afa0e29766d20c3dc8177af94678336 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -36,7 +36,8 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
+    return [
+        "cholesky", "log_abs_det", "inverse", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 489f6c9b00471e6c10a8a04830613e9c5b99661a..e203d1b9094f8497ad454ecafea5d99dd0b53c0a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -89,6 +90,58 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       self.evaluate(l)
 
+  def testPopUninitializedTensorUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3)
+    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, np.zeros((2, 3)))
+
+  def testPopUninitializedTensorUseSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    _, e = gen_list_ops.tensor_list_pop_back(
+        l, element_dtype=dtypes.float32, element_shape=[4, 3])
+    self.assertAllEqual(e, np.zeros((4, 3)))
+
+  def testPopUninitializedTensorWithInvalidElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to read an uninitialized tensor but "
+        "element_shape is not fully defined"):
+      _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.evaluate(e)
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
+      _, e = gen_list_ops.tensor_list_pop_back(
+          l, element_dtype=dtypes.float32, element_shape=[1, 3])
+      self.evaluate(e)
+
+  def testPushGetGrad(self):
+    with backprop.GradientTape() as tape:
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None)
+      c0 = constant_op.constant(5.0)
+      c1 = constant_op.constant([10.0, 20.0])
+      tape.watch(c0)
+      tape.watch(c1)
+      l = list_ops.tensor_list_push_back(l, c0)
+      l = list_ops.tensor_list_push_back(l, c1)
+      t1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t1), [10.0, 20.0])
+      # t1 == c1 so the gradient should be [0., [1., 1.]]
+      # This tests that the gradient of push_back correctly converts DT_INVALID
+      # tensors to zeros. The list returned by the gradient of GetItem will
+      # have only have tensor at index 1 set and others set to DT_INVALID.
+      dt0, dt1 = tape.gradient(t1, [c0, c1])
+      self.assertAllEqual(self.evaluate(dt1), [1.0, 1.0])
+      self.assertEqual(self.evaluate(dt0), 0.0)
+
   def _testStack(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
@@ -130,7 +183,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible ranks during merge: 0 vs. 1"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -151,7 +205,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -189,6 +245,54 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def _testStackWithUninitializedTensors(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [0., 0., 0.])
+
+  def testStackWithUninitializedTensors(self):
+    self._testStackWithUninitializedTensors()
+
+  def testStackWithUninitializedTensorsGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStackWithUninitializedTensors()
+
+  def _testStackWithUninitializedTensorsInferShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [[0., 0.], [1., 2.], [0., 0.]])
+
+  def testStackWithUninitializedTensorsInferShape(self):
+    self._testStackWithUninitializedTensorsInferShape()
+
+  def testStackWithUninitializedTensorsInferShapeGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStackWithUninitializedTensorsInferShape()
+
+  def testStackReservedListWithNoElementsAndPartialElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Tried to stack list which only contains "
+                                 "uninitialized tensors and has a "
+                                 "non-fully-defined element_shape: <unknown>"):
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testStackUsingSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = gen_list_ops.tensor_list_stack(
+        l, element_dtype=dtypes.float32, element_shape=[])
+    self.assertAllEqual(self.evaluate(t), np.zeros((3,)))
+
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
   def testGatherGrad(self, max_num_elements):
@@ -227,7 +331,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible ranks during merge: 0 vs. 1"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -251,7 +356,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -290,6 +397,118 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def testGatherGradWithNonContiguousIndices(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      t = constant_op.constant([1.0, 2.0, 3.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      c = constant_op.constant(5.0)
+      tape.watch(c)
+      l = list_ops.tensor_list_set_item(l, 1, c)
+      t = list_ops.tensor_list_gather(l, [1], element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t), [5.0])
+      s = t[0] * t[0]
+    dt = tape.gradient(s, c)
+    self.assertAllEqual(self.evaluate(dt), 10.0)
+    dl = tape.gradient(t, l)
+    dl_length = list_ops.tensor_list_length(dl)
+    self.assertAllEqual(self.evaluate(dl_length), 3)
+
+  def _testGatherWithUninitializedTensors(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [0., 0.])
+
+  def testGatherWithUninitializedTensors(self):
+    self._testGatherWithUninitializedTensors()
+
+  def testGatherWithUninitializedTensorsGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testGatherWithUninitializedTensors()
+
+  def _testGatherWithUninitializedTensorsInferShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
+    t = list_ops.tensor_list_gather(l, [1, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[1., 2.], [0., 0.]])
+
+  def testGatherWithUninitializedTensorsInferShape(self):
+    self._testGatherWithUninitializedTensorsInferShape()
+
+  def testGatherWithUninitializedTensorsInferShapeGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testGatherWithUninitializedTensorsInferShape()
+
+  def testGatherReservedListWithNoElementsAndPartialElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tried to gather uninitialized tensors from a"
+        " list with non-fully-defined element_shape"):
+      t = list_ops.tensor_list_gather(l, [0], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testGatherUsingSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = gen_list_ops.tensor_list_gather(
+        l, [0, 1, 2], element_dtype=dtypes.float32, element_shape=[])
+    self.assertAllEqual(self.evaluate(t), np.zeros((3,)))
+
+  def testScatterOutputListSize(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_scatter(c0, [1, 3], [])
+    # TensorListScatter should return a list with size largest index + 1.
+    self.assertAllEqual(list_ops.tensor_list_length(l), 4)
+
+  def testScatterOutputListSizeWithNumElementsSpecified(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    l = gen_list_ops.tensor_list_scatter_v2(
+        c0, [1, 3], list_ops._build_element_shape([]), num_elements=5)
+    # TensorListScatter should return a list with size num_elements.
+    self.assertAllEqual(list_ops.tensor_list_length(l), 5)
+
+  def testScatterFailsWhenIndexLargerThanNumElements(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListScatter: Trying to scatter at index 3 in list with size 3"):
+      l = gen_list_ops.tensor_list_scatter_v2(
+          c0, [1, 3], list_ops._build_element_shape([]), num_elements=3)
+      self.evaluate(l)
+
+  def testScatterFailsWithInvalidNumElements(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListScatter expects num_elements >= -1, found: -2"):
+      l = gen_list_ops.tensor_list_scatter_v2(
+          c0, [1, 3], list_ops._build_element_shape([]), num_elements=-2)
+      self.evaluate(l)
+
+  def testScatterWithInvalidRowsInInputTensorFails(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Invalid number of rows in input tensor. Expected: 3 Actual: 2"):
+      l = list_ops.tensor_list_scatter(c0, [1, 0, 2], [])
+      self.evaluate(l)
+
+  def testScatterWithNegativeIndicesFails(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Indices in TensorListScatter must all be non-negative."):
+      l = list_ops.tensor_list_scatter(
+          c0, [-1, -2], ops.convert_to_tensor([], dtype=dtypes.int32))
+      self.evaluate(l)
+
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
@@ -304,14 +523,28 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     dt = tape.gradient(loss, c0)
     self.assertAllEqual(self.evaluate(dt), [2., 4.])
 
+  def testScatterWithPartialReadGrad(self):
+    with backprop.GradientTape() as tape:
+      c0 = constant_op.constant([1.0, 2.0])
+      tape.watch(c0)
+      l = list_ops.tensor_list_scatter(
+          c0, [1, 0], ops.convert_to_tensor([], dtype=dtypes.int32))
+      t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t0), 2.0)
+      loss = t0 * t0
+    dt = tape.gradient(loss, c0)
+    self.assertAllEqual(self.evaluate(dt), [0., 4.])
+
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+    e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, 1.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 2.0)
+    self.assertAllEqual(e, 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 1.0)
-    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
+    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
 
   def testFromTensorGPU(self):
     if not context.num_gpus():
@@ -319,7 +552,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
-  def testGetSetItem(self):
+  def testGetSet(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
@@ -332,7 +565,22 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
-      self.testGetSetItem()
+      self.testGetSet()
+
+  def testGetSetReserved(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+    e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e0, 0.0)
+    l = list_ops.tensor_list_set_item(l, 0, 3.0)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [3.0, 0.0])
+
+  def testGetSetReservedGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testGetSetReserved()
 
   def testSetGetGrad(self):
     with backprop.GradientTape() as tape:
@@ -345,6 +593,64 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(self.evaluate(e), 10.0)
     self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
 
+  def testGetUninitializedTensorUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 0, 5.)
+    e1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+    e2 = list_ops.tensor_list_get_item(l, 2, element_dtype=dtypes.float32)
+    self.assertEqual(self.evaluate(e1), 0.)
+    self.assertEqual(self.evaluate(e2), 0.)
+
+  def testGetUninitializedTensorUseSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    e0 = gen_list_ops.tensor_list_get_item(
+        l, 0, element_shape=[], element_dtype=dtypes.float32)
+    e1 = gen_list_ops.tensor_list_get_item(
+        l, 1, element_shape=[2, 3], element_dtype=dtypes.float32)
+    self.assertEqual(self.evaluate(e0), 0.)
+    self.assertAllEqual(self.evaluate(e1), np.zeros((2, 3)))
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    e1 = gen_list_ops.tensor_list_get_item(
+        l, 1, element_shape=[2, 3], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e1), np.zeros((2, 3)))
+
+  def testGetUninitializedTensorWithInvalidElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to read an uninitialized tensor but "
+        "element_shape is not fully defined"):
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.evaluate(e0)
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
+      e0 = gen_list_ops.tensor_list_get_item(
+          l, 0, element_dtype=dtypes.float32, element_shape=[1, 3])
+      self.evaluate(e0)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testSkipEagerSetItemIndexOutOfBounds(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[])
+    e0 = constant_op.constant(5.)
+    l = list_ops.tensor_list_set_item(
+        l, 0, 2. * e0, resize_if_index_out_of_bounds=True)
+    l = list_ops.tensor_list_set_item(
+        l, 1, 1., resize_if_index_out_of_bounds=True)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients(t, e0)[0]
+    self.assertAllEqual(self.evaluate(grad), 2.)
+
   @test_util.run_deprecated_v1
   def testSetOnEmptyListWithMaxNumElementsFails(self):
     l = list_ops.empty_tensor_list(
@@ -666,16 +972,25 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
               list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "element shapes are not identical at index 0"):
+    if context.executing_eagerly():
+      expected_error = (
+          errors.InvalidArgumentError,
+          "element shapes are not identical at index 0")
+    else:
+      expected_error = (ValueError, "Shapes must be equal rank")
+    with self.assertRaisesRegexp(*expected_error):
       l_batch_of_vec_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_vec_tls,
                                             element_dtype=dtypes.float32))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r"input_b\[0\].dtype != element_dtype."):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError,
+                        r"input_b\[0\].dtype != element_dtype.")
+    else:
+      expected_error = (ValueError, "input_b.type != element_dtype")
+    with self.assertRaisesRegexp(*expected_error):
       l_batch_of_int_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
@@ -720,8 +1035,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(
           list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Invalid data type at index 0"):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError, "Invalid data type")
+    else:
+      expected_error = (ValueError, "wrong element dtype")
+    with self.assertRaisesRegexp(*expected_error):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
   def testZerosLike(self):
@@ -1096,6 +1414,47 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                    element_shape=[1],
                                    lengths=[1, 1])
 
+  def testResizeGrow(self):
+    l = list_ops.tensor_list_from_tensor([1., 2.], element_shape=[])
+    l = list_ops.tensor_list_resize(l, 4)
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 4)
+    self.assertEqual(
+        self.evaluate(
+            list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)),
+        1.)
+    self.assertEqual(
+        self.evaluate(
+            list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)),
+        2.)
+
+  def testResizeShrink(self):
+    l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
+    l = list_ops.tensor_list_resize(l, 2)
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 2)
+    self.assertAllEqual(
+        self.evaluate(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)),
+        [1., 2.])
+
+  def testResizeWithInvalidSizeFails(self):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListSlice expects size to be non-negative"):
+      l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
+      l = list_ops.tensor_list_resize(l, -1)
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testSkipEagerResizeGrad(self):
+    t = constant_op.constant([1., 2., 3.])
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+    l = list_ops.tensor_list_set_item(
+        l, 3, 4., resize_if_index_out_of_bounds=True)
+    t1 = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients(t1, t)[0]
+    self.assertAllEqual(self.evaluate(grad), [1., 1., 1.])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 85035e5f7d308c323786bc9fd9017fda89dbec13..3896b138c9462250475c77ccec300a122e3b0a8c 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import string
 import sys
 import tempfile
 
@@ -37,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
+
 class LoggingOpsTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -80,6 +82,17 @@ class PrintV2Test(test.TestCase):
       expected = "[0 1 2 ... 7 8 9]"
       self.assertTrue((expected + "\n") in printed.contents())
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testPrintOneStringTensor(self):
+    with self.cached_session():
+      tensor = ops.convert_to_tensor([char for char in string.ascii_lowercase])
+      with self.captureWritesToStream(sys.stderr) as printed:
+        print_op = logging_ops.print_v2(tensor)
+        self.evaluate(print_op)
+
+      expected = "[\"a\" \"b\" \"c\" ... \"x\" \"y\" \"z\"]"
+      self.assertIn((expected + "\n"), printed.contents())
+
   @test_util.run_in_graph_and_eager_modes()
   def testPrintOneTensorVarySummarize(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index ad81e0be649f17fe97691b1c5739dbe0bf4a63d2..8711923491134b4620b735641e08d022809ef3fb 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -35,9 +35,8 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
 
-class HashTableOpTest(test.TestCase):
+class HashTableTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testHashTable(self):
     with self.cached_session():
       default_val = -1
@@ -45,9 +44,9 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -62,7 +61,6 @@ class HashTableOpTest(test.TestCase):
                             self.evaluate(exported_keys_tensor))
       self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
-  @test_util.run_deprecated_v1
   def testHashTableFindHighRank(self):
     with self.cached_session():
       default_val = -1
@@ -70,18 +68,17 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["tank", "tarkus"]])
       output = table.lookup(input_string)
 
       result = self.evaluate(output)
       self.assertAllEqual([[0, 1], [-1, -1]], result)
 
-  @test_util.run_deprecated_v1
   def testHashTableInitWithPythonArrays(self):
     with self.cached_session():
       default_val = -1
@@ -90,9 +87,9 @@ class HashTableOpTest(test.TestCase):
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(
               keys, values, value_dtype=dtypes.int64), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -100,7 +97,6 @@ class HashTableOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testHashTableInitWithNumPyArrays(self):
     with self.cached_session():
       default_val = -1
@@ -108,9 +104,9 @@ class HashTableOpTest(test.TestCase):
       values = np.array([0, 1, 2], dtype=np.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -118,9 +114,9 @@ class HashTableOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testMultipleHashTables(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
+
       default_val = -1
       keys = constant_op.constant(["brain", "salad", "surgery"])
       values = constant_op.constant([0, 1, 2], dtypes.int64)
@@ -132,10 +128,12 @@ class HashTableOpTest(test.TestCase):
       table3 = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
 
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual(3, table1.size().eval())
-      self.assertAllEqual(3, table2.size().eval())
-      self.assertAllEqual(3, table3.size().eval())
+      self.evaluate(table1.initializer)
+      self.evaluate(table2.initializer)
+      self.evaluate(table3.initializer)
+      self.assertAllEqual(3, self.evaluate(table1.size()))
+      self.assertAllEqual(3, self.evaluate(table2.size()))
+      self.assertAllEqual(3, self.evaluate(table3.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output1 = table1.lookup(input_string)
@@ -147,7 +145,6 @@ class HashTableOpTest(test.TestCase):
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
 
-  @test_util.run_deprecated_v1
   def testHashTableWithTensorDefault(self):
     with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -155,7 +152,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -163,15 +160,14 @@ class HashTableOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testHashTableWithSparseTensorInput(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
       keys = constant_op.constant(["brain", "salad", "surgery"])
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       sp_indices = [[0, 0], [0, 1], [1, 0]]
       sp_shape = [2, 2]
@@ -187,7 +183,6 @@ class HashTableOpTest(test.TestCase):
       self.assertAllEqual(sp_indices, out_indices)
       self.assertAllEqual(sp_shape, out_shape)
 
-  @test_util.run_deprecated_v1
   def testSignatureMismatch(self):
     with self.cached_session():
       default_val = -1
@@ -195,12 +190,12 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       # Ref types do not produce a lookup signature mismatch.
       input_string_ref = variables.Variable("brain")
-      variables.global_variables_initializer().run()
-      self.assertEqual(0, table.lookup(input_string_ref).eval())
+      self.evaluate(input_string_ref.initializer)
+      self.assertEqual(0, self.evaluate(table.lookup(input_string_ref)))
 
       input_string = constant_op.constant([1, 2, 3], dtypes.int64)
       with self.assertRaises(TypeError):
@@ -223,8 +218,9 @@ class HashTableOpTest(test.TestCase):
     with self.cached_session():
       default_val = -1
       table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(
-              ["a"], [1], value_dtype=dtypes.int64), default_val)
+          lookup_ops.KeyValueTensorInitializer(["a"], [1],
+                                               value_dtype=dtypes.int64),
+          default_val)
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
@@ -240,10 +236,10 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       with self.assertRaisesOpError("Table already initialized"):
-        table.initializer.run()
+        self.evaluate(table.initializer)
 
   @test_util.run_deprecated_v1
   def testInitializationWithInvalidDimensions(self):
@@ -259,10 +255,9 @@ class HashTableOpTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     # Start a server
-    server = server_lib.Server(
-        {
-            "local0": ["localhost:0"]
-        }, protocol="grpc", start=True)
+    server = server_lib.Server({"local0": ["localhost:0"]},
+                               protocol="grpc",
+                               start=True)
     # Create two sessions sharing the same state
     session1 = session.Session(server.target)
     session2 = session.Session(server.target)
@@ -277,16 +272,15 @@ class HashTableOpTest(test.TestCase):
 
     # Init the table in the first session.
     with session1:
-      table.initializer.run()
-      self.assertAllEqual(3, table.size().eval())
+      self.evaluate(table.initializer)
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
     # Init the table in the second session and verify that we do not get a
     # "Table already initialized" error.
     with session2:
       table.initializer.run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testHashTableInt32String(self):
     with self.cached_session():
       default_val = "n/a"
@@ -294,7 +288,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant(["brain", "salad", "surgery"])
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
@@ -311,7 +305,6 @@ class IndexTableFromFile(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -319,12 +312,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -336,12 +329,12 @@ class IndexTableFromFile(test.TestCase):
           value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -354,12 +347,12 @@ class IndexTableFromFile(test.TestCase):
           delimiter=" ")
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -368,14 +361,16 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
-      self.assertEqual(1,
-                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+      if not context.executing_eagerly():
+        self.assertEqual(1,
+                         len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("placeholder usage")
   def test_string_index_table_from_file_placeholder_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -393,7 +388,6 @@ class IndexTableFromFile(test.TestCase):
       self.assertEqual(0,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
-  @test_util.run_deprecated_v1
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
@@ -405,12 +399,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
@@ -422,12 +416,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
@@ -436,12 +430,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
     with self.cached_session():
@@ -450,9 +444,10 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual(
           (
               1,  # From vocabulary file.
@@ -490,7 +485,6 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -498,22 +492,22 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=2)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, -1, -1), self.evaluate(ids))
-      self.assertEqual(2, table.size().eval())
+      self.assertEqual(2, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=4)
-      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", table.initializer.run)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Invalid vocab_size"):
+        table = lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file, vocab_size=4)
+        self.evaluate(table.initializer)
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
 
@@ -528,11 +522,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=3)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, -1), self.evaluate(ids))
-      self.assertEqual(3, table.size().eval())
+      self.assertEqual(3, self.evaluate(table.size()))
 
   def test_index_table_from_file_with_invalid_hashers(self):
     vocabulary_file = self._createVocabFile("invalid_hasher.txt")
@@ -574,6 +569,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer(
           ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
@@ -583,6 +579,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
         init1 = lookup_ops.KeyValueTensorInitializer(
             ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
             dtypes.int64)
+        self.assertEqual("", init1._shared_name)
         table1 = lookup_ops.HashTable(init1, default_value=-1)
         self.assertEquals("hash_table", table1.name)
         self.assertEquals("table_scope/hash_table",
@@ -590,6 +587,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
         init2 = lookup_ops.KeyValueTensorInitializer(
             ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
             dtypes.int64)
+        self.assertEqual("", init2._shared_name)
         table2 = lookup_ops.HashTable(init2, default_value=-1)
         self.assertEquals("hash_table_1", table2.name)
         self.assertEquals("table_scope/hash_table_1",
@@ -599,24 +597,24 @@ class KeyValueTensorInitializerTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int64, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
-  @test_util.run_deprecated_v1
   def test_int32(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int32, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
-      with self.assertRaisesRegexp(
-          errors_impl.OpError, "No OpKernel was registered"):
+      with self.assertRaisesRegexp(errors_impl.OpError,
+                                   "No OpKernel was registered"):
         table.initializer.run()
 
 
 class IndexTableFromTensor(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_tensor_init(self):
     table = lookup_ops.index_table_from_tensor(
         vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
@@ -633,7 +631,6 @@ class IndexTableFromTensor(test.TestCase):
     ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -641,12 +638,12 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_int64_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -654,12 +651,12 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
     with self.cached_session():
@@ -668,9 +665,10 @@ class IndexTableFromTensor(test.TestCase):
           default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
@@ -680,17 +678,13 @@ class IndexTableFromTensor(test.TestCase):
         lookup_ops.index_table_from_tensor(
             vocabulary_list=None, num_oov_buckets=1)
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_empty_vocabulary_list(self):
     with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
-        lookup_ops.tables_initializer().run()
+        _ = lookup_ops.index_table_from_tensor(
+            vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+        self.evaluate(lookup_ops.tables_initializer())
 
   def test_index_table_from_tensor_with_invalid_hashers(self):
     with self.cached_session():
@@ -717,7 +711,6 @@ class IndexToStringTableFromFileTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table(self):
     vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
     # vocabulary_file supports string and tensor
@@ -729,13 +722,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
             vocabulary_file=vocabulary_file)
         features = table.lookup(
             constant_op.constant([0, 1, 2, 3], dtypes.int64))
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-        lookup_ops.tables_initializer().run()
+        if not context.executing_eagerly():
+          with self.assertRaises(errors_impl.OpError):
+            self.evaluate(features)
+        self.evaluate(lookup_ops.tables_initializer())
         self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                             self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -745,13 +738,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
           value_column_index=0)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -762,13 +755,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           value_column_index=0,
           delimiter=" ")
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -776,13 +769,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
       table = lookup_ops.index_to_string_table_from_file(
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -792,27 +785,22 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocab_size=2,
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", default_value, default_value),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=4)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      init = lookup_ops.tables_initializer()
-      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", init.run)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Invalid vocab_size"):
+        _ = lookup_ops.index_to_string_table_from_file(
+            vocabulary_file=vocabulary_file, vocab_size=4)
+        self.evaluate(lookup_ops.tables_initializer())
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -820,16 +808,16 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=3)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_tensor(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
@@ -838,14 +826,14 @@ class IndexToStringTableFromTensorTest(test.TestCase):
 
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_duplicate_entries(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["hello", "hello"])
@@ -853,10 +841,9 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           vocabulary_list=vocabulary_list)
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
-      lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
     with self.cached_session():
@@ -865,10 +852,10 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           vocabulary_list=vocabulary_list, default_value=default_value)
       indices = constant_op.constant([1, 2, 4], dtypes.int64)
       features = table.lookup(indices)
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           self.evaluate(features))
 
@@ -885,10 +872,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
   def testInitializeStringTable(self):
     vocabulary_file = self._createVocabFile("one_column_1.txt")
     default_value = -1
-    table = lookup_ops.HashTable(
-        lookup_ops.TextFileInitializer(
-            vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-            dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+    init = lookup_ops.TextFileInitializer(
+        vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+        dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+    self.assertTrue("one_column_1.txt_-2_-1" in init._shared_name)
+    table = lookup_ops.HashTable(init, default_value)
     self.evaluate(table.initializer)
 
     output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
@@ -896,19 +884,18 @@ class InitializeTableFromFileOpTest(test.TestCase):
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
         "one_column_int64.txt", values=("42", "1", "-1000"))
 
     with self.cached_session():
       default_value = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file, dtypes.int64,
-              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_int64.txt_-2_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
@@ -916,7 +903,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testInitializeIndexTable(self):
     vocabulary_file = self._createVocabFile("one_column_2.txt")
 
@@ -924,11 +910,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = "UNK"
       key_index = lookup_ops.TextFileIndex.LINE_NUMBER
       value_index = lookup_ops.TextFileIndex.WHOLE_LINE
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                         key_index, dtypes.string, value_index),
-          default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, key_index, dtypes.string, value_index)
+      self.assertTrue("one_column_2.txt_-1_-2" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
@@ -936,7 +922,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
-  @test_util.run_deprecated_v1
   def testMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -947,11 +932,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       key_index = 1
       value_index = 2
 
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertTrue("three_columns.txt_1_2" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
@@ -959,7 +944,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([1, 5, 6], result)
 
-  @test_util.run_deprecated_v1
   def testInvalidDataTypeInMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -969,12 +953,12 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = -1
       key_index = 2
       value_index = 1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertTrue("three_columns.txt_2_1" in init._shared_name)
       with self.assertRaisesOpError("is not a valid"):
-        table.initializer.run()
+        table = lookup_ops.HashTable(init, default_value)
+        self.evaluate(table.initializer)
 
   def testInvalidDataType(self):
     vocabulary_file = self._createVocabFile("one_column_3.txt")
@@ -985,56 +969,48 @@ class InitializeTableFromFileOpTest(test.TestCase):
       value_index = lookup_ops.TextFileIndex.LINE_NUMBER
 
       with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                           key_index, dtypes.string,
-                                           value_index), default_value)
+        init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                              key_index, dtypes.string,
+                                              value_index)
+        self.assertTrue("one_column_3.txt_-2_-1" in init._shared_name)
+        lookup_ops.HashTable(init, default_value)
 
-  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     vocabulary_file = self._createVocabFile("one_column_4.txt")
     with self.cached_session():
       default_value = -1
       key_index = 1  # second column of the line
       value_index = lookup_ops.TextFileIndex.LINE_NUMBER
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertTrue("one_column_4.txt_1_-1" in init._shared_name)
 
       with self.assertRaisesOpError("Invalid number of columns"):
-        table.initializer.run()
+        table = lookup_ops.HashTable(init, default_value)
+        self.evaluate(table.initializer)
 
-  @test_util.run_deprecated_v1
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
 
-    with self.cached_session() as sess:
-      shared_name = "shared-one-columm"
+    with self.cached_session():
       default_value = -1
-      table1 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-      table2 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-      table3 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-
-      lookup_ops.tables_initializer().run()
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init1._shared_name)
+      table1 = lookup_ops.HashTable(init1, default_value)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init2._shared_name)
+      table2 = lookup_ops.HashTable(init2, default_value)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init3._shared_name)
+      table3 = lookup_ops.HashTable(init3, default_value)
+
+      self.evaluate(lookup_ops.tables_initializer())
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
 
@@ -1057,64 +1033,66 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
-  @test_util.run_deprecated_v1
   def testInitializeWithVocabSize(self):
     with self.cached_session():
       default_value = -1
       vocab_size = 3
       vocabulary_file1 = self._createVocabFile("one_column6.txt")
-      table1 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file1,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file1,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column6.txt_3_-2_-1" in init1._shared_name)
+      table1 = lookup_ops.HashTable(init1, default_value)
 
       # Initialize from file.
-      table1.initializer.run()
-      self.assertEquals(vocab_size, table1.size().eval())
+      self.evaluate(table1.initializer)
+      self.assertEquals(vocab_size, self.evaluate(table1.size()))
 
       vocabulary_file2 = self._createVocabFile("one_column7.txt")
       vocab_size = 5
-      table2 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file2,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file2,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column7.txt_5_-2_-1" in init2._shared_name)
       with self.assertRaisesOpError("Invalid vocab_size"):
-        table2.initializer.run()
+        table2 = lookup_ops.HashTable(init2, default_value)
+        self.evaluate(table2.initializer)
 
       vocab_size = 1
       vocabulary_file3 = self._createVocabFile("one_column3.txt")
-      table3 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file3,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file3,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column3.txt_1_-2_-1" in init3._shared_name)
+      table3 = lookup_ops.HashTable(init3, default_value)
 
       # Smaller vocab size reads only vocab_size records.
-      table3.initializer.run()
-      self.assertEquals(vocab_size, table3.size().eval())
+      self.evaluate(table3.initializer)
+      self.assertEquals(vocab_size, self.evaluate(table3.size()))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("placeholder usage")
   def testFeedVocabularyName(self):
     vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
 
     with self.cached_session():
       default_value = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              "old_file.txt", dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      init = lookup_ops.TextFileInitializer(
+          "old_file.txt", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("old_file.txt_-2_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
       # Initialize with non existing file (old_file.txt) should fail.
       # TODO(yleon): Update message, which might change per FileSystem.
@@ -1131,7 +1109,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testInvalidFilenames(self):
     vocabulary_file = self._createVocabFile("filename_shape.txt")
 
@@ -1149,66 +1126,73 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       # Non-scalar filename
       filenames = constant_op.constant([vocabulary_file, vocabulary_file])
-      with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(
-                filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
-            default_value)
+      if not context.executing_eagerly():
+        with self.assertRaises(ValueError):
+          lookup_ops.HashTable(
+              lookup_ops.TextFileInitializer(
+                  filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                  dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+              default_value)
+      else:
+        with self.assertRaises(errors_impl.InvalidArgumentError):
+          lookup_ops.HashTable(
+              lookup_ops.TextFileInitializer(
+                  filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                  dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+              default_value)
 
-  @test_util.run_deprecated_v1
   def testIdToStringTable(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
       default_value = "UNK"
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileStringTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
+      init = lookup_ops.TextFileStringTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_1.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
       out = table.lookup(input_values)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
                           self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertEquals(vocab_size, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testStringToIdTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt")
     with self.cached_session():
       default_value = -1
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_2.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
       self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertEquals(vocab_size, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testInt64ToIdTable(self):
     vocab_file = self._createVocabFile(
         "feat_to_id_3.txt", values=("42", "1", "-1000"))
     with self.cached_session():
       default_value = -1
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
-          default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64)
+      self.assertTrue("feat_to_id_3.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
       self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertEquals(vocab_size, self.evaluate(table.size()))
 
 
 class IdTableWithHashBucketsTest(test.TestCase):
@@ -1385,7 +1369,6 @@ class IdTableWithHashBucketsTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
-    shared_name = "across-sessions"
     with self.cached_session():
       default_value = -1
       vocab_size = 3
@@ -1393,9 +1376,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       table1 = lookup_ops.IdTableWithHashBuckets(
           lookup_ops.HashTable(
               lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size),
-              default_value,
-              shared_name=shared_name), oov_buckets)
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
 
       table1.initializer.run()
 
@@ -1417,9 +1399,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       table2 = lookup_ops.IdTableWithHashBuckets(
           lookup_ops.HashTable(
               lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size),
-              default_value,
-              shared_name=shared_name), oov_buckets)
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
 
       input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 7b1b054ae0656ef8ae988c1a3220a2a643afbcab..6fb8a4b5d8678e54623d194ef97ae65f2e494b15 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -223,7 +223,7 @@ class PadOpTest(test.TestCase):
   def testIntTypes(self):
     # TODO(touts): Figure out why the padding tests do not work on GPU
     # for int types and rank > 2.
-    for t in [np.int8, np.int32, np.int64]:
+    for t in [np.int8, np.uint8, np.int32, np.int64]:
       self._testAll(
           np.random.randint(-100, 100, (4, 4, 3)).astype(t),
           [[1, 0], [2, 3], [0, 2]], 0)
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 78e786f01ca9c167b5b175fcd833a83281c078de..367c94dd1e689ade820a96322c786d416bd2b1b1 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -303,6 +303,7 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")  # Much larger error
   def testGradient1D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index c33b59bb99b716b7164c82f6e640a8a3f4680351..27a71904340a1058bec5b9f993f78c5766345f01 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -730,6 +730,7 @@ class PoolingTest(test.TestCase):
         t = nn_ops.max_pool(
             t, ksize=ksize, strides=strides, padding="SAME").eval()
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testDepthwiseMaxPoolInvalidConfigs(self):
     self._testDepthwiseMaxPoolInvalidConfig(
         [1, 2, 2, 4], [1, 2, 2, 2], [1, 1, 1, 2],
@@ -1174,6 +1175,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1210,6 +1212,7 @@ class PoolingTest(test.TestCase):
                      [1, window_rows, window_cols, 1],
                      [1, row_stride, col_stride, 1], padding)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def _testMaxPoolGradDirect(self, input_data, output_backprop,
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
@@ -1625,6 +1628,7 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
@@ -1659,6 +1663,7 @@ class PoolingTest(test.TestCase):
         [1, row_stride, col_stride, 1], padding)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1818,6 +1823,7 @@ class PoolingTest(test.TestCase):
             padding="SAME")
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testOpEdgeCases(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
@@ -1893,9 +1899,17 @@ if __name__ == "__main__":
        padding_) in GetShrunkInceptionMaxPoolShapes():
     setattr(PoolingTest, "testMaxPoolFwd_" + name_,
             GetMaxPoolFwdTest(input_size_, filter_size_, stride_, padding_))
-    setattr(PoolingTest, "testMaxPoolGrad_" + name_,
-            GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
-                               padding_))
+    if name_ == "maxpool5":
+      setattr(
+          PoolingTest, "testMaxPoolGrad_" + name_,
+          test_util.disable_xla("maxpool5 fails while all others pass")(
+              GetMaxPoolGradTest(input_size_, filter_size_, output_size_,
+                                 stride_, padding_)))
+    else:
+      setattr(
+          PoolingTest, "testMaxPoolGrad_" + name_,
+          GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
+                             padding_))
     setattr(PoolingTest, "testMaxPoolGradGrad_" + name_,
             GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_,
                                    stride_, padding_))
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index dd81306db05aafac0d041320a193c7d92437a5fd..8452982a447ff5eaa1b4eaa11c5d6f8cbd6a7e8c 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -14,6 +14,14 @@ load("//tensorflow:tensorflow.bzl", "sycl_py_test")
 # Please avoid the py_tests and cuda_py_tests (plural) while we
 # fix the shared/overbroad dependencies.
 
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    deps = [
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
@@ -45,6 +53,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -64,6 +73,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
     shard_count = 3,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -75,6 +85,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -88,6 +99,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -103,6 +115,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:stateless_random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -110,6 +123,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["random_gamma_test.py"],
     additional_deps = [
+        ":util",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -120,6 +134,7 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = ["nozapfhahn"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -137,6 +152,7 @@ cuda_py_test(
         "//tensorflow/python:random_grad",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -144,6 +160,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["random_poisson_test.py"],
     additional_deps = [
+        ":util",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -152,4 +169,5 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index a5952a21968c79c8bfbcbfef2b09852f24f29923..5cc13f67777aef07ab40e8926effc3a2a0d6430b 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -27,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.kernel_tests.random import util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -69,16 +68,6 @@ class RandomGammaTest(test.TestCase):
       tf_logging.warn("Cannot test moments: %s" % e)
       return
 
-    # Check the given array of samples matches the given theoretical moment
-    # function at different orders. The test is considered passing if the
-    # z-tests of all statistical moments are all below z_limit.
-    # Parameters:
-    #   max_moments: the largest moments of the distribution to be tested
-    #   stride: the distance between samples to check for statistical properties
-    #       0 means the n-th moment of each sample
-    #       any other strides tests for spatial correlation between samples;
-    #   z_limit: the maximum z-test we would consider the test to pass;
-
     # The moments test is a z-value test.  This is the largest z-value
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
@@ -94,46 +83,13 @@ class RandomGammaTest(test.TestCase):
           max_moment = min(6, scale // 2)
           sampler = self._Sampler(
               20000, alpha, 1 / scale, dt, use_gpu=False, seed=12345)
-          moments = [0] * (max_moment + 1)
-          moments_sample_count = [0] * (max_moment + 1)
-          x = np.array(sampler().flat)  # sampler does 10x samples
-          for k in range(len(x)):
-            moment = 1.
-            for i in range(max_moment + 1):
-              index = k + i * stride
-              if index >= len(x):
-                break
-              moments[i] += moment
-              moments_sample_count[i] += 1
-              moment *= x[index]
-          for i in range(max_moment + 1):
-            moments[i] /= moments_sample_count[i]
-          for i in range(1, max_moment + 1):
-            g = stats.gamma(alpha, scale=scale)
-            if stride == 0:
-              moments_i_mean = g.moment(i)
-              moments_i_squared = g.moment(2 * i)
-            else:
-              moments_i_mean = pow(g.moment(1), i)
-              moments_i_squared = pow(g.moment(2), i)
-            # Calculate moment variance safely:
-            # This is just
-            #  (moments_i_squared - moments_i_mean**2) / moments_sample_count[i]
-            normalized_moments_i_var = (
-                moments_i_mean / moments_sample_count[i] *
-                (moments_i_squared / moments_i_mean - moments_i_mean))
-            # Assume every operation has a small numerical error.
-            # It takes i multiplications to calculate one i-th moment.
-            error_per_moment = i * np.finfo(dt.as_numpy_dtype).eps
-            total_variance = (normalized_moments_i_var + error_per_moment)
-            tiny = np.finfo(dt.as_numpy_dtype).tiny
-            self.assertGreaterEqual(total_variance, 0)
-            if total_variance < tiny:
-              total_variance = tiny
-            # z_test is approximately a unit normal distribution.
-            z_test = abs(
-                (moments[i] - moments_i_mean) / math.sqrt(total_variance))
-            self.assertLess(z_test, z_limit)
+          z_scores = util.test_moment_matching(
+              sampler(),
+              max_moment,
+              stats.gamma(alpha, scale=scale),
+              stride=stride,
+          )
+          self.assertAllLess(z_scores, z_limit)
 
   def _testZeroDensity(self, alpha):
     """Zero isn't in the support of the gamma distribution.
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 1384c3f446f97a76792a27cfc7f679e80402cbf0..68672a04bbdc48e066d90ceb5ff94ea705f75fd9 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -257,6 +257,7 @@ class TruncatedNormalTest(test.TestCase):
       self.assertAllEqual(rnd1, rnd2)
 
 
+@test_util.disable_all_xla("This never passed on XLA")
 class RandomUniformTest(RandomOpTestCommon):
 
   def _Sampler(self, num, minv, maxv, dtype, use_gpu, seed=None):
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 0a6b004d682e5d810a5a3e09ca6dce867e5f41f1..51dd4cb47ca8561dfd01e20031651047fb2b70b9 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.kernel_tests.random import util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -49,14 +50,13 @@ class RandomPoissonTest(test.TestCase):
 
     return func
 
-  # TODO(srvasude): Factor this out along with the corresponding moment testing
-  # method in random_gamma_test into a single library.
   def testMoments(self):
     try:
       from scipy import stats  # pylint: disable=g-import-not-at-top
     except ImportError as e:
       tf_logging.warn("Cannot test moments: %s", e)
       return
+
     # The moments test is a z-value test.  This is the largest z-value
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
@@ -67,41 +67,13 @@ class RandomPoissonTest(test.TestCase):
         for lam in (3., 20):
           max_moment = 5
           sampler = self._Sampler(10000, lam, dt, use_gpu=False, seed=12345)
-          moments = [0] * (max_moment + 1)
-          moments_sample_count = [0] * (max_moment + 1)
-          x = np.array(sampler().flat)  # sampler does 10x samples
-          for k in range(len(x)):
-            moment = 1.
-            for i in range(max_moment + 1):
-              index = k + i * stride
-              if index >= len(x):
-                break
-              moments[i] += moment
-              moments_sample_count[i] += 1
-              moment *= x[index]
-          for i in range(max_moment + 1):
-            moments[i] /= moments_sample_count[i]
-          for i in range(1, max_moment + 1):
-            g = stats.poisson(lam)
-            if stride == 0:
-              moments_i_mean = g.moment(i)
-              moments_i_squared = g.moment(2 * i)
-            else:
-              moments_i_mean = pow(g.moment(1), i)
-              moments_i_squared = pow(g.moment(2), i)
-            moments_i_var = (
-                moments_i_squared - moments_i_mean * moments_i_mean)
-            # Assume every operation has a small numerical error.
-            # It takes i multiplications to calculate one i-th moment.
-            error_per_moment = i * 1e-6
-            total_variance = (
-                moments_i_var / moments_sample_count[i] + error_per_moment)
-            if not total_variance:
-              total_variance = 1e-10
-            # z_test is approximately a unit normal distribution.
-            z_test = abs(
-                (moments[i] - moments_i_mean) / np.sqrt(total_variance))
-            self.assertLess(z_test, z_limit)
+          z_scores = util.test_moment_matching(
+              sampler(),
+              max_moment,
+              stats.poisson(lam),
+              stride=stride,
+          )
+          self.assertAllLess(z_scores, z_limit)
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
diff --git a/tensorflow/python/kernel_tests/random/util.py b/tensorflow/python/kernel_tests/random/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..67805c7f262480e18fd296e15fc4a436e70c0c58
--- /dev/null
+++ b/tensorflow/python/kernel_tests/random/util.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for testing random variables."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def test_moment_matching(
+    samples,
+    number_moments,
+    dist,
+    stride=0):
+  """Return z-test scores for sample moments to match analytic moments.
+
+  Given `samples`, check that the first sample `number_moments` match
+  the given  `dist` moments by doing a z-test.
+
+  Args:
+    samples: Samples from target distribution.
+    number_moments: Python `int` describing how many sample moments to check.
+    dist: SciPy distribution object that provides analytic moments.
+    stride: Distance between samples to check for statistical properties.
+      A stride of 0 means to use all samples, while other strides test for
+      spatial correlation.
+  Returns:
+    Array of z_test scores.
+  """
+
+  sample_moments = []
+  expected_moments = []
+  variance_sample_moments = []
+  x = samples.flat
+  for i in range(1, number_moments + 1):
+    strided_range = x[::(i - 1) * stride + 1]
+    sample_moments.append(np.mean(strided_range ** i))
+    expected_moments.append(dist.moment(i))
+    variance_sample_moments.append(
+        (dist.moment(2 * i) - dist.moment(i) ** 2) / len(strided_range))
+
+  z_test_scores = []
+  for i in range(1, number_moments + 1):
+    # Assume every operation has a small numerical error.
+    # It takes i multiplications to calculate one i-th moment.
+    total_variance = (
+        variance_sample_moments[i - 1] +
+        i * np.finfo(samples.dtype).eps)
+    tiny = np.finfo(samples.dtype).tiny
+    assert np.all(total_variance > 0)
+    if total_variance < tiny:
+      total_variance = tiny
+    # z_test is approximately a unit normal distribution.
+    z_test_scores.append(abs(
+        (sample_moments[i - 1] - expected_moments[i - 1]) / np.sqrt(
+            total_variance)))
+  return z_test_scores
+
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index d4ba1ad77d5547ccb9fe4e2154d145751cf63514..3b8924904c5eb20670a2d61fe1f5a3af470eebde 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -86,6 +86,7 @@ class ReluTest(test.TestCase):
     self.assertAllClose(np_relu, tf_relu)
     self.assertShapeEqual(np_relu, tf_relu)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testReluInt8x4BadShape(self):
     if not test.is_gpu_available(cuda_only=True):
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index df7b68616522f58633da9a1df174e370a5e73144..163d5a316ce088015ac00e9fc582a5b71865b3c0 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -24,11 +24,15 @@ import pickle
 
 import numpy as np
 
+from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -286,12 +290,19 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       tmp_dir = self.get_temp_dir()
       fname = os.path.join(tmp_dir, "var.pickle")
       with open(fname, "wb") as f:
-        v = resource_variable_ops.ResourceVariable(10.0)
+        v = resource_variable_ops.ResourceVariable(
+            10.0,
+            dtype=dtypes.float16,
+            name="v")
         pickle.dump(v, f)
 
       with open(fname, "rb") as f:
-        v = pickle.load(f)
-        self.assertAllEqual(v.numpy(), 10.0)
+        new_v = pickle.load(f)
+        self.assertEqual(new_v.name, v.name)
+        self.assertEqual(new_v.shape, v.shape)
+        self.assertEqual(new_v.dtype, v.dtype)
+        self.assertEqual(new_v.trainable, v.trainable)
+        self.assertAllEqual(new_v.numpy(), v.numpy())
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterDiv(self):
@@ -629,7 +640,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           variable_def=other_v_def)
       self.assertTrue(other_v_prime._cached_value is not None)
 
-  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -967,16 +977,33 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           list_ops.tensor_list_get_item(v[0], 0, element_dtype=dtypes.float32),
           1.)
 
+  def testGroupDoesntForceRead(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      assign = v.assign_add(1.0)
+      g = control_flow_ops.group([assign])
+      self.assertEqual(g.control_inputs[0].type, "AssignAddVariableOp")
+
   def testScatterNdAddStateOps(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(
-          [1, 1, 1, 1, 1, 1, 1, 1], dtype=dtypes.float32, name="add")
+          [1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.float32, name="add")
       indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-      expected = np.array([1, 12, 1, 11, 10, 1, 1, 13])
+      expected = np.array([1, 13, 3, 14, 14, 6, 7, 20])
       state_ops.scatter_nd_add(v, indices, updates)
       self.assertAllClose(expected, v.numpy())
 
+  def testScatterNdSubStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable(
+          [1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.float32, name="sub")
+      indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+      expected = np.array([1, -9, 3, -6, -4, 6, 7, -4])
+      state_ops.scatter_nd_sub(v, indices, updates)
+      self.assertAllClose(expected, v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
@@ -1011,6 +1038,59 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           session.run(copied.initializer)
 
+  def create_variant_shape_and_type_data(self):
+    variant_shape_and_type_data = (
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData())
+    variant_shape_and_type_data.is_set = True
+    stored_shape = tensor_shape.TensorShape([None, 4]).as_proto()
+    stored_dtype = dtypes.float32.as_datatype_enum
+    # NOTE(ebrevdo): shape_and_type lacks append() in some versions of protobuf.
+    variant_shape_and_type_data.shape_and_type.extend([
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleShapeAndType(
+            shape=stored_shape, dtype=stored_dtype)])
+    return variant_shape_and_type_data
+
+  @def_function.function
+  def create_constant_variant(self, value):
+    value = constant_op.constant(
+        tensor_pb2.TensorProto(
+            dtype=dtypes.variant.as_datatype_enum,
+            tensor_shape=tensor_shape.TensorShape([]).as_proto(),
+            variant_val=[
+                tensor_pb2.VariantTensorDataProto(
+                    # Match registration in variant_op_registry.cc
+                    type_name=b"int",
+                    metadata=np.array(value, dtype=np.int32).tobytes())
+            ]))
+    return value
+
+  # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
+  # EagerTensor constants with TensorProto inputs.
+  @test_util.run_in_graph_and_eager_modes()
+  def testVariantInitializer(self):
+    variant_shape_and_type_data = self.create_variant_shape_and_type_data()
+    value = self.create_constant_variant(3)
+    initializer = array_ops.fill([3], value)
+    resource_variable_ops._set_handle_shapes_and_types(  # pylint: disable=protected-access
+        initializer, variant_shape_and_type_data,
+        graph_mode=not context.executing_eagerly())
+    v = resource_variable_ops.ResourceVariable(initializer)
+    read = array_ops.identity(v)
+    read_variant_shape_and_type = (
+        resource_variable_ops.get_eager_safe_handle_data(read))
+    self.assertEqual(
+        read_variant_shape_and_type, variant_shape_and_type_data)
+    gather = v.sparse_read([0])
+    gather_variant_shape_and_type = (
+        resource_variable_ops.get_eager_safe_handle_data(gather))
+    self.assertEqual(
+        gather_variant_shape_and_type, variant_shape_and_type_data)
+    # Make sure initializer runs.
+    if not context.executing_eagerly():
+      self.evaluate(v.initializer)
+      self.evaluate(read.op)
+      self.evaluate(gather.op)
+
 
 class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
 
@@ -1068,6 +1148,11 @@ class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
     self.assertEqual(NotImplemented,
                      v._dense_var_to_tensor(dtype=dtypes.float16, as_ref=True))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testDistributeStrategy(self):
+    v = resource_variable_ops.ResourceVariable(1, dtype=dtypes.int32)
+    self.assertIsNone(v._distribute_strategy)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 33e491fee1dadbcce225dfa70310d47a21b6893c..4e15894fb4aa8a90d8dd9914ba25dcfd27d5fe95 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -70,6 +70,7 @@ def handle_options(func, x, axis, exclusive, reverse):
   return x
 
 
+@test_util.disable_all_xla("This test never passed for XLA")
 class CumsumTest(test.TestCase):
 
   valid_dtypes = [
@@ -193,6 +194,7 @@ class CumsumTest(test.TestCase):
           self._compareGradient([5, 10], axis, exclusive, reverse)
 
 
+@test_util.disable_all_xla("This test never passed for XLA")
 class CumprodTest(test.TestCase):
 
   valid_dtypes = [
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 8510a08f0c96dd9ae08a2ca3e782cc7d28e86264..88f7b27b77ee24c732b84a674587b63638b2c903 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -217,7 +218,7 @@ class StatefulScatterNdTest(test.TestCase):
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -295,6 +296,7 @@ class StatefulScatterNdTest(test.TestCase):
                                     updates).get_shape().as_list(), shape)
 
   @test_util.run_v1_only("b/120545219")
+  @test_util.disable_xla("This test never passed for XLA")
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
@@ -325,7 +327,7 @@ class StatefulScatterNdTest(test.TestCase):
     shape = np.array([2, 2, 2])
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The outer \\d+ dimensions of indices\\.shape="):
+        ValueError, r"The outer \d+ dimensions of indices\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
   @test_util.run_deprecated_v1
@@ -335,7 +337,7 @@ class StatefulScatterNdTest(test.TestCase):
     shape = np.array([2, 2, 2])
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The inner \\d+ dimensions of input\\.shape="):
+        ValueError, r"The inner \d+ dimensions of input\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
   @test_util.run_deprecated_v1
@@ -539,7 +541,7 @@ class ScatterNdTest(test.TestCase):
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The outer \\d+ dimensions of indices\\.shape="):
+        ValueError, r"The outer \d+ dimensions of indices\.shape="):
       self.scatter_nd(indices, updates, shape)
 
   @test_util.run_deprecated_v1
@@ -548,7 +550,7 @@ class ScatterNdTest(test.TestCase):
     updates = array_ops.zeros([2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
+        ValueError, r"The inner \d+ dimensions of (input|output)\.shape="):
       self.scatter_nd(indices, updates, shape)
 
   @test_util.run_deprecated_v1
@@ -749,6 +751,16 @@ class ScatterNdTensorTest(test.TestCase):
       self.assertLess(err_added_wrt_updates, 2e-4)
       self.assertLess(err_subbed_wrt_updates, 2e-4)
 
+  def testTensorScatterUpdateWithForwarding(self):
+    @def_function.function
+    def _TestFn():
+      indices = constant_op.constant([[4], [3], [1], [7]])
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+      t = array_ops.ones([8], dtype=dtypes.float32)
+
+      return array_ops.tensor_scatter_update(t, indices, updates)
+
+    self.assertAllEqual(_TestFn(), [1, 11, 1, 10, 9, 1, 1, 12])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 8f4e31abe3c90af01029be719ee83c7c7dc42f0c..554bf38029473bb9ff204a09556a182b378dd549 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -29,6 +29,7 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -45,6 +46,7 @@ cuda_py_tests(
     ],
     shard_count = 4,
     tags = ["optonly"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -56,6 +58,7 @@ cuda_py_tests(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/ops/signal",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -70,6 +73,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:spectral_ops_test_util",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -87,6 +91,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -104,6 +109,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -125,6 +131,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
     ],
     tags = ["nomac"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -140,4 +147,5 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/signal/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
index a3ac15bab8a7b8223bd1ea085386b965b7fdd62e..e698afde635d58e159340bac20bdd12d8cf3e711 100644
--- a/tensorflow/python/kernel_tests/signal/dct_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -134,6 +134,7 @@ class DCTOpsTest(parameterized.TestCase, test.TestCase):
   @parameterized.parameters([
       [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def test_random(self, shape):
     """Test randomly generated batches of data."""
     with spectral_ops_test_util.fft_kernel_label_map():
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 5b1053428c0096c15fce7c4fa7b46d5999602057..f3bee87a65f484788aa027fd51f7d006c5dcf24f 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -159,6 +159,7 @@ class FFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -169,6 +170,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -178,6 +180,7 @@ class FFTOpsTest(BaseFFTOpsTest):
                 np.mod(np.arange(np.power(4, dims)), 10).reshape(
                     (4,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testLargeBatch(self):
     if test.is_gpu_available(cuda_only=True):
       rank = 1
@@ -209,6 +212,7 @@ class FFTOpsTest(BaseFFTOpsTest):
                 rank, use_placeholder=True, rtol=tol, atol=tol)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
@@ -224,6 +228,7 @@ class FFTOpsTest(BaseFFTOpsTest):
                           rtol=tol, atol=tol)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testRandom1D(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -340,6 +345,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -350,6 +356,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -364,6 +371,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._compareBackward(
                 c2r.astype(np.complex64), rank, (size,) * rank)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testLargeBatch(self):
     if test.is_gpu_available(cuda_only=True):
       rank = 1
@@ -397,6 +405,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 rank, (size,) * rank,
                 use_placeholder=True)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testFftLength(self):
     if test.is_gpu_available(cuda_only=True):
       with spectral_ops_test_util.fft_kernel_label_map():
@@ -440,6 +449,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                   use_placeholder=True)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       def gen_real(shape):
@@ -465,6 +475,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 gen_complex(complex_dims), rank, (size,) * rank)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index 7b9748c7f260b60d7322a6de68e35970513ac969..a72a836592667889b7c54665c2277eaffd40846c 100644
--- a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -115,6 +116,7 @@ class SpectralOpsTest(test.TestCase):
       self.assertAllClose(
           expected_inverse_stft, actual_inverse_stft, 1e-4, 1e-4)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def test_shapes(self):
     with spectral_ops_test_util.fft_kernel_label_map(), (
         self.session(use_gpu=True)):
@@ -150,6 +152,7 @@ class SpectralOpsTest(test.TestCase):
       self.assertAllEqual([256], inverse_stft.shape.as_list())
       self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
     # Tuples of (signal_length, frame_length, frame_step, fft_length).
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index e96bc09f3652aaa4d41bddac6ad06daaff8bfbd6..7f3c381fa161bd59b1956e880d82e62d6b051b25 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -285,6 +285,7 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  @test_util.disable_xla("This test never passed for XLA")
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 7598991489ce6019352e19cb6c50819d91085b0d..ede12d1c83fb559f2164c0e7f46640315d0ced62 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -72,11 +72,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtype),
         constant_op.constant(shape, dtypes.int64))
 
-  @test_util.run_deprecated_v1
   def testInt32(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6(dtypes.int32)
-      output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 50)
 
       expected_output = np.zeros((5, 50), dtype=np.bool)
       expected_trues = ((0, 0), (1, 10), (1, 13), (1, 14), (3, 32), (3, 33))
@@ -85,11 +84,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
-  @test_util.run_deprecated_v1
   def testInt64(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6(dtypes.int64)
-      output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 50)
 
       expected_output = np.zeros((5, 50), dtype=np.bool)
       expected_trues = [(0, 0), (1, 10), (1, 13), (1, 14), (3, 32), (3, 33)]
@@ -98,11 +96,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
-  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x3x4(dtypes.int64)
-      output = sparse_ops.sparse_to_indicator(sp_input, 200).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 200)
 
       expected_output = np.zeros((2, 3, 200), dtype=np.bool)
       expected_trues = [(0, 0, 1), (0, 1, 10), (0, 1, 12), (1, 0, 103),
@@ -151,7 +148,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt32AndFloat32(self):
     vocab_size = 50
     indices_v, values_v = self._SparseTensorValue_3x50(np.int32, np.float32)
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for indices in (indices_v,
                       sparse_tensor.SparseTensor.from_value(indices_v)):
         for values in (values_v,
@@ -163,7 +160,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -172,7 +169,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -181,7 +178,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt32AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int32, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
@@ -191,7 +188,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
@@ -202,7 +199,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt64AndFloat64NonCanonicalOrder(self):
     vocab_size = 50
     vocab_size_tensor = constant_op.constant(vocab_size, dtypes.int64)
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
@@ -261,7 +258,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = [50, 31]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -270,7 +267,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64(self):
     vocab_size = [50, 31]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -279,7 +276,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64Shape(self):
     vocab_size = [50, 30]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -300,9 +297,8 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
@@ -314,7 +310,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testRetainNone(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6()
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
@@ -326,7 +322,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testMismatchedRetainShape(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6()
       to_retain = np.array([1, 0, 0, 1, 0], dtype=np.bool)
       with self.assertRaises(ValueError):
@@ -358,16 +354,14 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
-  @test_util.run_deprecated_v1
   def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 6, 7], dtype=np.int64)
     sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
     self.assertAllEqual([3, 6, 7], sp_output.get_shape())
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -379,9 +373,8 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
-  @test_util.run_deprecated_v1
   def testInputUnavailableInGraphConstructionOk(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensorValue_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -409,7 +402,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testTightBoundingBox(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
@@ -421,7 +414,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [2, 4, 5])
 
   def testTightBoundingBoxEmpty(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
@@ -431,9 +424,8 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.values.shape, [0])
       self.assertAllEqual(output.dense_shape, [0, 0, 0])
 
-  @test_util.run_deprecated_v1
   def testInvalidRank(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 7], dtype=np.int64)
 
@@ -450,7 +442,6 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
@@ -510,14 +501,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtypes.int32),
         constant_op.constant(shape, dtypes.int64))
 
-  @test_util.run_deprecated_v1
   def testFillNumber(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         sp_output, empty_row_indicator = (
             sparse_ops.sparse_fill_empty_rows(sp_input, -1))
 
-        output, empty_row_indicator_out = sess.run(
+        output, empty_row_indicator_out = self.evaluate(
             [sp_output, empty_row_indicator])
 
         self.assertAllEqual(
@@ -530,7 +520,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testFillFloat(self):
-    with self.session(use_gpu=False) as sess:
+    with self.session(use_gpu=False):
       values = constant_op.constant(
           [0.0, 10.0, 13.0, 14.0, 32.0, 33.0], dtype=dtypes.float64)
       default_value = constant_op.constant(-1.0, dtype=dtypes.float64)
@@ -540,7 +530,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
           dense_shape=np.array([5, 6]))
       sp_output, empty_row_indicator = (sparse_ops.sparse_fill_empty_rows(
           sp_input, default_value))
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4],
@@ -563,14 +553,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertGreater(default_value_grad_err, 0)
       self.assertLess(default_value_grad_err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testFillString(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_String5x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, ""))
 
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(
@@ -582,14 +571,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
-  @test_util.run_deprecated_v1
   def testNoEmptyRows(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, -1))
 
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4]])
@@ -600,7 +588,6 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testValuesInVariable(self):
     indices = constant_op.constant([[1]], dtype=dtypes.int64)
     values = variables.Variable([1], trainable=False, dtype=dtypes.float32)
@@ -609,7 +596,7 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_input = sparse_tensor.SparseTensor(indices, values, shape)
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       self.evaluate(variables.global_variables_initializer())
       output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
@@ -625,7 +612,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
   dense_shape = np.array([2, 3]).astype(np.int64)
 
   def _compare(self, sp_t, reduction_axes, ndims, keep_dims, do_sum):
-    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+    densified = self.evaluate(sparse_ops.sparse_tensor_to_dense(sp_t))
 
     np_ans = densified
     if reduction_axes is None:
@@ -665,7 +652,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
                                                             reduction_axes,
                                                             keep_dims)
       # Convert to dense for comparison purposes.
-      out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans).eval()
+      out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans)
 
     self.assertAllClose(np_ans, out_dense)
     self.assertAllClose(np_ans, out_sparse)
@@ -676,14 +663,13 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, True, False)
     self._compare(sp_t, reduction_axes, ndims, True, True)
 
-  @test_util.run_deprecated_v1
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       self._compare_all(sp_t, None, ndims=2)
       self._compare_all(sp_t, 0, ndims=2)
       self._compare_all(sp_t, [1], ndims=2)
@@ -694,7 +680,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
     np.random.seed(1618)
     test_dims = [(1618, 1, 11, 7, 1), (1,), (1, 1, 1)]
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for dims in test_dims:
         sp_t, unused_nnz = _sparsify(np.random.randn(*dims))
         # reduce all using None
@@ -706,15 +692,15 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def testInvalidAxes(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
+        self.evaluate(sparse_ops.sparse_reduce_sum(sp_t, -3))
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
+        self.evaluate(sparse_ops.sparse_reduce_sum(sp_t, 2))
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_max(sp_t, -3).eval()
+        self.evaluate(sparse_ops.sparse_reduce_max(sp_t, -3))
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_max(sp_t, 2).eval()
+        self.evaluate(sparse_ops.sparse_reduce_max(sp_t, 2))
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -745,7 +731,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def _testSparseReduceShape(self, sp_t, reduction_axes, ndims, keep_dims,
                              do_sum):
-    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+    densified = self.evaluate(sparse_ops.sparse_tensor_to_dense(sp_t))
 
     np_op = np.sum
     tf_op = sparse_ops.sparse_reduce_sum
@@ -773,7 +759,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
   def testSparseReduceSumOrMaxShape(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for do_sum in [True, False]:
         for keep_dims in [True, False]:
           self._testSparseReduceShape(sp_t, None, 2, keep_dims, do_sum)
@@ -790,19 +776,17 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
   def _check(self, result_tensor, result_np, input_sp_t):
     self.assertTrue(isinstance(result_tensor, sparse_tensor.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, sparse_tensor.SparseTensor))
-    self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.dense_shape.eval(),
-                        result_tensor.dense_shape.eval())
+    self.assertAllEqual(input_sp_t.indices, result_tensor.indices)
+    self.assertAllEqual(input_sp_t.dense_shape, result_tensor.dense_shape)
 
-    res_densified = sparse_ops.sparse_to_dense(result_tensor.indices,
-                                               result_tensor.dense_shape,
-                                               result_tensor.values).eval()
+    res_densified = sparse_ops.sparse_to_dense(
+        result_tensor.indices, result_tensor.dense_shape, result_tensor.values)
     self.assertAllEqual(result_np, res_densified)
 
   @test_util.run_deprecated_v1
   def testCwiseShapeValidation(self):
     # Test case for GitHub 24072.
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       a = array_ops.ones([3, 4, 1], dtype=dtypes.int32)
       b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
                                      [1, 1, 4, 2])
@@ -810,21 +794,20 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "broadcasts dense to sparse only; got incompatible shapes"):
-        c.eval()
+        self.evaluate(c)
 
-  @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
     dense_shapes = [(10, 10, 1), (5, 5), (1,), (1, 7)]
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for dtype in [np.float32, np.float64, np.int32, np.int64]:
         for sp_shape, dense_shape in zip(sp_shapes, dense_shapes):
           sp_vals_np = np.random.rand(*sp_shape).astype(dtype) + 1
           dense_vals_np = np.random.rand(*dense_shape).astype(dtype) + 1
           sp_t, unused_nnz = _sparsify(sp_vals_np, thresh=1.5)
-          sp_t_densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+          sp_t_densified = sparse_ops.sparse_tensor_to_dense(sp_t)
           dense_t = constant_op.constant(dense_vals_np)
 
           self._check(sp_t / dense_t, sp_t_densified / dense_vals_np, sp_t)
@@ -834,11 +817,10 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
           if dtype in [np.int32, np.int64]:
             res = sp_t / dense_t  # should invoke "__truediv__"
-            self.assertEqual(res.values.eval().dtype, np.float64)
+            self.assertEqual(res.values.dtype, np.float64)
 
-  @test_util.run_deprecated_v1
   def testCwiseAdd(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # Identity(2) + AllOnes(2,2).  Should be equal to 2 * Identity(2).
       indices = [[0, 0], [1, 1]]
       vals = [1, 1]
@@ -901,16 +883,15 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       batched_sp_t, unused_nnz1 = _sparsify(
           sp_vals_np.reshape((1, n, m)), thresh=0.)  # No masking.
 
-      with self.cached_session(use_gpu=False):
+      with test_util.force_cpu():
         densified = constant_op.constant(sp_vals_np)
 
-        sp_result = sparse_ops.sparse_softmax(batched_sp_t).eval(
-        ).values.reshape((n, m))
+        sp_result = self.evaluate(
+            sparse_ops.sparse_softmax(batched_sp_t)).values.reshape((n, m))
         dense_result = nn_ops.softmax(densified)
 
-        self.assertAllClose(dense_result.eval(), sp_result)
+        self.assertAllClose(dense_result, sp_result)
 
-  @test_util.run_deprecated_v1
   def testHigherRanks(self):
     # For the first shape:
     # First batch:
@@ -933,11 +914,11 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       sp_t, unused_nnz = _sparsify(values, thresh=1e-2)
       expected_values = [1., 1., 1., .5, .5]
 
-      with self.cached_session(use_gpu=False):
-        result = sparse_ops.sparse_softmax(sp_t).eval()
+      with test_util.force_cpu():
+        result = sparse_ops.sparse_softmax(sp_t)
 
         self.assertAllEqual(expected_values, result.values)
-        self.assertAllEqual(sp_t.indices.eval(), result.indices)
+        self.assertAllEqual(sp_t.indices, result.indices)
         self.assertAllEqual(shape, result.dense_shape)
 
   @test_util.run_deprecated_v1
@@ -960,25 +941,24 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.cached_session(use_gpu=False):
+    with test_util.force_cpu():
       # 1-D, values at index 0.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [7])
-      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
-      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_one).eval()
-      self._assertSparseTensorValueEqual(sp_one.eval(), max_tf)
-      self._assertSparseTensorValueEqual(sp_zero.eval(), min_tf)
+      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_one)
+      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_one)
+      self._assertSparseTensorValueEqual(sp_one, max_tf)
+      self._assertSparseTensorValueEqual(sp_zero, min_tf)
 
       # Values at different indices.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_zero_2 = sparse_tensor.SparseTensor([[1]], [0], [7])
       expected = sparse_tensor.SparseTensor([[0], [1]], [0, 0], [7])
-      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_zero_2).eval()
-      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_zero_2).eval()
-      self._assertSparseTensorValueEqual(expected.eval(), max_tf)
-      self._assertSparseTensorValueEqual(expected.eval(), min_tf)
+      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_zero_2)
+      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_zero_2)
+      self._assertSparseTensorValueEqual(expected, max_tf)
+      self._assertSparseTensorValueEqual(expected, min_tf)
 
   @test_util.run_deprecated_v1
   def testRandom(self):
@@ -1008,37 +988,36 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
             np.minimum(a_densified, b_densified), minimum_tf_densified)
 
   def testMismatchedShapes(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_zero = sparse_tensor.SparseTensor([[0, 0]], [0], [1, 1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands do not have the same ranks"):
-        sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
+        self.evaluate(sparse_ops.sparse_maximum(sp_zero, sp_one))
 
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands' shapes do not match"):
-        sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
+        self.evaluate(sparse_ops.sparse_maximum(sp_zero, sp_one))
 
 
 class SparseTransposeTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testTranspose(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       np.random.seed(1618)
       shapes = [np.random.randint(1, 10, size=rank) for rank in range(1, 6)]
       for shape in shapes:
         for dtype in [np.int32, np.int64, np.float32, np.float64]:
           dn_input = np.random.randn(*shape).astype(dtype)
-          rank = array_ops.rank(dn_input).eval()
+          rank = self.evaluate(array_ops.rank(dn_input))
           perm = np.random.choice(rank, rank, False)
           sp_input, unused_a_nnz = _sparsify(dn_input)
           sp_trans = sparse_ops.sparse_transpose(sp_input, perm=perm)
-          dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans).eval()
-          expected_trans = array_ops.transpose(dn_input, perm=perm).eval()
+          dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans)
+          expected_trans = array_ops.transpose(dn_input, perm=perm)
           self.assertAllEqual(expected_trans.shape, sp_trans.get_shape())
           self.assertAllEqual(dn_trans, expected_trans)
 
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 517db3450f3c43ea0989b59db5ccc7c089e9cec3..80004db833c50ef460ed16237f4f775eb80b6877 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -373,6 +373,7 @@ class SplitOpTest(test.TestCase):
     assert s1.shape.as_list() == [1]
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 83e06ba48bdbbe3189eafde7d0f42c2e4ced68ab..29cd00b78923cf7413114f858fe4c23a379a5af5 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -166,6 +166,7 @@ class StageTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testCapacity(self):
+    self.skipTest('b/123423516 this test is flaky on gpu.')
     capacity = 3
 
     with ops.Graph().as_default() as G:
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 3b2a56bd1ff6ef81ae17773fd5a23bc96778ce63..f587a7ec4329a1b9a4df5bbfb3d8edcc1773cbcb 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -160,6 +160,21 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(result))
     self.assertNotEqual(len(first), len(result))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_template_with_empty_name(self):
+    tpl = template.make_template("", variable_scoped_function)
+    with variable_scope.variable_scope("outer"):
+      x = variable_scope.get_variable("x", [])
+      v = tpl()
+    self.assertEqual("outer/", tpl.variable_scope_name)
+    self.assertEqual("outer//dummy:0", v.name)
+    if context.executing_eagerly():
+      # In eager mode `x` is not visible to the template since the template does
+      # not rely on global collections.
+      self.assertEqual([v], tpl.variables)
+    else:
+      self.assertEqual([x, v], tpl.variables)
+
   @test_util.run_in_graph_and_eager_modes
   def test_template_with_name(self):
     tmpl1 = template.make_template("s1", variable_scoped_function)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 147e7fde5793d4ac0b85696715aa7645f8e79bb2..d0efb47a3896205547c1e1ded7904bb2e319705a 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -185,8 +185,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_v1_only("b/118890905")
+  @test_util.disable_control_flow_v2("b/122324791")
+  @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -202,8 +202,8 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_v1_only("b/118890905")
+  @test_util.disable_control_flow_v2("b/122324791")
+  @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
@@ -1000,13 +1000,11 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
     with self.session(use_gpu=True):
       def loop(x):
@@ -1207,11 +1205,14 @@ class TensorArrayTest(test.TestCase):
       c1 = constant_op.constant([4.0, 5.0])
       w1 = w0.write(3, c1)
 
-      with self.assertRaisesOpError(
-          r"Could not read index 0 twice because it was cleared after a "
-          r"previous read \(perhaps try setting clear_after_read = false\?\)"):
-        with ops.control_dependencies([r0]):
-          self.evaluate(w1.read(0))
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        # TensorArray v2 does not support clear_after_read.
+        with self.assertRaisesOpError(
+            r"Could not read index 0 twice because it was cleared after a "
+            r"previous read \(perhaps try setting clear_after_read = false\?\)"
+        ):
+          with ops.control_dependencies([r0]):
+            self.evaluate(w1.read(0))
 
       r1 = w1.read(1)
       self.assertAllEqual(c1.get_shape(), r1.shape)
@@ -1220,7 +1221,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
-  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
@@ -1281,13 +1281,23 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  # TODO(srbs): Figure out how to enable this. This is probably failing
-  # because we are trying to stack a TensorList with invalid tensors.
-  # That is because we do not receive gradients for all list indices.
-  # Figure out how TensorArray handles this.
-  def disabletestGradientWhenNotAllComponentsRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteButNotAllComponentsReadGrad(self):
+    with self.cached_session(use_gpu=True) as session:
+      x0 = constant_op.constant(5.0)
+      x1 = constant_op.constant(10.0)
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2).write(0, x0).write(1, x1)
+      r0 = ta.read(0)
+      # calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
+      grad_r0_x1 = gradients_impl.gradients(ys=[r0], xs=[x0, x1], grad_ys=[1.0])
+      grad_r0_x1_vals = session.run(grad_r0_x1)
+      self.assertAllEqual(grad_r0_x1_vals, [1.0, 0.0])
+
   def _testTensorArrayUnpackDynamic(self):
     with self.cached_session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1300,12 +1310,10 @@ class TensorArrayTest(test.TestCase):
       grad = gradients_impl.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
@@ -1323,8 +1331,8 @@ class TensorArrayTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      v2_msg = ("Tried to stack elements of a empty list with "
-                "non-fully-defined shape")
+      v2_msg = ("Tried to stack elements of an empty list with "
+                "non-fully-defined element_shape")
       v1_msg = (
           "TensorArray has size zero, but element shape <unknown> is not "
           "fully defined. Currently only static shapes are supported when "
@@ -1345,7 +1353,10 @@ class TensorArrayTest(test.TestCase):
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        ta = ta.unstack(array_ops.zeros([0, 3, 5]))
+      else:
+        ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
       self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
@@ -1353,12 +1364,10 @@ class TensorArrayTest(test.TestCase):
       # first dimension of zero
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
@@ -1386,8 +1395,30 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.disable_control_flow_v2("b/117943286")
-  @test_util.run_v1_only("b/117943286")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayScatterPartialReadAndGradients(self):
+    with self.session(use_gpu=True) as session:
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=0,
+          dynamic_size=True)
+
+      indices = constant_op.constant([1, 8])
+      value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      w = ta.scatter(indices, value)
+      r0 = w.read(1)
+
+      # Test combined gradients + aggregation of read(0)
+      grad = gradients_impl.gradients(
+          ys=[r0], xs=[value], grad_ys=[[2.0, 3.0]])[0]
+      read_val, grad_val = session.run([r0, grad])
+
+      self.assertAllEqual([1.0, -1.0], read_val)
+      self.assertAllEqual([[2.0, 3.0], [0.0, 0.0]], grad_val)
+
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 07807e89d0e60bf5e053e75618112e266a3ca882..028ef11fc496725fd6535dd28196e9fadcf2fee4 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -43,6 +43,11 @@ from tensorflow.python.util import compat
 
 class VariablesTestCase(test.TestCase):
 
+  @test_util.run_deprecated_v1
+  def testDistributeStrategy(self):
+    v = variables.VariableV1(0.0)
+    self.assertIsNone(v._distribute_strategy)
+
   @test_util.run_v1_only("b/120545219")
   def testInitialization(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 6567ac9429ad6b2afb7f985de4e1c161fcea80cf..061d787760889cb344967fee861147d1d4674ad2 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -22,6 +22,9 @@ from absl.testing import parameterized
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
@@ -33,6 +36,7 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2
 from tensorflow.python.ops.control_flow_ops import while_loop as while_loop_v1
 from tensorflow.python.ops.while_v2 import while_loop as while_loop_v2
@@ -64,6 +68,21 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertEqual(16., eval_result[0])
       self.assertSequenceEqual(sess.run(grad), [32.])
 
+  def testGradientTapeResourceVariable(self):
+    with context.eager_mode():
+      v = variables.Variable(1.)
+
+      @def_function.function
+      def fnWithLoop():  # pylint: disable=invalid-name
+        with backprop.GradientTape() as tape:
+          _, x = while_loop_v2(
+              lambda i, _: i < 2,
+              lambda i, x: (i + 1, x * v),
+              [0, 2.])
+        return tape.gradient(x, v)
+
+      self.assertAllEqual(fnWithLoop(), 4.0)
+
   @test_util.run_deprecated_v1
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
@@ -116,6 +135,18 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertSequenceEqual(self.evaluate(grady_1), [6.])
       self.assertSequenceEqual(self.evaluate(grady_2), [61.])
 
+  @test_util.run_deprecated_v1
+  def testGradientTape(self):
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(2.)
+      t.watch(x)
+      ret = while_loop_v2(
+          lambda v: v < 4., lambda v: v * v, [x],
+          return_same_structure=False)  # x**2
+    grad = t.gradient(ret, x)
+    with self.cached_session() as sess:
+      self.assertAllEqual(sess.run(grad), 4.0)
+
   @test_util.run_deprecated_v1
   def testMultipleWhileLoops(self):
     x = constant_op.constant(2.)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 5354d437b481195f81dba8f4c1bbf3d12e67d1a7..1b84ec1f69ed55a5c86c7767e986c7bc542e1841 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -554,7 +554,7 @@ class Layer(base_layer.Layer):
 
   def __setattr__(self, value, name):
     # By-pass the automatic dependency tracking performed by the parent Layer.
-    super(checkpointable.CheckpointableBase, self).__setattr__(value, name)
+    super(checkpointable.Checkpointable, self).__setattr__(value, name)
 
 
 def _add_elements_to_collection(elements, collection_list):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 5d4805e245e17376e8719466868326b34d7cf12d..03344c844d35aa74c09ccc9cc308fa921b4d1789 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -117,7 +117,7 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv1d instead.')
+    instructions='Use tf.keras.layers.Conv1D instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -316,7 +316,7 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv2d instead.')
+    instructions='Use tf.keras.layers.Conv2D instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -523,7 +523,7 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv3d instead.')
+    instructions='Use tf.keras.layers.Conv3D instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -853,7 +853,7 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.separable_conv1d instead.')
+    instructions='Use tf.keras.layers.SeparableConv1D instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -973,7 +973,7 @@ def separable_conv1d(inputs,
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.separable_conv2d instead.')
+    instructions='Use tf.keras.layers.SeparableConv2D instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1183,7 +1183,7 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv2d_transpose instead.')
+    instructions='Use tf.keras.layers.Conv2DTranspose instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1363,7 +1363,7 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv3d_transpose instead.')
+    instructions='Use tf.keras.layers.Conv3DTranspose instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8aa3a199b562a382f4e74c0df15c711f7312b035
--- /dev/null
+++ b/tensorflow/python/module/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "module",
+    srcs = ["module.py"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/checkpointable:tracking",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "module_test",
+    srcs = ["module_test.py"],
+    additional_deps = [
+        ":module",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/python/module/module.py b/tensorflow/python/module/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d1bd46c4180c30930ca25d01981828a8da9893
--- /dev/null
+++ b/tensorflow/python/module/module.py
@@ -0,0 +1,410 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Modules encapsulate building stateful components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import sys
+
+import six
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
+
+
+class ModuleMetaclass(type):
+  """Metaclass for `tf.Module`."""
+
+  def __new__(mcs, name, bases, clsdict):
+    for key, value in clsdict.items():
+      if key in ("__init__", "name_scope"):
+        continue
+
+      elif tf_inspect.isfunction(value):
+        if getattr(value, "_no_module_name_scope", False):
+          # The function has been annotated to say that no autoscoping should
+          # be applied, so do not patch it.
+          continue
+        clsdict[key] = with_name_scope(value)
+
+      elif isinstance(value, property):
+        clsdict[key] = property(
+            value.fget if not value.fget else with_name_scope(value.fget),
+            value.fset if not value.fset else with_name_scope(value.fset),
+            value.fdel if not value.fdel else with_name_scope(value.fdel),
+            doc=value.__doc__)
+
+    return type.__new__(mcs, name, bases, clsdict)
+
+  def __call__(cls, *args, **kwargs):
+    # Call new such that we have an un-initialized module instance that we can
+    # still reference even if there is an exception during __init__. This is
+    # needed such that we can make sure the name_scope constructed in __init__
+    # is closed even if there is an exception.
+    module = cls.__new__(cls, *args, **kwargs)
+
+    # Now attempt to initialize the object.
+    try:
+      module.__init__(*args, **kwargs)
+    except:
+      # We must explicitly catch so that in Python 2 sys.exc_info() is populated
+      # before entering the finally block.
+      raise
+
+    finally:
+      # The base Module constructor enters the modules name scope before
+      # returning such that other functionality in the ctor happens within the
+      # modules name scope.
+      scope = getattr(module, "_ctor_name_scope", None)
+      exc_info = sys.exc_info()
+      if scope is None:
+        if exc_info[0] is None:
+          raise ValueError(
+              "Constructing a tf.Module without calling the super constructor "
+              "is not supported. Add the following as the first line in your "
+              "__init__ method:\n\n"
+              "super(%s, self).__init__()" % cls.__name__)
+      else:
+        scope.__exit__(*exc_info)
+        del module._ctor_name_scope
+
+    return module
+
+
+def with_name_scope(unbound_method):
+  """Patches the given method so it enters the modules name scope."""
+  def enter_name_scope(self, *args, **kwargs):
+    """Decorator that calls the given function in the module name scope.
+
+    Args:
+      self: Module instance.
+      *args: Positional arguments to `unbound_method`.
+      **kwargs: Keyword arguments to `unbound_method`.
+
+    Returns:
+      `with self.name_scope: return unbound_method(self, *args, **kwargs)`
+    """
+    try:
+      module_name_scope = self.name_scope
+    except AttributeError as exc_value_from:
+      exc_value = AttributeError(
+          "The super constructor must be called before any other methods in "
+          "your constructor. If this is not possible then annotate all the "
+          "methods called with `@no_module_name_scope`.")
+      six.raise_from(exc_value, exc_value_from)
+
+    with module_name_scope:
+      # tf.Module enters the module name scope for all methods. To disable this
+      # for a particular method annotate it with `@no_module_name_scope`.
+      return unbound_method(self, *args, **kwargs)
+
+  return tf_decorator.make_decorator(unbound_method, enter_name_scope)
+
+
+@tf_export("experimental.Module")
+class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
+  """Base neural network module class.
+
+  A module is a named container for `tf.Variable`s, other `tf.Module`s and
+  functions which apply to user input. For example a dense layer in a neural
+  network might be implemented as a `tf.Module`:
+
+  >>> class Dense(tf.Module):
+  ...   def __init__(self, in_features, output_features):
+  ...     super(Linear, self).__init__()
+  ...     self.w = tf.Variable(
+  ...         tf.random_normal([input_features, output_features]), name='w')
+  ...     self.b = tf.Variable(tf.zeros([output_features]), name='b')
+  ...
+  ...   def __call__(self, x):
+  ...     x = tf.convert_to_tensor(x, name='x')
+  ...     y = tf.matmul(x, self.w) + self.b
+  ...     return tf.nn.relu(y)
+
+  You can use the dense layer as you would expect:
+
+  >>> d = Dense(input_features=64, output_features=10)
+  >>> d(tf.ones([100, 64]))
+  <tf.Tensor: ...>
+
+  By subclassing `tf.Module` instead of `object` any variables created inside
+  the module are automatically created within the modules name scope:
+
+  >> d.w.name
+  "dense/w:0"
+
+  In eager mode this is useful for debugging, and when used with `@tf.function`
+  the use of name scopes gives operations (e.g. matmul) useful names as well.
+
+  As well as automatic naming, the Dense module inherits methods for tracking
+  its variables:
+
+  >>> d.variables
+  (<tf.Variable 'dense/b:0' ...>, <tf.Variable 'dense/w:0' ...>)
+  """
+
+  def __init__(self, name=None):
+    if name is None:
+      name = camel_to_snake(type(self).__name__)
+    else:
+      if not valid_identifier(name):
+        raise ValueError(
+            "%r is not a valid module name. Module names must be valid Python "
+            "identifiers (e.g. a valid class name)." % name)
+
+    self._name = name
+    with ops.name_scope(name) as scope_name:
+      self._scope_name = scope_name
+
+    # Enter the name scope so subsequent code in the contructor (e.g. creating
+    # submodules) happens inside the modules name scope. This is exited when
+    # the subclass __init__ returns (this is implemented in ModuleMetaclass).
+    self._ctor_name_scope = self.name_scope
+    self._ctor_name_scope.__enter__()
+
+  @property
+  def name(self):
+    """Returns the name of this module as passed or determined in the ctor.
+
+    NOTE: This is not the same as the `self.name_scope.name` which includes
+    parent module names.
+    """
+    return self._name
+
+  @property
+  def name_scope(self):
+    """Returns a `tf.name_scope` instance for this class."""
+    # TODO(tomhennigan) Memoize once name scopes are re-entrant.
+    return ops.name_scope(self._scope_name)
+
+  @property
+  def variables(self):
+    """Collection of variables owned by this module and it's submodules.
+
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
+
+    Returns:
+      A collection of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (depth first).
+    """
+    return tuple(walk(self, recurse_if=_IS_MODULE, predicate=_IS_VARIABLE))
+
+  @property
+  def owned_variables(self):
+    """Collection of variables that are attributes of the current module.
+
+    See `variables` for a property which returns all variables from the current
+    module and all it's submodules recursively.
+
+    Returns:
+      A collection of variables which are attributes of the current module. Will
+      yield variables inside nested structures (lists etc) but not in other
+      modules.
+    """
+    return tuple(walk(self, predicate=_IS_VARIABLE))
+
+  @property
+  def trainable_variables(self):
+    """Collection of variables owned by this module and it's submodules.
+
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
+
+    Returns:
+      A collection of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (depth first).
+    """
+    return tuple(
+        walk(self, recurse_if=_IS_MODULE, predicate=_IS_TRAINABLE_VARIABLE))
+
+  @property
+  def owned_trainable_variables(self):
+    """Collection of variables that are attributes of the current module.
+
+    See `variables` for a property which returns all variables from the current
+    module and all it's submodules recursively.
+
+    Returns:
+      A collection of variables which are attributes of the current module. Will
+      yield variables inside nested structures (lists etc) but not in other
+      modules.
+    """
+    return tuple(walk(self, predicate=_IS_TRAINABLE_VARIABLE))
+
+  @property
+  def owned_submodules(self):
+    """Collection of immediate child modules.
+
+    Child modules are modules which are found as properties of the current
+    module.
+
+    >>> a = tf.experimental.Module()
+    >>> b = tf.experimental.Module()
+    >>> c = tf.experimental.Module()
+    >>> a.b = b
+    >>> b.c = c
+    >>> assert list(a.owned_submodules) == [b]
+    >>> assert list(b.owned_submodules) == [c]
+    >>> assert list(c.owned_submodules) == []
+
+    Returns:
+      A collection of all child modules.
+    """
+    return tuple(walk(self, predicate=_IS_MODULE))
+
+  @property
+  def submodules(self):
+    """Collection of all sub-modules.
+
+    Submodules are modules which are properties of this module, or found as
+    properties of modules which are properties of this module (and so on).
+
+    >>> a = tf.experimental.Module()
+    >>> b = tf.experimental.Module()
+    >>> c = tf.experimental.Module()
+    >>> a.b = b
+    >>> b.c = c
+    >>> assert list(a.submodules) == [b, c]
+    >>> assert list(b.submodules) == [c]
+    >>> assert list(c.submodules) == []
+
+    Returns:
+      A collection of all submodules.
+    """
+    return tuple(walk(self, recurse_if=_IS_MODULE, predicate=_IS_MODULE))
+
+  @classmethod
+  def no_name_scope(cls, method):
+    """Decorator to wrap a method, preventing automatic name scope wrapping.
+
+    By default, any method on a module is considered as a forwards function, and
+    so any variables / modules created by the method will be scoped as belonging
+    to the module. In some cases this is undesirable, for example when
+    implementing .clone() / .transpose(), as in those cases we want the new
+    module to have the scope of wherever the .transpose() call is made. To
+    allow this, decorate any methods with `no_module_name_scope`.
+
+    This logic is tied to ModuleMetaclass.__new__, if anything is
+    changed here corresponding changes will be needed there.
+
+    Args:
+      method: the method to wrap.
+
+    Returns:
+      The method, with a flag indicating no name scope wrapping should occur.
+    """
+    setattr(method, "_no_module_name_scope", True)
+    return method
+
+_IS_VARIABLE = lambda o: isinstance(o, variables.Variable)
+_IS_TRAINABLE_VARIABLE = lambda o: (_IS_VARIABLE(o) and o.trainable)
+_IS_MODULE = lambda o: isinstance(o, Module)
+_CAMEL_TO_SNAKE_R = re.compile(r"((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))")
+_VALID_IDENTIFIER = re.compile(r"^[a-zA-Z_]([a-zA-Z0-9_])*$")
+
+
+def valid_identifier(name):
+  return bool(_VALID_IDENTIFIER.match(name))
+
+
+def camel_to_snake(value):
+  return _CAMEL_TO_SNAKE_R.sub(r"_\1", value).lower()
+
+
+def walk(o, recurse_if=None, predicate=None):
+  """Flattened attributes of `o` in sorted order by attribute name.
+
+  >>> class Foo(object):
+  ...   def __init__(self, prefix=''):
+  ...     self.z = prefix + 'c'
+  ...     self.a = [prefix + 'a', prefix + 'b']
+
+  >>> tuple(walk(Foo()))
+  ('a', 'b', 'c')
+
+  If `predicate` is not None, then only values matching predicate are returned:
+
+  >>> tuple(walk(Foo(), predicate=lambda v: v != 'a'))
+  ('b', 'c')
+
+  If `recurse_if` is not None then it should be a callable which tests if the
+  given leaf should be expanded:
+
+  >>> is_string = lambda v: isinstance(v, str)
+  >>> is_foo = lambda l: isinstance(l, Foo)
+  >>> o = Foo(prefix='root_')
+  >>> o.b = Foo(prefix='child_')
+  >>> tuple(walk(o, predicate=is_string))
+  ('root_a', 'root_b', 'root_c')
+  >>> tuple(walk(o, recurse_if=is_foo, predicate=is_string))
+  ('root_a', 'root_b', 'root_c', 'child_a', 'child_b', 'child_c')
+
+  Args:
+    o: An object who's attributes are walked.
+    recurse_if: (Optional) Visited items of this type will be walked to extract
+      more leaves. If `None`, it will not recurse into leaves.
+    predicate: (Optional) If set then only values matching predicate are
+      yielded.
+
+  Returns:
+    Attributes of `o` in name order. If `recurse_if` is not `None` then
+    attributes for which `recurse_if(attribute) == True` will be walked
+    recursively. If `predicate` is not `None` then only attributes for which
+    `predicate(attribute) == True` will be yielded.
+  """
+  if predicate is None:
+    predicate = lambda _: True
+  return _walk_internal(
+      o, recurse_if=recurse_if, predicate=predicate, seen=set())
+
+
+def _walk_internal(o, recurse_if, predicate, seen):
+  """Implementation of `walk`."""
+  if seen is None:
+    seen = set([id(o)])
+
+  o_dict = vars(o)
+  to_walk = []
+
+  for key in sorted(o_dict):
+    values = nest.flatten(o_dict[key])
+    for value in values:
+      value_id = id(value)
+      if value_id in seen:
+        continue
+
+      seen.add(value_id)
+      if predicate(value):
+        yield value
+
+      if recurse_if is not None and recurse_if(value):
+        # Walk direct properties first then recurse.
+        to_walk.append(value)
+
+  for value in to_walk:
+    for subvalue in _walk_internal(value, recurse_if, predicate, seen):
+      # Predicate is already tested for these values.
+      yield subvalue
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4e1959c371c35f6f71bf2b73e4adfa75053c71
--- /dev/null
+++ b/tensorflow/python/module/module_test.py
@@ -0,0 +1,363 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.Module`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from absl.testing import parameterized
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import ops
+from tensorflow.python.module import module
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class TestModuleNaming(test.TestCase):
+
+  def test_single_name(self):
+    mod = module.Module(name="simple")
+    self.assertEqual(mod.name, "simple")
+    self.assertEqual(mod.name_scope.name, "simple/")
+
+  def test_construct_in_scope(self):
+    with ops.name_scope("foo"):
+      mod = module.Module(name="bar")
+    self.assertEqual(mod.name, "bar")
+    self.assertEqual(mod.name_scope.name, "foo/bar/")
+
+  def test_enters_name_scope_in_call(self):
+    mod = ReturnsNameScopeModule()
+    for _ in range(3):
+      self.assertEqual(mod(), mod.name_scope.name)
+
+  def test_enters_name_scope_in_other_method(self):
+    mod = ReturnsNameScopeModule()
+    for _ in range(3):
+      self.assertEqual(mod.alternative_forward(), mod.name_scope.name)
+
+  def test_subclassed_module(self):
+    mod = SubclassedReturnsNameScopeModule()
+    for _ in range(3):
+      self.assertEqual(mod.alternative_forward(), mod.name_scope.name)
+      self.assertEqual(mod.alternative_alternative_forward(),
+                       mod.name_scope.name)
+
+  def test_submodule_created_late(self):
+    m = TreeModule()
+    self.assertEqual(m.name, "tree_module")
+    self.assertEqual(m.name_scope.name, "tree_module/")
+    leaf1 = m.new_leaf()
+    self.assertEqual(leaf1.name, "tree_module")
+    self.assertEqual(leaf1.name_scope.name, "tree_module/tree_module/")
+
+  def test_does_not_evaluate_property_methods(self):
+    mod = PropertyThrowsWhenCalledModule()
+    with self.assertRaises(AssertionError):
+      mod.raise_assertion_error  # pylint: disable=pointless-statement
+
+  def test_overridden_name_scope(self):
+    mod = ModuleOverridingNameScope()
+    self.assertEqual(mod(), mod.name_scope.name)
+    self.assertEqual(mod.alternative_forward(), mod.name_scope.name)
+
+  def test_patched_callable(self):
+    with ops.name_scope("foo"):
+      mod = module.Module(name="bar")
+    mod.foo = get_name_scope
+    # `foo` is not a method so we do not re-enter the name scope.
+    self.assertEqual(mod.foo(), "")
+
+  def test_invalid_name(self):
+    msg = ".* is not a valid module name"
+    with self.assertRaisesRegexp(ValueError, msg):
+      module.Module(name="$Foo")
+
+  def test_modules_not_numbered_in_eager(self):
+    mod = RecursiveModule(2)
+    self.assertEqual(mod.name_scope.name, "badger/")
+    self.assertEqual(mod.child.name_scope.name, "badger/badger/")
+
+    mod = RecursiveModule(2)
+    self.assertEqual(mod.name_scope.name, "badger/")
+    self.assertEqual(mod.child.name_scope.name, "badger/badger/")
+
+  def test_module_numbering_in_graph(self):
+    with ops.Graph().as_default():
+      mod = RecursiveModule(2)
+      self.assertEqual(mod.name_scope.name, "badger/")
+      self.assertEqual(mod.child.name_scope.name, "badger/badger/")
+
+      mod = RecursiveModule(2)
+      self.assertEqual(mod.name_scope.name, "badger_1/")
+      self.assertEqual(mod.child.name_scope.name, "badger_1/badger/")
+
+  def test_ctor_error_closes_name_scope(self):
+    with self.assertRaises(ErrorModuleError):
+      # If super constructor is called then a name scope is opened then an error
+      # is thrown. The metaclass should handle this and close the namescope
+      # before re-throwing the exception.
+      ErrorModule(call_super=True)
+
+    self.assertEqual("", get_name_scope())
+
+  def test_ctor_error_handles_ctor_not_opening_name_scope(self):
+    with self.assertRaises(ErrorModuleError):
+      # If super ctor is not called then the name scope isn't opened. We need to
+      # ensure that this doesn't trigger an exception (e.g. the metaclass trying
+      # to __exit__ a non-existant name scope).
+      ErrorModule(call_super=False)
+
+    self.assertEqual("", get_name_scope())
+
+  def test_forward_method_closes_name_scope(self):
+    mod = ErrorModule(call_super=True, raise_in_constructor=False)
+    with self.assertRaises(ErrorModuleError):
+      mod()
+
+    self.assertEqual("", get_name_scope())
+
+
+class VariableNamingTest(test.TestCase):
+
+  def test_variable_names(self):
+    mod = RecursiveModule(3)
+    self.assertEqual(mod.w.name, "badger/mushroom:0")
+    self.assertEqual(mod.child.w.name, "badger/badger/mushroom:0")
+    self.assertEqual(mod.child.child.w.name, "badger/badger/badger/mushroom:0")
+
+
+class VariableTrackingTest(test.TestCase):
+
+  def test_variables(self):
+    m = RecursiveModule(3)
+    self.assertEqual(m.variables, (m.w, m.child.w, m.child.child.w))
+    self.assertEqual(m.child.variables, (m.child.w, m.child.child.w))
+    self.assertEqual(m.child.child.variables, (m.child.child.w,))
+
+  def test_owned_variables(self):
+    m = RecursiveModule(3)
+    self.assertEqual(m.owned_variables, (m.w,))
+    self.assertEqual(m.child.owned_variables, (m.child.w,))
+    self.assertEqual(m.child.child.owned_variables, (m.child.child.w,))
+
+  def test_trainable_variables(self):
+    m = RecursiveModule(3)
+    self.assertEqual(m.trainable_variables,
+                     (m.w, m.child.w, m.child.child.w))
+    self.assertEqual(m.child.trainable_variables,
+                     (m.child.w, m.child.child.w))
+    self.assertEqual(m.child.child.trainable_variables, (m.child.child.w,))
+
+  def test_trainable_variables_ignores_non_trainable(self):
+    m = RecursiveModule(3, trainable=False)
+    self.assertEqual(len(m.trainable_variables), 0)
+    self.assertEqual(len(m.child.trainable_variables), 0)
+    self.assertEqual(len(m.child.child.trainable_variables), 0)
+
+  def test_owned_trainable_variables(self):
+    m = RecursiveModule(3)
+    self.assertEqual(m.owned_trainable_variables, (m.w,))
+    self.assertEqual(m.child.owned_trainable_variables, (m.child.w,))
+    self.assertEqual(m.child.child.owned_trainable_variables,
+                     (m.child.child.w,))
+
+  def test_owned_trainable_variables_ignores_non_trainable(self):
+    m = RecursiveModule(3, trainable=False)
+    self.assertEqual(len(m.owned_trainable_variables), 0)
+    self.assertEqual(len(m.child.owned_trainable_variables), 0)
+    self.assertEqual(len(m.child.child.owned_trainable_variables), 0)
+
+
+class ModuleTrackingTest(test.TestCase):
+
+  def test_owned_submodules(self):
+    m = RecursiveModule(3)
+    self.assertEqual(list(m.owned_submodules), [m.child])
+    self.assertEqual(list(m.child.owned_submodules), [m.child.child])
+    self.assertEqual(list(m.child.child.owned_submodules), [])
+
+  def test_submodules(self):
+    m = RecursiveModule(3)
+    self.assertEqual(list(m.submodules), [m.child, m.child.child])
+    self.assertEqual(list(m.child.submodules), [m.child.child])
+    self.assertEqual(list(m.child.child.submodules), [])
+
+  def test_non_ctor_submodule(self):
+    m = TreeModule()
+    leaf1 = m.new_leaf()
+    self.assertEqual(set(m.submodules), {leaf1})
+    leaf2 = m.new_leaf()
+    self.assertEqual(set(m.submodules), {leaf1, leaf2})
+
+
+class CommonErrorsTest(test.TestCase):
+
+  def test_not_calling_super_constructor(self):
+    msg = ("Constructing a tf.Module without calling the super constructor is "
+           "not supported")
+    with self.assertRaisesRegexp(ValueError, msg):
+      DoesNotCallSuperConstructorModule()
+
+  def test_calls_method_before_super(self):
+    msg = "super constructor must be called before any other methods"
+    with self.assertRaisesRegexp(AttributeError, msg):
+      CallsMethodBeforeSuperConstructorModule(allowed_method=False)
+
+  def test_annotated_method_is_allowed(self):
+    self.assertIsNotNone(
+        CallsMethodBeforeSuperConstructorModule(allowed_method=True))
+
+
+def get_name_scope():
+  with ops.name_scope("x") as ns:
+    return ns[:-2]
+
+
+class ErrorModuleError(Exception):
+  pass
+
+
+class ErrorModule(module.Module):
+
+  def __init__(self, call_super, raise_in_constructor=True):
+    if call_super:
+      super(ErrorModule, self).__init__()
+    if raise_in_constructor:
+      raise ErrorModuleError("Deliberate error!")
+
+  def __call__(self):
+    raise ErrorModuleError("Deliberate error!")
+
+
+class RecursiveModule(module.Module):
+
+  def __init__(self, depth, trainable=True):
+    super(RecursiveModule, self).__init__(name="badger")
+    self.child = None
+    if depth > 1:
+      self.child = RecursiveModule(depth - 1, trainable=trainable)
+    self.w = variables.Variable(1.0, trainable=trainable, name="mushroom")
+
+
+class TreeModule(module.Module):
+
+  def __init__(self, name=None):
+    super(TreeModule, self).__init__(name=name)
+    self._leaves = []
+
+  def new_leaf(self, name=None):
+    leaf = TreeModule(name=name)
+    self._leaves.append(leaf)
+    return leaf
+
+
+class ReturnsNameScopeModule(module.Module):
+
+  def alternative_forward(self):
+    return get_name_scope()
+
+  def __call__(self):
+    return get_name_scope()
+
+
+class SubclassedReturnsNameScopeModule(ReturnsNameScopeModule):
+
+  def alternative_alternative_forward(self):
+    return get_name_scope()
+
+
+class PropertyThrowsWhenCalledModule(module.Module):
+
+  @property
+  def raise_assertion_error(self):
+    raise AssertionError
+
+
+class ModuleOverridingNameScope(ReturnsNameScopeModule):
+
+  @property
+  def name_scope(self):
+    return ops.name_scope("yolo/")
+
+
+class DoesNotCallSuperConstructorModule(module.Module):
+
+  def __init__(self):
+    # NOTE: Intentionally does not call super constructor.
+    pass
+
+
+class CallsMethodBeforeSuperConstructorModule(module.Module):
+
+  def __init__(self, allowed_method):
+    if allowed_method:
+      self.no_name_scope()
+    else:
+      self.with_name_scope()
+    super(CallsMethodBeforeSuperConstructorModule, self).__init__()
+
+  @module.Module.no_name_scope
+  def no_name_scope(self):
+    pass
+
+  def with_name_scope(self):
+    pass
+
+NamedPair = collections.namedtuple("NamedPair", ("first", "second"))
+mk_index_dict = lambda v: dict(enumerate(v))
+
+
+class WalkTest(parameterized.TestCase, test.TestCase):
+
+  @parameterized.parameters(lambda v: NamedPair(*v), list, tuple, mk_index_dict)
+  def test_walk(self, container_type):
+    parent = SimpleModule(container_type=container_type)
+    child = parent.c
+
+    self.assertEqual(
+        list(module.walk(parent, predicate=IS_MEMBER)),
+        [parent.a[0], parent.a[1], parent.z])
+
+    self.assertEqual(
+        list(module.walk(parent, recurse_if=IS_MODULE, predicate=IS_MEMBER)),
+        [parent.a[0], parent.a[1], parent.z, child.a[0], child.a[1], child.z])
+
+
+class MemberType(object):
+  """A simple type to search for."""
+  pass
+
+
+class SimpleModule(module.Module):
+
+  def __init__(self, create_child=True, container_type=list):
+    super(SimpleModule, self).__init__()
+    self.z = MemberType()
+    self.a = container_type([MemberType(), MemberType()])
+    if create_child:
+      self.c = SimpleModule(create_child=False)
+
+
+IS_MEMBER = lambda v: isinstance(v, MemberType)
+IS_MODULE = lambda v: isinstance(v, module.Module)
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index d4e35ca77b2b903ad7da6ad2ffeea0ba43b9f5a4..287510342e1ec7468d50b3b311bcf17bf483e548 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -74,9 +74,13 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
     context_device = context.context().device_name
     if not context_device:
       context_device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    if context_device != in_device:
-      return input._copy()  # pylint: disable=protected-access
-    return input
+    if context_device == in_device:
+      return input
+    else:
+      copied = input._copy()  # pylint: disable=protected-access
+      if hasattr(copied, "_handle_data"):
+        copied._handle_data = input._handle_data  # pylint: disable=protected-access
+      return copied
   else:
     ret = gen_array_ops.identity(input, name=name)
     # Propagate handle data for happier shape inference for resource variables.
@@ -357,12 +361,14 @@ def shape_n(input, out_type=dtypes.int32, name=None):
 
 
 @tf_export("size", v1=[])
+@dispatch.add_dispatch_support
 def size_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   return size(input, name, out_type)
 
 
 @tf_export(v1=["size"])
+@dispatch.add_dispatch_support
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -430,6 +436,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("rank")
+@dispatch.add_dispatch_support
 def rank(input, name=None):
   # pylint: disable=redefined-builtin
   """Returns the rank of a tensor.
@@ -3190,7 +3197,7 @@ def where(condition, x=None, y=None, name=None):
 
   Returns:
     A `Tensor` with the same type and shape as `x`, `y` if they are non-None.
-    A `Tensor` with shape `(num_true, dim_size(condition))`.
+    Otherwise, a `Tensor` with shape `(num_true, rank(condition))`.
 
   Raises:
     ValueError: When exactly one of `x` or `y` is non-None.
@@ -3258,8 +3265,83 @@ reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
 
 @tf_export(v1=["gather"])
 @dispatch.add_dispatch_support
-def gather(params, indices, validate_indices=None, name=None, axis=0):
+def gather(params,
+           indices,
+           validate_indices=None,
+           name=None,
+           axis=None,
+           batch_dims=0):
+  r"""Gather slices from params axis axis according to indices.
+
+  Gather slices from params axis axis according to indices.  `indices` must be
+  an integer tensor of any dimension (usually 0-D or 1-D).
+
+  For 0-D (scalar) `indices`:
+
+  > `output`$$[p_0,          ..., p_{axis-1},        \hspace{5.1em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$ =\
+  > `params`$$[p_0,          ..., p_{axis-1},        \hspace{1em}
+  >            indices,                              \hspace{1em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$.
+
+  For 1-D (vector) `indices` with `batch_dims=0`:
+
+  > `output`$$[p_0,          ..., p_{axis-1},        \hspace{2.6em}
+  >            i,                                    \hspace{2.6em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$ =\
+  > `params`$$[p_0,          ..., p_{axis-1},        \hspace{1em}
+  >            indices[i],                           \hspace{1em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$.
+
+  In the general case, produces an output tensor where:
+
+  > `output`$$[p_0,             ..., p_{axis-1},     \hspace{1.2em}
+  >            i_{batch\_dims}, ..., i_{M-1},        \hspace{1.3em}
+  >            p_{axis + 1},    ..., p_{N-1}]$$ =\
+  > `params`$$[p_0,             ..., p_{axis-1},     \hspace{1em}
+  >            indices[i_0,     ..., i_{M-1}],       \hspace{1em}
+  >            p_{axis + 1},    ..., p_{N-1}]$$.
+
+  Where $$N$$=`ndims(params)` and $$M$$=`ndims(indices)`.
+  The shape of the output tensor is:
+
+  > `output.shape = params.shape[:axis] + indices.shape[batch_dims:] +
+  > params.shape[axis + 1:]`.
+
+  Note that on CPU, if an out of bound index is found, an error is returned.
+  On GPU, if an out of bound index is found, a 0 is stored in the corresponding
+  output value.
+
+  See also `tf.gather_nd`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png"
+  alt>
+  </div>
+
+  Args:
+    params: The `Tensor` from which to gather values. Must be at least rank
+      `axis + 1`.
+    indices: The index `Tensor`.  Must be one of the following types: `int32`,
+      `int64`. Must be in range `[0, params.shape[axis])`.
+    validate_indices: Deprecated, does nothing.
+    name: A name for the operation (optional).
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
+      `axis` in `params` to gather `indices` from. Must be greater than or equal
+      to `batch_dims`.  Defaults to the first non-batch dimension. Supports
+      negative indexes.
+    batch_dims: An `integer`.  The number of batch dimensions.  Must be less
+      than `ndims(inices)`.
+
+  Returns:
+    A `Tensor`. Has the same type as `params`.
+  """
   del validate_indices
+  if axis is None:
+    axis = batch_dims
+  if batch_dims != 0:
+    with ops.name_scope(name, "Gather", [params, indices, axis]):
+      return _batch_gather(params, indices, batch_dims, axis)
   if axis != 0:
     # Note that we do a sparse_read here to avoid snapshotting the entire
     # resource variable and doing a gather, which can be inefficient and lead to
@@ -3275,41 +3357,50 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
 
 @tf_export("gather", v1=[])
 @dispatch.add_dispatch_support
-def gather_v2(params, indices, validate_indices=None, axis=0, name=None):
+def gather_v2(params, indices, validate_indices=None, axis=None,
+              batch_dims=0, name=None):
   return gather(params, indices, validate_indices=validate_indices, name=name,
-                axis=axis)
+                axis=axis, batch_dims=batch_dims)
 
 
 gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
 
 
-
-@tf_export("batch_gather")
+@tf_export(v1=["batch_gather"])
 @dispatch.add_dispatch_support
+@deprecation.deprecated(
+    "2017-10-25", "`tf.batch_gather` is deprecated, please use `tf.gather` "
+    "with `batch_dims=-1` instead.")  # pylint: disable=missing-docstring
 def batch_gather(params, indices, name=None):
-  """Gather slices from `params` according to `indices` with leading batch dims.
-
-  This operation assumes that the leading dimensions of `indices` are dense,
-  and the gathers on the axis corresponding to the last dimension of `indices`.
-  More concretely it computes:
-
-  result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
+  """Gather slices from params according to indices with leading batch dims."""
+  with ops.name_scope(name, "BatchGather", [params, indices]):
+    indices = ops.convert_to_tensor(indices, name="indices")
+    params = ops.convert_to_tensor(params, name="params")
+    if indices.shape.ndims is None:
+      raise ValueError(
+          "batch_gather does not allow indices with unknown shape.")
+    return _batch_gather(params, indices, batch_dims=indices.shape.ndims - 1)
 
-  Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
-  `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
-  a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
 
-  In the case in which indices is a 1D tensor, this operation is equivalent to
-  `tf.gather`.
+def _batch_gather(params, indices, batch_dims, axis=None):
+  r"""Gather slices from params according to indices with leading batch dims.
 
-  See also `tf.gather` and `tf.gather_nd`.
+  This operation assumes that the leading `batch_dims` dimensions of `indices`
+  and `params` are batch dimensions; and performs a `tf.gather` operation within
+  each batch. (If `batch_dims` is not specified, then it defaults to
+  `ndims(indices) - 1`.)  In the case in which `batch_dims==0`, this operation
+  is equivalent to `tf.gather`.
 
   Args:
     params: A Tensor. The tensor from which to gather values.
     indices: A Tensor. Must be one of the following types: int32, int64. Index
-        tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
-        last dimension of `indices` itself.
-    name: A name for the operation (optional).
+      tensor. Must be in range `[0, params.shape[batch_dims]]`.
+    batch_dims: An integer.  The number of batch dimensions.  Must be less than
+      ndims(inices).  Defaults to `ndims(indices) - 1` if not specified.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
+      `axis` in `params` to gather `indices` from. Must be greater than or equal
+      to `batch_dims`.  Defaults to the first non-batch dimension. Supports
+      negative indexes.
 
   Returns:
     A Tensor. Has the same type as `params`.
@@ -3317,48 +3408,100 @@ def batch_gather(params, indices, name=None):
   Raises:
     ValueError: if `indices` has an unknown shape.
   """
-
-  with ops.name_scope(name):
-    indices = ops.convert_to_tensor(indices, name="indices")
-    params = ops.convert_to_tensor(params, name="params")
-    indices_shape = shape(indices)
-    params_shape = shape(params)
-
-    ndims = indices.shape.ndims
-    if ndims is None:
-      raise ValueError("batch_gather does not allow indices with unknown "
-                       "shape.")
-    batch_indices = indices
-    indices_dtype = indices.dtype.base_dtype
-    accum_dim_value = ones((), dtype=indices_dtype)
-    # Use correct type for offset index computation
-    casted_params_shape = gen_math_ops.cast(params_shape, indices_dtype)
-    for dim in range(ndims-1, 0, -1):
-      dim_value = casted_params_shape[dim-1]
-      accum_dim_value *= casted_params_shape[dim]
-      start = zeros((), dtype=indices_dtype)
-      step = ones((), dtype=indices_dtype)
-      dim_indices = gen_math_ops._range(start, dim_value, step)
-      dim_indices *= accum_dim_value
-      dim_shape = stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
-                        axis=0)
-      batch_indices += reshape(dim_indices, dim_shape)
-
-    flat_indices = reshape(batch_indices, [-1])
-    outer_shape = params_shape[ndims:]
-    flat_inner_shape = gen_math_ops.prod(
-        params_shape[:ndims], [0], False)
-
-    flat_params = reshape(
-        params, concat([[flat_inner_shape], outer_shape], axis=0))
-    flat_result = gather(flat_params, flat_indices)
-    result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
-    final_shape = indices.get_shape()[:ndims-1].merge_with(
-        params.get_shape()[:ndims -1])
-    final_shape = final_shape.concatenate(indices.get_shape().dims[ndims-1])
-    final_shape = final_shape.concatenate(params.get_shape()[ndims:])
-    result.set_shape(final_shape)
-    return result
+  if batch_dims is not None and not isinstance(batch_dims, int):
+    raise TypeError("batch_dims must be an int; got %r" % batch_dims)
+  indices = ops.convert_to_tensor(indices, name="indices")
+  params = ops.convert_to_tensor(params, name="params")
+
+  indices_ndims = indices.shape.ndims
+  if indices_ndims is None:
+    raise ValueError("tf.gather does not allow indices with unknown "
+                     "rank when batch_dims is specified.")
+  if batch_dims is None:
+    batch_dims = indices_ndims - 1
+  if batch_dims < 0:
+    batch_dims += indices_ndims
+  if batch_dims < 0 or batch_dims >= indices_ndims:
+    raise ValueError("batch_dims = %d must be less than ndims(indices) = %d" %
+                     (batch_dims, indices_ndims))
+  if params.shape.ndims is not None and batch_dims >= params.shape.ndims:
+    raise ValueError("batch_dims = %d must be less than ndims(params) = %d" %
+                     (batch_dims, params.shape.ndims))
+
+  # Handle axis by transposing the axis dimension to be the first non-batch
+  # dimension, recursively calling batch_gather with axis=0, and then
+  # transposing the result to put the pre-axis dimensions before the indices
+  # dimensions.
+  if axis is not None and axis != batch_dims:
+    # Adjust axis to be positive.
+    if not isinstance(axis, int):
+      axis = tf.where(axis < 0, axis + array_ops.rank(params), axis)
+    elif axis < 0 and params.shape.ndims is None:
+      axis = axis + array_ops.rank(params)
+    else:
+      if (axis < -params.shape.ndims) or (axis >= params.shape.ndims):
+        raise ValueError("axis (%d) out of range [%d, %d)" %
+                         (axis, -params.shape.ndims, params.shape.ndims))
+      if axis < 0:
+        axis += params.shape.ndims
+      if axis < batch_dims:
+        raise ValueError("batch_dims = %d must be less than or equal to "
+                         "axis = %d" % (batch_dims, axis))
+
+    # Move params[axis] up to params[batch_dims].
+    perm = [
+        list(range(batch_dims)), [axis],
+        gen_math_ops._range(batch_dims, axis, 1),
+        gen_math_ops._range(axis + 1, rank(params), 1)
+    ]
+    params = transpose(params, concat(perm, axis=0))
+
+    result = _batch_gather(params, indices, batch_dims=batch_dims)
+
+    # Move the result dimensions corresponding to params[batch_dims:axis]
+    # to just before the dimensions corresponding to indices[batch_dims:].
+    params_start = indices_ndims + axis - batch_dims
+    perm = [
+        list(range(batch_dims)),
+        gen_math_ops._range(indices_ndims, params_start, 1),
+        list(range(batch_dims, indices_ndims)),
+        gen_math_ops._range(params_start, rank(result), 1)
+    ]
+    return transpose(result, perm=concat(perm, axis=0))
+
+  indices_shape = shape(indices)
+  params_shape = shape(params)
+  batch_indices = indices
+  indices_dtype = indices.dtype.base_dtype
+  accum_dim_value = ones((), dtype=indices_dtype)
+  # Use correct type for offset index computation
+  casted_params_shape = gen_math_ops.cast(params_shape, indices_dtype)
+  for dim in range(batch_dims, 0, -1):
+    dim_value = casted_params_shape[dim - 1]
+    accum_dim_value *= casted_params_shape[dim]
+    start = zeros((), dtype=indices_dtype)
+    step = ones((), dtype=indices_dtype)
+    dim_indices = gen_math_ops._range(start, dim_value, step)
+    dim_indices *= accum_dim_value
+    dim_shape = stack(
+        [1] * (dim - 1) + [dim_value] + [1] * (indices_ndims - dim), axis=0)
+    batch_indices += reshape(dim_indices, dim_shape)
+
+  flat_indices = reshape(batch_indices, [-1])
+  outer_shape = params_shape[batch_dims + 1:]
+  flat_inner_shape = gen_math_ops.prod(params_shape[:batch_dims + 1], [0],
+                                       False)
+
+  flat_params = reshape(params, concat([[flat_inner_shape], outer_shape],
+                                       axis=0))
+  flat_result = gather(flat_params, flat_indices)
+  result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
+  final_shape = indices.get_shape()[:batch_dims].merge_with(
+      params.get_shape()[:batch_dims])
+  final_shape = final_shape.concatenate(indices.get_shape().dims[batch_dims:])
+  final_shape = final_shape.concatenate(params.get_shape()[batch_dims + 1:])
+  result.set_shape(final_shape)
+  return result
 
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index d154b6759bfbc50ad2e5ea34e4f04b945ef2d397..c182874c7f2d77b317f42a0cbfadb7435534f747 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -35,6 +35,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
     super(BitwiseOpTest, self).__init__(method_name)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -72,6 +73,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(truth, popcnt_result)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -97,6 +99,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(inverted, expected)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testShiftsWithPositiveLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64,
                   np.uint8, np.uint16, np.uint32, np.uint64]
diff --git a/tensorflow/python/ops/clustering_ops.py b/tensorflow/python/ops/clustering_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d48b89cbacce34781819010addbcbd0ba66f9873
--- /dev/null
+++ b/tensorflow/python/ops/clustering_ops.py
@@ -0,0 +1,770 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Clustering Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_clustering_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.embedding_ops import embedding_lookup
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_clustering_ops import *
+# pylint: enable=wildcard-import
+
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
+SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
+# Cosine distance between vectors U and V is defined as
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
+COSINE_DISTANCE = 'cosine'
+
+RANDOM_INIT = 'random'
+KMEANS_PLUS_PLUS_INIT = 'kmeans_plus_plus'
+KMC2_INIT = 'kmc2'
+
+# The name of the variable holding the cluster centers. Used by the Estimator.
+CLUSTERS_VAR_NAME = 'clusters'
+
+
+class KMeans(object):
+  """Creates the graph for k-means clustering."""
+
+  def __init__(self,
+               inputs,
+               num_clusters,
+               initial_clusters=RANDOM_INIT,
+               distance_metric=SQUARED_EUCLIDEAN_DISTANCE,
+               use_mini_batch=False,
+               mini_batch_steps_per_iteration=1,
+               random_seed=0,
+               kmeans_plus_plus_num_retries=2,
+               kmc2_chain_length=200):
+    """Creates an object for generating KMeans clustering graph.
+
+    This class implements the following variants of K-means algorithm:
+
+    If use_mini_batch is False, it runs standard full batch K-means. Each step
+    runs a single iteration of K-Means. This step can be run sharded across
+    multiple workers by passing a list of sharded inputs to this class. Note
+    however that a single step needs to process the full input at once.
+
+    If use_mini_batch is True, it runs a generalization of the mini-batch
+    K-means algorithm. It runs multiple iterations, where each iteration is
+    composed of mini_batch_steps_per_iteration steps. Two copies of cluster
+    centers are maintained: one that is updated at the end of each iteration,
+    and one that is updated every step. The first copy is used to compute
+    cluster allocations for each step, and for inference, while the second copy
+    is the one updated each step using the mini-batch update rule. After each
+    iteration is complete, this second copy is copied back the first copy.
+
+    Note that for use_mini_batch=True, when mini_batch_steps_per_iteration=1,
+    the algorithm reduces to the standard mini-batch algorithm. Also by setting
+    mini_batch_steps_per_iteration = num_inputs / batch_size, the algorithm
+    becomes an asynchronous version of the full-batch algorithm. Note however
+    that there is no guarantee by this implementation that each input is seen
+    exactly once per iteration. Also, different updates are applied
+    asynchronously without locking. So this asynchronous version may not behave
+    exactly like a full-batch version.
+
+    Args:
+      inputs: An input tensor or list of input tensors. It is assumed that the
+        data points have been previously randomly permuted.
+      num_clusters: An integer tensor specifying the number of clusters. This
+        argument is ignored if initial_clusters is a tensor or numpy array.
+      initial_clusters: Specifies the clusters used during initialization. One
+        of the following:
+        - a tensor or numpy array with the initial cluster centers.
+        - a function f(inputs, k) that returns up to k centers from `inputs`.
+        - "random": Choose centers randomly from `inputs`.
+        - "kmeans_plus_plus": Use kmeans++ to choose centers from `inputs`.
+        - "kmc2": Use the fast k-MC2 algorithm to choose centers from `inputs`.
+        In the last three cases, one batch of `inputs` may not yield
+        `num_clusters` centers, in which case initialization will require
+        multiple batches until enough centers are chosen. In the case of
+        "random" or "kmeans_plus_plus", if the input size is <= `num_clusters`
+        then the entire batch is chosen to be cluster centers.
+      distance_metric: Distance metric used for clustering. Supported options:
+        "squared_euclidean", "cosine".
+      use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
+        full batch.
+      mini_batch_steps_per_iteration: Number of steps after which the updated
+        cluster centers are synced back to a master copy.
+      random_seed: Seed for PRNG used to initialize seeds.
+      kmeans_plus_plus_num_retries: For each point that is sampled during
+        kmeans++ initialization, this parameter specifies the number of
+        additional points to draw from the current distribution before selecting
+        the best. If a negative value is specified, a heuristic is used to
+        sample O(log(num_to_sample)) additional points.
+      kmc2_chain_length: Determines how many candidate points are used by the
+        k-MC2 algorithm to produce one new cluster centers. If a (mini-)batch
+        contains less points, one new cluster center is generated from the
+        (mini-)batch.
+
+    Raises:
+      ValueError: An invalid argument was passed to initial_clusters or
+        distance_metric.
+    """
+    if isinstance(initial_clusters, str) and initial_clusters not in [
+        RANDOM_INIT, KMEANS_PLUS_PLUS_INIT, KMC2_INIT
+    ]:
+      raise ValueError(
+          "Unsupported initialization algorithm '%s'" % initial_clusters)
+    if distance_metric not in [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]:
+      raise ValueError("Unsupported distance metric '%s'" % distance_metric)
+    self._inputs = inputs if isinstance(inputs, list) else [inputs]
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._use_mini_batch = use_mini_batch
+    self._mini_batch_steps_per_iteration = int(mini_batch_steps_per_iteration)
+    self._random_seed = random_seed
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._kmc2_chain_length = kmc2_chain_length
+
+  @classmethod
+  def _distance_graph(cls, inputs, clusters, distance_metric):
+    """Computes distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: cluster Tensor.
+      distance_metric: distance metric used for clustering
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inputs.
+      The value is the distance of each row to all the cluster centers.
+      Currently only Euclidean distance and cosine distance are supported.
+    """
+    assert isinstance(inputs, list)
+    if distance_metric == SQUARED_EUCLIDEAN_DISTANCE:
+      return cls._compute_euclidean_distance(inputs, clusters)
+    elif distance_metric == COSINE_DISTANCE:
+      return cls._compute_cosine_distance(
+          inputs, clusters, inputs_normalized=True)
+    else:
+      assert False, str(distance_metric)
+
+  @classmethod
+  def _compute_euclidean_distance(cls, inputs, clusters):
+    """Computes Euclidean distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: cluster Tensor.
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inputs.
+      The value is the distance of each row to all the cluster centers.
+    """
+    output = []
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        # Computes Euclidean distance. Note the first and third terms are
+        # broadcast additions.
+        squared_distance = (
+            math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
+            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
+            array_ops.transpose(
+                math_ops.reduce_sum(
+                    math_ops.square(clusters), 1, keepdims=True)))
+        output.append(squared_distance)
+
+    return output
+
+  @classmethod
+  def _compute_cosine_distance(cls, inputs, clusters, inputs_normalized=True):
+    """Computes cosine distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensor.
+      clusters: cluster Tensor
+      inputs_normalized: if True, it assumes that inp and clusters are
+      normalized and computes the dot product which is equivalent to the cosine
+      distance. Else it L2 normalizes the inputs first.
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inp.
+      The value is the distance of each row to all the cluster centers.
+    """
+    output = []
+    if not inputs_normalized:
+      with ops.colocate_with(clusters, ignore_existing=True):
+        clusters = nn_impl.l2_normalize(clusters, dim=1)
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        if not inputs_normalized:
+          inp = nn_impl.l2_normalize(inp, dim=1)
+        output.append(1 - math_ops.matmul(inp, clusters, transpose_b=True))
+    return output
+
+  def _infer_graph(self, inputs, clusters):
+    """Maps input to closest cluster and the score.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: Tensor of cluster centers.
+
+    Returns:
+      List of tuple, where each value in tuple corresponds to a value in inp.
+      The tuple has following three elements:
+      all_scores: distance of each input to each cluster center.
+      score: distance of each input to closest cluster center.
+      cluster_idx: index of cluster center closest to the corresponding input.
+    """
+    assert isinstance(inputs, list)
+    # Pairwise distances are used only by transform(). In all other cases, this
+    # sub-graph is not evaluated.
+    scores = self._distance_graph(inputs, clusters, self._distance_metric)
+    output = []
+    if (self._distance_metric == COSINE_DISTANCE and
+        not self._clusters_l2_normalized()):
+      # The cosine distance between normalized vectors x and y is the same as
+      # 2 * squared_euclidean_distance. We are using this fact and reusing the
+      # nearest_neighbors op.
+      # TODO(ands): Support COSINE distance in nearest_neighbors and remove
+      # this.
+      with ops.colocate_with(clusters, ignore_existing=True):
+        clusters = nn_impl.l2_normalize(clusters, dim=1)
+    for inp, score in zip(inputs, scores):
+      with ops.colocate_with(inp, ignore_existing=True):
+        (indices, distances) = gen_clustering_ops.nearest_neighbors(
+            inp, clusters, 1)
+        if self._distance_metric == COSINE_DISTANCE:
+          distances *= 0.5
+        output.append((score, array_ops.squeeze(distances, [-1]),
+                       array_ops.squeeze(indices, [-1])))
+    return zip(*output)
+
+  def _clusters_l2_normalized(self):
+    """Returns True if clusters centers are kept normalized."""
+    return (self._distance_metric == COSINE_DISTANCE and
+            (not self._use_mini_batch or
+             self._mini_batch_steps_per_iteration > 1))
+
+  def _create_variables(self, num_clusters):
+    """Creates variables.
+
+    Args:
+      num_clusters: an integer Tensor providing the number of clusters.
+
+    Returns:
+      Tuple with following elements:
+      - cluster_centers: a Tensor for storing cluster centers
+      - cluster_centers_initialized: bool Variable indicating whether clusters
+            are initialized.
+      - cluster_counts: a Tensor for storing counts of points assigned to this
+            cluster. This is used by mini-batch training.
+      - cluster_centers_updated: Tensor representing copy of cluster centers
+            that are updated every step.
+      - update_in_steps: numbers of steps left before we sync
+            cluster_centers_updated back to cluster_centers.
+    """
+    init_value = array_ops.constant([], dtype=dtypes.float32)
+    cluster_centers = variable_scope.variable(
+        init_value, name=CLUSTERS_VAR_NAME, validate_shape=False)
+    cluster_centers_initialized = variable_scope.variable(
+        False, dtype=dtypes.bool, name='initialized')
+
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      # Copy of cluster centers actively updated each step according to
+      # mini-batch update rule.
+      cluster_centers_updated = variable_scope.variable(
+          init_value, name='clusters_updated', validate_shape=False)
+      # How many steps till we copy the updated clusters to cluster_centers.
+      update_in_steps = variable_scope.variable(
+          self._mini_batch_steps_per_iteration,
+          dtype=dtypes.int64,
+          name='update_in_steps')
+      # Count of points assigned to cluster_centers_updated.
+      cluster_counts = variable_scope.variable(
+          array_ops.zeros([num_clusters], dtype=dtypes.int64))
+    else:
+      cluster_centers_updated = cluster_centers
+      update_in_steps = None
+      cluster_counts = (
+          variable_scope.variable(
+              array_ops.ones([num_clusters], dtype=dtypes.int64))
+          if self._use_mini_batch else None)
+    return (cluster_centers, cluster_centers_initialized, cluster_counts,
+            cluster_centers_updated, update_in_steps)
+
+  @classmethod
+  def _l2_normalize_data(cls, inputs):
+    """Normalized the input data."""
+    output = []
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        output.append(nn_impl.l2_normalize(inp, dim=1))
+    return output
+
+  def training_graph(self):
+    """Generate a training graph for kmeans algorithm.
+
+    This returns, among other things, an op that chooses initial centers
+    (init_op), a boolean variable that is set to True when the initial centers
+    are chosen (cluster_centers_initialized), and an op to perform either an
+    entire Lloyd iteration or a mini-batch of a Lloyd iteration (training_op).
+    The caller should use these components as follows. A single worker should
+    execute init_op multiple times until cluster_centers_initialized becomes
+    True. Then multiple workers may execute training_op any number of times.
+
+    Returns:
+      A tuple consisting of:
+      all_scores: A matrix (or list of matrices) of dimensions (num_input,
+        num_clusters) where the value is the distance of an input vector and a
+        cluster center.
+      cluster_idx: A vector (or list of vectors). Each element in the vector
+        corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      scores: Similar to cluster_idx but specifies the distance to the
+        assigned cluster instead.
+      cluster_centers_initialized: scalar indicating whether clusters have been
+        initialized.
+      init_op: an op to initialize the clusters.
+      training_op: an op that runs an iteration of training.
+    """
+    # Implementation of kmeans.
+    if (isinstance(self._initial_clusters, str) or
+        callable(self._initial_clusters)):
+      initial_clusters = self._initial_clusters
+      num_clusters = ops.convert_to_tensor(self._num_clusters)
+    else:
+      initial_clusters = ops.convert_to_tensor(self._initial_clusters)
+      num_clusters = array_ops.shape(initial_clusters)[0]
+
+    inputs = self._inputs
+    (cluster_centers_var, cluster_centers_initialized, total_counts,
+     cluster_centers_updated,
+     update_in_steps) = self._create_variables(num_clusters)
+    init_op = _InitializeClustersOpFactory(
+        self._inputs, num_clusters, initial_clusters, self._distance_metric,
+        self._random_seed, self._kmeans_plus_plus_num_retries,
+        self._kmc2_chain_length, cluster_centers_var, cluster_centers_updated,
+        cluster_centers_initialized).op()
+    cluster_centers = cluster_centers_var
+
+    if self._distance_metric == COSINE_DISTANCE:
+      inputs = self._l2_normalize_data(inputs)
+      if not self._clusters_l2_normalized():
+        cluster_centers = nn_impl.l2_normalize(cluster_centers, dim=1)
+
+    all_scores, scores, cluster_idx = self._infer_graph(inputs, cluster_centers)
+    if self._use_mini_batch:
+      sync_updates_op = self._mini_batch_sync_updates_op(
+          update_in_steps, cluster_centers_var, cluster_centers_updated,
+          total_counts)
+      assert sync_updates_op is not None
+      with ops.control_dependencies([sync_updates_op]):
+        training_op = self._mini_batch_training_op(
+            inputs, cluster_idx, cluster_centers_updated, total_counts)
+    else:
+      assert cluster_centers == cluster_centers_var
+      training_op = self._full_batch_training_op(
+          inputs, num_clusters, cluster_idx, cluster_centers_var)
+
+    return (all_scores, cluster_idx, scores, cluster_centers_initialized,
+            init_op, training_op)
+
+  def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
+                                  cluster_centers_updated, total_counts):
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      assert update_in_steps is not None
+      with ops.colocate_with(update_in_steps, ignore_existing=True):
+
+        def _f():
+          # Note that there is a race condition here, so we do a best effort
+          # updates here. We reset update_in_steps first so that other workers
+          # don't duplicate the updates. Also we update cluster_center_vars
+          # before resetting total_counts to avoid large updates to
+          # cluster_centers_updated based on partially updated
+          # cluster_center_vars.
+          with ops.control_dependencies([
+              state_ops.assign(update_in_steps,
+                               self._mini_batch_steps_per_iteration - 1)
+          ]):
+            with ops.colocate_with(
+                cluster_centers_updated, ignore_existing=True):
+              if self._distance_metric == COSINE_DISTANCE:
+                cluster_centers = nn_impl.l2_normalize(
+                    cluster_centers_updated, dim=1)
+              else:
+                cluster_centers = cluster_centers_updated
+            with ops.colocate_with(cluster_centers_var, ignore_existing=True):
+              with ops.control_dependencies(
+                  [state_ops.assign(cluster_centers_var, cluster_centers)]):
+                with ops.colocate_with(None, ignore_existing=True):
+                  with ops.control_dependencies([
+                      state_ops.assign(total_counts,
+                                       array_ops.zeros_like(total_counts))
+                  ]):
+                    return array_ops.identity(update_in_steps)
+
+        return control_flow_ops.cond(
+            update_in_steps <= 0, _f,
+            lambda: state_ops.assign_sub(update_in_steps, 1))
+    else:
+      return control_flow_ops.no_op()
+
+  def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
+                              total_counts):
+    """Creates an op for training for mini batch case.
+
+    Args:
+      inputs: list of input Tensors.
+      cluster_idx_list: A vector (or list of vectors). Each element in the
+        vector corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      cluster_centers: Tensor Ref of cluster centers.
+      total_counts: Tensor Ref of cluster counts.
+
+    Returns:
+      An op for doing an update of mini-batch k-means.
+    """
+    update_ops = []
+    for inp, cluster_idx in zip(inputs, cluster_idx_list):
+      with ops.colocate_with(inp, ignore_existing=True):
+        assert total_counts is not None
+        cluster_idx = array_ops.reshape(cluster_idx, [-1])
+        # Dedupe the unique ids of cluster_centers being updated so that updates
+        # can be locally aggregated.
+        unique_ids, unique_idx = array_ops.unique(cluster_idx)
+        num_unique_cluster_idx = array_ops.size(unique_ids)
+        # Fetch the old values of counts and cluster_centers.
+        with ops.colocate_with(total_counts, ignore_existing=True):
+          old_counts = array_ops.gather(total_counts, unique_ids)
+        # TODO(agarwal): This colocation seems to run into problems. Fix it.
+        with ops.colocate_with(cluster_centers, ignore_existing=True):
+          old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
+        # Locally aggregate the increment to counts.
+        count_updates = math_ops.unsorted_segment_sum(
+            array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
+            unique_idx, num_unique_cluster_idx)
+        # Locally compute the sum of inputs mapped to each id.
+        # For a cluster with old cluster value x, old count n, and with data
+        # d_1,...d_k newly assigned to it, we recompute the new value as
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
+        cluster_center_updates = math_ops.unsorted_segment_sum(
+            inp, unique_idx, num_unique_cluster_idx)
+        # Shape to enable broadcasting count_updates and learning_rate to inp.
+        # It extends the shape with 1's to match the rank of inp.
+        broadcast_shape = array_ops.concat([
+            array_ops.reshape(num_unique_cluster_idx, [1]),
+            array_ops.ones(
+                array_ops.reshape(array_ops.rank(inp) - 1, [1]),
+                dtype=dtypes.int32)
+        ], 0)
+        # Subtract k * x, see comment above.
+        cluster_center_updates -= math_ops.cast(
+            array_ops.reshape(count_updates, broadcast_shape),
+            inp.dtype) * old_cluster_centers
+        learning_rate = math_ops.reciprocal(
+            math_ops.cast(old_counts + count_updates, inp.dtype))
+        learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
+        # scale by 1 / (n + k), see comment above.
+        cluster_center_updates *= learning_rate
+        # Apply the updates.
+      update_counts = state_ops.scatter_add(total_counts, unique_ids,
+                                            count_updates)
+      update_cluster_centers = state_ops.scatter_add(
+          cluster_centers, unique_ids, cluster_center_updates)
+      update_ops.extend([update_counts, update_cluster_centers])
+    return control_flow_ops.group(*update_ops)
+
+  def _full_batch_training_op(self, inputs, num_clusters, cluster_idx_list,
+                              cluster_centers):
+    """Creates an op for training for full batch case.
+
+    Args:
+      inputs: list of input Tensors.
+      num_clusters: an integer Tensor providing the number of clusters.
+      cluster_idx_list: A vector (or list of vectors). Each element in the
+        vector corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      cluster_centers: Tensor Ref of cluster centers.
+
+    Returns:
+      An op for doing an update of mini-batch k-means.
+    """
+    cluster_sums = []
+    cluster_counts = []
+    epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
+    for inp, cluster_idx in zip(inputs, cluster_idx_list):
+      with ops.colocate_with(inp, ignore_existing=True):
+        cluster_sums.append(
+            math_ops.unsorted_segment_sum(inp, cluster_idx, num_clusters))
+        cluster_counts.append(
+            math_ops.unsorted_segment_sum(
+                array_ops.reshape(
+                    array_ops.ones(
+                        array_ops.reshape(array_ops.shape(inp)[0], [-1])),
+                    [-1, 1]), cluster_idx, num_clusters))
+    with ops.colocate_with(cluster_centers, ignore_existing=True):
+      new_clusters_centers = math_ops.add_n(cluster_sums) / (
+          math_ops.cast(math_ops.add_n(cluster_counts), cluster_sums[0].dtype) +
+          epsilon)
+      if self._clusters_l2_normalized():
+        new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
+    return state_ops.assign(cluster_centers, new_clusters_centers)
+
+
+class _InitializeClustersOpFactory(object):
+  """Internal class to create the op to initialize the clusters.
+
+    The op performs this algorithm (see constructor args):
+
+    num_remaining = num_clusters - length(cluster_centers)
+    if num_remaining == 0:
+      assert that cluster_centers_initialized is true
+    else:
+      assert that num_remaining > 0
+      new_centers = choose up to num_remaining initial centers
+      l2-normalize new_centers if using cosine distance
+      all_centers = concat(cluster_centers, new_centers)
+      cluster_centers := all_centers
+      if there is a cluster_centers_updated variable:
+        cluster_centers_updated := cluster_centers
+      num_now_remaining = num_clusters - length(cluster_centers)
+      if num_now_remaining == 0:
+        cluster_centers_initialized := true
+  """
+
+  # TODO(ccolby): Refactor this class so that kmc2 isn't so much a special case.
+
+  def __init__(self, inputs, num_clusters, initial_clusters, distance_metric,
+               random_seed, kmeans_plus_plus_num_retries, kmc2_chain_length,
+               cluster_centers, cluster_centers_updated,
+               cluster_centers_initialized):
+    """Creates an op factory.
+
+    Args:
+      inputs: See KMeans constructor.
+      num_clusters: An integer Tensor providing the number of clusters.
+      initial_clusters: See KMeans constructor.
+      distance_metric: See KMeans constructor.
+      random_seed: See KMeans constructor.
+      kmeans_plus_plus_num_retries: See KMeans constructor.
+      kmc2_chain_length: See KMeans constructor.
+      cluster_centers: The TF variable holding the initial centers. It may
+          already contain some centers when the op is executed.
+      cluster_centers_updated: A second TF variable to hold a copy of the
+          initial centers, used for full-batch mode. In mini-batch mode,
+          cluster_centers_updated is the same variable as cluster_centers.
+      cluster_centers_initialized: A boolean TF variable that will be set
+          to true when all the initial centers have been chosen.
+    """
+    # All of these instance variables are constants.
+    self._inputs = inputs
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._random_seed = random_seed
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._kmc2_chain_length = kmc2_chain_length
+    self._cluster_centers = cluster_centers
+    self._cluster_centers_updated = cluster_centers_updated
+    self._cluster_centers_initialized = cluster_centers_initialized
+
+    self._num_selected = array_ops.shape(self._cluster_centers)[0]
+    self._num_remaining = self._num_clusters - self._num_selected
+    self._num_data = math_ops.add_n(
+        [array_ops.shape(i)[0] for i in self._inputs])
+
+  def _random(self):
+    indices = random_ops.random_uniform(
+        array_ops.reshape(self._num_remaining, [-1]),
+        minval=0,
+        maxval=math_ops.cast(self._num_data, dtypes.int64),
+        seed=self._random_seed,
+        dtype=dtypes.int64)
+    return embedding_lookup(self._inputs, indices, partition_strategy='div')
+
+  def _kmeans_plus_plus(self):
+    # Points from only the first shard are used for initializing centers.
+    # TODO(ands): Use all points.
+    inp = self._inputs[0]
+    if self._distance_metric == COSINE_DISTANCE:
+      inp = nn_impl.l2_normalize(inp, dim=1)
+    return gen_clustering_ops.kmeans_plus_plus_initialization(
+        inp,
+        math_ops.to_int64(self._num_remaining), self._random_seed,
+        self._kmeans_plus_plus_num_retries)
+
+  def _kmc2_multiple_centers(self):
+    """Adds new initial cluster centers using the k-MC2 algorithm.
+
+    In each call to the op, the provided batch is split into subsets based on
+    the specified `kmc2_chain_length`. On each subset, a single Markov chain of
+    the k-MC2 algorithm is used to add *one* new center cluster center. If there
+    are less than `kmc2_chain_length` points in the subset, a single center is
+    added using one Markov chain on the full input. It is assumed that the
+    provided batch has previously been randomly permuted. Otherwise, k-MC2 may
+    return suboptimal centers.
+
+    Returns:
+      An op that adds new cluster centers.
+    """
+    # The op only operates on the first shard of data.
+    first_shard = self._inputs[0]
+    # Number of points in the input that can be used.
+    batch_size = array_ops.shape(first_shard)[0]
+    # Maximum number of subsets such that the size of each subset is at least
+    # `kmc2_chain_length`. Final subsets may be larger.
+    max_to_sample = math_ops.cast(
+        batch_size / self._kmc2_chain_length, dtype=dtypes.int32)
+    # We sample at least one new center and at most all remaining centers.
+    num_to_sample = math_ops.maximum(
+        math_ops.minimum(self._num_remaining, max_to_sample), 1)
+
+    def _cond(i, _):
+      """Stopping condition for the while loop."""
+      return math_ops.less(i, num_to_sample)
+
+    def _body(i, _):
+      """Body that adds a single new center based on a subset."""
+
+      def _sample_random():
+        """Returns a random point as a cluster center."""
+        # By assumption the batch is reshuffled and _sample_random is always
+        # called for i=0. Hence, we simply return the first point.
+        new_center = array_ops.reshape(first_shard[0], [1, -1])
+        if self._distance_metric == COSINE_DISTANCE:
+          new_center = nn_impl.l2_normalize(new_center, dim=1)
+        return new_center
+
+      def _sample_kmc2_chain():
+        """Returns previous centers as well as a new center sampled using k-MC2.
+        """
+        # Extract the subset from the underlying batch.
+        start = i * self._kmc2_chain_length
+        end = start + self._kmc2_chain_length
+        subset = first_shard[start:end]
+        # Compute the distances from points in the subset to previous centers.
+        _, distances = gen_clustering_ops.nearest_neighbors(
+            subset, self._cluster_centers, 1)
+        # Sample index of new center using k-MC2 Markov chain.
+        new_center_index = gen_clustering_ops.kmc2_chain_initialization(
+            array_ops.squeeze(distances), self._random_seed)
+        # Extract actual new center.
+        newly_sampled_center = array_ops.reshape(subset[new_center_index],
+                                                 [1, -1])
+        # Return concatenation with previously sampled centers.
+        if self._distance_metric == COSINE_DISTANCE:
+          newly_sampled_center = nn_impl.l2_normalize(
+              newly_sampled_center, dim=1)
+        return array_ops.concat([self._cluster_centers, newly_sampled_center],
+                                0)
+
+      # Obtain a random point if there are no previously sampled centers.
+      # Otherwise, construct a k-MC2 Markov chain.
+      new_centers = control_flow_ops.cond(
+          math_ops.equal(self._num_selected, 0), _sample_random,
+          _sample_kmc2_chain)
+      # Assign new cluster centers to underlying variable.
+      assigned_centers = state_ops.assign(
+          self._cluster_centers, new_centers, validate_shape=False)
+      if self._cluster_centers_updated is not self._cluster_centers:
+        assigned_centers = state_ops.assign(
+            self._cluster_centers_updated,
+            assigned_centers,
+            validate_shape=False)
+      return i + 1, self._num_clusters - array_ops.shape(assigned_centers)[0]
+
+    # Add num_to_sample new data points.
+    _, num_remaining = control_flow_ops.while_loop(_cond, _body, [0, 0])
+    return num_remaining
+
+  def _greedy_batch_sampler(self, sampler):
+    # If the input dataset size is smaller than the number of centers
+    # remaining, choose the entire input dataset as centers. This can happen
+    # with mini-batch. Otherwise, sample the batch according to the provided
+    # sampler.
+    return control_flow_ops.cond(self._num_data <= self._num_remaining,
+                                 lambda: array_ops.concat(self._inputs, 0),
+                                 sampler)
+
+  def _single_batch_sampler(self, sampler):
+    # Enforce that there are at least as many data points as centers
+    # remaining. This gives the provided sampler the chance to select all
+    # remaining centers from a single batch.
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(self._num_data, self._num_remaining)]):
+      return sampler()
+
+  def _choose_initial_centers(self):
+    if isinstance(self._initial_clusters, str):
+      if self._initial_clusters == RANDOM_INIT:
+        return self._greedy_batch_sampler(self._random)
+      else:  # self._initial_clusters == KMEANS_PLUS_PLUS_INIT
+        return self._single_batch_sampler(self._kmeans_plus_plus)
+    elif callable(self._initial_clusters):
+      return self._initial_clusters(self._inputs, self._num_remaining)
+    else:
+      with ops.control_dependencies([
+          check_ops.assert_equal(self._num_remaining,
+                                 array_ops.shape(self._initial_clusters)[0])
+      ]):
+        return self._initial_clusters
+
+  def _add_new_centers(self):
+    """Adds some centers and returns the number of centers remaining."""
+    new_centers = self._choose_initial_centers()
+    if self._distance_metric == COSINE_DISTANCE:
+      new_centers = nn_impl.l2_normalize(new_centers, dim=1)
+    # If cluster_centers is empty, it doesn't have the right shape for concat.
+    all_centers = control_flow_ops.cond(
+        math_ops.equal(self._num_selected, 0), lambda: new_centers,
+        lambda: array_ops.concat([self._cluster_centers, new_centers], 0))
+    # TODO(ccolby): De-dupe all_centers?
+    a = state_ops.assign(
+        self._cluster_centers, all_centers, validate_shape=False)
+    if self._cluster_centers_updated is not self._cluster_centers:
+      a = state_ops.assign(
+          self._cluster_centers_updated, a, validate_shape=False)
+    return self._num_clusters - array_ops.shape(a)[0]
+
+  def _initialize(self):
+    with ops.control_dependencies([
+        check_ops.assert_positive(self._num_remaining),
+    ]):
+      if self._initial_clusters == KMC2_INIT:
+        num_now_remaining = self._kmc2_multiple_centers()
+      else:
+        num_now_remaining = self._add_new_centers()
+      return control_flow_ops.cond(
+          math_ops.equal(num_now_remaining, 0),
+          lambda: state_ops.assign(self._cluster_centers_initialized, True),
+          control_flow_ops.no_op)
+
+  def op(self):
+    """Returns the cluster initializer op."""
+    return control_flow_ops.cond(
+        math_ops.equal(self._num_remaining, 0),
+        lambda: check_ops.assert_equal(self._cluster_centers_initialized, True),
+        self._initialize)
diff --git a/tensorflow/python/ops/clustering_ops_test.py b/tensorflow/python/ops/clustering_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5804c660e67eedf09b0dec6e599d1cf644156a9d
--- /dev/null
+++ b/tensorflow/python/ops/clustering_ops_test.py
@@ -0,0 +1,212 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for clustering_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import clustering_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KmeansPlusPlusInitializationTest(test.TestCase):
+
+  # All but one input point are close to (101, 1). With uniform random sampling,
+  # it is highly improbable for (-1, -1) to be selected.
+  def setUp(self):
+    self._points = np.array([[100., 0.],
+                             [101., 2.],
+                             [102., 0.],
+                             [100., 1.],
+                             [100., 2.],
+                             [101., 0.],
+                             [101., 0.],
+                             [101., 1.],
+                             [102., 0.],
+                             [-1., -1.]]).astype(np.float32)
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      sampled_points = clustering_ops.kmeans_plus_plus_initialization(
+          self._points, 3, seed, (seed % 5) - 1)
+      self.assertAllClose(
+          sorted(self.evaluate(sampled_points).tolist()),
+          [[-1., -1.], [101., 1.], [101., 1.]],
+          atol=1.0)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationTest(test.TestCase):
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      distances = np.zeros(1000).astype(np.float32)
+      distances[6] = 10e7
+      distances[4] = 10e3
+
+      sampled_point = clustering_ops.kmc2_chain_initialization(distances, seed)
+      self.assertAllEqual(sampled_point, 6)
+      distances[6] = 0.0
+      sampled_point = clustering_ops.kmc2_chain_initialization(distances, seed)
+      self.assertAllEqual(sampled_point, 4)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationLargeTest(test.TestCase):
+
+  def setUp(self):
+    self._distances = np.zeros(1001)
+    self._distances[500] = 100.0
+    self._distances[1000] = 50.0
+
+  def testBasic(self):
+    with self.cached_session():
+      counts = {}
+      seed = 0
+      for i in range(50):
+        sample = self.evaluate(
+            clustering_ops.kmc2_chain_initialization(self._distances, seed + i))
+        counts[sample] = counts.get(sample, 0) + 1
+      self.assertEquals(len(counts), 2)
+      self.assertTrue(500 in counts)
+      self.assertTrue(1000 in counts)
+      self.assertGreaterEqual(counts[500], 5)
+      self.assertGreaterEqual(counts[1000], 5)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationCornercaseTest(test.TestCase):
+
+  def setUp(self):
+    self._distances = np.zeros(10)
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      sampled_point = clustering_ops.kmc2_chain_initialization(
+          self._distances, seed)
+      self.assertAllEqual(sampled_point, 0)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+# A simple test that can be verified by hand.
+class NearestCentersTest(test.TestCase):
+
+  def setUp(self):
+    self._points = np.array([[100., 0.],
+                             [101., 2.],
+                             [99., 2.],
+                             [1., 1.]]).astype(np.float32)
+
+    self._centers = np.array([[100., 0.],
+                              [99., 1.],
+                              [50., 50.],
+                              [0., 0.],
+                              [1., 1.]]).astype(np.float32)
+
+  def testNearest1(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 1)
+      self.assertAllClose(indices, [[0], [0], [1], [4]])
+      self.assertAllClose(distances, [[0.], [5.], [1.], [0.]])
+
+  def testNearest2(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 2)
+      self.assertAllClose(indices, [[0, 1], [0, 1], [1, 0], [4, 3]])
+      self.assertAllClose(distances, [[0., 2.], [5., 5.], [1., 5.], [0., 2.]])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+# A test with large inputs.
+class NearestCentersLargeTest(test.TestCase):
+
+  def setUp(self):
+    num_points = 1000
+    num_centers = 2000
+    num_dim = 100
+    max_k = 5
+    # Construct a small number of random points and later tile them.
+    points_per_tile = 10
+    assert num_points % points_per_tile == 0
+    points = np.random.standard_normal(
+        [points_per_tile, num_dim]).astype(np.float32)
+    # Construct random centers.
+    self._centers = np.random.standard_normal(
+        [num_centers, num_dim]).astype(np.float32)
+
+    # Exhaustively compute expected nearest neighbors.
+    def squared_distance(x, y):
+      return np.linalg.norm(x - y, ord=2)**2
+
+    nearest_neighbors = [
+        sorted([(squared_distance(point, self._centers[j]), j)
+                for j in range(num_centers)])[:max_k] for point in points
+    ]
+    expected_nearest_neighbor_indices = np.array(
+        [[i for _, i in nn] for nn in nearest_neighbors])
+    expected_nearest_neighbor_squared_distances = np.array(
+        [[dist for dist, _ in nn] for nn in nearest_neighbors])
+    # Tile points and expected results to reach requested size (num_points)
+    (self._points, self._expected_nearest_neighbor_indices,
+     self._expected_nearest_neighbor_squared_distances) = (
+         np.tile(x, (int(num_points / points_per_tile), 1))
+         for x in (points, expected_nearest_neighbor_indices,
+                   expected_nearest_neighbor_squared_distances))
+
+  def testNearest1(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 1)
+      self.assertAllClose(
+          indices,
+          self._expected_nearest_neighbor_indices[:, [0]])
+      self.assertAllClose(
+          distances,
+          self._expected_nearest_neighbor_squared_distances[:, [0]])
+
+  def testNearest5(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 5)
+      self.assertAllClose(
+          indices,
+          self._expected_nearest_neighbor_indices[:, 0:5])
+      self.assertAllClose(
+          distances,
+          self._expected_nearest_neighbor_squared_distances[:, 0:5])
+
+
+if __name__ == "__main__":
+  np.random.seed(0)
+  test.main()
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 0fd9368d2194e875aa5c4ddfb716f0898d6a9c49..9c5a39b90e0e163f559524e33f7deb04794c1d0d 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -50,6 +50,24 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
+  def _testMultipleConcurrentCollectiveReduce(self, t0, t1, expected):
+    group_key = 1
+    group_size = 2
+    num_instances = 2
+    all_reduces = []
+    config = config_pb2.ConfigProto(device_count={'CPU': group_size})
+    config.experimental.collective_deterministic_sequential_execution = True
+    with self.session(config=config) as sess:
+      for cpu in range(group_size):
+        with ops.device('/CPU:%d' % cpu):
+          in_tensor = constant_op.constant(t0 if cpu == 0 else t1)
+          for instance in range(num_instances):
+            all_reduces.append(collective_ops.all_reduce(
+                in_tensor, group_size, group_key, instance, 'Add', 'Div'))
+      results = sess.run(all_reduces)
+    for i in range(group_size * num_instances):
+      self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
+
   @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
@@ -62,6 +80,13 @@ class CollectiveOpTest(test.TestCase):
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], False)
 
+  @test_util.run_deprecated_v1
+  def testCollectiveMultipleConcurrentReduce(self):
+    self._testMultipleConcurrentCollectiveReduce(
+        [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+        [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
+        [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
+
   @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
     self._testCollectiveReduce(0.1, 0.3, 0.2, True)
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 7d09e32e241d55f064239bbfd4c4af45ac329c4b..a0a13fbde39f30e42b1d6843afb68dda89198383 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
 
@@ -68,34 +69,33 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
         true_name,
         true_fn, [], {},
         func_graph=util.CondBranchFuncGraph(
-            true_name, read_only_collections=False),
+            true_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
     false_graph = func_graph_module.func_graph_from_py_func(
         false_name,
         false_fn, [], {},
         func_graph=util.CondBranchFuncGraph(
-            false_name, read_only_collections=False),
+            false_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
 
-    outputs = _build_cond(pred, true_graph, false_graph,
-                          true_graph.external_captures,
-                          false_graph.external_captures,
-                          name=scope)
-
-    return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
-                                              outputs)
+    return _build_cond(pred, true_graph, false_graph,
+                       true_graph.external_captures,
+                       false_graph.external_captures,
+                       name=scope)
 
 
 @ops.RegisterGradient("If")
 def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of an If op produced by cond_v2."""
-  true_graph, false_graph = _get_func_graphs(op)
+  # Get the if operator (this logic handles the case where op is a MockOp)
+  if_op = op.outputs[0].op
+  true_graph, false_graph = _get_func_graphs(if_op)
   # Note: op.graph != ops.get_default_graph() when we are computing the gradient
   # of a nested cond.
-  assert true_graph.outer_graph == op.graph
-  assert false_graph.outer_graph == op.graph
+  assert true_graph.outer_graph == if_op.graph
+  assert false_graph.outer_graph == if_op.graph
 
   # Create grad functions that compute the gradient of the true/false forward
   # graphs. These functions will capture tensors from the forward pass
@@ -105,9 +105,6 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   false_grad_graph = _create_grad_func(
       false_graph, grads, util.unique_grad_fn_name(false_graph.name))
 
-  assert ([t.dtype for t in true_grad_graph.outputs] ==
-          [t.dtype for t in false_grad_graph.outputs])
-
   if (true_grad_graph.if_op_needs_rewrite or
       false_grad_graph.if_op_needs_rewrite):
     # Modify 'op' to output the intermediates needed by the grad functions. Note
@@ -140,11 +137,12 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
     true_graph.name += "_rewritten"
     false_graph.name += "_rewritten"
 
-    op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
-    op._set_func_attr("else_branch", util.create_new_tf_function(false_graph))
-    op._set_type_list_attr("Tout", true_graph.output_types)
-    op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
-    op._add_outputs(
+    if_op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
+    if_op._set_func_attr("else_branch",
+                         util.create_new_tf_function(false_graph))
+    if_op._set_type_list_attr("Tout", true_graph.output_types)
+    if_op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
+    if_op._add_outputs(
         [t.dtype for t in extra_true_outputs],
         [t.shape for t in extra_true_outputs])
 
@@ -153,7 +151,10 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
   false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
-  outputs = _build_cond(op.inputs[0], true_grad_graph, false_grad_graph,
+  # This modifies true_grad_graph and false_grad_graph.
+  _make_output_composite_tensors_match(true_grad_graph, false_grad_graph)
+
+  outputs = _build_cond(if_op.inputs[0], true_grad_graph, false_grad_graph,
                         true_grad_inputs, false_grad_inputs)
 
   # The predicate has no gradient.
@@ -216,7 +217,8 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
 
   # Prevent fetching since the variant outputs can't be fetched directly.
   if_op.graph.prevent_fetching(if_op)
-  return tensors
+  return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
+                                            tensors)
 
 
 def _get_func_graphs(if_op):
@@ -472,6 +474,50 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   return new_inputs
 
 
+def _make_output_composite_tensors_match(true_graph, false_graph):
+  """Rewrites {true,false}_graph's outputs to use the same _TensorLike classes.
+
+  Currently the only transformation implemented is turning a Tensor into an
+  equivalent IndexedSlices if the other branch returns an IndexedSlices.
+  Updates {true,false}_graph.{outputs,structured_outputs}.
+
+  Args:
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+
+  Raises:
+    TypeError: if a pair of outputs cannot be rewritten.
+  """
+  # Note: since this is only used for gradient graphs, we do not expect the
+  # outputs to be structured (e.g. nested lists), and thus do not need to use
+  # nest.flatten, etc.
+  true_outputs = list(true_graph.structured_outputs)
+  false_outputs = list(false_graph.structured_outputs)
+  assert len(true_outputs) == len(false_outputs)
+
+  for idx, (true_out, false_out) in enumerate(zip(true_outputs, false_outputs)):
+    if type(true_out) == type(false_out):  # pylint: disable=unidiomatic-typecheck
+      continue
+    if (isinstance(true_out, ops.IndexedSlices) and
+        isinstance(false_out, ops.Tensor)):
+      with false_graph.as_default():
+        false_outputs[idx] = math_ops._as_indexed_slices(false_out)
+    elif (isinstance(true_out, ops.Tensor) and
+          isinstance(false_out, ops.IndexedSlices)):
+      with true_graph.as_default():
+        true_outputs[idx] = math_ops._as_indexed_slices(true_out)
+    else:
+      raise TypeError(
+          "Cannot reconcile tf.cond %i-th outputs:\n"
+          "  true_fn returned:  %s\n"
+          "  false_fn returned: %s" % (idx, true_out, false_out))
+
+  true_graph.structured_outputs = true_outputs
+  true_graph.outputs = func_graph_module.flatten(true_outputs)
+  false_graph.structured_outputs = false_outputs
+  false_graph.outputs = func_graph_module.flatten(false_outputs)
+
+
 def _wrap_intermediates(func_graph, intermediates):
   with func_graph.as_default():
     return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
@@ -515,23 +561,30 @@ def _create_fakeparams(func_graph, template_tensors):
 
 def _check_same_outputs(true_graph, false_graph):
   """Raises an error if true_graph and false_graph have different outputs."""
-  true_output_types = [t.dtype for t in true_graph.outputs]
-  false_output_types = [t.dtype for t in false_graph.outputs]
-  if (len(true_graph.outputs) != len(false_graph.outputs) or
-      true_output_types != false_output_types):
+
+  def error(error_detail):
     raise TypeError(
-        "true_fn() and false_fn() must return the same number and type of "
-        "arguments, got:\n"
-        "  true_fn: %s\n"
-        "  false_fn: %s" % (true_output_types, false_output_types))
+        "true_fn and false_fn arguments to tf.cond must have the same number, "
+        "type, and overall structure of return values.\n"
+        "\n"
+        "true_fn output:  %s\n"
+        "false_fn output: %s\n"
+        "\n"
+        "Error details:\n"
+        "%s" % (true_graph.structured_outputs, false_graph.structured_outputs,
+                error_detail))
 
-  # Make sure `structured_outputs` for both graphs have the same structure.
   try:
     nest.assert_same_structure(true_graph.structured_outputs,
-                               false_graph.structured_outputs)
+                               false_graph.structured_outputs,
+                               expand_composites=True)
   except (ValueError, TypeError) as e:
-    raise ValueError("Outputs of true_fn and false_fn must have the same "
-                     "structure: %s" % str(e))
+    error(str(e))
+
+  assert len(true_graph.outputs) == len(false_graph.outputs)
+  for true_out, false_out in zip(true_graph.outputs, false_graph.outputs):
+    if true_out.dtype != false_out.dtype:
+      error("%s and %s have different types" % (true_out, false_out))
 
 
 def _get_output_shapes(true_graph_outputs, false_graph_outputs):
@@ -554,7 +607,8 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
   """
 
   def __init__(self, name, forward_graph):
-    super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    super(_CondGradFuncGraph, self).__init__(
+        name, collections=ops.get_default_graph()._collections)  # pylint: disable=protected-access
     self.if_op_needs_rewrite = False
     self._forward_graph = forward_graph
     # Maps from forward intermediate tensor -> the unwrapped captured
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 99216d7fb15ff865ba70d01995606c6a5e3ab7c4..cfdbe63c2fe9e492af1ae9dcfeaeb9bcd0383402 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -30,11 +30,11 @@ import six
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -181,47 +181,29 @@ def _Identity(data, name=None):
   Returns:
     A Tensor with the same type and value as the input Tensor.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return gen_array_ops.ref_identity(data, name=name)
     else:
       return array_ops.identity(data, name=name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(_Identity, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _Identity(data.values, name=name)
-    indices = array_ops.identity(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = array_ops.identity(dense_shape, name="dense_shape")
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = array_ops.identity(data.dense_shape, name="dense_shape")
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def _NextIteration(data, name=None):
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return ref_next_iteration(data, name=name)
     else:
       return next_iteration(data, name=name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(_NextIteration, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _NextIteration(data.values, name=name)
-    indices = next_iteration(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = next_iteration(dense_shape, name="dense_shape")
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = next_iteration(data.dense_shape, name="dense_shape")
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def _Enter(data,
@@ -244,12 +226,13 @@ def _Enter(data,
     is_constant: If true, the output is constant within the child frame.
     parallel_iterations: The number of iterations allowed to run in parallel.
     use_ref: If true, use ref_enter if data is of ref type.
+    use_input_shape: If true, set the result's shape based on data's shape.
     name: A name for this operation (optional).
 
   Returns:
     The same tensor as `data`.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
       result = gen_control_flow_ops.ref_enter(
@@ -260,46 +243,13 @@ def _Enter(data,
     if use_input_shape:
       result.set_shape(data.get_shape())
     return result
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    def enter_component(t):
+      return _Enter(t, frame_name, is_constant, parallel_iterations,
+                    use_ref, use_input_shape)
+    return nest.map_structure(enter_component, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _Enter(
-        data.values,
-        frame_name,
-        is_constant,
-        parallel_iterations=parallel_iterations,
-        use_input_shape=use_input_shape,
-        name=name)
-    indices = gen_control_flow_ops.enter(
-        data.indices,
-        frame_name,
-        is_constant,
-        parallel_iterations,
-        name="indices")
-    if use_input_shape:
-      indices.set_shape(data.indices.get_shape())
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = gen_control_flow_ops.enter(
-            dense_shape,
-            frame_name,
-            is_constant,
-            parallel_iterations,
-            name="dense_shape")
-        if use_input_shape:
-          dense_shape.set_shape(data.dense_shape.get_shape())
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = gen_control_flow_ops.enter(
-          data.dense_shape,
-          frame_name,
-          is_constant,
-          parallel_iterations,
-          name="dense_shape")
-      if use_input_shape:
-        dense_shape.set_shape(data.dense_shape.get_shape())
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def exit(data, name=None):  # pylint: disable=redefined-builtin
@@ -314,25 +264,16 @@ def exit(data, name=None):  # pylint: disable=redefined-builtin
   Returns:
     The same tensor as `data`.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return gen_control_flow_ops.ref_exit(data, name)
     else:
       return gen_control_flow_ops._exit(data, name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(exit, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = exit(data.values, name=name)
-    indices = gen_control_flow_ops._exit(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = gen_control_flow_ops._exit(dense_shape, name)
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = gen_control_flow_ops._exit(data.dense_shape, name)
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def switch(data, pred, dtype=None, name=None):
@@ -355,32 +296,19 @@ def switch(data, pred, dtype=None, name=None):
     to `output_true`, otherwise it goes to `output_false`.
   """
   with ops.name_scope(name, "Switch", [data, pred]) as name:
-    data = ops.internal_convert_to_tensor_or_indexed_slices(
+    data = ops.internal_convert_to_tensor_or_composite(
         data, dtype=dtype, name="data", as_ref=True)
     pred = ops.convert_to_tensor(pred, name="pred")
     if isinstance(data, ops.Tensor):
       return gen_control_flow_ops.switch(data, pred, name=name)
     else:
-      if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
+      if not isinstance(data, composite_tensor.CompositeTensor):
         raise TypeError("Type %s not supported" % type(data))
-      val, ind = data.values, data.indices
-      val_f, val_t = gen_control_flow_ops.switch(val, pred, name=name)
-      ind_f, ind_t = gen_control_flow_ops.switch(ind, pred, name="indices")
-      if isinstance(data, ops.IndexedSlices):
-        dense_shape = data.dense_shape
-        if dense_shape is not None:
-          dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
-              dense_shape, pred, name="dense_shape")
-        else:
-          dense_shape_f, dense_shape_t = None, None
-        return (ops.IndexedSlices(val_f, ind_f, dense_shape_f),
-                ops.IndexedSlices(val_t, ind_t, dense_shape_t))
-      else:
-        dense_shape = data.dense_shape
-        dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
-            data.dense_shape, pred, name="dense_shape")
-        return (sparse_tensor.SparseTensor(ind_f, val_f, dense_shape_f),
-                sparse_tensor.SparseTensor(ind_t, val_t, dense_shape_t))
+      tensors = nest.flatten(data, expand_composites=True)
+      mapped = [gen_control_flow_ops.switch(tensor, pred) for tensor in tensors]
+      mapped_f, mapped_t = zip(*mapped)
+      return (nest.pack_sequence_as(data, mapped_f, expand_composites=True),
+              nest.pack_sequence_as(data, mapped_t, expand_composites=True))
 
 
 def _SwitchRefOrTensor(data, pred, name="Switch"):
@@ -403,7 +331,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
   Raises:
     TypeError: if data is not a Tensor or IndexedSlices
   """
-  data = ops.convert_to_tensor_or_indexed_slices(data, name="data")
+  data = ops.convert_to_tensor_or_composite(data, name="data")
   # NOTE(vrv): ops.colocate_with(data, ignore_existing=True) below
   # addresses the following scenario.
   #
@@ -456,7 +384,7 @@ def merge(inputs, name=None):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
     inputs = [
-        ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
+        ops.internal_convert_to_tensor_or_composite(inp, as_ref=True)
         for inp in inputs
     ]
     if all(isinstance(v, ops.Tensor) for v in inputs):
@@ -464,30 +392,27 @@ def merge(inputs, name=None):
         return gen_control_flow_ops.ref_merge(inputs, name)
       else:
         return gen_control_flow_ops.merge(inputs, name)
-    elif all(isinstance(v, sparse_tensor.SparseTensor) for v in inputs):
-      # Only handle the case when all inputs are SparseTensor.
-      values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops.merge(
-          [inp.indices for inp in inputs], name="indices")
-      dense_shape, _ = gen_control_flow_ops.merge(
-          [inp.dense_shape for inp in inputs], name="dense_shape")
-      return (sparse_tensor.SparseTensor(indices, values, dense_shape),
-              chosen_index)
     else:
-      # For now convert all the inputs as IndexedSlices.
-      inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
-      values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops.merge(
-          [inp.indices for inp in inputs], name="indices")
-      if any(inp.dense_shape is not None for inp in inputs):
-        if any(inp.dense_shape is None for inp in inputs):
-          raise ValueError("Either all merged IndexedSlices must have a "
-                           "dense_shape, or none must have a dense_shape.")
-        dense_shape, _ = gen_control_flow_ops.merge(
-            [inp.dense_shape for inp in inputs], name="dense_shape")
-      else:
-        dense_shape = None
-      return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+      # If there is a mix of tensors and indexed slices, then convert the
+      # tensors to indexed slices.
+      if all(isinstance(v, (ops.IndexedSlices, ops.Tensor)) for v in inputs):
+        inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
+
+      for v in inputs:
+        if not isinstance(v, composite_tensor.CompositeTensor):
+          raise TypeError("Type %s not supported" % type(v))
+
+      for v in inputs[1:]:
+        nest.assert_same_structure(inputs[0], v, expand_composites=True)
+
+      flat_inputs = [nest.flatten(v, expand_composites=True) for v in inputs]
+      merged_results = [gen_control_flow_ops.merge(component)
+                        for component in zip(*flat_inputs)]
+      flat_merged = [tensor for (tensor, _) in merged_results]
+      chosen_index = merged_results[0][1]
+      merged_inputs = nest.pack_sequence_as(inputs[0], flat_merged,
+                                            expand_composites=True)
+      return (merged_inputs, chosen_index)
 
 
 # pylint: enable=protected-access
@@ -537,6 +462,30 @@ def _ShapeLessThanOrEqual(shape1, shape2):
   return True
 
 
+def _get_shape_invariant(var, shape=None):
+  """Returns a shape invariant for the given variable.
+
+  If `var` is a `CompositeTensor`, then this uses
+  `_shape_invariant_to_components()` to get shape invariants for the
+  component tensors.
+
+  Args:
+    var: The tensor whose shape is described.
+    shape: The shape invariant for the tensor.  If not specified, then a default
+      shape invariant for `var` is returned.
+
+  Returns:
+    The shape invariant for `var` (if it is a `Tensor`), or the shape invariants
+    for the components that comprise `var` (if it is a `CompositeTensor`).
+  """
+  if isinstance(var, composite_tensor.CompositeTensor):
+    return var._shape_invariant_to_components(shape)  # pylint: disable=protected-access
+  elif shape is None:
+    return var.shape
+  else:
+    return shape
+
+
 def _SetShapeInvariants(input_vars, enter_vars, shapes):
   """Set the shapes of the tensors in `enter_vars` to `shapes`.
 
@@ -566,31 +515,7 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
             (inp.name, inp.get_shape(), shape))
       var.set_shape(shape)
     else:
-      if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-        raise TypeError("Type %s not supported" % type(var))
-      if isinstance(var, ops.IndexedSlices):
-        if not _ShapeLessThanOrEqual(inp.values.get_shape(), shape):
-          raise ValueError(
-              "The shape invariant specified for %s is not compatible with "
-              "the initial shape of the values tensor of this IndexedSlices. "
-              "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s." % (inp.values.name, inp.values.get_shape(),
-                                    shape))
-        var.values.set_shape(shape)
-        var.indices.set_shape(tensor_shape.TensorShape([shape[0]]))
-        if var.dense_shape is not None:
-          var.dense_shape.set_shape(tensor_shape.TensorShape([shape.ndims]))
-      else:
-        if not _ShapeLessThanOrEqual(inp.dense_shape.get_shape(), shape):
-          raise ValueError(
-              "The shape invariant specified for %s is not compatible with "
-              "the initial shape of the shape tensor of this SparseTensor. "
-              "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s." % (inp.dense_shape.name,
-                                    inp.dense_shape.get_shape(), shape))
-        var.values.set_shape(tensor_shape.TensorShape([None]))
-        var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
-        var.dense_shape.set_shape(shape)
+      raise TypeError("Type %s not supported" % type(var))
 
 
 def _EnforceShapeInvariant(merge_var, next_var):
@@ -619,49 +544,7 @@ def _EnforceShapeInvariant(merge_var, next_var):
           "use the `shape_invariants` argument of tf.while_loop to specify a "
           "less-specific shape." % (input_t.name, input_t.shape, n_shape))
   else:
-    if not isinstance(merge_var,
-                      (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(merge_var))
-    if isinstance(merge_var, ops.IndexedSlices):
-      m_values_shape = merge_var.values.get_shape()
-      m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = tensor_shape.TensorShape(None)
-      if merge_var.dense_shape is not None:
-        m_shape_shape = merge_var.dense_shape.get_shape()
-      n_values_shape = next_var.values.get_shape()
-      n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = tensor_shape.TensorShape(None)
-      if next_var.dense_shape is not None:
-        n_shape_shape = next_var.dense_shape.get_shape()
-      if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
-          not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape)):
-        if not _ShapeLessThanOrEqual(n_values_shape, m_values_shape):
-          raise ValueError(
-              "The shape for %s is not an invariant for the loop. It enters "
-              "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-              "after one iteration. Provide shape invariants using either the "
-              "`shape_invariants` argument of tf.while_loop or set_shape() "
-              "on the loop variables." %
-              (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-               n_values_shape, n_indices_shape, n_shape_shape))
-    else:
-      m_values_shape = merge_var.values.get_shape()
-      m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = merge_var.dense_shape.get_shape()
-      n_values_shape = next_var.values.get_shape()
-      n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = next_var.dense_shape.get_shape()
-      if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
-          not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
-          not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
-        raise ValueError(
-            "The shape for %s is not an invariant for the loop. It enters "
-            "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-            "after one iteration. Provide shape invariants using either "
-            "the `shape_invariants` argument of tf.while_loop or set_shape() "
-            "on the loop variables." %
-            (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-             n_values_shape, n_indices_shape, n_shape_shape))
+    raise TypeError("Type %s not supported" % type(merge_var))
 
 
 def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
@@ -676,26 +559,15 @@ def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
       # TODO(skyewm): call this for other cases below (needs testing)
       _EnforceShapeInvariant(m, v)
     m.op._update_input(1, v)  # pylint: disable=protected-access
-  elif isinstance(m, ops.IndexedSlices):
+  elif isinstance(m, composite_tensor.CompositeTensor):
     # pylint: disable=protected-access
-    v = math_ops._as_indexed_slices(v, optimize=False)
-    v = _NextIteration(v)
-    m.values.op._update_input(1, v.values)
-    m.indices.op._update_input(1, v.indices)
+    def update_component(m_component, v_component):
+      m_component.op._update_input(1, v_component)
+    if isinstance(m, ops.IndexedSlices):
+      v = math_ops._as_indexed_slices(v, optimize=False)
     # pylint: enable=protected-access
-    if m.dense_shape is not None:
-      if v.dense_shape is None:
-        raise ValueError("Must have dense shape: %s" % v.name)
-      m.dense_shape.op._update_input(1, v.dense_shape)
-  elif isinstance(m, sparse_tensor.SparseTensor):
-    if not isinstance(v, sparse_tensor.SparseTensor):
-      raise ValueError("Must be a sparse tensor: %s" % v.name)
     v = _NextIteration(v)
-    # pylint: disable=protected-access
-    m.values.op._update_input(1, v.values)
-    m.indices.op._update_input(1, v.indices)
-    m.dense_shape.op._update_input(1, v.dense_shape)
-    # pylint: enable=protected-access
+    return nest.map_structure(update_component, m, v, expand_composites=True)
   else:
     raise TypeError("Type %s not supported" % type(m))
   return v
@@ -1613,7 +1485,8 @@ class ControlFlowContext(object):
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
-      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result)
+      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result,
+                         expand_composites=True)
 
   def GetWhileContext(self):
     """Return the while context containing this context."""
@@ -1920,19 +1793,9 @@ class CondContext(ControlFlowContext):
     if isinstance(v, ops.Operation):
       # Use pivot as the proxy for this op.
       return with_dependencies([v], self._pivot)
-    elif isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      values = self._ProcessOutputTensor(v.values)
-      indices = self._ProcessOutputTensor(v.indices)
-      if isinstance(v, ops.IndexedSlices):
-        dense_shape = v.dense_shape
-        if dense_shape is not None:
-          dense_shape = self._ProcessOutputTensor(dense_shape)
-        return ops.IndexedSlices(values, indices, dense_shape)
-      else:
-        dense_shape = self._ProcessOutputTensor(v.dense_shape)
-        return sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
-      v = nest.map_structure(_convert_tensorarray_to_flow, v)
+      v = nest.map_structure(_convert_tensorarray_to_flow, v,
+                             expand_composites=True)
       return self._ProcessOutputTensor(ops.convert_to_tensor(v))
 
   def BuildCondBranch(self, fn):
@@ -1949,11 +1812,13 @@ class CondContext(ControlFlowContext):
           return no_op(), None
         else:
           original_result = nest.map_structure(array_ops.identity,
-                                               original_result)
+                                               original_result,
+                                               expand_composites=True)
     if original_result is None:
       return None, None
 
-    result = nest.map_structure(self._BuildCondTensor, original_result)
+    result = nest.map_structure(self._BuildCondTensor, original_result,
+                                expand_composites=True)
     if not isinstance(result, (list, _basetuple)):
       result = [result]
     return original_result, result
@@ -2047,7 +1912,9 @@ def cond(pred,
   ```
 
   """
-  if util.ENABLE_CONTROL_FLOW_V2 and not context.executing_eagerly():
+  # Always enable control flow v2 if building a function, regardless of toggle.
+  if (util.EnableControlFlowV2(ops.get_default_graph()) and
+      not context.executing_eagerly()):
     return cond_v2.cond_v2(pred, true_fn, false_fn, name)
 
   # We needed to make true_fn/false_fn keyword arguments for
@@ -2118,7 +1985,8 @@ def cond(pred,
 
     # Check that the return values of the two branches have the same structure.
     try:
-      nest.assert_same_structure(orig_res_t, orig_res_f)
+      nest.assert_same_structure(orig_res_t, orig_res_f,
+                                 expand_composites=True)
     except TypeError as e:
       raise TypeError(
           "Incompatible return types of true_fn and false_fn: {}".format(e))
@@ -2130,24 +1998,21 @@ def cond(pred,
     if not res_t:
       raise ValueError("true_fn and false_fn must return at least one result.")
 
-    res_t_flat = nest.flatten(res_t)
-    res_f_flat = nest.flatten(res_f)
-
-    for x, y in zip(res_t_flat, res_f_flat):
-      assert ((isinstance(x, ops.IndexedSlices) and
-               isinstance(y, ops.IndexedSlices)) or
-              (isinstance(x, sparse_tensor.SparseTensor) and
-               isinstance(y, sparse_tensor.SparseTensor)) or
-              (isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)))
-      val_x = x if isinstance(x, ops.Tensor) else x.values
-      val_y = y if isinstance(y, ops.Tensor) else y.values
-      if val_x.dtype.base_dtype != val_y.dtype.base_dtype:
-        raise ValueError(
-            "Outputs of true_fn and false_fn must have the same type: %s, %s" %
-            (val_x.dtype.name, val_y.dtype.name))
+    res_t_flat = nest.flatten(res_t, expand_composites=True)
+    res_f_flat = nest.flatten(res_f, expand_composites=True)
+
+    for i, (x, y) in enumerate(zip(res_t_flat, res_f_flat)):
+      assert isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)
+      if x.dtype.base_dtype != y.dtype.base_dtype:
+        _cast_indexed_slice_indices(res_t, res_t_flat, res_f_flat)
+        if res_t_flat[i].dtype.base_dtype != res_f_flat[i].dtype.base_dtype:
+          raise ValueError(
+              "Outputs of true_fn and false_fn must have the same type: "
+              "%s, %s" % (x.dtype.name, y.dtype.name))
 
     merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
-    merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
+    merges = _convert_flows_to_tensorarrays(
+        nest.flatten(orig_res_t, expand_composites=True), merges)
 
     # Only add non-nested conds to the collection. Any nested control flow will
     # be encapsulated in the root context.
@@ -2156,7 +2021,8 @@ def cond(pred,
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
-    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
+    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges,
+                                   expand_composites=True)
 
     # Singleton lists and tuples are automatically unpacked if strict == False.
     if not strict:
@@ -2164,6 +2030,48 @@ def cond(pred,
     return merges
 
 
+def _cast_indexed_slice_indices(structure, flat_a, flat_b):
+  """Cast IndexedSlice.indices from int32 to int64 where necessary.
+
+  For each `IndexedSlices` in the nested structure `structure`, find its
+  indices `Tensor` in the corresponding flattened lists `flat_a` and `flat_b`
+  (where composites have been expanded); and if those indices tensors have
+  different dtypes (i.e., if one is int64 but the other is int32), then cast
+  them to both be int64.
+
+  Args:
+    structure: The nested structure that was flattened.
+    flat_a: A flattened list of `Tensors` whose structure matches
+        `structure`.  Will be modified in place to cast `IndexedSlices`
+        indices tensors to int64, where necessary.
+    flat_a: A flattened list of `Tensors` whose structure matches
+        `structure`.  Will be modified in place to cast `IndexedSlices`
+        indices tensors to int64, where necessary.
+  """
+  # Find the locations (in flat_a and flat_b) of the IndexedSlices'
+  # indices tensors.
+  indexed_slice_indices = []
+  current_index = 0
+  for item in nest.flatten(structure, expand_composites=False):
+    if isinstance(item, ops.IndexedSlices):
+      # indices is the second component of the composite tensor.
+      indexed_slice_indices.append(current_index + 1)
+    if nest.is_sequence_or_composite(item):
+      current_index += len(nest.flatten(item, expand_composites=True))
+    else:
+      current_index += 1
+  assert current_index == len(flat_a)
+
+  for index in indexed_slice_indices:
+    assert flat_a[index].dtype in (dtypes.int32, dtypes.int64)
+    assert flat_b[index].dtype in (dtypes.int32, dtypes.int64)
+    if flat_a[index].dtype != flat_b[index].dtype:
+      if flat_b[index].dtype == dtypes.int32:
+        flat_b[index] = math_ops.cast(flat_b[index], dtypes.int64)
+      else:
+        flat_a[index] = math_ops.cast(flat_a[index], dtypes.int64)
+
+
 # pylint: enable=g-doc-args
 # pylint: enable=redefined-outer-name
 
@@ -2937,21 +2845,12 @@ class WhileContext(ControlFlowContext):
       if isinstance(x, ops.Tensor):
         self._values.add(x.name)
       else:
-        self._values.add(x.values.name)
-        self._values.add(x.indices.name)
-        if isinstance(x, ops.IndexedSlices):
-          dense_shape = x.dense_shape
-        elif isinstance(x, sparse_tensor.SparseTensor):
-          dense_shape = x.dense_shape
-        else:
-          raise TypeError("Type %s not supported" % type(x))
-        if dense_shape is not None:
-          self._values.add(dense_shape.name)
+        raise TypeError("Type %s not supported" % type(x))
 
   def _BuildLoop(self, pred, body, original_loop_vars, loop_vars,
                  shape_invariants):
     """Core: Add the loop termination condition and body to the graph."""
-    flat_loop_vars = nest.flatten(original_loop_vars)
+    flat_loop_vars = nest.flatten(original_loop_vars, expand_composites=True)
 
     # Let the context know the loop variables so the loop variables
     # would be added in the outer contexts properly.
@@ -3003,7 +2902,8 @@ class WhileContext(ControlFlowContext):
         _convert_flows_to_tensorarrays(flat_loop_vars, merge_vars))
     packed_vars = nest.pack_sequence_as(
         structure=original_loop_vars,
-        flat_sequence=merge_vars_with_tensor_arrays)
+        flat_sequence=merge_vars_with_tensor_arrays,
+        expand_composites=True)
     c = ops.convert_to_tensor(pred(*packed_vars))
     self._pivot = loop_cond(c, name="LoopCond")
     switch_vars = [_SwitchRefOrTensor(x, self._pivot) for x in merge_vars]
@@ -3017,11 +2917,12 @@ class WhileContext(ControlFlowContext):
         _convert_flows_to_tensorarrays(flat_loop_vars, vars_for_body))
     packed_vars_for_body = nest.pack_sequence_as(
         structure=original_loop_vars,
-        flat_sequence=vars_for_body_with_tensor_arrays)
+        flat_sequence=vars_for_body_with_tensor_arrays,
+        expand_composites=True)
     pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     body_result = body(*packed_vars_for_body)
     post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
-    if not nest.is_sequence(body_result):
+    if not nest.is_sequence_or_composite(body_result):
       body_result = [body_result]
     if len(post_summaries) > len(pre_summaries):
       new_summaries = post_summaries[len(pre_summaries):]
@@ -3035,20 +2936,24 @@ class WhileContext(ControlFlowContext):
             return x
           return array_ops.identity(x)
 
-        body_result = nest.map_structure(map_fn, body_result)
+        body_result = nest.map_structure(map_fn, body_result,
+                                         expand_composites=True)
 
     # Compare the structure types of input and output of body.
     # For backwards compatibility, the first layer is forced to a list
     # during this comparison, because inputs are typically lists and
     # outputs of the body are typically tuples.
-    nest.assert_same_structure(list(packed_vars_for_body), list(body_result))
+    nest.assert_same_structure(list(packed_vars_for_body), list(body_result),
+                               expand_composites=True)
 
     # Store body_result to keep track of TensorArrays returned by body
     original_body_result = body_result
     # Convert TensorArrays returned by body into their flow variables
-    result = nest.map_structure(_convert_tensorarray_to_flow,
-                                nest.flatten(body_result))
-    result = ops.convert_n_to_tensor_or_indexed_slices(result)
+    result = nest.map_structure(
+        _convert_tensorarray_to_flow,
+        nest.flatten(body_result, expand_composites=True),
+        expand_composites=True)
+    result = ops.convert_n_to_tensor_or_composite(result)
 
     # Add NextIteration and the back edges to complete the loop.
     if len(merge_vars) != len(result):
@@ -3074,9 +2979,15 @@ class WhileContext(ControlFlowContext):
     # Keep original_loop_vars to identify which are TensorArrays
     original_loop_vars = loop_vars
     # Convert TensorArrays to their flow variables
-    loop_vars = nest.map_structure(_convert_tensorarray_to_flow,
-                                   nest.flatten(loop_vars))
-    loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
+    loop_vars = nest.map_structure(
+        _convert_tensorarray_to_flow,
+        nest.flatten(loop_vars, expand_composites=False),
+        expand_composites=True)
+    loop_vars = ops.convert_n_to_tensor_or_composite(loop_vars)
+    if shape_invariants is None:
+      shape_invariants = nest.map_structure(
+          _get_shape_invariant, loop_vars, expand_composites=False)
+    loop_vars = nest.flatten(loop_vars, expand_composites=True)
     try:
       self.Enter()
       # _BuildLoop calls _update_input in several places. _mutation_lock()
@@ -3088,14 +2999,15 @@ class WhileContext(ControlFlowContext):
     finally:
       self.Exit()
 
-    flat_result = nest.flatten(original_body_result)
+    flat_result = nest.flatten(original_body_result, expand_composites=True)
     # Convert TensorArray flow variables outside the context back into
     # their associated TensorArrays for returning to caller.
     exit_vars_with_tensor_arrays = (
         _convert_flows_to_tensorarrays(flat_result, exit_vars))
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
-        flat_sequence=exit_vars_with_tensor_arrays)
+        flat_sequence=exit_vars_with_tensor_arrays,
+        expand_composites=True)
 
     if return_same_structure:
       return packed_exit_vars
@@ -3109,12 +3021,7 @@ class WhileContext(ControlFlowContext):
       if isinstance(e, ops.Tensor):
         xs = [e]
       else:
-        if not isinstance(e, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-          raise TypeError("Type %s not supported" % type(e))
-        xs = [e.values, e.indices]
-        shape = e.dense_shape
-        if shape is not None:
-          xs.append(shape)
+        raise TypeError("Type %s not supported" % type(e))
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
@@ -3482,12 +3389,15 @@ def while_loop(cond,
   ```
 
   """
-  if util.ENABLE_CONTROL_FLOW_V2 and not context.executing_eagerly():
+  # Always enable control flow v2 if building a function, regardless of toggle.
+  if (util.EnableControlFlowV2(ops.get_default_graph()) and
+      not context.executing_eagerly()):
     return while_v2.while_loop(
         cond,
         body,
         loop_vars,
         shape_invariants=shape_invariants,
+        parallel_iterations=parallel_iterations,
         maximum_iterations=maximum_iterations,
         name=name,
         return_same_structure=return_same_structure)
@@ -3541,7 +3451,12 @@ def while_loop(cond,
     if shape_invariants is not None:
       if maximum_iterations is not None:
         shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
-      nest.assert_same_structure(loop_vars, shape_invariants)
+
+      nest.assert_same_structure(loop_vars, shape_invariants,
+                                 expand_composites=False)
+      shape_invariants = nest.map_structure(
+          _get_shape_invariant, loop_vars, shape_invariants,
+          expand_composites=False)
 
     loop_context = WhileContext(
         maximum_iterations=maximum_iterations,
@@ -3583,7 +3498,7 @@ def _AsTensorList(x, p):
   for v in x:
     if isinstance(v, ops.Operation):
       v = with_dependencies([v], p)
-    v = ops.convert_to_tensor_or_indexed_slices(v)
+    v = ops.convert_to_tensor_or_composite(v)
     if isinstance(v, ops.Tensor):
       l.append(array_ops.identity(v))
     else:
@@ -3631,7 +3546,7 @@ def with_dependencies(dependencies, output_tensor, name=None):
                       list(dependencies) + [output_tensor]) as name:
     with ops.colocate_with(output_tensor):
       with ops.control_dependencies(dependencies):
-        output_tensor = ops.convert_to_tensor_or_indexed_slices(output_tensor)
+        output_tensor = ops.convert_to_tensor_or_composite(output_tensor)
         if isinstance(output_tensor, ops.Tensor):
           return _Identity(output_tensor, name=name)
         else:
@@ -3682,7 +3597,7 @@ def group(*inputs, **kwargs):
 
     # Sorts *inputs according to their devices.
     ops_on_device = {}  # device -> operations specified on the device.
-    for inp in nest.flatten(inputs):
+    for inp in nest.flatten(inputs, expand_composites=True):
       if not hasattr(inp, "device"):
         raise TypeError("Expected tf.group() expected Tensor arguments not "
                         "'%s' with type '%s'" % (inp, type(inp)))
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 1747f06109daa1e7092fd1bbbcd2e2cc5762fc6c..ff0dff0042e409cc12131ca4e97731a210c6203b 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -26,16 +26,22 @@ from __future__ import print_function
 import os
 import traceback
 
-from tensorflow.python import tf2
 from tensorflow.python.platform import tf_logging as logging
 
-ENABLE_CONTROL_FLOW_V2 = (tf2.enabled() or
-                          os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
+ENABLE_CONTROL_FLOW_V2 = (os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_COND_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_WHILE_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_TENSOR_ARRAY_V2", "0") != "0")
 
 
+def EnableControlFlowV2(graph):
+  """Returns whether control flow v2 should be used in `graph`."""
+  # Enable new control flow in FuncGraphs (but not legacy _FuncGraphs).
+  # TODO(skyewm): do something better than hasattr without messing up imports.
+  return ENABLE_CONTROL_FLOW_V2 or (
+      graph.building_function and not hasattr(graph, "_captured"))
+
+
 def IsInXLAContext(op):
   try:
     xla_compile = op.get_attr("_XlaCompile")
@@ -51,6 +57,15 @@ def InXlaContext(graph):
   return GetContainingXLAContext(ctxt) is not None
 
 
+def GraphOrParentsInXlaContext(graph):
+  while True:
+    if InXlaContext(graph): return True
+    try:
+      graph = graph.outer_graph
+    except AttributeError:
+      return False
+
+
 def IsInWhileLoop(op):
   ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
   return GetContainingWhileContext(ctxt) is not None
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 5f56850884a5e9e424c77515406ef8c9b513e972..58917ad264a56578bb4c98ff9a3ef0b63a3cbf12 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -114,7 +114,7 @@ def maybe_set_lowering_attr(op):
   Args:
     op: An `If` or `While` Operation.
   """
-  if (not control_flow_util.IsInXLAContext(op) and
+  if (not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
       context.context().get_function_call_options().executor_type
       != "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d96601ac21c7d7d62423b65a2e43d08449e23129..5fc3fdcc3e108d3cc90dd118d9a6274b4f8e4704 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -56,7 +56,9 @@ def copy_handle_data(source_t, target_t):
       handle_data = source_t._handle_data  # pylint: disable=protected-access
     else:
       handle_data = resource_variable_ops.get_resource_handle_data(source_t)
-    if handle_data is not None and handle_data.is_set:
+    if (handle_data is not None
+        and handle_data.is_set
+        and handle_data.shape_and_type):
       # pylint: disable=protected-access
       pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
                                               target_t._as_tf_output(),
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 9c63385dd0152aae48b1f92fd8d350fc910fe564..a347cfdec1585f87ba0bf5e2e6fa604367604c7b 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -462,7 +462,7 @@ class Bijector(object):
 
 
   ```python
-  abs = tf.contrib.distributions.bijectors.AbsoluteValue()
+  abs = tfp.distributions.bijectors.AbsoluteValue()
 
   abs.forward(-1.)
   ==> 1.
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 9acc0469885c2463e84f875314f07d1f3d55481a..0b36054db2f15538037c2f5f64a2b762c58e5105 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -291,5 +291,5 @@ def _kl_normal_normal(n_a, n_b, name=None):
     s_a_squared = math_ops.square(n_a.scale)
     s_b_squared = math_ops.square(n_b.scale)
     ratio = s_a_squared / s_b_squared
-    return (math_ops.square(n_a.loc - n_b.loc) / (two * s_b_squared) +
-            half * (ratio - one - math_ops.log(ratio)))
+    return (math_ops.squared_difference(n_a.loc, n_b.loc) / (two * s_b_squared)
+            + half * (ratio - one - math_ops.log(ratio)))
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1becfc18778e998d1a84594273e1637e580f2aad..3c6476864a0bb05feec828d69de8fb8bc138a74b 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -167,7 +167,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   distribution:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   log_normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Exp(),
@@ -177,7 +177,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   A `LogNormal` made from callables:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   log_normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Inline(
@@ -191,7 +191,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   Another example constructing a Normal from a StandardNormal:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Affine(
@@ -209,7 +209,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   multivariate Normal as a `TransformedDistribution`.
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   # We will create two MVNs with batch_shape = event_shape = 2.
   mean = [[-1., 0],      # batch:0
           [0., 1]]       # batch:1
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index df4be1d65a042f35eacfaae924af197600ece702..b240d1e465c21450fba5ead6ca957e7d4482ea1d 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -40,6 +39,7 @@ from tensorflow.python.ops.gen_functional_ops import remote_call
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1024,17 +1024,6 @@ def For(start,
   return ret
 # pylint: enable=invalid-name,protected-access
 
-_rewriter_config_optimizer_disabled = None
-
-def _get_disabled_rewriter_config():
-  global _rewriter_config_optimizer_disabled
-  if _rewriter_config_optimizer_disabled is None:
-    config = config_pb2.ConfigProto()
-    rewriter_config = config.graph_options.rewrite_options
-    rewriter_config.disable_meta_optimizer = True
-    _rewriter_config_optimizer_disabled = config.SerializeToString()
-  return _rewriter_config_optimizer_disabled
-
 
 def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
                      executor_type=None):
@@ -1071,7 +1060,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
     executing_eagerly = context.executing_eagerly()
 
   if config is None:
-    config = _get_disabled_rewriter_config()
+    config = function_utils.get_disabled_rewriter_config()
 
   if executor_type is None:
     executor_type = ""
@@ -1103,7 +1092,8 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   # When running in graph mode, the graph and function graphs are optimized
   # (i.e. run through grappler) per the session options, so we can disable any
   # eager-specific rewriting.
-  config_proto = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
+  config_proto = attr_value_pb2.AttrValue(
+      s=function_utils.get_disabled_rewriter_config())
 
   graph = ops.get_default_graph()
   f.add_to_graph(graph)
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 5d473eeb5f4f00087672da53c5fef3ab63bdbd08..41fcaaca6824611fb4212df1f444e72bffdf0ea4 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -66,20 +66,31 @@ def _eval_indexed_slices(a):
 
 
 def _to_numpy(a):
-  """Converts Tensors and EagerTensors to numpy arrays.
+  """Converts Tensors, EagerTensors, and IndexedSlicesValue to numpy arrays.
 
   Args:
     a: any value.
 
   Returns:
     If a is EagerTensor or Tensor, returns the evaluation of a by calling
-    numpy() or run(). Otherwise returns a unchanged.
+    numpy() or run(). If a is IndexedSlicesValue, constructs the corresponding
+    dense numpy array. Otherwise returns a unchanged.
   """
   if isinstance(a, ops.EagerTensor):
     return a.numpy()
   if isinstance(a, ops.Tensor):
     sess = ops.get_default_session()
     return sess.run(a)
+  if isinstance(a, ops.IndexedSlicesValue):
+    arr = np.zeros(a.dense_shape)
+    assert len(a.values) == len(a.indices), (
+        "IndexedSlicesValue has %s value slices but %s indices\n%s" %
+        (a.values, a.indices, a))
+    for values_slice, index in zip(a.values, a.indices):
+      assert 0 <= index < len(arr), (
+          "IndexedSlicesValue has invalid index %s\n%s" % (index, a))
+      arr[index] += values_slice
+    return arr
   return a
 
 
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 24d049b726fb93401d916d60c0d37fe85de30719..8047743cfa2ef719461e85b904d38beeb040979b 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1250,14 +1250,14 @@ def random_brightness(image, max_delta, seed=None):
   interval `[-max_delta, max_delta)`.
 
   Args:
-    image: An image.
+    image: An image or images to adjust.
     max_delta: float, must be non-negative.
     seed: A Python integer. Used to create a random seed. See
       `tf.set_random_seed`
       for behavior.
 
   Returns:
-    The brightness-adjusted image.
+    The brightness-adjusted image(s).
 
   Raises:
     ValueError: if `max_delta` is negative.
@@ -1271,7 +1271,7 @@ def random_brightness(image, max_delta, seed=None):
 
 @tf_export('image.random_contrast')
 def random_contrast(image, lower, upper, seed=None):
-  """Adjust the contrast of an image by a random factor.
+  """Adjust the contrast of an image or images by a random factor.
 
   Equivalent to `adjust_contrast()` but uses a `contrast_factor` randomly
   picked in the interval `[lower, upper]`.
@@ -1281,11 +1281,10 @@ def random_contrast(image, lower, upper, seed=None):
     lower: float.  Lower bound for the random contrast factor.
     upper: float.  Upper bound for the random contrast factor.
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.set_random_seed` for behavior.
 
   Returns:
-    The contrast-adjusted tensor.
+    The contrast-adjusted image(s).
 
   Raises:
     ValueError: if `upper <= lower` or if `lower < 0`.
@@ -1305,19 +1304,19 @@ def random_contrast(image, lower, upper, seed=None):
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
-  This is a convenience method that converts an RGB image to float
-  representation, adjusts its brightness, and then converts it back to the
-  original data type. If several adjustments are chained it is advisable to
+  This is a convenience method that converts RGB images to float
+  representation, adjusts their brightness, and then converts them back to the
+  original data type. If several adjustments are chained, it is advisable to
   minimize the number of redundant conversions.
 
-  The value `delta` is added to all components of the tensor `image`. Both
-  `image` and `delta` are converted to `float` before adding (and `image` is
-  scaled appropriately if it is in fixed-point representation). For regular
+  The value `delta` is added to all components of the tensor `image`. `image` is
+  converted to `float` and scaled appropriately if it is in fixed-point
+  representation, and `delta` is converted to the same data type. For regular
   images, `delta` should be in the range `[0,1)`, as it is added to the image in
   floating point representation, where pixel values are in the `[0,1)` range.
 
   Args:
-    image: A tensor.
+    image: RGB image or images to adjust.
     delta: A scalar. Amount to add to the pixel values.
 
   Returns:
@@ -1327,10 +1326,14 @@ def adjust_brightness(image, delta):
     image = ops.convert_to_tensor(image, name='image')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    flt_image = convert_image_dtype(image, dtypes.float32)
+
+    if orig_dtype in [dtypes.float16, dtypes.float32]:
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
 
     adjusted = math_ops.add(
-        flt_image, math_ops.cast(delta, dtypes.float32), name=name)
+        flt_image, math_ops.cast(delta, flt_image.dtype), name=name)
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
@@ -1339,9 +1342,9 @@ def adjust_brightness(image, delta):
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
-  This is a convenience method that converts an RGB image to float
-  representation, adjusts its contrast, and then converts it back to the
-  original data type. If several adjustments are chained it is advisable to
+  This is a convenience method that converts RGB images to float
+  representation, adjusts their contrast, and then converts them back to the
+  original data type. If several adjustments are chained, it is advisable to
   minimize the number of redundant conversions.
 
   `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
@@ -1366,7 +1369,11 @@ def adjust_contrast(images, contrast_factor):
     images = ops.convert_to_tensor(images, name='images')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = images.dtype
-    flt_images = convert_image_dtype(images, dtypes.float32)
+
+    if orig_dtype in (dtypes.float16, dtypes.float32):
+      flt_images = images
+    else:
+      flt_images = convert_image_dtype(images, dtypes.float32)
 
     adjusted = gen_image_ops.adjust_contrastv2(
         flt_images, contrast_factor=contrast_factor, name=name)
@@ -1560,7 +1567,7 @@ def grayscale_to_rgb(images, name=None):
 # pylint: disable=invalid-name
 @tf_export('image.random_hue')
 def random_hue(image, max_delta, seed=None):
-  """Adjust the hue of an RGB image by a random factor.
+  """Adjust the hue of RGB images by a random factor.
 
   Equivalent to `adjust_hue()` but uses a `delta` randomly
   picked in the interval `[-max_delta, max_delta]`.
@@ -1570,10 +1577,10 @@ def random_hue(image, max_delta, seed=None):
   Args:
     image: RGB image or images. Size of the last dimension must be 3.
     max_delta: float.  Maximum value for the random delta.
-    seed: An operation-specific seed. It will be used in conjunction
-      with the graph-level seed to determine the real seeds that will be
-      used in this operation. Please see the documentation of
-      set_random_seed for its interaction with the graph-level random seed.
+    seed: An operation-specific seed. It will be used in conjunction with the
+      graph-level seed to determine the real seeds that will be used in this
+      operation. Please see the documentation of set_random_seed for its
+      interaction with the graph-level random seed.
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
@@ -1593,7 +1600,7 @@ def random_hue(image, max_delta, seed=None):
 
 @tf_export('image.adjust_hue')
 def adjust_hue(image, delta, name=None):
-  """Adjust hue of an RGB image.
+  """Adjust hue of RGB images.
 
   This is a convenience method that converts an RGB image to float
   representation, converts it to HSV, add an offset to the hue channel, converts
@@ -1601,7 +1608,7 @@ def adjust_hue(image, delta, name=None):
   are chained it is advisable to minimize the number of redundant conversions.
 
   `image` is an RGB image.  The image hue is adjusted by converting the
-  image to HSV and rotating the hue channel (H) by
+  image(s) to HSV and rotating the hue channel (H) by
   `delta`.  The image is then converted back to RGB.
 
   `delta` must be in the interval `[-1, 1]`.
@@ -1618,7 +1625,10 @@ def adjust_hue(image, delta, name=None):
     image = ops.convert_to_tensor(image, name='image')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    flt_image = convert_image_dtype(image, dtypes.float32)
+    if orig_dtype in (dtypes.float16, dtypes.float32):
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
 
     rgb_altered = gen_image_ops.adjust_hue(flt_image, delta)
 
@@ -1696,7 +1706,7 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
 
 @tf_export('image.random_saturation')
 def random_saturation(image, lower, upper, seed=None):
-  """Adjust the saturation of an RGB image by a random factor.
+  """Adjust the saturation of RGB images by a random factor.
 
   Equivalent to `adjust_saturation()` but uses a `saturation_factor` randomly
   picked in the interval `[lower, upper]`.
@@ -1705,10 +1715,10 @@ def random_saturation(image, lower, upper, seed=None):
     image: RGB image or images. Size of the last dimension must be 3.
     lower: float.  Lower bound for the random saturation factor.
     upper: float.  Upper bound for the random saturation factor.
-    seed: An operation-specific seed. It will be used in conjunction
-      with the graph-level seed to determine the real seeds that will be
-      used in this operation. Please see the documentation of
-      set_random_seed for its interaction with the graph-level random seed.
+    seed: An operation-specific seed. It will be used in conjunction with the
+      graph-level seed to determine the real seeds that will be used in this
+      operation. Please see the documentation of set_random_seed for its
+      interaction with the graph-level random seed.
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
@@ -1729,17 +1739,17 @@ def random_saturation(image, lower, upper, seed=None):
 
 @tf_export('image.adjust_saturation')
 def adjust_saturation(image, saturation_factor, name=None):
-  """Adjust saturation of an RGB image.
+  """Adjust saturation of RGB images.
 
-  This is a convenience method that converts an RGB image to float
-  representation, converts it to HSV, add an offset to the saturation channel,
+  This is a convenience method that converts RGB images to float
+  representation, converts them to HSV, add an offset to the saturation channel,
   converts back to RGB and then back to the original data type. If several
   adjustments are chained it is advisable to minimize the number of redundant
   conversions.
 
-  `image` is an RGB image.  The image saturation is adjusted by converting the
-  image to HSV and multiplying the saturation (S) channel by
-  `saturation_factor` and clipping. The image is then converted back to RGB.
+  `image` is an RGB image or images.  The image saturation is adjusted by
+  converting the images to HSV and multiplying the saturation (S) channel by
+  `saturation_factor` and clipping. The images are then converted back to RGB.
 
   Args:
     image: RGB image or images. Size of the last dimension must be 3.
@@ -1753,11 +1763,14 @@ def adjust_saturation(image, saturation_factor, name=None):
     image = ops.convert_to_tensor(image, name='image')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    flt_image = convert_image_dtype(image, dtypes.float32)
+    if orig_dtype in (dtypes.float16, dtypes.float32):
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
+
+    adjusted = gen_image_ops.adjust_saturation(flt_image, saturation_factor)
 
-    return convert_image_dtype(
-        gen_image_ops.adjust_saturation(flt_image, saturation_factor),
-        orig_dtype)
+    return convert_image_dtype(adjusted, orig_dtype)
 
 
 @tf_export('io.is_jpeg', 'image.is_jpeg', v1=['io.is_jpeg', 'image.is_jpeg'])
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index e7249333bd35d07821004a39c3c78e52c1ee904d..361befabce7725f44f44dcda1c6d2c487f704030 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -886,44 +886,6 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  def _adjust_saturation(self, image, saturation_factor):
-    image = ops.convert_to_tensor(image, name="image")
-    orig_dtype = image.dtype
-    flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
-    saturation_adjusted_image = gen_image_ops.adjust_saturation(
-        flt_image, saturation_factor)
-    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
-
-  def testHalfSaturationFused(self):
-    x_shape = [2, 2, 3]
-    x_rgb_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_rgb_data, dtype=np.uint8).reshape(x_shape)
-
-    saturation_factor = 0.5
-    y_rgb_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
-    y_np = np.array(y_rgb_data, dtype=np.uint8).reshape(x_shape)
-
-    with self.test_session(use_gpu=True):
-      x = constant_op.constant(x_np, shape=x_shape)
-      y = self._adjust_saturation(x, saturation_factor)
-      y_tf = self.evaluate(y)
-      self.assertAllEqual(y_tf, y_np)
-
-  def testTwiceSaturationFused(self):
-    x_shape = [2, 2, 3]
-    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-
-    saturation_factor = 2.0
-    y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0]
-    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
-
-    with self.test_session(use_gpu=True):
-      x = constant_op.constant(x_np, shape=x_shape)
-      y = self._adjust_saturation(x, saturation_factor)
-      y_tf = self.evaluate(y)
-      self.assertAllEqual(y_tf, y_np)
-
   def _adjustSaturationNp(self, x_np, scale):
     self.assertEqual(x_np.shape[-1], 3)
     x_v = x_np.reshape([-1, 3])
@@ -977,7 +939,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
           else:
             raise AssertionError("Invalid test style: %s" % (test_style))
           y_baseline = self._adjustSaturationNp(x_np, scale)
-          y_fused = self._adjust_saturation(x_np, scale).eval()
+          y_fused = image_ops.adjust_saturation(x_np, scale).eval()
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
@@ -1438,12 +1400,12 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
 
 class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
-  def _testBrightness(self, x_np, y_np, delta):
+  def _testBrightness(self, x_np, y_np, delta, tol=1e-6):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
       y_tf = self.evaluate(y)
-      self.assertAllClose(y_tf, y_np, 1e-6)
+      self.assertAllClose(y_tf, y_np, tol)
 
   def testPositiveDeltaUint8(self):
     x_shape = [2, 2, 3]
@@ -1455,7 +1417,7 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
     self._testBrightness(x_np, y_np, delta=10. / 255.)
 
-  def testPositiveDeltaFloat(self):
+  def testPositiveDeltaFloat32(self):
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.float32).reshape(x_shape) / 255.
@@ -1465,6 +1427,16 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
     self._testBrightness(x_np, y_np, delta=10. / 255.)
 
+  def testPositiveDeltaFloat16(self):
+    x_shape = [2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.float16).reshape(x_shape) / 255.
+
+    y_data = [10, 15, 23, 64, 145, 236, 47, 18, 244, 100, 265, 11]
+    y_np = np.array(y_data, dtype=np.float16).reshape(x_shape) / 255.
+
+    self._testBrightness(x_np, y_np, delta=10. / 255., tol=1e-3)
+
   def testNegativeDelta(self):
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 395d53b3286aaf82f035c4f031b521d35be1347b..a4cebc8d5891da23e9c1042b478dcabe9b7994a0 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -46,6 +46,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import  deprecated_arg_values
+from tensorflow.python.util.deprecation import  deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -95,10 +96,15 @@ class Initializer(object):
     return cls(**config)
 
 
-@tf_export("initializers.zeros", "zeros_initializer")
+@tf_export(v1=["initializers.zeros", "zeros_initializer"])
+@deprecation.deprecated_endpoints("initializers.zeros")
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, dtype=dtypes.float32):
     self.dtype = dtypes.as_dtype(dtype)
 
@@ -111,10 +117,15 @@ class Zeros(Initializer):
     return {"dtype": self.dtype.name}
 
 
-@tf_export("initializers.ones", "ones_initializer")
+@tf_export(v1=["initializers.ones", "ones_initializer"])
+@deprecation.deprecated_endpoints("initializers.ones", "ones_initializer")
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, dtype=dtypes.float32):
     self.dtype = dtypes.as_dtype(dtype)
 
@@ -127,7 +138,8 @@ class Ones(Initializer):
     return {"dtype": self.dtype.name}
 
 
-@tf_export("initializers.constant", "constant_initializer")
+@tf_export(v1=["initializers.constant", "constant_initializer"])
+@deprecation.deprecated_endpoints("constant_initializer")
 class Constant(Initializer):
   """Initializer that generates tensors with constant values.
 
@@ -207,6 +219,14 @@ class Constant(Initializer):
   ```
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
+  @deprecated_args(None,
+                   "Objects must now be the required shape or no shape "
+                   "can be specified",
+                   "verify_shape")
   def __init__(self, value=0, dtype=dtypes.float32, verify_shape=False):
     if not (np.isscalar(value) or isinstance(value, (list, tuple, np.ndarray))):
       raise TypeError(
@@ -233,7 +253,8 @@ class Constant(Initializer):
     return {"value": self.value, "dtype": self.dtype.name}
 
 
-@tf_export("initializers.random_uniform", "random_uniform_initializer")
+@tf_export(v1=["initializers.random_uniform", "random_uniform_initializer"])
+@deprecation.deprecated_endpoints("initializers.random_uniform")
 class RandomUniform(Initializer):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -249,6 +270,10 @@ class RandomUniform(Initializer):
       calling the initializer.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
     self.minval = minval
     self.maxval = maxval
@@ -270,7 +295,8 @@ class RandomUniform(Initializer):
     }
 
 
-@tf_export("initializers.random_normal", "random_normal_initializer")
+@tf_export(v1=["initializers.random_normal", "random_normal_initializer"])
+@deprecation.deprecated_endpoints("initializers.random_normal")
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
 
@@ -286,6 +312,10 @@ class RandomNormal(Initializer):
       calling the initializer. Only floating point types are supported.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
     self.mean = mean
     self.stddev = stddev
@@ -307,7 +337,9 @@ class RandomNormal(Initializer):
     }
 
 
-@tf_export("initializers.truncated_normal", "truncated_normal_initializer")
+@tf_export(v1=["initializers.truncated_normal", "truncated_normal_initializer"])
+@deprecation.deprecated_endpoints("initializers.truncated_normal",
+                                  "truncated_normal_initializer")
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
 
@@ -328,6 +360,10 @@ class TruncatedNormal(Initializer):
       calling the initializer. Only floating point types are supported.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
     self.mean = mean
     self.stddev = stddev
@@ -349,12 +385,10 @@ class TruncatedNormal(Initializer):
     }
 
 
-@tf_export(
-    "initializers.uniform_unit_scaling",
-    v1=[
-        "initializers.uniform_unit_scaling", "uniform_unit_scaling_initializer"
-    ])
-@deprecation.deprecated_endpoints("uniform_unit_scaling_initializer")
+@tf_export(v1=["initializers.uniform_unit_scaling",
+               "uniform_unit_scaling_initializer"])
+@deprecation.deprecated_endpoints("uniform_unit_scaling_initializer",
+                                  "initializers.uniform_unit_scaling")
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
 
@@ -386,6 +420,10 @@ class UniformUnitScaling(Initializer):
       ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   @deprecated(None,
               "Use tf.initializers.variance_scaling instead with distribution="
               "uniform to get equivalent behavior.")
@@ -417,13 +455,9 @@ class UniformUnitScaling(Initializer):
     return {"factor": self.factor, "seed": self.seed, "dtype": self.dtype.name}
 
 
-@tf_export(
-    "initializers.variance_scaling",
-    v1=[
-        "initializers.variance_scaling",
-        "variance_scaling_initializer"
-    ])
-@deprecation.deprecated_endpoints("variance_scaling_initializer")
+@tf_export(v1=["initializers.variance_scaling", "variance_scaling_initializer"])
+@deprecation.deprecated_endpoints("initializers.variance_scaling",
+                                  "variance_scaling_initializer")
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
 
@@ -454,6 +488,10 @@ class VarianceScaling(Initializer):
       "distribution" arguments.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   @deprecated_arg_values(
       None,
       "`normal` is a deprecated alias for `truncated_normal`",
@@ -516,13 +554,9 @@ class VarianceScaling(Initializer):
     }
 
 
-@tf_export(
-    "initializers.orthogonal",
-    v1=[
-        "initializers.orthogonal",
-        "orthogonal_initializer",
-    ])
-@deprecation.deprecated_endpoints("orthogonal_initializer")
+@tf_export(v1=["initializers.orthogonal", "orthogonal_initializer"])
+@deprecation.deprecated_endpoints("initializers.orthogonal",
+                                  "orthogonal_initializer")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
 
@@ -550,6 +584,10 @@ class Orthogonal(Initializer):
       ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
     self.gain = gain
     self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
@@ -568,8 +606,10 @@ class Orthogonal(Initializer):
     for dim in shape[:-1]:
       num_rows *= dim
     num_cols = shape[-1]
-    flat_shape = (num_cols, num_rows) if num_rows < num_cols else (num_rows,
-                                                                   num_cols)
+    if num_rows < num_cols:
+      flat_shape = (num_cols, num_rows)
+    else:
+      flat_shape = (num_rows, num_cols)
 
     # Generate a random matrix
     a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
@@ -586,6 +626,8 @@ class Orthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
+# Note these haven't been ported to TF2.0. They are not currently visible and
+# the tests are non trivial to port
 class ConvolutionDeltaOrthogonal(Initializer):
   """Initializer that generates a delta orthogonal kernel for ConvNets.
 
@@ -1137,7 +1179,8 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
     return self._dict_to_tensor(p, ksize, ksize, ksize)
 
 
-@tf_export("initializers.identity")
+@tf_export(v1=["initializers.identity"])
+@deprecation.deprecated_endpoints("initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
 
@@ -1149,6 +1192,10 @@ class Identity(Initializer):
       calling the initializer. Only floating point types are supported.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, gain=1.0, dtype=dtypes.float32):
     self.gain = gain
     self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
@@ -1170,7 +1217,9 @@ class Identity(Initializer):
     return {"gain": self.gain, "dtype": self.dtype.name}
 
 
-@tf_export("glorot_uniform_initializer", "initializers.glorot_uniform")
+@tf_export(v1=["glorot_uniform_initializer", "initializers.glorot_uniform"])
+@deprecation.deprecated_endpoints("glorot_uniform_initializer",
+                                  "initializers.glorot_uniform")
 class GlorotUniform(VarianceScaling):
   """The Glorot uniform initializer, also called Xavier uniform initializer.
 
@@ -1191,6 +1240,10 @@ class GlorotUniform(VarianceScaling):
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
@@ -1203,13 +1256,9 @@ class GlorotUniform(VarianceScaling):
     return {"seed": self.seed, "dtype": self.dtype.name}
 
 
-@tf_export(
-    "initializers.glorot_normal",
-    v1=[
-        "glorot_normal_initializer",
-        "initializers.glorot_normal"
-    ])
-@deprecation.deprecated_endpoints("glorot_normal_initializer")
+@tf_export(v1=["glorot_normal_initializer", "initializers.glorot_normal"])
+@deprecation.deprecated_endpoints("glorot_normal_initializer",
+                                  "initializers.glorot_normal")
 class GlorotNormal(VarianceScaling):
   """The Glorot normal initializer, also called Xavier normal initializer.
 
@@ -1229,6 +1278,10 @@ class GlorotNormal(VarianceScaling):
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
@@ -1263,7 +1316,7 @@ convolutional_orthogonal_3d = ConvolutionOrthogonal3D
 # pylint: enable=invalid-name
 
 
-@tf_export("initializers.lecun_normal")
+@tf_export(v1=["initializers.lecun_normal"])
 def lecun_normal(seed=None):
   """LeCun normal initializer.
 
@@ -1279,7 +1332,7 @@ def lecun_normal(seed=None):
 
   References:
       - Self-Normalizing Neural Networks,
-      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)  # pylint: disable=line-too-long
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
       [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
@@ -1288,7 +1341,7 @@ def lecun_normal(seed=None):
       scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
 
 
-@tf_export("initializers.lecun_uniform")
+@tf_export(v1=["initializers.lecun_uniform"])
 def lecun_uniform(seed=None):
   """LeCun uniform initializer.
 
@@ -1304,7 +1357,7 @@ def lecun_uniform(seed=None):
 
   References:
       - Self-Normalizing Neural Networks,
-      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)  # pylint: disable=line-too-long
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
       [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
@@ -1313,7 +1366,7 @@ def lecun_uniform(seed=None):
       scale=1., mode="fan_in", distribution="uniform", seed=seed)
 
 
-@tf_export("initializers.he_normal")
+@tf_export(v1=["initializers.he_normal"])
 def he_normal(seed=None):
   """He normal initializer.
 
@@ -1328,14 +1381,15 @@ def he_normal(seed=None):
       An initializer.
 
   References:
-      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      [He et al., 2015]
+      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)  # pylint: disable=line-too-long
       ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
 
 
-@tf_export("initializers.he_uniform")
+@tf_export(v1=["initializers.he_uniform"])
 def he_uniform(seed=None):
   """He uniform variance scaling initializer.
 
@@ -1350,7 +1404,8 @@ def he_uniform(seed=None):
       An initializer.
 
   References:
-      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      [He et al., 2015]
+      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)  # pylint: disable=line-too-long
       ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 1f22248004697438d2c8c05dc0c6762a20902d31..b3cdec9dd407c26277ed2d710397a0a831d75e16 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -24,13 +24,14 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class InitializersTest(test.TestCase):
 
   def _runner(self,
@@ -40,13 +41,8 @@ class InitializersTest(test.TestCase):
               target_std=None,
               target_max=None,
               target_min=None):
-    variable = resource_variable_ops.ResourceVariable(init(shape))
-    if context.executing_eagerly():
-      output = variable.numpy()
-    else:
-      sess = ops.get_default_session()
-      self.evaluate(variable.initializer)
-      output = self.evaluate(variable)
+    output = self.evaluate(init(shape))
+    self.assertEqual(output.shape, shape)
     lim = 3e-2
     if target_std is not None:
       self.assertGreater(lim, abs(output.std() - target_std))
diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e614e0012a279a2c4257a850579bc63577207b7
--- /dev/null
+++ b/tensorflow/python/ops/init_ops_v2.py
@@ -0,0 +1,764 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations often used for initializing tensors.
+
+All variable initializers returned by functions in this file should have the
+following signature:
+
+def _initializer(shape, dtype=dtypes.float32):
+  Args:
+    shape: List of `int` representing the shape of the output `Tensor`. Some
+      initializers may also be able to accept a `Tensor`.
+    dtype: (Optional) Type of the output `Tensor`.
+  Returns:
+    A `Tensor` of type `dtype` and `shape`.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class Initializer(object):
+  """Initializer base class: all initializers inherit from this class.
+  """
+
+  def __call__(self, shape, dtype=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided will return tensor
+       of `tf.float32`.
+    """
+    raise NotImplementedError
+
+  def get_config(self):
+    """Returns the configuration of the initializer as a JSON-serializable dict.
+
+    Returns:
+      A JSON-serializable Python dict.
+    """
+    return {}
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates an initializer from a configuration dictionary.
+
+    Example:
+
+    ```python
+    initializer = RandomUniform(-1, 1)
+    config = initializer.get_config()
+    initializer = RandomUniform.from_config(config)
+    ```
+
+    Args:
+      config: A Python dictionary.
+        It will typically be the output of `get_config`.
+
+    Returns:
+      An Initializer instance.
+    """
+    config.pop("dtype", None)
+    return cls(**config)
+
+
+@tf_export("zeros_initializer", v1=[])
+class Zeros(Initializer):
+  """Initializer that generates tensors initialized to 0."""
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    dtype = dtypes.as_dtype(dtype)
+    return array_ops.zeros(shape, dtype)
+
+
+@tf_export("ones_initializer", v1=[])
+class Ones(Initializer):
+  """Initializer that generates tensors initialized to 1."""
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
+       supported.
+
+    Raises:
+      ValuesError: If the dtype is not numeric or boolean.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    if not dtype.is_numpy_compatible or dtype == dtypes.string:
+      raise ValueError("Expected numeric or boolean dtype, got %s." % dtype)
+    return array_ops.ones(shape, dtype)
+
+
+@tf_export("constant_initializer", v1=[])
+class Constant(Initializer):
+  """Initializer that generates tensors with constant values.
+
+  The resulting tensor is populated with values of type `dtype`, as
+  specified by arguments `value` following the desired `shape` of the
+  new tensor (see examples below).
+
+  The argument `value` can be a constant value, or a list of values of type
+  `dtype`. If `value` is a list, then the length of the list must be less
+  than or equal to the number of elements implied by the desired shape of the
+  tensor. In the case where the total number of elements in `value` is less
+  than the number of elements required by the tensor shape, the last element
+  in `value` will be used to fill the remaining entries. If the total number of
+  elements in `value` is greater than the number of elements required by the
+  tensor shape, the initializer will raise a `ValueError`.
+
+  Args:
+    value: A Python scalar, list or tuple of values, or a N-dimensional numpy
+      array. All elements of the initialized variable will be set to the
+      corresponding value in the `value` argument.
+
+  Raises:
+    TypeError: If the input `value` is not one of the expected types.
+
+  Examples:
+    The following example can be rewritten using a numpy.ndarray instead
+    of the `value` list, even reshaped, as shown in the two commented lines
+    below the `value` list initialization.
+
+  ```python
+    >>> import numpy as np
+    >>> import tensorflow as tf
+
+    >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
+    >>> # value = np.array(value)
+    >>> # value = value.reshape([2, 4])
+    >>> init = tf.constant_initializer(value)
+
+    >>> print('fitting shape:')
+    >>> with tf.Session():
+    >>>   x = tf.get_variable('x', shape=[2, 4], initializer=init)
+    >>>   x.initializer.run()
+    >>>   print(x.eval())
+
+    fitting shape:
+    [[ 0.  1.  2.  3.]
+     [ 4.  5.  6.  7.]]
+
+    >>> print('larger shape:')
+    >>> with tf.Session():
+    >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init)
+    >>>   x.initializer.run()
+    >>>   print(x.eval())
+
+    larger shape:
+    [[ 0.  1.  2.  3.]
+     [ 4.  5.  6.  7.]
+     [ 7.  7.  7.  7.]]
+
+    >>> print('smaller shape:')
+    >>> with tf.Session():
+    >>>   x = tf.get_variable('x', shape=[2, 3], initializer=init)
+
+    ValueError: Too many elements provided. Needed at most 6, but received 8
+  ```
+  """
+
+  def __init__(self, value=0):
+    if not (np.isscalar(value) or isinstance(value, (list, tuple, np.ndarray))):
+      raise TypeError(
+          "Invalid type for initial value: %s (expected Python scalar, list or "
+          "tuple of values, or numpy.ndarray)." % type(value))
+    self.value = value
+
+  def __call__(self, shape, dtype=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided the dtype of the
+       tensor created will be the type of the inital value.
+
+    Raises:
+      TypeError: If the initializer cannot create a tensor of the requested
+       dtype.
+    """
+    if dtype is not None:
+      dtype = dtypes.as_dtype(dtype)
+    return constant_op.constant(
+        self.value, dtype=dtype, shape=shape)
+
+  def get_config(self):
+    return {"value": self.value}
+
+
+@tf_export("random_uniform_initializer", v1=[])
+class RandomUniform(Initializer):
+  """Initializer that generates tensors with a uniform distribution.
+
+  Args:
+    minval: A python scalar or a scalar tensor. Lower bound of the range
+      of random values to generate.
+    maxval: A python scalar or a scalar tensor. Upper bound of the range
+      of random values to generate.  Defaults to 1 for float types.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+  """
+
+  def __init__(self, minval=-0.05, maxval=0.05, seed=None):
+    self.minval = minval
+    self.maxval = maxval
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point and integer
+      types are supported.
+
+    Raises:
+      ValueError: If the dtype is not numeric.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    if not dtype.is_floating and not dtype.is_integer:
+      raise ValueError("Expected float or integer dtype, got %s." % dtype)
+    return random_ops.random_uniform(
+        shape, self.minval, self.maxval, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "minval": self.minval,
+        "maxval": self.maxval,
+        "seed": self.seed
+    }
+
+
+@tf_export("random_normal_initializer", v1=[])
+class RandomNormal(Initializer):
+  """Initializer that generates tensors with a normal distribution.
+
+  Args:
+    mean: a python scalar or a scalar tensor. Mean of the random values
+      to generate.
+    stddev: a python scalar or a scalar tensor. Standard deviation of the
+      random values to generate.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+  """
+
+  def __init__(self, mean=0.0, stddev=0.05, seed=None):
+    self.mean = mean
+    self.stddev = stddev
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    dtype = _assert_float_dtype(dtype)
+    return random_ops.random_normal(
+        shape, self.mean, self.stddev, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "mean": self.mean,
+        "stddev": self.stddev,
+        "seed": self.seed
+    }
+
+
+class TruncatedNormal(Initializer):
+  """Initializer that generates a truncated normal distribution.
+
+  These values are similar to values from a `random_normal_initializer`
+  except that values more than two standard deviations from the mean
+  are discarded and re-drawn. This is the recommended initializer for
+  neural network weights and filters.
+
+  Args:
+    mean: a python scalar or a scalar tensor. Mean of the random values
+      to generate.
+    stddev: a python scalar or a scalar tensor. Standard deviation of the
+      random values to generate.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+  """
+
+  def __init__(self, mean=0.0, stddev=0.05, seed=None):
+    self.mean = mean
+    self.stddev = stddev
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    dtype = _assert_float_dtype(dtype)
+    return random_ops.truncated_normal(
+        shape, self.mean, self.stddev, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "mean": self.mean,
+        "stddev": self.stddev,
+        "seed": self.seed
+    }
+
+
+class VarianceScaling(Initializer):
+  """Initializer capable of adapting its scale to the shape of weights tensors.
+
+  With `distribution="truncated_normal" or "untruncated_normal"`,
+  samples are drawn from a truncated/untruncated normal
+  distribution with a mean of zero and a standard deviation (after truncation,
+  if used) `stddev = sqrt(scale / n)`
+  where n is:
+    - number of input units in the weight tensor, if mode = "fan_in"
+    - number of output units, if mode = "fan_out"
+    - average of the numbers of input and output units, if mode = "fan_avg"
+
+  With `distribution="uniform"`, samples are drawn from a uniform distribution
+  within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
+
+  Args:
+    scale: Scaling factor (positive float).
+    mode: One of "fan_in", "fan_out", "fan_avg".
+    distribution: Random distribution to use. One of "truncated_normal",
+      "untruncated_normal" and  "uniform".
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+
+  Raises:
+    ValueError: In case of an invalid value for the "scale", mode" or
+      "distribution" arguments.
+  """
+
+  def __init__(self,
+               scale=1.0,
+               mode="fan_in",
+               distribution="truncated_normal",
+               seed=None):
+    if scale <= 0.:
+      raise ValueError("`scale` must be positive float.")
+    if mode not in {"fan_in", "fan_out", "fan_avg"}:
+      raise ValueError("Invalid `mode` argument:", mode)
+    distribution = distribution.lower()
+    if distribution not in {"uniform", "truncated_normal",
+                            "untruncated_normal"}:
+      raise ValueError("Invalid `distribution` argument:", distribution)
+    self.scale = scale
+    self.mode = mode
+    self.distribution = distribution
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    partition_info = None  # Keeps logic so can be readded later if necessary
+    dtype = _assert_float_dtype(dtype)
+    scale = self.scale
+    scale_shape = shape
+    if partition_info is not None:
+      scale_shape = partition_info.full_shape
+    fan_in, fan_out = _compute_fans(scale_shape)
+    if self.mode == "fan_in":
+      scale /= max(1., fan_in)
+    elif self.mode == "fan_out":
+      scale /= max(1., fan_out)
+    else:
+      scale /= max(1., (fan_in + fan_out) / 2.)
+    if self.distribution == "truncated_normal":
+      # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
+      return random_ops.truncated_normal(
+          shape, 0.0, stddev, dtype, seed=self.seed)
+    elif self.distribution == "untruncated_normal":
+      stddev = math.sqrt(scale)
+      return random_ops.random_normal(
+          shape, 0.0, stddev, dtype, seed=self.seed)
+    else:
+      limit = math.sqrt(3.0 * scale)
+      return random_ops.random_uniform(
+          shape, -limit, limit, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "scale": self.scale,
+        "mode": self.mode,
+        "distribution": self.distribution,
+        "seed": self.seed
+    }
+
+
+class Orthogonal(Initializer):
+  """Initializer that generates an orthogonal matrix.
+
+  If the shape of the tensor to initialize is two-dimensional, it is initialized
+  with an orthogonal matrix obtained from the QR decomposition of a matrix of
+  random numbers drawn from a normal distribution.
+  If the matrix has fewer rows than columns then the output will have orthogonal
+  rows. Otherwise, the output will have orthogonal columns.
+
+  If the shape of the tensor to initialize is more than two-dimensional,
+  a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
+  is initialized, where `n` is the length of the shape vector.
+  The matrix is subsequently reshaped to give a tensor of the desired shape.
+
+  Args:
+    gain: multiplicative factor to apply to the orthogonal matrix
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+    for behavior.
+
+  References:
+      [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
+      ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
+  """
+
+  def __init__(self, gain=1.0, seed=None):
+    self.gain = gain
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point or the input shape is not
+       valid.
+    """
+    dtype = _assert_float_dtype(dtype)
+    # Check the shape
+    if len(shape) < 2:
+      raise ValueError("The tensor to initialize must be "
+                       "at least two-dimensional")
+    # Flatten the input shape with the last dimension remaining
+    # its original shape so it works for conv2d
+    num_rows = 1
+    for dim in shape[:-1]:
+      num_rows *= dim
+    num_cols = shape[-1]
+    flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows))
+
+    # Generate a random matrix
+    a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
+    # Compute the qr factorization
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    d = array_ops.diag_part(r)
+    q *= math_ops.sign(d)
+    if num_rows < num_cols:
+      q = array_ops.matrix_transpose(q)
+    return self.gain * array_ops.reshape(q, shape)
+
+  def get_config(self):
+    return {"gain": self.gain, "seed": self.seed}
+
+
+class Identity(Initializer):
+  """Initializer that generates the identity matrix.
+
+  Only use for 2D matrices.
+
+  Args:
+    gain: Multiplicative factor to apply to the identity matrix.
+  """
+
+  def __init__(self, gain=1.0):
+    self.gain = gain
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    partition_info = None  # Keeps logic so can be readded later if necessary
+    dtype = _assert_float_dtype(dtype)
+    full_shape = shape if partition_info is None else partition_info.full_shape
+    if len(full_shape) != 2:
+      raise ValueError(
+          "Identity matrix initializer can only be used for 2D matrices.")
+    initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
+    if partition_info is not None:
+      initializer = array_ops.slice(initializer, partition_info.var_offset,
+                                    shape)
+    return self.gain * initializer
+
+  def get_config(self):
+    return {"gain": self.gain}
+
+
+class GlorotUniform(VarianceScaling):
+  """The Glorot uniform initializer, also called Xavier uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Args:
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
+  """
+
+  def __init__(self, seed=None):
+    super(GlorotUniform, self).__init__(
+        scale=1.0,
+        mode="fan_avg",
+        distribution="uniform",
+        seed=seed)
+
+  def get_config(self):
+    return {"seed": self.seed}
+
+
+class GlorotNormal(VarianceScaling):
+  """The Glorot normal initializer, also called Xavier normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Args:
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed` for behavior.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
+  """
+
+  def __init__(self, seed=None):
+    super(GlorotNormal, self).__init__(
+        scale=1.0,
+        mode="fan_avg",
+        distribution="truncated_normal",
+        seed=seed)
+
+  def get_config(self):
+    return {"seed": self.seed, "dtype": self.dtype.name}
+
+
+# Aliases.
+
+# pylint: disable=invalid-name
+zeros_initializer = Zeros
+ones_initializer = Ones
+constant_initializer = Constant
+random_uniform_initializer = RandomUniform
+random_normal_initializer = RandomNormal
+truncated_normal_initializer = TruncatedNormal
+variance_scaling_initializer = VarianceScaling
+glorot_uniform_initializer = GlorotUniform
+glorot_normal_initializer = GlorotNormal
+orthogonal_initializer = Orthogonal
+identity_initializer = Identity
+# pylint: enable=invalid-name
+
+
+def lecun_normal(seed=None):
+  """LeCun normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(1 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017]
+      (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf]
+      (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+  """
+  return VarianceScaling(
+      scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
+
+
+def lecun_uniform(seed=None):
+  """LeCun uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(3 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) # pylint: disable=line-too-long
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+  """
+  return VarianceScaling(
+      scale=1., mode="fan_in", distribution="uniform", seed=seed)
+
+
+def he_normal(seed=None):
+  """He normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) # pylint: disable=line-too-long
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
+  """
+  return VarianceScaling(
+      scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
+
+
+def he_uniform(seed=None):
+  """He uniform variance scaling initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) # pylint: disable=line-too-long
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
+  """
+  return VarianceScaling(
+      scale=2., mode="fan_in", distribution="uniform", seed=seed)
+
+
+# Utility functions.
+
+
+def _compute_fans(shape):
+  """Computes the number of input and output units for a weight shape.
+
+  Args:
+    shape: Integer shape tuple or TF tensor shape.
+
+  Returns:
+    A tuple of scalars (fan_in, fan_out).
+  """
+  if len(shape) < 1:  # Just to avoid errors for constants.
+    fan_in = fan_out = 1
+  elif len(shape) == 1:
+    fan_in = fan_out = shape[0]
+  elif len(shape) == 2:
+    fan_in = shape[0]
+    fan_out = shape[1]
+  else:
+    # Assuming convolution kernels (2D, 3D, or more).
+    # kernel shape: (..., input_depth, depth)
+    receptive_field_size = 1.
+    for dim in shape[:-2]:
+      receptive_field_size *= dim
+    fan_in = shape[-2] * receptive_field_size
+    fan_out = shape[-1] * receptive_field_size
+  return fan_in, fan_out
+
+
+def _assert_float_dtype(dtype):
+  """Validate and return floating point type based on `dtype`.
+
+  `dtype` must be a floating point type.
+
+  Args:
+    dtype: The data type to validate.
+
+  Returns:
+    Validated type.
+
+  Raises:
+    ValueError: if `dtype` is not a floating point type.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  if not dtype.is_floating:
+    raise ValueError("Expected floating point type, got %s." % dtype)
+  return dtype
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fceba1d04a25867a835e398889748bb3c2d3de3b
--- /dev/null
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -0,0 +1,512 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for initializers in init_ops_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class InitializersTest(test.TestCase):
+
+  def _identical_test(self,
+                      init1,
+                      init2,
+                      assertion,
+                      shape=None,
+                      dtype=dtypes.float32):
+    if shape is None:
+      shape = [100]
+    t1 = self.evaluate(init1(shape, dtype))
+    t2 = self.evaluate(init2(shape, dtype))
+    self.assertEqual(tensor_shape.as_shape(shape), t1.shape)
+    self.assertEqual(tensor_shape.as_shape(shape), t2.shape)
+    self.assertEqual(assertion, np.allclose(t1, t2, rtol=1e-15, atol=1e-15))
+
+  def _duplicated_test(self,
+                       init,
+                       shape=None,
+                       dtype=dtypes.float32):
+    if shape is None:
+      shape = [100]
+    t1 = self.evaluate(init(shape, dtype))
+    t2 = self.evaluate(init(shape, dtype))
+    self.assertEqual(tensor_shape.as_shape(shape), t1.shape)
+    self.assertEqual(tensor_shape.as_shape(shape), t2.shape)
+    self.assertFalse(np.allclose(t1, t2, rtol=1e-15, atol=1e-15))
+
+  def _range_test(self,
+                  init,
+                  shape,
+                  target_mean=None,
+                  target_std=None,
+                  target_max=None,
+                  target_min=None):
+    output = self.evaluate(init(shape))
+    self.assertEqual(output.shape, shape)
+    lim = 3e-2
+    if target_std is not None:
+      self.assertGreater(lim, abs(output.std() - target_std))
+    if target_mean is not None:
+      self.assertGreater(lim, abs(output.mean() - target_mean))
+    if target_max is not None:
+      self.assertGreater(lim, abs(output.max() - target_max))
+    if target_min is not None:
+      self.assertGreater(lim, abs(output.min() - target_min))
+
+
+class ConstantInitializersTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testZeros(self):
+    self._range_test(init_ops_v2.Zeros(), shape=(4, 5),
+                     target_mean=0., target_max=0.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOnes(self):
+    self._range_test(init_ops_v2.Ones(), shape=(4, 5),
+                     target_mean=1., target_max=1.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantInt(self):
+    self._range_test(
+        init_ops_v2.Constant(2),
+        shape=(5, 6, 4),
+        target_mean=2,
+        target_max=2,
+        target_min=2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantTuple(self):
+    init = init_ops_v2.constant_initializer((10, 20, 30))
+    tensor = init(shape=[3])
+    self.assertAllEqual(self.evaluate(tensor), [10, 20, 30])
+    self.assertEqual(tensor.shape, [3])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantInvalidValue(self):
+    c = constant_op.constant([1.0, 2.0, 3.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Invalid type for initial value: .*Tensor.*"):
+      init_ops_v2.constant_initializer(c)
+    v = variables.Variable([3.0, 2.0, 1.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Invalid type for initial value: .*Variable.*"):
+      init_ops_v2.constant_initializer(v)
+
+  def _testNDimConstantInitializer(self, value, shape, expected):
+    with test_util.use_gpu():
+      init = init_ops_v2.constant_initializer(value)
+      x = init(shape)
+
+      actual = self.evaluate(array_ops.reshape(x, [-1]))
+      self.assertEqual(len(actual), len(expected))
+      for a, e in zip(actual, expected):
+        self.assertEqual(a, e)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNDimConstantInitializer(self):
+    value = [0, 1, 2, 3, 4, 5]
+    shape = [2, 3]
+    expected = list(value)
+
+    self._testNDimConstantInitializer(value, shape, expected)
+    self._testNDimConstantInitializer(np.asarray(value), shape, expected)
+    self._testNDimConstantInitializer(np.asarray(value).reshape(tuple(shape)),
+                                      shape, expected)
+
+  def _testNDimConstantInitializerIncorrectNumberValues(self, value, shape):
+    with test_util.use_gpu():
+      init = init_ops_v2.constant_initializer(value)
+      self.assertRaises(TypeError,
+                        init,
+                        shape=shape)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNDimConstantInitializerIncorrectNumberValues(self):
+    value = [0, 1, 2, 3, 4, 5]
+
+    for shape in [[2, 4], [2, 2]]:
+      self._testNDimConstantInitializerIncorrectNumberValues(value, shape)
+      self._testNDimConstantInitializerIncorrectNumberValues(np.asarray(value),
+                                                             shape)
+      self._testNDimConstantInitializerIncorrectNumberValues(
+          np.asarray(value).reshape(tuple([2, 3])), shape)
+
+
+class RandomUniformInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    shape = (9, 6, 7)
+    self._range_test(
+        init_ops_v2.RandomUniform(minval=-1, maxval=1, seed=124),
+        shape,
+        target_mean=0.,
+        target_max=1,
+        target_min=-1)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.RandomUniform(0, 7, seed=1)
+    init2 = init_ops_v2.RandomUniform(0, 7, seed=1)
+    self._identical_test(init1, init2, True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.RandomUniform(0, 7, seed=1)
+    init2 = init_ops_v2.RandomUniform(0, 7, seed=2)
+    self._identical_test(init1, init2, False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.RandomUniform(0.0, 1.0)
+    self._duplicated_test(init)
+
+
+class RandomNormalInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    self._range_test(
+        init_ops_v2.RandomNormal(mean=0, stddev=1, seed=153),
+        shape=(8, 12, 99),
+        target_mean=0.,
+        target_std=1)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.RandomNormal(0, 7, seed=1)
+    init2 = init_ops_v2.RandomNormal(0, 7, seed=1)
+    self._identical_test(init1, init2, True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.RandomNormal(0, 7, seed=1)
+    init2 = init_ops_v2.RandomNormal(0, 7, seed=2)
+    self._identical_test(init1, init2, False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.RandomNormal(0.0, 1.0)
+    self._duplicated_test(init)
+
+
+class TruncatedNormalInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    self._range_test(
+        init_ops_v2.TruncatedNormal(mean=0, stddev=1, seed=126),
+        shape=(12, 99, 7),
+        target_mean=0.,
+        target_max=2,
+        target_min=-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Not seeming to work in Eager mode")
+    init1 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    init2 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    self._identical_test(init1, init2, True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    init2 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=2)
+    self._identical_test(init1, init2, False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.TruncatedNormal(0.0, 1.0)
+    self._duplicated_test(init)
+
+  def testInvalidDataType(self):
+    init = init_ops_v2.TruncatedNormal(0.0, 1.0)
+    with self.assertRaises(ValueError):
+      init([1], dtype=dtypes.int32)
+
+
+class VarianceScalingInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTruncatedNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="truncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "truncated_normal",
+        wraps=random_ops.truncated_normal) as mock_truncated_normal:
+      x = self.evaluate(init(shape))
+      self.assertTrue(mock_truncated_normal.called)
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="truncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "truncated_normal",
+        wraps=random_ops.truncated_normal) as mock_truncated_normal:
+      x = self.evaluate(init(shape))
+      self.assertTrue(mock_truncated_normal.called)
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUntruncatedNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(
+        distribution="untruncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "random_normal",
+        wraps=random_ops.random_normal) as mock_random_normal:
+      x = self.evaluate(init(shape))
+      self.assertTrue(mock_random_normal.called)
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="uniform")
+
+    with test_util.use_gpu():
+      x = self.evaluate(init(shape))
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
+class OrthogonalInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    self._range_test(init_ops_v2.Orthogonal(seed=123), shape=(20, 20),
+                     target_mean=0.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.Orthogonal(seed=1)
+    init2 = init_ops_v2.Orthogonal(seed=1)
+    self._identical_test(init1, init2, True, (10, 10))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.Orthogonal(seed=1)
+    init2 = init_ops_v2.Orthogonal(seed=2)
+    self._identical_test(init1, init2, False, (10, 10))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.Orthogonal()
+    self._duplicated_test(init, (10, 10))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidDataType(self):
+    init = init_ops_v2.Orthogonal()
+    self.assertRaises(ValueError, init, shape=(10, 10), dtype=dtypes.string)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidShape(self):
+    init = init_ops_v2.Orthogonal()
+    with test_util.use_gpu():
+      self.assertRaises(ValueError, init, shape=[5])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGain(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.Orthogonal(seed=1)
+    init2 = init_ops_v2.Orthogonal(gain=3.14, seed=1)
+    with test_util.use_gpu():
+      t1 = self.evaluate(init1(shape=(10, 10)))
+      t2 = self.evaluate(init2(shape=(10, 10)))
+    self.assertAllClose(t1, t2 / 3.14)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testShapesValues(self):
+    for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]:
+      init = init_ops_v2.Orthogonal()
+      tol = 1e-5
+      with test_util.use_gpu():
+        # Check the shape
+        t = self.evaluate(init(shape))
+        self.assertAllEqual(shape, t.shape)
+        # Check orthogonality by computing the inner product
+        t = t.reshape((np.prod(t.shape[:-1]), t.shape[-1]))
+        if t.shape[0] > t.shape[1]:
+          self.assertAllClose(
+              np.dot(t.T, t), np.eye(t.shape[1]), rtol=tol, atol=tol)
+        else:
+          self.assertAllClose(
+              np.dot(t, t.T), np.eye(t.shape[0]), rtol=tol, atol=tol)
+
+
+class IdentityInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRange(self):
+    with self.assertRaises(ValueError):
+      shape = (3, 4, 5)
+      self._range_test(
+          init_ops_v2.Identity(),
+          shape=shape,
+          target_mean=1. / shape[0],
+          target_max=1.)
+
+    shape = (3, 3)
+    self._range_test(
+        init_ops_v2.Identity(),
+        shape=shape,
+        target_mean=1. / shape[0],
+        target_max=1.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidDataType(self):
+    init = init_ops_v2.Identity()
+    self.assertRaises(ValueError, init, shape=[10, 5], dtype=dtypes.int32)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidShape(self):
+    init = init_ops_v2.Identity()
+    with test_util.use_gpu():
+      self.assertRaises(ValueError, init, shape=[5, 7, 7])
+      self.assertRaises(ValueError, init, shape=[5])
+      self.assertRaises(ValueError, init, shape=[])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonSquare(self):
+    init = init_ops_v2.Identity()
+    shape = (10, 5)
+    with test_util.use_gpu():
+      self.assertAllClose(self.evaluate(init(shape)), np.eye(*shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGain(self):
+    shape = (10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init_default = init_ops_v2.Identity()
+      init_custom = init_ops_v2.Identity(gain=0.9)
+      with test_util.use_gpu():
+        self.assertAllClose(self.evaluate(init_default(shape, dtype=dtype)),
+                            np.eye(*shape))
+      with test_util.use_gpu():
+        self.assertAllClose(self.evaluate(init_custom(shape, dtype=dtype)),
+                            np.eye(*shape) * 0.9)
+
+
+class GlorotInitializersTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGlorotUniform(self):
+    shape = (5, 6, 4, 2)
+    fan_in, fan_out = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / (fan_in + fan_out))
+    self._range_test(
+        init_ops_v2.GlorotUniform(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_GlorotNormal(self):
+    shape = (5, 6, 4, 2)
+    fan_in, fan_out = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / (fan_in + fan_out))
+    self._range_test(
+        init_ops_v2.GlorotNormal(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+
+class MethodInitializers(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLecunUniform(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(1. / fan_in)
+    self._range_test(
+        init_ops_v2.lecun_uniform(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHeUniform(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / fan_in)
+    self._range_test(
+        init_ops_v2.he_uniform(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLecunNormal(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(1. / fan_in)
+    self._range_test(
+        init_ops_v2.lecun_normal(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHeNormal(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / fan_in)
+    self._range_test(
+        init_ops_v2.he_normal(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/linalg/inverse_registrations.py b/tensorflow/python/ops/linalg/inverse_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d1e7554cd59bf0f6d2754865090cf67e831da1
--- /dev/null
+++ b/tensorflow/python/ops/linalg/inverse_registrations.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.inverse."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+
+
+# By default, return LinearOperatorInversion which switched the .matmul
+# and .solve methods.
+@linear_operator_algebra.RegisterInverse(linear_operator.LinearOperator)
+def _inverse_linear_operator(linop):
+  return linear_operator_inversion.LinearOperatorInversion(
+      linop,
+      is_non_singular=linop.is_non_singular,
+      is_self_adjoint=linop.is_self_adjoint,
+      is_positive_definite=linop.is_positive_definite,
+      is_square=linop.is_square)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_inversion.LinearOperatorInversion)
+def _inverse_inverse_linear_operator(linop_inversion):
+  return linop_inversion.operator
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_diag.LinearOperatorDiag)
+def _inverse_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      1. / diag_operator.diag,
+      is_non_singular=diag_operator.is_non_singular,
+      is_self_adjoint=diag_operator.is_self_adjoint,
+      is_positive_definite=diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_identity.LinearOperatorIdentity)
+def _inverse_identity(identity_operator):
+  return identity_operator
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _inverse_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=1. / identity_operator.multiplier,
+      is_non_singular=identity_operator.is_non_singular,
+      is_self_adjoint=True,
+      is_positive_definite=identity_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _inverse_block_diag(block_diag_operator):
+  # We take the inverse of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.inverse() for operator in block_diag_operator.operators],
+      is_non_singular=block_diag_operator.is_non_singular,
+      is_self_adjoint=block_diag_operator.is_self_adjoint,
+      is_positive_definite=block_diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _inverse_kronecker(kronecker_operator):
+  # Inverse decomposition of a Kronecker product is the Kronecker product
+  # of inverse decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.inverse() for operator in kronecker_operator.operators],
+      is_non_singular=kronecker_operator.is_non_singular,
+      is_self_adjoint=kronecker_operator.is_self_adjoint,
+      is_positive_definite=kronecker_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_circulant.LinearOperatorCirculant)
+def _inverse_circulant(circulant_operator):
+  # Inverting the spectrum is sufficient to get the inverse.
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=1. / circulant_operator.spectrum,
+      is_non_singular=circulant_operator.is_non_singular,
+      is_self_adjoint=circulant_operator.is_self_adjoint,
+      is_positive_definite=circulant_operator.is_positive_definite,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index ac4fd4ebc6059a187828c757c852a470d8ee69a8..eebe741337d8eefae44e5206ce990edbf261bdd9 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import inverse_registrations as _inverse_registrations
 from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
 from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index df2bd887cdde6f651db572c2bdfebd2bc0170716..2259eaa65cd1a857e369ee8673165c76c882df7e 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -104,6 +104,7 @@ def adjoint(matrix, name=None):
   tf.linalg.adjoint(x)  # [[1 - 1j, 4 - 4j],
                         #  [2 - 2j, 5 - 5j],
                         #  [3 - 3j, 6 - 6j]]
+  ```
 
   Args:
     matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 6be81f4b34191414d3c4c00ac7158bfa1539ef27..4c99e86dc59a8c39abb57494ae84bcfdc13faa1b 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -847,6 +847,31 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def inverse(self, name="inverse"):
+    """Returns the Inverse of this `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, return a `LinearOperator`
+    representing `A^-1`.
+
+    Args:
+      name: A name scope to use for ops added by this method.
+
+    Returns:
+      `LinearOperator` representing inverse of this matrix.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be `non_singular`.
+    """
+    if self.is_square is False:  # pylint: disable=g-bool-id-comparison
+      raise ValueError("Cannot take the Inverse: This operator represents "
+                       "a non square matrix.")
+    if self.is_non_singular is False:  # pylint: disable=g-bool-id-comparison
+      raise ValueError("Cannot take the Inverse: This operator represents "
+                       "a singular matrix.")
+
+    with self._name_scope(name):
+      return linear_operator_algebra.inverse(self)
+
   def cholesky(self, name="cholesky"):
     """Returns a Cholesky factor as a `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
index 7b99066e4c121ebd7546dfad1039c0dfa46bca11..c1513fdb38c6005c89f6994141797f7df5c65350 100644
--- a/tensorflow/python/ops/linalg/linear_operator_algebra.py
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -27,6 +27,7 @@ from tensorflow.python.util import tf_inspect
 
 _CHOLESKY_DECOMPS = {}
 _MATMUL = {}
+_INVERSES = {}
 
 
 def _registered_function(type_list, registry):
@@ -55,6 +56,11 @@ def _registered_matmul(type_a, type_b):
   return _registered_function([type_a, type_b], _MATMUL)
 
 
+def _registered_inverse(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _INVERSES)
+
+
 def cholesky(lin_op_a, name=None):
   """Get the Cholesky factor associated to lin_op_a.
 
@@ -103,6 +109,29 @@ def matmul(lin_op_a, lin_op_b, name=None):
     return matmul_fn(lin_op_a, lin_op_b)
 
 
+def inverse(lin_op_a, name=None):
+  """Get the Inverse associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the inverse of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Inverse method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  inverse_fn = _registered_inverse(type(lin_op_a))
+  if inverse_fn is None:
+    raise ValueError("No inverse registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Inverse"):
+    return inverse_fn(lin_op_a)
+
+
 class RegisterCholesky(object):
   """Decorator to register a Cholesky implementation function.
 
@@ -189,3 +218,45 @@ class RegisterMatmul(object):
           self._key[1].__name__))
     _MATMUL[self._key] = matmul_fn
     return matmul_fn
+
+
+class RegisterInverse(object):
+  """Decorator to register an Inverse implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterInverse(lin_op.LinearOperatorIdentity)
+  def _inverse_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, inverse_fn):
+    """Perform the Inverse registration.
+
+    Args:
+      inverse_fn: The function to use for the Inverse.
+
+    Returns:
+      inverse_fn
+
+    Raises:
+      TypeError: if inverse_fn is not a callable.
+      ValueError: if a Inverse function has already been registered for
+        the given argument classes.
+    """
+    if not callable(inverse_fn):
+      raise TypeError(
+          "inverse_fn must be callable, received: {}".format(inverse_fn))
+    if self._key in _INVERSES:
+      raise ValueError("Inverse({}) has already been registered to: {}".format(
+          self._key[0].__name__, _INVERSES[self._key]))
+    _INVERSES[self._key] = inverse_fn
+    return inverse_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 32b222cb2a685ee3254065dfc26a230482004182..694557e50ae62f15d66ef713aa8512f719f97b0b 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -588,12 +588,19 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     """
     self._assert_proper_shapes = assert_proper_shapes
 
-    if not is_square:
-      raise ValueError("A ScaledIdentity operator is always square.")
-
     with ops.name_scope(name, values=[multiplier, num_rows]):
       self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
 
+      # Check and auto-set hints.
+      if not self._multiplier.dtype.is_complex:
+        if is_self_adjoint is False:  # pylint: disable=g-bool-id-comparison
+          raise ValueError("A real diagonal operator is always self adjoint.")
+        else:
+          is_self_adjoint = True
+
+      if not is_square:
+        raise ValueError("A ScaledIdentity operator is always square.")
+
       super(LinearOperatorScaledIdentity, self).__init__(
           dtype=self._multiplier.dtype,
           is_non_singular=is_non_singular,
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index f7e785caa5d8cc290f037944378f709633423a74..005b9b429b6327211feb9466bdca59b7a50870a7 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -71,7 +71,7 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
   `op1 x op2 x .. opJ` (we omit parentheses as the Kronecker product is
   associative).
 
-  If `opj` has shape `batch_shape_j` + [M_j, N_j`, then the composed operator
+  If `opj` has shape `batch_shape_j + [M_j, N_j]`, then the composed operator
   will have shape equal to `broadcast_batch_shape + [prod M_j, prod N_j]`,
   where the product is over all operators.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index e50f572b5f431ae8b7cf3470ee799f170e83656c..a957c84dc1ca6f26927ae3c39c6cb49caa2b19be 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -336,6 +336,22 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("solve_with_broadcast")
     self._test_solve(with_batch=False)
 
+  def _test_inverse(self):
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder)
+            op_inverse_v, mat_inverse_v = sess.run([
+                operator.inverse().to_dense(), linalg.inv(mat)])
+            self.assertAC(op_inverse_v, mat_inverse_v)
+
+  def test_inverse(self):
+    self._skip_if_tests_to_skip_contains("inverse")
+    self._test_inverse()
+
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
@@ -463,7 +479,14 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return [
+        "cholesky",
+        "inverse",
+        "solve",
+        "solve_with_broadcast",
+        "det",
+        "log_abs_det"
+    ]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 54d04e4a70bc65e0053575e7761680894e3702e5..6c18943dab03d434cb92d5510f48066f46615ba5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -481,9 +481,9 @@ def _reshape_for_efficiency(a,
 
   # Permutation to put the extra dims at the end.
   perm = (
-      array_ops.concat(
-          (math_ops.range(b_extra_ndims, b.shape.ndims),
-           math_ops.range(0, b_extra_ndims)), 0))
+      np.concatenate(
+          (np.arange(b_extra_ndims, b.shape.ndims),
+           np.arange(0, b_extra_ndims)), 0))
   b_extra_on_end = array_ops.transpose(b, perm=perm)
 
   # Now squash this end into one long dim.
@@ -497,7 +497,7 @@ def _reshape_for_efficiency(a,
     y_extra_shape = array_ops.concat(
         (array_ops.shape(y)[:-1], [b_main_sh[-1]], b_extra_sh), 0)
     y_extra_on_end = array_ops.reshape(y, y_extra_shape)
-    return array_ops.transpose(
-        y_extra_on_end, perm=array_ops.invert_permutation(perm))
+    inverse_perm = np.argsort(perm)
+    return array_ops.transpose(y_extra_on_end, perm=inverse_perm)
 
   return a, b_squashed_end, reshape_inv, still_need_to_transpose
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index dbaae886d43e46ac193d1e7f28a6367192d2a640..fcd5b816fe9ab0268602c7bffd5ebbce3c88c4ac 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -28,6 +28,12 @@ from tensorflow.python.ops import gen_list_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# list_ops -> control_flow_ops -> tensor_array_ops -> list_ops
+control_flow_ops = LazyLoader(
+    "control_flow_ops", globals(),
+    "tensorflow.python.ops.control_flow_ops")
 
 
 ops.NotDifferentiable("TensorListConcatLists")
@@ -65,11 +71,57 @@ def tensor_list_from_tensor(tensor, element_shape, name=None):
       name=name)
 
 
-def tensor_list_concat(input_handle, element_dtype, name=None):
+def tensor_list_get_item(input_handle, index, element_dtype, name=None):
+  return gen_list_ops.tensor_list_get_item(
+      input_handle=input_handle,
+      index=index,
+      element_shape=-1,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_pop_back(input_handle, element_dtype, name=None):
+  return gen_list_ops.tensor_list_pop_back(
+      input_handle=input_handle,
+      element_shape=-1,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_gather(input_handle, indices, element_dtype, name=None):
+  return gen_list_ops.tensor_list_gather(
+      input_handle=input_handle,
+      indices=indices,
+      element_shape=-1,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_scatter(tensor, indices, element_shape, name=None):
+  return gen_list_ops.tensor_list_scatter_v2(
+      tensor=tensor,
+      indices=indices,
+      element_shape=_build_element_shape(element_shape),
+      num_elements=-1,
+      name=name)
+
+
+def tensor_list_stack(input_handle, element_dtype, num_elements=-1, name=None):
+  return gen_list_ops.tensor_list_stack(
+      input_handle=input_handle,
+      element_shape=-1,
+      element_dtype=element_dtype,
+      num_elements=num_elements,
+      name=name)
+
+
+def tensor_list_concat(input_handle, element_dtype, element_shape=None,
+                       name=None):
   # Ignore the lengths output of TensorListConcat. It is only used during
   # gradient computation.
   return gen_list_ops.tensor_list_concat(
-      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+      input_handle=input_handle, element_dtype=element_dtype,
+      element_shape=element_shape, name=name)[0]
 
 
 def tensor_list_split(tensor, element_shape, lengths, name=None):
@@ -80,10 +132,31 @@ def tensor_list_split(tensor, element_shape, lengths, name=None):
       name=name)
 
 
+def tensor_list_set_item(input_handle,
+                         index,
+                         item,
+                         resize_if_index_out_of_bounds=False,
+                         name=None):
+  """Sets `item` at `index` in input list."""
+  if resize_if_index_out_of_bounds:
+    input_list_size = gen_list_ops.tensor_list_length(input_handle)
+    # TODO(srbs): This could cause some slowdown. Consider fusing resize
+    # functionality in the SetItem op.
+    input_handle = control_flow_ops.cond(
+        index >= input_list_size,
+        lambda: gen_list_ops.tensor_list_resize(  # pylint: disable=g-long-lambda
+            input_handle, index + 1),
+        lambda: input_handle)
+  return gen_list_ops.tensor_list_set_item(
+      input_handle=input_handle, index=index, item=item, name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
-      dresult, element_dtype=op.get_attr("element_dtype"))
+      dresult,
+      element_shape=array_ops.shape(op.inputs[1]),
+      element_dtype=op.get_attr("element_dtype"))
 
 
 @ops.RegisterGradient("TensorListPopBack")
@@ -93,12 +166,12 @@ def _PopBackGrad(op, dlist, delement):
         element_dtype=delement.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_push_back(dlist, delement)
+  return gen_list_ops.tensor_list_push_back(dlist, delement), None
 
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:]), None
 
 
 @ops.RegisterGradient("TensorListConcat")
@@ -123,17 +196,21 @@ def _TensorListSplitGrad(op, dlist):
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
-    num_elements = op.inputs[0].shape.dims[0].value
+  t = op.inputs[0]
+  if t.shape.dims and t.shape.dims[0].value is not None:
+    num_elements = t.shape.dims[0].value
   else:
     num_elements = None
   if dlist is None:
     dlist = empty_tensor_list(
-        element_dtype=op.inputs[0].dtype,
+        element_dtype=t.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
   tensor_grad = gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+      dlist,
+      element_shape=array_ops.slice(array_ops.shape(t), [1], [-1]),
+      element_dtype=t.dtype,
+      num_elements=num_elements)
   shape_grad = None
   return tensor_grad, shape_grad
 
@@ -150,33 +227,60 @@ def _TensorListGetItemGrad(op, ditem):
       index=op.inputs[1],
       item=ditem)
   index_grad = None
-  return list_grad, index_grad
+  element_shape_grad = None
+  return list_grad, index_grad, element_shape_grad
 
 
 @ops.RegisterGradient("TensorListSetItem")
 def _TensorListSetItemGrad(op, dlist):
+  """Gradient function for TensorListSetItem."""
   _, index, item = op.inputs
   list_grad = gen_list_ops.tensor_list_set_item(
       dlist, index=index, item=array_ops.zeros_like(item))
   index_grad = None
   element_grad = gen_list_ops.tensor_list_get_item(
-      dlist, index, element_dtype=item.dtype)
+      dlist,
+      index,
+      element_shape=array_ops.shape(item),
+      element_dtype=item.dtype)
   return list_grad, index_grad, element_grad
 
 
+@ops.RegisterGradient("TensorListResize")
+def _TensorListResizeGrad(op, dlist):
+  input_list, _ = op.inputs
+  input_list_size = gen_list_ops.tensor_list_length(input_list)
+  return gen_list_ops.tensor_list_resize(dlist, input_list_size), None
+
+
 @ops.RegisterGradient("TensorListGather")
 def _TensorListGatherGrad(op, dtensor):
-  _, indices = op.inputs
-  return gen_list_ops.tensor_list_scatter(
-      tensor=dtensor, indices=indices,
-      element_shape=ops.convert_to_tensor(-1, dtype=dtypes.int32)), None
+  """Gradient function for TensorListGather."""
+  input_list, indices, _ = op.inputs
+  dlist = gen_list_ops.tensor_list_scatter_v2(
+      tensor=dtensor,
+      indices=indices,
+      element_shape=gen_list_ops.tensor_list_element_shape(
+          input_list, shape_type=dtypes.int32),
+      num_elements=gen_list_ops.tensor_list_length(input_list))
+  return dlist, None, None
 
 
 @ops.RegisterGradient("TensorListScatter")
+@ops.RegisterGradient("TensorListScatterV2")
 def _TensorListScatterGrad(op, dlist):
-  t, indices, _ = op.inputs
-  return gen_list_ops.tensor_list_gather(
-      dlist, indices, element_dtype=t.dtype), None
+  """Gradient function for TensorListScatter."""
+  tensor = op.inputs[0]
+  indices = op.inputs[1]
+  dtensor = gen_list_ops.tensor_list_gather(
+      dlist,
+      indices,
+      element_shape=array_ops.slice(array_ops.shape(tensor), [1], [-1]),
+      element_dtype=tensor.dtype)
+  if op.type == "TensorListScatterV2":
+    return dtensor, None, None, None
+  else:
+    return dtensor, None, None
 
 
 def _build_element_shape(shape):
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 5a948a21946d0b9ce867901a00425857e4f06b1f..3cb16eb81e8c0796e199edb9c97acd1c269c832b 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -263,7 +263,7 @@ def print_v2(*inputs, **kwargs):
   # If we are only printing a single string scalar, there is no need to format
   if (len(inputs) == 1 and tensor_util.is_tensor(inputs[0])
       and (not isinstance(inputs[0], sparse_tensor.SparseTensor))
-      and inputs[0].shape and (inputs[0].dtype == dtypes.string)):
+      and (inputs[0].shape.ndims == 0)and (inputs[0].dtype == dtypes.string)):
     formatted_string = inputs[0]
   # Otherwise, we construct an appropriate template for the tensors we are
   # printing, and format the template using those tensors.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index e96c93c15c27ebbdf833c6b97dd9f2ce8c0e4faa..50e14297e3e391d791d3e95ac6f578d0092622db 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -64,6 +64,9 @@ def initialize_all_tables(name="init_all_tables"):
 @tf_export(v1=["initializers.tables_initializer", "tables_initializer"])
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
+  
+  See the [Low Level Intro](https://www.tensorflow.org/guide/low_level_intro#feature_columns)
+  guide, for an example of usage.
 
   Args:
     name: Optional name for the initialization op.
@@ -161,7 +164,7 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    if isinstance(initializer, checkpointable_base.CheckpointableBase):
+    if isinstance(initializer, checkpointable_base.Checkpointable):
       self._initializer = self._track_checkpointable(
           initializer, "_initializer")
     self._resource_handle = self.create_resource()
@@ -249,7 +252,7 @@ class HashTable(InitializableLookupTableBase):
   ```
   """
 
-  def __init__(self, initializer, default_value, shared_name=None, name=None):
+  def __init__(self, initializer, default_value, name=None):
     """Creates a non-initialized `HashTable` object.
 
     Creates a table, the type of its keys and values are specified by the
@@ -261,8 +264,6 @@ class HashTable(InitializableLookupTableBase):
       initializer: The table initializer to use. See `HashTable` kernel for
         supported key and value types.
       default_value: The value to use if a key is missing in the table.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
       name: A name for the operation (optional).
 
     Returns:
@@ -270,21 +271,22 @@ class HashTable(InitializableLookupTableBase):
     """
     self._initializer = initializer
     self._default_value = default_value
-    self._shared_name = shared_name
-    self._name = name
-    self._table_name = ""
+    self._shared_name = self._initializer._shared_name  # pylint: disable=protected-access
+    self._name = name or "hash_table"
+    self._table_name = None
     super(HashTable, self).__init__(default_value, initializer)
     self._value_shape = self._default_value.get_shape()
 
   def create_resource(self):
-    with ops.name_scope(self._name, "hash_table",
-                        (self._initializer, self._default_value)) as scope:
-      table_ref = gen_lookup_ops.hash_table_v2(
-          shared_name=self._shared_name,
-          key_dtype=self._initializer.key_dtype,
-          value_dtype=self._initializer.value_dtype,
-          name=scope)
-      self._table_name = scope.split("/")[-2]
+    table_ref = gen_lookup_ops.hash_table_v2(
+        shared_name=self._shared_name,
+        key_dtype=self._initializer.key_dtype,
+        value_dtype=self._initializer.value_dtype,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
     return table_ref
 
   @property
@@ -303,16 +305,15 @@ class HashTable(InitializableLookupTableBase):
     """
     with ops.name_scope(name, "%s_Export" % self.name,
                         [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
+      exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+          self.resource_handle, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
     return exported_keys, exported_values
 
 
-class TableInitializerBase(checkpointable_base.CheckpointableBase):
+class TableInitializerBase(checkpointable_base.Checkpointable):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -339,6 +340,16 @@ class TableInitializerBase(checkpointable_base.CheckpointableBase):
     """Returns the table initialization op."""
     raise NotImplementedError
 
+  @property
+  def _shared_name(self):
+    """Returns a shared name to be used by the table."""
+    shared_name = ""
+    if context.executing_eagerly():
+      # Ensure a unique name when eager execution is enabled to avoid spurious
+      # sharing issues.
+      shared_name += str(ops.uid())
+    return shared_name
+
 
 class KeyValueTensorInitializer(TableInitializerBase):
   """Table initializers given `keys` and `values` tensors."""
@@ -498,6 +509,7 @@ class TextFileInitializer(TableInitializerBase):
     if not isinstance(filename, ops.Tensor) and not filename:
       raise ValueError("Filename required for %s." % name)
 
+    self._filename_arg = filename
     key_dtype = dtypes.as_dtype(key_dtype)
     value_dtype = dtypes.as_dtype(value_dtype)
 
@@ -569,6 +581,21 @@ class TextFileInitializer(TableInitializerBase):
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
     return init_op
 
+  @property
+  def _shared_name(self):
+    if self._vocab_size:
+      # Keep the shared_name:
+      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%d_%s_%s" % (
+          self._filename_arg, self._vocab_size, self._key_index,
+          self._value_index)
+    else:
+      # Keep the shared_name
+      # <table_type>_<filename>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%s_%s" % (self._filename_arg,
+                                             self._key_index, self._value_index)
+    return shared_name
+
 
 class TextFileStringTableInitializer(TextFileInitializer):
   """Table initializer for `int64` IDs to string tables from a text file."""
@@ -819,7 +846,10 @@ class IdTableWithHashBuckets(LookupInterface):
       raise TypeError(
           "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
     self._hasher_spec = hasher_spec
-    self._table_name = name.split("/")[-1]
+    if name:
+      self._table_name = name.split("/")[-1]
+    else:
+      self._table_name = None
     super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64)
 
   def create_resource(self):
@@ -1022,20 +1052,7 @@ def index_table_from_file(vocabulary_file=None,
 
   with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
     table = None
-    shared_name = ""
     with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if vocab_size:
-        # Keep the shared_name:
-        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                  key_column_index,
-                                                  value_column_index)
-      else:
-        # Keep the shared_name
-        # <table_type>_<filename>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                               key_column_index,
-                                               value_column_index)
       init = TextFileIdTableInitializer(
           vocabulary_file,
           vocab_size=vocab_size,
@@ -1045,8 +1062,7 @@ def index_table_from_file(vocabulary_file=None,
           value_column_index=value_column_index,
           delimiter=delimiter)
 
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
+      table = HashTable(init, default_value, name=hash_table_scope)
     if num_oov_buckets:
       table = IdTableWithHashBuckets(
           table,
@@ -1136,12 +1152,7 @@ def index_table_from_tensor(vocabulary_list,
     num_elements = array_ops.size(keys)
     values = math_ops.to_int64(math_ops.range(num_elements))
 
-    shared_name = ""
     with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if context.executing_eagerly():
-        # Ensure a unique name when eager execution is enabled to avoid spurious
-        # sharing issues.
-        shared_name += str(ops.uid())
       table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
       init = KeyValueTensorInitializer(
           table_keys,
@@ -1149,8 +1160,7 @@ def index_table_from_tensor(vocabulary_list,
           table_keys.dtype.base_dtype,
           dtypes.int64,
           name="table_init")
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
+      table = HashTable(init, default_value, name=hash_table_scope)
     if num_oov_buckets:
       table = IdTableWithHashBuckets(
           table,
@@ -1239,17 +1249,6 @@ def index_to_string_table_from_file(vocabulary_file,
     raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
 
   with ops.name_scope(name, "index_to_string") as scope:
-    shared_name = ""
-    if vocab_size:
-      # Keep a shared_name
-      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                key_column_index,
-                                                value_column_index)
-    else:
-      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file, key_column_index,
-                                             value_column_index)
     init = TextFileStringTableInitializer(
         vocabulary_file,
         vocab_size=vocab_size,
@@ -1259,7 +1258,7 @@ def index_to_string_table_from_file(vocabulary_file,
         delimiter=delimiter)
 
     # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+    return HashTable(init, default_value, name=scope)
 
 
 def index_to_string_table_from_tensor(vocabulary_list,
@@ -1317,11 +1316,10 @@ def index_to_string_table_from_tensor(vocabulary_list,
     num_elements = array_ops.size(vocabulary_list)
     keys = math_ops.to_int64(math_ops.range(num_elements))
 
-    shared_name = ""
     init = KeyValueTensorInitializer(
         keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
     # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+    return HashTable(init, default_value, name=scope)
 
 
 ops.NotDifferentiable("LookupTableFind")
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 7f88ccd879d09d57dc32c29dd4f28fa4389937a1..3393e75335af1caca18a79ba2302dee7a46d5662 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -794,7 +794,7 @@ def softmax_cross_entropy(
 
     if label_smoothing > 0:
       num_classes = math_ops.cast(
-          array_ops.shape(onehot_labels)[1], logits.dtype)
+          array_ops.shape(onehot_labels)[-1], logits.dtype)
       smooth_positives = 1.0 - label_smoothing
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 97bba46661d056fd336c68988e3bc17ef4232487..73f4c750b886e0548d0c008fb84058b9ddc8a39d 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.add_loss")
+@tf_export(v1=["losses.add_loss"])
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
 
@@ -40,7 +40,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
-@tf_export("losses.get_losses")
+@tf_export(v1=["losses.get_losses"])
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -54,7 +54,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
-@tf_export("losses.get_regularization_losses")
+@tf_export(v1=["losses.get_regularization_losses"])
 def get_regularization_losses(scope=None):
   """Gets the list of regularization losses.
 
@@ -67,7 +67,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
-@tf_export("losses.get_regularization_loss")
+@tf_export(v1=["losses.get_regularization_loss"])
 def get_regularization_loss(scope=None, name="total_regularization_loss"):
   """Gets the total regularization loss.
 
@@ -85,7 +85,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
     return constant_op.constant(0.0)
 
 
-@tf_export("losses.get_total_loss")
+@tf_export(v1=["losses.get_total_loss"])
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index c7ec1c57d1b07232e2bdb05fc30f5456b792890f..dc2340983afa21f9236708a77f50875fafd0699b 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1343,3 +1343,20 @@ def _CumprodGrad(op, grad):
   out = math_ops.cumsum(
       prod * grad, axis, exclusive=exclusive, reverse=not reverse)
   return [out / x, None]
+
+
+@ops.RegisterGradient("NextAfter")
+def _NextAfterGrad(op, grad):
+  """Returns gradient of nextafter(x1, x2) with respect to x1 and x2."""
+  x1 = op.inputs[0]
+  x2 = op.inputs[1]
+  s_x1 = array_ops.shape(x1)
+  s_x2 = array_ops.shape(x2)
+  r_x1, r_x2 = gen_array_ops.broadcast_gradient_args(s_x1, s_x2)
+  with ops.control_dependencies([grad]):
+    partial_x1 = array_ops.ones(s_x1, dtype=x1.dtype)
+    partial_x2 = array_ops.zeros(s_x2, dtype=x2.dtype)
+    return (array_ops.reshape(
+        math_ops.reduce_sum(partial_x1 * grad, r_x1), s_x1),
+            array_ops.reshape(
+                math_ops.reduce_sum(partial_x2 * grad, r_x2), s_x2))
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index f415e65787d406e59725ec866845b0ab50f44d76..36b54b62cb44dae96e951279195282678f0dc637 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -395,5 +396,59 @@ class PowGradTest(test.TestCase):
       g = self.evaluate(g)
       self.assertAllClose([-2., 0., 2.], g)
 
+
+@test_util.run_all_in_graph_and_eager_modes
+class NextAfterTest(test.TestCase):
+
+  def _nextafter_gradient(self, x1, x2):
+    with backprop.GradientTape() as tape:
+      tape.watch(x1)
+      tape.watch(x2)
+      y = math_ops.nextafter(x1, x2)
+      return tape.gradient(y, [x1, x2])
+
+  def testBasic(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x1 = constant_op.constant(0.1, dtype=dtype)
+      x2 = constant_op.constant(3.1, dtype=dtype)
+      dx1, dx2 = self._nextafter_gradient(x1, x2)
+      expected_dx1 = constant_op.constant(1, dtype=dtype)
+      expected_dx2 = constant_op.constant(0, dtype=dtype)
+      self.assertAllClose(expected_dx1, dx1)
+      self.assertAllClose(expected_dx2, dx2)
+
+  def testDynamicShapes(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      default_x1 = constant_op.constant(0.1, dtype=dtype)
+      default_x2 = constant_op.constant(3.1, dtype=dtype)
+      x1 = array_ops.placeholder_with_default(default_x1, shape=None)
+      x2 = array_ops.placeholder_with_default(default_x2, shape=None)
+      dx1, dx2 = self._nextafter_gradient(x1, x2)
+      expected_dx1 = constant_op.constant(1, dtype=dtype)
+      expected_dx2 = constant_op.constant(0, dtype=dtype)
+      self.assertAllClose(expected_dx1, dx1)
+      self.assertAllClose(expected_dx2, dx2)
+
+  def testWithGradientChecker(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        x1 = np.array([-1, 0, 1, 2, 3], dtype=dtype.as_numpy_dtype)
+        x2 = np.array([2, 2, 2, 2, 2], dtype=dtype.as_numpy_dtype)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: math_ops.nextafter(x, x2), [x1]))  # pylint: disable=cell-var-from-loop
+        self.assertLess(err, 1e-3)
+
+  def testBroadcastingWithGradientChecker(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        x1 = np.array([-1, 0, 1, 2, 3], dtype=dtype.as_numpy_dtype)
+        x2 = np.array([2], dtype=dtype.as_numpy_dtype)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: math_ops.nextafter(x, x2), [x1]))  # pylint: disable=cell-var-from-loop
+        self.assertLess(err, 1e-3)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 1467678f2943a6400836cb8bd77f7e6f661ce516..cb7fecf7c7b261c320f3fe3d5579642176b39514 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -49,6 +49,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
+nextafter = gen_math_ops.next_after
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
@@ -812,7 +813,8 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
         return func(x, y, name=name)
       elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
-          y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+          y = ops.convert_to_tensor_v2(y, dtype_hint=x.dtype.base_dtype,
+                                       name="y")
         except TypeError:
           # If the RHS is not a tensor, it might be a tensor aware object
           # that can implement the operator with knowledge of itself
@@ -1337,6 +1339,8 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_indices is deprecated, use axis instead", "axis")
 def count_nonzero(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -3063,8 +3067,8 @@ def reduced_shape(input_shape, axes):
     input_shape[axes] = 1
     return input_shape
 
-  input_shape = to_int32(input_shape)  # [2, 3, 5, 7]
-  axes = to_int32(axes)  # [1, 2]
+  input_shape = cast(input_shape, dtypes.int32)  # [2, 3, 5, 7]
+  axes = cast(axes, dtypes.int32)  # [1, 2]
 
   input_rank = array_ops.size(input_shape)  # 4
   axes = (axes + input_rank) % input_rank
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 4de56fce0ad4a0532d8d68668a91485a6e415514..b4832e09c084e7165143f4e918b9ba76842e2311 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -627,5 +628,59 @@ class XdivyTest(test_util.TensorFlowTestCase):
         self.assertAllClose(x_over_y, xdivy_tf_np[1])
 
 
+class NextAfterTest(test_util.TensorFlowTestCase):
+
+  # Basic NextAfter tests that replicate numpy nextafter tests.
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+
+    for dtype in [dtypes.float32, dtypes.float64]:
+      one = constant_op.constant([1], dtype=dtype)
+      two = constant_op.constant([2], dtype=dtype)
+      zero = constant_op.constant([0], dtype=dtype)
+      nan = constant_op.constant([np.nan], dtype=dtype)
+
+      eps = constant_op.constant([np.finfo(dtype.as_numpy_dtype).eps],
+                                 dtype=dtype)
+
+      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps)
+      self.assertAllLess(math_ops.nextafter(one, zero) - one, 0)
+      self.assertAllEqual(
+          math_ops.is_nan(math_ops.nextafter(nan, one)), [True])
+      self.assertAllEqual(
+          math_ops.is_nan(math_ops.nextafter(one, nan)), [True])
+      self.assertAllEqual(math_ops.nextafter(one, one), one)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBroadcasting(self):
+
+    for dtype in [dtypes.float32, dtypes.float64]:
+      one = constant_op.constant([1, 1], dtype=dtype)
+      two = constant_op.constant([2], dtype=dtype)
+
+      eps = np.finfo(dtype.as_numpy_dtype).eps
+
+      eps_const = constant_op.constant([eps, eps], dtype=dtype)
+
+      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps_const)
+
+
+class BinaryOpsTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testErrorReceivedIfDtypeMismatchFromOp(self):
+    if context.executing_eagerly():
+      error = errors_impl.InvalidArgumentError
+      error_message = (
+          r"cannot compute Add as input #0\(zero-based\) was expected to be a "
+          r"float tensor but is a int32 tensor \[Op:Add\] name: add/")
+    else:
+      error = TypeError
+      error_message = ("Input 'y' of 'Add' Op has type float32 that does not "
+                       "match type int32 of argument 'x'.")
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
+      self.evaluate(a)
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index df9ca86ec77cfb99d083248783bed2209d281257..e3cefb2e92e24c79125f84ab743cb75ea56ab214 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -1295,7 +1295,7 @@ def mean_squared_error(labels,
 
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
-  squared_error = math_ops.square(labels - predictions)
+  squared_error = math_ops.squared_difference(labels, predictions)
   return mean(squared_error, weights, metrics_collections, updates_collections,
               name or 'mean_squared_error')
 
diff --git a/tensorflow/python/ops/nccl_ops.py b/tensorflow/python/ops/nccl_ops.py
index 6259ce0f948427cace576dbc3e21a410f531f4e2..6c8685cf63aeae5bb9f081a6a5282c472f724842 100644
--- a/tensorflow/python/ops/nccl_ops.py
+++ b/tensorflow/python/ops/nccl_ops.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import threading
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_nccl_ops
@@ -211,19 +213,27 @@ def _apply_all_reduce(reduction, tensors):
     raise ValueError('Must pass >0 tensors to all reduce operations')
 
   shared_name = _get_shared_name()
-  res = []
 
-  for t in tensors:
-    _check_device(t)
-    with ops.device(t.device):
-      res.append(
-          gen_nccl_ops.nccl_all_reduce(
-              input=t,
-              reduction=reduction,
-              num_devices=len(tensors),
-              shared_name=shared_name))
-
-  return res
+  def _all_reduce():
+    """Call nccl allreduce."""
+    res = []
+    for t in tensors:
+      _check_device(t)
+      with ops.device(t.device):
+        res.append(
+            gen_nccl_ops.nccl_all_reduce(
+                input=t,
+                reduction=reduction,
+                num_devices=len(tensors),
+                shared_name=shared_name))
+    return res
+
+  if context.executing_eagerly():
+    # Nccl ops will block unless they are executed concurrently such as in a
+    # graph or a defun.
+    return def_function.function(_all_reduce)()
+  else:
+    return _all_reduce()
 
 
 def _apply_reduce(reduction, tensors):
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index e978f1d32601890f8eb9b54fdd5738f626b7f863..fedf8e44c3ddfdac9739b88e019ed6d1e4485ab2 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -206,6 +206,7 @@ class BatchNormalizationTest(test.TestCase):
                                   2)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 4bc33ff8bdb845510a9872db26c8adfdf1f50995..69e753aa956389a5dbfd132a09d6930fc5f4660c 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -436,6 +437,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape1(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
@@ -463,6 +465,7 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape2(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 2]
@@ -483,6 +486,7 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape3(self):
     for is_training in [True, False]:
       x_shape = [1, 2, 1, 6]
@@ -496,6 +500,7 @@ class BatchNormalizationTest(test.TestCase):
               data_format='NCHW',
               is_training=is_training)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape4(self):
     for is_training in [True, False]:
       x_shape = [5, 7, 11, 4]
@@ -523,6 +528,8 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test never passed for XLA')
   def testBatchNormGradShape5(self):
     for is_training in [True, False]:
       x_shape = [0, 7, 11, 4]
@@ -581,6 +588,7 @@ class BatchNormalizationTest(test.TestCase):
           is_training=is_training,
           err_tolerance=err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig1(self):
     config = {
         'shape': [2, 3, 4, 5],
@@ -589,6 +597,7 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig2(self):
     config = {
         'shape': [2, 3, 2, 2],
@@ -597,6 +606,7 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig3(self):
     config = {
         'shape': [2, 3, 4, 5],
@@ -605,6 +615,7 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig4(self):
     config = {
         'shape': [2, 3, 2, 2],
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 34404edc9a1250710d4cd7a50e04ad8d187a5d7f..6ca2b2aafe3145978e6610cded32719173368eb8 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -314,10 +314,10 @@ def _BiasAddGradGrad(op, received_grad):
 
   if data_format == b"NCHW":
     expanded_shape = array_ops.concat([
-        array_ops.ones_like(shape[:-3]), bias_shape,
-        array_ops.ones_like(shape[-2:])
+        array_ops.ones_like(shape[:1]), bias_shape,
+        array_ops.ones_like(shape[2:])
     ], 0)
-    tile_mults = array_ops.concat([shape[:-3], [1], shape[-2:]], 0)
+    tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0)
   else:
     expanded_shape = array_ops.concat(
         [array_ops.ones_like(shape[:-1]), bias_shape], 0)
@@ -514,29 +514,40 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
 
 @ops.RegisterGradient("Conv2D")
 def _Conv2DGrad(op, grad):
+  """Gradient function for Conv2D."""
   dilations = op.get_attr("dilations")
   strides = op.get_attr("strides")
   padding = op.get_attr("padding")
+  explicit_paddings = op.get_attr("explicit_paddings")
   use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
   data_format = op.get_attr("data_format")
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
+
+  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
+  # functions for performance reasons in Eager mode. gen_nn_ops functions take a
+  # `explicit_paddings` parameter, but nn_ops functions do not. So if were were
+  # to use the nn_ops functions, we would have to convert `padding` and
+  # `explicit_paddings` into a single `padding` parameter, increasing overhead
+  # in Eager mode.
   return [
-      nn_ops.conv2d_backprop_input(
+      gen_nn_ops.conv2d_backprop_input(
           shape_0,
           op.inputs[1],
           grad,
           dilations=dilations,
           strides=strides,
           padding=padding,
+          explicit_paddings=explicit_paddings,
           use_cudnn_on_gpu=use_cudnn_on_gpu,
           data_format=data_format),
-      nn_ops.conv2d_backprop_filter(
+      gen_nn_ops.conv2d_backprop_filter(
           op.inputs[0],
           shape_1,
           grad,
           dilations=dilations,
           strides=strides,
           padding=padding,
+          explicit_paddings=explicit_paddings,
           use_cudnn_on_gpu=use_cudnn_on_gpu,
           data_format=data_format)
   ]
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 841bac8bea064f7dce8e9015745c89e7d98fc4d7..7abfde5149acfb3da6b27e03f5ddd95fec746db6 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -467,7 +467,7 @@ def depthwise_conv2d(input,
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
@@ -540,7 +540,7 @@ def depthwise_conv2d_v2(input,
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
@@ -599,7 +599,7 @@ def separable_conv2d(input,
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
@@ -699,7 +699,7 @@ def separable_conv2d_v2(
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
@@ -1380,6 +1380,8 @@ def _compute_sampled_logits(weights,
     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
+    if all_w.dtype != inputs.dtype:
+      all_w = math_ops.cast(all_w, inputs.dtype)
 
     # true_w shape is [batch_size * num_true, dim]
     true_w = array_ops.slice(all_w, [0, 0],
@@ -1397,6 +1399,8 @@ def _compute_sampled_logits(weights,
     # add the biases to the true and sampled logits.
     all_b = embedding_ops.embedding_lookup(
         biases, all_ids, partition_strategy=partition_strategy)
+    if all_b.dtype != inputs.dtype:
+      all_b = math_ops.cast(all_b, inputs.dtype)
     # true_b is a [batch_size * num_true] tensor
     # sampled_b is a [num_sampled] float tensor
     true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 6f2d2c15bd40109b79e7497c6b279fd8edf23bd7..e0ef9e5e34ad3666540daf91552e9ccb16f1b46c 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -171,7 +171,7 @@ class _NonAtrousConvolution(object):
         raise ValueError("data_format must be \"NHWC\" or \"NCHW\".")
       self.strides = strides
       self.data_format = data_format
-      self.conv_op = gen_nn_ops.conv2d
+      self.conv_op = conv2d
     elif conv_dims == 3:
       if data_format is None or data_format == "NDHWC":
         strides = [1] + list(strides) + [1]
@@ -1373,6 +1373,44 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
+def _convert_padding(padding):
+  """Converts Python padding to C++ padding for ops which take EXPLICIT padding.
+
+  Args:
+    padding: the `padding` argument for a Python op which supports EXPLICIT
+      padding.
+
+  Returns:
+    (padding, explicit_paddings) pair, which should be passed as attributes to a
+    C++ op.
+
+  Raises:
+    ValueError: If padding is invalid.
+  """
+  explicit_paddings = []
+  if padding == "EXPLICIT":
+    # Give a better error message if EXPLICIT is passed.
+    raise ValueError('"EXPLICIT" is not a valid value for the padding '
+                     "parameter. To use explicit padding, the padding "
+                     "parameter must be a list.")
+  if isinstance(padding, (list, tuple)):
+    for i, dim_paddings in enumerate(padding):
+      if not isinstance(dim_paddings, (list, tuple)):
+        raise ValueError("When padding is a list, each element of padding must "
+                         "be a list/tuple of size 2. Element with index %d of "
+                         "padding is not a list/tuple" % i)
+      if len(dim_paddings) != 2:
+        raise ValueError("When padding is a list, each element of padding must "
+                         "be a list/tuple of size 2. Element with index %d of "
+                         "padding has size %d" % (i, len(dim_paddings)))
+      explicit_paddings.extend(dim_paddings)
+    if len(padding) != 4:
+      raise ValueError("When padding is a list, it must be of size 4. Got "
+                       "padding of size: %d" % len(padding))
+    padding = "EXPLICIT"
+  return padding, explicit_paddings
+
+
 @tf_export("nn.conv2d", v1=[])
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
@@ -1418,8 +1456,13 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
       1-D tensor of length 4.  The stride of the sliding window for each
       dimension of `input`. The dimension order is determined by the value of
       `data_format`, see below for details.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1441,15 +1484,98 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
   # pylint: enable=line-too-long
   if dilations is None:
     dilations = [1, 1, 1, 1]
+  return conv2d(input,  # pylint: disable=redefined-builtin
+                filters,
+                strides,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                dilations=dilations,
+                name=name)
+
+
+@tf_export(v1=["nn.conv2d"])
+def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input,
+    filter,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
+  r"""Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q]
+                          * filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filter: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: A list of `ints`.
+      1-D tensor of length 4.  The stride of the sliding window for each
+      dimension of `input`. The dimension order is determined by the value of
+      `data_format`, see below for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by the
+      value of `data_format`, see above for details. Dilations in the batch and
+      depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  padding, explicit_paddings = _convert_padding(padding)
   return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
-                           filters,
+                           filter,
                            strides,
                            padding,
-                           use_cudnn_on_gpu=True,
+                           use_cudnn_on_gpu=use_cudnn_on_gpu,
+                           explicit_paddings=explicit_paddings,
                            data_format=data_format,
                            dilations=dilations,
                            name=name)
-tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
 
 
 @tf_export("nn.conv2d_backprop_filter", v1=[])
@@ -1478,8 +1604,13 @@ def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
       The stride of the sliding window for each dimension of the input
       of the convolution. Must be in the same order as the dimension specified
       with format.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1500,17 +1631,75 @@ def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
   """
   if dilations is None:
     dilations = [1, 1, 1, 1]
-  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
-                                           filter_sizes,
-                                           out_backprop,
-                                           strides,
-                                           padding,
-                                           use_cudnn_on_gpu=True,
-                                           data_format=data_format,
-                                           dilations=dilations,
-                                           name=name)
-tf_export(v1=["nn.conv2d_backprop_filter"])(
-    gen_nn_ops.conv2d_backprop_filter)
+  return conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
+                                filter_sizes,
+                                out_backprop,
+                                strides,
+                                padding,
+                                use_cudnn_on_gpu=True,
+                                data_format=data_format,
+                                dilations=dilations,
+                                name=name)
+
+
+@tf_export(v1=["nn.conv2d_backprop_filter"])
+def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input,
+    filter_sizes,
+    out_backprop,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
+  r"""Computes the gradients of convolution with respect to the filter.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape `[batch, in_height, in_width, in_channels]`.
+    filter_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the tensor shape of `filter`,
+      where `filter` is a 4-D
+      `[filter_height, filter_width, in_channels, out_channels]` tensor.
+    out_backprop: A `Tensor`. Must have the same type as `input`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  padding, explicit_paddings = _convert_padding(padding)
+  return gen_nn_ops.conv2d_backprop_filter(
+      input, filter_sizes, out_backprop, strides, padding, use_cudnn_on_gpu,
+      explicit_paddings, data_format, dilations, name)
 
 
 @tf_export("nn.conv2d_backprop_input", v1=[])
@@ -1539,8 +1728,13 @@ def conv2d_backprop_input_v2(input_sizes,
       The stride of the sliding window for each dimension of the input
       of the convolution. Must be in the same order as the dimension specified
       with format.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1561,17 +1755,75 @@ def conv2d_backprop_input_v2(input_sizes,
   """
   if dilations is None:
     dilations = [1, 1, 1, 1]
-  return gen_nn_ops.conv2d_backprop_input(input_sizes,
-                                          filters,
-                                          out_backprop,
-                                          strides,
-                                          padding,
-                                          use_cudnn_on_gpu=True,
-                                          data_format=data_format,
-                                          dilations=dilations,
-                                          name=name)
-tf_export(v1=["nn.conv2d_backprop_input"])(
-    gen_nn_ops.conv2d_backprop_input)
+  return conv2d_backprop_input(input_sizes,
+                               filters,
+                               out_backprop,
+                               strides,
+                               padding,
+                               use_cudnn_on_gpu=True,
+                               data_format=data_format,
+                               dilations=dilations,
+                               name=name)
+
+
+@tf_export(v1=["nn.conv2d_backprop_input"])
+def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input_sizes,
+    filter,
+    out_backprop,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
+  r"""Computes the gradients of convolution with respect to the input.
+
+  Args:
+    input_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the shape of `input`,
+      where `input` is a 4-D `[batch, height, width, channels]` tensor.
+    filter: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape
+      `[filter_height, filter_width, in_channels, out_channels]`.
+    out_backprop: A `Tensor`. Must have the same type as `filter`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `filter`.
+  """
+  padding, explicit_paddings = _convert_padding(padding)
+  return gen_nn_ops.conv2d_backprop_input(
+      input_sizes, filter, out_backprop, strides, padding, use_cudnn_on_gpu,
+      explicit_paddings, data_format, dilations, name)
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
@@ -3040,15 +3292,19 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disa
         return x
 
     noise_shape = _get_noise_shape(x, noise_shape)
-
-    keep_prob = 1 - rate
-    # uniform [keep_prob, 1.0 + keep_prob)
-    random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(
+    # Sample a uniform distribution on [0.0, 1.0) and select values larger than
+    # rate.
+    #
+    # NOTE: Random uniform actually can only generate 2^23 floats on [1.0, 2.0)
+    # and subtract 1.0.
+    random_tensor = random_ops.random_uniform(
         noise_shape, seed=seed, dtype=x.dtype)
-    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
-    binary_tensor = math_ops.floor(random_tensor)
-    ret = math_ops.divide(x, keep_prob) * binary_tensor
+    keep_prob = 1 - rate
+    scale = 1 / keep_prob
+    # NOTE: if (1.0 + rate) - 1 is equal to rate, then we want to consider that
+    # float to be selected, hence we use a >= comparison.
+    keep_mask = random_tensor >= rate
+    ret = x * scale * math_ops.cast(keep_mask, x.dtype)
     if not context.executing_eagerly():
       ret.set_shape(x.get_shape())
     return ret
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 82fab741830fddd4ee0ba5c8e2644702ec199b4d..74561349ed1de72037e2e2f3c5d16e4a7cb03ce5 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.ops.nn_impl import _compute_sampled_logits
 from tensorflow.python.platform import test as test_lib
 
 
+@test_util.disable_all_xla("This test never passed for XLA")
 class ZeroFractionTest(test_lib.TestCase):
 
   def _ZeroFraction(self, x):
@@ -1017,6 +1018,7 @@ class LeakyReluTest(test_lib.TestCase):
 class SwishTest(test_lib.TestCase):
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testValues(self):
     np_values = np.array(
         [np.linspace(-10.0, 0.0, 100),
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 07fc9433a2582225a8da687eb8c9563c8fcac9e2..fd89d8ad452487ce7b5d3e677bc36d63eb1111f6 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -15,11 +15,13 @@ py_library(
         "control_flow_ops.py",
         "gradients.py",
         "pfor.py",
+        "test_util.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":control_flow_ops",
         ":gradients",
+        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -83,12 +85,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pfor_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "control_flow_ops_test",
-    size = "large",
     srcs = ["control_flow_ops_test.py"],
     additional_deps = [
         ":control_flow_ops",
+        ":test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:gradients",
@@ -101,6 +116,34 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "array_test",
+    srcs = ["array_test.py"],
+    additional_deps = [
+        ":control_flow_ops",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+cuda_py_test(
+    name = "math_test",
+    srcs = ["math_test.py"],
+    additional_deps = [
+        ":control_flow_ops",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+    ],
+    tags = ["optonly"],  # Too slow in non-opt mode
+)
+
 py_library(
     name = "gradients",
     srcs = ["gradients.py"],
@@ -115,7 +158,6 @@ py_library(
 
 cuda_py_test(
     name = "gradients_test",
-    size = "large",
     srcs = ["gradients_test.py"],
     additional_deps = [
         ":control_flow_ops",
@@ -128,4 +170,6 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python/ops/losses",
     ],
+    tags = ["optonly"],  # Too slow in non-opt mode
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0c0f5b992b3f005dc8b75a6d0207237a5205bb
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -0,0 +1,274 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for vectorization of array kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ArrayTest(PForTestCase):
+
+  def test_gather(self):
+    x = random_ops.random_uniform([3, 3, 3])
+
+    def loop_fn(i):
+      outputs = []
+      x_i = array_ops.gather(x, i)
+      for y in [x, x_i]:
+        axes = [0, 2, -1] if y == x else [0]
+        for axis in axes:
+          outputs.append(array_ops.gather(y, 2, axis=axis))
+          outputs.append(array_ops.gather(y, i, axis=axis))
+          outputs.append(array_ops.gather(y, [i], axis=axis))
+          outputs.append(array_ops.gather(y, [i, 2], axis=axis))
+          outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
+      return outputs
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
+
+  def test_shape(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.shape(x_i), array_ops.shape(x_i, out_type=dtypes.int64)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
+  def test_size(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.size(x_i), array_ops.size(x_i, out_type=dtypes.int64)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
+  def test_rank(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.rank(x_i)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
+
+  def test_shape_n(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      y_i = array_ops.gather(y, i)
+      return array_ops.shape_n([x_i, x, y, y_i]), array_ops.shape_n(
+          [x_i, x, y, y_i], out_type=dtypes.int64)
+
+    self._test_loop_fn(
+        loop_fn, 3, loop_fn_dtypes=[dtypes.int32] * 4 + [dtypes.int64] * 4)
+
+  def test_reshape(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.reshape(x1, [-1]), array_ops.reshape(x1, [1, 3, 1, -1])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_expand_dims(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.expand_dims(
+          x1, axis=-1), array_ops.expand_dims(
+              x1, axis=1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_slice(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.slice(x1, begin=(0, 1), size=(2, 1))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_tile(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.tile(x1, [2, 1])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_tile_loop_dependent(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.tile(x1, [i, 1])
+
+    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
+      pfor_control_flow_ops.pfor(loop_fn, 2)
+
+  def test_pack(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.stack([x1, y], axis=-1)
+
+    self._test_loop_fn(loop_fn, 1)
+
+  def test_unpack(self):
+    x = random_ops.random_uniform([3, 2, 3, 4])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.unstack(
+          x_i, 4, axis=-1), array_ops.unstack(
+              x_i, 3, axis=1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 7)
+
+  def test_pad(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    padding = constant_op.constant([[1, 2], [3, 4]])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.pad(x1, padding, mode="CONSTANT")
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_split(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
+
+  def test_split_v(self):
+    x = random_ops.random_uniform([3, 6, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return (array_ops.split(x1, [2, 1, 3], axis=0),
+              array_ops.split(x1, [3], axis=-1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
+
+  def test_transpose(self):
+    x = random_ops.random_uniform([3, 2, 3, 4])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.transpose(x1, [2, 1, 0])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_zeros_like(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      z = array_ops.zeros_like(x1),
+      return z, z + x1
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_concat_v2(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.concat(
+          [x1, x1, y], axis=0), array_ops.concat(
+              [x1, x1, y], axis=-1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_unary_cwise_ops(self):
+    for op in [array_ops.identity, array_ops.stop_gradient]:
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        with g:
+          x1 = array_ops.gather(x, i)
+          y = op(x1) + x1
+          loss = nn.l2_loss(y)
+        return op(x), y, g.gradient(loss, x1)
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_identity_n(self):
+    x = random_ops.random_uniform([3, 4])
+
+    def loop_fn(i):
+      return array_ops.identity_n([x, array_ops.gather(x, i)])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_matrix_diag_part(self):
+    x = random_ops.random_uniform([3, 4, 2])
+
+    def loop_fn(i):
+      return array_ops.matrix_diag_part(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
+
+  def test_strided_slice(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+      g.watch(x)
+
+    def loop_fn(i):
+      with g:
+        x_i = array_ops.gather(x, i)
+        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
+        loss = nn.l2_loss(y)
+      return y, g.gradient(loss, x_i)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 933bddd8ccaa830a394c8d69e4f1b33311315c99..8a5830e28f34fe35258262241db2330b1f592614 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -34,7 +34,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -50,40 +49,13 @@ from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-im
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class PForTest(test.TestCase):
-
-  def _run_targets(self, targets1, targets2=None, run_init=True):
-    targets1 = nest.flatten(targets1)
-    targets2 = ([] if targets2 is None else nest.flatten(targets2))
-    assert len(targets1) == len(targets2) or not targets2
-    if run_init:
-      init = variables.global_variables_initializer()
-      self.evaluate(init)
-    return self.evaluate(targets1 + targets2)
-
-  def run_and_assert_equal(self, targets1, targets2):
-    outputs = self._run_targets(targets1, targets2)
-    outputs = nest.flatten(outputs)  # flatten SparseTensorValues
-    n = len(outputs) // 2
-    for i in range(n):
-      if outputs[i + n].dtype != np.object:
-        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
-      else:
-        self.assertAllEqual(outputs[i + n], outputs[i])
-
-  def _test_loop_fn(self, loop_fn, iters,
-                    loop_fn_dtypes=dtypes.float32,
-                    parallel_iterations=None):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
-                                    parallel_iterations=parallel_iterations)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
-                                        parallel_iterations=parallel_iterations)
-    self.run_and_assert_equal(t1, t2)
+class PForTest(PForTestCase):
 
   def test_op_conversion_fallback_to_while_loop(self):
     # Note that we used top_k op for this test. If a converter gets defined for
@@ -129,246 +101,7 @@ class PForTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class ArrayTest(PForTest):
-
-  def test_gather(self):
-    x = random_ops.random_uniform([3, 3, 3])
-
-    def loop_fn(i):
-      outputs = []
-      x_i = array_ops.gather(x, i)
-      for y in [x, x_i]:
-        axes = [0, 2, -1] if y == x else [0]
-        for axis in axes:
-          outputs.append(array_ops.gather(y, 2, axis=axis))
-          outputs.append(array_ops.gather(y, i, axis=axis))
-          outputs.append(array_ops.gather(y, [i], axis=axis))
-          outputs.append(array_ops.gather(y, [i, 2], axis=axis))
-          outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
-      return outputs
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
-
-  def test_shape(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.shape(x_i), array_ops.shape(x_i, out_type=dtypes.int64)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
-
-  def test_size(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.size(x_i), array_ops.size(x_i, out_type=dtypes.int64)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
-
-  def test_rank(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.rank(x_i)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
-
-  def test_shape_n(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      y_i = array_ops.gather(y, i)
-      return array_ops.shape_n([x_i, x, y, y_i]), array_ops.shape_n(
-          [x_i, x, y, y_i], out_type=dtypes.int64)
-
-    self._test_loop_fn(
-        loop_fn, 3, loop_fn_dtypes=[dtypes.int32] * 4 + [dtypes.int64] * 4)
-
-  def test_reshape(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.reshape(x1, [-1]), array_ops.reshape(x1, [1, 3, 1, -1])
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_expand_dims(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.expand_dims(
-          x1, axis=-1), array_ops.expand_dims(
-              x1, axis=1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_slice(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.slice(x1, begin=(0, 1), size=(2, 1))
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_tile(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.tile(x1, [2, 1])
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_tile_loop_dependent(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.tile(x1, [i, 1])
-
-    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
-      pfor_control_flow_ops.pfor(loop_fn, 2)
-
-  def test_pack(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.stack([x1, y], axis=-1)
-
-    self._test_loop_fn(loop_fn, 1)
-
-  def test_unpack(self):
-    x = random_ops.random_uniform([3, 2, 3, 4])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.unstack(
-          x_i, 4, axis=-1), array_ops.unstack(
-              x_i, 3, axis=1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 7)
-
-  def test_pad(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    padding = constant_op.constant([[1, 2], [3, 4]])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.pad(x1, padding, mode="CONSTANT")
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_split(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
-
-  def test_split_v(self):
-    x = random_ops.random_uniform([3, 6, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return (array_ops.split(x1, [2, 1, 3], axis=0),
-              array_ops.split(x1, [3], axis=-1))
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
-
-  def test_transpose(self):
-    x = random_ops.random_uniform([3, 2, 3, 4])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.transpose(x1, [2, 1, 0])
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_zeros_like(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      z = array_ops.zeros_like(x1),
-      return z, z + x1
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_concat_v2(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.concat(
-          [x1, x1, y], axis=0), array_ops.concat(
-              [x1, x1, y], axis=-1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_unary_cwise_ops(self):
-    for op in [array_ops.identity, array_ops.stop_gradient]:
-      with backprop.GradientTape(persistent=True) as g:
-        x = random_ops.random_uniform([3, 5])
-        g.watch(x)
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          x1 = array_ops.gather(x, i)
-          y = op(x1) + x1
-          loss = nn.l2_loss(y)
-        return op(x), y, g.gradient(loss, x1)
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
-
-  def test_identity_n(self):
-    x = random_ops.random_uniform([3, 4])
-
-    def loop_fn(i):
-      return array_ops.identity_n([x, array_ops.gather(x, i)])
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_matrix_diag_part(self):
-    x = random_ops.random_uniform([3, 4, 2])
-
-    def loop_fn(i):
-      return array_ops.matrix_diag_part(array_ops.gather(x, i))
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
-
-  def test_strided_slice(self):
-    with backprop.GradientTape(persistent=True) as g:
-      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
-      g.watch(x)
-
-    def loop_fn(i):
-      with g:
-        x_i = array_ops.gather(x, i)
-        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
-        loss = nn.l2_loss(y)
-      return y, g.gradient(loss, x_i)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class BitwiseTest(PForTest):
+class BitwiseTest(PForTestCase):
 
   def test_unary_cwise(self):
     for op in [bitwise_ops.invert]:
@@ -408,376 +141,7 @@ class BitwiseTest(PForTest):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class MathTest(PForTest):
-
-  def test_unary_cwise_ops(self):
-    complex_ops = [
-        math_ops.angle,
-        math_ops.imag,
-        math_ops.complex_abs,
-        math_ops.real,
-        math_ops.conj,
-    ]
-    real_ops = [
-        lambda x: math_ops.acosh(1 + math_ops.square(x)),
-        math_ops.abs,
-        math_ops.acos,
-        math_ops.asin,
-        math_ops.asinh,
-        math_ops.atan,
-        math_ops.atanh,
-        math_ops.bessel_i0e,
-        math_ops.bessel_i1e,
-        math_ops.cos,
-        math_ops.cosh,
-        math_ops.digamma,
-        math_ops.erf,
-        math_ops.erfc,
-        math_ops.exp,
-        math_ops.expm1,
-        math_ops.inv,
-        math_ops.is_finite,
-        math_ops.is_inf,
-        math_ops.lgamma,
-        math_ops.log,
-        math_ops.log1p,
-        math_ops.neg,
-        math_ops.negative,
-        math_ops.reciprocal,
-        math_ops.rint,
-        math_ops.round,
-        math_ops.rsqrt,
-        math_ops.sigmoid,
-        math_ops.sign,
-        math_ops.sin,
-        math_ops.sinh,
-        math_ops.sqrt,
-        math_ops.square,
-        math_ops.tan,
-        math_ops.tanh,
-        math_ops.tanh,
-        nn.elu,
-        nn.relu,
-        nn.relu6,
-        nn.selu,
-        nn.softplus,
-        nn.softsign,
-    ]
-    for op in complex_ops + real_ops:
-      with backprop.GradientTape(persistent=True) as g:
-        x = random_ops.random_uniform([3, 5])
-        g.watch(x)
-        if op in complex_ops:
-          y = random_ops.random_uniform([3, 5])
-          g.watch(y)
-          x = math_ops.complex(x, y)
-
-      # pylint: disable=cell-var-from-loop
-      output_dtypes = []
-      def loop_fn(i):
-        with g:
-          x1 = array_ops.gather(x, i)
-          y1 = op(x1)
-          outputs = [op(x), y1]
-          if y1.dtype == dtypes.float32:
-            loss = math_ops.reduce_sum(y1 * y1)
-          else:
-            loss = None
-        if loss is not None:
-          grad = g.gradient(loss, x1)
-          if grad is not None:
-            outputs.append(grad)
-        del output_dtypes[:]
-        output_dtypes.extend([t.dtype for t in outputs])
-        return outputs
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
-
-  def test_unary_cwise_no_grad(self):
-    for op in [math_ops.ceil,
-               math_ops.floor,
-               math_ops.logical_not]:
-      x = random_ops.random_uniform([3, 5])
-      if op == math_ops.logical_not:
-        x = x > 0
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        return op(array_ops.gather(x, i))
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=x.dtype)
-
-  def test_binary_cwise_ops(self):
-    logical_ops = [
-        math_ops.logical_and,
-        math_ops.logical_or,
-        math_ops.logical_xor
-    ]
-
-    # Wrapper functions restricting the range of inputs of zeta and polygamma.
-    def safe_polygamma(x, y):
-      return math_ops.polygamma(
-          math_ops.round(clip_ops.clip_by_value(y, 1, 10)),
-          x * x + 1)
-
-    def safe_zeta(x, y):
-      return math_ops.zeta(x * x + 1, y * y)
-
-    float_ops = [
-        math_ops.add,
-        math_ops.add_v2,
-        math_ops.atan2,
-        math_ops.complex,
-        math_ops.div,
-        math_ops.divide,
-        math_ops.div_no_nan,
-        math_ops.equal,
-        math_ops.floor_div,
-        math_ops.floor_mod,
-        math_ops.greater,
-        math_ops.greater_equal,
-        math_ops.igamma,
-        math_ops.igammac,
-        math_ops.igamma_grad_a,
-        math_ops.less,
-        math_ops.less_equal,
-        math_ops.maximum,
-        math_ops.minimum,
-        math_ops.mod,
-        math_ops.multiply,
-        math_ops.not_equal,
-        math_ops.pow,
-        math_ops.squared_difference,
-        math_ops.subtract,
-        math_ops.truncate_mod,
-        safe_polygamma,
-        safe_zeta,
-    ]
-    for op in logical_ops + float_ops:
-      x = random_ops.random_uniform([7, 3, 5])
-      y = random_ops.random_uniform([3, 5])
-      if op in logical_ops:
-        x = x > 0
-        y = y > 0
-
-      output_dtypes = []
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = array_ops.gather(y, i)
-        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
-        del output_dtypes[:]
-        output_dtypes.extend([t.dtype for t in outputs])
-        return outputs
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
-
-  def test_approximate_equal(self):
-    x = random_ops.random_uniform([3, 5])
-    y = random_ops.random_uniform([3, 5])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      y1 = array_ops.gather(y, i)
-      return math_ops.approximate_equal(x1, y1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.bool])
-
-  def test_addn(self):
-    x = random_ops.random_uniform([2, 3, 5])
-    y = random_ops.random_uniform([3, 5])
-    z = random_ops.random_uniform([3, 5])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return math_ops.add_n([x1, y, z])
-
-    self._test_loop_fn(loop_fn, 2)
-
-  def test_matmul(self):
-    for tr_a in (True, False):
-      for tr_b in (True, False):
-        for stack_a in (True, False):
-          for stack_b in (True, False):
-            shape_a = (5, 3) if tr_a else (3, 5)
-            if stack_a:
-              shape_a = (2,) + shape_a
-            shape_b = (7, 5) if tr_b else (5, 7)
-            if stack_b:
-              shape_b = (2,) + shape_b
-
-            x = random_ops.random_uniform(shape_a)
-            y = random_ops.random_uniform(shape_b)
-
-            # pylint: disable=cell-var-from-loop
-            def loop_fn(i):
-              a = array_ops.gather(x, i) if stack_a else x
-              b = array_ops.gather(y, i) if stack_b else y
-              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
-
-            # pylint: enable=cell-var-from-loop
-
-            self._test_loop_fn(loop_fn, 2)
-
-  def test_batch_matmul(self):
-    for tr_a in (True, False):
-      for tr_b in (True, False):
-        for stack_a in (True, False):
-          for stack_b in (True, False):
-            shape_a = (4, 5, 3) if tr_a else (4, 3, 5)
-            if stack_a:
-              shape_a = (2,) + shape_a
-            shape_b = (4, 7, 5) if tr_b else (4, 5, 7)
-            if stack_b:
-              shape_b = (2,) + shape_b
-
-            x = random_ops.random_uniform(shape_a)
-            y = random_ops.random_uniform(shape_b)
-
-            # pylint: disable=cell-var-from-loop
-            def loop_fn(i):
-              a = array_ops.gather(x, i) if stack_a else x
-              b = array_ops.gather(y, i) if stack_b else y
-              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
-
-            # pylint: enable=cell-var-from-loop
-
-            self._test_loop_fn(loop_fn, 2)
-
-  def test_reduction(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for op in [
-        math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
-        math_ops.reduce_min
-    ]:
-      for axis in ([1], None, [0, 2]):
-        for keepdims in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return op(a, axis=axis, keepdims=keepdims)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_cum_sum(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
-      for exclusive in (True, False):
-        for reverse in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return math_ops.cumsum(
-                a, axis=axis, exclusive=exclusive, reverse=reverse)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_cum_prod(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
-      for exclusive in (True, False):
-        for reverse in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return math_ops.cumprod(
-                a, axis=axis, exclusive=exclusive, reverse=reverse)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_bias_add(self):
-    x_shape = [2, 3, 4, 5, 6]
-    x = random_ops.random_uniform(x_shape)
-    for data_format in ("NCHW", "NHWC"):
-      with backprop.GradientTape(persistent=True) as g:
-        bias_dim = 2 if data_format == "NCHW" else -1
-        bias_shape = x_shape[bias_dim]
-        bias = random_ops.random_uniform([bias_shape])
-        g.watch(bias)
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          a = array_ops.gather(x, i)
-          y = nn.bias_add(a, bias, data_format=data_format)
-          loss = math_ops.reduce_sum(y * y)
-        return y, g.gradient(loss, bias)
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(
-          loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.float32])
-
-  def test_unsorted_segment_sum(self):
-    t = random_ops.random_uniform([3, 3, 2])
-    segment_ids = constant_op.constant([[0, 0, 2], [0, 1, 2], [2, 2, 2]])
-    num_segments = 3
-
-    def loop_fn(i):
-      data = array_ops.gather(t, i)
-      data_0 = array_ops.gather(t, 0)
-      seg_ids = array_ops.gather(segment_ids, i)
-      return (math_ops.unsorted_segment_sum(data, seg_ids, num_segments),
-              math_ops.unsorted_segment_sum(data_0, seg_ids, num_segments))
-
-    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 2)
-
-  def test_cast(self):
-    x = constant_op.constant([[1], [2]])
-    y = constant_op.constant([[1.0], [2.0]])
-
-    def loop_fn(i):
-      return (math_ops.cast(array_ops.gather(x, i), dtypes.float32),
-              math_ops.cast(array_ops.gather(y, i), dtypes.int32))
-
-    self._test_loop_fn(
-        loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
-
-  def test_tanh_axpy(self):
-    a = constant_op.constant(3.)
-    x = random_ops.random_uniform([4, 5])
-    y = random_ops.random_uniform([6, 5])
-    n = x.shape[0]
-
-    def loop_fn(i):
-      return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
-
-    self._test_loop_fn(loop_fn, n)
-
-  def test_select(self):
-    cond = constant_op.constant([True, False])
-    a = random_ops.random_uniform([2, 3, 5])
-    b = random_ops.random_uniform([2, 3, 5])
-    for cond_shape in [2], [2, 3], [2, 3, 5]:
-      cond = random_ops.random_uniform(cond_shape) > 0.5
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        a_i = array_ops.gather(a, i)
-        b_i = array_ops.gather(b, i)
-        cond_i = array_ops.gather(cond, i)
-        return array_ops.where(cond_i, a_i, b_i)
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 2)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class NNTest(PForTest):
+class NNTest(PForTestCase):
 
   def test_conv2d(self):
     x = random_ops.random_uniform([3, 2, 12, 12, 3])
@@ -887,6 +251,7 @@ class NNTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def test_fused_batch_norm(self):
     data_formats = ["NHWC"]
     if test.is_gpu_available():
@@ -956,7 +321,7 @@ class NNTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
 
-class RandomTest(PForTest):
+class RandomTest(PForTestCase):
 
   # The random values generated in the two implementations are not guaranteed to
   # match. So we only check the returned shapes.
@@ -1009,8 +374,9 @@ class RandomTest(PForTest):
     self._test_loop_fn(loop_fn, 5)
 
 
-class LoggingTest(PForTest):
+class LoggingTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_print(self):
     x = random_ops.random_uniform([3, 5])
 
@@ -1031,8 +397,9 @@ class LoggingTest(PForTest):
       sess.run(pfor_control_flow_ops.pfor(loop_fn, 3))
 
 
-class TensorArrayTest(PForTest):
+class TensorArrayTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_read(self):
 
     ta = tensor_array_ops.TensorArray(
@@ -1043,6 +410,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_gather(self):
 
     ta = tensor_array_ops.TensorArray(
@@ -1053,6 +421,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_write_and_scatter(self):
 
     t = tensor_array_ops.TensorArray(dtypes.int32, 10, clear_after_read=False)
@@ -1074,6 +443,7 @@ class TensorArrayTest(PForTest):
     output2 = self._run_targets(out2)
     self.assertAllClose(output2, output1)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_write(self):
 
     def loop_fn(i):
@@ -1085,6 +455,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_scatter(self):
 
     def loop_fn(i):
@@ -1097,6 +468,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_read(self):
 
     def loop_fn(i):
@@ -1109,6 +481,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_gather(self):
 
     def loop_fn(i):
@@ -1121,6 +494,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_grad(self):
     x = random_ops.random_uniform([3, 2])
     ta = tensor_array_ops.TensorArray(
@@ -1140,8 +514,9 @@ class TensorArrayTest(PForTest):
       self.assertAllClose(actual_grad, computed_grad)
 
 
-class StackTest(PForTest):
+class StackTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_inside_loop_invariant(self):
 
     def loop_fn(_):
@@ -1157,6 +532,7 @@ class StackTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_inside_push_loop_dependent(self):
 
     def loop_fn(i):
@@ -1172,6 +548,7 @@ class StackTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_outside_pop(self):
     s = data_flow_ops.stack_v2(max_size=4, elem_type=dtypes.int32)
     op = data_flow_ops.stack_push_v2(s, 5)
@@ -1195,6 +572,7 @@ class StackTest(PForTest):
     self.assertAllEqual([6, 6], v2)
     self.assertAllEqual(5, v3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_outside_push(self):
     s = data_flow_ops.stack_v2(max_size=4, elem_type=dtypes.int32)
 
@@ -1207,7 +585,7 @@ class StackTest(PForTest):
 
 # TODO(agarwal): test nested while_loops. This currently requires converting a
 # tf.cond.
-class ControlFlowTest(PForTest):
+class ControlFlowTest(PForTestCase):
 
   def test_while_outside_loop(self):
 
@@ -1218,6 +596,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_invariant_while(self):
 
     def loop_fn(_):
@@ -1225,6 +604,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_invariant_while_with_control_dependency(self):
 
     def loop_fn(i):
@@ -1234,6 +614,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_with_stateful_ops(self):
 
     def loop_fn(_):
@@ -1243,6 +624,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_unstacked_condition(self):
 
     def loop_fn(i):
@@ -1251,6 +633,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while(self):
     x = random_ops.random_uniform([3, 5])
     lengths = constant_op.constant([4, 0, 2])
@@ -1266,6 +649,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_jacobian(self):
     x = random_ops.random_uniform([1, 3])
     y = random_ops.random_uniform([3, 3])
@@ -1293,6 +677,7 @@ class ControlFlowTest(PForTest):
       out, expected = sess.run([out, expected_output])
       self.assertAllClose(expected, out)
 
+  @test_util.run_v1_only("b/122612051")
   def test_tensor_array_as_loop_variable(self):
 
     def loop_fn(i):
@@ -1308,6 +693,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_read_tensor_array_partitioned_indices(self):
     # Note that tensor array values are pfor loop dependent, and the while loop
     # termination condition is also dependent on pfor iteration.
@@ -1325,6 +711,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_external_while_loop_grad(self):
     # Here we test that external while_loops that are extended from inside pfor
     # (due to gradient calls) are not actually converted. If the below was
@@ -1350,6 +737,7 @@ class ControlFlowTest(PForTest):
       self.assertAllEqual([1, 1, 1],
                           sess.run(pfor_control_flow_ops.pfor(loop_fn, 3)))
 
+  @test_util.run_v1_only("b/122612051")
   def test_tensor_array_grad(self):
     inp = constant_op.constant(np.random.rand(3, 4, 2), dtype=dtypes.float32)
     ta = tensor_array_ops.TensorArray(dtypes.float32, size=3)
@@ -1447,13 +835,15 @@ def create_dynamic_lstm(cell_fn, batch_size, state_size, max_steps):
   return pfor_output, tf_output
 
 
-class RNNTest(PForTest):
+class RNNTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_dynamic_rnn(self):
     pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicRNNCell,
                                                    3, 5, 7)
     self.run_and_assert_equal(pfor_outputs, tf_outputs)
 
+  @test_util.run_v1_only("b/122612051")
   def test_dynamic_lstm(self):
     pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicLSTMCell,
                                                    3, 5, 7)
@@ -1576,8 +966,9 @@ class Benchmarks(test.Benchmark):
       self._run(tf_outputs, 100, name="tf_rnn")
 
 
-class SparseTest(PForTest):
+class SparseTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_var_loop_len(self):
     num_iters = array_ops.placeholder(dtypes.int32)
 
@@ -1589,6 +980,7 @@ class SparseTest(PForTest):
     with self.cached_session() as sess:
       sess.run(pfor, feed_dict={num_iters: 3})
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_none_stacked(self):
     num_iters = 10
 
@@ -1605,6 +997,7 @@ class SparseTest(PForTest):
     manual = sparse_tensor.SparseTensor(indices, values, dense_shapes)
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_all_stacked(self):
     num_iters = 10
 
@@ -1620,6 +1013,7 @@ class SparseTest(PForTest):
                                         (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_indices_stacked(self):
     num_iters = 10
 
@@ -1634,6 +1028,7 @@ class SparseTest(PForTest):
                                         [1] * num_iters, (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_values_stacked(self):
     num_iters = 10
 
@@ -1648,6 +1043,7 @@ class SparseTest(PForTest):
                                         (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_shapes_stacked(self):
     num_iters = 10
 
@@ -1661,6 +1057,7 @@ class SparseTest(PForTest):
                                         [1] * num_iters, (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_shapes_stacked_2D(self):
     num_iters = 10
 
@@ -1677,7 +1074,7 @@ class SparseTest(PForTest):
     self.run_and_assert_equal(pfor, manual)
 
 
-class ParsingTest(PForTest):
+class ParsingTest(PForTestCase):
 
   def test_decode_csv(self):
     csv_tensor = constant_op.constant([["1:2:3"], ["::"], ["7:8:9"]])
@@ -1689,6 +1086,7 @@ class ParsingTest(PForTest):
 
     self._test_loop_fn(loop_fn, iters=3, loop_fn_dtypes=[dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_parse_single_example(self):
 
     def _int64_feature(*values):
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 4342833e3eb362e81ff9f60b4649cc5b8de6250f..69635c5a79c032514cdcd83af7e52b6953b2dc0b 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import layers as tf_layers
 from tensorflow.python.ops import array_ops
@@ -69,9 +70,10 @@ def fully_connected_model_fn(batch_size, activation_size, num_layers):
   return inp, model(inp)
 
 
-def lstm_model_fn(batch_size, state_size, steps):
+def lstm_model_fn(batch_size, state_size, steps, inputs_size=None):
+  inputs_size = inputs_size or state_size
   inputs = [
-      random_ops.random_normal([batch_size, state_size]) for _ in range(steps)
+      random_ops.random_normal([batch_size, inputs_size]) for _ in range(steps)
   ]
   cell = rnn_cell.BasicLSTMCell(state_size)
   init_state = cell.zero_state(batch_size, dtypes.float32)
@@ -107,8 +109,9 @@ def create_fc_batch_jacobian(batch_size, activation_size, num_layers):
   return pfor_jacobian, while_jacobian
 
 
-def create_lstm_batch_jacobian(batch_size, state_size, steps):
-  inp, output = lstm_model_fn(batch_size, state_size, steps)
+def create_lstm_batch_jacobian(batch_size, state_size, steps, inputs_size=None):
+  inp, output = lstm_model_fn(batch_size, state_size, steps,
+                              inputs_size=inputs_size)
   pfor_jacobian = gradients.batch_jacobian(output, inp, use_pfor=True)
   while_jacobian = gradients.batch_jacobian(output, inp, use_pfor=False)
   return pfor_jacobian, while_jacobian
@@ -180,9 +183,10 @@ def create_fc_per_eg_grad(batch_size, activation_size, num_layers):
   return pfor_outputs, while_outputs
 
 
-def create_lstm_per_eg_grad(batch_size, state_size, steps):
+def create_lstm_per_eg_grad(batch_size, state_size, steps, inputs_size=None):
+  inputs_size = inputs_size or state_size
   inputs = [
-      random_ops.random_normal([batch_size, state_size]) for _ in range(steps)
+      random_ops.random_normal([batch_size, inputs_size]) for _ in range(steps)
   ]
   cell = rnn_cell.BasicLSTMCell(state_size)
   init_state = cell.zero_state(batch_size, dtypes.float32)
@@ -297,6 +301,16 @@ def create_mnist_per_eg_grad(batch_size, data_format, training):
   return pfor_outputs, while_outputs
 
 
+def create_mnist_batch_jacobian(batch_size, data_format, training):
+  images = random_ops.random_uniform([batch_size, 28, 28])
+  model = Mnist(data_format)
+  logits = model(images, training=training)
+
+  pfor_jacobian = gradients.batch_jacobian(logits, images, use_pfor=True)
+  while_jacobian = gradients.batch_jacobian(logits, images, use_pfor=False)
+  return pfor_jacobian, while_jacobian
+
+
 def create_mnist_per_eg_jacobian(batch_size, data_format, training):
   images = random_ops.random_uniform([batch_size, 28, 28])
   model = Mnist(data_format)
@@ -338,6 +352,7 @@ def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers):
   return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
 
 
+@test_util.run_v1_only("b/122612051")
 class GradientsTest(test.TestCase):
 
   def run_and_assert_equal(self, targets1, targets2, atol=1e-4, rtol=1e-4):
@@ -477,9 +492,11 @@ class GradientsTest(test.TestCase):
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
 
   def test_lstm_batch_jacobian(self):
-    pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(8, 4, 2)
+    pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(8, 4, 2,
+                                                               inputs_size=128)
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def test_dynamic_lstm_batch_jacobian(self):
     pfor_jacobian, while_gradients = create_dynamic_lstm_batch_jacobian(8, 4, 3)
     with session.Session() as sess:
@@ -566,7 +583,7 @@ class GradientsBenchmarks(test.Benchmark):
       for _ in range(iters):
         self.evaluate(targets)
       end = time.time()
-    avg_time_ms = 1000 * (end - begin) / iters
+    avg_time_ms = (1000 * (end - begin)) / iters
     self.report_benchmark(iters=iters, wall_time=avg_time_ms, name=name)
     return avg_time_ms
 
@@ -578,7 +595,8 @@ class GradientsBenchmarks(test.Benchmark):
 
   def benchmark_lstm_batch_jacobian(self):
     with ops.Graph().as_default():
-      pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(100, 32, 8)
+      pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(
+          100, 32, 8, inputs_size=128)
       self._run(pfor_jacobian, 100, name="lstm_batch_jacobian_pfor")
       self._run(while_jacobian, 20, name="lstm_batch_jacobian_while")
 
@@ -627,13 +645,26 @@ class GradientsBenchmarks(test.Benchmark):
 
   def benchmark_mnist_per_eg_jacobian(self):
     with ops.Graph().as_default():
-      data_format = ("channels_first"
-                     if test.is_gpu_available() else "channels_last")
+      if test.is_gpu_available():
+        data_format = "channels_first"
+      else:
+        data_format = "channels_last"
       pfor_outputs, while_outputs = create_mnist_per_eg_jacobian(
           16, data_format, training=True)
       self._run(pfor_outputs, 20, name="mnist_per_eg_jacobian_pfor")
       self._run(while_outputs, 20, name="mnist_per_eg_jacobian_while")
 
+  def benchmark_mnist_batch_jacobian(self):
+    with ops.Graph().as_default():
+      if test.is_gpu_available():
+        data_format = "channels_first"
+      else:
+        data_format = "channels_last"
+      pfor_outputs, while_outputs = create_mnist_batch_jacobian(
+          128, data_format, training=True)
+      self._run(pfor_outputs, 20, name="mnist_batch_jacobian_pfor")
+      self._run(while_outputs, 20, name="mnist_batch_jacobian_while")
+
   def benchmark_fc_per_eg_jacobian(self):
     with ops.Graph().as_default():
       jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while = (
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..db88f4fe0332afe8de312da65b9643a24a056bcb
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -0,0 +1,405 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for vectorization of math kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MathTest(PForTestCase):
+
+  def test_unary_cwise_ops(self):
+    complex_ops = [
+        math_ops.angle,
+        math_ops.imag,
+        math_ops.complex_abs,
+        math_ops.real,
+        math_ops.conj,
+    ]
+    real_ops = [
+        lambda x: math_ops.acosh(1 + math_ops.square(x)),
+        math_ops.abs,
+        math_ops.acos,
+        math_ops.asin,
+        math_ops.asinh,
+        math_ops.atan,
+        math_ops.atanh,
+        math_ops.bessel_i0e,
+        math_ops.bessel_i1e,
+        math_ops.cos,
+        math_ops.cosh,
+        math_ops.digamma,
+        math_ops.erf,
+        math_ops.erfc,
+        math_ops.exp,
+        math_ops.expm1,
+        math_ops.inv,
+        math_ops.is_finite,
+        math_ops.is_inf,
+        math_ops.lgamma,
+        math_ops.log,
+        math_ops.log1p,
+        math_ops.neg,
+        math_ops.negative,
+        math_ops.reciprocal,
+        math_ops.rint,
+        math_ops.round,
+        math_ops.rsqrt,
+        math_ops.sigmoid,
+        math_ops.sign,
+        math_ops.sin,
+        math_ops.sinh,
+        math_ops.sqrt,
+        math_ops.square,
+        math_ops.tan,
+        math_ops.tanh,
+        math_ops.tanh,
+        nn.elu,
+        nn.relu,
+        nn.relu6,
+        nn.selu,
+        nn.softplus,
+        nn.softsign,
+    ]
+    for op in complex_ops + real_ops:
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+        if op in complex_ops:
+          y = random_ops.random_uniform([3, 5])
+          g.watch(y)
+          x = math_ops.complex(x, y)
+
+      # pylint: disable=cell-var-from-loop
+      output_dtypes = []
+      def loop_fn(i):
+        with g:
+          x1 = array_ops.gather(x, i)
+          y1 = op(x1)
+          outputs = [op(x), y1]
+          if y1.dtype == dtypes.float32:
+            loss = math_ops.reduce_sum(y1 * y1)
+          else:
+            loss = None
+        if loss is not None:
+          grad = g.gradient(loss, x1)
+          if grad is not None:
+            outputs.append(grad)
+        del output_dtypes[:]
+        output_dtypes.extend([t.dtype for t in outputs])
+        return outputs
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
+
+  def test_unary_cwise_no_grad(self):
+    for op in [math_ops.ceil,
+               math_ops.floor,
+               math_ops.logical_not]:
+      x = random_ops.random_uniform([3, 5])
+      if op == math_ops.logical_not:
+        x = x > 0
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return op(array_ops.gather(x, i))
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=x.dtype)
+
+  def test_binary_cwise_ops(self):
+    logical_ops = [
+        math_ops.logical_and,
+        math_ops.logical_or,
+        math_ops.logical_xor
+    ]
+
+    # Wrapper functions restricting the range of inputs of zeta and polygamma.
+    def safe_polygamma(x, y):
+      return math_ops.polygamma(
+          math_ops.round(clip_ops.clip_by_value(y, 1, 10)),
+          x * x + 1)
+
+    def safe_zeta(x, y):
+      return math_ops.zeta(x * x + 1, y * y)
+
+    float_ops = [
+        math_ops.add,
+        math_ops.add_v2,
+        math_ops.atan2,
+        math_ops.complex,
+        math_ops.div,
+        math_ops.divide,
+        math_ops.div_no_nan,
+        math_ops.equal,
+        math_ops.floor_div,
+        math_ops.floor_mod,
+        math_ops.greater,
+        math_ops.greater_equal,
+        math_ops.igamma,
+        math_ops.igammac,
+        math_ops.igamma_grad_a,
+        math_ops.less,
+        math_ops.less_equal,
+        math_ops.maximum,
+        math_ops.minimum,
+        math_ops.mod,
+        math_ops.multiply,
+        math_ops.not_equal,
+        math_ops.pow,
+        math_ops.squared_difference,
+        math_ops.subtract,
+        math_ops.truncate_mod,
+        safe_polygamma,
+        safe_zeta,
+    ]
+    for op in logical_ops + float_ops:
+      x = random_ops.random_uniform([7, 3, 5])
+      y = random_ops.random_uniform([3, 5])
+      if op in logical_ops:
+        x = x > 0
+        y = y > 0
+
+      output_dtypes = []
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        x1 = array_ops.gather(x, i)
+        y1 = array_ops.gather(y, i)
+        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
+        del output_dtypes[:]
+        output_dtypes.extend([t.dtype for t in outputs])
+        return outputs
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
+
+  def test_approximate_equal(self):
+    x = random_ops.random_uniform([3, 5])
+    y = random_ops.random_uniform([3, 5])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      y1 = array_ops.gather(y, i)
+      return math_ops.approximate_equal(x1, y1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.bool])
+
+  def test_addn(self):
+    x = random_ops.random_uniform([2, 3, 5])
+    y = random_ops.random_uniform([3, 5])
+    z = random_ops.random_uniform([3, 5])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return math_ops.add_n([x1, y, z])
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_matmul(self):
+    for tr_a in (True, False):
+      for tr_b in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (5, 3) if tr_a else (3, 5)
+            if stack_a:
+              shape_a = (2,) + shape_a
+            shape_b = (7, 5) if tr_b else (5, 7)
+            if stack_b:
+              shape_b = (2,) + shape_b
+
+            x = random_ops.random_uniform(shape_a)
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+  def test_batch_matmul(self):
+    for tr_a in (True, False):
+      for tr_b in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (4, 5, 3) if tr_a else (4, 3, 5)
+            if stack_a:
+              shape_a = (2,) + shape_a
+            shape_b = (4, 7, 5) if tr_b else (4, 5, 7)
+            if stack_b:
+              shape_b = (2,) + shape_b
+
+            x = random_ops.random_uniform(shape_a)
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+  def test_reduction(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for op in [
+        math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
+        math_ops.reduce_min
+    ]:
+      for axis in ([1], None, [0, 2]):
+        for keepdims in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return op(a, axis=axis, keepdims=keepdims)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_cum_sum(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for axis in (1, -2):
+      for exclusive in (True, False):
+        for reverse in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return math_ops.cumsum(
+                a, axis=axis, exclusive=exclusive, reverse=reverse)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_cum_prod(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for axis in (1, -2):
+      for exclusive in (True, False):
+        for reverse in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return math_ops.cumprod(
+                a, axis=axis, exclusive=exclusive, reverse=reverse)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_bias_add(self):
+    x_shape = [2, 3, 4, 5, 6]
+    x = random_ops.random_uniform(x_shape)
+    for data_format in ("NCHW", "NHWC"):
+      with backprop.GradientTape(persistent=True) as g:
+        bias_dim = 2 if data_format == "NCHW" else -1
+        bias_shape = x_shape[bias_dim]
+        bias = random_ops.random_uniform([bias_shape])
+        g.watch(bias)
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        with g:
+          a = array_ops.gather(x, i)
+          y = nn.bias_add(a, bias, data_format=data_format)
+          loss = math_ops.reduce_sum(y * y)
+        return y, g.gradient(loss, bias)
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(
+          loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.float32])
+
+  def test_unsorted_segment_sum(self):
+    t = random_ops.random_uniform([3, 3, 2])
+    segment_ids = constant_op.constant([[0, 0, 2], [0, 1, 2], [2, 2, 2]])
+    num_segments = 3
+
+    def loop_fn(i):
+      data = array_ops.gather(t, i)
+      data_0 = array_ops.gather(t, 0)
+      seg_ids = array_ops.gather(segment_ids, i)
+      return (math_ops.unsorted_segment_sum(data, seg_ids, num_segments),
+              math_ops.unsorted_segment_sum(data_0, seg_ids, num_segments))
+
+    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 2)
+
+  def test_cast(self):
+    x = constant_op.constant([[1], [2]])
+    y = constant_op.constant([[1.0], [2.0]])
+
+    def loop_fn(i):
+      return (math_ops.cast(array_ops.gather(x, i), dtypes.float32),
+              math_ops.cast(array_ops.gather(y, i), dtypes.int32))
+
+    self._test_loop_fn(
+        loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
+
+  def test_tanh_axpy(self):
+    a = constant_op.constant(3.)
+    x = random_ops.random_uniform([4, 5])
+    y = random_ops.random_uniform([6, 5])
+    n = x.shape[0]
+
+    def loop_fn(i):
+      return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
+
+    self._test_loop_fn(loop_fn, n)
+
+  def test_select(self):
+    cond = constant_op.constant([True, False])
+    a = random_ops.random_uniform([2, 3, 5])
+    b = random_ops.random_uniform([2, 3, 5])
+    for cond_shape in [2], [2, 3], [2, 3, 5]:
+      cond = random_ops.random_uniform(cond_shape) > 0.5
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        a_i = array_ops.gather(a, i)
+        b_i = array_ops.gather(b, i)
+        cond_i = array_ops.gather(cond, i)
+        return array_ops.where(cond_i, a_i, b_i)
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/parallel_for/test_util.py b/tensorflow/python/ops/parallel_for/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4ef2239e5dc2eb7614d167777821437ae1e812
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/test_util.py
@@ -0,0 +1,59 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class PForTestCase(test.TestCase):
+  """Base class for test cases."""
+
+  def _run_targets(self, targets1, targets2=None, run_init=True):
+    targets1 = nest.flatten(targets1)
+    targets2 = ([] if targets2 is None else nest.flatten(targets2))
+    assert len(targets1) == len(targets2) or not targets2
+    if run_init:
+      init = variables.global_variables_initializer()
+      self.evaluate(init)
+    return self.evaluate(targets1 + targets2)
+
+  def run_and_assert_equal(self, targets1, targets2):
+    outputs = self._run_targets(targets1, targets2)
+    outputs = nest.flatten(outputs)  # flatten SparseTensorValues
+    n = len(outputs) // 2
+    for i in range(n):
+      if outputs[i + n].dtype != np.object:
+        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
+      else:
+        self.assertAllEqual(outputs[i + n], outputs[i])
+
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
+    self.run_and_assert_equal(t1, t2)
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 89b8c4a2b305e7cd584d8bc215ae30490572f2e4..e3bdb74cfeb10beadaccb0d9447e08998696b4f1 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -24,10 +24,13 @@ py_library(
     tags = ["nofixdeps"],
     deps = [
         ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_concat_ops",
         ":ragged_conversion_ops",
         ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_functional_ops",
+        ":ragged_gather_ops",
         ":ragged_getitem",
         ":ragged_map_ops",
         ":ragged_math_ops",
@@ -37,6 +40,7 @@ py_library(
         ":ragged_tensor_shape",
         ":ragged_tensor_value",
         ":ragged_util",
+        ":ragged_where_op",
         ":segment_id_ops",
         "//tensorflow/python:util",
     ],
@@ -55,6 +59,7 @@ py_library(
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -64,6 +69,45 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_batch_gather_ops",
+    srcs = ["ragged_batch_gather_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_concat_ops",
+        ":ragged_conversion_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_util",
+        ":ragged_where_op",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_concat_ops",
+    srcs = ["ragged_concat_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 py_library(
     name = "ragged_conversion_ops",
     srcs = ["ragged_conversion_ops.py"],
@@ -115,13 +159,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_gather_ops",
+    srcs = ["ragged_gather_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 py_library(
     name = "ragged_getitem",
     srcs = ["ragged_getitem.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
@@ -244,6 +305,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_where_op",
+    srcs = ["ragged_where_op.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_concat_ops",
+        ":ragged_functional_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "segment_id_ops",
     srcs = ["segment_id_ops.py"],
@@ -412,6 +489,7 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -430,6 +508,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":ragged_test_util",
@@ -448,8 +527,8 @@ py_test(
     srcs = ["ragged_gather_nd_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -691,7 +770,7 @@ py_test(
     srcs = ["ragged_concat_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
+        ":ragged_concat_ops",
         ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
@@ -710,7 +789,7 @@ py_test(
     srcs = ["ragged_stack_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
+        ":ragged_concat_ops",
         ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:constant_op",
@@ -720,6 +799,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "ragged_rank_op_test",
+    srcs = ["ragged_rank_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "ragged_tile_op_test",
     srcs = ["ragged_tile_op_test.py"],
@@ -772,9 +864,9 @@ py_test(
     srcs = ["ragged_where_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
         ":ragged_test_util",
+        ":ragged_where_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -862,3 +954,18 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_size_op_test",
+    srcs = ["ragged_size_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index 7806f5697852fa69cea46e930fa37a3477c8e380..a5ffd8a950dd07e3ba5c808c49a1e08cea2b6bc1 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -21,7 +21,7 @@ different lengths.  For example, the inner (column) dimension of
 `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
 (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
 description of ragged tensors, see the `tf.RaggedTensor` class documentation
-and the [Ragged Tensor Guide](/guides/ragged_tensor).
+and the [Ragged Tensor Guide](/guide/ragged_tensors).
 """
 
 from __future__ import absolute_import
@@ -29,10 +29,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_getitem
 from tensorflow.python.ops.ragged import ragged_map_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
@@ -41,6 +44,7 @@ from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.ops.ragged import segment_id_ops
 
 # Add a list of the ops that support Ragged Tensors.
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 8ba8c53212f250dd48e5ac6485000494e9726f38..8c62cc4a7286c13d9c6aaa0da2e5a70d2abf1d32 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -20,11 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
@@ -34,322 +31,6 @@ from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
 
 
-#===============================================================================
-# ragged_gather
-#===============================================================================
-# TODO(edloper): Add an `axis` argument
-def gather(params, indices, validate_indices=None, axis=0, name=None):
-  """Gathers ragged slices from `params` axis `0` according to `indices`.
-
-  Returns `RaggedTensor` output, such that:
-
-  ```python
-  output.shape = indices.shape + params.shape[1:]
-  output.ragged_rank = indices.shape.ndims + params.ragged_rank
-  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-  ```
-
-  `params` may be ragged.  `indices` may be ragged.
-  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
-  then an error is returned.
-
-  Examples:
-
-  ```python
-  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
-  >>> indices = tf.constant([3, 1, 2, 1, 0])
-  >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-  >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
-
-  >>> print ragged.gather(params, ragged_indices)
-  [['d', 'b', 'c'], ['b'], [], ['a']]
-
-  >>> print ragged.gather(ragged_params, indices)
-  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
-
-  >>> print ragged.gather(ragged_params, ragged_indices)
-  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
-  ```
-
-  Args:
-    params: The potentially ragged tensor from which to gather values. Must be
-      at least rank 1.
-    indices: The potentially ragged tensor indicating which values to gather.
-      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
-      params.shape[0]]`.
-    validate_indices: Ignored.
-    axis: Must be zero.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `RaggedTensor`, where `output.dtype=params.dtype` and
-    `output.shape=indices.shape + params.shape[1:]` and
-    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
-
-  Raises:
-    ValueError: If indices.shape.ndims is not known statically.
-  """
-  del validate_indices
-  if not isinstance(axis, int) or axis != 0:
-    raise ValueError('axis>0 is not supported for ragged gather yet.')
-  with ops.name_scope(name, 'RaggedGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-
-    if ragged_tensor.is_ragged(indices):
-      return indices.with_values(gather(params, indices.values))
-
-    if not ragged_tensor.is_ragged(params):
-      return array_ops.gather(params, indices)
-
-    indices = ops.convert_to_tensor(indices)
-    if indices.shape.ndims is None:
-      raise ValueError('indices.shape.ndims must be known statically')
-
-    result = gen_ragged_array_ops.ragged_gather(
-        indices=indices,
-        params_dense_values=params.flat_values,
-        params_nested_splits=params.nested_row_splits,
-        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
-        1)
-
-    # Compose the RaggedTensor from splits & values.
-    return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        result.output_dense_values, result.output_nested_splits)
-
-
-#===============================================================================
-# ragged.batch_gather
-#===============================================================================
-def batch_gather(params, indices, name=None):
-  """Gathers slices from `params` according to `indices` with batch dims.
-
-  This operation is similar to `gather`, but it assumes that the leading `N`
-  dimensions of `indices` and `params` are batch dimensions, and performs a
-  gather within each batch.  In particular, when using this operation with `N`
-  batch dimensions `B1...BN`:
-
-  * `indices` has shape `[B1...BN, I]`
-  * `params` has shape `[B1...BN, P1...PM]`.
-  * `result` has shape `[B1...BN, I, P2...PM]`.
-  * `result[b1...bN, i, p2...pM] =
-    params[b1...bN, indices[b1...bN, i], p2...pM]`
-
-  Args:
-    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
-      `M>0`).
-    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
-    name: A name for the operation (optional).
-
-  Returns:
-    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
-    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
-
-  #### Example:
-    ```python
-    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
-    >>> ragged.batch_gather(params, indices)
-    [['b', 'c', 'a'], [], [], ['e', 'e']]
-    ```
-  """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.batch_gather(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    indices_ndims = indices.shape.ndims
-    if indices_ndims is None:
-      raise ValueError(
-          'batch_gather does not allow indices with unknown shape.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-
-    if ragged_tensor.is_ragged(indices):
-      # If the outermost ragged dimension is a batch dimension, recurse.
-      if indices_ndims > 2:
-        if not ragged_tensor.is_ragged(params):
-          raise ValueError('batch shape from indices does '
-                           'not match params shape')
-        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
-        with ops.control_dependencies(checks):
-          return ragged_tensor.RaggedTensor.from_row_splits(
-              batch_gather(params.values, indices.values), indices.row_splits)
-
-      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
-      else:
-        # Ensure that `params` is ragged and has at least 2 dimensions.
-        if not ragged_tensor.is_ragged(params):
-          if params.shape.ndims is not None and params.shape.ndims < 2:
-            raise ValueError('batch shape from indices does '
-                             'not match params shape')
-          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
-
-        # Adjust indices from within-batch to global (in params.values), and
-        # then use ragged.gather to gather them.
-        num_indices = indices.row_lengths()
-        params_starts = params.row_starts()
-        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
-        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
-        return ragged_tensor.RaggedTensor.from_row_splits(
-            gather(params.values, adjusted_index_values), indices.row_splits)
-
-    else:  # params is a RaggedTensor and indices is a Tensor.
-      if indices_ndims == 1:
-        return gather(params, indices)
-      elif indices_ndims == 2:
-        # Adjust indices from batch-local to global (in params.values)
-        adjustments = array_ops.expand_dims(params.row_starts(), 1)
-        adjusted_indices = math_ops.to_int64(indices) + adjustments
-        return gather(params.values, adjusted_indices)
-      else:
-        raise ValueError('batch shape from indices does not match params shape')
-
-
-#===============================================================================
-# ragged.gather_nd
-#===============================================================================
-def gather_nd(params, indices, name=None):
-  """Gather slices from `params` using `n`-dimensional indices.
-
-  This operation is similar to `gather`, but it uses the innermost dimension
-  of `indices` to define a slice into `params`.  In particular, if:
-
-  * `indices` has shape `[A1...AN, I]`
-  * `params` has shape `[B1...BM]`
-
-  Then:
-
-  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
-  * `result[a1...aN] = params[indices[a1...aN, :]]`
-
-  Args:
-    params: A potentially ragged tensor with shape `[A1...AN, I]`.
-    indices: A potentially ragged tensor with shape `[B1...BM]`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
-
-  #### Examples:
-    ```python
-    >>> params = tf.ragged.constant_value(
-    ...     [ [ ['000', '001'], ['010'              ]          ],
-    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
-    ...       [ [            ], ['210'              ]          ] ])
-
-    >>> # Gather 2D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2], [0]])
-    [ [ [            ], ['210'] ]
-      [ ['000', '001'], ['010'] ] ]
-
-    >>> # Gather 1D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
-    [['210'], ['000', '001']]
-
-    >>> # Gather scalars from a 3D tensor
-    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
-    ['001', '112']
-    ```
-  """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.gather_nd(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
-
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    indices_shape = indices.shape
-    indices_ndims = indices_shape.ndims
-    if indices_ndims is None:
-      raise ValueError('indices.rank be statically known.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-    if (ragged_tensor.is_ragged(indices) and
-        indices_ndims == indices.ragged_rank + 1):
-      raise ValueError('The innermost dimension of indices may not be ragged')
-
-    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
-    # that each index slices into.
-    index_size = tensor_shape.dimension_value(indices_shape[-1])
-    if index_size is None:
-      raise ValueError('indices.shape[-1] must be statically known.')
-
-    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
-    # dense, then we convert it to ragged before recursing, and then convert
-    # the result back to `dense` if appropriate.
-    if indices_ndims > 2:
-      indices_is_dense = not ragged_tensor.is_ragged(indices)
-      if indices_is_dense:
-        indices = ragged_conversion_ops.from_tensor(
-            indices, ragged_rank=indices_ndims - 2)
-      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
-      if (indices_is_dense and ragged_tensor.is_ragged(result) and
-          result.ragged_rank == indices_ndims - 2):
-        result = ragged_conversion_ops.to_tensor(result)
-      return result
-
-    # indices_ndims <= 2, and the innermost dimension of indices may not be
-    # ragged, so `indices` must not be ragged.
-    assert not ragged_tensor.is_ragged(indices)
-    assert ragged_tensor.is_ragged(params)
-
-    # Handle corner case: An empty index tuple selects the entire `params`
-    # value.  So if `index_size` is zero, then tile `params`.
-    if index_size == 0:
-      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
-      for dim in range(indices_ndims - 1):
-        params = expand_dims(params, axis=0)
-      multiples = array_ops.concat([
-          array_ops.shape(indices)[:-1],
-          array_ops.ones([params_ndims], dtypes.int32)
-      ],
-                                   axis=0)
-      return tile(params, multiples)
-
-    # When index_size=1, we can just flatten the index tuples and use gather.
-    elif index_size == 1:
-      flattened_index_tuples = array_ops.reshape(indices, [-1])
-      return gather(params, flattened_index_tuples)
-
-    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
-    # Flatten both the index tuples and the params, such that the flattened
-    # index tuples point to the correct values in the flattened params; and
-    # then use ragged.gather on the flattened index tuples & params.
-    else:
-      indices = math_ops.to_int64(indices)
-
-      # Flatten the outermost 2 dimensions of the index tuples & params.
-      flattened_index_tuples = array_ops.gather(params.row_splits,
-                                                indices[..., 0])
-      flattened_index_tuples += indices[..., 1]
-      flattened_params = params.values
-
-      # Flatten any remaining dimensions.
-      for dim in range(2, index_size):
-        if not ragged_tensor.is_ragged(flattened_params):
-          flattened_index_tuples = array_ops.expand_dims(
-              flattened_index_tuples, axis=1)
-          flattened_index_tuples = array_ops.concat(
-              [flattened_index_tuples, indices[..., dim:]], axis=1)
-          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
-
-        flattened_index_tuples = array_ops.gather(
-            flattened_params.row_starts(), flattened_index_tuples)
-        flattened_index_tuples += indices[..., dim]
-        flattened_params = flattened_params.values
-
-      # Gather using the flattened index tuples and params.
-      return gather(flattened_params, flattened_index_tuples)
-
-
 #===============================================================================
 # Masking
 #===============================================================================
@@ -444,7 +125,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
 
     # Get static rank of mask.
     if mask.shape.ndims is None:
-      raise ValueError('mask.shape.ndims must be kown statically.')
+      raise ValueError('mask.shape.ndims must be known statically.')
     elif mask.shape.ndims == 0:
       raise ValueError('mask cannot be scalar.')
 
@@ -543,260 +224,6 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       return masked_values
 
 
-#===============================================================================
-# Concatenation and Stacking
-#===============================================================================
-def concat(values, axis, name=None):
-  """Concatenates potentially ragged tensors along one dimension.
-
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  concatenation of `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to concatenate.
-      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
-
-  #### Example:
-    ```python
-    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.concat([t1, t2], axis=0)
-    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
-    >>> ragged.concat([t1, t2], axis=1)
-    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
-    ```
-  """
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  with ops.name_scope(name, 'RaggedConcat', values):
-    return _ragged_stack_concat_helper(values, axis, stack_values=False)
-
-
-def stack(values, axis=0, name=None):
-  """Stacks potentially ragged tensors along one dimension.
-
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  list `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to stack.
-      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K+1`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
-
-  #### Example:
-    ```python
-    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.stack([t1, t2], axis=0)
-    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
-    >>> ragged.stack([t1, t2], axis=1)
-    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
-    ```
-  """
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  with ops.name_scope(name, 'RaggedConcat', values):
-    return _ragged_stack_concat_helper(values, axis, stack_values=True)
-
-
-def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
-  """Helper function to concatenate or stack ragged tensors.
-
-  Args:
-    rt_inputs: A list of RaggedTensors or Tensors to combine.
-    axis: The axis along which to concatenate or stack.
-    stack_values: A boolean -- if true, then stack values; otherwise,
-      concatenate them.
-
-  Returns:
-    A RaggedTensor.
-  Raises:
-    ValueError: If rt_inputs is empty, or if axis is out of range.
-  """
-  # Validate parameters.
-  if not rt_inputs:
-    raise ValueError('rt_inputs may not be empty.')
-
-  # Convert input tensors.
-  rt_inputs = [
-      ragged_tensor.convert_to_tensor_or_ragged_tensor(
-          rt_input, name='rt_input') for rt_input in rt_inputs
-  ]
-
-  # Special case: if there's only one input, then return it as-is.
-  if len(rt_inputs) == 1:
-    if stack_values:
-      return expand_dims(rt_inputs[0], axis=0)
-    else:
-      return rt_inputs[0]
-
-  # Check the rank (number of dimensions) of the input tensors.
-  ndims = None
-  for rt in rt_inputs:
-    if ndims is None:
-      ndims = rt.shape.ndims
-    else:
-      rt.shape.assert_has_rank(ndims)
-
-  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
-  axis = ragged_util.get_positive_axis(axis, out_ndims)
-
-  # If all the inputs are Tensors, and we're combining the final dimension,
-  # then we can delegate to the tf.stack/tf.concat operation, and return a
-  # Tensor.
-  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
-    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
-      if stack_values:
-        return array_ops.stack(rt_inputs, axis)
-      else:
-        return array_ops.concat(rt_inputs, axis)
-
-  # Convert any Tensor inputs to RaggedTensors.  This makes it
-  # possible to concatenate Tensors and RaggedTensors together.
-  for i in range(len(rt_inputs)):
-    if not ragged_tensor.is_ragged(rt_inputs[i]):
-      rt_inputs[i] = ragged_conversion_ops.from_tensor(
-          rt_inputs[i], ragged_rank=1)
-
-  # Convert the input tensors to all have the same ragged_rank.
-  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
-  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
-
-  if axis == 0:
-    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
-  elif axis == 1:
-    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
-  else:  # axis > 1: recurse.
-    values = [rt.values for rt in rt_inputs]
-    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
-    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
-      return ragged_tensor.RaggedTensor.from_row_splits(
-          _ragged_stack_concat_helper(values, axis - 1, stack_values),
-          splits[0][0])
-
-
-def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
-  """Helper function to concatenate or stack ragged tensors along axis 0.
-
-  Args:
-    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
-    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
-      them.
-
-  Returns:
-    A RaggedTensor.
-  """
-  # Concatenate the inner values together.
-  flat_values = [rt.flat_values for rt in rt_inputs]
-  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
-
-  # Concatenate the splits together for each ragged dimension (adjusting
-  # split offsets as necessary).
-  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
-  ragged_rank = rt_inputs[0].ragged_rank
-  concatenated_nested_splits = [
-      _concat_ragged_splits([ns[dim]
-                             for ns in nested_splits])
-      for dim in range(ragged_rank)
-  ]
-
-  # If we are performing a stack operation, then add another splits.
-  if stack_values:
-    stack_lengths = array_ops.stack([_nrows(rt) for rt in rt_inputs])
-    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
-    concatenated_nested_splits.insert(0, stack_splits)
-
-  return ragged_tensor.RaggedTensor.from_nested_row_splits(
-      concatenated_flat_values, concatenated_nested_splits)
-
-
-def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
-  """Helper function to concatenate or stack ragged tensors along axis 1.
-
-  Args:
-    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
-    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
-      them.
-
-  Returns:
-    A RaggedTensor.
-  """
-  num_inputs = len(rt_inputs)
-
-  rt_nrows = _nrows(rt_inputs[0])
-  nrows_msg = 'Input tensors have incompatible shapes.'
-  nrows_checks = [
-      check_ops.assert_equal(_nrows(rt), rt_nrows, message=nrows_msg)
-      for rt in rt_inputs[1:]
-  ]
-
-  with ops.control_dependencies(nrows_checks):
-    # Concatentate the inputs together to put them in a single ragged tensor.
-    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
-
-    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
-    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
-    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
-    #                      ...,
-    #                  rt_inputs[0][M], ..., rt_input[N][M]]
-    # where `N=num_inputs-1` and `M=rt_nrows-1`.
-    row_indices = math_ops.range(rt_nrows * num_inputs)
-    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
-    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
-    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
-    permuted_rt = gather(concatenated_rt, row_permutation)
-
-    if stack_values:
-      # Add a new splits tensor to group together the values.
-      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
-      _copy_row_shape(rt_inputs, stack_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
-                                                        stack_splits)
-    else:
-      # Merge together adjacent rows by dropping the row-split indices that
-      # separate them.
-      concat_splits = permuted_rt.row_splits[::num_inputs]
-      _copy_row_shape(rt_inputs, concat_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
-                                                        concat_splits)
-
-
-def _copy_row_shape(rt_inputs, splits):
-  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
-  for rt in rt_inputs:
-    if rt.shape[0] is not None:
-      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
-
-
 #===============================================================================
 # Tiling
 #===============================================================================
@@ -1058,136 +485,33 @@ def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
 
 
 #===============================================================================
-# ragged.where
+# RaggedTensor Size
 #===============================================================================
-def where(condition, x=None, y=None, name=None):
-  """Return the elements, either from `x` or `y`, depending on the `condition`.
-
-  : If both `x` and `y` are `None`:
-    Returns the coordinates of true elements of `condition`. The coordinates
-    are returned in a 2-D tensor with shape
-    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
-    coordinates of the `i`th true value (in row-major order).
-
-  : If both `x` and `y` are non-`None`:
-    Returns a tensor formed by selecting values from `x` where condition is
-    true, and from `y` when condition is false.  In particular:
 
-    : If `condition`, `x`, and `y` all have the same shape:
 
-      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
-      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+def size(input, out_type=dtypes.int32, name=None):  # pylint: disable=redefined-builtin
+  """Returns the size of a potentially ragged tensor.
 
-    : Otherwise:
-
-      * `condition` must be a vector.
-      * `x` and `y` must have the same number of dimensions.
-      * The outermost dimensions of `condition`, `x`, and `y` must all have the
-        same size.
-      * `result[i] = x[i]` if `condition[i]` is true.
-      * `result[i] = y[i]` if `condition[i]` is false.
+  The size of a ragged tensor is the size of its inner values.
 
   Args:
-    condition: A potentially ragged tensor of type `bool`
-    x: A potentially ragged tensor (optional).
-    y: A potentially ragged tensor (optional).  Must be specified if `x` is
-      specified.  Must have the same rank and type as `x`.
-    name: A name of the operation (optional)
+    input: A potentially ragged `Tensor`.
+    out_type: The numeric output type for the operation.
+    name: A name for the operation (optional).
 
   Returns:
-    : If both `x` and `y` are `None`:
-      A `Tensor` with shape `(num_true, dim_size(condition))`.
-    : Otherwise:
-      A potentially ragged tensor with the same type, rank, and outermost
-      dimension size as `x` and `y`.
-      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+    A Tensor of type `out_type`.
 
-  Raises:
-    ValueError: When exactly one of `x` or `y` is non-`None`; or when
-      `condition`, `x`, and `y` have incompatible shapes.
-
-  #### Examples:
+  #### Example:
     ```python
-    >>> # Coordinates where condition is true.
-    >>> condition = tf.ragged.constant_value(
-    ...     [[True, False, True], [False, True]])
-    >>> ragged.where(condition)
-    [[0, 0], [0, 2], [1, 1]]
-
-    >>> # Elementwise selection between x and y, based on condition.
-    >>> condition = tf.ragged.constant_value(
-    ...     [[True, False, True], [False, True]])
-    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'b', 'C'], ['d', 'E']]
-
-    >>> # Row selection between x and y, based on condition.
-    >>> condition = [True, False]
-    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'B', 'C'], ['d', 'e']]
+    >>> tf.size(tf.ragged.constant([[1, 2], [3]]))
+    3
     ```
   """
-  if (x is None) != (y is None):
-    raise ValueError('x and y must be either both None or both non-None')
-  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
-    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        condition, name='condition')
-    if x is None:
-      return _coordinate_where(condition)
-    else:
-      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
-      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
-      return _elementwise_where(condition, x, y)
-
-
-def _elementwise_where(condition, x, y):
-  """Ragged version of tf.where(condition, x, y)."""
-  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
-  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
-  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
-
-  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
-    return array_ops.where(condition, x, y)
-
-  elif condition_is_ragged and x_is_ragged and y_is_ragged:
-    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
-                                                 y)
-  elif not condition_is_ragged:
-    # Concatenate x and y, and then use `gather` to assemble the selected rows.
-    condition.shape.assert_has_rank(1)
-    x_nrows = _nrows(x)
-    x_and_y = concat([x, y], axis=0)
-    indices = array_ops.where(condition, math_ops.range(x_nrows),
-                              x_nrows + math_ops.range(_nrows(y)))
-    return gather(x_and_y, indices)
-
+  if ragged_tensor.is_ragged(input):
+    return array_ops.size(input.flat_values, out_type=out_type, name=name)
   else:
-    raise ValueError('Input shapes do not match.')
-
-
-def _coordinate_where(condition):
-  """Ragged version of tf.where(condition)."""
-  if not isinstance(condition, ragged_tensor.RaggedTensor):
-    return array_ops.where(condition)
-
-  # The coordinate for each `true` value in condition.values.
-  selected_coords = _coordinate_where(condition.values)
-
-  # Convert the first index in each coordinate to a row index and column index.
-  first_index = selected_coords[:, 0]
-  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
-  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
-  selected_cols = first_index - selected_row_starts
-
-  # Assemble the row & column index with the indices for inner dimensions.
-  return array_ops.concat([
-      array_ops.expand_dims(selected_rows, 1),
-      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
-  ],
-                          axis=1)
+    return array_ops.size(input, out_type=out_type, name=name)
 
 
 #===============================================================================
@@ -1222,3 +546,33 @@ def _nrows(rt_input, out_type=dtypes.int64, name=None):
   else:
     with ops.name_scope(name, 'RaggedNRows', [rt_input]):
       return array_ops.shape(rt_input, out_type=out_type)[0]
+
+
+#===============================================================================
+# ragged.rank
+#===============================================================================
+def rank(input, name=None):  # pylint: disable=redefined-builtin
+  """Returns the rank of a RaggedTensor.
+
+  Returns a 0-D `int32` `Tensor` representing the rank of `input`.
+
+  For example:
+
+  ```python
+  # shape of tensor 't' is [2, None, None]
+  t = tf.ragged.constant([[[1], [2, 2]], [[3, 3, 3], [4, 4, 4, 4]]])
+  tf.rank(t)  # 3
+  ```
+
+  Args:
+    input: A `RaggedTensor`
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int32`.
+  """
+  with ops.name_scope(name, 'RaggedRank', [input]) as name:
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.rank(input, name)
+
+    return input.ragged_rank + array_ops.rank(input.flat_values)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 431d350db8a5a266113df9a03e39a90643893d79..72692fcef024bd1bf8a45e449e142894e153cc30 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_array_ops.batch_gather."""
+"""Tests for ragged_batch_gather_ops.batch_gather."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,7 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
@@ -146,7 +146,7 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
               [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
   ])
   def testRaggedBatchGather(self, descr, params, indices, expected):
-    result = ragged_array_ops.batch_gather(params, indices)
+    result = ragged_batch_gather_ops.batch_gather(params, indices)
     self.assertRaggedEqual(result, expected)
 
   def testRaggedBatchGatherUnknownRankError(self):
@@ -159,11 +159,11 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged_array_ops.batch_gather(params, indices)
+      ragged_batch_gather_ops.batch_gather(params, indices)
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged_array_ops.batch_gather(params, ragged_indices)
+      ragged_batch_gather_ops.batch_gather(params, ragged_indices)
 
   @parameterized.parameters(
       [
@@ -208,7 +208,7 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
                                        message=None,
                                        error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged_array_ops.batch_gather(params, indices)
+      ragged_batch_gather_ops.batch_gather(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c57aead9192f657442d8f6c86be267f83317b87
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Batch gather operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+#===============================================================================
+# ragged.batch_gather
+#===============================================================================
+def batch_gather(params, indices, name=None):
+  """Gathers slices from `params` according to `indices` with batch dims.
+
+  This operation is similar to `gather`, but it assumes that the leading `N`
+  dimensions of `indices` and `params` are batch dimensions, and performs a
+  gather within each batch.  In particular, when using this operation with `N`
+  batch dimensions `B1...BN`:
+
+  * `indices` has shape `[B1...BN, I]`
+  * `params` has shape `[B1...BN, P1...PM]`.
+  * `result` has shape `[B1...BN, I, P2...PM]`.
+  * `result[b1...bN, i, p2...pM] =
+    params[b1...bN, indices[b1...bN, i], p2...pM]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> tf.batch_gather(params, indices)
+    [['b', 'c', 'a'], [], [], ['e', 'e']]
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.batch_gather(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_ndims = indices.shape.ndims
+    if indices_ndims is None:
+      raise ValueError(
+          'batch_gather does not allow indices with unknown shape.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+
+    if ragged_tensor.is_ragged(indices):
+      # If the outermost ragged dimension is a batch dimension, recurse.
+      if indices_ndims > 2:
+        if not ragged_tensor.is_ragged(params):
+          raise ValueError('batch shape from indices does '
+                           'not match params shape')
+        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
+        with ops.control_dependencies(checks):
+          return ragged_tensor.RaggedTensor.from_row_splits(
+              batch_gather(params.values, indices.values), indices.row_splits)
+
+      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
+      else:
+        # Ensure that `params` is ragged and has at least 2 dimensions.
+        if not ragged_tensor.is_ragged(params):
+          if params.shape.ndims is not None and params.shape.ndims < 2:
+            raise ValueError('batch shape from indices does '
+                             'not match params shape')
+          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+
+        # Adjust indices from within-batch to global (in params.values), and
+        # then use ragged.gather to gather them.
+        num_indices = indices.row_lengths()
+        params_starts = params.row_starts()
+        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
+        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
+        return ragged_tensor.RaggedTensor.from_row_splits(
+            ragged_gather_ops.gather(params.values, adjusted_index_values),
+            indices.row_splits)
+
+    else:  # params is a RaggedTensor and indices is a Tensor.
+      if indices_ndims == 1:
+        return ragged_gather_ops.gather(params, indices)
+      elif indices_ndims == 2:
+        # Adjust indices from batch-local to global (in params.values)
+        adjustments = array_ops.expand_dims(params.row_starts(), 1)
+        adjusted_indices = math_ops.to_int64(indices) + adjustments
+        return ragged_gather_ops.gather(params.values, adjusted_indices)
+      else:
+        raise ValueError('batch shape from indices does not match params shape')
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
index 19f7d216d22e84958743bf771ecd346cd6b55b83..6f5fad13fb4afe9fdc0591dce71b5d33d0f005dd 100644
--- a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -316,7 +316,7 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
   def testErrors(self):
     if not context.executing_eagerly():
       self.assertRaisesRegexp(ValueError,
-                              r'mask\.shape\.ndims must be kown statically',
+                              r'mask\.shape\.ndims must be known statically',
                               ragged_array_ops.boolean_mask, [[1, 2]],
                               array_ops.placeholder(dtypes.bool))
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index 254afdaa21b489f0c3ea4191b0b02990fd7334cf..62989d3025562db9af4b19d5a2922988591fe521 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
@@ -235,7 +235,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
                        expected_ragged_rank=None,
                        expected_shape=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    concatenated = ragged_array_ops.concat(rt_inputs, axis)
+    concatenated = ragged_concat_ops.concat(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
@@ -276,7 +276,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
                       message=None,
                       ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    self.assertRaisesRegexp(error, message, ragged_array_ops.concat, rt_inputs,
+    self.assertRaisesRegexp(error, message, ragged_concat_ops.concat, rt_inputs,
                             axis)
 
   @parameterized.parameters([
@@ -294,7 +294,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     rt_inputs = [
         array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
     ]
-    concatenated = ragged_array_ops.concat(rt_inputs, axis)
+    concatenated = ragged_concat_ops.concat(rt_inputs, axis)
     with self.assertRaisesRegexp(error, message):
       self.evaluate(concatenated)
 
@@ -307,7 +307,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     ]
     self.assertRaisesRegexp(
         ValueError, r'axis may only be negative if ndims is statically known.',
-        ragged_array_ops.concat, rt_inputs, -1)
+        ragged_concat_ops.concat, rt_inputs, -1)
 
   def testSingleTensorInput(self):
     """Tests ragged_concat with a single tensor input.
@@ -317,7 +317,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     returns that tensor.  This test exercises that path.
     """
     rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
-    concatenated = ragged_array_ops.concat(rt_inputs, 0)
+    concatenated = ragged_concat_ops.concat(rt_inputs, 0)
     self.assertRaggedEqual(concatenated, [[1, 2], [3, 4]])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f86b05e178a98f5c0afa9c201f83bb652ad8deb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -0,0 +1,302 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Concat and stack operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+def concat(values, axis, name=None):
+  """Concatenates potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  concatenation of `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to concatenate.
+      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.concat([t1, t2], axis=0)
+    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
+    >>> ragged.concat([t1, t2], axis=1)
+    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=False)
+
+
+def stack(values, axis=0, name=None):
+  """Stacks potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  list `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K+1`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.stack([t1, t2], axis=0)
+    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
+    >>> ragged.stack([t1, t2], axis=1)
+    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=True)
+
+
+def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
+  """Helper function to concatenate or stack ragged tensors.
+
+  Args:
+    rt_inputs: A list of RaggedTensors or Tensors to combine.
+    axis: The axis along which to concatenate or stack.
+    stack_values: A boolean -- if true, then stack values; otherwise,
+      concatenate them.
+
+  Returns:
+    A RaggedTensor.
+  Raises:
+    ValueError: If rt_inputs is empty, or if axis is out of range.
+  """
+  # Validate parameters.
+  if not rt_inputs:
+    raise ValueError('rt_inputs may not be empty.')
+
+  # Convert input tensors.
+  rt_inputs = [
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(
+          rt_input, name='rt_input') for rt_input in rt_inputs
+  ]
+
+  # Special case: if there's only one input, then return it as-is.
+  if len(rt_inputs) == 1:
+    if stack_values:
+      return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis)
+    else:
+      return rt_inputs[0]
+
+  # Check the rank (number of dimensions) of the input tensors.
+  ndims = None
+  for rt in rt_inputs:
+    if ndims is None:
+      ndims = rt.shape.ndims
+    else:
+      rt.shape.assert_has_rank(ndims)
+
+  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
+  axis = ragged_util.get_positive_axis(axis, out_ndims)
+
+  # If all the inputs are Tensors, and we're combining the final dimension,
+  # then we can delegate to the tf.stack/tf.concat operation, and return a
+  # Tensor.
+  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
+    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
+      if stack_values:
+        return array_ops.stack(rt_inputs, axis)
+      else:
+        return array_ops.concat(rt_inputs, axis)
+
+  # Convert any Tensor inputs to RaggedTensors.  This makes it
+  # possible to concatenate Tensors and RaggedTensors together.
+  for i in range(len(rt_inputs)):
+    if not ragged_tensor.is_ragged(rt_inputs[i]):
+      rt_inputs[i] = ragged_conversion_ops.from_tensor(
+          rt_inputs[i], ragged_rank=1)
+
+  # Convert the input tensors to all have the same ragged_rank.
+  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+
+  if axis == 0:
+    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
+  elif axis == 1:
+    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
+  else:  # axis > 1: recurse.
+    values = [rt.values for rt in rt_inputs]
+    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
+    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
+      return ragged_tensor.RaggedTensor.from_row_splits(
+          _ragged_stack_concat_helper(values, axis - 1, stack_values),
+          splits[0][0])
+
+
+def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 0.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  # Concatenate the inner values together.
+  flat_values = [rt.flat_values for rt in rt_inputs]
+  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
+
+  # Concatenate the splits together for each ragged dimension (adjusting
+  # split offsets as necessary).
+  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
+  ragged_rank = rt_inputs[0].ragged_rank
+  concatenated_nested_splits = [
+      _concat_ragged_splits([ns[dim]
+                             for ns in nested_splits])
+      for dim in range(ragged_rank)
+  ]
+
+  # If we are performing a stack operation, then add another splits.
+  if stack_values:
+    stack_lengths = array_ops.stack([rt.nrows() for rt in rt_inputs])
+    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
+    concatenated_nested_splits.insert(0, stack_splits)
+
+  return ragged_tensor.RaggedTensor.from_nested_row_splits(
+      concatenated_flat_values, concatenated_nested_splits)
+
+
+def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 1.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  num_inputs = len(rt_inputs)
+
+  rt_nrows = rt_inputs[0].nrows()
+  nrows_msg = 'Input tensors have incompatible shapes.'
+  nrows_checks = [
+      check_ops.assert_equal(rt.nrows(), rt_nrows, message=nrows_msg)
+      for rt in rt_inputs[1:]
+  ]
+
+  with ops.control_dependencies(nrows_checks):
+    # Concatentate the inputs together to put them in a single ragged tensor.
+    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
+
+    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
+    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
+    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
+    #                      ...,
+    #                  rt_inputs[0][M], ..., rt_input[N][M]]
+    # where `N=num_inputs-1` and `M=rt_nrows-1`.
+    row_indices = math_ops.range(rt_nrows * num_inputs)
+    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
+    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
+    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
+    permuted_rt = ragged_gather_ops.gather(concatenated_rt, row_permutation)
+
+    if stack_values:
+      # Add a new splits tensor to group together the values.
+      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
+      _copy_row_shape(rt_inputs, stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
+                                                        stack_splits)
+    else:
+      # Merge together adjacent rows by dropping the row-split indices that
+      # separate them.
+      concat_splits = permuted_rt.row_splits[::num_inputs]
+      _copy_row_shape(rt_inputs, concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
+                                                        concat_splits)
+
+
+def _copy_row_shape(rt_inputs, splits):
+  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
+  for rt in rt_inputs:
+    if rt.shape[0] is not None:
+      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
+
+
+def _increase_ragged_rank_to(rt_input, ragged_rank):
+  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
+  if ragged_rank > 0:
+    if not ragged_tensor.is_ragged(rt_input):
+      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+    if rt_input.ragged_rank < ragged_rank:
+      rt_input = rt_input.with_values(
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+  return rt_input
+
+
+def _concat_ragged_splits(splits_list):
+  """Concatenates a list of RaggedTensor splits to form a single splits."""
+  pieces = [splits_list[0]]
+  splits_offset = splits_list[0][-1]
+  for splits in splits_list[1:]:
+    pieces.append(splits[1:] + splits_offset)
+    splits_offset += splits[-1]
+  return array_ops.concat(pieces, axis=0)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index bc64f9cc9ed0c673dc6ba7b921e1a9d7d2a5d376..0c9e6efe8bcee5d4c3d3e9aceda1160709f9c00a 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -21,19 +21,25 @@ from __future__ import print_function
 import collections
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
@@ -282,6 +288,7 @@ _UNARY_ELEMENTWISE_OPS = [
     array_ops.zeros_like,
     array_ops.zeros_like_v2,
     clip_ops.clip_by_value,
+    gen_bitwise_ops.invert,
     math_ops.abs,
     math_ops.acos,
     math_ops.acosh,
@@ -348,6 +355,11 @@ _UNARY_LIST_ELEMENTWISE_OPS = [
 ]
 
 _BINARY_ELEMENTWISE_OPS = [
+    gen_bitwise_ops.bitwise_and,
+    gen_bitwise_ops.bitwise_or,
+    gen_bitwise_ops.bitwise_xor,
+    gen_bitwise_ops.left_shift,
+    gen_bitwise_ops.right_shift,
     math_ops.add,
     math_ops.atan2,
     math_ops.complex,
@@ -392,12 +404,13 @@ _V1_OPS_THAT_DELEGATE_TO_V2_OPS = [
 
 
 def _ragged_gather_v1(params, indices, validate_indices=None, name=None,
-                      axis=0):
-  return ragged_array_ops.gather(
+                      axis=0, batch_dims=0):
+  return ragged_gather_ops.gather(
       params=params,
       indices=indices,
       validate_indices=validate_indices,
       axis=axis,
+      batch_dims=batch_dims,
       name=name)
 
 
@@ -407,19 +420,26 @@ def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None):  # pylint: di
   return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
 
 
+def _ragged_size_v1(input, name=None, out_type=dtypes.int32):  # pylint: disable=redefined-builtin
+  return ragged_array_ops.size(input=input, out_type=out_type, name=name)
+
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
-    (array_ops.batch_gather, ragged_array_ops.batch_gather,
+    (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
      ['params', 'indices']),
-    (array_ops.concat, ragged_array_ops.concat, ['[values]']),
+    (array_ops.concat, ragged_concat_ops.concat, ['[values]']),
     (array_ops.expand_dims, _ragged_expand_dims_v1, ['input']),
     (array_ops.expand_dims_v2, ragged_array_ops.expand_dims, ['input']),
     (array_ops.gather, _ragged_gather_v1, ['params', 'indices']),
-    (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
-    (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
-    (array_ops.stack, ragged_array_ops.stack, ['[values]']),
+    (array_ops.gather_v2, ragged_gather_ops.gather, ['params', 'indices']),
+    (array_ops.gather_nd, ragged_gather_ops.gather_nd, ['params', 'indices']),
+    (array_ops.rank, ragged_array_ops.rank, ['input']),
+    (array_ops.size, _ragged_size_v1, ['input']),
+    (array_ops.size_v2, ragged_array_ops.size, ['input']),
+    (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
-    (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
+    (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
     (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 9d70470f05a292e09def389505779b92041f2e99..04ef0d7cd68a8f4e09d424584885c342d23a564c 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
@@ -121,9 +122,15 @@ BINARY_BOOL_OPS = [
     math_ops.logical_xor,
 ]
 UNARY_INT_OPS = [
+    gen_bitwise_ops.invert,
     string_ops.unicode_script,
 ]
 BINARY_INT_OPS = [
+    gen_bitwise_ops.bitwise_and,
+    gen_bitwise_ops.bitwise_or,
+    gen_bitwise_ops.bitwise_xor,
+    gen_bitwise_ops.left_shift,
+    gen_bitwise_ops.right_shift,
     math_ops.truncatediv,
     math_ops.truncatemod,
 ]
@@ -676,6 +683,18 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
                   1
           },
           expected=[False, True]),
+      dict(
+          op=array_ops.rank,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=2),
+      dict(
+          op=array_ops.size,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=3),
+      dict(
+          op=array_ops.size_v2,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=3),
   ])
   def testRaggedDispatch(self, op, expected, args=(), kwargs=None):
     if kwargs is None: kwargs = {}
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index d4bffeb401656b02a48a36eb0383850656506fc4..8e44368d4752ed01410de762b7cbda134ebfaa60 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_array_ops.gather_nd."""
+"""Tests for ragged_gather_ops.gather_nd."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,8 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -201,7 +201,7 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
           expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
   ])  # pyformat: disable
   def testRaggedGatherNd(self, descr, params, indices, expected):
-    result = ragged_array_ops.gather_nd(params, indices)
+    result = ragged_gather_ops.gather_nd(params, indices)
     self.assertRaggedEqual(result, expected)
 
   def testRaggedGatherNdUnknownRankError(self):
@@ -213,10 +213,10 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
 
     with self.assertRaisesRegexp(ValueError,
                                  'indices.rank be statically known.'):
-      ragged_array_ops.gather_nd(params, indices1)
+      ragged_gather_ops.gather_nd(params, indices1)
     with self.assertRaisesRegexp(
         ValueError, r'indices.shape\[-1\] must be statically known.'):
-      ragged_array_ops.gather_nd(params, indices2)
+      ragged_gather_ops.gather_nd(params, indices2)
 
   @parameterized.parameters([
       dict(
@@ -238,7 +238,7 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
                                     message=None,
                                     error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged_array_ops.gather_nd(params, indices)
+      ragged_gather_ops.gather_nd(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index 9914b56448868b21058cdb50cda17d63676c4f23..eb64bb4ad1685dc1c9c850c4a9c9ef36e9ffa23f 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -25,8 +24,8 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -41,35 +40,35 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
                                                  ['e']])
     ragged_indices = ragged_factory_ops.constant([[3, 1, 2], [1], [], [0]])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, ragged_indices),
+        ragged_gather_ops.gather(params, ragged_indices),
         [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(ragged_params, indices),
+        ragged_gather_ops.gather(ragged_params, indices),
         [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(ragged_params, ragged_indices),
+        ragged_gather_ops.gather(ragged_params, ragged_indices),
         [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
 
   def testTensorParamsAndTensorIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = [2, 0, 2, 1]
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
-    self.assertIsInstance(ragged_array_ops.gather(params, indices), ops.Tensor)
+        ragged_gather_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
+    self.assertIsInstance(ragged_gather_ops.gather(params, indices), ops.Tensor)
 
   def testRaggedParamsAndTensorIndices(self):
     params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
                                           [], ['g']])
     indices = [2, 0, 2, 1]
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
 
   def testTensorParamsAndRaggedIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
 
   def testRaggedParamsAndRaggedIndices(self):
@@ -77,7 +76,7 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
                                           [], ['g']])
     indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
          [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
          [[]]]                                        #  [p[3]            ]]
@@ -88,14 +87,14 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
                                           [], ['g']])
     indices = 1
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices), [b'c', b'd', b'e'])
+        ragged_gather_ops.gather(params, indices), [b'c', b'd', b'e'])
 
   def test3DRaggedParamsAnd2DTensorIndices(self):
     params = ragged_factory_ops.constant([[['a', 'b'], []],
                                           [['c', 'd'], ['e'], ['f']], [['g']]])
     indices = [[1, 2], [0, 1], [2, 2]]
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
          [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
          [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
@@ -109,7 +108,7 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
         inner_shape=(2,))
     params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[[b'd', b'e'], [b'a', b'g']], []],
          [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
          [[[b'b', b'a']]]])  # pyformat: disable
@@ -121,13 +120,13 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
     ragged_indices = ragged_factory_ops.constant([[0, 3]])
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[1\] = 3 is not in \[0, 3\)'):
-      self.evaluate(ragged_array_ops.gather(tensor_params, ragged_indices))
+      self.evaluate(ragged_gather_ops.gather(tensor_params, ragged_indices))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[2\] = 2 is not in \[0, 2\)'):
-      self.evaluate(ragged_array_ops.gather(ragged_params, tensor_indices))
+      self.evaluate(ragged_gather_ops.gather(ragged_params, tensor_indices))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[1\] = 3 is not in \[0, 2\)'):
-      self.evaluate(ragged_array_ops.gather(ragged_params, ragged_indices))
+      self.evaluate(ragged_gather_ops.gather(ragged_params, ragged_indices))
 
   def testUnknownIndicesRankError(self):
     if context.executing_eagerly():
@@ -137,7 +136,7 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
     indices = array_ops.placeholder_with_default(indices, None)
     self.assertRaisesRegexp(ValueError,
                             r'indices\.shape\.ndims must be known statically',
-                            ragged_array_ops.gather, params, indices)
+                            ragged_gather_ops.gather, params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_ops.py b/tensorflow/python/ops/ragged/ragged_gather_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b49e0e549ff8a3948c335e54a90deb5708d4b7cd
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_ops.py
@@ -0,0 +1,258 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gather operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_ragged_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+#===============================================================================
+# ragged_gather
+#===============================================================================
+# TODO(edloper): Add an `axis` argument
+def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
+           name=None):
+  """Gathers ragged slices from `params` axis `0` according to `indices`.
+
+  Returns `RaggedTensor` output, such that:
+
+  ```python
+  output.shape = indices.shape + params.shape[1:]
+  output.ragged_rank = indices.shape.ndims + params.ragged_rank
+  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+  ```
+
+  `params` may be ragged.  `indices` may be ragged.
+  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
+  then an error is returned.
+
+  Examples:
+
+  ```python
+  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
+  >>> indices = tf.constant([3, 1, 2, 1, 0])
+  >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
+
+  >>> print ragged.gather(params, ragged_indices)
+  [['d', 'b', 'c'], ['b'], [], ['a']]
+
+  >>> print ragged.gather(ragged_params, indices)
+  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+
+  >>> print ragged.gather(ragged_params, ragged_indices)
+  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
+  ```
+
+  Args:
+    params: The potentially ragged tensor from which to gather values. Must be
+      at least rank 1.
+    indices: The potentially ragged tensor indicating which values to gather.
+      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
+      params.shape[0]]`.
+    validate_indices: Ignored.
+    axis: Must be zero.
+    batch_dims: Must be zero.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor`, where `output.dtype=params.dtype` and
+    `output.shape=indices.shape + params.shape[1:]` and
+    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
+
+  Raises:
+    ValueError: If indices.shape.ndims is not known statically.
+  """
+  del validate_indices
+  if not isinstance(axis, int) or axis != 0:
+    raise ValueError('axis != 0 is not supported for ragged gather yet.')
+  if not isinstance(batch_dims, int) or batch_dims != 0:
+    raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
+  with ops.name_scope(name, 'RaggedGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+
+    if ragged_tensor.is_ragged(indices):
+      return indices.with_values(gather(params, indices.values))
+
+    if not ragged_tensor.is_ragged(params):
+      return array_ops.gather(params, indices)
+
+    indices = ops.convert_to_tensor(indices)
+    if indices.shape.ndims is None:
+      raise ValueError('indices.shape.ndims must be known statically')
+
+    result = gen_ragged_array_ops.ragged_gather(
+        indices=indices,
+        params_dense_values=params.flat_values,
+        params_nested_splits=params.nested_row_splits,
+        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
+        1)
+
+    # Compose the RaggedTensor from splits & values.
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        result.output_dense_values, result.output_nested_splits)
+
+
+#===============================================================================
+# ragged.gather_nd
+#===============================================================================
+def gather_nd(params, indices, name=None):
+  """Gather slices from `params` using `n`-dimensional indices.
+
+  This operation is similar to `gather`, but it uses the innermost dimension
+  of `indices` to define a slice into `params`.  In particular, if:
+
+  * `indices` has shape `[A1...AN, I]`
+  * `params` has shape `[B1...BM]`
+
+  Then:
+
+  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
+  * `result[a1...aN] = params[indices[a1...aN, :]]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[A1...AN, I]`.
+    indices: A potentially ragged tensor with shape `[B1...BM]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
+
+  #### Examples:
+    ```python
+    >>> params = tf.ragged.constant_value(
+    ...     [ [ ['000', '001'], ['010'              ]          ],
+    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+    ...       [ [            ], ['210'              ]          ] ])
+
+    >>> # Gather 2D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2], [0]])
+    [ [ [            ], ['210'] ]
+      [ ['000', '001'], ['010'] ] ]
+
+    >>> # Gather 1D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
+    [['210'], ['000', '001']]
+
+    >>> # Gather scalars from a 3D tensor
+    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
+    ['001', '112']
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.gather_nd(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
+
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_shape = indices.shape
+    indices_ndims = indices_shape.ndims
+    if indices_ndims is None:
+      raise ValueError('indices.rank be statically known.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+    if (ragged_tensor.is_ragged(indices) and
+        indices_ndims == indices.ragged_rank + 1):
+      raise ValueError('The innermost dimension of indices may not be ragged')
+
+    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
+    # that each index slices into.
+    index_size = tensor_shape.dimension_value(indices_shape[-1])
+    if index_size is None:
+      raise ValueError('indices.shape[-1] must be statically known.')
+
+    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
+    # dense, then we convert it to ragged before recursing, and then convert
+    # the result back to `dense` if appropriate.
+    if indices_ndims > 2:
+      indices_is_dense = not ragged_tensor.is_ragged(indices)
+      if indices_is_dense:
+        indices = ragged_conversion_ops.from_tensor(
+            indices, ragged_rank=indices_ndims - 2)
+      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
+      if (indices_is_dense and ragged_tensor.is_ragged(result) and
+          result.ragged_rank == indices_ndims - 2):
+        result = ragged_conversion_ops.to_tensor(result)
+      return result
+
+    # indices_ndims <= 2, and the innermost dimension of indices may not be
+    # ragged, so `indices` must not be ragged.
+    assert not ragged_tensor.is_ragged(indices)
+    assert ragged_tensor.is_ragged(params)
+
+    # Handle corner case: An empty index tuple selects the entire `params`
+    # value.  So if `index_size` is zero, then tile `params`.
+    if index_size == 0:
+      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
+      for dim in range(indices_ndims - 1):
+        params = ragged_array_ops.expand_dims(params, axis=0)
+      multiples = array_ops.concat([
+          array_ops.shape(indices)[:-1],
+          array_ops.ones([params_ndims], dtypes.int32)
+      ],
+                                   axis=0)
+      return ragged_array_ops.tile(params, multiples)
+
+    # When index_size=1, we can just flatten the index tuples and use gather.
+    elif index_size == 1:
+      flattened_index_tuples = array_ops.reshape(indices, [-1])
+      return gather(params, flattened_index_tuples)
+
+    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
+    # Flatten both the index tuples and the params, such that the flattened
+    # index tuples point to the correct values in the flattened params; and
+    # then use ragged.gather on the flattened index tuples & params.
+    else:
+      indices = math_ops.to_int64(indices)
+
+      # Flatten the outermost 2 dimensions of the index tuples & params.
+      flattened_index_tuples = array_ops.gather(params.row_splits,
+                                                indices[..., 0])
+      flattened_index_tuples += indices[..., 1]
+      flattened_params = params.values
+
+      # Flatten any remaining dimensions.
+      for dim in range(2, index_size):
+        if not ragged_tensor.is_ragged(flattened_params):
+          flattened_index_tuples = array_ops.expand_dims(
+              flattened_index_tuples, axis=1)
+          flattened_index_tuples = array_ops.concat(
+              [flattened_index_tuples, indices[..., dim:]], axis=1)
+          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
+
+        flattened_index_tuples = array_ops.gather(
+            flattened_params.row_starts(), flattened_index_tuples)
+        flattened_index_tuples += indices[..., dim]
+        flattened_params = flattened_params.values
+
+      # Gather using the flattened index tuples and params.
+      return gather(flattened_params, flattened_index_tuples)
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 001a400596597bb0efb9b847184abd54e757f1d5..d01cf67139b397977c30817fa515f5e30050b25b 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
@@ -150,6 +151,27 @@ def _ragged_getitem(rt_input, key_list):
   else:
     starts = rt_input.row_splits[:-1]
     limits = rt_input.row_splits[1:]
+    if context.executing_eagerly():
+      # In python, __getitem__ should throw IndexError for out of bound
+      # indices. This will allow iteration run correctly as python will
+      # translate IndexError into StopIteration for next()/__next__().
+      # Below is an example:
+      #    import tensorflow as tf
+      #    r = tf.ragged.constant([[1., 2.], [3., 4., 5.], [6.]])
+      #    for elem in r:
+      #      print(elem)
+      # In non eager mode, the exception is thrown when session runs
+      # so we don't know if out of bound happens before.
+      # In eager mode, however, it is possible to find out when to
+      # throw out of bound IndexError.
+      # In the following row_key >= len(starts) is checked. In case of
+      # TypeError which happens when row_key is not an integer, the exception
+      # will simply be ignored as it will be processed later anyway.
+      try:
+        if int(row_key) >= len(starts):
+          raise IndexError("Row key {} out of bounds".format(row_key))
+      except (TypeError, ValueError):
+        pass
     row = rt_input.values[starts[row_key]:limits[row_key]]
     return row.__getitem__(inner_keys)
 
@@ -344,7 +366,7 @@ def _build_ragged_tensor_from_value_ranges(starts, limits, step, values):
 
   # Use `ragged_gather` or `array_ops.gather` to collect the values.
   if isinstance(values, ragged_tensor.RaggedTensor):
-    gathered_values = ragged_array_ops.gather(
+    gathered_values = ragged_gather_ops.gather(
         params=values, indices=value_indices.values)
   else:
     gathered_values = array_ops.gather(
diff --git a/tensorflow/python/ops/ragged/ragged_rank_op_test.py b/tensorflow/python/ops/ragged/ragged_rank_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54eee3bc0425852e82858684509838e5812dffde
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_rank_op_test.py
@@ -0,0 +1,89 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.rank op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl.testing import parameterized
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRankOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Rank 0
+      dict(
+          test_input=1,
+          expected_rank=0,
+      ),
+      # Rank 1
+      dict(
+          test_input=[1],
+          expected_rank=1,
+      ),
+      dict(
+          test_input=[1, 2, 3, 4],
+          expected_rank=1,
+      ),
+      # Rank 2
+      dict(
+          test_input=[[1], [2], [3]],
+          expected_rank=2,
+      ),
+      # Rank 3
+      dict(
+          test_input=[[[1], [2, 3]], [[4], [5, 6, 7]]],
+          expected_rank=3,
+      ),
+      # Rank 3, ragged_rank=2
+      dict(
+          test_input=[[[1], [2, 3], [10, 20]],
+                      [[4], [5, 6, 7]]],
+          expected_rank=3,
+          ragged_rank=2,
+      ),
+      # Rank 4, ragged_rank=3 with dimensions: {2, (1, 2), (2), (1, 2)}
+      dict(
+          test_input=[[[[1], [2]]],
+                      [[[3, 4], [5, 6]], [[7, 8], [9, 10]]]],
+          expected_rank=4,
+      ),
+      # Rank 4, ragged_rank=2 with dimensions: {2, (1, 2), (1, 2), 2}
+      dict(
+          test_input=[
+              [[[1, 2]]],
+              [[[5, 6], [7, 8]],
+               [[9, 10], [11, 12]]]],
+          expected_rank=4,
+          ragged_rank=2,
+      ),
+
+  ])
+  def testRaggedRank(self, test_input, expected_rank, ragged_rank=None):
+    test_input = ragged_factory_ops.constant(
+        test_input, ragged_rank=ragged_rank)
+    self.assertAllEqual(ragged_array_ops.rank(
+        test_input), expected_rank)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_size_op_test.py b/tensorflow/python/ops/ragged/ragged_size_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ffed11b13c0bc80dbfc45e1af79a808af3da7d1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_size_op_test.py
@@ -0,0 +1,48 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.size."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSizeOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      {'size': 1, 'test_input': 1},
+      {'size': 0, 'test_input': []},
+      {'size': 0, 'test_input': [], 'ragged_rank': 1},
+      {'size': 3, 'test_input': [1, 1, 1]},
+      {'size': 3, 'test_input': [[1, 1], [1]]},
+      {'size': 5, 'test_input': [[[1, 1, 1], [1]], [[1]]]},
+      {'size': 6, 'test_input': [[[1, 1], [1, 1]], [[1, 1]]], 'ragged_rank': 1},
+  ])
+  def testRaggedSize(self, test_input, size, ragged_rank=None):
+    input_rt = ragged_factory_ops.constant(test_input, ragged_rank=ragged_rank)
+    self.assertAllEqual(ragged_array_ops.size(input_rt), size)
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index f9c825168e64a4c9f8f0df572e396ca01dc8de51..e52ad4de20cd8697c7772123627f32d2b980b720 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_array_ops.stack."""
+"""Tests for ragged_concat_ops.stack."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,7 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
@@ -33,6 +33,52 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
                         parameterized.TestCase):
 
   @parameterized.parameters(
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],),   # shape=(3, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),   # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01']],
+              [[]],
+              [[b'a20', b'a21', b'a22']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),   # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00'], [b'a01']], [],
+              [[b'a20'], [b'a21'], [b'a22']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=-3',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],),   # shape=(3, None)
+          axis=-3,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),   # shape=(3, None)
+          axis=-2,
+          expected=[
+              [[b'a00', b'a01']],
+              [[]],
+              [[b'a20', b'a21', b'a22']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),  # shape=(3, None)
+          axis=-1,
+          expected=[
+              [[b'a00'], [b'a01']], [],
+              [[b'a20'], [b'a21'], [b'a22']]]),
       dict(
           descr='Two rank-2 inputs (ragged_rank=1), axis=0',
           rt_inputs=(
@@ -284,7 +330,7 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
         if rrank != 0 else constant_op.constant(rt_input)
         for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
     ]
-    stacked = ragged_array_ops.stack(rt_inputs, axis)
+    stacked = ragged_concat_ops.stack(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
@@ -314,7 +360,7 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
           message='axis=3 out of bounds: expected -3<=axis<3'),
   )
   def testError(self, rt_inputs, axis, error, message):
-    self.assertRaisesRegexp(error, message, ragged_array_ops.stack, rt_inputs,
+    self.assertRaisesRegexp(error, message, ragged_concat_ops.stack, rt_inputs,
                             axis)
 
   def testSingleTensorInput(self):
@@ -325,7 +371,7 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
     equivalent to expand_dims(axis=0).  This test exercises that path.
     """
     rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
-    stacked = ragged_array_ops.stack(rt_inputs, 0)
+    stacked = ragged_concat_ops.stack(rt_inputs, 0)
     self.assertRaggedEqual(stacked, [[[1, 2], [3, 4]]])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index fd334e6cc713d3cc3e94a84e9f7f7bdc813e0a7b..8fb3c1f44ca2ddf3e83fff93dcd4eae3492adfa7 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -45,7 +46,7 @@ _eval_using_default_session = ops._eval_using_default_session
 
 
 @tf_export("RaggedTensor")
-class RaggedTensor(object):
+class RaggedTensor(composite_tensor.CompositeTensor):
   """Represents a ragged tensor.
 
   A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
@@ -1437,6 +1438,53 @@ class RaggedTensor(object):
       values = values.values
     return values
 
+  #=============================================================================
+  # Composite Tensor
+  #=============================================================================
+
+  def _to_components(self):
+    return (self.flat_values,) + self.nested_row_splits
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls.from_nested_row_splits(components[0], components[1:])
+
+  def _shape_invariant_to_components(self, shape=None):
+    ragged_rank = self.ragged_rank
+    flat_values = self.flat_values
+
+    if shape is None:
+      # Default shape invariant
+      value_shape = flat_values.shape[1:]
+      values_shape = tensor_shape.TensorShape([None]).concatenate(value_shape)
+      return ((values_shape, self._row_splits.shape) +
+              tuple(tensor_shape.TensorShape([None])
+                    for i in range(1, ragged_rank)))
+    else:
+      # Explicitly specified shape invariant
+      if shape.ndims is not None and shape.ndims <= ragged_rank:
+        raise ValueError("Shape invariant %s does not have sufficient rank "
+                         "for a RaggedTensor with %d ragged dimensions." %
+                         (shape, self.ragged_rank))
+      if any(tensor_shape.dimension_value(shape[dim]) is not None
+             for dim in range(1, self.ragged_rank + 1)):
+        raise ValueError("Shape invariant dimension size must be None for "
+                         "ragged dimenions.")
+      nrows = tensor_shape.dimension_value(shape[0])
+      value_shape = shape[self.ragged_rank + 1:]
+      values_shape = tensor_shape.TensorShape([None]).concatenate(value_shape)
+      if nrows is None:
+        outer_splits_shape = tensor_shape.TensorShape([None])
+      else:
+        outer_splits_shape = tensor_shape.TensorShape([nrows + 1])
+      return ((values_shape, outer_splits_shape) +
+              tuple(tensor_shape.TensorShape([None])
+                    for i in range(1, ragged_rank)))
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 def is_ragged(value):
   """Returns true if `value` is a ragged tensor or ragged tensor value."""
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 89691b015d76dbd35d0a9f5db2f2a0ab431147b1..62b7a6b1bc7890e4776bc101ffaceb70401532ac 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -828,14 +828,14 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
   # pylint: disable=invalid-slice-index
   @parameterized.parameters(
       # Tests for out-of-bound errors
-      (SLICE_BUILDER[5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[-6],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 2],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[3, 0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-6], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
 
       # Indexing into an inner ragged dimension
       (SLICE_BUILDER[:, 3], ValueError,
@@ -953,14 +953,15 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
        'Cannot index into an inner ragged dimension.'),
 
       # Test for out-of-bounds errors.
-      (SLICE_BUILDER[1, 0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[1, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
       (SLICE_BUILDER[0, 0, 3],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+       (IndexError, ValueError,
+        errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
                                                    message):
@@ -982,10 +983,10 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
-      (SLICE_BUILDER[0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[-1],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-1], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
                                                    message):
@@ -1207,5 +1208,17 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
       res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
       self.assertAllEqual(res2, [15, 7])
 
+  # Test case for GitHub issue 24679.
+  def testEagerForLoop(self):
+    if not context.executing_eagerly():
+      return
+
+    values = [[1., 2.], [3., 4., 5.], [6.]]
+    r = ragged_factory_ops.constant(values)
+    i = 0
+    for elem in r:
+      self.assertAllEqual(elem, values[i])
+      i += 1
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d60ee49ee8adb2e4b117f9009bd602ab36f84046
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""where operation for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def where(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  : If both `x` and `y` are `None`:
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`:
+    Returns a tensor formed by selecting values from `x` where condition is
+    true, and from `y` when condition is false.  In particular:
+
+    : If `condition`, `x`, and `y` all have the same shape:
+
+      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
+      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+
+    : Otherwise:
+
+      * `condition` must be a vector.
+      * `x` and `y` must have the same number of dimensions.
+      * The outermost dimensions of `condition`, `x`, and `y` must all have the
+        same size.
+      * `result[i] = x[i]` if `condition[i]` is true.
+      * `result[i] = y[i]` if `condition[i]` is false.
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional)
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, dim_size(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type, rank, and outermost
+      dimension size as `x` and `y`.
+      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+
+  #### Examples:
+    ```python
+    >>> # Coordinates where condition is true.
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> ragged.where(condition)
+    [[0, 0], [0, 2], [1, 1]]
+
+    >>> # Elementwise selection between x and y, based on condition.
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'b', 'C'], ['d', 'E']]
+
+    >>> # Row selection between x and y, based on condition.
+    >>> condition = [True, False]
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'B', 'C'], ['d', 'e']]
+    ```
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      return _elementwise_where(condition, x, y)
+
+
+def _elementwise_where(condition, x, y):
+  """Ragged version of tf.where(condition, x, y)."""
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where(condition, x, y)
+
+  elif condition_is_ragged and x_is_ragged and y_is_ragged:
+    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
+                                                 y)
+  elif not condition_is_ragged:
+    # Concatenate x and y, and then use `gather` to assemble the selected rows.
+    condition.shape.assert_has_rank(1)
+    x_nrows = _nrows(x)
+    x_and_y = ragged_concat_ops.concat([x, y], axis=0)
+    indices = array_ops.where(condition, math_ops.range(x_nrows),
+                              x_nrows + math_ops.range(_nrows(y)))
+    return ragged_gather_ops.gather(x_and_y, indices)
+
+  else:
+    raise ValueError('Input shapes do not match.')
+
+
+def _coordinate_where(condition):
+  """Ragged version of tf.where(condition)."""
+  if not isinstance(condition, ragged_tensor.RaggedTensor):
+    return array_ops.where(condition)
+
+  # The coordinate for each `true` value in condition.values.
+  selected_coords = _coordinate_where(condition.values)
+
+  # Convert the first index in each coordinate to a row index and column index.
+  first_index = selected_coords[:, 0]
+  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
+  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
+  selected_cols = first_index - selected_row_starts
+
+  # Assemble the row & column index with the indices for inner dimensions.
+  return array_ops.concat([
+      array_ops.expand_dims(selected_rows, 1),
+      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
+  ],
+                          axis=1)
+
+
+def _nrows(rt_input, out_type=dtypes.int64, name=None):
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    return rt_input.nrows(out_type=out_type, name=name)
+  else:
+    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+      return array_ops.shape(rt_input, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index 3dd95658265de90a71f59ab4ae7c38ad80579cec..e76a04072a5ae0f593a9897105962305a38c39bf 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -17,13 +17,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from absl.testing import parameterized
-
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.platform import googletest
 
 
@@ -182,7 +180,7 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
               [[[[], [b'A']]], [[[b'b']]]])),
   ])   # pyformat: disable
   def testRaggedWhere(self, condition, expected, x=None, y=None):
-    result = ragged_array_ops.where(condition, x, y)
+    result = ragged_where_op.where(condition, x, y)
     self.assertRaggedEqual(result, expected)
 
   @parameterized.parameters([
@@ -201,7 +199,7 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
   ])
   def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
     with self.assertRaisesRegexp(error, message):
-      ragged_array_ops.where(condition, x, y)
+      ragged_where_op.where(condition, x, y)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 6104cfa7ffe74499c465400bce1212fd36fad0a2..c1f11e89a075e06d03af3999d7640b20747f8eeb 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import functools
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
@@ -36,6 +37,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -56,8 +58,118 @@ def get_resource_handle_data(graph_op):
       compat.as_bytes(handle_data))
 
 
-def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
-  """Creates a variable handle with information to do shape inference."""
+def get_eager_safe_handle_data(handle):
+  """Get the data handle from the Tensor `handle`."""
+  assert isinstance(handle, ops.Tensor)
+
+  if isinstance(handle, ops.EagerTensor):
+    return handle._handle_data  # pylint: disable=protected-access
+  else:
+    return get_resource_handle_data(handle)
+
+
+def _set_handle_shapes_and_types(tensor, handle_data, graph_mode):
+  """Sets the shape inference result HandleData on tensor.
+
+  Args:
+    tensor: A `Tensor` or `EagerTensor`.
+    handle_data: A `CppShapeInferenceResult.HandleData`.
+    graph_mode: A python bool.
+  """
+  tensor._handle_data = handle_data  # pylint: disable=protected-access
+  if not graph_mode:
+    return
+
+  # Not an EagerTensor, so a graph tensor.
+  shapes, types = zip(*[(pair.shape, pair.dtype)
+                        for pair in handle_data.shape_and_type])
+  ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
+  shapes = [[d.size for d in s.dim]
+            if not s.unknown_rank else None for s in shapes]
+  pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+      tensor._op._graph._c_graph,  # pylint: disable=protected-access
+      tensor._as_tf_output(),  # pylint: disable=protected-access
+      shapes, ranks, types)
+
+
+def _combine_handle_data(handle, initial_value):
+  """Concats HandleData from tensors `handle` and `initial_value`.
+
+  Args:
+    handle: A `Tensor` of dtype `resource`.
+    initial_value: A `Tensor`.
+
+  Returns:
+    A `CppShapeInferenceResult.HandleData`.  If `initial_value` has dtype
+    `variant`, the `HandleData` contains the concatenation of the shape_and_type
+    from both `handle` and `initial_value`.
+
+  Raises:
+    RuntimeError: If handle, which was returned by VarHandleOp, either has
+      no handle data, or its len(handle_data.shape_and_type) != 1.
+  """
+  assert handle.dtype == dtypes.resource
+
+  variable_handle_data = get_eager_safe_handle_data(handle)
+
+  if initial_value.dtype != dtypes.variant:
+    return variable_handle_data
+
+  extra_handle_data = get_eager_safe_handle_data(initial_value)
+  if extra_handle_data is not None and extra_handle_data.is_set:
+    if (variable_handle_data is None
+        or not variable_handle_data.is_set
+        or len(variable_handle_data.shape_and_type) != 1):
+      raise RuntimeError(
+          "Expected VarHandleOp to return a length==1 shape_and_type, "
+          "but saw: '%s'" % (variable_handle_data,))
+    variable_handle_data.shape_and_type.extend(
+        extra_handle_data.shape_and_type)
+  return variable_handle_data
+
+
+def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
+  """Creates a variable handle with information to do shape inference.
+
+  The shape and dtype are read from `initial_value` and stored in the returned
+  resource tensor's handle data.
+
+  If `initial_value.dtype == tf.variant`, we additionally extract the handle
+  data (if any) from `initial_value` and append it to the `handle_data`.
+  In this case, the returned tensor's handle data is in the form
+
+  ```
+  is_set: true
+  shape_and_type {
+    shape {
+      // initial_value.shape
+    }
+    dtype: DT_VARIANT
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[0]
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[1]
+  }
+  ...
+  ```
+
+  Ops that read from this tensor, such as `ReadVariableOp` and
+  `AssignVariableOp`, know that `handle_data(handle).shape_and_type[1:]`
+  correspond to the handle data of the variant(s) stored in the Variable.
+
+  Args:
+    initial_value: A `Tensor`.
+    shared_name: A string.
+    name: A string.
+    graph_mode: A python bool.
+
+  Returns:
+    The handle, a `Tensor` of type `resource`.
+  """
+  shape = initial_value.get_shape()
+  dtype = initial_value.dtype.base_dtype
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
@@ -65,35 +177,38 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
                                                    shared_name=shared_name,
                                                    name=name,
                                                    container=container)
+
   if graph_mode:
-    handle._handle_data = get_resource_handle_data(handle)  # pylint: disable=protected-access
+    full_handle_data = _combine_handle_data(handle, initial_value)
+    _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+    return handle
+  else:
+    # We do not want two distinct ResourceVariable objects for the same
+    # underlying resource in the runtime.
+    # When in eager mode, explicitly ensure so here. When in graph mode, it's
+    # ensured by always generating different variable names.
+    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
+    if exists:
+      raise ValueError("variable object with name '%s' already created. Use "
+                       "get_variable() if reuse is desired." %
+                       shared_name)
+    with context.graph_mode(), ops.Graph().as_default() as graph:
+      h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
+                                                  shared_name=shared_name,
+                                                  name=name,
+                                                  container=container)
+
+      # Tensor._handle_data contains information for the shape-inference code to
+      # know the shape and dtype of the variable pointed to by a handle. Since
+      # shape inference doesn't run in eager mode we copy this data here for
+      # when the handle is captured by an eager mode function.
+      # pylint: disable=protected-access
+      full_handle_data = _combine_handle_data(h, initial_value)
+      _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+      # pylint: enable=protected-access
+    # Clean up op->graph->op reference cycles.
+    ops.dismantle_graph(graph)
     return handle
-
-  # We do not want two distinct ResourceVariable objects for the same
-  # underlying resource in the runtime.
-  # When in eager mode, explicitly ensure so here. When in graph mode, it's
-  # ensured by always generating different variable names.
-  exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-  if exists:
-    raise ValueError("variable object with name '%s' already created. Use "
-                     "get_variable() if reuse is desired." %
-                     shared_name)
-  with context.graph_mode(), ops.Graph().as_default() as graph:
-    h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                shared_name=shared_name,
-                                                name=name,
-                                                container=container)
-
-    # Tensor._handle_data contains information for the shape-inference code to
-    # know the shape and dtype of the variable pointed to by a handle. Since
-    # shape inference doesn't run in eager mode we copy this data here for when
-    # the handle is captured by an eager mode function.
-    # pylint: disable=protected-access
-    handle._handle_data = get_resource_handle_data(h)
-    # pylint: enable=protected-access
-  # Clean up op->graph->op reference cycles.
-  ops.dismantle_graph(graph)
-  return handle
 
 
 @contextlib.contextmanager
@@ -160,8 +275,19 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
                                                       name=name)
 
 
-# TODO(apassos) make this be variables.Variable
-class ResourceVariable(variables.RefVariable):
+def _maybe_set_handle_data(dtype, handle, tensor):
+  if dtype == dtypes.variant:
+    # For DT_VARIANT types, the handle's shape_and_type[1:] stores the
+    # variant's handle data.  Extract it.
+    handle_data = get_eager_safe_handle_data(handle)
+    if handle_data.is_set and len(handle_data.shape_and_type) > 1:
+      tensor._handle_data = (  # pylint: disable=protected-access
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
+              is_set=True,
+              shape_and_type=handle_data.shape_and_type[1:]))
+
+
+class ResourceVariable(variables.VariableV1):
   """Variable based on resource handles.
 
   See the [Variables How To](https://tensorflow.org/guide/variables)
@@ -218,19 +344,19 @@ class ResourceVariable(variables.RefVariable):
                initial_value=None,
                trainable=True,
                collections=None,
-               validate_shape=True,
+               validate_shape=True,  # pylint: disable=unused-argument
                caching_device=None,
                name=None,
                dtype=None,
                variable_def=None,
                import_scope=None,
-               constraint=None):
+               constraint=None,
+               distribute_strategy=None):
     """Creates a variable.
 
     Args:
       initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. The initial value must have
-        a shape specified unless `validate_shape` is set to False. Can also be a
+        which is the initial value for the Variable. Can also be a
         callable with no argument that returns the initial value when called.
         (Note that initializer functions from init_ops.py must first be bound
          to a shape before being used here.)
@@ -263,6 +389,8 @@ class ResourceVariable(variables.RefVariable):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      distribute_strategy: The tf.distribute.Strategy this variable is being
+        created inside of.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -274,6 +402,7 @@ class ResourceVariable(variables.RefVariable):
     collections.
     @end_compatibility
     """
+    self._distribute_strategy = distribute_strategy
     if variable_def:
       if initial_value is not None:
         raise ValueError("variable_def and initial_value are mutually "
@@ -287,18 +416,24 @@ class ResourceVariable(variables.RefVariable):
           initial_value=initial_value,
           trainable=trainable,
           collections=collections,
-          validate_shape=validate_shape,
           caching_device=caching_device,
           name=name,
           dtype=dtype,
           constraint=constraint)
 
-  # pylint: disable=unused-argument
+  def __repr__(self):
+    if context.executing_eagerly() and not self._in_graph_mode:
+      return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
+          self.name, self.get_shape(), self.dtype.name,
+          ops.numpy_text(self.read_value(), is_repr=True))
+    else:
+      return "<tf.Variable '%s' shape=%s dtype=%s>" % (
+          self.name, self.get_shape(), self.dtype.name)
+
   def _init_from_args(self,
                       initial_value=None,
                       trainable=True,
                       collections=None,
-                      validate_shape=True,
                       caching_device=None,
                       name=None,
                       dtype=None,
@@ -390,24 +525,27 @@ class ResourceVariable(variables.RefVariable):
         handle_name = ops._name_from_scope_name(name)
         if self._in_graph_mode:
           shared_name = handle_name
+          unique_id = shared_name
         else:
           # When in eager mode use a uid for the shared_name, to prevent
           # accidental sharing.
-          shared_name = "%s_%d" % (handle_name, ops.uid())
+          unique_id = "%s_%d" % (handle_name, ops.uid())
+          shared_name = context.shared_name()
         # Use attr_scope and device(None) to simulate the behavior of
         # colocate_with when the variable we want to colocate with doesn't
         # yet exist.
+        device_context_manager = (
+            ops.device if self._in_graph_mode else ops.NullContextmanager)
         attr = attr_value_pb2.AttrValue(
             list=attr_value_pb2.AttrValue.ListValue(
                 s=[compat.as_bytes("loc:@%s" % handle_name)]))
         with ops.get_default_graph()._attr_scope({"_class": attr}):
-          with ops.name_scope("Initializer"), ops.device(None):
+          with ops.name_scope("Initializer"), device_context_manager(None):
             initial_value = ops.convert_to_tensor(
                 initial_value() if init_from_fn else initial_value,
                 name="initial_value", dtype=dtype)
           self._handle = eager_safe_variable_handle(
-              shape=initial_value.get_shape(),
-              dtype=initial_value.dtype.base_dtype,
+              initial_value=initial_value,
               shared_name=shared_name,
               name=name,
               graph_mode=self._in_graph_mode)
@@ -421,7 +559,7 @@ class ResourceVariable(variables.RefVariable):
               "variable inside a loop or conditional, use a lambda as the "
               "initializer." % name)
         # pylint: enable=protected-access
-        self._unique_id = shared_name
+        self._unique_id = unique_id
         self._initial_value = initial_value if self._in_graph_mode else None
         self._handle_name = handle_name + ":0"
         self._dtype = initial_value.dtype.base_dtype
@@ -433,12 +571,15 @@ class ResourceVariable(variables.RefVariable):
                 gen_resource_variable_ops.var_is_initialized_op(self._handle))
           if initial_value is not None:
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              # pylint: disable=protected-access
               self._initializer_op = (
                   gen_resource_variable_ops.assign_variable_op(
                       self._handle,
-                      self._try_guard_against_uninitialized_dependencies(
+                      variables._try_guard_against_uninitialized_dependencies(
+                          name,
                           initial_value),
                       name=n))
+              # pylint: enable=protected-access
           with ops.name_scope("Read"), ops.colocate_with(self._handle):
             # Manually assign reads to the handle's device to avoid log
             # messages.
@@ -486,7 +627,6 @@ class ResourceVariable(variables.RefVariable):
       # all in graph mode.
       self._handle_deleter = EagerResourceDeleter(
           handle=self._handle, handle_device=self._handle.device)
-    self._cached_shape_as_list = None
 
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
@@ -544,7 +684,6 @@ class ResourceVariable(variables.RefVariable):
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
     self._constraint = None
-    self._cached_shape_as_list = None
 
   @contextlib.contextmanager
   def _assign_dependencies(self):
@@ -579,7 +718,8 @@ class ResourceVariable(variables.RefVariable):
         trainable=self._trainable,
         constraint=self._constraint,
         dtype=self._dtype,
-        name=self._shared_name + "_copy")
+        name=self._shared_name + "_copy",
+        distribute_strategy=self._distribute_strategy)
     memo[self._unique_id] = copied_variable
     return copied_variable
 
@@ -609,12 +749,9 @@ class ResourceVariable(variables.RefVariable):
     return self._shape
 
   def _shape_as_list(self):
-    if self._cached_shape_as_list:
-      return self._cached_shape_as_list
     if self.shape.ndims is None:
       return None
-    self._cached_shape_as_list = [dim.value for dim in self.shape.dims]
-    return self._cached_shape_as_list
+    return [dim.value for dim in self.shape.dims]
 
   def _shape_tuple(self):
     shape = self._shape_as_list()
@@ -674,6 +811,10 @@ class ResourceVariable(variables.RefVariable):
     """The op for this variable."""
     return self._handle.op
 
+  @property
+  def trainable(self):
+    return self._trainable
+
   def eval(self, session=None):
     """Evaluates and returns the value of this variable."""
     if context.executing_eagerly():
@@ -710,22 +851,13 @@ class ResourceVariable(variables.RefVariable):
     return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
                                               T=self.dtype)
 
-  def _set_save_slice_info(self, save_slice_info):
-    """Sets the slice info for this `ResourceVariable`.
-
-    Args:
-      save_slice_info: A `Variable.SaveSliceInfo` object.
-    """
-    self._save_slice_info = save_slice_info
-
-  def _get_save_slice_info(self):
-    return self._save_slice_info
-
   def _read_variable_op(self):
     if self.trainable:
       tape.variable_accessed(self)
     result = gen_resource_variable_ops.read_variable_op(self._handle,
                                                         self._dtype)
+    _maybe_set_handle_data(self._dtype, self._handle, result)
+
     if not context.executing_eagerly():
       # Note that if a control flow context is active the input of the read op
       # might not actually be the handle. This line bypasses it.
@@ -757,6 +889,17 @@ class ResourceVariable(variables.RefVariable):
         tape.variable_accessed(self)
       value = gen_resource_variable_ops.resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
+
+      if self._dtype == dtypes.variant:
+        # For DT_VARIANT types, the handle's shape_and_type[1:] stores the
+        # variant's handle data.  Extract it.
+        handle_data = get_eager_safe_handle_data(self._handle)
+        if handle_data.is_set and len(handle_data.shape_and_type) > 1:
+          value._handle_data = (  # pylint: disable=protected-access
+              cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
+                  is_set=True,
+                  shape_and_type=handle_data.shape_and_type[1:]))
+
     return array_ops.identity(value)
 
   def to_proto(self, export_scope=None):
@@ -809,10 +952,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  def _ref(self):
-    """Unsupported."""
-    raise NotImplementedError("ResourceVariable does not implement _ref()")
-
   def set_shape(self, shape):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement set_shape()")
@@ -920,7 +1059,15 @@ class ResourceVariable(variables.RefVariable):
     return assign_op
 
   def __reduce__(self):
-    return (ResourceVariable, (self.numpy(),))
+    # The implementation mirrors that of __deepcopy__.
+    return functools.partial(
+        ResourceVariable,
+        initial_value=self.numpy(),
+        trainable=self.trainable,
+        name=self._shared_name,
+        dtype=self.dtype,
+        constraint=self.constraint,
+        distribute_strategy=self._distribute_strategy), ()
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
     """Subtracts `IndexedSlices` from this variable.
@@ -985,6 +1132,55 @@ class ResourceVariable(variables.RefVariable):
         self.handle, sparse_delta.indices,
         ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return self._lazy_read(state_ops.batch_scatter_update(
+        self, sparse_delta.indices, sparse_delta.values,
+        use_locking=use_locking, name=name))
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
@@ -1169,8 +1365,10 @@ class ResourceVariable(variables.RefVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
-    if dtype is not None and dtype != self.dtype:
-      return NotImplemented
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          "Incompatible type conversion requested to type {!r} for variable "
+          "of type {!r}".format(dtype.name, self.dtype.name))
     if as_ref:
       return self.read_value().op.inputs[0]
     else:
@@ -1222,6 +1420,12 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
+ops.register_dense_tensor_like_type(ResourceVariable)
+
+
 class _UnreadVariable(ResourceVariable):
   """Represents a future for a read of a variable.
 
@@ -1270,19 +1474,18 @@ class _UnreadVariable(ResourceVariable):
 
   def _read_variable_op(self):
     with ops.control_dependencies([self._parent_op]):
-      return gen_resource_variable_ops.read_variable_op(self._handle,
-                                                        self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(self._handle,
+                                                          self._dtype)
+      _maybe_set_handle_data(self._dtype, self._handle, result)
+      return result
 
-  def set_shape(self, shape):
-    self._shape = shape
-    self._cached_shape_as_list = None
 
   @property
   def op(self):
     """The op for this variable."""
     return self._parent_op
 
-ops.register_tensor_conversion_function(_UnreadVariable, _dense_var_to_tensor)
+
 ops.register_dense_tensor_like_type(_UnreadVariable)
 
 
@@ -1360,15 +1563,12 @@ class _MixedPrecisionVariable(ResourceVariable):
     with ops.colocate_with(self._handle):
       res = gen_resource_variable_ops.read_variable_op(self._handle,
                                                        self._dtype)
+      _maybe_set_handle_data(self._dtype, self._handle, res)
       if self._read_dtype != self._dtype:
         return math_ops.cast(res, self._read_dtype)
       else:
         return res
 
-  def set_shape(self, shape):
-    self._shape = shape
-    self._cached_shape_as_list = None
-
   @property
   def op(self):
     """The op for this variable."""
@@ -1381,29 +1581,15 @@ class _MixedPrecisionVariable(ResourceVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
-    dtype = dtype or self.read_dtype
-    if dtype != self.read_dtype or as_ref:
+    if (dtype is not None and
+        not dtype.is_compatible_with(self.read_dtype) or as_ref):
       return NotImplemented
-    else:
-      res = self.value()
-    return res
+    return self.value()
 
   def _should_act_as_resource_variable(self):
     """To pass resource_variable_ops.is_resource_variable check."""
     pass
 
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-
-# Note: registering for Variable after ResourceVariable because inheritance will
-# otherwise lead to the wrong behavior.
-ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
-ops.register_tensor_conversion_function(
-    variables.Variable, variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
-
-# pylint: disable=protected-access
-ops.register_dense_tensor_like_type(ResourceVariable)
-
 
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index a4ec90d36c7f1a0b9c6ba6dba6bce74e452de68a..40c3771f4e3bd961bd9728855be319fd3df817c5 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -63,6 +63,15 @@ _WEIGHTS_VARIABLE_NAME = "kernel"
 ASSERT_LIKE_RNNCELL_ERROR_REGEXP = "is not an RNNCell"
 
 
+def _hasattr(obj, attr_name):
+  try:
+    getattr(obj, attr_name)
+  except AttributeError:
+    return False
+  else:
+    return True
+
+
 def assert_like_rnncell(cell_name, cell):
   """Raises a TypeError if cell is not like an RNNCell.
 
@@ -79,9 +88,9 @@ def assert_like_rnncell(cell_name, cell):
     TypeError: A human-friendly exception.
   """
   conditions = [
-      hasattr(cell, "output_size"),
-      hasattr(cell, "state_size"),
-      hasattr(cell, "get_initial_state") or hasattr(cell, "zero_state"),
+      _hasattr(cell, "output_size"),
+      _hasattr(cell, "state_size"),
+      _hasattr(cell, "get_initial_state") or _hasattr(cell, "zero_state"),
       callable(cell),
   ]
   errors = [
@@ -316,7 +325,7 @@ class RNNCell(base_layer.Layer):
     # zeros, especially when eager execution is enabled.
     state_size = self.state_size
     is_eager = context.executing_eagerly()
-    if is_eager and hasattr(self, "_last_zero_state"):
+    if is_eager and _hasattr(self, "_last_zero_state"):
       (last_state_size, last_batch_size, last_dtype,
        last_output) = getattr(self, "_last_zero_state")
       if (last_batch_size == batch_size and
@@ -1174,7 +1183,7 @@ class DropoutWrapper(RNNCell):
 
     # Set cell, variational_recurrent, seed before running the code below
     self._cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
+    if isinstance(cell, checkpointable.Checkpointable):
       self._track_checkpointable(self._cell, name="cell")
     self._variational_recurrent = variational_recurrent
     self._seed = seed
@@ -1415,7 +1424,7 @@ class ResidualWrapper(RNNCell):
     """
     super(ResidualWrapper, self).__init__()
     self._cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
+    if isinstance(cell, checkpointable.Checkpointable):
       self._track_checkpointable(self._cell, name="cell")
     self._residual_fn = residual_fn
 
@@ -1473,7 +1482,7 @@ class DeviceWrapper(RNNCell):
     """
     super(DeviceWrapper, self).__init__()
     self._cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
+    if isinstance(cell, checkpointable.Checkpointable):
       self._track_checkpointable(self._cell, name="cell")
     self._device = device
 
@@ -1542,7 +1551,7 @@ class MultiRNNCell(RNNCell):
     for cell_number, cell in enumerate(self._cells):
       # Add Checkpointable dependencies on these cells so their variables get
       # saved with this object when using object-based saving.
-      if isinstance(cell, checkpointable.CheckpointableBase):
+      if isinstance(cell, checkpointable.Checkpointable):
         # TODO(allenl): Track down non-Checkpointable callers.
         self._track_checkpointable(cell, name="cell-%d" % (cell_number,))
     self._state_is_tuple = state_is_tuple
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index 2ca9c0c647d14b792b2575c8f977d9dbe39efb4b..bef0a8ea4eda3bc3a7d79b275fccf7fbfb1fc3af 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -278,7 +278,7 @@ def _SparseSoftmaxGrad(op, grad):
       indices, sp_output.values * sp_grad.values, shape)
 
   # [..., B, 1], dense.
-  sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keep_dims=True)
+  sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keepdims=True)
   # sparse [..., B, C] + dense [..., B, 1] with broadcast; outputs sparse.
   sp_sum = sparse_ops.sparse_dense_cwise_add(sp_grad, sum_reduced)
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 097b485a115fb8153f77d0ad24c63b872fb2e8ca..a149d9873016e52164d072ee4cabd98167bfa3dd 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -806,8 +806,8 @@ def sparse_split(keyword_required=KeywordRequired(),
   Graphically the output tensors are:
 
       output_tensor[0] =
-      [    a ]
-      [b c   ]
+      [    a   ]
+      [b c     ]
 
       output_tensor[1] =
       [ d e  ]
@@ -1774,7 +1774,9 @@ def sparse_reset_shape(sp_input, new_shape=None):
     output_shape_tensor = math_ops.cast(output_shape_tensor, dtypes.int64)
     # For cases when shape is known during graph construction, this catches the
     # error before the sparse_tensor.SparseTensor catches it.
-    output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
+    if output_shape_tensor.get_shape().rank is not None:
+      output_shape_tensor.get_shape().dims[0].merge_with(
+          in_shape.get_shape().dims[0])
 
     output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 94aaebed951a96a4aade8d05d36b3366e59708a5..0224e7ebb8aa6d35d20ab43a303887d325e00441 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -119,6 +119,7 @@ class LBetaTest(test.TestCase):
           special_math_ops.lbeta(x).get_shape())
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_xla('This test never passed for XLA')
   def test_length_1_last_dimension_results_in_one(self):
     # If there is only one coefficient, the formula still works, and we get one
     # as the answer, always.
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 71aaceee272f6e0acd8b8e860fb501eaed4bd61b..be21263f4cbdbdd4a38b0e849e1fec15ba033712 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -432,19 +432,19 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
   `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
   ```
-  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
   ```
 
   For example, say we want to add 4 scattered elements to a rank-1 tensor to
-  8 elements. In Python, that update would look like this:
+  8 elements. In Python, that addition would look like this:
 
   ```python
-      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-      indices = tf.constant([[4], [3], [1] ,[7]])
-      updates = tf.constant([9, 10, 11, 12])
-      add = tf.scatter_nd_add(ref, indices, updates)
-      with tf.Session() as sess:
-        print sess.run(add)
+  ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+  indices = tf.constant([[4], [3], [1], [7]])
+  updates = tf.constant([9, 10, 11, 12])
+  add = tf.scatter_nd_add(ref, indices, updates)
+  with tf.Session() as sess:
+    print sess.run(add)
   ```
 
   The resulting update to ref would look like this:
@@ -464,9 +464,8 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
     updates: A `Tensor`. Must have the same type as `ref`.
       A tensor of updated values to add to ref.
     use_locking: An optional `bool`. Defaults to `False`.
-      An optional bool. Defaults to True. If True, the assignment will
-      be protected by a lock; otherwise the behavior is undefined,
-      but may exhibit less contention.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
     name: A name for the operation (optional).
 
   Returns:
@@ -550,19 +549,19 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
   `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
   ```
-  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
   ```
 
   For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-  to 8 elements. In Python, that update would look like this:
+  with 8 elements. In Python, that update would look like this:
 
   ```python
-      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-      indices = tf.constant([[4], [3], [1] ,[7]])
-      updates = tf.constant([9, 10, 11, 12])
-      op = tf.scatter_nd_sub(ref, indices, updates)
-      with tf.Session() as sess:
-        print sess.run(op)
+  ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+  indices = tf.constant([[4], [3], [1] ,[7]])
+  updates = tf.constant([9, 10, 11, 12])
+  op = tf.scatter_nd_sub(ref, indices, updates)
+  with tf.Session() as sess:
+    print sess.run(op)
   ```
 
   The resulting update to ref would look like this:
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..155ad969f67073f151fbdc295570e85af5dc22b1
--- /dev/null
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -0,0 +1,241 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for generating random numbers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import \
+tracking
+from tensorflow.python.util.tf_export import tf_export
+
+# A seed for random ops (stateful and stateless) will always be 1024
+# bits, all of which will be sent to the C++ code. The actual C++
+# implementation of some algorithms may only use a lower part of the bits.
+
+MAX_INT64 = 2**63 - 1
+MIN_INT64 = -(2**63)
+UINT64_SPAN = 2**64
+# 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained in
+# b/111604096 and cl/171681867), so I use signed int here. I choose int64
+# instead of int32 here because `VarHandleOp` doesn't support int32 on GPU.
+SEED_TYPE = "int64"
+SEED_MIN = MIN_INT64
+SEED_MAX = MAX_INT64
+SEED_UINT_SPAN = UINT64_SPAN
+SEED_TYPE_BITS = 64
+SEED_BIT_MASK = 0xFFFFFFFFFFFFFFFF
+SEED_SIZE = 16  # in units of SEED_TYPE
+
+
+STATE_TYPE = SEED_TYPE
+RNG_ALG_PHILOX = 1
+DEFAULT_ALGORITHM = RNG_ALG_PHILOX
+
+
+def non_deterministic_seed():
+  """Makes a non-deterministic seed.
+
+  The implementation will be changed soon from pure Python to an op.
+
+  Returns:
+    a 1-D tensor.
+  """
+  return np.random.randint(
+      low=SEED_MIN, high=SEED_MAX + 1, size=SEED_SIZE,
+      dtype=SEED_TYPE)
+
+
+def _uint_to_int(n):
+  if n > SEED_MAX:
+    n = n - SEED_UINT_SPAN
+  return n
+
+
+PHILOX_STATE_SIZE = 3
+
+
+def _make_philox_state(seed):
+  """Makes a RNG state for Philox algorithm.
+
+  Args:
+    seed: an integer or 1-D tensor.
+
+  Returns:
+    a 1-D tensor.
+  """
+  int_types = (int,) if sys.version_info >= (3, 0) else (int, long)
+  if isinstance(seed, int_types):
+    # chop the Python integer (infinite precision) into chunks of SEED_TYPE
+    ls = []
+    for _ in range(PHILOX_STATE_SIZE):
+      ls.append(seed & SEED_BIT_MASK)
+      seed >>= SEED_TYPE_BITS
+    seed = ls
+  # to avoid overflow error from np.asarray
+  seed = list(map(_uint_to_int, seed))
+  seed = np.asarray(seed, dtype=STATE_TYPE)
+  if len(seed.shape) != 1:
+    raise ValueError(
+        "seed should only have one dimension; got shape: %s" % seed.shape)
+  seed = seed[0:PHILOX_STATE_SIZE]
+  # Padding with zeros on the right if too short
+  seed_size = seed.shape[0]
+  if seed_size < PHILOX_STATE_SIZE:
+    seed = np.pad(
+        seed, [(0, PHILOX_STATE_SIZE - seed_size)],
+        mode="constant",
+        constant_values=0)
+  assert seed.shape == (PHILOX_STATE_SIZE,), "Wrong seed.shape: %s" % seed.shape
+  return seed
+
+
+def _make_state_from_seed(seed, algorithm):
+  if algorithm == RNG_ALG_PHILOX:
+    return _make_philox_state(seed)
+  else:
+    raise ValueError("Unsupported algorithm id: %s" % algorithm)
+
+
+def create_rng_state(seed, algorithm=None):
+  """Creates a RNG state.
+
+  Args:
+    seed: an integer or 1-D tensor.
+    algorithm: (optional) an integer representing the RNG algorithm. If None, an
+      algorithm will be auto-selected.
+
+  Returns:
+    a 1-D tensor "rng_state" with:
+    * rng_state[0] is a value that identifies the RNG algorithm;
+    * rng_state[1:] holds the RNG state itself (size dependent on the
+        algorithm).
+  """
+  if algorithm is None:
+    # TODO(wangpeng): more sophisticated algorithm selection
+    algorithm = DEFAULT_ALGORITHM
+  state = _make_state_from_seed(seed, algorithm)
+  return np.concatenate((np.array([algorithm], dtype=STATE_TYPE), state),
+                        axis=None)
+
+
+def _shape_tensor(shape):
+  """Convert to an int32 or int64 tensor, defaulting to int64 if empty."""
+  if isinstance(shape, (tuple, list)) and not shape:
+    dtype = dtypes.int64
+  else:
+    dtype = None
+  return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
+
+
+@tf_export("random.experimental.Generator")
+class Generator(tracking.AutoCheckpointable):
+  """Random-number generator.
+
+  It uses Variable to manage its internal state.
+  """
+
+  def __init__(self, copy_from=None, seed=None, algorithm=None):
+    if copy_from is None:
+      if seed is None:
+        seed = non_deterministic_seed()
+      state = create_rng_state(seed, algorithm)
+      self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+    else:
+      assert seed is None
+      state = copy_from.state
+      self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+
+  def reset(self, seed):
+    algorithm = int(self.algorithm)
+    state = create_rng_state(seed, algorithm)
+    self._state_var.assign(state)
+
+  @property
+  def state(self):
+    return self._state_var
+
+  @property
+  def algorithm(self):
+    return self._state_var[0]
+
+  # The following functions return a tensor and as a side effect update
+  # self._state_var.
+  def standard_normal(self, shape, dtype=dtypes.float32):
+    return gen_stateful_random_ops.stateful_standard_normal(
+        self.state.handle, shape, dtype)
+
+  def normal(self, shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
+             name=None):
+    with ops.name_scope(name, "stateful_normal", [shape, mean, stddev]) as name:
+      shape = _shape_tensor(shape)
+      mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+      stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+      rnd = self.standard_normal(shape, dtype)
+      return math_ops.add(rnd * stddev, mean, name=name)
+
+  # TODO(wangpeng): implement other distributions (`uniform`,
+  #   `truncated_normal`, etc.)
+  # TODO(wangpeng): implement `make_seeds`
+  # TODO(wangpeng): implement `make_generators`
+
+
+# It's not safe to create TF ops before `init_google` is called, so this is
+# initialized to None and get a value the first time `get_global_generator` is
+# called.
+global_generator = None
+
+
+@tf_export("random.experimental.get_global_generator")
+def get_global_generator():
+  global global_generator
+  if global_generator is None:
+    global_generator = Generator()
+  return global_generator
+
+
+@tf_export("random.experimental.set_global_generator")
+def set_global_generator(generator):
+  global global_generator
+  global_generator = generator
+
+
+# This function creates a new Generator object (and the Variable object within),
+# which does not work well with tf.function because (1) tf.function puts
+# restrictions on Variable creation thus reset_global_generator can't be freely
+# used inside tf.function; (2) redirecting a global variable to
+# a new object is problematic with tf.function because the old object may be
+# captured by a 'tf.function'ed function and still be used by it.
+# A 'tf.function'ed function only keeps weak references to variables,
+# so deleting a variable and then calling that function again may raise an
+# error, as demonstrated by
+# random_test.py/RandomTest.testResetGlobalGeneratorBadWithDefun .
+# The function 'set_global_generator' below also has this problem.
+@tf_export("random.experimental.reset_global_generator")
+def reset_global_generator(seed, algorithm=None):
+  global global_generator
+  if algorithm is None:
+    algorithm = int(global_generator.algorithm)  # preserve the old algorithm
+  global_generator = Generator(seed=seed, algorithm=algorithm)
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..92419a0770e302acce205cf9c2c37023da72434c
--- /dev/null
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -0,0 +1,156 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateful_random_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_random_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import stateful_random_ops as \
+random
+from tensorflow.python.platform import test
+
+
+class StatefulRandomOpsTest(test.TestCase):
+
+  def testCreateRNGStateIntSeed(self):
+    """Tests `create_rng_state` when `seed` is int."""
+    # using leading 'F' to test overflow tolerance
+    state = random.create_rng_state(0xFFFF222233334444FFAA666677778888,
+                                    random.RNG_ALG_PHILOX)
+    self.assertAllEqual(
+        list(map(random._uint_to_int,
+                 [random.RNG_ALG_PHILOX, 0xFFAA666677778888,
+                  0xFFFF222233334444] + [0] * (random.PHILOX_STATE_SIZE - 2))),
+        state)
+
+  @test_util.run_v2_only
+  @test_util.also_run_as_tf_function
+  def testEagerAndDefun(self):
+    """A simple test to make sure the op works in eager and defunned mode."""
+    random.get_global_generator().normal((3,))
+
+  @test_util.run_v2_only
+  def testOpSeedSelectionAfterSetSeed(self):
+    """Tests that op-seed selection is reset after reseting global generator.
+
+    Fixing GitHub issue 9171:
+    https://github.com/tensorflow/tensorflow/issues/9171
+    """
+    shape = (3,)
+    random.get_global_generator().reset(1)
+    a = random.get_global_generator().normal(shape)
+    random.get_global_generator().reset(1)
+    b = random.get_global_generator().normal(shape)
+    self.assertAllEqual(a, b)
+
+    # Now do the above again using accelerated ('defun'ed) computation
+    @def_function.function
+    def f():
+      return random.get_global_generator().normal(shape)
+
+    random.get_global_generator().reset(1)
+    c = f()
+    random.get_global_generator().reset(1)
+    d = f()
+    self.assertAllEqual(c, d)
+    self.assertAllEqual(a, c)
+
+  @test_util.run_v2_only
+  def testOpSeedSelectionNotSensitive(self):
+    """Test that op-seed selection is not sensitive to trivial changes.
+
+    Test that op-seed selection is not sensitive to trivial computation
+    (i.e. graph) changes.
+
+    Fixing b/32087099
+    """
+    def f(include_print):
+      shape = constant_op.constant([5])
+      if include_print:
+        shape = logging_ops.Print(shape, [shape])
+      return random.get_global_generator().normal(shape)
+
+    def compare(fst_includes_print, snd_includes_print):
+      random.get_global_generator().reset(50)
+      fst = f(fst_includes_print)
+      random.get_global_generator().reset(50)
+      snd = f(snd_includes_print)
+      self.assertAllEqual(fst, snd)
+      # Now do the above again using accelerated (defunned) 'f'.
+      # Running 'f' with two different Boolean arguments should cause
+      # two different graphs to be generated, hence demonstrating the
+      # insensitivity to graph changes.
+      f_acc = def_function.function(f)
+      random.get_global_generator().reset(50)
+      fst = f_acc(fst_includes_print)
+      random.get_global_generator().reset(50)
+      snd = f_acc(snd_includes_print)
+      self.assertAllEqual(fst, snd)
+
+    compare(False, False)
+    compare(True, True)
+    compare(True, False)
+
+  @test_util.run_v2_only
+  def testSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py .
+    """
+    seed1, seed2 = 50, 60
+    # note how the two seeds for the old op correspond to the seed for the new
+    # op
+    random.get_global_generator().reset([0, seed2, seed1])
+    shape = constant_op.constant([2, 3])
+    dtype = dtypes.float32
+    # create a graph for the old op in order to call it many times
+    @def_function.function
+    def old():
+      return gen_random_ops.random_standard_normal(
+          shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    def new():
+      return random.get_global_generator().standard_normal(shape, dtype=dtype)
+
+    for _ in range(100):
+      self.assertAllEqual(old(), new())
+
+  @test_util.run_v2_only
+  def testResetGlobalGeneratorBadWithDefun(self):
+    """Demonstrates that reset_global_generator don't work properly with defun.
+    """
+    shape = (3,)
+
+    @def_function.function
+    def f():
+      return random.get_global_generator().normal(shape)
+
+    random.reset_global_generator(50)
+    with self.assertRaisesWithPredicateMatch(
+        errors_impl.NotFoundError, "Resource .+ does not exist"):
+      a = f()
+      random.reset_global_generator(50)
+      b = f()
+      self.assertAllEqual(a, b)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 7c2d3be338766a4e25a817f824e06c665059bc01..e02175d6feaf1dd74c560bec9e7815cbe20eddc5 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -232,7 +232,7 @@ def _skip_common_stack_elements(stacktrace, base_case):
   return stacktrace[-1:]
 
 
-class Template(checkpointable.CheckpointableBase):
+class Template(checkpointable.Checkpointable):
   """Wrap a function to aid in variable sharing.
 
   Templates are functions that create variables the first time they are called
@@ -387,8 +387,11 @@ class Template(checkpointable.CheckpointableBase):
     """Returns the variable scope name created by this Template."""
     if self._variable_scope:
       name = self._variable_scope.name
-      # To prevent partial matches on the scope_name, we add '/' at the end.
-      return name if name[-1] == "/" else name + "/"
+      if not name or name[-1] == "/":
+        return name
+      else:
+        # To prevent partial matches on the scope_name, we add '/' at the end.
+        return name + "/"
 
   @property
   def variables(self):
@@ -646,29 +649,6 @@ class EagerTemplate(Template):
         with self._template_store.as_default():
           return self._call_func(args, kwargs)
 
-  @property
-  def name(self):
-    """Returns the name given to this Template."""
-    return self._name
-
-  @property
-  def func(self):
-    """Returns the func given to this Template."""
-    return self._func
-
-  @property
-  def variable_scope(self):
-    """Returns the variable scope object created by this Template."""
-    return self._variable_scope
-
-  @property
-  def variable_scope_name(self):
-    """Returns the variable scope name created by this Template."""
-    if self._variable_scope:
-      name = self._variable_scope.name
-      # To prevent partial matches on the scope_name, we add '/' at the end.
-      return name if name[-1] == "/" else name + "/"
-
   @property
   def variables(self):
     """Returns the list of variables created by the Template."""
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 85333ee6b561c2c593eed3b12caff419eb7c1c84..41a481430fe37abdca84b5121a3e516ebe45a418 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -114,6 +114,7 @@ class _GraphTensorArray(object):
 
     if clear_after_read is None:
       clear_after_read = True
+    self._dynamic_size = None
     dynamic_size = dynamic_size or False
 
     self._dtype = dtype
@@ -221,7 +222,9 @@ class _GraphTensorArray(object):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
     ta = TensorArray(
-        dtype=self._dtype, handle=self._handle, flow=flow,
+        dtype=self._dtype,
+        handle=self._handle,
+        flow=flow,
         infer_shape=self._infer_shape,
         colocate_with_first_write_call=self._colocate_with_first_write_call)
     ta._element_shape = self._element_shape
@@ -278,7 +281,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -349,7 +354,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -378,7 +385,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -448,7 +457,7 @@ class _GraphTensorArrayV2(object):
     del tensor_array_name
     del colocate_with_first_write_call
 
-    del dynamic_size  # TODO(b/117943489): Unused for now.
+    self._dynamic_size = dynamic_size
 
     if (flow is not None and
         (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
@@ -525,10 +534,7 @@ class _GraphTensorArrayV2(object):
   def identity(self):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
-    ta = TensorArray(
-        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
-    ta._element_shape = self._element_shape
-    return ta
+    return build_ta_with_new_flow(self, flow)
 
   def grad(self, source, flow=None, name=None):
     """Not supported."""
@@ -553,11 +559,12 @@ class _GraphTensorArrayV2(object):
       if self._infer_shape:
         self._merge_element_shape(value.shape)
       flow_out = list_ops.tensor_list_set_item(
-          input_handle=self._flow, index=index, item=value, name=name)
-      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      return ta
+          input_handle=self._flow,
+          index=index,
+          item=value,
+          resize_if_index_out_of_bounds=self._dynamic_size,
+          name=name)
+      return build_ta_with_new_flow(self, flow_out)
 
   def stack(self, name=None):
     """See TensorArray."""
@@ -581,10 +588,16 @@ class _GraphTensorArrayV2(object):
 
   def concat(self, name=None):
     """See TensorArray."""
-    value = list_ops.tensor_list_concat(
-        input_handle=self._flow, element_dtype=self._dtype, name=name)
     if self._element_shape and self._element_shape[0].dims is not None:
-      value.set_shape([None] + self._element_shape[0].dims[1:])
+      element_shape = [None] + self._element_shape[0].dims[1:]
+    else:
+      element_shape = None
+
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow,
+        element_dtype=self._dtype,
+        element_shape=element_shape,
+        name=name)
     return value
 
   @tf_should_use.should_use_result
@@ -596,15 +609,7 @@ class _GraphTensorArrayV2(object):
         self._merge_element_shape(value.shape[1:])
       flow_out = list_ops.tensor_list_from_tensor(
           tensor=value, element_shape=value.shape[1:])
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
   def scatter(self, indices, value, name=None):
@@ -614,17 +619,10 @@ class _GraphTensorArrayV2(object):
       value = ops.convert_to_tensor(value, name="value")
       if self._infer_shape and not context.executing_eagerly():
         self._merge_element_shape(value.shape[1:])
+      element_shape = self._element_shape[0] if self._element_shape else None
       flow_out = list_ops.tensor_list_scatter(
-          tensor=value, indices=indices, element_shape=-1)
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+          tensor=value, indices=indices, element_shape=element_shape)
+      return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
@@ -644,15 +642,7 @@ class _GraphTensorArrayV2(object):
           lengths=lengths_64,
           element_shape=self._element_shape[0] if self._element_shape else None,
           name=name)
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   def size(self, name=None):
     """See TensorArray."""
@@ -858,7 +848,8 @@ class _EagerTensorArray(object):
     if self._tensor_array:
       for ix in range(len(self._tensor_array)):
         self._maybe_zero(ix)
-    return array_ops.stack(self._tensor_array, name=name)
+    return ops.convert_to_tensor(
+        self._tensor_array, name=name, dtype=self._dtype)
 
   def gather(self, indices, name=None):
     """See TensorArray."""
@@ -1008,7 +999,7 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+      if control_flow_util.EnableControlFlowV2(ops.get_default_graph()):
         implementation = _GraphTensorArrayV2
       else:
         implementation = _GraphTensorArray
@@ -1042,6 +1033,10 @@ class TensorArray(object):
     """The reference to the TensorArray."""
     return self._implementation.handle
 
+  @property
+  def _dynamic_size(self):
+    return self._implementation._dynamic_size
+
   @property
   def _infer_shape(self):
     return self._implementation._infer_shape
@@ -1227,8 +1222,10 @@ class TensorArray(object):
 
 
 def build_ta_with_new_flow(old_ta, flow):
+  """Builds a TensorArray with a new `flow` tensor."""
   ta = TensorArray(
       dtype=old_ta.dtype,
+      dynamic_size=old_ta._dynamic_size,
       handle=old_ta.handle,
       flow=flow,
       infer_shape=old_ta._infer_shape,
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index ccce9e2f93bac26a69d8cadab9ece4cc2482c4e1..35c00778ae5c99cb5688c9ff1fa97b26c72dc855 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -842,8 +842,11 @@ class _VariableStore(object):
         if isinstance(var, resource_variable_ops.ResourceVariable):
           raise ValueError(err_msg)
         tb = var.op.traceback[::-1]
-        # Throw away internal tf entries and only take a few lines.
-        tb = [x for x in tb if "tensorflow/python" not in x[0]][:3]
+        # Throw away internal tf entries and only take a few lines. In some
+        # cases the traceback can be longer (e.g. if someone uses factory
+        # functions to create variables) so we take more than needed in the
+        # default case.
+        tb = [x for x in tb if "tensorflow/python" not in x[0]][:5]
         raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(
             traceback.format_list(tb))))
       found_var = self._vars[name]
@@ -2480,12 +2483,13 @@ def default_variable_creator(next_creator=None, **kwargs):
     use_resource = _DEFAULT_USE_RESOURCE
   use_resource = use_resource or context.executing_eagerly()
   if use_resource:
+    distribute_strategy = kwargs.get("distribute_strategy", None)
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
         caching_device=caching_device, name=name, dtype=dtype,
         constraint=constraint, variable_def=variable_def,
-        import_scope=import_scope)
+        import_scope=import_scope, distribute_strategy=distribute_strategy)
   else:
     return variables.RefVariable(
         initial_value=initial_value, trainable=trainable,
@@ -2507,6 +2511,7 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
   dtype = kwargs.get("dtype", None)
   import_scope = kwargs.get("import_scope", None)
   constraint = kwargs.get("constraint", None)
+  distribute_strategy = kwargs.get("distribute_strategy", None)
 
   # Set trainable value based on synchronization value.
   synchronization = kwargs.get("synchronization", VariableSynchronization.AUTO)
@@ -2517,7 +2522,7 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
       initial_value=initial_value, trainable=trainable,
       validate_shape=validate_shape, caching_device=caching_device,
       name=name, dtype=dtype, constraint=constraint, variable_def=variable_def,
-      import_scope=import_scope)
+      import_scope=import_scope, distribute_strategy=distribute_strategy)
 
 
 variables.default_variable_creator = default_variable_creator
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index d01b95666b3241b7c9e9a4caf3b6d6c375ff19fe..236a1cd537faf792b79ec334130b81b7a41ee372 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -59,21 +59,6 @@ def _make_getter(captured_getter, captured_previous):
   return getter
 
 
-def _has_cycle(op, path):
-  """Detect cycles in the dependencies of `initial_value`."""
-  if op.name in path:
-    return True
-  path.add(op.name)
-  for op_input in op.inputs:
-    if _has_cycle(op_input.op, path):
-      return True
-  for op_control_input in op.control_inputs:
-    if _has_cycle(op_control_input, path):
-      return True
-  path.remove(op.name)
-  return False
-
-
 @tf_export("VariableSynchronization")
 class VariableSynchronization(enum.Enum):
   """Indicates when a distributed variable will be synced.
@@ -219,7 +204,7 @@ class VariableMetaclass(type):
 
 @tf_export("Variable", v1=[])
 class Variable(six.with_metaclass(VariableMetaclass,
-                                  checkpointable.CheckpointableBase)):
+                                  checkpointable.Checkpointable)):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
 
   A variable maintains state in the graph across calls to `run()`. You add a
@@ -319,8 +304,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing adding `use_resource=True` when constructing the variable will
+  Here, adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
+
   ```
   v = tf.Variable(True, use_resource=True)
   tf.cond(v, lambda: v.assign(False), my_false_fn)
@@ -1031,6 +1017,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """Alias of `Variable.shape`."""
     return self.shape
 
+  def _gather_saveables_for_checkpoint(self):
+    """For implementing `Checkpointable`. This object is saveable on its own."""
+    return {checkpointable.VARIABLE_VALUE_KEY: self}
+
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
@@ -1049,6 +1039,17 @@ class Variable(six.with_metaclass(VariableMetaclass,
     return RefVariable(variable_def=variable_def,
                        import_scope=import_scope)
 
+  def _set_save_slice_info(self, save_slice_info):
+    """Sets the slice info for this `Variable`.
+
+    Args:
+      save_slice_info: A `Variable.SaveSliceInfo` object.
+    """
+    self._save_slice_info = save_slice_info
+
+  def _get_save_slice_info(self):
+    return self._save_slice_info
+
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
 
@@ -1134,6 +1135,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
         return None
 
 
+Variable._OverloadAllOperators()  # pylint: disable=protected-access
+
+
 @tf_export(v1=["Variable"])
 class VariableV1(Variable):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
@@ -1235,7 +1239,7 @@ class VariableV1(Variable):
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing adding `use_resource=True` when constructing the variable will
+  Here, adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
   ```
   v = tf.Variable(True, use_resource=True)
@@ -1572,7 +1576,8 @@ class RefVariable(VariableV1):
         # using their initialized_value() method.
         self._initializer_op = state_ops.assign(
             self._variable,
-            self._try_guard_against_uninitialized_dependencies(
+            _try_guard_against_uninitialized_dependencies(
+                name,
                 self._initial_value),
             validate_shape=validate_shape).op
 
@@ -2151,134 +2156,6 @@ class RefVariable(VariableV1):
     else:
       return v.value()
 
-  def _gather_saveables_for_checkpoint(self):
-    """For implementing `Checkpointable`. This object is saveable on its own."""
-    return {checkpointable.VARIABLE_VALUE_KEY: self}
-
-  def _try_guard_against_uninitialized_dependencies(self, initial_value):
-    """Attempt to guard against dependencies on uninitialized variables.
-
-    Replace references to variables in `initial_value` with references to the
-    variable's initialized values. The initialized values are essentially
-    conditional TensorFlow graphs that return a variable's value if it is
-    initialized or its `initial_value` if it hasn't been initialized. This
-    replacement is done on a best effort basis:
-
-    - If the `initial_value` graph contains cycles, we don't do any
-      replacements for that graph.
-    - If the variables that `initial_value` depends on are not present in the
-      `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
-
-    In these cases, it is up to the caller to ensure that the `initial_value`
-    graph uses initialized variables or that they guard access to variables
-    using their `initialized_value` method.
-
-    Args:
-      initial_value: `Tensor`. The initial value.
-    Returns:
-      A `Tensor` suitable to initialize a variable.
-    Raises:
-      TypeError: If `initial_value` is not a `Tensor`.
-    """
-    if not isinstance(initial_value, ops.Tensor):
-      raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
-
-    # Don't modify initial_value if it contains any cyclic dependencies.
-    if _has_cycle(initial_value.op, path=set()):
-      return initial_value
-
-    return self._safe_initial_value_from_tensor(initial_value, op_cache={})
-
-  def _safe_initial_value_from_tensor(self, tensor, op_cache):
-    """Replace dependencies on variables with their initialized values.
-
-    Args:
-      tensor: A `Tensor`. The tensor to replace.
-      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
-        the results so as to avoid creating redundant operations.
-    Returns:
-      A `Tensor` compatible with `tensor`. Any inputs that lead to variable
-      values will be replaced with a corresponding graph that uses the
-      variable's initialized values. This is done on a best-effort basis. If no
-      modifications need to be made then `tensor` will be returned unchanged.
-    """
-    op = tensor.op
-    new_op = op_cache.get(op.name)
-    if new_op is None:
-      new_op = self._safe_initial_value_from_op(op, op_cache)
-      op_cache[op.name] = new_op
-    return new_op.outputs[tensor.value_index]
-
-  def _safe_initial_value_from_op(self, op, op_cache):
-    """Replace dependencies on variables with their initialized values.
-
-    Args:
-      op: An `Operation`. The operation to replace.
-      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
-        the results so as to avoid creating redundant operations.
-    Returns:
-      An `Operation` compatible with `op`. Any inputs that lead to variable
-      values will be replaced with a corresponding graph that uses the
-      variable's initialized values. This is done on a best-effort basis. If no
-      modifications need to be made then `op` will be returned unchanged.
-    """
-    op_type = op.node_def.op
-    if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
-                   "ReadVariableOp"):
-      return op
-
-    # Attempt to find the initialized_value of any variable reference / handles.
-    # TODO(b/70206927): Fix handling of ResourceVariables.
-    if op_type in ("Variable", "VariableV2", "VarHandleOp"):
-      initialized_value = self._find_initialized_value_for_variable(op)
-      return op if initialized_value is None else initialized_value.op
-
-    # Recursively build initializer expressions for inputs.
-    modified = False
-    new_op_inputs = []
-    for op_input in op.inputs:
-      new_op_input = self._safe_initial_value_from_tensor(op_input, op_cache)
-      new_op_inputs.append(new_op_input)
-      modified = modified or (new_op_input != op_input)
-
-    # If at least one input was modified, replace the op.
-    if modified:
-      new_op_type = op_type
-      if new_op_type == "RefSwitch":
-        new_op_type = "Switch"
-      new_op_name = op.node_def.name + "_" + self.name
-      new_op_name = new_op_name.replace(":", "_")
-      return self.graph.create_op(
-          new_op_type, new_op_inputs,
-          op._output_types,  # pylint: disable=protected-access
-          name=new_op_name, attrs=op.node_def.attr)
-
-    return op
-
-  def _find_initialized_value_for_variable(self, variable_op):
-    """Find the initialized value for a variable op.
-
-    To do so, lookup the variable op in the variables collection.
-
-    Args:
-      variable_op: A variable `Operation`.
-    Returns:
-      A `Tensor` representing the initialized value for the variable or `None`
-      if the initialized value could not be found.
-    """
-    try:
-      var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
-      for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
-                              ops.GraphKeys.LOCAL_VARIABLES):
-        for var in self.graph.get_collection(collection_name):
-          if var.name in var_names:
-            return var.initialized_value()
-    except AttributeError:
-      # Return None when an incomplete user-defined variable type was put in
-      # the collection.
-      return None
-    return None
-
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Variable class higher priority than an ndarray, or a
@@ -2318,6 +2195,11 @@ class RefVariable(VariableV1):
     """The `Graph` of this variable."""
     return self._variable.graph
 
+  @property
+  def _distribute_strategy(self):
+    """The `tf.distribute.Strategy` that this variable was created under."""
+    return None   # Ref variables are never created inside a strategy.
+
   @property
   def shape(self):
     """The `TensorShape` of this variable.
@@ -2414,16 +2296,150 @@ class RefVariable(VariableV1):
         " if you want a new python Tensor object.", 1)
     return self ** other
 
-  def _set_save_slice_info(self, save_slice_info):
-    """Sets the slice info for this `Variable`.
 
-    Args:
-      save_slice_info: A `Variable.SaveSliceInfo` object.
-    """
-    self._save_slice_info = save_slice_info
+def _try_guard_against_uninitialized_dependencies(name, initial_value):
+  """Attempt to guard against dependencies on uninitialized variables.
 
-  def _get_save_slice_info(self):
-    return self._save_slice_info
+  Replace references to variables in `initial_value` with references to the
+  variable's initialized values. The initialized values are essentially
+  conditional TensorFlow graphs that return a variable's value if it is
+  initialized or its `initial_value` if it hasn't been initialized. This
+  replacement is done on a best effort basis:
+
+  - If the `initial_value` graph contains cycles, we don't do any
+    replacements for that graph.
+  - If the variables that `initial_value` depends on are not present in the
+    `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
+
+  In these cases, it is up to the caller to ensure that the `initial_value`
+  graph uses initialized variables or that they guard access to variables
+  using their `initialized_value` method.
+
+  Args:
+    name: Variable name.
+    initial_value: `Tensor`. The initial value.
+  Returns:
+    A `Tensor` suitable to initialize a variable.
+  Raises:
+    TypeError: If `initial_value` is not a `Tensor`.
+  """
+  if not isinstance(initial_value, ops.Tensor):
+    raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
+
+  # Don't modify initial_value if it contains any cyclic dependencies.
+  if _has_cycle(initial_value.op, path=set()):
+    return initial_value
+  return _safe_initial_value_from_tensor(name, initial_value, op_cache={})
+
+
+def _has_cycle(op, path):
+  """Detect cycles in the dependencies of `initial_value`."""
+  if op.name in path:
+    return True
+  path.add(op.name)
+  for op_input in op.inputs:
+    if _has_cycle(op_input.op, path):
+      return True
+  for op_control_input in op.control_inputs:
+    if _has_cycle(op_control_input, path):
+      return True
+  path.remove(op.name)
+  return False
+
+
+def _safe_initial_value_from_tensor(name, tensor, op_cache):
+  """Replace dependencies on variables with their initialized values.
+
+  Args:
+    name: Variable name.
+    tensor: A `Tensor`. The tensor to replace.
+    op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+      the results so as to avoid creating redundant operations.
+  Returns:
+    A `Tensor` compatible with `tensor`. Any inputs that lead to variable
+    values will be replaced with a corresponding graph that uses the
+    variable's initialized values. This is done on a best-effort basis. If no
+    modifications need to be made then `tensor` will be returned unchanged.
+  """
+  op = tensor.op
+  new_op = op_cache.get(op.name)
+  if new_op is None:
+    new_op = _safe_initial_value_from_op(name, op, op_cache)
+    op_cache[op.name] = new_op
+  return new_op.outputs[tensor.value_index]
+
+
+def _safe_initial_value_from_op(name, op, op_cache):
+  """Replace dependencies on variables with their initialized values.
+
+  Args:
+    name: Variable name.
+    op: An `Operation`. The operation to replace.
+    op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+      the results so as to avoid creating redundant operations.
+  Returns:
+    An `Operation` compatible with `op`. Any inputs that lead to variable
+    values will be replaced with a corresponding graph that uses the
+    variable's initialized values. This is done on a best-effort basis. If no
+    modifications need to be made then `op` will be returned unchanged.
+  """
+  op_type = op.node_def.op
+  if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
+                 "ReadVariableOp"):
+    return op
+
+  # Attempt to find the initialized_value of any variable reference / handles.
+  # TODO(b/70206927): Fix handling of ResourceVariables.
+  if op_type in ("Variable", "VariableV2", "VarHandleOp"):
+    initialized_value = _find_initialized_value_for_variable(op)
+    return op if initialized_value is None else initialized_value.op
+
+  # Recursively build initializer expressions for inputs.
+  modified = False
+  new_op_inputs = []
+  for op_input in op.inputs:
+    new_op_input = _safe_initial_value_from_tensor(name, op_input, op_cache)
+    new_op_inputs.append(new_op_input)
+    modified = modified or (new_op_input != op_input)
+
+  # If at least one input was modified, replace the op.
+  if modified:
+    new_op_type = op_type
+    if new_op_type == "RefSwitch":
+      new_op_type = "Switch"
+    new_op_name = op.node_def.name + "_" + name
+    new_op_name = new_op_name.replace(":", "_")
+    return op.graph.create_op(
+        new_op_type, new_op_inputs,
+        op._output_types,  # pylint: disable=protected-access
+        name=new_op_name, attrs=op.node_def.attr)
+
+  return op
+
+
+def _find_initialized_value_for_variable(variable_op):
+  """Find the initialized value for a variable op.
+
+  To do so, lookup the variable op in the variables collection.
+
+  Args:
+    variable_op: A variable `Operation`.
+  Returns:
+    A `Tensor` representing the initialized value for the variable or `None`
+    if the initialized value could not be found.
+  """
+  try:
+    var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
+    for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
+                            ops.GraphKeys.LOCAL_VARIABLES):
+      for var in variable_op.graph.get_collection(collection_name):
+        if var.name in var_names:
+          return var.initialized_value()
+  except AttributeError:
+    # Return None when an incomplete user-defined variable type was put in
+    # the collection.
+    return None
+  return None
 
 
 class PartitionedVariable(object):
@@ -2576,6 +2592,12 @@ class PartitionedVariable(object):
   def shape(self):
     return self.get_shape()
 
+  @property
+  def _distribute_strategy(self):
+    """The `tf.distribute.Strategy` that this variable was created under."""
+    # NOTE(yuefengz): Today, no partitioned variables in a distribute strategy.
+    return None
+
   def get_shape(self):
     return self._shape
 
@@ -2638,6 +2660,15 @@ class PartitionedVariable(object):
       return assign_list
     return [assign.op for assign in assign_list]
 
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(
+    RefVariable,
+    RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
+ops.register_dense_tensor_like_type(RefVariable)
+
+
 @tf_export(v1=["global_variables"])
 def global_variables(scope=None):
   """Returns global variables.
@@ -2961,12 +2992,7 @@ def report_uninitialized_variables(var_list=None,
         # uninitialized variables.
         return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
-# pylint: disable=protected-access
-Variable._OverloadAllOperators()
 
 ops.register_tensor_conversion_function(
-    PartitionedVariable, PartitionedVariable._TensorConversionFunction)
-# pylint: enable=protected-access
-
-
-ops.register_dense_tensor_like_type(Variable)
+    PartitionedVariable,
+    PartitionedVariable._TensorConversionFunction)  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 3e5a8fcdfac5c7134112ff14f0a59664d2deb207..0e427d3c6ab10517524429957e70fcfa332991af 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -57,6 +58,7 @@ def while_loop(cond,
                body,
                loop_vars,
                shape_invariants=None,
+               parallel_iterations=10,
                maximum_iterations=None,
                name=None,
                return_same_structure=True):
@@ -115,12 +117,15 @@ def while_loop(cond,
             loop_counter < maximum_iterations,
             cond(*_pack_sequence_as(orig_loop_vars, args)))
 
+    # NOTE(skyewm): we set collections to the outer graph's collections for
+    # compatibility with TPUEstimator.
     cond_graph = func_graph_module.func_graph_from_py_func(
         cond_name,
         wrapped_cond,
         loop_vars, {},
         signature=_build_signature(loop_vars, shape_invariants),
-        func_graph=util.WhileCondFuncGraph(cond_name),
+        func_graph=util.WhileCondFuncGraph(
+            cond_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
 
     # Add external_captures of cond to the list of loop vars.
@@ -171,7 +176,8 @@ def while_loop(cond,
         wrapped_body,
         loop_vars, {},
         signature=_build_signature(loop_vars, shape_invariants),
-        func_graph=util.WhileBodyFuncGraph(body_name),
+        func_graph=util.WhileBodyFuncGraph(
+            body_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
     # Add external captures of body to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
@@ -211,6 +217,7 @@ def while_loop(cond,
         util.create_new_tf_function(cond_graph),
         util.create_new_tf_function(body_graph),
         output_shapes=[t.shape for t in body_graph.outputs],
+        parallel_iterations=parallel_iterations,
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
@@ -242,33 +249,25 @@ def while_loop(cond,
 @ops.RegisterGradient("While")
 def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a While op produced by while_loop."""
-  cond_graph = _get_graph(op, "cond")
-  body_graph = _get_graph(op, "body")
+  # Note that op is not always the same as while_op because the gradient tape,
+  # for eager mode compatibility, forgets information about the proper op. Since
+  # the loop cannot run in eager mode, however, we can safely introspect into
+  # the graph here.
+  while_op = op.outputs[0].op
+  cond_graph = _get_graph(while_op, "cond")
+  body_graph = _get_graph(while_op, "body")
   orig_num_params = len(body_graph.outputs)
 
   maximum_iterations = op.get_attr(
       "_maximum_iterations") if _is_in_xla_context() else None
+  parallel_iterations = op.get_attr("parallel_iterations")
   assert not _is_in_xla_context() or maximum_iterations is not None
+  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
 
-  # Set the incoming gradient of non-trainable inputs to None. It is possible
-  # that we receive non-None gradients for non-trainable types in nested while
-  # loops because we accumulate outputs of the inner while as variant tensors
-  # which are trainable and hence receive zeros_like tensors in the gradient
-  # pass. The non-trainable tensors then receive the popped zeros tensor from
-  # this zeros variant. The gradient for the loop vars corresponding to these
-  # tensors is None or zeros (this happens only if the loop var is accumulated
-  # as well) in _grad_fn so we reset these.
-  # TODO(b/118712257): Remove the IsTrainable filter once we can handle None
-  # output grads in _grad_fn.
-  grads = [
-      None if not _is_trainable(output) else grad
-      for grad, output in zip(grads, body_graph.outputs)
-  ]
+  grads = [_preprocess_grad(grad, body_out, while_out)
+           for grad, body_out, while_out
+           in zip(grads, body_graph.outputs, while_op.outputs)]
 
-  # Ensure that all non-resource trainable outputs have incoming gradients.
-  assert all(g is not None or o.dtype == dtypes.resource or not _is_trainable(o)
-             for o, g in zip(body_graph.outputs, grads)
-            ), "All trainable loop vars must receive incoming gradients."
   # We compute the gradient for the sub-graph between trainable ys and xs
   # with non-None incoming gradients. We later pad the None's to the list of
   # outputs.
@@ -291,16 +290,17 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
     new_inputs = body_grad_graph.empty_tensor_lists
     new_outputs = body_graph.outputs[orig_num_params:]
 
-    op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
-    op._set_func_attr("body", util.create_new_tf_function(body_graph))
-    op._set_type_list_attr("T", body_graph.output_types)
-    op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
-    op._add_while_inputs(new_inputs)
-    op._add_outputs([t.dtype for t in new_outputs],
-                    [t.shape for t in new_outputs])
+    while_op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
+    while_op._set_func_attr("body", util.create_new_tf_function(body_graph))
+    while_op._set_type_list_attr("T", body_graph.output_types)
+    while_op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
+    while_op._add_while_inputs(new_inputs)
+    while_op._add_outputs([t.dtype for t in new_outputs],
+                          [t.shape for t in new_outputs])
     _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
 
-  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph, op)
+  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph,
+                                           while_op)
   loop_vars = args + captured_inputs
 
   def grad_cond(counter, max_iters, *unused_args):
@@ -318,7 +318,8 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       util.create_new_tf_function(cond_grad_graph),
       util.create_new_tf_function(body_grad_graph),
       output_shapes=[t.shape for t in body_grad_graph.outputs],
-      name="%s_grad" % op.name)
+      parallel_iterations=parallel_iterations,
+      name="%s_grad" % while_op.name)
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
   util.maybe_set_lowering_attr(outputs[0].op)
@@ -341,6 +342,47 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   return none_padded_outputs
 
 
+def _preprocess_grad(grad, body_graph_output, while_op_output):
+  """Returns the initial gradient to be used for a given output tensor.
+
+  Args:
+    grad: the original gradient Tensor passed to the gradient function.
+    body_graph_output: the corresponding Tensor in the body graph.
+    while_op_output: the corresponding Tensor output of the While op.
+
+  Returns:
+    A Tensor or None.
+  """
+  # Set the incoming gradient of non-trainable inputs to None. It is possible
+  # that we receive non-None gradients for non-trainable types in nested while
+  # loops because we accumulate outputs of the inner while as variant tensors
+  # which are trainable and hence receive zeros_like tensors in the gradient
+  # pass. The non-trainable tensors then receive the popped zeros tensor from
+  # this zeros variant. The gradient for the loop vars corresponding to these
+  # tensors is None or zeros (this happens only if the loop var is accumulated
+  # as well) in _grad_fn so we reset these.
+  # TODO(b/118712257): Remove once we can handle None output grads in _grad_fn.
+  if not _is_trainable(body_graph_output):
+    return None
+
+  # GradientTape initializes resource and variant grads as None instead of
+  # zeros. Set to zeros so _GradientsHelper computes the gradients instead of
+  # returning None.
+  if (while_op_output.dtype in (dtypes.resource, dtypes.variant)
+      and grad is None):
+    return _zeros_like(while_op_output)
+
+  return grad
+
+
+def _zeros_like(op_output):
+  """Like array_ops.zeros_like() but also accepts resource var handles."""
+  if op_output.dtype == dtypes.resource:
+    return array_ops.zeros(
+        gen_resource_variable_ops.variable_shape(op_output))
+  return array_ops.zeros_like(op_output)
+
+
 def _is_trainable(tensor):
   """Returns whether the given tensor is trainable."""
   if not gradients_impl.IsTrainable(tensor):
@@ -374,28 +416,30 @@ def _validate_and_convert_to_tensor(maximum_iterations):
   Raises:
     ValueError: If `maximum_iterations` is invalid.
   """
-  if _is_in_xla_context():
-    if maximum_iterations is None:
-      raise ValueError("maximum_iterations is None. It is required and must "
-                       "be statically known (e.g. a constant value or known "
-                       "shape dimension) when building while_loop in XLA "
-                       "context.")
-    if isinstance(maximum_iterations, ops.Tensor):
-      # Get the constant value from the `maximum_iterations` tensor to avoid
-      # capturing a Const tensor from outside this graph.
-      maximum_iterations = tensor_util.constant_value(maximum_iterations)
-      if maximum_iterations is None:
-        raise ValueError("maximum_iterations must be statically known (e.g. a "
-                         "constant value or known shape dimension) when "
-                         "building while_loop in XLA context.")
-
-  if maximum_iterations is not None:
-    # EmptyTensorList expects `max_num_elements` to be of type int32.
-    maximum_iterations = ops.convert_to_tensor(
-        maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
-    if maximum_iterations.shape.ndims != 0:
-      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
-                       maximum_iterations.shape)
+  if maximum_iterations is None:
+    return None
+
+  if _is_in_xla_context() and isinstance(maximum_iterations, ops.Tensor):
+    # Get the constant value from the `maximum_iterations` tensor to avoid
+    # capturing a Const tensor from outside this graph.
+    value = tensor_util.constant_value(maximum_iterations)
+    if value is None:
+      # XLA requires maximum_iterations to be statically known (e.g. a
+      # constant value or known shape dimension) when intermediate values
+      # from the forward pass are needed in the gradients pass. However,
+      # maximum_iterations may not be required if the gradient isn't built
+      # or no intermediates are required, thus we return the tensor as is.
+      return maximum_iterations
+
+    maximum_iterations = value
+
+  # EmptyTensorList expects `max_num_elements` to be of type int32.
+  maximum_iterations = ops.convert_to_tensor(
+      maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+  if maximum_iterations.shape.ndims != 0:
+    raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                     maximum_iterations.shape)
+
   return maximum_iterations
 
 
@@ -715,31 +759,9 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     if captured_tensor is not None:
       return captured_tensor
 
+    # Resource tensors are not accumulated and handled specially.
     if tensor.dtype == dtypes.resource:
-      # Resource-type tensors are not accumulated.
-      # If a resource tensor exists in the loop body it must either be a loop
-      # input or an output of a nested While op inside the loop body which
-      # had captured the external resource.
-      if tensor in self._forward_graph.inputs:
-        index = self._forward_graph.inputs.index(tensor)
-      elif tensor.op.type == "While":
-        # Captured resources occur at the same index in the lists of inputs and
-        # outputs of a while op. So we lookup the input of `tensor.op` at the
-        # same index as the index of `tensor` in the `tensor.op.outputs`.
-        index = self._forward_graph.inputs.index(
-            tensor.op.inputs[tensor.value_index])
-      else:
-        raise ValueError(
-            "Taking gradient of a while loop which creates"
-            " a resource in its body is not supported: %s" % str(tensor))
-      # This must be a loop invariant.
-      assert self._forward_graph.inputs[index] == self._forward_graph.outputs[
-          index], "Resource tensors must be loop invariants %s." % str(
-              self._forward_graph._while.inputs[index])
-      tensor_in_outer_graph = self._forward_graph._while.inputs[index]
-      self._indirect_captures[tensor] = self.capture(
-          tensor_in_outer_graph, whitelisted=True)
-      return self._indirect_captures[tensor]
+      return self._resource_capture_helper(tensor)
 
     # Create or find an existing accumulator output for `tensor` in the forward
     # graph, and fetch from this accumulator in the gradient graph to get the
@@ -780,6 +802,41 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     self.popped_tensor_lists[captured_accumulator] = new_tensor_list
     return captured_tensor
 
+  def _resource_capture_helper(self, tensor):
+    """Returns the captured resource tensor.
+
+    Resource-type tensors are not accumulated. If a resource tensor exists in
+    the loop body it must either be a loop input or an output of a nested While
+    op inside the loop body which had captured the external resource.
+
+    Args:
+      tensor: the external resource Tensor to be captured.
+
+    Returns:
+      Tensor in this graph.
+    """
+    assert tensor.dtype == dtypes.resource
+    if tensor in self._forward_graph.inputs:
+      index = self._forward_graph.inputs.index(tensor)
+    elif tensor.op.type == "While":
+      # Captured resources occur at the same index in the lists of inputs and
+      # outputs of a while op. So we lookup the input of `tensor.op` at the
+      # same index as the index of `tensor` in the `tensor.op.outputs`.
+      index = self._forward_graph.inputs.index(
+          tensor.op.inputs[tensor.value_index])
+    else:
+      raise ValueError(
+          "Taking gradient of a while loop which creates "
+          "a resource in its body is not supported: %s" % tensor)
+    # This must be a loop invariant.
+    assert self._forward_graph.inputs[index] == self._forward_graph.outputs[
+        index], ("Resource tensors must be loop invariants %s." %
+                 self._forward_graph._while.inputs[index])
+    tensor_in_outer_graph = self._forward_graph._while.inputs[index]
+    self._indirect_captures[tensor] = self.capture(
+        tensor_in_outer_graph, whitelisted=True)
+    return self._indirect_captures[tensor]
+
 
 def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
   for (t, shape, input_t) in zip(output_tensors, shape_invariants,
@@ -813,7 +870,7 @@ def _copy_handle_data(src_tensors, tgt_tensors):
 
 
 def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
-  if control_flow_util.IsInXLAContext(op):
+  if maximum_iterations is not None and control_flow_util.IsInXLAContext(op):
     # Store the maximum_iterations to use in the gradient pass.
     op._set_attr(  # pylint: disable=protected-access
         "_maximum_iterations",
@@ -844,19 +901,8 @@ def _pack_sequence_as(structure_with_tas, loop_vars):
   """Like `nest.pack_sequence_as` but also replaces flows with TensorArrays."""
 
   def flow_to_tensor_array(flow, ta):  # pylint: disable=missing-docstring
-    if isinstance(ta, tensor_array_ops.TensorArray):
-      # pylint: disable=protected-access
-      new_ta = tensor_array_ops.TensorArray(
-          dtype=ta.dtype,
-          handle=ta.handle,
-          flow=flow,
-          infer_shape=ta._infer_shape,
-          colocate_with_first_write_call=ta._colocate_with_first_write_call)
-      new_ta._colocate_with = ta._colocate_with
-      new_ta._element_shape = ta._element_shape
-      # pylint: enable=protected-access
-      return new_ta
-    return flow
+    return (tensor_array_ops.build_ta_with_new_flow(ta, flow) if isinstance(  # pylint: disable=g-long-ternary
+        ta, tensor_array_ops.TensorArray) else flow)
 
   flattened_loop_vars = [
       flow_to_tensor_array(*z)
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 7b917235c0a73421552b7aebaa3192de969e5f3a..303b70ff57e4eba5d1338e4ea30fbe5a0c8b652e 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -18,109 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import errno as _errno
 import sys as _sys
 
+from absl.app import run as _run
+
 from tensorflow.python.platform import flags
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _usage(shorthelp):
-  """Writes __main__'s docstring to stdout with some help text.
-
-  Args:
-    shorthelp: bool, if True, prints only flags from the main module,
-        rather than all flags.
-  """
-  doc = _sys.modules['__main__'].__doc__
-  if not doc:
-    doc = '\nUSAGE: %s [flags]\n' % _sys.argv[0]
-    doc = flags.text_wrap(doc, indent='       ', firstline_indent='')
-  else:
-    # Replace all '%s' with sys.argv[0], and all '%%' with '%'.
-    num_specifiers = doc.count('%') - 2 * doc.count('%%')
-    try:
-      doc %= (_sys.argv[0],) * num_specifiers
-    except (OverflowError, TypeError, ValueError):
-      # Just display the docstring as-is.
-      pass
-  if shorthelp:
-    flag_str = flags.FLAGS.main_module_help()
-  else:
-    flag_str = str(flags.FLAGS)
-  try:
-    _sys.stdout.write(doc)
-    if flag_str:
-      _sys.stdout.write('\nflags:\n')
-      _sys.stdout.write(flag_str)
-    _sys.stdout.write('\n')
-  except IOError as e:
-    # We avoid printing a huge backtrace if we get EPIPE, because
-    # "foo.par --help | less" is a frequent use case.
-    if e.errno != _errno.EPIPE:
-      raise
-
-
-class _HelpFlag(flags.BooleanFlag):
-  """Special boolean flag that displays usage and raises SystemExit."""
-  NAME = 'help'
-  SHORT_NAME = 'h'
-
-  def __init__(self):
-    super(_HelpFlag, self).__init__(
-        self.NAME, False, 'show this help', short_name=self.SHORT_NAME)
-
-  def parse(self, arg):
-    if arg:
-      _usage(shorthelp=True)
-      print()
-      print('Try --helpfull to get a list of all flags.')
-      _sys.exit(1)
-
-
-class _HelpshortFlag(_HelpFlag):
-  """--helpshort is an alias for --help."""
-  NAME = 'helpshort'
-  SHORT_NAME = None
-
-
-class _HelpfullFlag(flags.BooleanFlag):
-  """Display help for flags in main module and all dependent modules."""
-
-  def __init__(self):
-    super(_HelpfullFlag, self).__init__('helpfull', False, 'show full help')
-
-  def parse(self, arg):
-    if arg:
-      _usage(shorthelp=False)
-      _sys.exit(1)
-
-
-_define_help_flags_called = False
-
-
-def _define_help_flags():
-  global _define_help_flags_called
-  if not _define_help_flags_called:
-    flags.DEFINE_flag(_HelpFlag())
-    flags.DEFINE_flag(_HelpfullFlag())
-    flags.DEFINE_flag(_HelpshortFlag())
-    _define_help_flags_called = True
+def _parse_flags_tolerate_undef(argv):
+  """Parse args, returning any unknown flags (ABSL defaults to crashing)."""
+  return flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
 
 
 @tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
-  # Define help flags.
-  _define_help_flags()
-
-  # Parse known flags.
-  argv = flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
-
   main = main or _sys.modules['__main__'].main
 
-  # Call the main function, passing through any arguments
-  # to the final program.
-  _sys.exit(main(argv))
-
+  _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index d0159e9e9816ba730c843d2b46936b142d47ff79..dd2c615e9e0ca193b68c4242cb64163bc9266762 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
+@tf_export('io.gfile.GFile', v1=['gfile.GFile', 'gfile.Open', 'io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index fe4b0d0d3767346f4300450f01d56a62e625cca4..802721e34b04d87fc095f6d6900dd2d99b14faef 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Imports unittest as a replacement for testing.pybase.googletest."""
+"""Imports absltest as a replacement for testing.pybase.googletest."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ import tempfile
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from unittest import *
+from absl.testing.absltest import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.framework import errors
@@ -41,7 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 Benchmark = benchmark.TensorFlowBenchmark  # pylint: disable=invalid-name
 
-unittest_main = main
+absltest_main = main
 
 # We keep a global variable in this module to make sure we create the temporary
 # directory only once per test binary invocation.
@@ -51,7 +51,7 @@ _googletest_temp_dir = ''
 # pylint: disable=invalid-name
 # pylint: disable=undefined-variable
 def g_main(argv):
-  """Delegate to unittest.main after redefining testLoader."""
+  """Delegate to absltest.main after redefining testLoader."""
   if 'TEST_SHARD_STATUS_FILE' in os.environ:
     try:
       f = None
@@ -67,7 +67,7 @@ def g_main(argv):
 
   if ('TEST_TOTAL_SHARDS' not in os.environ or
       'TEST_SHARD_INDEX' not in os.environ):
-    return unittest_main(argv=argv)
+    return absltest_main(argv=argv)
 
   total_shards = int(os.environ['TEST_TOTAL_SHARDS'])
   shard_index = int(os.environ['TEST_SHARD_INDEX'])
@@ -87,7 +87,7 @@ def g_main(argv):
   # Override getTestCaseNames
   base_loader.getTestCaseNames = getShardedTestCaseNames
 
-  unittest_main(argv=argv, testLoader=base_loader)
+  absltest_main(argv=argv, testLoader=base_loader)
 
 
 # Redefine main to allow running benchmarks
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 733d471ca29729ba07fca45bb20d5db04ae4cef9..bae20aca506f28910ab1d62238bddb25509dc053 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -22,6 +22,7 @@ limitations under the License.
 %rename("%s") TFE_ContextListDevices;
 %rename("%s") TFE_ContextAddFunction;
 %rename("%s") TFE_ContextAddFunctionDef;
+%rename("%s") TFE_ContextHasFunction;
 %rename("%s") TFE_ContextEnableRunMetadata;
 %rename("%s") TFE_ContextDisableRunMetadata;
 %rename("%s") TFE_ContextExportRunMetadata;
@@ -32,6 +33,10 @@ limitations under the License.
 %rename("%s") TFE_ContextSetServerDef;
 %rename("%s") TFE_ContextAsyncWait;
 %rename("%s") TFE_ContextAsyncClearError;
+%rename("%s") TFE_NewProfiler;
+%rename("%s") TFE_DeleteProfiler;
+%rename("%s") TFE_ProfilerSerializeToString;
+%rename("%s") TFE_StartProfilerServer;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_SetEagerTensorProfiler;
@@ -74,6 +79,7 @@ limitations under the License.
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
 %}
 
 %typemap(in) (const void* proto) {
@@ -136,6 +142,13 @@ limitations under the License.
   $1 = const_cast<char*>(TFE_GetPythonString($input));
 }
 
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* name {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
 %typemap(in) (TFE_Context*) {
   $1 = (TFE_Context*)PyCapsule_GetPointer($input, nullptr);
 
@@ -172,6 +185,25 @@ limitations under the License.
       }
       if (EagerTensor_CheckExact(elem)) {
         (*$1)[i] = EagerTensor_Handle(elem);
+      } else if (tensorflow::swig::IsTensor(elem)) {
+        // If it isnt an EagerTensor, but is still a Tensor, it must be a graph
+        // tensor.
+        SWIG_exception_fail(
+            SWIG_TypeError,
+            tensorflow::strings::StrCat(
+                "An op outside of the function building code is being passed\n"
+                "a \"Graph\" tensor. It is possible to have Graph tensors\n"
+                "leak out of the function building context by including a\n"
+                "tf.init_scope in your function building code.\n"
+                "For example, the following function will fail:\n",
+                "  @tf.function\n",
+                "  def has_init_scope():\n",
+                "    my_constant = tf.constant(1.)\n",
+                "    with tf.init_scope():\n",
+                "      added = my_constant * 2\n",
+                "The graph tensor has name: ",
+                TFE_GetPythonString(PyObject_GetAttrString(elem, "name")))
+                .c_str());
       } else {
         SWIG_exception_fail(
             SWIG_TypeError,
@@ -230,6 +262,7 @@ limitations under the License.
 
 %include "tensorflow/python/eager/pywrap_tfe.h"
 %include "tensorflow/c/c_api_experimental.h"
+%include "tensorflow/c/eager/c_api_experimental.h"
 
 // Clear all typemaps.
 %typemap(out) TF_DataType;
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 40d7e2f25ee4bd3b28301bf164255c67911d62d5..5d08a40d67d075652a95424d03feaacb893ca41b 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -11,7 +11,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
@@ -71,7 +71,7 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:saver",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -98,17 +98,16 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "loader_test",
     size = "small",
     srcs = ["loader_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":builder",
         ":loader",
         ":signature_def_utils",
         ":utils",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -118,7 +117,6 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -155,15 +153,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saved_model_test",
     size = "small",
     srcs = ["saved_model_test.py"],
-    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":builder",
         ":constants",
         ":loader",
@@ -186,6 +180,8 @@ py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
+    tags = ["no_windows"],
 )
 
 py_library(
@@ -205,13 +201,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -237,13 +231,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "signature_def_utils_test",
     size = "small",
     srcs = ["signature_def_utils_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":signature_constants",
         ":signature_def_utils",
         ":utils",
@@ -254,12 +246,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "simple_save_test",
     size = "small",
     srcs = ["simple_save_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":loader",
         ":signature_constants",
         ":simple_save",
@@ -280,6 +271,7 @@ py_library(
         ":builder",
         ":constants",
         ":function_serialization",
+        ":revived_types",
         ":saved_object_graph_py",
         ":signature_constants",
         ":signature_def_utils",
@@ -305,18 +297,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "save_test",
     srcs = ["save_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":loader",
         ":save",
         ":signature_constants",
         ":tag_constants",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -330,6 +321,7 @@ py_library(
         ":constants",
         ":function_deserialization",
         ":loader",
+        ":revived_types",
         ":saved_object_graph_py",
         ":utils",
         "//tensorflow/python:function",
@@ -339,13 +331,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "load_test",
     srcs = ["load_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":load",
         ":save",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lib",
@@ -356,6 +348,27 @@ py_test(
     ],
 )
 
+py_library(
+    name = "revived_types",
+    srcs = [
+        "revived_types.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saved_object_graph_py",
+    ],
+)
+
+tf_py_test(
+    name = "revived_types_test",
+    srcs = ["revived_types_test.py"],
+    additional_deps = [
+        ":revived_types",
+        ":saved_object_graph_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_library(
     name = "function_serialization",
     srcs = [
@@ -408,10 +421,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "nested_structure_coder_test",
     srcs = ["nested_structure_coder_test.py"],
-    deps = [
+    additional_deps = [
         ":nested_structure_coder",
         ":struct_py",
         "//tensorflow/python:framework",
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 7845aab089929805418898c718749a57a12ceb19..3ad289e275730760241371284f2c9a3df35e9080 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -12,18 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tools for deserializing PolymorphicFunctions."""
+"""Tools for deserializing `Function`s."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import re
+
+from tensorflow.core.framework import function_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
+from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
 
@@ -33,76 +40,266 @@ def _is_tensor(t):
 
 def _inputs_compatible(args, stored_inputs):
   """Checks whether function arguments are compatible with parameters."""
-  # TODO(vbardiovsky): The compatibility check should be about the signature,
-  # not the flattened version of it.
   if len(args) != len(stored_inputs):
     return False
-  for a, b in zip(args, stored_inputs):
-    if _is_tensor(a):
-      if not isinstance(b, tensor_spec.TensorSpec):
-        return False
-      if a.dtype != b.dtype or not b.shape.is_compatible_with(a.shape):
-        return False
-    else:
-      if a != b:
-        return False
+
+  for arg, stored_input in zip(args, stored_inputs):
+    if not function_lib.is_same_structure(arg, stored_input):
+      return False
+
+    flattened_arg = nest.flatten(arg)
+    flattened_stored_input = nest.flatten(stored_input)
+
+    for a, b in zip(flattened_arg, flattened_stored_input):
+      if _is_tensor(a):
+        if not isinstance(b, tensor_spec.TensorSpec):
+          return False
+        if a.dtype != b.dtype or not b.shape.is_compatible_with(a.shape):
+          return False
+      else:
+        if a != b:
+          return False
   return True
 
 
-def recreate_polymorphic_function(
-    saved_polymorphic_function, functions):
-  """Creates a PolymorphicFunction from a SavedPolymorphicFunction.
+def _deserialize_function_spec(function_spec_proto, coder):
+  """Deserialize a FunctionSpec object from its proto representation."""
+  fullargspec = coder.decode_proto(function_spec_proto.fullargspec)
+  is_method = function_spec_proto.is_method
+  args_to_prepend = coder.decode_proto(function_spec_proto.args_to_prepend)
+  kwargs_to_include = coder.decode_proto(function_spec_proto.kwargs_to_include)
+  input_signature = coder.decode_proto(function_spec_proto.input_signature)
+  return function_lib.FunctionSpec(fullargspec, is_method, args_to_prepend,
+                                   kwargs_to_include, input_signature)
+
+
+# TODO(allenl): The fact that we can't derive ConcreteFunction calling
+# conventions from the serialized input spec right now is unfortunate. Merging
+# these would be good, maybe by adding TensorSpec names to cache keys so renamed
+# keyword arguments would yield different ConcreteFunctions.
+def setup_bare_concrete_function(saved_bare_concrete_function,
+                                 concrete_functions):
+  """Makes a restored bare concrete function callable."""
+  # Bare concrete functions accept only flat lists of Tensors with unique
+  # names.
+  concrete_function = concrete_functions[
+      saved_bare_concrete_function.concrete_function_name]
+  # pylint: disable=protected-access
+  concrete_function._arg_keywords = (
+      saved_bare_concrete_function.argument_keywords)
+  concrete_function._num_positional_args = (
+      saved_bare_concrete_function.allowed_positional_arguments)
+  # pylint: enable=protected-access
+  concrete_function.add_to_graph()
+  return concrete_function
+
+
+class RestoredFunction(def_function.Function):
+  """Wrapper class for a function that has been restored from saved state.
+
+  See `def_function.Function`.
+  """
+
+  def __init__(self, python_function, name, function_spec, concrete_functions):
+    # TODO(mdan): We may enable autograph once exceptions are supported.
+    super(RestoredFunction, self).__init__(
+        python_function, name, autograph=False)
+    self._concrete_functions = concrete_functions
+    # TODO(vbardiovsky): This does not propagate to stateful and stateless
+    # functions of the RestoredFunction, which will have seen only defunned
+    # restored_function_body(*args, **kwargs). Therefore get_concrete_function()
+    # called on RestoredFunction will not work properly.
+    self._function_spec = function_spec
+
+  def _list_all_concrete_functions_for_serialization(self):
+    return self._concrete_functions
+
+  def get_concrete_function(self, *args, **kwargs):
+    raise NotImplementedError()
+
+
+def recreate_function(saved_function, concrete_functions):
+  """Creates a `Function` from a `SavedFunction`.
 
   Args:
-    saved_polymorphic_function: SavedPolymorphicFunction proto.
-    functions: map from function name to Function.
+    saved_function: `SavedFunction` proto.
+    concrete_functions: map from function name to `ConcreteFunction`.
 
   Returns:
-    A PolymorphicFunction.
+    A `Function`.
   """
-  # TODO(andresp): Construct a PolymorphicFunction with the cache populated
-  # instead of creating a new PolymorphicFunction backed by a Python layer to
+  # TODO(andresp): Construct a `Function` with the cache populated
+  # instead of creating a new `Function` backed by a Python layer to
   # glue things together. Current approach is nesting functions deeper for each
   # serialization cycle.
 
   coder = nested_structure_coder.StructureCoder()
-  function_spec_tuple = coder.decode_proto(
-      saved_polymorphic_function.function_spec_tuple)
-  function_spec = function_lib.FunctionSpec.from_tuple(function_spec_tuple)
+  function_spec = _deserialize_function_spec(saved_function.function_spec,
+                                             coder)
 
-  # TODO(mdan): We may enable autograph once exceptions are supported.
-  @def_function.function(autograph=False)
-  def restored_function(*args, **kwargs):
+  def restored_function_body(*args, **kwargs):
     """Calls a restored function."""
     # TODO(allenl): Functions saved with input_signatures should revive with
     # input_signatures.
-    for monomorphic_function in saved_polymorphic_function.monomorphic_function:
-      function_obj = functions[monomorphic_function.concrete_function]
-      canonicalized_original_inputs = coder.decode_proto(
-          monomorphic_function.canonicalized_input)
-
-      try:
-        can_args, can_kwargs = function_spec.canonicalize_function_inputs(
-            *args, **kwargs)
-        if can_kwargs:
-          # TODO(vbardiovsky): Enable this along with the structured input and
-          # structured output.
-          raise ValueError(
-              "Received keywords arguments that could not be bound: %s" %
-              kwargs)
-      except ValueError:
-        continue
-
-      canonicalized_inputs = nest.flatten(can_args)
+    try:
+      canonicalized_inputs = function_spec.canonicalize_function_inputs(
+          *args, **kwargs)
+    except ValueError as e:
+      raise ValueError(
+          "Cannot canonicalize input args %r and kwargs %r. Error: %r." %
+          (args, kwargs, e))
+
+    debug_considered_signatures = []
+    for concrete_function_name in saved_function.concrete_functions:
+      function_obj = concrete_functions[concrete_function_name]
+      canonicalized_original_inputs = (
+          function_obj.graph.structured_input_signature)
+      debug_considered_signatures.append(canonicalized_original_inputs)
 
       if _inputs_compatible(canonicalized_inputs,
                             canonicalized_original_inputs):
-        filtered_inputs = [t for t in canonicalized_inputs if _is_tensor(t)]
-        flattened_outputs = function_obj._call_flat(filtered_inputs)  # pylint: disable=protected-access
-        # TODO(vbardiovsky): Rebuild output structure.
-        single_output, = flattened_outputs
-        return single_output
+        flattened_inputs = nest.flatten(canonicalized_inputs)
+        filtered_inputs = [t for t in flattened_inputs if _is_tensor(t)]
+        return function_obj._call_flat(filtered_inputs)  # pylint: disable=protected-access
 
     raise AssertionError(
-        "Could not find matching function to call for arguments: %s" % (args,))
-  return restored_function
+        "Could not find matching function to call for canonicalized inputs %r. "
+        "Only existing signatures are %r."
+        % (canonicalized_inputs, debug_considered_signatures))
+
+  concrete_function_objects = []
+  for concrete_function_name in saved_function.concrete_functions:
+    concrete_function_objects.append(concrete_functions[concrete_function_name])
+
+  return RestoredFunction(restored_function_body,
+                          restored_function_body.__name__,
+                          function_spec,
+                          concrete_function_objects)
+
+
+def load_function_def_library(library):
+  """Load a set of functions as concrete functions without captured inputs.
+
+  Functions names are manipulated during load such that they do not overlap
+  with previously created ones.
+
+  Args:
+    library: FunctionDefLibrary proto message.
+
+  Returns:
+    Map of original function names in the library to instances of
+    `ConcreteFunction` without captured inputs.
+
+  Raises:
+    ValueError: if functions dependencies have a cycle.
+  """
+  functions = {}
+
+  for fdef in _sort_function_defs(library):
+    copy = _fix_fdef(fdef, functions)
+
+    func_graph = function_def_lib.function_def_to_graph(copy)
+    for dep in _list_function_deps(fdef):
+      functions[dep].add_to_graph(func_graph)
+    func = function_lib.ConcreteFunction(func_graph)
+    func.add_to_graph()
+
+    functions[fdef.signature.name] = func
+
+    # Also register the gradients in the current root context.
+    with ops.init_scope():
+      func._register_gradient()  # pylint: disable=protected-access
+
+  return functions
+
+
+def _sort_function_defs(library):
+  """Return a topologic sort of FunctionDefs in a library."""
+  edges = collections.defaultdict(list)
+  in_count = collections.defaultdict(lambda: 0)
+
+  for fdef in library.function:
+    for dep in _list_function_deps(fdef):
+      edges[dep].append(fdef.signature.name)
+      in_count[fdef.signature.name] += 1
+
+  ready = [
+      fdef.signature.name
+      for fdef in library.function
+      if in_count[fdef.signature.name] == 0
+  ]
+  output = []
+  while ready:
+    node = ready.pop()
+    output.append(node)
+    for dest in edges[node]:
+      in_count[dest] -= 1
+      if not in_count[dest]:
+        ready.append(dest)
+
+  if len(output) != len(library.function):
+    failed_to_resolve = sorted(set(in_count.keys()) - set(output))
+    raise ValueError("There is a cyclic-dependency between functions. ",
+                     "Could not resolve %r." % (failed_to_resolve,))
+
+  reverse = {fdef.signature.name: fdef for fdef in library.function}
+  return [reverse[x] for x in output]
+
+
+def _fix_fdef(orig_fdef, functions):
+  """Fixes a FunctionDef proto to be loaded in current context.
+
+  In particular, when loading a function library into an eager context, one
+  must rename the functions to avoid conflicts with existent functions.
+
+  Args:
+    orig_fdef: FunctionDef proto to fix. It is not modified.
+    functions: map from function name to a ConcreteFunction instance.
+
+  Returns:
+    A fixed copy of the original FunctionDef.
+  """
+  fdef = function_pb2.FunctionDef()
+  fdef.CopyFrom(orig_fdef)
+  for node_def in fdef.node_def:
+    if "_gradient_op_type" in node_def.attr:
+      if node_def.op in ["StatefulPartitionedCall", "PartitionedCall"]:
+        # TODO(andresp): This code assumes that the gradient registered for this
+        # function call is the default gradient for the function and not a
+        # custom one.
+        fname = node_def.attr["f"].func.name
+        node_def.attr["_gradient_op_type"].s = compat.as_bytes(
+            functions[fname]._gradient_name)  # pylint: disable=protected-access
+      else:
+        logging.warning("Importing a function (%s) with ops with custom "
+                        "gradients. Will likely fail if a gradient is "
+                        "requested.", fdef.signature.name)
+    for _, attr_value in node_def.attr.items():
+      if attr_value.func.name:
+        attr_value.func.name = functions[attr_value.func.name].name
+
+  fdef.signature.name = _clean_function_name(fdef.signature.name)
+  return fdef
+
+
+def _list_function_deps(fdef):
+  # TODO(andresp): Recurse into list attributes and into NameAttrList attrs both
+  # when listing deps and when fixing them. `function_def_to_graph` also
+  # requires fixes.
+  deps = set()
+  for node_def in fdef.node_def:
+    for _, attr_value in node_def.attr.items():
+      if attr_value.WhichOneof("value") == "func":
+        deps.add(attr_value.func.name)
+  return deps
+
+
+def _clean_function_name(name):
+  """Vanity function to keep the function names comprehensible."""
+  # Note: each time a function is wrapped into `function_lib.ConcreteFunction`
+  # its name becomes "__inference_<orig>_xyz".
+  match = re.search(r"^__inference_(.*)_\d+$", name)
+  if match:
+    return match.group(1)
+  else:
+    return name
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index 27e8e476cb0927ed2b9333056d3f585dc743d743..8c65007baed1a78b61d8df358a178bca46f35cc4 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -12,98 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tools for serializing PolymorphicFunctions."""
+"""Tools for serializing `Function`s."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function as defun_lib
+from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import saved_object_graph_pb2
 
 
-def _serialize_polymorphic_function(polymorphic_function, node_ids):
-  """Build a SavedPolymorphicProto."""
-  coder = nested_structure_coder.StructureCoder()
-  proto = saved_object_graph_pb2.SavedPolymorphicFunction()
-
-  proto.function_spec_tuple.CopyFrom(
-      coder.encode_structure(polymorphic_function.function_spec.as_tuple()))  # pylint: disable=protected-access
-  for signature, concrete_function in list_all_concrete_functions(
-      polymorphic_function):
-    bound_inputs = []
-    try:
-      for capture in concrete_function.captured_inputs:
-        bound_inputs.append(node_ids[capture])
-    except KeyError:
-      # TODO(andresp): Would it better to throw an exception?
-      logging.warning(
-          "Concrete function %s not added to object based saved model as it "
-          "captures tensor %s which is unsupported or not reachable from root.",
-          concrete_function.name, capture)
-      continue
-    function_proto = proto.monomorphic_function.add()
-    function_proto.concrete_function = concrete_function.name
-    function_proto.canonicalized_input.CopyFrom(
-        coder.encode_structure(signature))
-    function_proto.bound_inputs.extend(bound_inputs)
+def _serialize_function_spec(function_spec, coder):
+  """Serialize a FunctionSpec object into its proto representation."""
+  proto = saved_object_graph_pb2.FunctionSpec()
+  proto.fullargspec.CopyFrom(coder.encode_structure(function_spec.fullargspec))
+  proto.is_method = function_spec.is_method
+  proto.args_to_prepend.CopyFrom(
+      coder.encode_structure(function_spec.args_to_prepend))
+  proto.kwargs_to_include.CopyFrom(
+      coder.encode_structure(function_spec.kwargs_to_include))
+  proto.input_signature.CopyFrom(
+      coder.encode_structure(function_spec.input_signature))
   return proto
 
 
-def list_all_concrete_functions(polymorphic_function):
-  """Given a polymorphic function, returns all of its concrete functions.
-
-  Args:
-    polymorphic_function: Instance of `PolymorphicFunction`.
+def serialize_concrete_function(concrete_function, node_ids, coder):
+  """Build a SavedConcreteFunction."""
+  bound_inputs = []
+  try:
+    for capture in concrete_function.captured_inputs:
+      bound_inputs.append(node_ids[capture])
+  except KeyError:
+    # TODO(allenl): This warning shadows a real issue in test_table in
+    # save_test.py, where we don't handle captured constants. Fix that and
+    # then make this an exception.
+    logging.warning(
+        "Concrete function %s not added to object based saved model as it "
+        "captures tensor %s which is unsupported or not reachable from root.",
+        concrete_function.name, capture)
+    return None
+  concrete_function_proto = saved_object_graph_pb2.SavedConcreteFunction()
+  structured_outputs = func_graph_module.convert_structure_to_signature(
+      concrete_function.structured_outputs)
+  concrete_function_proto.canonicalized_input_signature.CopyFrom(
+      coder.encode_structure(concrete_function.structured_input_signature))
+  concrete_function_proto.output_signature.CopyFrom(
+      coder.encode_structure(structured_outputs))
+  concrete_function_proto.bound_inputs.extend(bound_inputs)
+  return concrete_function_proto
 
-  Returns:
-    A list of tuples in the form (signature, concrete_function), where concrete
-    function is an instance of `Function`.
-  """
-  input_signature = polymorphic_function._input_signature  # pylint: disable=protected-access
-  if input_signature is not None:
-    polymorphic_function.get_concrete_function()
-  concrete_functions = []
-  for signature in polymorphic_function._cached_input_signatures:  # pylint: disable=protected-access
-    if any(isinstance(arg, defun_lib.UnknownArgument) for arg in signature):
-      continue
-    concrete_function = polymorphic_function.get_concrete_function(*signature)
-    concrete_functions.append((signature, concrete_function))
-  return concrete_functions
 
+def serialize_bare_concrete_function(concrete_function):
+  """Build a SavedBareConcreteFunction."""
+  # pylint: disable=protected-access
+  return saved_object_graph_pb2.SavedBareConcreteFunction(
+      concrete_function_name=concrete_function.name,
+      allowed_positional_arguments=concrete_function._num_positional_args,
+      argument_keywords=concrete_function._arg_keywords)
+  # pylint: enable=protected-access
 
-def list_all_polymorphic_functions(checkpointable_object):
-  """Given a checkpointable object, returns all of its polymorphic functions."""
-  polymorphic_functions = dict()
-  for attribute_name in dir(checkpointable_object):
-    try:
-      attribute_value = getattr(checkpointable_object, attribute_name, None)
-    except:  # pylint: disable=bare-except
-      # We really don't want to throw an exception just because some object's
-      # attribute accessor is broken.
-      attribute_value = None
-    # TODO(allenl): Consider de-duplicating functions which are referenced
-    # from multiple attributes.
-    if isinstance(attribute_value, def_function.PolymorphicFunction):
-      polymorphic_functions[attribute_name] = attribute_value
-  return polymorphic_functions
 
+def serialize_function(function):
+  """Build a SavedFunction proto."""
+  coder = nested_structure_coder.StructureCoder()
+  proto = saved_object_graph_pb2.SavedFunction()
 
-def add_polymorphic_functions_to_object_graph_proto(checkpointable_objects,
-                                                    saved_object_graph,
-                                                    node_ids):
-  """Finds PolymorphicFunctions attached to objects and saves them."""
-  existing_objects = list(zip(checkpointable_objects, saved_object_graph.nodes))
-  for obj, obj_proto in existing_objects:
-    for name, polymorphic_function in list_all_polymorphic_functions(
-        obj).items():
-      function_node_id = len(saved_object_graph.nodes)
-      function_node = saved_object_graph.nodes.add()
-      function_node.function.CopyFrom(
-          _serialize_polymorphic_function(polymorphic_function, node_ids))
-      reference = obj_proto.children.add()
-      reference.node_id = function_node_id
-      reference.local_name = name
+  function_spec_proto = _serialize_function_spec(function.function_spec, coder)
+  proto.function_spec.CopyFrom(function_spec_proto)
+  all_concrete_functions = \
+      function._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+  for concrete_function in all_concrete_functions:
+    proto.concrete_functions.append(concrete_function.name)
+  return proto
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 0f3dd36d4ceb707c378ebbdb0a46f6cf8500dd93..ad7a046ba444429c52d476cf736b80a685a63043 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 
-from tensorflow.python.eager import function
-from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -29,6 +28,8 @@ from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_deserialization
 from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training.checkpointable import tracking
@@ -44,42 +45,53 @@ class _Loader(object):
     self._asset_file_def = meta_graph.asset_file_def
     self._proto = object_graph_proto
     self._export_dir = export_dir
-    self._load_func_graphs(meta_graph.graph_def.library)
+    self._concrete_functions = (
+        function_deserialization.load_function_def_library(
+            meta_graph.graph_def.library))
     self._load_all()
-    self._bind_function_captures()
+    self._setup_functions()
     self._restore_checkpoint()
 
-  def _load_func_graphs(self, function_library):
-    # TODO(allenl): Do we need to do name mapping here? Not quite sure what
-    # happens when loaded names collide with existing names.
-    # TODO(andresp): Look into restoring nested and gradient functions in the
-    # right order.
-    self._functions = {}
-    for fdef in function_library.function:
-      graph = function_def_lib.function_def_to_graph(fdef)
-      self._functions[fdef.signature.name] = function.Function(graph)
-
-  def _bind_function_captures(self):
-    """Setup captured tensors in restored concrete functions."""
-    seen_functions = set()
-    for object_proto in self._proto.nodes:
-      if object_proto.WhichOneof("kind") == "function":
-        for monomorphic_function in object_proto.function.monomorphic_function:
-          name = monomorphic_function.concrete_function
-          bound_inputs = [
-              self._get_tensor_from_node(node_id)
-              for node_id in monomorphic_function.bound_inputs]
-          if name in seen_functions:
-            if self._functions[name]._captured_inputs != bound_inputs:  # pylint: disable=protected-access
-              raise NotImplementedError(
-                  "Function %s is used more than once with different "
-                  "captured inputs." % name)
-          else:
-            seen_functions.add(name)
-            # TODO(andresp): This is only injecting the captured inputs into the
-            # concrete function, note that we did not modify the FuncGraph
-            # itself.
-            self._functions[name]._captured_inputs = bound_inputs  # pylint: disable=protected-access
+  def _setup_concrete_function(self, proto, concrete_function, coder):
+    """Setup captured tensors and outputs for a single concrete function."""
+    bound_inputs = [
+        self._get_tensor_from_node(node_id)
+        for node_id in proto.bound_inputs]
+    bound_variables = [
+        self._nodes[node_id]
+        for node_id in proto.bound_inputs
+        if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
+    ]
+    # TODO(andresp): This is only injecting the captured inputs into the
+    # concrete function, note that we did not modify the FuncGraph
+    # itself.
+    concrete_function._captured_inputs = bound_inputs  # pylint: disable=protected-access
+    concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
+    # By setting the structured_outputs directly, we can rely on this
+    # function_lib.ConcreteFunction object to perform the output repacking
+    # logic. The only limitation of that logic is that it only works
+    # with output that is convertible to Tensors and the conversion
+    # always happens. For example tf.TensorShape([2, 3]) will be
+    # converted to Tensor representing [2, 3].
+    original_outputs = coder.decode_proto(proto.output_signature)
+    # The original_outputs here had Tensors converted to TensorSpecs, so
+    # the restored function's structured_outputs field will not be
+    # exactly the same. Fortunately the repacking logic cares only about
+    # the structure.
+    # TODO(vbardiovsky): Should we just replicate the structures, with
+    # Nones instead of real objects?
+    concrete_function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
+    concrete_function._func_graph.structured_input_signature = (  # pylint: disable=protected-access
+        coder.decode_proto(proto.canonicalized_input_signature))
+
+  def _setup_functions(self):
+    """Setup captures and output structure in restored functions."""
+    coder = nested_structure_coder.StructureCoder()
+    for name, concrete_function_proto in self._proto.concrete_functions.items():
+      self._setup_concrete_function(
+          concrete_function_proto,
+          self._concrete_functions[name],
+          coder)
 
   def _get_tensor_from_node(self, node_id):
     obj = self._nodes[node_id]
@@ -90,11 +102,23 @@ class _Loader(object):
     raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
   def _load_all(self):
-    self._nodes = [self._recreate(proto) for proto in self._proto.nodes]
+    """Load all saved objects and wire their properties."""
+    self._nodes = []
+    node_setters = []
+    for proto in self._proto.nodes:
+      node, setter = self._recreate(proto)
+      self._nodes.append(node)
+      node_setters.append(setter)
     # After creating the objects, construct the edges between the objects.
-    for obj, object_proto in zip(self._nodes, self._proto.nodes):
+    for obj, object_proto, setter in zip(self._nodes, self._proto.nodes,
+                                         node_setters):
       for reference in object_proto.children:
-        setattr(obj, reference.local_name, self._nodes[reference.node_id])
+        setter(obj, reference.local_name, self._nodes[reference.node_id])
+        # Note: if an object has an attribute `__call__` add a class method
+        # that allows `obj()` syntax to work. This is done per-instance to
+        # allow `callable` to be used to find out if an object is callable.
+        if reference.local_name == "__call__":
+          setattr(type(obj), "__call__", _call_attribute)
 
   def _restore_checkpoint(self):
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
@@ -110,6 +134,9 @@ class _Loader(object):
         "user_object": lambda: self._recreate_user_object(proto.user_object),
         "asset": lambda: self._recreate_asset(proto.asset),
         "function": lambda: self._recreate_function(proto.function),
+        "bare_concrete_function": functools.partial(
+            self._recreate_bare_concrete_function,
+            proto.bare_concrete_function),
         "variable": lambda: self._recreate_variable(proto.variable),
     }
     kind = proto.WhichOneof("kind")
@@ -118,23 +145,41 @@ class _Loader(object):
     return factory[kind]()
 
   def _recreate_user_object(self, proto):
-    del proto
-    return tracking.Checkpointable()
+    """Instantiates a SavedUserObject."""
+    looked_up = revived_types.deserialize(proto)
+    if looked_up is None:
+      # Note: each user object has its own class. This allows to make each one
+      # individually callable by adding a `__call__` method to the classes of
+      # the objects instances that have a `__call__` property.
+
+      class _UserObject(tracking.AutoCheckpointable):
+        pass
+
+      return _UserObject(), setattr
+    return looked_up
 
   def _recreate_asset(self, proto):
     filename = os.path.join(
         saved_model_utils.get_assets_dir(self._export_dir),
         self._asset_file_def[proto.asset_file_def_index].filename)
-    return tracking.TrackableAsset(filename)
+    return tracking.TrackableAsset(filename), setattr
 
   def _recreate_function(self, proto):
-    return function_deserialization.recreate_polymorphic_function(
-        proto, self._functions)
+    return function_deserialization.recreate_function(
+        proto, self._concrete_functions), setattr
+
+  def _recreate_bare_concrete_function(self, proto):
+    return function_deserialization.setup_bare_concrete_function(
+        proto, self._concrete_functions), setattr
 
   def _recreate_variable(self, proto):
     # TODO(andresp): Can we use the checkpointed value as initializer?
     dummy_value = init_ops.Zeros(dtype=proto.dtype)(shape=proto.shape)
-    return variables.Variable(dummy_value)
+    return variables.Variable(dummy_value, trainable=proto.trainable), setattr
+
+
+def _call_attribute(instance, *args, **kwargs):
+  return instance.__call__(*args, **kwargs)
 
 
 def _load_saved_object_graph_proto(filename):
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 0f7fba0c66ef87ae4e9869318b63886c5b646404..ef703f9d29657fa3b2be0a96efa75b73989f57eb 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -18,53 +18,74 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import tempfile
 
+from absl.testing import parameterized
+
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
 from tensorflow.python.training.checkpointable import tracking
 
 
-class LoadTest(test.TestCase):
-
-  def cycle(self, obj):
-    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
-    save.save(obj, path, signatures={})
-    return load.load(path)
-
-  def test_structure_import(self):
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    root.dep_one = tracking.Checkpointable()
-    root.dep_two = tracking.Checkpointable()
-    root.dep_two.dep = tracking.Checkpointable()
+@parameterized.named_parameters(
+    dict(testcase_name="ReloadOnce", cycles=1),
+    dict(testcase_name="ReloadTwice", cycles=2),
+    dict(testcase_name="ReloadThrice", cycles=3))
+class LoadTest(test.TestCase, parameterized.TestCase):
+
+  def cycle(self, obj, cycles=1, signatures=None):
+    to_save = obj
+    # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
+    # point w.r.t. saving/restoring, ideally after 2nd saving.
+    for _ in range(cycles):
+      path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+      save.save(to_save, path, signatures)
+      loaded = load.load(path)
+      to_save = loaded
+    return loaded
+
+  def test_structure_import(self, cycles):
+    root = tracking.AutoCheckpointable()
+    root.dep_one = tracking.AutoCheckpointable()
+    root.dep_two = tracking.AutoCheckpointable()
+    root.dep_two.dep = tracking.AutoCheckpointable()
     root.dep_three = root.dep_two.dep
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
-    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
 
-  def test_variables(self):
-    root = tracking.Checkpointable()
-    root.v1 = variables.Variable(1.)
-    root.v2 = variables.Variable(2.)
+  def test_variables(self, cycles):
+    root = tracking.AutoCheckpointable()
+    root.v1 = variables.Variable(1., trainable=True)
+    root.v2 = variables.Variable(2., trainable=False)
+    imported = self.cycle(root, cycles)
+    self.assertEqual(imported.v1.numpy(), 1.0)
+    self.assertTrue(imported.v1.trainable)
+    self.assertEqual(imported.v2.numpy(), 2.0)
+    self.assertFalse(imported.v2.trainable)
+
+  def test_capture_variables(self, cycles):
+    root = tracking.AutoCheckpointable()
+    root.weights = variables.Variable(2.)
     root.f = def_function.function(
-        lambda x: root.v2 * x,
+        lambda x: root.weights * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    imported = self.cycle(root)
-    self.assertEquals(imported.v1.numpy(), 1.0)
-    self.assertEquals(imported.v2.numpy(), 2.0)
+    imported = self.cycle(root, cycles)
     self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+    imported.weights.assign(4.0)
+    self.assertEqual(8., imported.f(constant_op.constant(2.)).numpy())
 
   def _make_asset(self, contents):
     filename = tempfile.mktemp(prefix=self.get_temp_dir())
@@ -72,14 +93,11 @@ class LoadTest(test.TestCase):
       f.write(contents)
     return filename
 
-  def test_assets_import(self):
+  def test_assets(self, cycles):
     file1 = self._make_asset("contents 1")
     file2 = self._make_asset("contents 2")
 
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root = tracking.AutoCheckpointable()
     root.asset1 = tracking.TrackableAsset(file1)
     root.asset2 = tracking.TrackableAsset(file2)
 
@@ -93,68 +111,91 @@ class LoadTest(test.TestCase):
 
     imported = load.load(load_dir)
     with open(imported.asset1.asset_path.numpy(), "r") as f:
-      self.assertEquals("contents 1", f.read())
+      self.assertEqual("contents 1", f.read())
     with open(imported.asset2.asset_path.numpy(), "r") as f:
-      self.assertEquals("contents 2", f.read())
+      self.assertEqual("contents 2", f.read())
 
-  def test_capture_assets(self):
-    root = tracking.Checkpointable()
+  def test_capture_assets(self, cycles):
+    root = tracking.AutoCheckpointable()
     root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
     root.f = def_function.function(
         lambda: root.vocab.asset_path,
         input_signature=[])
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     origin_output = root.f().numpy()
     imported_output = imported.f().numpy()
     self.assertNotEqual(origin_output, imported_output)
     with open(imported_output, "r") as f:
-      self.assertEquals("contents", f.read())
+      self.assertEqual("contents", f.read())
 
-  def test_assets_dedup(self):
+  def test_dedup_assets(self, cycles):
     vocab = self._make_asset("contents")
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-
+    root = tracking.AutoCheckpointable()
     root.asset1 = tracking.TrackableAsset(vocab)
     root.asset2 = tracking.TrackableAsset(vocab)
-
-    imported = self.cycle(root)
-
+    imported = self.cycle(root, cycles)
     self.assertEqual(imported.asset1.asset_path.numpy(),
                      imported.asset2.asset_path.numpy())
 
-  def test_implicit_input_signature(self):
+  def test_implicit_input_signature(self, cycles):
     @def_function.function
     def func(x):
       return 2 * x
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = func
 
     # Add two traces.
     root.f(constant_op.constant(1.))
     root.f(constant_op.constant(1))
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
     self.assertEqual(14, imported.f(constant_op.constant(7)).numpy())
 
-  def test_explicit_input_signature(self):
+  def test_explicit_input_signature(self, cycles):
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     def func(x):
       return 2 * x
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = func
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
 
-  def test_function_with_default_bool_input(self):
+  def test_explicit_save_signature(self, cycles):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoCheckpointable()
+    root.f = func
+
+    imported = self.cycle(
+        root, cycles, {
+            "f":
+                root.f.get_concrete_function(
+                    tensor_spec.TensorSpec(None, dtypes.float32))
+        })
+    self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
+
+  def test_nested_functions(self, cycles):
+    f = def_function.function(
+        lambda x: x*2.0,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    g = def_function.function(
+        lambda x: f(x) + 1.0,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root = tracking.AutoCheckpointable()
+    root.g = g
+    imported = self.cycle(root, cycles)
+    imported.g(constant_op.constant([1.0]))
+
+  def test_function_with_default_bool_input(self, cycles):
 
     def func(x, training=False):
       if training:
@@ -162,19 +203,121 @@ class LoadTest(test.TestCase):
       else:
         return 7
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(func)
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
     self.assertEqual(7, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
 
-  def test_positional_arguments(self):
+  def test_function_with_default_none_input(self, cycles):
+
+    def func(x, dtype=None):
+      if dtype:
+        return array_ops.zeros(shape=x.shape, dtype=dtype)
+      else:
+        return array_ops.zeros(shape=x.shape, dtype=dtypes.float32)
+
+    root = tracking.AutoCheckpointable()
+    root.f = def_function.function(func)
+
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1, 2, 3])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1.0, 2.0, 3.0])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([0, 0, 0],
+                        root.f(
+                            constant_op.constant([1.0, 2.0, 3.0]),
+                            dtype=dtypes.int32).numpy())
+
+    concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    self.assertEqual(4, len(concrete_functions))
+
+    imported = self.cycle(root, cycles)
+
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1, 2, 3]),
+                                   None).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1.0, 2.0,
+                                                         3.0])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([0, 0, 0],
+                        imported.f(
+                            constant_op.constant([1.0, 2.0, 3.0]),
+                            dtype=dtypes.int32).numpy())
+
+  def test_structured_inputs(self, cycles):
+
+    def func(x, training=True):
+      # x is a nested structure, we care about one particular tensor.
+      _, (a, b) = x
+      if training:
+        return 2 * a["a"] + b
+      else:
+        return 7
+
+    root = tracking.AutoCheckpointable()
+    root.f = def_function.function(func)
+
+    x = constant_op.constant(10)
+    y = constant_op.constant(11)
+
+    input1 = [6, ({"a": x}, y)]
+    input2 = [7, ({"a": x}, y)]  # Not compatible with input1 signature.
+    input3 = [6, ({"a": y}, x)]  # Compatible with input1 signature.
+
+    # Note: by only calling f(input1) before serialization, only inputs with
+    # matching signature will be valid on the loaded model.
+    self.assertEqual(31, root.f(input1).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    with self.assertRaisesRegexp(AssertionError,
+                                 "Could not find matching function to call.*"):
+      imported.f(input2)
+
+    self.assertEqual(31, imported.f(input1).numpy())
+    self.assertEqual(32, imported.f(input3).numpy())
+
+  def test_structured_output(self, cycles):
+
+    # Use fields with non-alphabetical order
+    named_tuple_type = collections.namedtuple("NamedTupleHello", ["b", "a"])
+
+    def func(input1, input2):
+      named_tuple = named_tuple_type(a=input1 + input2, b=input1 * input2)
+      return [named_tuple, input2, {"x": 0.5}]
+
+    root = tracking.AutoCheckpointable()
+    root.f = def_function.function(func)
+
+    result = root.f(constant_op.constant(2), constant_op.constant(3))
+
+    self.assertEqual(5, result[0].a.numpy())
+    self.assertEqual(6, result[0].b.numpy())
+    self.assertEqual(["b", "a"], list(result[0]._asdict().keys()))
+    self.assertEqual(3, result[1].numpy())
+    self.assertEqual(0.5, result[2]["x"].numpy())
+
+    imported = self.cycle(root, cycles)
+
+    result = imported.f(constant_op.constant(2), constant_op.constant(5))
+    self.assertEqual(7, result[0].a.numpy())
+    self.assertEqual(10, result[0].b.numpy())
+    self.assertEqual(["b", "a"], list(result[0]._asdict().keys()))
+    self.assertEqual(5, result[1].numpy())
+    self.assertEqual(0.5, result[2]["x"].numpy())
+
+  def test_positional_arguments(self, cycles):
     def func(x, training=False, abc=7.1, defg=7.7):
       del abc
       if training:
@@ -184,7 +327,7 @@ class LoadTest(test.TestCase):
       else:
         return 7
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(func)
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
@@ -192,14 +335,36 @@ class LoadTest(test.TestCase):
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
     self.assertEqual(6, root.f(constant_op.constant(1), defg=7.0).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
     self.assertEqual(6, imported.f(constant_op.constant(1), defg=7.0).numpy())
 
-  def test_member_function(self):
-    class CheckpointableWithMember(tracking.Checkpointable):
+  def test_additional_kwargs(self, cycles):
+    def func(x, training=False, **options):
+      del options
+      if training:
+        return 2 * x
+      else:
+        return 7
+
+    root = tracking.AutoCheckpointable()
+    root.f = def_function.function(func)
+
+    x = constant_op.constant(10)
+    self.assertEqual(7, root.f(x, learning_rate=0.5, epochs=3).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    with self.assertRaisesRegexp(AssertionError,
+                                 "Could not find matching function to call.*"):
+      imported.f(x, learning_rate=0.5, epochs=4)
+
+    self.assertEqual(7, imported.f(x, learning_rate=0.5, epochs=3).numpy())
+
+  def test_member_function(self, cycles):
+    class CheckpointableWithMember(tracking.AutoCheckpointable):
 
       def __init__(self):
         super(CheckpointableWithMember, self).__init__()
@@ -218,11 +383,356 @@ class LoadTest(test.TestCase):
     self.assertEqual(27, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(27, imported.f(constant_op.constant(2)).numpy())
 
+  def test_side_effect_listing(self, cycles):
+    class M(tracking.AutoCheckpointable):
+
+      def __init__(self):
+        super(M, self).__init__()
+        self.var = None
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+      def f(self, x):
+        if self.var is None:
+          self.var = variables.Variable(2.)
+        return x * self.var
+
+    m = M()
+    self.cycle(m)
+    self.assertEqual(4.0, m.f(constant_op.constant(2.0)).numpy())
+
+  def test_basic_backprop(self, cycles):
+    weight = variables.Variable(1., trainable=True)
+    bias = variables.Variable(0., trainable=True)
+    g = def_function.function(
+        lambda x: x*weight + bias,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root = tracking.AutoCheckpointable()
+    root.weight = weight
+    root.bias = bias
+    root.g = g
+    imported = self.cycle(root, cycles)
+    with backprop.GradientTape() as t:
+      x = constant_op.constant([3.5])
+      loss = imported.g(x)
+      grad = t.gradient(loss, [imported.weight, imported.bias])
+      self.assertAllClose(grad, [3.5, 1.0])
+
+  def test_nested_backprop(self, cycles):
+    weight = variables.Variable(1., trainable=True)
+    bias = variables.Variable(0., trainable=True)
+
+    # Note: this function gets called from other function defs via a
+    # "PartitionedCall" op node.
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32),
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def mul(x, y):
+      return x * y
+
+    # Note: this function gets called from other function defs via a
+    # "StatefulPartitionedCall" op node.
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def f(x):
+      return mul(weight.read_value(), x)
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def g(x):
+      return f(x) + bias,
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def h(x):
+      return g(x) + bias,
+
+    root = tracking.AutoCheckpointable()
+    root.weight = weight
+    root.bias = bias
+    root.g = h
+
+    imported = self.cycle(root, cycles)
+    with backprop.GradientTape() as t:
+      x = constant_op.constant([3.5])
+      loss = imported.g(x)
+    grad = t.gradient(loss, [imported.weight, imported.bias])
+    self.assertAllClose(grad, [3.5, 2.0])
+
+  def test_callable(self, cycles):
+    class M1(tracking.AutoCheckpointable):
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+      def __call__(self, x):
+        return x
+
+    root = tracking.AutoCheckpointable()
+    root.m1 = M1()
+    root.m2 = tracking.AutoCheckpointable()
+    root.m2.__call__ = def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
+            lambda x: x*3.0)
+    imported = self.cycle(root, cycles)
+    x = constant_op.constant(1.0)
+
+    self.assertTrue(callable(imported.m1))
+    self.assertAllEqual(root.m1(x), imported.m1(x))
+
+    # Note: `root.m2` was not callable since `__call__` attribute was set
+    # into the instance and not on the class. But after a serialization cycle
+    # that starts to work.
+    self.assertTrue(callable(imported.m2))
+    self.assertAllEqual(root.m2.__call__(x), imported.m2(x))
+
+    # Verify that user objects without `__call__` attribute are not callable.
+    self.assertFalse(callable(imported))
+
+  def test_chain_callable(self, cycles):
+    func = def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
+            lambda x: x*3.0)
+    root = tracking.AutoCheckpointable()
+    root.__call__ = tracking.AutoCheckpointable()
+    root.__call__.__call__ = tracking.AutoCheckpointable()
+    root.__call__.__call__.__call__ = func
+
+    imported = self.cycle(root, cycles)
+    self.assertTrue(callable(imported))
+    x = constant_op.constant(1.0)
+    self.assertAllEqual(imported(x).numpy(), 3.0)
+
+  def test_soft_matching(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoCheckpointable()
+    root.f = func
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+    self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
+
+    concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    self.assertEqual(1, len(concrete_functions))
+
+    imported = self.cycle(root, cycles)
+
+    with self.assertRaisesRegexp(ValueError, "Cannot canonicalize"):
+      # We cannot call the function with a constant of shape ().
+      self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
+
+    # TODO(vbardiovsky): When classes are revived with input_signatures, we
+    # should also check that the calls below are not generating any more
+    # concrete functions.
+    self.assertAllEqual([2, 4, 6, 8],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+
+  def test_concrete_function(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function()
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+    self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
+
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+
+    self.assertAllEqual([2, 4, 6, 8],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+
+  def test_concrete_function_arg_names(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function()
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(x=constant_op.constant([1, 2, 3])).numpy())
+
+  def test_concrete_function_no_signature(self, cycles):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function(constant_op.constant([1]))
+    self.assertAllEqual([4], root.f(constant_op.constant([2])).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertAllEqual([6],
+                        imported.f(constant_op.constant([3])).numpy())
+
+  def test_concrete_function_backprop(self, cycles):
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.float32)])
+    def func(x):
+      return x ** 2.
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function()
+
+    def _compute_gradient(function):
+      with backprop.GradientTape() as tape:
+        inp = constant_op.constant(1.)
+        tape.watch(inp)
+        output = function(inp)
+      return tape.gradient(output, inp)
+
+    self.assertEqual(2., _compute_gradient(root.f).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(2., _compute_gradient(imported.f).numpy())
+
+  def test_revived_concrete_function_kwargs(self, cycles):
+
+    @def_function.function
+    def func(x, y):
+      return x * (y + 1.)
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function(
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.float32))
+    self.assertEqual(8., root.f(y=constant_op.constant(3.),
+                                x=constant_op.constant(2.)).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
+                                    x=constant_op.constant(2.)).numpy())
+
+  def test_revived_concrete_function_tensorspec_kwargs(self, cycles):
+
+    @def_function.function
+    def func(*args):
+      x, y = args
+      return x * (y + 1.)
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function(
+        tensor_spec.TensorSpec([], dtypes.float32, name="x"),
+        tensor_spec.TensorSpec([], dtypes.float32, name="y"))
+    self.assertEqual(8., root.f(y=constant_op.constant(3.),
+                                x=constant_op.constant(2.)).numpy())
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
+                                    x=constant_op.constant(2.)).numpy())
+
+  def test_concrete_function_variable_argument(self, cycles):
+    # TODO(allenl): Fix variables in input signatures.
+    self.skipTest("Need to fix encoding of variables in inputs signatures")
+    capture = variables.Variable(0)
+
+    @def_function.function
+    def func(v):
+      v.assign_add(1)
+      capture.assign_sub(1)
+
+    vsave = variables.Variable(1)
+    root = tracking.AutoCheckpointable()
+    root.f = func.get_concrete_function(vsave)
+    root.capture = capture
+    self.assertEqual(1, vsave.numpy())
+    root.f(vsave)
+    self.assertEqual(2, vsave.numpy())
+    self.assertEqual(-1, capture.numpy())
+    imported = self.cycle(root, cycles)
+
+    vload = variables.Variable(1)
+    imported.f(vload)
+    self.assertEqual(2, vload.numpy())
+    imported.f(v=vload)
+    self.assertEqual(3, vload.numpy())
+    self.assertEqual(-3, imported.capture.numpy())
+    self.assertEqual(-1, capture.numpy())
+
+  def test_function_and_component(self, cycles):
+
+    @def_function.function
+    def func(v):
+      return v + 1
+
+    root = tracking.AutoCheckpointable()
+    root.func = func
+    root.concrete_func = func.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    one = constant_op.constant(1)
+    self.assertEqual(2, root.func(one).numpy())
+    self.assertEqual(2, root.concrete_func(one).numpy())
+    imported = self.cycle(root, cycles)
+    self.assertEqual(2, imported.func(one).numpy())
+    self.assertEqual(2, imported.concrete_func(one).numpy())
+
+  def test_dict(self, cycles):
+    root = tracking.AutoCheckpointable()
+    root.variables = dict(a=variables.Variable(1.))
+    root.variables["b"] = variables.Variable(2.)
+    root.variables["c"] = 1
+    root.funcs = dict(
+        a=def_function.function(lambda: constant_op.constant(100.)))
+    root.funcs["conc"] = root.funcs["a"].get_concrete_function()
+    imported = self.cycle(root, cycles)
+    self.assertEqual(1., imported.variables["a"].numpy())
+    self.assertEqual(2., imported.variables["b"].numpy())
+    self.assertEqual(set(["a", "b"]), set(imported.variables.keys()))
+    self.assertEqual(100., imported.funcs["a"]().numpy())
+    self.assertEqual(100., imported.funcs["conc"]().numpy())
+
+  def test_list(self, cycles):
+    root = tracking.AutoCheckpointable()
+    root.variables = [variables.Variable(1.)]
+    root.variables.append(1)
+    root.variables.append(variables.Variable(3.))
+    imported = self.cycle(root, cycles)
+    self.assertEqual(1., imported.variables[0].numpy())
+    self.assertEqual(3., imported.variables[2].numpy())
+    self.assertIs(None, imported.variables[1])
+    self.assertEqual(3, len(imported.variables))
+
+  def test_functions_list(self, cycles):
+    root = tracking.AutoCheckpointable()
+    v1 = variables.Variable(1.)
+    root.losses = [def_function.function(lambda: math_ops.reduce_sum(v1 ** 2))]
+    root.variables = [v1]
+
+    @def_function.function
+    def _v2_loss():
+      if len(root.variables) == 1:
+        v2 = variables.Variable(2.)
+        root.variables.append(v2)
+      return math_ops.reduce_sum(root.variables[1] ** 2)
+
+    root.losses.append(_v2_loss)
+    self.assertAllClose([1., 4.], [loss() for loss in root.losses])
+    imported = self.cycle(root, cycles)
+    self.assertAllClose([1., 4.], [loss() for loss in imported.losses])
+    imported.variables[0].assign(3.)
+    imported.variables[1].assign(4.)
+    self.assertAllClose([9., 16.], [loss() for loss in imported.losses])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
index 776bfff886aeba5d6fc08e14329be39ade8d6061..ef512150a259514fcc4c801eaa06a99441f1f7a2 100644
--- a/tensorflow/python/saved_model/model_utils/export_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -22,7 +22,6 @@ import os
 import tempfile
 import time
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -52,110 +51,110 @@ ops.register_tensor_conversion_function(LabeledTensorMock,
 
 class ExportTest(test_util.TensorFlowTestCase):
 
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_without_receiver_alternatives(self):
-    with context.graph_mode():
-      receiver_tensor = array_ops.placeholder(dtypes.string)
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(["2"])
-      output_3 = constant_op.constant(["3"])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.RegressionOutput(value=output_1),
-          "head-2": export_output.ClassificationOutput(classes=output_2),
-          "head-3": export_output.PredictOutput(outputs={
-              "some_output_3": output_3
-          }),
-      }
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs)
-
-      expected_signature_defs = {
-          "serving_default":
-              signature_def_utils.regression_signature_def(receiver_tensor,
-                                                           output_1),
-          "head-2":
-              signature_def_utils.classification_signature_def(receiver_tensor,
-                                                               output_2, None),
-          "head-3":
-              signature_def_utils.predict_signature_def({
-                  "input": receiver_tensor
-              }, {"some_output_3": output_3})
-      }
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs)
 
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(receiver_tensor,
+                                                         output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(receiver_tensor,
+                                                             output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def({
+                "input": receiver_tensor
+            }, {"some_output_3": output_3})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_with_dict_alternatives(self):
-    with context.graph_mode():
-      receiver_tensor = array_ops.placeholder(dtypes.string)
-      receiver_tensors_alternative_1 = {
-          "foo": array_ops.placeholder(dtypes.int64),
-          "bar": array_ops.sparse_placeholder(dtypes.float32)}
-      receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(["2"])
-      output_3 = constant_op.constant(["3"])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.RegressionOutput(value=output_1),
-          "head-2": export_output.ClassificationOutput(classes=output_2),
-          "head-3": export_output.PredictOutput(outputs={
-              "some_output_3": output_3
-          }),
-      }
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs, receiver_tensors_alternatives)
-
-      expected_signature_defs = {
-          "serving_default":
-              signature_def_utils.regression_signature_def(
-                  receiver_tensor,
-                  output_1),
-          "head-2":
-              signature_def_utils.classification_signature_def(
-                  receiver_tensor,
-                  output_2, None),
-          "head-3":
-              signature_def_utils.predict_signature_def(
-                  {"input": receiver_tensor},
-                  {"some_output_3": output_3}),
-          "other:head-3":
-              signature_def_utils.predict_signature_def(
-                  receiver_tensors_alternative_1,
-                  {"some_output_3": output_3})
-
-          # Note that the alternatives 'other:serving_default' and
-          # 'other:head-2' are invalid, because regession and classification
-          # signatures must take a single string input.  Here we verify that
-          # these invalid signatures are not included in the export_utils.
-      }
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    receiver_tensors_alternative_1 = {
+        "foo": array_ops.placeholder(dtypes.int64),
+        "bar": array_ops.sparse_placeholder(dtypes.float32)}
+    receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
 
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(
+                receiver_tensor,
+                output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(
+                receiver_tensor,
+                output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensor},
+                {"some_output_3": output_3}),
+        "other:head-3":
+            signature_def_utils.predict_signature_def(
+                receiver_tensors_alternative_1,
+                {"some_output_3": output_3})
+
+        # Note that the alternatives 'other:serving_default' and
+        # 'other:head-2' are invalid, because regession and classification
+        # signatures must take a single string input.  Here we verify that
+        # these invalid signatures are not included in the export_utils.
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_with_single_alternatives(self):
-    with context.graph_mode():
-      receiver_tensor = array_ops.placeholder(dtypes.string)
-      receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
-      receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
-          dtypes.float32)
-      # Note we are passing single Tensors as values of
-      # receiver_tensors_alternatives, where normally that is a dict.
-      # In this case a dict will be created using the default receiver tensor
-      # name "input".
-      receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
-                                       "other2": receiver_tensors_alternative_2}
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(["2"])
-      output_3 = constant_op.constant(["3"])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.RegressionOutput(value=output_1),
-          "head-2": export_output.ClassificationOutput(classes=output_2),
-          "head-3": export_output.PredictOutput(outputs={
-              "some_output_3": output_3
-          }),
-      }
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
+    receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
+        dtypes.float32)
+    # Note we are passing single Tensors as values of
+    # receiver_tensors_alternatives, where normally that is a dict.
+    # In this case a dict will be created using the default receiver tensor
+    # name "input".
+    receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
+                                     "other2": receiver_tensors_alternative_2}
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
 
     signature_defs = export_utils.build_all_signature_defs(
         receiver_tensor, export_outputs, receiver_tensors_alternatives)
@@ -222,35 +221,35 @@ class ExportTest(test_util.TensorFlowTestCase):
     self.assertTrue(int(time_1) < int(time_2))
     self.assertTrue(int(time_2) < int(time_3))
 
+  @test_util.deprecated_graph_mode_only
   def test_build_all_signature_defs_serving_only(self):
-    with context.graph_mode():
-      receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
-      output_1 = constant_op.constant([1.])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              export_output.PredictOutput(outputs=output_1),
-          "train": export_output.TrainOutput(loss=output_1),
-      }
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs)
-
-      expected_signature_defs = {
-          "serving_default": signature_def_utils.predict_signature_def(
-              receiver_tensor, {"output": output_1})
-      }
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
-
-      signature_defs = export_utils.build_all_signature_defs(
-          receiver_tensor, export_outputs, serving_only=False)
-
-      expected_signature_defs.update({
-          "train": signature_def_utils.supervised_train_signature_def(
-              receiver_tensor, loss={"loss": output_1})
-      })
-
-      self.assertDictEqual(expected_signature_defs, signature_defs)
+    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+    output_1 = constant_op.constant([1.])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(outputs=output_1),
+        "train": export_output.TrainOutput(loss=output_1),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default": signature_def_utils.predict_signature_def(
+            receiver_tensor, {"output": output_1})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, serving_only=False)
+
+    expected_signature_defs.update({
+        "train": signature_def_utils.supervised_train_signature_def(
+            receiver_tensor, loss={"loss": output_1})
+    })
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 410ebda5c1eda0388e4dfff8efec14bc5f482b85..5cf9a5b155bc27d236b67496159a4ca540bb0c1b 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -14,14 +14,14 @@
 # ==============================================================================
 """Module that encodes (decodes) nested structures into (from) protos.
 
-The intended use is to serialize everything needed to restore a
-PolymorphicFunction that was saved into a SavedModel. This may include concrete
-function inputs and outputs, signatures, function specs, etc.
+The intended use is to serialize everything needed to restore a `Function` that
+was saved into a SavedModel. This may include concrete function inputs and
+outputs, signatures, function specs, etc.
 
 Example use:
 coder = nested_structure_coder.StructureCoder()
 # Encode into proto.
-signature_proto = coder.encode_structure(polymorphic_function.input_signature)
+signature_proto = coder.encode_structure(function.input_signature)
 # Decode into a Python object.
 restored_signature = coder.decode_proto(signature_proto)
 """
@@ -38,6 +38,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.saved_model import struct_pb2
+from tensorflow.python.util import compat
 
 
 class NotEncodableError(Exception):
@@ -83,7 +84,6 @@ class StructureCoder(object):
     """
     return self._map_structure(nested_structure, self._get_encoders())
 
-
   def can_encode(self, nested_structure):
     """Determines whether a nested structure can be encoded into a proto.
 
@@ -305,7 +305,7 @@ class _StringCodec(object):
 
   def do_decode(self, value, decode_fn):
     del decode_fn
-    return value.string_value
+    return compat.as_str(value.string_value)
 
 
 StructureCoder.register_codec(_StringCodec())
@@ -361,7 +361,10 @@ class _TensorShapeCodec(object):
   """Codec for `TensorShape`."""
 
   def can_encode(self, pyobj):
-    return isinstance(pyobj, tensor_shape.TensorShape)
+    return isinstance(pyobj, (tensor_shape.TensorShape,
+                              # TODO(b/121255889): Should not need these.
+                              tensor_shape.TensorShapeV1,
+                              tensor_shape.TensorShapeV2))
 
   def do_encode(self, tensor_shape_value, encode_fn):
     del encode_fn
diff --git a/tensorflow/python/saved_model/revived_types.py b/tensorflow/python/saved_model/revived_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f82039cfd60b8c5d414f7dc3ca2e0b4e85022b1
--- /dev/null
+++ b/tensorflow/python/saved_model/revived_types.py
@@ -0,0 +1,185 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handles types registrations for tf.saved_model.load."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import versions_pb2
+from tensorflow.python.saved_model import saved_object_graph_pb2
+
+
+class VersionedTypeRegistration(object):
+  """Holds information about one version of a revived type."""
+
+  def __init__(self, object_factory, version, min_producer_version,
+               min_consumer_version, bad_consumers=None, setter=setattr,
+               getter=getattr, attribute_extractor=dir):
+    """Identify a revived type version.
+
+    Args:
+      object_factory: A callable which takes a SavedUserObject proto and returns
+        a checkpointable object. Dependencies are added later via `setter`.
+      version: An integer, the producer version of this wrapper type. When
+        making incompatible changes to a wrapper, add a new
+        `VersionedTypeRegistration` with an incremented `version`. The most
+        recent version will be saved, and all registrations with a matching
+        identifier will be searched for the highest compatible version to use
+        when loading.
+      min_producer_version: The minimum producer version number required to use
+        this `VersionedTypeRegistration` when loading a proto.
+      min_consumer_version: `VersionedTypeRegistration`s with a version number
+        less than `min_consumer_version` will not be used to load a proto saved
+        with this object. `min_consumer_version` should be set to the lowest
+        version number which can successfully load protos saved by this
+        object. If no matching registration is available on load, the object
+        will be revived with a generic checkpointable type.
+
+        `min_consumer_version` and `bad_consumers` are a blunt tool, and using
+        them will generally break forward compatibility: previous versions of
+        TensorFlow will revive newly saved objects as opaque checkpointable
+        objects rather than wrapped objects. When updating wrappers, prefer
+        saving new information but preserving compatibility with previous
+        wrapper versions. They are, however, useful for ensuring that
+        previously-released buggy wrapper versions degrade gracefully rather
+        than throwing exceptions when presented with newly-saved SavedModels.
+      bad_consumers: A list of consumer versions which are incompatible (in
+        addition to any version less than `min_consumer_version`).
+      setter: A callable with the same signature as `setattr` to use when adding
+        dependencies to generated objects.
+      getter: A callable with the same signature as `getattr` to use when
+        retrieving items from objects of this type. Used along with
+        `attribute_extractor` to find functions, which are not Checkpointable
+        objects and so not regular dependencies.
+      attribute_extractor: A callable equivalent of the builtin `dir`, used for
+        listing items in this container (if any).
+    """
+    self.setter = setter
+    self.getter = getter
+    self.attribute_extractor = attribute_extractor
+    self.identifier = None  # Set after registration
+    self._object_factory = object_factory
+    self.version = version
+    self._min_consumer_version = min_consumer_version
+    self._min_producer_version = min_producer_version
+    if bad_consumers is None:
+      bad_consumers = []
+    self._bad_consumers = bad_consumers
+
+  def to_proto(self):
+    """Create a SavedUserObject proto."""
+    # For now wrappers just use dependencies to save their state, so the
+    # SavedUserObject doesn't depend on the object being saved.
+    # TODO(allenl): Add a wrapper which uses its own proto.
+    return saved_object_graph_pb2.SavedUserObject(
+        identifier=self.identifier,
+        version=versions_pb2.VersionDef(
+            producer=self.version,
+            min_consumer=self._min_consumer_version,
+            bad_consumers=self._bad_consumers))
+
+  def from_proto(self, proto):
+    """Recreate a checkpointable object from a SavedUserObject proto."""
+    return self._object_factory(proto)
+
+  def should_load(self, proto):
+    """Checks if this object should load the SavedUserObject `proto`."""
+    if proto.identifier != self.identifier:
+      return False
+    if self.version < proto.version.min_consumer:
+      return False
+    if proto.version.producer < self._min_producer_version:
+      return False
+    for bad_version in proto.version.bad_consumers:
+      if self.version == bad_version:
+        return False
+    return True
+
+
+# string identifier -> (predicate, [VersionedTypeRegistration])
+_REVIVED_TYPE_REGISTRY = {}
+_TYPE_IDENTIFIERS = []
+
+
+def register_revived_type(identifier, predicate, versions):
+  """Register a type for revived objects.
+
+  Args:
+    identifier: A unique string identifying this class of objects.
+    predicate: A Boolean predicate for this registration. Takes a
+      checkpointable object as an argument. If True, `type_registration` may be
+      used to save and restore the object.
+    versions: A list of `VersionedTypeRegistration` objects.
+  """
+  # Keep registrations in order of version. We always use the highest matching
+  # version (respecting the min consumer version and bad consumers).
+  versions.sort(key=lambda reg: reg.version, reverse=True)
+  if not versions:
+    raise AssertionError("Need at least one version of a registered type.")
+  version_numbers = set()
+  for registration in versions:
+    # Copy over the identifier for use in generating protos
+    registration.identifier = identifier
+    if registration.version in version_numbers:
+      raise AssertionError(
+          "Got multiple registrations with version {} for type {}".format(
+              registration.version, identifier))
+    version_numbers.add(registration.version)
+  if identifier in _REVIVED_TYPE_REGISTRY:
+    raise AssertionError(
+        "Duplicate registrations for type {}".format(identifier))
+
+  _REVIVED_TYPE_REGISTRY[identifier] = (predicate, versions)
+  _TYPE_IDENTIFIERS.append(identifier)
+
+
+def get_attribute_extractors(obj):
+  """Get a `dir` and `getattr` equivalent for use with `obj`."""
+  for identifier in _TYPE_IDENTIFIERS:
+    predicate, versions = _REVIVED_TYPE_REGISTRY[identifier]
+    if predicate(obj):
+      return versions[0].attribute_extractor, versions[0].getter
+  return dir, getattr
+
+
+def serialize(obj):
+  """Create a SavedUserObject from a checkpointable object."""
+  for identifier in _TYPE_IDENTIFIERS:
+    predicate, versions = _REVIVED_TYPE_REGISTRY[identifier]
+    if predicate(obj):
+      # Always uses the most recent version to serialize.
+      return versions[0].to_proto()
+  return None
+
+
+def deserialize(proto):
+  """Create a checkpointable object from a SavedUserObject proto.
+
+  Args:
+    proto: A SavedUserObject to deserialize.
+
+  Returns:
+    A tuple of (checkpointable, assignment_fn) where assignment_fn has the same
+    signature as setattr and should be used to add dependencies to
+    `checkpointable` when they are available.
+  """
+  _, type_registrations = _REVIVED_TYPE_REGISTRY.get(
+      proto.identifier, (None, None))
+  if type_registrations is not None:
+    for type_registration in type_registrations:
+      if type_registration.should_load(proto):
+        return (type_registration.from_proto(proto), type_registration.setter)
+  return None
diff --git a/tensorflow/python/saved_model/revived_types_test.py b/tensorflow/python/saved_model/revived_types_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede5922b801e1d6606d3d86059a03eee60433ad8
--- /dev/null
+++ b/tensorflow/python/saved_model/revived_types_test.py
@@ -0,0 +1,110 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for revived type matching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import versions_pb2
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.training.checkpointable import tracking
+
+
+class CustomTestClass(tracking.AutoCheckpointable):
+
+  def __init__(self, version):
+    self.version = version
+
+
+revived_types.register_revived_type(
+    "test_type",
+    lambda obj: isinstance(obj, CustomTestClass),
+    versions=[
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(1),
+            version=1, min_producer_version=1,
+            min_consumer_version=1),
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(2),
+            version=2, min_producer_version=2, min_consumer_version=1),
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(3),
+            version=3, min_producer_version=3, min_consumer_version=2),
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(4),
+            version=4, min_producer_version=4, min_consumer_version=2,
+            bad_consumers=[3]),
+    ]
+)
+
+
+class RegistrationMatchingTest(test.TestCase):
+
+  def test_save_typecheck(self):
+    self.assertIs(revived_types.serialize(tracking.AutoCheckpointable()), None)
+
+  def test_load_identifier_not_found(self):
+    nothing_matches = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="_unregistered_type",
+            version=versions_pb2.VersionDef(
+                producer=1,
+                min_consumer=1,
+                bad_consumers=[])))
+    self.assertIs(nothing_matches, None)
+
+  def test_most_recent_version_saved(self):
+    serialized = revived_types.serialize(CustomTestClass(None))
+    self.assertEqual([3], serialized.version.bad_consumers)
+    deserialized, _ = revived_types.deserialize(serialized)
+    self.assertIsInstance(deserialized, CustomTestClass)
+    self.assertEqual(4, deserialized.version)
+
+  def test_min_consumer_version(self):
+    nothing_matches = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="test_type",
+            version=versions_pb2.VersionDef(
+                producer=5,
+                min_consumer=5,
+                bad_consumers=[])))
+    self.assertIs(nothing_matches, None)
+
+  def test_bad_versions(self):
+    deserialized, _ = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="test_type",
+            version=versions_pb2.VersionDef(
+                producer=5,
+                min_consumer=1,
+                bad_consumers=[4, 3])))
+    self.assertEqual(2, deserialized.version)
+
+  def test_min_producer_version(self):
+    deserialized, _ = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="test_type",
+            version=versions_pb2.VersionDef(
+                producer=3,
+                min_consumer=0,
+                bad_consumers=[])))
+    self.assertEqual(3, deserialized.version)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index e69343208c34ba22c33e355ff7ac092cb50bab75..20af464edface1813cd82504023dd51cefb9cf61 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import os
 
+from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import context
@@ -38,6 +39,8 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_serialization
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
@@ -53,65 +56,151 @@ from tensorflow.python.util.tf_export import tf_export
 DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
 
 
-def _find_function_to_export(root):
-  """Iterate over `root`'s attributes, finding traced functions."""
-  exported_function = None
-  previous_attribute_name = None
-  for attribute_name in dir(root):
-    attribute_value = getattr(root, attribute_name, None)
-    if isinstance(attribute_value, def_function.PolymorphicFunction):
-      if exported_function is not None:
-        raise ValueError(
-            ("Exporting an object with no "
-             "tf.saved_model.save(..., signatures=...) "
-             "argument specified, and with more than one "
-             "@tf.function-decorated method attached to it: {}. The signature "
-             "keys for these functions are ambiguous. Specify signature "
-             "functions explicitly.").format(
-                 [previous_attribute_name, attribute_name]))
-      exported_function = attribute_value
-      previous_attribute_name = attribute_name
-  if exported_function is None:
-    exported_function = getattr(root, DEFAULT_SIGNATURE_ATTR, None)
-  if exported_function is None:
-    raise ValueError(
-        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
-         "argument specified, and with no @tf.function-decorated methods "
-         "attached to it. In the future this will be a supported use-case for "
-         "Python re-import, but at the moment saving a SavedModel without "
-         "signatures does not make sense, as the only consumers will expect "
-         "signatures. Either decorate a method or specify a signature function "
-         "explicitly."))
-  return exported_function
+class _SaveableView(object):
+  """Provides a stable view over a checkpointable root.
+
+  This class helps creating a single stable view over an object to save. The
+  saving code should access properties and functions via this class and not via
+  the original object as there are cases where an object construct their
+  checkpointable attributes and functions dynamically per call and will yield
+  different objects if invoked more than once.
+  """
+
+  def __init__(self, root):
+    checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
+    self.nodes = checkpointable_objects
+    self.node_ids = node_ids
+    self.slot_variables = slot_variables
+    self.functions = util.ObjectIdentityDictionary()
+    self.concrete_functions = []
+
+    # Also add `Function`s as nodes.
+    nodes_without_functions = list(self.nodes)
+    seen_function_names = set()
+    for obj in nodes_without_functions:
+      self.functions[obj] = self._list_functions(obj)
+      for function in self.functions[obj].values():
+        if function not in self.node_ids:
+          self.node_ids[function] = len(self.nodes)
+          self.nodes.append(function)
+          # Avoids recursing into functions to see if other functions are
+          # assigned to attributes. This is sometimes true for concrete
+          # functions but not helpful.
+          self.functions[function] = {}
+        if isinstance(function, def_function.Function):
+          # Force listing the concrete functions for the side effects:
+          #  - populate the cache for functions that have an input_signature
+          #  and have not been called.
+          #  - force side effects of creation of concrete functions, e.g. create
+          #  variables on first run.
+          concrete_functions = (
+              function._list_all_concrete_functions_for_serialization())  # pylint: disable=protected-access
+        else:
+          concrete_functions = [function]
+        for concrete_function in concrete_functions:
+          if concrete_function.name not in seen_function_names:
+            seen_function_names.add(concrete_function.name)
+            self.concrete_functions.append(concrete_function)
+
+  @property
+  def root(self):
+    return self.nodes[0]
+
+  def fill_object_graph_proto(self, proto):
+    """Populate the nodes, children and slot_variables of a SavedObjectGraph."""
+    for node_id, node in enumerate(self.nodes):
+      assert self.node_ids[node] == node_id
+      object_proto = proto.nodes.add()
+      object_proto.slot_variables.extend(self.slot_variables.get(node, ()))
+      if isinstance(node, (def_function.Function, defun.ConcreteFunction)):
+        continue
+      for child in node._checkpoint_dependencies:  # pylint: disable=protected-access
+        child_proto = object_proto.children.add()
+        child_proto.node_id = self.node_ids[child.ref]
+        child_proto.local_name = child.name
+      for local_name, ref_function in self.functions[node].items():
+        child_proto = object_proto.children.add()
+        child_proto.node_id = self.node_ids[ref_function]
+        child_proto.local_name = local_name
+
+  def _list_functions(self, checkpointable_object):
+    """Return a dict of `Function`s of a checkpointable."""
+    functions = dict()
+    attribute_extractor, attribute_getter = (
+        revived_types.get_attribute_extractors(checkpointable_object))
+    for attribute_name in attribute_extractor(checkpointable_object):
+      try:
+        attribute_value = attribute_getter(
+            checkpointable_object, attribute_name, None)
+      except Exception:  # pylint: disable=broad-except
+        # We really don't want to throw an exception just because some object's
+        # attribute accessor is broken.
+        attribute_value = None
+      if isinstance(attribute_value, (def_function.Function,
+                                      defun.ConcreteFunction)):
+        functions[attribute_name] = attribute_value
+    return functions
+
+
+def _get_signature(function):
+  if (isinstance(function, (defun.Function, def_function.Function)) and
+      function._input_signature is not None):  # pylint: disable=protected-access
+    function = function.get_concrete_function()
+  if not isinstance(function, defun.ConcreteFunction):
+    return None
+  return function
+
+
+def _valid_signature(concrete_function):
+  """Returns whether concrete function can be converted to a signature."""
+  if not concrete_function.outputs:
+    # Functions without outputs don't make sense as signatures. We just don't
+    # have any way to run an Operation with no outputs as a SignatureDef in the
+    # 1.x style.
+    return False
+  try:
+    _normalize_outputs(concrete_function.structured_outputs, "unused", "unused")
+  except ValueError:
+    return False
+  return True
+
+
+def _find_function_to_export(saveable_view):
+  """Function to export, None if no suitable function was found."""
+  # If the user did not specify signatures, check the root object for a function
+  # that can be made into a signature.
+  functions = saveable_view.functions[saveable_view.root]
+  signature = functions.get(DEFAULT_SIGNATURE_ATTR, None)
+  if signature is not None:
+    return signature
+
+  # TODO(andresp): Discuss removing this behaviour. It can lead to WTFs when a
+  # user decides to annotate more functions with tf.function and suddenly
+  # serving that model way later in the process stops working.
+  if len(functions) == 1:
+    single_function = list(functions.values())[0]
+    signature = _get_signature(single_function)
+    if signature and  _valid_signature(signature):
+      return signature
+  return None
 
 
 def _canonicalize_signatures(signatures):
   """Converts `signatures` into a dictionary of concrete functions."""
+  if signatures is None:
+    return {}
   if not isinstance(signatures, collections.Mapping):
     signatures = {
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   concrete_signatures = {}
-  for serving_key, signature_function in signatures.items():
-    if isinstance(signature_function, (defun.PolymorphicFunction,
-                                       def_function.PolymorphicFunction)):
-      input_signature = signature_function._input_signature  # pylint: disable=protected-access
-      if input_signature is None:
-        raise ValueError(
-            ("Unable to use the function {} as a signature directly. Functions "
-             "used to generate serving signatures must either have an "
-             "`input_signature=` specified when constructed, or must be "
-             "converted to concrete functions using "
-             "`f.get_concrete_function(...)`.").format(signature_function))
-      signature_function = signature_function.get_concrete_function()
-    elif not isinstance(signature_function, defun.Function):
+  for signature_key, function in signatures.items():
+    signature_function = _get_signature(function)
+    if signature_function is None:
       raise ValueError(
           ("Expected a TensorFlow function to generate a signature for, but "
-           "got {}. Python functions may be decorated with "
-           "`@tf.function(input_signature=...)` and passed as signatures "
-           "directly, or created without a signature using `@tf.function` "
-           "and then converted to a concrete TensorFlow function using "
-           "`f.get_concrete_function(...)`.").format(signature_function))
-    concrete_signatures[serving_key] = signature_function
+           "got {}. Only `tf.functions` with an input signature or "
+           "concrete functions can be used as a signature.").format(function))
+    concrete_signatures[signature_key] = signature_function
   return concrete_signatures
 
 
@@ -156,7 +245,7 @@ def _normalize_outputs(outputs, function_name, signature_key):
 
 
 def _tensor_dict_to_tensorinfo(tensor_dict):
-  return {key: utils_impl.build_tensor_info(value)
+  return {key: utils_impl.build_tensor_info_internal(value)
           for key, value in tensor_dict.items()}
 
 
@@ -430,13 +519,13 @@ def _map_resources(accessible_objects):
   return object_map, resource_map, asset_info
 
 
-def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
+def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
                          object_saver):
   """Generates a MetaGraph which calls `signature_functions`.
 
   Args:
     meta_graph_def: The MetaGraphDef proto to fill.
-    obj: The checkpointable object being exported.
+    saveable_view: The _SaveableView being exported.
     signature_functions: A dictionary mapping signature keys to concrete
       functions containing signatures to add to the MetaGraph.
     object_saver: A CheckpointableSaver to add to the MetaGraph.
@@ -444,10 +533,9 @@ def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
   Returns:
     An _AssetInfo, which contains information to help creating the SavedModel.
   """
-  signatures = {}
   # List objects from the eager context to make sure Optimizers give us the
   # right Graph-dependent variables.
-  accessible_objects = util.list_objects(obj)
+  accessible_objects = saveable_view.nodes
   resource_initializer_functions = _trace_resource_initializers(
       accessible_objects)
   exported_graph = ops.Graph()
@@ -482,19 +570,9 @@ def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
   # the exported graph (thus the `to_graph` argument).
   saver = object_saver.freeze(object_map=object_map, to_graph=exported_graph)
 
-  # We must instantiate and list all concrete functions of polymorphic functions
-  # while in eager mode so they end up added to the graph and can later be used
-  # by the object based saved model.
-  concrete_functions = []
-  for accessible_object in accessible_objects:
-    for function in function_serialization.list_all_polymorphic_functions(
-        accessible_object).values():
-      concrete_functions.extend(
-          function_serialization.list_all_concrete_functions(function))
-
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
-    for _, concrete_function in concrete_functions:
+    for concrete_function in saveable_view.concrete_functions:
       concrete_function.add_to_graph()
     saver_def = saver.to_proto()
     meta_graph_def.saver_def.CopyFrom(saver_def)
@@ -512,30 +590,31 @@ def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
   return asset_info
 
 
-def _write_object_graph(root, export_dir, asset_file_def_index):
+def _write_object_graph(saveable_view, export_dir, asset_file_def_index):
   """Save a SavedObjectGraph proto for `root`."""
   # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
   # checkpoint. It will eventually go into the SavedModel.
   proto = saved_object_graph_pb2.SavedObjectGraph()
-
-  checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
-  util.fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
-                               proto)
+  saveable_view.fill_object_graph_proto(proto)
 
   node_ids = util.ObjectIdentityDictionary()
-  for i in range(len(checkpointable_objects)):
-    obj = checkpointable_objects[i]
+  for i, obj in enumerate(saveable_view.nodes):
     node_ids[obj] = i
     if resource_variable_ops.is_resource_variable(obj):
       node_ids[obj.handle] = i
     elif isinstance(obj, tracking.TrackableAsset):
       node_ids[obj.asset_path.handle] = i
 
-  for obj, obj_proto in zip(checkpointable_objects, proto.nodes):
-    _write_object_proto(obj, obj_proto, asset_file_def_index)
+  coder = nested_structure_coder.StructureCoder()
+  for concrete_function in saveable_view.concrete_functions:
+    serialized = function_serialization.serialize_concrete_function(
+        concrete_function, node_ids, coder)
+    if serialized is not None:
+      proto.concrete_functions[concrete_function.name].CopyFrom(
+          serialized)
 
-  function_serialization.add_polymorphic_functions_to_object_graph_proto(
-      checkpointable_objects, proto, node_ids)
+  for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
+    _write_object_proto(obj, obj_proto, asset_file_def_index)
 
   extra_asset_dir = os.path.join(
       compat.as_bytes(export_dir),
@@ -553,10 +632,24 @@ def _write_object_proto(obj, proto, asset_file_def_index):
     proto.asset.asset_file_def_index = asset_file_def_index[obj]
   elif resource_variable_ops.is_resource_variable(obj):
     proto.variable.SetInParent()
+    proto.variable.trainable = obj.trainable
     proto.variable.dtype = obj.dtype.as_datatype_enum
     proto.variable.shape.CopyFrom(obj.shape.as_proto())
+  elif isinstance(obj, def_function.Function):
+    proto.function.CopyFrom(
+        function_serialization.serialize_function(obj))
+  elif isinstance(obj, defun.ConcreteFunction):
+    proto.bare_concrete_function.CopyFrom(
+        function_serialization.serialize_bare_concrete_function(obj))
   else:
-    proto.user_object.SetInParent()
+    registered_type_proto = revived_types.serialize(obj)
+    if registered_type_proto is None:
+      # Fallback for types with no matching registration
+      registered_type_proto = saved_object_graph_pb2.SavedUserObject(
+          identifier="_generic_user_object",
+          version=versions_pb2.VersionDef(
+              producer=1, min_consumer=1, bad_consumers=[]))
+    proto.user_object.CopyFrom(registered_type_proto)
 
 
 @tf_export("saved_model.save", v1=["saved_model.experimental.save"])
@@ -726,23 +819,30 @@ def save(obj, export_dir, signatures=None):
             "tf.enable_eager_execution() must run first when calling it from "
             "TensorFlow 1.x.")
   # pylint: enable=line-too-long
-  if not isinstance(obj, base.CheckpointableBase):
+  if not isinstance(obj, base.Checkpointable):
     raise ValueError(
         "Expected a Checkpointable object for export, got {}.".format(obj))
+
+  # Use _SaveableView to provide a stable listing of properties and functions.
+  # Note we run this twice since, while constructing the view the first time
+  # there can be side effects of creating variables.
+  _ = _SaveableView(obj)
+  saveable_view = _SaveableView(obj)
+
   if signatures is None:
-    # Note that we run this before saving the checkpoint, since looping over
-    # attributes may have the side effect of creating variables in some cases.
-    signatures = _find_function_to_export(obj)
+    signatures = _find_function_to_export(saveable_view)
 
   signatures = _canonicalize_signatures(signatures)
+
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
   saved_model = saved_model_pb2.SavedModel()
   meta_graph_def = saved_model.meta_graphs.add()
+  # TODO(andresp): Should this be using saveable_view?
   object_saver = util.CheckpointableSaver(obj)
   asset_info = _fill_meta_graph_def(
-      meta_graph_def, obj, signatures, object_saver)
+      meta_graph_def, saveable_view, signatures, object_saver)
   saved_model.saved_model_schema_version = (
       constants.SAVED_MODEL_SCHEMA_VERSION)
   # So far we've just been generating protocol buffers with no I/O. Now we write
@@ -756,4 +856,4 @@ def save(obj, export_dir, signatures=None):
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
   file_io.write_string_to_file(path, saved_model.SerializeToString())
-  _write_object_graph(obj, export_dir, asset_info.asset_index)
+  _write_object_graph(saveable_view, export_dir, asset_info.asset_index)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index cffc1ec2021bf34de7fc8362d1e8b226c43b294c..fa79e45bf180f0ff5383cdeb6a75ab48402fd0a5 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -24,6 +24,7 @@ import sys
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
@@ -39,7 +41,6 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import adam
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 
@@ -48,7 +49,7 @@ class _ModelWithOptimizer(util.Checkpoint):
 
   def __init__(self):
     self.dense = core.Dense(1)
-    self.optimizer = adam.AdamOptimizer(0.01)
+    self.optimizer = adam.Adam(0.01)
 
   @def_function.function(
       input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
@@ -85,7 +86,7 @@ def _import_and_infer(
 class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -97,7 +98,7 @@ class SaveTest(test.TestCase):
         _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(
         lambda z: {"out": 2. * z})
     root.f(constant_op.constant(1.))
@@ -113,16 +114,16 @@ class SaveTest(test.TestCase):
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
-        ValueError, "must be converted to concrete functions"):
+        ValueError, "Expected a TensorFlow function"):
       save.save(root, save_dir, root.f)
 
   def test_nested_inputs(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(
         lambda x: 2. * x[0],
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
@@ -135,7 +136,7 @@ class SaveTest(test.TestCase):
       root.f.get_concrete_function()
 
   def test_nested_outputs(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
@@ -156,7 +157,7 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, to_save)
 
   def test_variable(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.v1 = variables.Variable(3.)
     root.v2 = variables.Variable(2.)
     root.f = def_function.function(
@@ -181,11 +182,6 @@ class SaveTest(test.TestCase):
         second_loss,
         _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
-  def test_trivial_save_exception(self):
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "signature"):
-      save.save(tracking.Checkpointable(), save_dir)
-
   def test_single_method_default_signature(self):
     model = _ModelWithOptimizer()
     x = constant_op.constant([[3., 4.]])
@@ -198,7 +194,7 @@ class SaveTest(test.TestCase):
                                     {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
-    model = tracking.Checkpointable()
+    model = tracking.AutoCheckpointable()
     model.f = def_function.function(lambda: 3., input_signature=())
     model.f()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -206,28 +202,11 @@ class SaveTest(test.TestCase):
     self.assertAllClose({"output_0": 3.},
                         _import_and_infer(save_dir, {}))
 
-  def test_ambiguous_signatures(self):
-    model = _ModelWithOptimizer()
-    x = constant_op.constant([[3., 4.]])
-    y = constant_op.constant([2.])
-    model.call(x, y)
-    model.second_function = def_function.function(lambda: 1.)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "call.*second_function"):
-      save.save(model, save_dir)
-
-  def test_no_signature(self):
-
-    class Model(util.Checkpoint):
-
-      def call(self, inputs):
-        return inputs * 2.
-
+  def test_single_function_no_signature(self):
+    model = tracking.AutoCheckpointable()
+    model.f = def_function.function(lambda: 3.)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    model = Model()
-    with self.assertRaisesRegexp(
-        ValueError, "no @tf.function-decorated methods"):
-      save.save(model, save_dir)
+    save.save(model, save_dir)
 
   def test_find_default_save_function(self):
 
@@ -343,7 +322,7 @@ class AssetTests(test.TestCase):
         _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
 
   def test_unused_asset(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -371,10 +350,32 @@ class AssetTests(test.TestCase):
         save.save(root, export_dir)
 
 
+class _ModelWithOptimizerUsingDefun(util.Checkpoint):
+
+  def __init__(self):
+    self.dense = core.Dense(1)
+    self.optimizer = adam.Adam(0.01)
+
+  # Using defun due to control flow v2 cycles, b/121159261. def_function uses
+  # conds to gate variable initialization and so triggers cond reference cycles,
+  # but the thing being wrapped here does not use cond itself.
+  @function.defun(
+      input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
+                       tensor_spec.TensorSpec([None], dtypes.float32)),
+  )
+  def call(self, x, y):
+    with backprop.GradientTape() as tape:
+      loss = math_ops.reduce_mean((self.dense(x) - y) ** 2.)
+    trainable_variables = self.dense.trainable_variables
+    gradients = tape.gradient(loss, trainable_variables)
+    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+    return {"loss": loss}
+
+
 class MemoryTests(test.TestCase):
 
   def setUp(self):
-    self._model = _ModelWithOptimizer()
+    self._model = _ModelWithOptimizerUsingDefun()
 
   @test_util.assert_no_garbage_created
   def test_no_reference_cycles(self):
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
index f46927d6e8734efdff028acb36983200b2a5bd1a..1e8743f681f6ffe139fe8fb910419ac90822fca2 100644
--- a/tensorflow/python/saved_model/saved_object_graph.proto
+++ b/tensorflow/python/saved_model/saved_object_graph.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/framework/versions.proto";
 import "tensorflow/python/saved_model/struct.proto";
 
 option cc_enable_arenas = true;
@@ -26,6 +27,10 @@ message SavedObjectGraph {
   // The position of the object in this list indicates its id.
   // Nodes[0] is considered the root node.
   repeated SavedObject nodes = 1;
+
+  // Information about captures and output structures in concrete functions.
+  // Referenced from SavedBareConcreteFunction and SavedFunction.
+  map<string, SavedConcreteFunction> concrete_functions = 2;
 }
 
 message SavedObject {
@@ -51,8 +56,9 @@ message SavedObject {
   oneof kind {
     SavedUserObject user_object = 4;
     SavedAsset asset = 5;
-    SavedPolymorphicFunction function = 6;
+    SavedFunction function = 6;
     SavedVariable variable = 7;
+    SavedBareConcreteFunction bare_concrete_function = 8;
   }
 }
 
@@ -62,7 +68,12 @@ message SavedObject {
 //
 // This object cannot be evaluated as a tensor, and therefore cannot be bound
 // to an input of a function.
-message SavedUserObject {}
+message SavedUserObject {
+  // Corresponds to a registration of the type to use in the loading program.
+  string identifier = 1;
+  // Version information from the producer of this SavedUserObject.
+  VersionDef version = 2;
+}
 
 // A SavedAsset represents a file in a SavedModel.
 //
@@ -78,17 +89,14 @@ message SavedAsset {
 }
 
 // A function with multiple signatures, possibly with non-Tensor arguments.
-message SavedPolymorphicFunction {
-  repeated SavedMonomorphicFunction monomorphic_function = 1;
-  // Tuple representing a `FunctionSpec`.
-  // TODO(vbardiovsky): Make this a proto.
-  StructuredValue function_spec_tuple = 2;
+message SavedFunction {
+  repeated string concrete_functions = 1;
+  FunctionSpec function_spec = 2;
 }
 
-message SavedMonomorphicFunction {
-  // A reference to a TensorFlow function in the MetaGraph's FunctionDefLibrary
-  string concrete_function = 1;
-
+// Stores low-level information about a concrete function. Referenced in either
+// a SavedFunction or a SavedBareConcreteFunction.
+message SavedConcreteFunction {
   // Bound inputs to the function. The SavedObjects identified by the node ids
   // given here are appended as extra inputs to the caller-supplied inputs.
   // The only types of SavedObjects valid here are SavedVariable, SavedResource
@@ -96,7 +104,21 @@ message SavedMonomorphicFunction {
   repeated int32 bound_inputs = 2;
   // Input in canonicalized form that was received to create this concrete
   // function.
-  StructuredValue canonicalized_input = 3;
+  StructuredValue canonicalized_input_signature = 3;
+  // Output that was the return value of this function after replacing all
+  // Tensors with TensorSpecs. This can be an arbitrary nested function and will
+  // be used to reconstruct the full structure from pure tensors.
+  StructuredValue output_signature = 4;
+}
+
+message SavedBareConcreteFunction {
+  // Identifies a SavedConcreteFunction.
+  string concrete_function_name = 1;
+
+  // A sequence of unique strings, one per Tensor argument.
+  repeated string argument_keywords = 2;
+  // The prefix of `argument_keywords` which may be identified by position.
+  int64 allowed_positional_arguments = 3;
 }
 
 // Represents a Variable that is initialized by loading the contents from the
@@ -104,6 +126,24 @@ message SavedMonomorphicFunction {
 message SavedVariable {
   DataType dtype = 1;
   TensorShapeProto shape = 2;
+  bool trainable = 3;
+
+  // TODO(andresp): Add save_slice_info_def?
+}
 
-  // TODO(andresp): Add "trainable" and save_slice_info_def.
+// Represents `FunctionSpec` used in `Function`. This represents a
+// function that has been wrapped as a TensorFlow `Function`.
+message FunctionSpec {
+  // Full arg spec from inspect.getfullargspec().
+  StructuredValue fullargspec = 1;
+  // Whether this represents a class method.
+  bool is_method = 2;
+  // Which arguments to always prepend, in case the original function is based
+  // on a functools.partial.
+  StructuredValue args_to_prepend = 3;
+  // Which kwargs to always include, in case the original function is based on a
+  // functools.partial.
+  StructuredValue kwargs_to_include = 4;
+  // The input signature, if specified.
+  StructuredValue input_signature = 5;
 }
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 5caabe59fec1a0819629bd9ff16ad5be19f0890a..a82007fd545ca9e088411bcd5234477b8801e995 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -22,6 +22,7 @@ import os
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -53,7 +54,17 @@ def build_tensor_info(tensor):
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
   """
+  if context.executing_eagerly():
+    raise RuntimeError("build_tensor_info is not supported in Eager mode.")
+  return build_tensor_info_internal(tensor)
+
+
+def build_tensor_info_internal(tensor):
+  """Utility function to build TensorInfo proto from a Tensor."""
   tensor_info = meta_graph_pb2.TensorInfo(
       dtype=dtypes.as_dtype(tensor.dtype).as_datatype_enum,
       tensor_shape=tensor.get_shape().as_proto())
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 2afe8abfd646f26f0562d7cc56b82c5781a586ef..1e12de91b8652328632010d716f75f551aaab2db 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -81,6 +82,12 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  def testBuildTensorInfoEager(self):
+    x = constant_op.constant(1, name="x")
+    with context.eager_mode(), self.assertRaisesRegexp(
+        RuntimeError, "build_tensor_info is not supported in Eager mode"):
+      utils.build_tensor_info(x)
+
   @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoDense(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 0c13016712f316e113723c4c0c250ef636a3fcf0..a01feb3dde041de2ca33f5f4d9fea6a1b6869d41 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor summaries for exporting information about a model.
+"""Operations for writing summary data, for use in analysis and visualization.
 
-See the [Summary](https://tensorflow.org/api_guides/python/summary) guide.
+See the [Summaries and
+TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 321b11ffb73487405428340df94010ed8ddbfcd4..3675c235cfba1063bf2e338fd223dce6c540bec6 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -24,7 +24,7 @@ from tensorflow.python.lib.io import tf_record
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('train.summary_iterator')
+@tf_export(v1=['train.summary_iterator'])
 def summary_iterator(path):
   # pylint: disable=line-too-long
   """An iterator for reading `Event` protocol buffers from an event file.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 78217b503ffac90811c6ae8316bc0c0b907e7bf7..a66be4f833713d106deda15fef56f48ef4a321d3 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -279,7 +279,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
-@tf_export("summary.FileWriter")
+@tf_export(v1=["summary.FileWriter"])
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index 645fa28a37fb125b6b1224961251bc8879d5fe6d..c62a7ce1a3f6eb6cd223f70dabd478b2dba24394 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -25,7 +25,7 @@ from tensorflow.python.summary.writer.writer import FileWriter
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.FileWriterCache')
+@tf_export(v1=['summary.FileWriterCache'])
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 901d6bc335f3a10439e2f02d0db2b237a89fece0..f1a911eb489970cb6a594258e5fcf69e70f91fcd 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -38,7 +38,20 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/contrib/saved_model:reader"],
+)
+
+py_test(
+    name = "saved_model_utils_test",
+    size = "small",
+    srcs = ["saved_model_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/saved_model",
+    ],
 )
 
 py_library(
@@ -250,7 +263,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":saved_model_utils",
-        "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python",
         "//tensorflow/python/debug:local_cli_wrapper",
     ],
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 8a3d6b31fd2fd977082f4b1baacd50b1a5719cff..7637cbd584ae3b4e43fd74bbd5bd362733d800cf 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,6 +4,7 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
+    "audio/__init__.py",
     "autograph/__init__.py",
     "autograph/experimental/__init__.py",
     "bitwise/__init__.py",
@@ -22,7 +23,6 @@ TENSORFLOW_API_INIT_FILES = [
     "image/__init__.py",
     "io/__init__.py",
     "queue/__init__.py",
-    "initializers/__init__.py",
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
@@ -77,6 +77,7 @@ KERAS_API_INIT_FILES = [
     "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
+    "keras/layers/experimental/__init__.py",
     "keras/losses/__init__.py",
     "keras/metrics/__init__.py",
     "keras/models/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 2d5898f31c942f1c8671dff658ee39bb8f979a62..5db84b1d0be1b9ebbefb7d810598f0448901035b 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -5,6 +5,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     # BEGIN GENERATED FILES
     "__init__.py",
     "app/__init__.py",
+    "audio/__init__.py",
     "autograph/__init__.py",
     "autograph/experimental/__init__.py",
     "bitwise/__init__.py",
@@ -99,6 +100,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
+    "keras/layers/experimental/__init__.py",
     "keras/losses/__init__.py",
     "keras/metrics/__init__.py",
     "keras/models/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index b567eead3d0c8c3023322f95402662408152ce45..28bf0e9d015e6f4b28e8cfbf0dbb5a3ccec66f11 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -61,6 +61,7 @@ _TENSORFLOW_DOC_SOURCES = {
     'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
+    'summary': DocSource(docstring_module_name='summary.summary'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
     'train': DocSource(docstring_module_name='training.training'),
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index ab154af9101e32ecacda276004b0e2c39ced0b83..7013f007e583b7d35dcb6f8bfdbea2fefdbb3101 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -45,7 +45,7 @@ def _get_modules(package, attr_name, constants_attr_name):
       API constant names.
 
   Returns:
-    Set of TensorFow API modules.
+    Set of TensorFlow API modules.
   """
   modules = set()
   # TODO(annarev): split up the logic in create_python_api.py so that
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index afc4e517cdd0a34171038cc0ae2d74ce30ecb6a9..cdef42e2bf8df4834677bb809194183332c6f279 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -30,9 +30,8 @@ import sys
 import warnings
 
 import numpy as np
-
 from six import integer_types
-from tensorflow.contrib.saved_model.python.saved_model import reader
+
 from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
@@ -56,7 +55,7 @@ def _show_tag_sets(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
   print('The given SavedModel contains the following tag-sets:')
   for tag_set in sorted(tag_sets):
     print(', '.join(sorted(tag_set)))
@@ -190,7 +189,7 @@ def _show_all(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
   for tag_set in sorted(tag_sets):
     print("\nMetaGraphDef with tag-set: '%s' "
           "contains the following SignatureDefs:" % ', '.join(tag_set))
@@ -654,7 +653,7 @@ def scan(args):
     scan_meta_graph_def(
         saved_model_utils.get_meta_graph_def(args.dir, args.tag_set))
   else:
-    saved_model = reader.read_saved_model(args.dir)
+    saved_model = saved_model_utils.read_saved_model(args.dir)
     for meta_graph_def in saved_model.meta_graphs:
       scan_meta_graph_def(meta_graph_def)
 
diff --git a/tensorflow/python/tools/saved_model_utils.py b/tensorflow/python/tools/saved_model_utils.py
index c27d7a2658a096d1f5ce515dbc1f86423eb113de..17c4b8cb8319363a4a2d422a563ae1227d673366 100644
--- a/tensorflow/python/tools/saved_model_utils.py
+++ b/tensorflow/python/tools/saved_model_utils.py
@@ -18,7 +18,78 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.saved_model.python.saved_model import reader
+import os
+
+from google.protobuf import message
+from google.protobuf import text_format
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def read_saved_model(saved_model_dir):
+  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel file.
+
+  Returns:
+    A `SavedModel` protocol buffer.
+
+  Raises:
+    IOError: If the file does not exist, or cannot be successfully parsed.
+  """
+  # Build the path to the SavedModel in pbtxt format.
+  path_to_pbtxt = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+  # Build the path to the SavedModel in pb format.
+  path_to_pb = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+  # Ensure that the SavedModel exists at either path.
+  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
+      path_to_pb):
+    raise IOError("SavedModel file does not exist at: %s" % saved_model_dir)
+
+  # Parse the SavedModel protocol buffer.
+  saved_model = saved_model_pb2.SavedModel()
+  if file_io.file_exists(path_to_pb):
+    try:
+      file_content = file_io.FileIO(path_to_pb, "rb").read()
+      saved_model.ParseFromString(file_content)
+      return saved_model
+    except message.DecodeError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pb, str(e)))
+  elif file_io.file_exists(path_to_pbtxt):
+    try:
+      file_content = file_io.FileIO(path_to_pbtxt, "rb").read()
+      text_format.Merge(file_content.decode("utf-8"), saved_model)
+      return saved_model
+    except text_format.ParseError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
+  else:
+    raise IOError("SavedModel file does not exist at: %s/{%s|%s}" %
+                  (saved_model_dir, constants.SAVED_MODEL_FILENAME_PBTXT,
+                   constants.SAVED_MODEL_FILENAME_PB))
+
+
+def get_saved_model_tag_sets(saved_model_dir):
+  """Retrieves all the tag-sets available in the SavedModel.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel.
+
+  Returns:
+    String representation of all tag-sets in the SavedModel.
+  """
+  saved_model = read_saved_model(saved_model_dir)
+  all_tags = []
+  for meta_graph_def in saved_model.meta_graphs:
+    all_tags.append(list(meta_graph_def.meta_info_def.tags))
+  return all_tags
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
@@ -39,7 +110,7 @@ def get_meta_graph_def(saved_model_dir, tag_set):
   Returns:
     A MetaGraphDef corresponding to the tag-set.
   """
-  saved_model = reader.read_saved_model(saved_model_dir)
+  saved_model = read_saved_model(saved_model_dir)
   set_of_tags = set(tag_set.split(','))
   for meta_graph_def in saved_model.meta_graphs:
     if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
diff --git a/tensorflow/python/tools/saved_model_utils_test.py b/tensorflow/python/tools/saved_model_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5512dea1f74c8a27045c0036fb0d6df9681169bf
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_utils_test.py
@@ -0,0 +1,116 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModel utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.tools import saved_model_utils
+
+
+def tearDownModule():
+  file_io.delete_recursively(test.get_temp_dir())
+
+
+class SavedModelUtilTest(test.TestCase):
+
+  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+    v = variables.Variable(variable_value, name=variable_name)
+    sess.run(variables.global_variables_initializer())
+    self.assertEqual(variable_value, v.eval())
+
+  @test_util.deprecated_graph_mode_only
+  def testReadSavedModelValid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "valid_saved_model")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+    builder.save()
+
+    actual_saved_model_pb = saved_model_utils.read_saved_model(saved_model_dir)
+    self.assertEqual(len(actual_saved_model_pb.meta_graphs), 1)
+    self.assertEqual(
+        len(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags), 1)
+    self.assertEqual(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags[0],
+                     tag_constants.TRAINING)
+
+  def testReadSavedModelInvalid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "invalid_saved_model")
+    with self.assertRaisesRegexp(
+        IOError, "SavedModel file does not exist at: %s" % saved_model_dir):
+      saved_model_utils.read_saved_model(saved_model_dir)
+
+  @test_util.deprecated_graph_mode_only
+  def testGetSavedModelTagSets(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+
+    # Graph with a single variable. SavedModel invoked to:
+    # - add with weights.
+    # - a single tag (from predefined constants).
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - a single tag (from predefined constants).
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 43)
+      builder.add_meta_graph([tag_constants.SERVING])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags for serving on TPU.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple custom tags.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
+      builder.add_meta_graph(["foo", "bar"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    actual_tags = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
+    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["serve", "tpu"],
+                     ["foo", "bar"]]
+    self.assertEqual(expected_tags, actual_tags)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 0c701f47122caf7ae561ddfa84b98925226930e0..81e03c735851ae67e3b332c956f7c91446da01d6 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Adam for TensorFlow."""
 from __future__ import absolute_import
 from __future__ import division
@@ -37,9 +36,14 @@ class AdamOptimizer(optimizer.Optimizer):
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
-  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-               use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+  def __init__(self,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name="Adam"):
+    r"""Construct a new Adam optimizer.
 
     Initialization:
 
@@ -75,23 +79,20 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Args:
       learning_rate: A Tensor or a floating point value.  The learning rate.
-      beta1: A float value or a constant float tensor.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value or a constant float tensor.
-        The exponential decay rate for the 2nd moment estimates.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
-    `epsilon` can each be a callable that takes no arguments and returns the
-    actual value to use. This can be useful for changing these values across
-    different invocations of optimizer functions.
-    @end_compatibility
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
+        callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
@@ -105,9 +106,6 @@ class AdamOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Created in SparseApply if needed.
-    self._updated_lr = None
-
   def _get_beta_accumulators(self):
     with ops.init_scope():
       if context.executing_eagerly():
@@ -123,12 +121,10 @@ class AdamOptimizer(optimizer.Optimizer):
     # workers (these need to go on the same PS, otherwise some updates are
     # silently ignored).
     first_var = min(var_list, key=lambda x: x.name)
-    self._create_non_slot_variable(initial_value=self._beta1,
-                                   name="beta1_power",
-                                   colocate_with=first_var)
-    self._create_non_slot_variable(initial_value=self._beta2,
-                                   name="beta2_power",
-                                   colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta1, name="beta1_power", colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta2, name="beta2_power", colocate_with=first_var)
 
     # Create slots for the first and second moments.
     for v in var_list:
@@ -151,28 +147,34 @@ class AdamOptimizer(optimizer.Optimizer):
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
-        var, m, v,
+        var,
+        m,
+        v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
         math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
+        grad,
+        use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
+        var.handle,
+        m.handle,
+        v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
         math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
+        grad,
+        use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power, beta2_power = self._get_beta_accumulators()
@@ -186,8 +188,7 @@ class AdamOptimizer(optimizer.Optimizer):
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
     m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
@@ -197,26 +198,29 @@ class AdamOptimizer(optimizer.Optimizer):
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
-        grad.values, var, grad.indices,
+        grad.values,
+        var,
+        grad.indices,
         lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
-            x, i, v, use_locking=self._use_locking))
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
   def _resource_apply_sparse(self, grad, var, indices):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add)
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
 
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
@@ -227,5 +231,5 @@ class AdamOptimizer(optimizer.Optimizer):
             beta1_power * self._beta1_t, use_locking=self._use_locking)
         update_beta2 = beta2_power.assign(
             beta2_power * self._beta2_t, use_locking=self._use_locking)
-    return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
-                                  name=name_scope)
+    return control_flow_ops.group(
+        *update_ops + [update_beta1, update_beta2], name=name_scope)
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index a7ad1f70e5e86d2fcd86b76c54314238edd400e1..21fa6b3b5d3f8c306f0116f4d21940164c28b104 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -621,7 +621,8 @@ class CheckpointManager(object):
                >= self._last_preserved_timestamp)):
         self._last_preserved_timestamp = timestamp
         continue
-      remove_checkpoint(filename)
+      _delete_file_if_exists(filename + ".index")
+      _delete_file_if_exists(filename + ".data-?????-of-?????")
 
   def _record_state(self):
     """Saves the `CheckpointManager`'s state in `directory`."""
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 74b46179e75423b530191cce5a52034879712eaa..5e18f4b722b402a892125903ac82bf5991c385cd 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -180,8 +180,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
       (in default graph).
 
   Raises:
-    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
-    ValueError: If missing variables in current graph.
+    ValueError: If missing variables in current graph, or if missing
+      checkpoints or tensors in checkpoints.
   """
   if distribution_strategy_context.get_cross_replica_context():
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index 3201c755afd5f4927a28f8b5de65c564144423aa..a39462732f591cb49bb4ee07a45a9efe732f589e 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -11,7 +11,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 py_library(
     name = "base",
@@ -31,11 +32,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "base_test",
     srcs = ["base_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":base",
         "//tensorflow/python:client_testlib",
     ],
@@ -51,11 +51,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tracking_test",
     srcs = ["tracking_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         "//tensorflow/python:client_testlib",
@@ -75,14 +74,14 @@ py_library(
     deps = [
         ":base",
         ":layer_utils",
+        "//tensorflow/python/saved_model:revived_types",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "data_structures_test",
     srcs = ["data_structures_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":data_structures",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
@@ -128,15 +127,15 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "util_test",
     srcs = ["util_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/74395663
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         ":util",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -159,20 +158,42 @@ py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
+    ],
+    tags = ["notsan"],  # b/74395663
+)
+
+tf_xla_py_test(
+    name = "util_xla_test",
+    srcs = ["util_xla_test.py"],
+    tags = [
+        "no_pip",
+        "nomac",
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        ":tracking",
+        ":util",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "//tensorflow/python/keras/optimizer_v2",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "util_with_v1_optimizers_test",
     srcs = ["util_with_v1_optimizers_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/74395663
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         ":util",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -189,13 +210,13 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
     ],
+    tags = ["notsan"],  # b/74395663
 )
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index c752f9ca7e005158f79f302c61a3580cdcaf8413..8257693055d0508c223eab8aeb6ff2e291515d4d 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -142,8 +142,10 @@ class PythonStringStateSaveable(PythonStateSaveable):
       state_callback: A function taking no arguments which returns a
         string. This function is run every time a checkpoint is written.
       restore_callback: A function taking a Python string, used to restore
-        state. Optional; defaults to doing nothing.
+        state. Optional; defaults to doing nothing, in which case it is ignored
+        by status assertions such as assert_consumed().
     """
+    self._has_trivial_state_callback = (restore_callback is None)
     def _state_callback_wrapper():
       with ops.init_scope():
         return state_callback()
@@ -156,14 +158,21 @@ class PythonStringStateSaveable(PythonStateSaveable):
     super(PythonStringStateSaveable, self).__init__(
         self._save_string, [spec], name)
 
+  @property
+  def optional_restore(self):
+    """For values with no restore, relaxes assert_consumed()."""
+    return self._has_trivial_state_callback
+
   def feed_dict_additions(self):
     """When running a graph, indicates fresh state to feed."""
     return {self._save_string: self._state_callback()}
 
   def freeze(self):
     """Create a frozen `SaveableObject` which saves the current state."""
+    def _constant_state():
+      return constant_op.constant(self._state_callback(), dtype=dtypes.string)
     return NoRestoreSaveable(
-        tensor=self._state_callback,
+        tensor=_constant_state,
         dtype=dtypes.string,
         name=self.name)
 
@@ -349,8 +358,9 @@ class _CheckpointPosition(object):
           # added or deleted. Stores unused attributes so an exception can be
           # raised if the user decides to check that everything in the
           # checkpoint was loaded.
-          self._checkpoint.unused_attributes.setdefault(
-              self.checkpointable, []).append(serialized_tensor.name)
+          if not serialized_tensor.optional_restore:
+            self._checkpoint.unused_attributes.setdefault(
+                self.checkpointable, []).append(serialized_tensor.name)
           continue
         if callable(saveable_factory):
           saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
@@ -450,16 +460,16 @@ def no_automatic_dependency_tracking(method):
       target=method, decorator_func=_method_wrapper)
 
 
-class CheckpointableBase(object):
+class Checkpointable(object):
   """Base class for `Checkpointable` objects without automatic dependencies.
 
   This class has no __setattr__ override for performance reasons. Dependencies
   must be added explicitly. Unless attribute assignment is performance-critical,
-  use `Checkpointable` instead. Use `CheckpointableBase` for `isinstance`
+  use `AutoCheckpointable` instead. Use `Checkpointable` for `isinstance`
   checks.
   """
 
-  # CheckpointableBase does not do automatic dependency tracking, but uses the
+  # Checkpointable does not do automatic dependency tracking, but uses the
   # no_automatic_dependency_tracking decorator so it can avoid adding
   # dependencies if a subclass is Checkpointable / inherits from Model (both of
   # which have __setattr__ overrides).
@@ -613,7 +623,7 @@ class CheckpointableBase(object):
     # assign again. It will add this variable to our dependencies, and if there
     # is a non-trivial restoration queued, it will handle that. This also
     # handles slot variables.
-    if not overwrite or isinstance(new_variable, CheckpointableBase):
+    if not overwrite or isinstance(new_variable, Checkpointable):
       return self._track_checkpointable(new_variable, name=name,
                                         overwrite=overwrite)
     else:
@@ -685,7 +695,7 @@ class CheckpointableBase(object):
       ValueError: If another object is already tracked by this name.
     """
     self._maybe_initialize_checkpointable()
-    if not isinstance(checkpointable, CheckpointableBase):
+    if not isinstance(checkpointable, Checkpointable):
       raise TypeError(
           ("Checkpointable._track_checkpointable() passed type %s, not a "
            "Checkpointable.") % (type(checkpointable),))
@@ -732,7 +742,7 @@ class CheckpointableBase(object):
       name: The name of the dependency within this object (`self`), used to
         match `checkpointable` with values saved in a checkpoint.
       checkpointable: The Checkpointable object to restore (inheriting from
-        `CheckpointableBase`).
+        `Checkpointable`).
     """
     self._maybe_initialize_checkpointable()
     checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
diff --git a/tensorflow/python/training/checkpointable/base_test.py b/tensorflow/python/training/checkpointable/base_test.py
index fd935ac559ed7cd607145e7b2433a00c1f8431ea..750799f03036bfddc188796210c7c3c29aa0e986 100644
--- a/tensorflow/python/training/checkpointable/base_test.py
+++ b/tensorflow/python/training/checkpointable/base_test.py
@@ -16,6 +16,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import os
+
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
@@ -26,13 +29,13 @@ from tensorflow.python.training.checkpointable import util
 class InterfaceTests(test.TestCase):
 
   def testOverwrite(self):
-    root = base.CheckpointableBase()
-    leaf = base.CheckpointableBase()
+    root = base.Checkpointable()
+    leaf = base.Checkpointable()
     root._track_checkpointable(leaf, name="leaf")
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(leaf, current_dependency)
     self.assertEqual("leaf", current_name)
-    duplicate_name_dep = base.CheckpointableBase()
+    duplicate_name_dep = base.Checkpointable()
     with self.assertRaises(ValueError):
       root._track_checkpointable(duplicate_name_dep, name="leaf")
     root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
@@ -41,7 +44,7 @@ class InterfaceTests(test.TestCase):
     self.assertEqual("leaf", current_name)
 
   def testAddVariableOverwrite(self):
-    root = base.CheckpointableBase()
+    root = base.Checkpointable()
     a = root._add_variable_with_custom_getter(
         name="v", shape=[], getter=variable_scope.get_variable)
     self.assertEqual([root, a], util.list_objects(root))
@@ -57,5 +60,30 @@ class InterfaceTests(test.TestCase):
             name="v", shape=[], overwrite=False,
             getter=variable_scope.get_variable)
 
+  def testAssertConsumedWithUnusedPythonState(self):
+    has_config = base.Checkpointable()
+    has_config.get_config = lambda: {}
+    saved = util.Checkpoint(obj=has_config)
+    save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    restored = util.Checkpoint(obj=base.Checkpointable())
+    restored.restore(save_path).assert_consumed()
+
+  def testAssertConsumedFailsWithUsedPythonState(self):
+    has_config = base.Checkpointable()
+    attributes = {
+        "foo_attr": functools.partial(
+            base.PythonStringStateSaveable,
+            state_callback=lambda: "",
+            restore_callback=lambda x: None)}
+    has_config._gather_saveables_for_checkpoint = lambda: attributes
+    saved = util.Checkpoint(obj=has_config)
+    save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    restored = util.Checkpoint(obj=base.Checkpointable())
+    status = restored.restore(save_path)
+    with self.assertRaisesRegexp(AssertionError, "foo_attr"):
+      status.assert_consumed()
+
+
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index 817552f32696e34d123d1da5057388c1bd96139c..c86846f8e23d7e2de8e062e2c16e38a8bfcab423 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -19,10 +19,13 @@ from __future__ import print_function
 
 import collections
 import copy
+import operator
+import sys
 
 import six
 
 from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import layer_utils
 
@@ -56,7 +59,7 @@ def _wrap_or_unwrap(value):
   """Wraps basic data structures, unwraps NoDependency objects."""
   if isinstance(value, NoDependency):
     return value.value
-  if isinstance(value, base.CheckpointableBase):
+  if isinstance(value, base.Checkpointable):
     return value  # Skip conversion for already checkpointable objects.
   elif isinstance(value, dict):
     return _DictWrapper(value)
@@ -97,7 +100,7 @@ def sticky_attribute_assignment(checkpointable, name, value):
   value = _wrap_or_unwrap(value)
   if not add_dependency:
     return value
-  if isinstance(value, base.CheckpointableBase):
+  if isinstance(value, base.Checkpointable):
     checkpointable._track_checkpointable(  # pylint: disable=protected-access
         value, name=name,
         # Allow the user to switch the Checkpointable which is tracked by this
@@ -107,7 +110,7 @@ def sticky_attribute_assignment(checkpointable, name, value):
   return value
 
 
-class CheckpointableDataStructure(base.CheckpointableBase):
+class CheckpointableDataStructure(base.Checkpointable):
   """Base class for data structures which contain checkpointable objects."""
 
   def __init__(self):
@@ -120,11 +123,11 @@ class CheckpointableDataStructure(base.CheckpointableBase):
         checkpointable=self, value=value, name=name)
     if isinstance(value, variables.Variable):
       self._extra_variables.append(value)
-    if not isinstance(value, base.CheckpointableBase):
+    if not isinstance(value, base.Checkpointable):
       raise ValueError(
           ("Only checkpointable objects (such as Layers or Optimizers) may be "
            "stored in a List object. Got %s, which does not inherit from "
-           "CheckpointableBase.") % (value,))
+           "Checkpointable.") % (value,))
     if hasattr(value, "_use_resource_variables"):
       # In subclassed models, legacy layers (tf.layers) must always use
       # resource variables.
@@ -258,9 +261,12 @@ class List(CheckpointableDataStructure, collections.Sequence):
       self._storage[index] = self._track_value(
           element, name=self._name_element(index))
 
-  def __copy__(self):
+  def copy(self):
     return type(self)(copy.copy(self._storage))
 
+  def __copy__(self):
+    return self.copy()
+
   def __deepcopy__(self, memo):
     return type(self)(copy.deepcopy(self._storage, memo))
 
@@ -283,18 +289,33 @@ class List(CheckpointableDataStructure, collections.Sequence):
   def extend(self, values):
     """Add a sequence of checkpointable values."""
     for value in values:
-      self._storage.append(self._track_value(
-          value, name=self._name_element(len(self._storage))))
+      self.append(value)
 
   def __iadd__(self, values):
     self.extend(values)
     return self
 
   def __add__(self, other):
-    if isinstance(other, List):
-      return self.__class__(self._storage + other._storage)  # pylint: disable=protected-access
-    else:
-      return self.__class__(self._storage + other)
+    return self.__class__(self._storage + getattr(other, "_storage", other))
+
+  def __imul__(self, y):
+    if y <= 0:
+      raise ValueError(
+          "List only supports append, multiplying in place by %d removes "
+          "elements." % y)
+
+    n = len(self._storage)
+    for _ in range(y - 1):
+      for i in range(n):
+        self.append(self._storage[i])
+
+    return self
+
+  def __mul__(self, n):
+    return self.__class__(self._storage * n)
+
+  def __rmul__(self, n):
+    return self * n
 
   def __radd__(self, other):
     return self + other
@@ -302,13 +323,20 @@ class List(CheckpointableDataStructure, collections.Sequence):
   def __getitem__(self, key):
     return self._storage[key]
 
+  def __getslice__(self, i, j):
+    return self._storage[slice(i, j)]
+
   def __len__(self):
     return len(self._storage)
 
   def __repr__(self):
     return "List(%s)" % (repr(self._storage),)
 
+  def __sizeof__(self):
+    return super(List, self).__sizeof__() + sys.getsizeof(self._storage)
 
+
+# TODO(tomhennigan) Update to collections.UserList?
 class _ListWrapper(List, collections.MutableSequence,
                    # Shadowed, but there for isinstance checks.
                    list):
@@ -381,12 +409,12 @@ class _ListWrapper(List, collections.MutableSequence,
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
            "checkpointable TensorFlow objects). A list element was replaced "
-           "(__setitem__), deleted, or inserted. In order to support "
-           "restoration on object creation, tracking is exclusively for "
-           "append-only data structures.\n\nIf you don't need this list "
-           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
-           "object; it will be automatically un-wrapped and subsequently "
-           "ignored." % (self,)))
+           "(__setitem__, __setslice__), deleted (__delitem__, __delslice__), "
+           "or moved (sort). In order to support restoration on object "
+           "creation, tracking is exclusively for append-only data structures."
+           "\n\nIf you don't need this list checkpointed, wrap it in a "
+           "tf.contrib.checkpoint.NoDependency object; it will be "
+           "automatically un-wrapped and subsequently ignored." % (self,)))
     if self._external_modification:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
@@ -404,8 +432,34 @@ class _ListWrapper(List, collections.MutableSequence,
     del self._storage[key]
 
   def __setitem__(self, key, value):
-    self._non_append_mutation = True
-    self._storage[key] = value
+    self._check_external_modification()
+
+    if isinstance(key, slice):
+      # Note: this is quite inefficient, but the list API supports a broad range
+      # of slice setters (e.g. truncate, extend, replace) and immitating this
+      # for a range of Python versions is non-trivial.
+      storage_copy = list(self._storage)
+      self._storage[key] = value
+
+      len_before = len(storage_copy)
+      len_now = len(self._storage)
+      for i in range(max(len_before, len_now)):
+        value_now = self._storage[i] if i < len_now else None
+        value_before = storage_copy[i] if i < len_before else None
+
+        if isinstance(value_before, base.Checkpointable):
+          self._non_append_mutation = True
+
+        if value_now is not None and value_now != value_before:
+          self._storage[i] = self._track_value(self._storage[i],
+                                               self._name_element(i))
+
+    else:
+      if isinstance(self._storage[key], base.Checkpointable):
+        self._non_append_mutation = True
+      self._storage[key] = self._track_value(value, self._name_element(key))
+
+    self._update_snapshot()
 
   def append(self, value):
     """Add a new checkpointable value."""
@@ -446,6 +500,17 @@ class _ListWrapper(List, collections.MutableSequence,
     self._non_append_mutation = True
     self._storage.insert(index, obj)
 
+  def sort(self):
+    self._non_append_mutation = True
+    self._storage.sort()
+
+  def __setslice__(self, i, j, y):
+    self.__setitem__(slice(i, j), y)
+
+  def __delslice__(self, i, j):
+    self._non_append_mutation = True
+    del self._storage[slice(i, j)]
+
   def _track_value(self, value, name):
     """Allows storage of non-checkpointable objects."""
     try:
@@ -685,14 +750,14 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     else:
       value = _wrap_or_unwrap(value)
       existing_dependency = None
-      if not no_dep and isinstance(value, base.CheckpointableBase):
+      if not no_dep and isinstance(value, base.Checkpointable):
         # Non-string keys are OK as long as we have no reason to add a
         # dependency on the value (either because the value is not
         # checkpointable, or because it was wrapped in a NoDependency object).
         self._non_string_key = True
     current_value = self._storage.setdefault(key, value)
     if current_value is not value:
-      if ((not no_dep and isinstance(value, base.CheckpointableBase))
+      if ((not no_dep and isinstance(value, base.Checkpointable))
           # We don't want to just check that the existing object is
           # checkpointable, since it may have been wrapped in a NoDependency
           # object.
@@ -708,7 +773,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
   def __delitem__(self, key):
     self._check_external_modification()
     existing_value = self[key]
-    if isinstance(existing_value, base.CheckpointableBase):
+    if isinstance(existing_value, base.Checkpointable):
       # Deleting tracked checkpointable values means restoring is problematic,
       # so we'll throw an exception on save.
       self._non_append_mutation = True
@@ -727,3 +792,44 @@ class _DictWrapper(Mapping, collections.MutableMapping):
   def update(self, *args, **kwargs):
     for key, value in dict(*args, **kwargs).items():
       self[key] = value
+
+revived_types.register_revived_type(
+    "checkpointable_dict_wrapper",
+    lambda obj: isinstance(obj, _DictWrapper),
+    versions=[revived_types.VersionedTypeRegistration(
+        # Standard dependencies are enough to reconstruct the checkpointable
+        # items in dictionaries, so we don't need to save any extra information.
+        object_factory=lambda proto: _DictWrapper({}),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=operator.setitem,
+        getter=_DictWrapper.get,
+        attribute_extractor=lambda obj: obj.keys())])
+
+
+def _set_list_item(list_object, index_string, value):
+  item_index = int(index_string)
+  if len(list_object) <= item_index:
+    list_object.extend([None] * (1 + item_index - len(list_object)))
+  list_object[item_index] = value
+
+
+def _list_getter(obj, item, default=None):
+  index = int(item)
+  if index < len(obj):
+    return obj[index]
+  return default
+
+
+revived_types.register_revived_type(
+    "checkpointable_list_wrapper",
+    lambda obj: isinstance(obj, _ListWrapper),
+    versions=[revived_types.VersionedTypeRegistration(
+        object_factory=lambda proto: _ListWrapper([]),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=_set_list_item,
+        getter=_list_getter,
+        attribute_extractor=lambda obj: [str(i) for i in range(len(obj))])])
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index bcec6e01001eec6c164cf4bb17db3d4ed55b0935..72045876b61b5331f8c576e6a611839a101973e2 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -207,11 +207,88 @@ class ListTests(test.TestCase):
     self.assertEqual([v], l.trainable_weights)
     self.assertEqual([v2], l.non_trainable_weights)
 
+  def testCopy(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    v3 = resource_variable_ops.ResourceVariable(1.)
+
+    l1 = data_structures.List([v1, v2])
+    l2 = l1.copy()
+    l2.append(v3)
+    self.assertEqual(list(l1), [v1, v2])
+    self.assertEqual(list(l2), [v1, v2, v3])
+
+  def testSlicing(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    v3 = resource_variable_ops.ResourceVariable(1.)
+    v4 = resource_variable_ops.ResourceVariable(1.)
+
+    l = data_structures.List([v1, v2, v3, v4])
+    self.assertEqual(l[1:], [v2, v3, v4])
+    self.assertEqual(l[1:-1], [v2, v3])
+    self.assertEqual(l[:-1], [v1, v2, v3])
+
+  def testHash(self):
+    has_sequences = set([data_structures.List(),
+                         data_structures.List()])
+    self.assertEqual(2, len(has_sequences))
+    self.assertNotIn(data_structures.List(), has_sequences)
+
+  def testIMul_zero(self):
+    l = data_structures.List([])
+    with self.assertRaisesRegexp(ValueError, "List only supports append"):
+      l *= 0
+
+  def testIMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v])
+    l *= 2
+    self.assertEqual(list(l), [v] * 2)
+
+  def testMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v, v, v])
+    self.assertEqual(list(l * 2), [v, v, v] * 2)
+
+  def testRMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v, v, v])
+    self.assertEqual(list(2 * l), [v, v, v] * 2)
+
+
+class ListWrapperTest(test.TestCase):
+
+  IGNORED = ("__new__", "__init__", "__subclasshook__", "__getattribute__")
+
+  def test_overrides_all_list_methods(self):
+    not_overridden = []
+
+    for name in dir(list):
+      if name in ListWrapperTest.IGNORED:
+        continue
+
+      list_method = getattr(list, name)
+
+      if not callable(list_method):
+        continue
+
+      object_method = getattr(object, name, None)
+      if object_method is not None and object_method == list_method:
+        # Skip methods that aren't overridden from object.
+        continue
+
+      if list_method == getattr(data_structures._ListWrapper, name):
+        not_overridden.append(name)
+
+    if not_overridden:
+      self.fail("_ListWrapper does not override %s" % (not_overridden))
+
   def testListWrapperBasic(self):
     # _ListWrapper, unlike List, compares like the built-in list type (since it
     # is used to automatically replace lists).
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
+    b = tracking.AutoCheckpointable()
     self.assertEqual([a, a],
                      [a, a])
     self.assertEqual(data_structures._ListWrapper([a, a]),
@@ -244,6 +321,10 @@ class ListTests(test.TestCase):
     self.assertEqual([a, a], [a] + data_structures._ListWrapper([a]))
     self.assertIsInstance(data_structures._ListWrapper([a]), list)
 
+  def testAcceptsNonCheckpointableContent(self):
+    l = data_structures._ListWrapper([1, 2, 3])
+    self.assertEqual(l, [1, 2, 3])
+
   def testWrapperChangesList(self):
     l = []
     l_wrapper = data_structures._ListWrapper(l)
@@ -263,13 +344,61 @@ class ListTests(test.TestCase):
     l.append(layer)
     self.assertEqual([layer], l_wrapper.layers)
 
-  def testHashing(self):
-    has_sequences = set([data_structures.List(),
-                         data_structures.List()])
-    self.assertEqual(2, len(has_sequences))
-    self.assertNotIn(data_structures.List(), has_sequences)
+  def testNotHashable(self):
     with self.assertRaises(TypeError):
-      has_sequences.add(data_structures._ListWrapper([]))
+      hash(data_structures._ListWrapper())
+
+  def testDelItem(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    del l[0]
+    self.assertEqual(l, [2, 3, 4])
+    self.assertUnableToSave(l, "Unable to save .*__delitem__")
+
+  def testDelSlice(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    del l[2:3]
+    self.assertEqual(l, [1, 2, 4])
+    self.assertUnableToSave(l, "Unable to save .*__delslice__")
+
+  def testSetSlice_canSaveForNonCheckpointableItems(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[:] = 2, 8, 9, 0
+    self.assertEqual(l, [2, 8, 9, 0])
+    l._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    self.assertEqual(len(l._checkpoint_dependencies), 0)  # pylint: disable=protected-access
+
+  def testSetSlice_cannotSaveIfCheckpointableModified(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures._ListWrapper([1, 2, v1, v2])
+    l[:] = 2, 8, 9, v2
+    self.assertEqual(l, [2, 8, 9, v2])
+    self.assertUnableToSave(l, "Unable to save .*__setslice__")
+
+  def testSetSlice_truncate(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[:] = []
+    self.assertEqual(l, [])
+
+  def testSetSlice_extend(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[2:] = 1, 2, 3, 4
+    self.assertEqual(l, [1, 2, 1, 2, 3, 4])
+
+  def testSort(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l.sort()
+    self.assertEqual(l, [1, 2, 3, 4])
+    # Regardless of being a no-op for the input list, we still refuse to save.
+    # This is intentional since otherwise we would end up with a hard to debug
+    # case for users (e.g. sometimes sort on a ListWrapper is checkpointable and
+    # other times it is not).
+    self.assertUnableToSave(l, "Unable to save .*sort")
+
+  def assertUnableToSave(self, l, msg):
+    l._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    with self.assertRaisesRegexp(ValueError, msg):
+      return l._checkpoint_dependencies  # pylint: disable=protected-access
 
 
 class HasMapping(training.Model):
@@ -337,7 +466,7 @@ class MappingTests(test.TestCase):
 
   def testLayerCollectionWithExternalMutation(self):
     d = {}
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.wrapper = d
     self.assertEqual([], root.wrapper.layers)
     self.assertEqual([], root.wrapper.trainable_weights)
@@ -355,7 +484,7 @@ class MappingTests(test.TestCase):
     self.assertEqual(2, len(has_mappings))
     self.assertNotIn(data_structures.Mapping(), has_mappings)
     # In contrast to Mapping, dict wrappers are not hashable
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.d = {}
     self.assertEqual({}, a.d)
     self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
@@ -364,7 +493,7 @@ class MappingTests(test.TestCase):
       set([a.d])
 
   def testDictWrapperBadKeys(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.d = {}
     a.d[1] = data_structures.List()
     model = training.Model()
@@ -374,7 +503,7 @@ class MappingTests(test.TestCase):
       model.save_weights(save_path)
 
   def testDictWrapperNoDependency(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.d = data_structures.NoDependency({})
     a.d[1] = [3]
     self.assertEqual([a], util.list_objects(a))
@@ -385,7 +514,7 @@ class MappingTests(test.TestCase):
     model.load_weights(save_path)
 
   def testNonStringKeyNotCheckpointableValue(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = data_structures.NoDependency([3])
@@ -399,15 +528,15 @@ class MappingTests(test.TestCase):
   def testNonAppendNotCheckpointable(self):
     # Non-append mutations (deleting or overwriting values) are OK when the
     # values aren't tracked.
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = 3
     a.d[1] = 2
     self.assertEqual(2, a.d[1])
     del a.d[1]
-    a.d[2] = data_structures.NoDependency(tracking.Checkpointable())
-    second = tracking.Checkpointable()
+    a.d[2] = data_structures.NoDependency(tracking.AutoCheckpointable())
+    second = tracking.AutoCheckpointable()
     a.d[2] = data_structures.NoDependency(second)
     self.assertIs(second, a.d[2])
     self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
@@ -469,7 +598,7 @@ class MappingTests(test.TestCase):
     self.assertEqual({1: 3}, new_dict)
 
   def testListShallowCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.copy(root.a)
@@ -486,7 +615,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testListDeepCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.deepcopy(root.a)
@@ -503,7 +632,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.deepcopy(root.a))
 
   def testDictShallowCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.copy(root.a)
@@ -520,7 +649,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testDictDeepCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.deepcopy(root.a)
@@ -537,8 +666,8 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.deepcopy(root.a))
 
   def testShallowCopyCheckpointable(self):
-    original = tracking.Checkpointable()
-    original_sub = tracking.Checkpointable()
+    original = tracking.AutoCheckpointable()
+    original_sub = tracking.AutoCheckpointable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     shallow_copied = copy.copy(original)
@@ -551,15 +680,15 @@ class MappingTests(test.TestCase):
     self.assertIn(shallow_copied.b["a"], shallow_deps)
 
   def testDeepCopyCheckpointable(self):
-    original = tracking.Checkpointable()
-    original_sub = tracking.Checkpointable()
+    original = tracking.AutoCheckpointable()
+    original_sub = tracking.AutoCheckpointable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     deep_copied = copy.deepcopy(original)
     self.assertIsNot(original, deep_copied)
     self.assertIsNot(original_sub, deep_copied.b["a"])
     self.assertEqual([[1.]], deep_copied.a)
-    self.assertIsInstance(deep_copied.b["a"], tracking.Checkpointable)
+    self.assertIsInstance(deep_copied.b["a"], tracking.AutoCheckpointable)
     deps = util.list_objects(deep_copied)
     self.assertIn(deep_copied.a, deps)
     self.assertIn(deep_copied.b, deps)
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/checkpointable/tracking.py
index 4e96aee0c51d441c4a32ce68943e27dbf592349c..04fd5547e1002c559b43c241dc25919588167e8b 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/checkpointable/tracking.py
@@ -41,7 +41,7 @@ class NotCheckpointable(object):
   pass
 
 
-class Checkpointable(base.CheckpointableBase):
+class AutoCheckpointable(base.Checkpointable):
   """Manages dependencies on other objects.
 
   `Checkpointable` objects may have dependencies: other `Checkpointable` objects
@@ -74,7 +74,7 @@ class Checkpointable(base.CheckpointableBase):
     if getattr(self, "_setattr_tracking", True):
       value = data_structures.sticky_attribute_assignment(
           checkpointable=self, value=value, name=name)
-    super(Checkpointable, self).__setattr__(name, value)
+    super(AutoCheckpointable, self).__setattr__(name, value)
 
   def _no_dependency(self, value):
     """Override to allow CheckpointableBase to disable dependency tracking."""
@@ -124,7 +124,7 @@ def resource_tracker_scope(resource_tracker):
     _RESOURCE_TRACKER_STACK = old
 
 
-class TrackableResource(base.CheckpointableBase):
+class TrackableResource(base.Checkpointable):
   """Base class for all resources that need to be tracked."""
 
   def __init__(self):
@@ -151,7 +151,7 @@ class TrackableResource(base.CheckpointableBase):
     return self._resource_handle
 
 
-class TrackableAsset(base.CheckpointableBase):
+class TrackableAsset(base.Checkpointable):
   """Base class for asset files which need to be tracked."""
 
   def __init__(self, path):
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/checkpointable/tracking_test.py
index 17c5461bc25e5e409cc04d0182603e8406dc7d47..eb70919b9c99d7c00326e0d2233ad204a10288a1 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/checkpointable/tracking_test.py
@@ -35,10 +35,10 @@ from tensorflow.python.util import nest
 class InterfaceTests(test.TestCase):
 
   def testMultipleAssignment(self):
-    root = tracking.Checkpointable()
-    root.leaf = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
+    root.leaf = tracking.AutoCheckpointable()
     root.leaf = root.leaf
-    duplicate_name_dep = tracking.Checkpointable()
+    duplicate_name_dep = tracking.AutoCheckpointable()
     with self.assertRaisesRegexp(ValueError, "already declared"):
       root._track_checkpointable(duplicate_name_dep, name="leaf")
     # No error; we're overriding __setattr__, so we can't really stop people
@@ -50,10 +50,10 @@ class InterfaceTests(test.TestCase):
     self.assertIs(duplicate_name_dep, dep_object)
 
   def testNoDependency(self):
-    root = tracking.Checkpointable()
-    hasdep = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
+    hasdep = tracking.AutoCheckpointable()
     root.hasdep = hasdep
-    nodep = tracking.Checkpointable()
+    nodep = tracking.AutoCheckpointable()
     root.nodep = data_structures.NoDependency(nodep)
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
@@ -66,16 +66,16 @@ class InterfaceTests(test.TestCase):
       def __init__(self):
         super(NoDependencyModel, self).__init__()
         self.a = []
-        self.b = tracking.Checkpointable()
+        self.b = tracking.AutoCheckpointable()
 
     nodeps = NoDependencyModel()
     self.assertEqual([nodeps], util.list_objects(nodeps))
 
   def testListBasic(self):
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
+    b = tracking.AutoCheckpointable()
     a.l = [b]
-    c = tracking.Checkpointable()
+    c = tracking.AutoCheckpointable()
     a.l.append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
@@ -87,10 +87,10 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testMutationDirtiesList(self):
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
+    b = tracking.AutoCheckpointable()
     a.l = [b]
-    c = tracking.Checkpointable()
+    c = tracking.AutoCheckpointable()
     a.l.insert(0, c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
@@ -98,11 +98,11 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testOutOfBandEditDirtiesList(self):
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
+    b = tracking.AutoCheckpointable()
     held_reference = [b]
     a.l = held_reference
-    c = tracking.Checkpointable()
+    c = tracking.AutoCheckpointable()
     held_reference.append(c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "The wrapped list was modified"):
@@ -110,25 +110,25 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testNestedLists(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.l = []
-    b = tracking.Checkpointable()
+    b = tracking.AutoCheckpointable()
     a.l.append([b])
-    c = tracking.Checkpointable()
+    c = tracking.AutoCheckpointable()
     a.l[0].append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     a.l[0].append(1)
-    d = tracking.Checkpointable()
+    d = tracking.AutoCheckpointable()
     a.l[0].append(d)
     a_deps = util.list_objects(a)
     self.assertIn(d, a_deps)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     self.assertNotIn(1, a_deps)
-    e = tracking.Checkpointable()
-    f = tracking.Checkpointable()
+    e = tracking.AutoCheckpointable()
+    f = tracking.AutoCheckpointable()
     a.l1 = [[], [e]]
     a.l1[0].append(f)
     a_deps = util.list_objects(a)
@@ -183,7 +183,7 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoCheckpointable()
     a.l = {"k": [numpy.zeros([2, 2])]}
     self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
                         nest.flatten(a.l))
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 7f70d973d0a8933757f043f6e6c208e213df60a7..129ad55f961794b387a38ecc28e539fca5826477 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -176,25 +176,13 @@ class _CheckpointRestoreCoordinator(object):
         raise AssertionError(
             ("Saveable keys changed when validating. Got back %s, was "
              "expecting %s") % (tensor_saveables.keys(), validated_names))
-      for saveable in validated_saveables:
-        if saveable.device:
-          device = saveable_object_util.set_cpu0(saveable.device)
-        else:
-          device = None
-        with ops.device(device):
-          tensors = []
-          for spec in saveable.specs:
-            tensors.append(
-                io_ops.restore_v2(
-                    self.save_path_tensor,
-                    [spec.name],
-                    [spec.slice_spec],
-                    [spec.dtype])[0])
-          restore_op = saveable.restore(tensors, restored_shapes=None)
-        if not context.executing_eagerly():
+      new_restore_ops = functional_saver.restore_from_saveable_objects(
+          self.save_path_tensor, validated_saveables)
+      if not context.executing_eagerly():
+        restore_ops.extend(new_restore_ops)
+        for saveable, restore_op in zip(validated_saveables, new_restore_ops):
           assert saveable.name not in self.restore_ops_by_name
           self.restore_ops_by_name[saveable.name] = restore_op
-          restore_ops.append(restore_op)
     return restore_ops
 
 
@@ -678,7 +666,13 @@ def _add_attributes_to_object_graph(
         if cached_attributes is not None:
           cached_attributes[name] = saveables
 
+      optional_restore = None
       for saveable in saveables:
+        if optional_restore is None:
+          optional_restore = saveable.optional_restore
+        else:
+          optional_restore = optional_restore and saveable.optional_restore
+
         if hasattr(saveable, "full_name"):
           attribute.full_name = saveable.full_name
         if isinstance(saveable, base.PythonStateSaveable):
@@ -700,6 +694,9 @@ def _add_attributes_to_object_graph(
                     % (checkpointable, new_feed_key))
             feed_additions.update(saveable_feed_dict)
         named_saveable_objects.append(saveable)
+      if optional_restore is None:
+        optional_restore = False
+      attribute.optional_restore = optional_restore
 
   return named_saveable_objects, feed_additions
 
@@ -1032,7 +1029,7 @@ class CheckpointLoadStatus(_LoadStatus):
       raise AssertionError(
           ("Unused attributes in these objects (the attributes exist in the "
            "checkpoint but not in the objects): %s") % (
-               self._checkpoint.unused_attributes.items(),))
+               list(self._checkpoint.unused_attributes.items()),))
     return self
 
   def assert_existing_objects_matched(self):
@@ -1391,8 +1388,8 @@ class CheckpointableSaver(object):
             name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects, graph_proto, feed_additions
 
-  def freeze(self, object_map=None, to_graph=None):
-    """Creates a `tf.train.Saver` with the current object graph frozen."""
+  def gather_objects(self, object_map=None, to_graph=None):
+    """Creates SaveableObjects with the current object graph frozen."""
     checkpointable_objects, path_to_root = (
         _breadth_first_checkpointable_traversal(self._root_checkpointable))
     if to_graph:
@@ -1412,7 +1409,12 @@ class CheckpointableSaver(object):
           base.NoRestoreSaveable(
               tensor=object_graph_tensor,
               name=base.OBJECT_GRAPH_PROTO_KEY))
-      return functional_saver.Saver(named_saveable_objects)
+    return named_saveable_objects
+
+  def freeze(self, object_map=None, to_graph=None):
+    named_saveable_objects = self.gather_objects(
+        object_map=object_map, to_graph=to_graph)
+    return functional_saver.Saver(named_saveable_objects)
 
   def _save_cached_when_graph_building(
       self,
@@ -1653,7 +1655,7 @@ def frozen_saver(root_checkpointable):
 
 
 @tf_export("train.Checkpoint")
-class Checkpoint(tracking.Checkpointable):
+class Checkpoint(tracking.AutoCheckpointable):
   """Groups checkpointable objects, saving and restoring them.
 
   `Checkpoint`'s constructor accepts keyword arguments whose values are types
@@ -1755,8 +1757,7 @@ class Checkpoint(tracking.Checkpointable):
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, (base.CheckpointableBase,
-                            def_function.PolymorphicFunction)):
+      if not isinstance(v, (base.Checkpointable, def_function.Function)):
         raise ValueError(
             ("`Checkpoint` was expecting a checkpointable object (an object "
              "derived from `CheckpointableBase`), got %s. If you believe this "
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index a5f4fec672ba95179a9afe8ed5cfac2311c3d265..cef1075e93ca8a2aecfb5af9362308dabb82e47c 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -51,7 +51,7 @@ from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
-class NonLayerCheckpointable(tracking.Checkpointable):
+class NonLayerCheckpointable(tracking.AutoCheckpointable):
 
   def __init__(self):
     super(NonLayerCheckpointable, self).__init__()
@@ -139,7 +139,7 @@ class InterfaceTests(test.TestCase):
 
   def testInitNotCalled(self):
 
-    class NoInit(tracking.Checkpointable):
+    class NoInit(tracking.AutoCheckpointable):
 
       def __init__(self):
         pass
@@ -148,7 +148,7 @@ class InterfaceTests(test.TestCase):
     checkpointable_utils.add_variable(NoInit(), "var", shape=[])
 
   def testShapeDtype(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     v1 = checkpointable_utils.add_variable(
         root, name="v1", initializer=3., dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, v1.dtype)
@@ -180,7 +180,7 @@ class InterfaceTests(test.TestCase):
   def testNotCheckpointable(self):
 
     class CallsFunctionalStuff(
-        tracking.NotCheckpointable, tracking.Checkpointable):
+        tracking.NotCheckpointable, tracking.AutoCheckpointable):
       pass
 
     test_dir = self.get_temp_dir()
@@ -190,7 +190,7 @@ class InterfaceTests(test.TestCase):
       checkpoint.save(prefix)
 
     class CallsFunctionalStuffOtherMRO(
-        tracking.Checkpointable, tracking.NotCheckpointable):
+        tracking.AutoCheckpointable, tracking.NotCheckpointable):
       pass
 
     checkpoint_reversed = checkpointable_utils.Checkpoint(
@@ -220,7 +220,7 @@ class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(base.CheckpointableBase):
+class _OwnsMirroredVariables(base.Checkpointable):
   """A Checkpointable object which returns a more complex SaveableObject."""
 
   def __init__(self):
@@ -437,8 +437,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     on_create_m_bias_slot = on_create_optimizer.get_slot(
         on_create_model._named_dense.variables[1], "m")
     status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
+    if not context.executing_eagerly():
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
     # Optimizer slot variables are created when the original variable is
     # restored.
     self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
@@ -652,7 +653,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     checkpointable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
     (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
@@ -673,8 +674,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNumberedPath(self):
-    root = tracking.Checkpointable()
-    leaf = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
+    leaf = tracking.AutoCheckpointable()
     root.leaf = leaf
     checkpointable_utils.add_variable(leaf, name="v", shape=[])
     (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
@@ -683,8 +684,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testLocalNameValidation(self):
-    root = tracking.Checkpointable()
-    leaf = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
+    leaf = tracking.AutoCheckpointable()
     # Dots are escaped, which avoids conflicts with reserved names.
     root._track_checkpointable(leaf, name=".ATTRIBUTES")
     checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
@@ -725,13 +726,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testLateDependencyTracking(self):
 
-    class Dependency(tracking.Checkpointable):
+    class Dependency(tracking.AutoCheckpointable):
 
       def build(self):
         self.var = checkpointable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class LateDependencies(tracking.Checkpointable):
+    class LateDependencies(tracking.AutoCheckpointable):
 
       def add_dep(self):
         self.dep = Dependency()
@@ -758,13 +759,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testDepAfterVar(self):
 
-    class Dependency(tracking.Checkpointable):
+    class Dependency(tracking.AutoCheckpointable):
 
       def build(self):
         self.var = checkpointable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class DepAfterVar(tracking.Checkpointable):
+    class DepAfterVar(tracking.AutoCheckpointable):
 
       def add_dep(self):
         dep = Dependency()
@@ -791,7 +792,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.var = checkpointable_utils.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.Adam(0.1)
@@ -814,7 +815,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         14.))
     slots_path = checkpointable_utils.CheckpointableSaver(root).save(
         os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.Checkpointable()
+    new_root = tracking.AutoCheckpointable()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
     slot_status = checkpointable_utils.CheckpointableSaver(
@@ -830,8 +831,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertEqual(12., self.evaluate(new_root.var))
     new_root.optimizer = adam.Adam(0.1)
     slot_status.assert_existing_objects_matched()
-    with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
-      slot_status.assert_consumed()
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
+        slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
     if context.executing_eagerly():
       # Slot variables are only created with restoring initializers when
@@ -859,8 +861,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.Checkpointable()
-    save_root.dep = tracking.Checkpointable()
+    save_root = tracking.AutoCheckpointable()
+    save_root.dep = tracking.AutoCheckpointable()
     save_root.dep.var = checkpointable_utils.add_variable(
         save_root.dep, name="var", initializer=0.)
     self.evaluate(state_ops.assign(save_root.dep.var, 12.))
@@ -869,13 +871,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.evaluate(state_ops.assign(save_root.dep.var, 13.))
     second_path = saver.save(os.path.join(checkpoint_directory, "second"))
 
-    first_root = tracking.Checkpointable()
-    second_root = tracking.Checkpointable()
+    first_root = tracking.AutoCheckpointable()
+    second_root = tracking.AutoCheckpointable()
     first_status = checkpointable_utils.CheckpointableSaver(
         first_root).restore(first_path)
     second_status = checkpointable_utils.CheckpointableSaver(
         second_root).restore(second_path)
-    load_dep = tracking.Checkpointable()
+    load_dep = tracking.AutoCheckpointable()
     load_dep.var = checkpointable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
@@ -889,13 +891,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
     # Try again with the order of the restore() reversed. The last restore
     # determines the final value.
-    first_root = tracking.Checkpointable()
-    second_root = tracking.Checkpointable()
+    first_root = tracking.AutoCheckpointable()
+    second_root = tracking.AutoCheckpointable()
     second_status = checkpointable_utils.CheckpointableSaver(
         second_root).restore(second_path)
     first_status = checkpointable_utils.CheckpointableSaver(
         first_root).restore(first_path)
-    load_dep = tracking.Checkpointable()
+    load_dep = tracking.AutoCheckpointable()
     load_dep.var = checkpointable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
@@ -911,23 +913,23 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testAmbiguousLoad(self):
     # Not OK to split one checkpoint object into two
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.Checkpointable()
-    save_root.dep_one = tracking.Checkpointable()
-    save_root.dep_two = tracking.Checkpointable()
-    dep_three = tracking.Checkpointable()
+    save_root = tracking.AutoCheckpointable()
+    save_root.dep_one = tracking.AutoCheckpointable()
+    save_root.dep_two = tracking.AutoCheckpointable()
+    dep_three = tracking.AutoCheckpointable()
     save_root.dep_one.dep_three = dep_three
     save_root.dep_two.dep_three = dep_three
     checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
     self.evaluate(checkpointable_utils.gather_initializers(save_root))
     save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
         os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.Checkpointable()
+    load_root = tracking.AutoCheckpointable()
     status = checkpointable_utils.CheckpointableSaver(load_root).restore(
         save_path)
-    load_root.dep_one = tracking.Checkpointable()
-    load_root.dep_two = tracking.Checkpointable()
-    load_root.dep_one.dep_three = tracking.Checkpointable()
-    load_root.dep_two.dep_three = tracking.Checkpointable()
+    load_root.dep_one = tracking.AutoCheckpointable()
+    load_root.dep_two = tracking.AutoCheckpointable()
+    load_root.dep_one.dep_three = tracking.AutoCheckpointable()
+    load_root.dep_two.dep_three = tracking.AutoCheckpointable()
     checkpointable_utils.add_variable(
         load_root.dep_one.dep_three, name="var", initializer=0.)
     with self.assertRaises(AssertionError):
@@ -939,9 +941,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testObjectsCombined(self):
     # Currently fine to load two checkpoint objects into one Python object
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.Checkpointable()
-    save_root.dep_one = tracking.Checkpointable()
-    save_root.dep_two = tracking.Checkpointable()
+    save_root = tracking.AutoCheckpointable()
+    save_root.dep_one = tracking.AutoCheckpointable()
+    save_root.dep_two = tracking.AutoCheckpointable()
     checkpointable_utils.add_variable(
         save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
     checkpointable_utils.add_variable(
@@ -949,8 +951,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.evaluate(checkpointable_utils.gather_initializers(save_root))
     save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
         os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.Checkpointable()
-    load_root.dep_one = tracking.Checkpointable()
+    load_root = tracking.AutoCheckpointable()
+    load_root.dep_one = tracking.AutoCheckpointable()
     load_root.dep_two = load_root.dep_one
     v1 = checkpointable_utils.add_variable(
         load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
@@ -966,8 +968,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testDependencyLoop(self):
     # Note: this test creates garbage during eager execution because it
     # purposefully creates a reference cycle.
-    first = tracking.Checkpointable()
-    second = tracking.Checkpointable()
+    first = tracking.AutoCheckpointable()
+    second = tracking.AutoCheckpointable()
     first.second = second
     second.first = first
     first.v = checkpointable_utils.add_variable(
@@ -980,10 +982,10 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         os.path.join(checkpoint_directory, "ckpt"))
 
     # Test deferred loading
-    first_load = tracking.Checkpointable()
+    first_load = tracking.AutoCheckpointable()
     status = checkpointable_utils.CheckpointableSaver(
         first_load).restore(save_path)
-    second_load = tracking.Checkpointable()
+    second_load = tracking.AutoCheckpointable()
     first_load.second = second_load
     second_load.first = first_load
     with self.assertRaises(AssertionError):
@@ -1012,7 +1014,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testRestoreOnAssign(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    first = tracking.Checkpointable()
+    first = tracking.AutoCheckpointable()
     first.var1 = variables_lib.Variable(0., name="outside_var")
     first.var2 = variables_lib.Variable(0., name="blah")
     self.evaluate(first.var1.assign(4.))
@@ -1020,7 +1022,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     save_path = checkpointable_utils.CheckpointableSaver(first).save(
         checkpoint_prefix)
 
-    second = tracking.Checkpointable()
+    second = tracking.AutoCheckpointable()
     second.var2 = variables_lib.Variable(0., name="blah")
     status = checkpointable_utils.CheckpointableSaver(
         second).restore(save_path)
@@ -1040,7 +1042,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = tracking.AutoCheckpointable()
         obj.var = variables_lib.Variable(0., name="v")
         obj.opt = adam.Adam(0.1)
         variables = [obj.var]
@@ -1057,7 +1059,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     # No checkpoints are deleted by default
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.Checkpointable()
+    obj = tracking.AutoCheckpointable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
     self.evaluate(checkpointable_utils.gather_initializers(obj))
     saver = checkpointable_utils.Checkpoint(obj=obj)
@@ -1077,7 +1079,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testCheckpointStateChangingVarList(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.Checkpointable()
+    obj = tracking.AutoCheckpointable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
     self.evaluate(checkpointable_utils.gather_initializers(obj))
     checkpoint = checkpointable_utils.Checkpoint(obj=obj)
@@ -1130,7 +1132,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = tracking.AutoCheckpointable()
         obj.var = variables_lib.Variable(0., name="v")
         obj.opt = adam.Adam(0.1)
         variables = [obj.var]
@@ -1242,7 +1244,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     # Make sure initialization doesn't clobber later restores
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.Adam(0.001, beta1=1.0)
+      optimizer = adam.Adam(0.001, beta_1=1.0)
       root = checkpointable_utils.Checkpoint(
           optimizer=optimizer, model=model)
       opt_root = checkpointable_utils.Checkpoint(
@@ -1284,7 +1286,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     load_status.assert_existing_objects_matched().run_restore_ops()
 
 
-class _ManualScope(tracking.Checkpointable):
+class _ManualScope(tracking.AutoCheckpointable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
diff --git a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py b/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
index 00d5747f7838ae48d022675fd878b59d659db38a..bd80fa60f0b27f16da01002ab5088495f0a43edb 100644
--- a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
@@ -23,6 +23,7 @@ import os
 import six
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -46,7 +47,7 @@ from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
-class NonLayerCheckpointable(tracking.Checkpointable):
+class NonLayerCheckpointable(tracking.AutoCheckpointable):
 
   def __init__(self):
     super(NonLayerCheckpointable, self).__init__()
@@ -280,6 +281,70 @@ class CheckpointingTests(test.TestCase):
       self.assertEqual((training_continuation + 1) * num_training_steps,
                        root.optimizer_step.numpy())
 
+  def testEagerDistributionStrategy(self):
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      strategy = mirrored_strategy.MirroredStrategy()
+      with strategy.scope():
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            optimizer_step=training_util.get_or_create_global_step())
+        root.restore(checkpoint_management.latest_checkpoint(
+            checkpoint_directory))
+
+        for _ in range(num_training_steps):
+          strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+  def testGraphDistributionStrategy(self):
+    self.skipTest("b/121381184")
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      return optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      with ops.Graph().as_default():
+        strategy = mirrored_strategy.MirroredStrategy()
+        with strategy.scope():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              optimizer_step=training_util.get_or_create_global_step())
+          status = root.restore(checkpoint_management.latest_checkpoint(
+              checkpoint_directory))
+          train_op = strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+          with self.session() as session:
+            if training_continuation > 0:
+              status.assert_consumed()
+            status.initialize_or_restore()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
   def testUsageGraph(self):
     """Expected usage when graph building."""
     with context.graph_mode():
@@ -396,7 +461,7 @@ class CheckpointingTests(test.TestCase):
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     checkpointable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
     (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
@@ -438,7 +503,7 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.Checkpointable()
+    root = tracking.AutoCheckpointable()
     root.var = checkpointable_utils.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
@@ -461,7 +526,7 @@ class CheckpointingTests(test.TestCase):
                                    14.))
     slots_path = checkpointable_utils.CheckpointableSaver(root).save(
         os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.Checkpointable()
+    new_root = tracking.AutoCheckpointable()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
     slot_status = checkpointable_utils.CheckpointableSaver(
@@ -507,7 +572,7 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = tracking.AutoCheckpointable()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
@@ -525,7 +590,7 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = tracking.AutoCheckpointable()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
@@ -674,7 +739,7 @@ class CheckpointingTests(test.TestCase):
       self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
 
-class _ManualScope(tracking.Checkpointable):
+class _ManualScope(tracking.AutoCheckpointable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
diff --git a/tensorflow/python/training/checkpointable/util_xla_test.py b/tensorflow/python/training/checkpointable/util_xla_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e96a7514a24be19b857eab7032846e7578cc55c
--- /dev/null
+++ b/tensorflow/python/training/checkpointable/util_xla_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+class NonLayerCheckpointable(tracking.AutoCheckpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class Subclassed(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class CheckpointingTests(xla_test.XLATestCase):
+
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    for training_continuation in range(3):
+      with self.test_scope():
+        model = Subclassed()
+        optimizer = adam.Adam(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=2)
+        root.restore(manager.latest_checkpoint)
+        for _ in range(num_training_steps):
+          input_value = constant_op.constant([[3.]])
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          optimizer.apply_gradients(zip(gradients, variables))
+        manager.save()
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer.iterations.numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/training/evaluation_test.py b/tensorflow/python/training/evaluation_test.py
index 3de4ceda759d927aaf743a0aa0159c50b0dbefb7..690c97e3db196ddeb5a212e3b254cf6c01907789 100644
--- a/tensorflow/python/training/evaluation_test.py
+++ b/tensorflow/python/training/evaluation_test.py
@@ -26,10 +26,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
@@ -117,16 +117,18 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metrics.accuracy(
-        predictions=predictions, labels=labels)
+    accuracy = metrics_module.Accuracy()
+    update_op = accuracy.update_state(labels, predictions)
 
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
 
     final_ops_values = evaluation._evaluate_once(
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
-        final_ops={'accuracy': accuracy},
-        hooks=[evaluation._StopAfterNEvalsHook(1),])
+        final_ops={'accuracy': (accuracy.result(), update_op)},
+        hooks=[
+            evaluation._StopAfterNEvalsHook(1),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
 
   def testEvaluateWithFiniteInputs(self):
@@ -148,17 +150,21 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metrics.accuracy(
-        predictions=predictions, labels=labels)
+    accuracy = metrics_module.Accuracy()
+    update_op = accuracy.update_state(labels, predictions)
 
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
 
     final_ops_values = evaluation._evaluate_once(
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
-        final_ops={'accuracy': accuracy,
-                   'eval_steps': evaluation._get_or_create_eval_step()},
-        hooks=[evaluation._StopAfterNEvalsHook(None),])
+        final_ops={
+            'accuracy': (accuracy.result(), update_op),
+            'eval_steps': evaluation._get_or_create_eval_step()
+        },
+        hooks=[
+            evaluation._StopAfterNEvalsHook(None),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
     # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs
     # each; the 3rd iter evaluates the remaining 4 inputs, and the last one
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 072dbc1730e4a07b7deffd35b8f2acc0e58d0278..41a42bd2695eaecaee271409afa03653ca6b4014 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,6 +41,7 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training.checkpointable import util as checkpointable_util
 from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
@@ -136,6 +137,16 @@ class Scaffold(object):
         string tensor containing a serialized `Summary` proto.
       saver: Optional `tf.train.Saver` object to use to save and restore
         variables.
+
+        May also be a `tf.train.Checkpoint` object, in which case object-based
+        checkpoints are saved. This will also load some object-based checkpoints
+        saved from elsewhere, but that loading may be fragile since it uses
+        fixed keys rather than performing a full graph-based match. For example
+        if a variable has two paths from the `Checkpoint` object because two
+        `Model` objects share the `Layer` object that owns it, removing one
+        `Model` may change the keys and break checkpoint loading through this
+        API, whereas a graph-based match would match the variable through the
+        other `Model`.
       copy_from_scaffold: Optional scaffold object to copy fields from. Its
         fields will be overwritten by the provided fields in this function.
     """
@@ -216,7 +227,13 @@ class Scaffold(object):
     if self._saver is None:
       self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
     # pylint: enable=g-long-lambda
-    self._saver.build()
+    if isinstance(self._saver, checkpointable_util.Checkpoint):
+      self._saver = training_saver.Saver(
+          var_list=checkpointable_util.CheckpointableSaver(
+              self._saver).gather_objects(),
+          sharded=True)
+    else:
+      self._saver.build()
 
     ops.get_default_graph().finalize()
     logging.info('Graph was finalized.')
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 72670f0ca39f67b151abcb1813ede7ee36c6544b..6efcab28c5249fe943f6d4a1b0b6b7866271571f 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -98,12 +98,12 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
       def merge_fn(strategy, v, value):
         value = strategy.extended.reduce_to(
             ds_reduce_util.ReduceOp.MEAN, value, v)
-        return strategy.update(v, update_fn, value)
+        return strategy.extended.update(v, update_fn, args=(value,))
 
       return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
       strategy = distribution_strategy_context.get_cross_replica_context()
-      return strategy.update(variable, update_fn, value)
+      return strategy.extended.update(variable, update_fn, args=(value,))
 
 
 def weighted_moving_average(value,
@@ -505,13 +505,13 @@ class ExponentialMovingAverage(object):
     ```
     Args:
       moving_avg_variables: a list of variables that require to use of the
-        moving variable name to be restored. If None, it will default to
+        moving average variable name to be restored. If None, it will default to
         variables.moving_average_variables() + variables.trainable_variables()
 
     Returns:
-      A map from restore_names to variables. The restore_name can be the
-      moving_average version of the variable name if it exist, or the original
-      variable name.
+      A map from restore_names to variables. The restore_name is either the
+      original or the moving average version of the variable name, depending
+      on whether the variable name is in the `moving_avg_variables`.
     """
     name_map = {}
     if moving_avg_variables is None:
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index eaa563e84aa76f6c27ed497c4e7c5db51cdb3fda..3742ebb807f4c245aef956144f7888d7b0560375 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -218,7 +218,7 @@ class Optimizer(
     # Optimizers inherit from CheckpointableBase rather than Checkpointable
     # since they do most of their dependency management themselves (slot
     # variables are special-cased, and non-slot variables are keyed to graphs).
-    checkpointable.CheckpointableBase):
+    checkpointable.Checkpointable):
   """Base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -521,8 +521,7 @@ class Optimizer(
   @staticmethod
   def _scale_loss(loss_value):
     if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= (1. / num_replicas)
     return loss_value
@@ -554,14 +553,15 @@ class Optimizer(
     # by most optimizers.  It relies on the subclass implementing the following
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
-    # Handle DistributionStrategy case.
-    if distribute_ctx.get_cross_replica_context():
-      raise RuntimeError("Use `_distributed_apply()` instead of "
-                         "`apply_gradients()` in a cross-replica context.")
-    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
+    # TODO(isaprykin): Get rid of `has_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribute_ctx.has_distribution_strategy():
+    if distribute_ctx.has_strategy():
+      # Handle DistributionStrategy case.
+      if distribute_ctx.in_cross_replica_context():
+        raise RuntimeError("Use `_distributed_apply()` instead of "
+                           "`apply_gradients()` in a cross-replica context.")
+
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
       return distribute_ctx.get_replica_context().merge_call(
           self._distributed_apply, args=(grads_and_vars, global_step, name))
@@ -815,8 +815,8 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      distribution_strategy = distribute_ctx.get_distribution_strategy()
-      with distribution_strategy.colocate_vars_with(colocate_with):
+      distribution_strategy = distribute_ctx.get_strategy()
+      with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
               name=name, shape=None)
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 369b6cbb50e5c621737c095a24eeb473f3870534..6eca0e6cb5f32a34b178c14c9fe86d00fdd0fdfe 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -27,7 +27,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.ProximalGradientDescentOptimizer")
+@tf_export(v1=["train.ProximalGradientDescentOptimizer"])
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index d1b51adaa4f89aaa0394bca3f6fd82ab9823258b..dec23c50e8c069d4f2dd18c49ecdabb447f4872b 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -2775,7 +2775,7 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
 
 
-class _OwnsAVariableSimple(checkpointable_base.CheckpointableBase):
+class _OwnsAVariableSimple(checkpointable_base.Checkpointable):
   """A Checkpointable object which can be saved using a tf.train.Saver."""
 
   def __init__(self):
@@ -2808,7 +2808,7 @@ class _MirroringSaveable(
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(checkpointable_base.CheckpointableBase):
+class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
   """A Checkpointable object which returns a more complex SaveableObject."""
 
   def __init__(self):
@@ -2831,7 +2831,7 @@ class _OwnsMirroredVariables(checkpointable_base.CheckpointableBase):
     return self.non_dep_variable.name
 
 
-class NonLayerCheckpointable(checkpointable_tracking.Checkpointable):
+class NonLayerCheckpointable(checkpointable_tracking.AutoCheckpointable):
 
   def __init__(self):
     super(NonLayerCheckpointable, self).__init__()
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index 74991b240cda8791246565f8c3296e2ecd0668eb..4ff2742c2f1b8b68528914c5c23414b1f87c957b 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -55,22 +55,24 @@ class Saver(object):
     filename_tensor = array_ops.placeholder(
         shape=[], dtype=dtypes.string, name="saver_filename")
     # TODO(allenl): Add save and restore function names to the proto directly.
-    save_tensor = self.save(filename_tensor)
-    restore_op = self.restore(filename_tensor).op
+    signature = (tensor_spec.TensorSpec(shape=(), dtype=dtypes.string),)
+    # Autograph is off because of reference cycles which must be collected when
+    # a function is created and destroyed (as in tf.saved_model.save). It's also
+    # not necessary, so having it off may be slightly faster.
+    #
+    # TODO(b/121302372): We should be able to decorate save() and restore()
+    # unconditionally.
+    save_tensor = def_function.function(
+        self.save, input_signature=signature, autograph=False)(filename_tensor)
+    restore_op = def_function.function(
+        self.restore, input_signature=signature, autograph=False)(
+            filename_tensor).op
     return saver_pb2.SaverDef(
         filename_tensor_name=filename_tensor.name,
         save_tensor_name=save_tensor.name,
         restore_op_name=restore_op.name,
         version=saver_pb2.SaverDef.V2)
 
-  @def_function.function(
-      input_signature=(tensor_spec.TensorSpec(shape=(), dtype=dtypes.string),),
-      # Autograph is off because of reference cycles which must be collected
-      # when a function is created and destroyed (as in
-      # tf.saved_model.save). It's also not necessary, so having it off may be
-      # slightly faster.
-      autograph=False,
-  )
   def save(self, file_prefix):
     """Save the saveable objects to a checkpoint with `file_prefix`.
 
@@ -89,13 +91,11 @@ class Saver(object):
         tensor_names.append(spec.name)
         tensors.append(spec.tensor)
         tensor_slices.append(spec.slice_spec)
-    io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)
-    return file_prefix
+    with ops.device("cpu:0"):
+      with ops.control_dependencies([io_ops.save_v2(
+          file_prefix, tensor_names, tensor_slices, tensors)]):
+        return array_ops.identity(file_prefix)
 
-  @def_function.function(
-      input_signature=(tensor_spec.TensorSpec(shape=(), dtype=dtypes.string),),
-      autograph=False,
-  )
   def restore(self, file_prefix):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
 
@@ -107,22 +107,32 @@ class Saver(object):
       A scalar string Tensor containing `file_prefix` with control dependencies
       on the restore ops.
     """
-    restore_specs = []
-    tensor_structure = []
-    for saveable in self._saveable_objects:
-      saveable_tensor_structure = []
-      tensor_structure.append(saveable_tensor_structure)
-      for spec in saveable.specs:
-        saveable_tensor_structure.append(spec.name)
-        restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
-    tensor_names, tensor_slices, tensor_dtypes = zip(*restore_specs)
+    restore_ops = restore_from_saveable_objects(
+        file_prefix, self._saveable_objects)
     with ops.device("cpu:0"):
-      restored_tensors = io_ops.restore_v2(
-          file_prefix, tensor_names, tensor_slices, tensor_dtypes)
-    structured_restored_tensors = nest.pack_sequence_as(
-        tensor_structure, restored_tensors)
-    for saveable, restored_tensors in zip(self._saveable_objects,
-                                          structured_restored_tensors):
-      saveable.restore(restored_tensors,
-                       restored_shapes=None)
-    return file_prefix
+      with ops.control_dependencies(restore_ops):
+        return array_ops.identity(file_prefix)
+
+
+def restore_from_saveable_objects(file_prefix, saveable_objects):
+  """Reads from a checkpoint and returns restore ops for `saveable_objects`s."""
+  restore_specs = []
+  tensor_structure = []
+  for saveable in saveable_objects:
+    saveable_tensor_structure = []
+    tensor_structure.append(saveable_tensor_structure)
+    for spec in saveable.specs:
+      saveable_tensor_structure.append(spec.name)
+      restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
+  tensor_names, tensor_slices, tensor_dtypes = zip(*restore_specs)
+  with ops.device("cpu:0"):
+    restored_tensors = io_ops.restore_v2(
+        file_prefix, tensor_names, tensor_slices, tensor_dtypes)
+  structured_restored_tensors = nest.pack_sequence_as(
+      tensor_structure, restored_tensors)
+  restore_ops = []
+  for saveable, restored_tensors in zip(saveable_objects,
+                                        structured_restored_tensors):
+    restore_ops.append(saveable.restore(restored_tensors,
+                                        restored_shapes=None))
+  return restore_ops
diff --git a/tensorflow/python/training/saving/saveable_object.py b/tensorflow/python/training/saving/saveable_object.py
index 4b19294b6545de8105443a46a112a416f6bf481c..981d4580fcb3390380e58e90a2edefa2cae5f066 100644
--- a/tensorflow/python/training/saving/saveable_object.py
+++ b/tensorflow/python/training/saving/saveable_object.py
@@ -66,6 +66,11 @@ class SaveableObject(object):
     self.name = name
     self._device = None
 
+  @property
+  def optional_restore(self):
+    """A hint to restore assertions that this object is optional."""
+    return False  # Default to required
+
   @property
   def device(self):
     """The device for SaveSpec Tensors."""
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index fa88d2c6ebd2f29c2d2de7583a918dcbc6b28b51..b8cc66249bffd7c5c21280969a5d27b8c3b89da7 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -165,7 +165,7 @@ def saveable_objects_for_op(op, name):
         yield ResourceVariableSaveable(
             variable, variable._save_slice_info.spec, name)
     # pylint: enable=protected-access
-  elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
+  elif isinstance(op, checkpointable.Checkpointable) and not isinstance(
       op, variables.Variable):
     # pylint: disable=protected-access
     for attr, factory in op._gather_saveables_for_checkpoint().items():
@@ -250,7 +250,7 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
         names_to_saveables[name].append(var)
       else:
         names_to_saveables[name] = [var]
-    elif (isinstance(var, checkpointable.CheckpointableBase)
+    elif (isinstance(var, checkpointable.Checkpointable)
           and not isinstance(var, variables.Variable)):
       checkpointable_saveables = [
           (factory() if callable(factory) else factory)
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 0f68fcfe8bb4cb81e54ba27d35bfb0b2e3888a1b..104247e60ece2477506e94c152bf9b4f26a806cd 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -21,6 +21,7 @@ import time
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -181,8 +182,16 @@ class SessionManager(object):
         set.
     """
     self._target = master
-    sess = session.Session(self._target, graph=self._graph, config=config)
 
+    # This is required to so that we initialize the TPU device before
+    # restoring from checkpoint since we'll be placing variables on the device
+    # and TPUInitialize wipes out the memory of the device.
+    strategy = distribution_strategy_context.get_strategy()
+    if strategy and hasattr(strategy.extended,
+                            "_experimental_initialize_system"):
+      strategy.extended._experimental_initialize_system()  # pylint: disable=protected-access
+
+    sess = session.Session(self._target, graph=self._graph, config=config)
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
                        "checkpoint_filename_with_path.")
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index bc1137e200dc0bfbc49c518dff63121ae3cd4f9e..0868cfdea8896e00b4348919b43d948ae30ee956 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -121,9 +121,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
-      with distribution_strategy.colocate_vars_with(primary):
+      distribution_strategy = distribution_strategy_context.get_strategy()
+      with distribution_strategy.extended.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
       return _create_slot_var(primary, val, "", validate_shape, None, None)
@@ -159,9 +158,8 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
-      with distribution_strategy.colocate_vars_with(primary):
+      distribution_strategy = distribution_strategy_context.get_strategy()
+      with distribution_strategy.extended.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
     else:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index cd4590db7f6550f8790ad683c9aaecf145ad12da..21e9a99e7ceeebc6b021bb899fd77faa5e19ed48 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -260,9 +260,8 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    distribution_strategy = (
-        distribution_strategy_context.get_distribution_strategy())
-    with distribution_strategy.colocate_vars_with(local_anchor):
+    distribution_strategy = distribution_strategy_context.get_strategy()
+    with distribution_strategy.extended.colocate_vars_with(local_anchor):
       self._local_step = variable_scope.variable(
           initial_value=0,
           trainable=False,
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 9aaf0c2de9756718645e77de416c653182994019..f8e8d4c28a50629f108abeb0700d82fba311666c 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -100,7 +100,13 @@ def _validate_deprecation_args(date, instructions):
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
   stack = tf_stack.extract_stack()
-  frame = stack[-4 if outer else -3]
+  length = len(stack)
+  if length == 0:  # should never happen as we're in a function
+    return 'UNKNOWN'
+  index = length-4 if outer else length-3
+  if index < 0:
+    index = 0
+  frame = stack[index]
   return '{filename}:{lineno}'.format(filename=frame[0], lineno=frame[1])
 
 
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
index a56dfbff8e383134f3ad475736b7679dcceb055f..84e45bec6fc58f18a6ce6f0e8576e2cdb135ed8d 100644
--- a/tensorflow/python/util/function_utils.py
+++ b/tensorflow/python/util/function_utils.py
@@ -22,6 +22,7 @@ import functools
 
 import six
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -113,3 +114,16 @@ def get_func_code(func):
       return None
   else:
     raise ValueError('Argument must be callable')
+
+
+_rewriter_config_optimizer_disabled = None
+
+
+def get_disabled_rewriter_config():
+  global _rewriter_config_optimizer_disabled
+  if _rewriter_config_optimizer_disabled is None:
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
+    rewriter_config.disable_meta_optimizer = True
+    _rewriter_config_optimizer_disabled = config.SerializeToString()
+  return _rewriter_config_optimizer_disabled
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 70e5ebb3b68b0973cf46d147bf2a11837a82b1b9..436cf82ea512be908bc9369297a3e6b23837969d 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -41,10 +41,38 @@ import six as _six
 from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 
 
-def _get_attrs_values(obj):
-  """Returns the list of values from an attrs instance."""
+_SHALLOW_TREE_HAS_INVALID_KEYS = (
+    "The shallow_tree's keys are not a subset of the input_tree's keys. The "
+    "shallow_tree has the following keys that are not in the input_tree: {}.")
+
+_STRUCTURES_HAVE_MISMATCHING_TYPES = (
+    "The two structures don't have the same sequence type. Input structure has "
+    "type {shallow_type}, while shallow structure has type {input_type}.")
+
+_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE = (
+    "The input_tree has fewer elements than the input_tree. Input structure "
+    "has length {input_size}, while shallow structure has length "
+    "{shallow_size}.")
+
+_IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ = (
+    "If shallow structure is a sequence, input must also be a sequence. "
+    "Input has type: {}.")
+
+
+def _get_attrs_items(obj):
+  """Returns a list of (name, value) pairs from an attrs instance.
+
+  The list will be sorted by name.
+
+  Args:
+    obj: an object.
+
+  Returns:
+    A list of (attr_name, attr_value) pairs, sorted by attr_name.
+  """
   attrs = getattr(obj.__class__, "__attrs_attrs__")
-  return [getattr(obj, a.name) for a in attrs]
+  attr_names = sorted([a.name for a in attrs])
+  return [(attr_name, getattr(obj, attr_name)) for attr_name in attr_names]
 
 
 def _sorted(dict_):
@@ -74,6 +102,7 @@ def _is_namedtuple(instance, strict=False):
 # See the swig file (util.i) for documentation.
 _is_mapping = _pywrap_tensorflow.IsMapping
 _is_attrs = _pywrap_tensorflow.IsAttrs
+_is_composite_tensor = _pywrap_tensorflow.IsCompositeTensor
 
 
 def _sequence_like(instance, args):
@@ -97,33 +126,63 @@ def _sequence_like(instance, args):
     return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
   elif _is_namedtuple(instance) or _is_attrs(instance):
     return type(instance)(*args)
+  elif _is_composite_tensor(instance):
+    return instance._from_components(args)  # pylint: disable=protected-access
   else:
     # Not a namedtuple
     return type(instance)(args)
 
 
 def _yield_value(iterable):
-  """Yields the next value from the given iterable."""
-  if _is_mapping(iterable):
+  for _, v in _yield_sorted_items(iterable):
+    yield v
+
+
+def _yield_sorted_items(iterable):
+  """Yield (key, value) pairs for `iterable` in a deterministic order.
+
+  For Sequences, the key will be an int, the array index of a value.
+  For Mappings, the key will be the dictionary key.
+  For objects (e.g. namedtuples), the key will be the attribute name.
+
+  In all cases, the keys will be iterated in sorted order.
+
+  Args:
+    iterable: an iterable.
+
+  Yields:
+    The iterable's (key, value) pairs, in order of sorted keys.
+  """
+  if isinstance(iterable, _collections.Mapping):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     for key in _sorted(iterable):
-      yield iterable[key]
+      yield key, iterable[key]
   elif _is_attrs(iterable):
-    for value in _get_attrs_values(iterable):
-      yield value
+    for item in _get_attrs_items(iterable):
+      yield item
+  elif _is_namedtuple(iterable):
+    for field in iterable._fields:
+      yield field, getattr(iterable, field)
+  elif _is_composite_tensor(iterable):
+    for item in enumerate(iterable._to_components()):  # pylint: disable=protected-access
+      yield item
   else:
-    for value in iterable:
-      yield value
+    for item in enumerate(iterable):
+      yield item
 
 
 # See the swig file (util.i) for documentation.
 is_sequence = _pywrap_tensorflow.IsSequence
 
 
+# See the swig file (util.i) for documentation.
+is_sequence_or_composite = _pywrap_tensorflow.IsSequenceOrComposite
+
+
 # See the swig file (util.i) for documentation.
 flatten = _pywrap_tensorflow.Flatten
 
@@ -144,12 +203,13 @@ class _DotString(object):
 _DOT = _DotString()
 
 
-def assert_same_structure(nest1, nest2, check_types=True):
+def assert_same_structure(nest1, nest2, check_types=True,
+                          expand_composites=False):
   """Asserts that two structures are nested in the same way.
 
   Note that namedtuples with identical name and fields are always considered
   to have the same shallow structure (even with `check_types=True`).
-  For intance, this code will print `True`:
+  For instance, this code will print `True`:
 
   ```python
   def nt(a, b):
@@ -168,6 +228,8 @@ def assert_same_structure(nest1, nest2, check_types=True):
         considered the same if they are both list subtypes (which allows "list"
         and "_ListWrapper" from checkpointable dependency tracking to compare
         equal).
+    expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+        and `tf.RaggedTensor` are expanded into their component tensors.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -176,7 +238,8 @@ def assert_same_structure(nest1, nest2, check_types=True):
       their substructures. Only possible if `check_types` is `True`.
   """
   try:
-    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types)
+    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types,
+                                           expand_composites)
   except (ValueError, TypeError) as e:
     str1 = str(map_structure(lambda _: _DOT, nest1))
     str2 = str(map_structure(lambda _: _DOT, nest2))
@@ -242,13 +305,14 @@ def flatten_dict_items(dictionary):
   return flat_dictionary
 
 
-def _packed_nest_with_indices(structure, flat, index):
+def _packed_nest_with_indices(structure, flat, index, is_seq):
   """Helper function for pack_sequence_as.
 
   Args:
     structure: Substructure (list / tuple / dict) to mimic.
     flat: Flattened values to output substructure for.
     index: Index at which to start reading from flat.
+    is_seq: Function used to test if a value should be treated as a sequence.
 
   Returns:
     The tuple (new_index, child), where:
@@ -263,8 +327,8 @@ def _packed_nest_with_indices(structure, flat, index):
   """
   packed = []
   for s in _yield_value(structure):
-    if is_sequence(s):
-      new_index, child = _packed_nest_with_indices(s, flat, index)
+    if is_seq(s):
+      new_index, child = _packed_nest_with_indices(s, flat, index, is_seq)
       packed.append(_sequence_like(s, child))
       index = new_index
     else:
@@ -273,7 +337,7 @@ def _packed_nest_with_indices(structure, flat, index):
   return index, packed
 
 
-def pack_sequence_as(structure, flat_sequence):
+def pack_sequence_as(structure, flat_sequence, expand_composites=False):
   """Returns a given flattened sequence packed into a given structure.
 
   If `structure` is a scalar, `flat_sequence` must be a single-element list;
@@ -293,6 +357,8 @@ def pack_sequence_as(structure, flat_sequence):
         tuples, and dicts. Note: numpy arrays and strings are considered
         scalars.
     flat_sequence: flat sequence to pack.
+    expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+        and `tf.RaggedTensor` are expanded into their component tensors.
 
   Returns:
     packed: `flat_sequence` converted to have the same recursive structure as
@@ -303,17 +369,19 @@ def pack_sequence_as(structure, flat_sequence):
       element counts.
     TypeError: `structure` is or contains a dict with non-sortable keys.
   """
-  if not is_sequence(flat_sequence):
+  is_seq = is_sequence_or_composite if expand_composites else is_sequence
+  if not is_seq(flat_sequence):
     raise TypeError("flat_sequence must be a sequence")
 
-  if not is_sequence(structure):
+  if not is_seq(structure):
     if len(flat_sequence) != 1:
       raise ValueError("Structure is a scalar but len(flat_sequence) == %d > 1"
                        % len(flat_sequence))
     return flat_sequence[0]
 
   try:
-    final_index, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
+    final_index, packed = _packed_nest_with_indices(structure, flat_sequence,
+                                                    0, is_seq)
     if final_index < len(flat_sequence):
       raise IndexError
   except IndexError:
@@ -326,7 +394,7 @@ def pack_sequence_as(structure, flat_sequence):
   return _sequence_like(structure, packed)
 
 
-def map_structure(func, *structure, **check_types_dict):
+def map_structure(func, *structure, **kwargs):
   """Applies `func` to each entry in `structure` and returns a new structure.
 
   Applies `func(x[0], x[1], ...)` where x[i] is an entry in
@@ -337,12 +405,18 @@ def map_structure(func, *structure, **check_types_dict):
     func: A callable that accepts as many arguments as there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
-    **check_types_dict: only valid keyword argument is `check_types`. If set to
-      `True` (default) the types of iterables within the structures have to be
-      same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
-      exception). To allow this set this argument to `False`.
-      Note that namedtuples with identical name and fields are always
-      considered to have the same shallow structure.
+    **kwargs: Valid keyword args are:
+
+      * `check_types`: If set to `True` (default) the types of
+        iterables within the structures have to be same (e.g.
+        `map_structure(func, [1], (1,))` raises a `TypeError`
+        exception). To allow this set this argument to `False`.
+        Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
+      * `expand_composites`: If set to `True`, then composite tensors such
+        as `tf.SparseTensor` and `tf.RaggedTensor` are expanded into their
+        component tensors.  If `False` (the default), then composite tensors
+        are not expanded.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
@@ -364,21 +438,25 @@ def map_structure(func, *structure, **check_types_dict):
   if not structure:
     raise ValueError("Must provide at least one structure")
 
-  if check_types_dict:
-    if "check_types" not in check_types_dict or len(check_types_dict) > 1:
-      raise ValueError("Only valid keyword argument is check_types")
-    check_types = check_types_dict["check_types"]
-  else:
-    check_types = True
+  check_types = True
+  expand_composites = False
+  if kwargs:
+    check_types = kwargs.pop("check_types", check_types)
+    expand_composites = kwargs.pop("expand_composites", expand_composites)
+    if kwargs:
+      raise ValueError("Only valid keyword arguments are check_types "
+                       "and expand_composites")
 
   for other in structure[1:]:
-    assert_same_structure(structure[0], other, check_types=check_types)
+    assert_same_structure(structure[0], other, check_types=check_types,
+                          expand_composites=expand_composites)
 
-  flat_structure = [flatten(s) for s in structure]
+  flat_structure = [flatten(s, expand_composites) for s in structure]
   entries = zip(*flat_structure)
 
   return pack_sequence_as(
-      structure[0], [func(*x) for x in entries])
+      structure[0], [func(*x) for x in entries],
+      expand_composites=expand_composites)
 
 
 def map_structure_with_paths(func, *structure, **kwargs):
@@ -413,8 +491,14 @@ def map_structure_with_paths(func, *structure, **kwargs):
       the type of sequence in any of their substructures.
     ValueError: If no structures are provided.
   """
-  return _map_structure_with_tuple_or_string_paths(
-      use_string_paths=True, func=func, structure=structure, kwargs=kwargs)
+  def wrapper_func(tuple_path, *inputs, **kwargs):
+    string_path = "/".join(str(s) for s in tuple_path)
+    return func(string_path, *inputs, **kwargs)
+
+  return map_structure_with_tuple_paths_up_to(structure[0],
+                                              wrapper_func,
+                                              *structure,
+                                              **kwargs)
 
 
 def map_structure_with_tuple_paths(func, *structure, **kwargs):
@@ -450,52 +534,43 @@ def map_structure_with_tuple_paths(func, *structure, **kwargs):
       the type of sequence in any of their substructures.
     ValueError: If no structures are provided.
   """
-  return _map_structure_with_tuple_or_string_paths(
-      use_string_paths=False, func=func, structure=structure, kwargs=kwargs)
+  return map_structure_with_tuple_paths_up_to(structure[0],
+                                              func,
+                                              *structure,
+                                              **kwargs)
 
 
-def _map_structure_with_tuple_or_string_paths(
-    use_string_paths, func, structure, kwargs):
-  """Implements `map_structure` with either tuple or string paths."""
-
-  if not callable(func):
-    raise TypeError("func must be callable, got: %s" % func)
-  if not structure:
-    raise ValueError("Must provide at least one structure")
-
-  check_types = kwargs.pop("check_types", True)
-  for other in structure[1:]:
-    assert_same_structure(structure[0], other, check_types=check_types)
-
-  if use_string_paths:
-    flatten_func = flatten_with_joined_string_paths
-  else:
-    flatten_func = flatten_with_tuple_paths
-
-  # First set paths_and_values to:
-  # [[(p11, v11), ... (p1n, v1n)], ... [(pm1, vm1), ... (pmn, vmn)]]
-  paths_and_values = [flatten_func(s) for s in structure]
-
-  # Now zip(*paths_and_values) would be:
-  # [((p11, v11), ... (pm1, vm1)), ... ((p1n, v1n), ... (pmn, vmn))]
-  # so grouped_by_path is set to:
-  # [[(p11, ... pm1), (v11, ... vm1)], ... [(p1n, ... pmn), (v1n, ... vmn)]]
-  # Note that p1i, ... pmi must all be equal since the structures are the same.
-  grouped_by_path = [zip(*p_v) for p_v in zip(*paths_and_values)]
-
-  return pack_sequence_as(structure[0], [
-      func(paths[0], *values, **kwargs) for paths, values in grouped_by_path])
+def _yield_flat_up_to(shallow_tree, input_tree, path=()):
+  """Yields (path, value) pairs of input_tree flattened up to shallow_tree.
 
+  Args:
+    shallow_tree: Nested structure. Traverse no further than its leaf nodes.
+    input_tree: Nested structure. Return the paths and values from this tree.
+      Must have the same upper structure as shallow_tree.
+    path: Tuple. Optional argument, only used when recursing. The path from the
+      root of the original shallow_tree, down to the root of the shallow_tree
+      arg of this recursive call.
 
-def _yield_flat_up_to(shallow_tree, input_tree):
-  """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
-  if is_sequence(shallow_tree):
-    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
-                                            _yield_value(input_tree)):
-      for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
-        yield input_leaf
+  Yields:
+    Pairs of (path, value), where path the tuple path of a leaf node in
+    shallow_tree, and value is the value of the corresponding node in
+    input_tree.
+  """
+  if (isinstance(shallow_tree, _six.string_types) or
+      not any([isinstance(shallow_tree, _collections.Sequence),
+               isinstance(shallow_tree, _collections.Mapping),
+               _is_namedtuple(shallow_tree),
+               _is_attrs(shallow_tree)])):
+    yield (path, input_tree)
   else:
-    yield input_tree
+    input_tree = dict(_yield_sorted_items(input_tree))
+    for shallow_key, shallow_subtree in _yield_sorted_items(shallow_tree):
+      subpath = path + (shallow_key,)
+      input_subtree = input_tree[shallow_key]
+      for leaf_path, leaf_value in _yield_flat_up_to(shallow_subtree,
+                                                     input_subtree,
+                                                     path=subpath):
+        yield (leaf_path, leaf_value)
 
 
 def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
@@ -509,8 +584,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
 
   The following code will raise an exception:
   ```python
-    shallow_tree = ["a", "b"]
-    input_tree = ["c", ["d", "e"], "f"]
+    shallow_tree = {"a": "A", "b": "B"}
+    input_tree = {"a": 1, "c": 2}
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
@@ -549,40 +624,34 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
       input_is_namedtuple = _is_namedtuple(input_tree, False)
       if shallow_is_namedtuple and input_is_namedtuple:
         if not _same_namedtuples(shallow_tree, input_tree):
-          raise TypeError(
-              "The two namedtuples don't have the same sequence type. Input "
-              "structure has type %s, while shallow structure has type %s."
-              % (type(input_tree), type(shallow_tree)))
+          raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+              input_type=type(input_tree),
+              shallow_type=type(shallow_tree)))
+
       elif not (isinstance(shallow_tree, _collections.Mapping)
                 and isinstance(input_tree, _collections.Mapping)):
-        raise TypeError(
-            "The two structures don't have the same sequence type. Input "
-            "structure has type %s, while shallow structure has type %s."
-            % (type(input_tree), type(shallow_tree)))
+        raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+            input_type=type(input_tree),
+            shallow_type=type(shallow_tree)))
 
-    if len(input_tree) != len(shallow_tree):
-      raise ValueError(
-          "The two structures don't have the same sequence length. Input "
-          "structure has length %s, while shallow structure has length %s."
-          % (len(input_tree), len(shallow_tree)))
-
-    if check_types and isinstance(shallow_tree, (dict, _collections.Mapping)):
-      if set(input_tree) != set(shallow_tree):
-        raise ValueError(
-            "The two structures don't have the same keys. Input "
-            "structure has keys %s, while shallow structure has keys %s." %
-            (list(_six.iterkeys(input_tree)),
-             list(_six.iterkeys(shallow_tree))))
+    if len(input_tree) < len(shallow_tree):
+      raise ValueError(_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+          input_size=len(input_tree),
+          shallow_size=len(shallow_tree)))
 
-      input_tree = list(sorted(_six.iteritems(input_tree)))
-      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
+    if isinstance(shallow_tree, _collections.Mapping):
+      absent_keys = set(shallow_tree) - set(input_tree)
+      if absent_keys:
+        raise ValueError(_SHALLOW_TREE_HAS_INVALID_KEYS
+                         .format(sorted(absent_keys)))
 
-    for shallow_branch, input_branch in zip(shallow_tree, input_tree):
+    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
+                                            _yield_value(input_tree)):
       assert_shallow_structure(shallow_branch, input_branch,
                                check_types=check_types)
 
 
-def flatten_up_to(shallow_tree, input_tree):
+def flatten_up_to(shallow_tree, input_tree, check_types=True):
   """Flattens `input_tree` up to `shallow_tree`.
 
   Any further depth in structure in `input_tree` is retained as elements in the
@@ -639,6 +708,8 @@ def flatten_up_to(shallow_tree, input_tree):
     shallow_tree: a possibly pruned structure of input_tree.
     input_tree: an arbitrarily nested structure or a scalar object.
       Note, numpy arrays are considered scalars.
+    check_types: bool. If True, check that each node in shallow_tree has the
+      same type as the corresponding node in input_tree.
 
   Returns:
     A Python list, the partially flattened version of `input_tree` according to
@@ -651,11 +722,12 @@ def flatten_up_to(shallow_tree, input_tree):
     ValueError: If the sequence lengths of `shallow_tree` are different from
       `input_tree`.
   """
-  assert_shallow_structure(shallow_tree, input_tree)
-  return list(_yield_flat_up_to(shallow_tree, input_tree))
+  assert_shallow_structure(shallow_tree, input_tree, check_types)
+  # Discard paths returned by _yield_flat_up_to.
+  return list(v for _, v in _yield_flat_up_to(shallow_tree, input_tree))
 
 
-def map_structure_up_to(shallow_tree, func, *inputs):
+def map_structure_up_to(shallow_tree, func, *inputs, **kwargs):
   """Applies a function or op to a number of partially flattened inputs.
 
   The `inputs` are flattened up to `shallow_tree` before being mapped.
@@ -704,6 +776,11 @@ def map_structure_up_to(shallow_tree, func, *inputs):
         shallow_tree. The function `func` is applied to corresponding
         partially flattened elements of each input, so the function must support
         arity of `len(inputs)`.
+    **kwargs: kwargs to feed to func(). Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
@@ -716,16 +793,93 @@ def map_structure_up_to(shallow_tree, func, *inputs):
     result of repeatedly applying `func`, with same structure as
     `shallow_tree`.
   """
+  return map_structure_with_tuple_paths_up_to(
+      shallow_tree,
+      lambda _, *values: func(*values),  # Discards the path arg.
+      *inputs,
+      **kwargs)
+
+
+def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
+  """Applies a function or op to a number of partially flattened inputs.
+
+  Like map_structure_up_to(), except that the 'func' argument takes a path
+  tuple as its first argument, followed by the corresponding values from
+  *inputs.
+
+  Example:
+
+  lowercase = {'a': 'a', 'b': ('b0', 'b1')}
+  uppercase = {'a': 'A', 'b': ('B0', 'B1')}
+
+  def print_path_and_values(path, *values):
+    print("path: {}, values: {}".format(path, values))
+
+  shallow_tree = {'a': None}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase)
+  >>> path: ('a',), values: ('a', 'A')
+  >>> path: ('b', 0), values: ('b0', 'B0')
+  >>> path: ('b', 1), values: ('b1', 'B1')
+
+  shallow_tree = {'b': None}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase,
+                                       check_types=False)
+  >>> path: ('b', 1), values: (('bo', 'b1'), ('B0', 'B1'))
+
+  shallow_tree = {'a': None, 'b': {1: None}}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase,
+                                       check_types=False)
+  >>> path: ('a',), values: ('a', 'A')
+  >>> path: ('b', 1), values: ('b1', B1')
+
+  Args:
+    shallow_tree: a shallow tree, common to all the inputs.
+    func: callable that takes args (path, inputs_0_value, ... , inputs_N_value),
+      where path is a tuple path to a leaf node in shallow_tree, and
+      inputs_i_value is the corresponding value from inputs[i].
+    *inputs: nested structures that are all structurally compatible with
+        shallow_tree.
+    **kwargs: kwargs to feed to func(). Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but one of `*inputs` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+
+  Returns:
+    Result of repeatedly applying `func`. Has same structure as `shallow_tree`.
+  """
   if not inputs:
     raise ValueError("Cannot map over no sequences")
+
+  check_types = kwargs.pop("check_types", True)
+
   for input_tree in inputs:
-    assert_shallow_structure(shallow_tree, input_tree)
+    assert_shallow_structure(shallow_tree, input_tree, check_types=check_types)
 
   # Flatten each input separately, apply the function to corresponding elements,
   # then repack based on the structure of the first input.
-  all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
-                         for input_tree in inputs]
-  results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
+  flat_value_lists = [flatten_up_to(shallow_tree, input_tree, check_types)
+                      for input_tree in inputs]
+  flat_path_list = [path for path, _
+                    in _yield_flat_up_to(shallow_tree, inputs[0])]
+  results = [func(*args, **kwargs) for args in zip(flat_path_list,
+                                                   *flat_value_lists)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
 
 
@@ -824,27 +978,8 @@ def yield_flat_paths(nest):
     Tuples containing index or key values which form the path to a specific
       leaf value in the nested structure.
   """
-
-  # The _maybe_add_final_path_element function is used below in order to avoid
-  # adding trailing slashes when the sub-element recursed into is a leaf.
-  if isinstance(nest, (dict, _collections.Mapping)):
-    for key in _sorted(nest):
-      value = nest[key]
-      for sub_path in yield_flat_paths(value):
-        yield (key,) + sub_path
-  elif _is_namedtuple(nest):
-    for key in nest._fields:
-      value = getattr(nest, key)
-      for sub_path in yield_flat_paths(value):
-        yield (key,) + sub_path
-  elif isinstance(nest, _six.string_types):
-    yield ()
-  elif isinstance(nest, _collections.Sequence):
-    for idx, value in enumerate(nest):
-      for sub_path in yield_flat_paths(value):
-        yield (idx,) + sub_path
-  else:
-    yield ()
+  for k, _ in _yield_flat_up_to(nest, nest):
+    yield k
 
 
 def flatten_with_joined_string_paths(structure, separator="/"):
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 83fa5dd66084e7d6710505bc638cdc7ae4f9bbe3..71034ffcb6bf3cf7e2b47cf9615500774dc4957f 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -510,30 +510,28 @@ class NestTest(parameterized.TestCase, test.TestCase):
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
     inp_abc = ["a", "b", "c"]
-    expected_message = (
-        "The two structures don't have the same sequence length. Input "
-        "structure has length 2, while shallow structure has length 3.")
-    with self.assertRaisesRegexp(ValueError, expected_message):
-      nest.assert_shallow_structure(inp_abc, inp_ab)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+            shallow_size=len(inp_abc),
+            input_size=len(inp_ab))):
+      nest.assert_shallow_structure(shallow_tree=inp_abc, input_tree=inp_ab)
 
     inp_ab1 = [(1, 1), (2, 2)]
     inp_ab2 = [[1, 1], [2, 2]]
-    expected_message = (
-        "The two structures don't have the same sequence type. Input structure "
-        "has type <(type|class) 'tuple'>, while shallow structure has type "
-        "<(type|class) 'list'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+            shallow_type=type(inp_ab2[0]),
+            input_type=type(inp_ab1[0]))):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
     nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
-    expected_message = (
-        r"The two structures don't have the same keys. Input "
-        r"structure has keys \['c'\], while shallow structure has "
-        r"keys \['d'\].")
-
-    with self.assertRaisesRegexp(ValueError, expected_message):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["d"])):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
     inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
@@ -719,7 +717,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Non-equal dicts.
     inp_val = dict(a=2, b=3)
     inp_ops = dict(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
-    with self.assertRaisesRegexp(ValueError, "same keys"):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["b"])):
       nest.map_structure_up_to(
           inp_val,
           lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
@@ -736,7 +736,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Non-equal dict/mapping.
     inp_val = dict(a=2, b=3)
     inp_ops = _CustomMapping(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
-    with self.assertRaisesRegexp(ValueError, "same keys"):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["b"])):
       nest.map_structure_up_to(
           inp_val,
           lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
@@ -849,12 +851,12 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(expected, result)
 
   @parameterized.named_parameters(
-      ("tuples", (1, 2), (3, 4, 5), ValueError),
+      ("tuples", (1, 2, 3), (4, 5), ValueError),
       ("dicts", {"a": 1}, {"b": 2}, ValueError),
       ("mixed", (1, 2), [3, 4], TypeError),
       ("nested",
-       {"a": [2, 3], "b": [1, 3]},
-       {"b": [5, 6, 7], "a": [8, 9]},
+       {"a": [2, 3, 4], "b": [1, 3]},
+       {"b": [5, 6], "a": [8, 9]},
        ValueError
       ))
   def testMapWithPathsIncompatibleStructures(self, s1, s2, error_type):
@@ -884,13 +886,14 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(expected, result)
 
   @parameterized.named_parameters([
-      dict(testcase_name="Tuples", s1=(1, 2), s2=(3, 4, 5),
+      dict(testcase_name="Tuples", s1=(1, 2, 3), s2=(4, 5),
            error_type=ValueError),
       dict(testcase_name="Dicts", s1={"a": 1}, s2={"b": 2},
            error_type=ValueError),
       dict(testcase_name="Mixed", s1=(1, 2), s2=[3, 4], error_type=TypeError),
       dict(testcase_name="Nested",
-           s1={"a": [2, 3], "b": [1, 3]}, s2={"b": [5, 6, 7], "a": [8, 9]},
+           s1={"a": [2, 3, 4], "b": [1, 3]},
+           s2={"b": [5, 6], "a": [8, 9]},
            error_type=ValueError)
   ])
   def testMapWithTuplePathsIncompatibleStructures(self, s1, s2, error_type):
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 90c9c4b5b38dad824f4132513cc71a82fafcbf92..ce1a1a5c30172c9286423b48c88b04242992546f 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -251,7 +251,7 @@ def get_v2_constants(module):
 class api_export(object):  # pylint: disable=invalid-name
   """Provides ways to export symbols to the TensorFlow API."""
 
-  def __init__(self, *args, **kwargs):
+  def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
     """Export under the names *args (first one is considered canonical).
 
     Args:
@@ -269,6 +269,10 @@ class api_export(object):  # pylint: disable=invalid-name
     """
     self._names = args
     self._names_v1 = kwargs.get('v1', args)
+    if 'v2' in kwargs:
+      raise ValueError('You passed a "v2" argument to tf_export. This is not '
+                       'what you want. Pass v2 names directly as positional '
+                       'arguments instead.')
     self._api_name = kwargs.get('api_name', TENSORFLOW_API_NAME)
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index e69eec73a0ef8b37f042d9a0f5bf63569b6f5b39..bda0cba82fa31528337cd35d26f5daa577a43d55 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -455,6 +455,14 @@ class SparseTensorValueIterator : public ValueIterator {
   Safe_PyObjectPtr tensor_;
 };
 
+// Returns nullptr (to raise an exception) when next() is called.  Caller
+// should have already called PyErr_SetString.
+class ErrorValueIterator : public ValueIterator {
+ public:
+  ErrorValueIterator() {}
+  Safe_PyObjectPtr next() override { return nullptr; }
+};
+
 class AttrsValueIterator : public ValueIterator {
  public:
   explicit AttrsValueIterator(PyObject* nested) : nested_(nested) {
@@ -497,6 +505,35 @@ bool IsSparseTensorValueType(PyObject* o) {
              o, reinterpret_cast<PyTypeObject*>(sparse_tensor_value_type)) == 1;
 }
 
+// Returns 1 if `o` is an instance of CompositeTensor.
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+bool IsCompositeTensorHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    PyObject* composite_tensor_type = GetRegisteredType("CompositeTensor");
+    if (TF_PREDICT_FALSE(composite_tensor_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "CompositeTensor type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"CompositeTensor\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    int is_instance = PyObject_IsInstance(to_check, composite_tensor_type);
+
+    // Don't cache a failed is_instance check.
+    if (is_instance == -1) return -1;
+
+    return static_cast<int>(is_instance != 0);
+  });
+  return check_cache->CachedLookup(o);
+}
+
+int IsSequenceOrCompositeHelper(PyObject* o) {
+  return IsSequence(o) || IsCompositeTensor(o);
+}
+
 int IsSequenceForDataHelper(PyObject* o) {
   return IsSequenceHelper(o) == 1 && !PyList_Check(o) &&
          !IsSparseTensorValueType(o);
@@ -529,6 +566,18 @@ ValueIteratorPtr GetValueIteratorForData(PyObject* nested) {
   }
 }
 
+// Similar to GetValueIterator above, but expands CompositeTensors.
+ValueIteratorPtr GetValueIteratorForComposite(PyObject* nested) {
+  if (IsCompositeTensor(nested)) {
+    static char expand_method_name[] = "_to_components";
+    nested = PyObject_CallMethod(nested, expand_method_name, nullptr);
+    if (PyErr_Occurred() || nested == nullptr) {
+      return absl::make_unique<ErrorValueIterator>();
+    }
+  }
+  return GetValueIterator(nested);
+}
+
 bool FlattenHelper(
     PyObject* nested, PyObject* list,
     const std::function<int(PyObject*)>& is_sequence_helper,
@@ -596,7 +645,8 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 bool AssertSameStructureHelper(
     PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
     bool* is_type_error,
-    const std::function<int(PyObject*)>& is_sequence_helper) {
+    const std::function<int(PyObject*)>& is_sequence_helper,
+    const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter) {
   DCHECK(error_msg);
   DCHECK(is_type_error);
   const bool is_seq1 = is_sequence_helper(o1);
@@ -702,8 +752,8 @@ bool AssertSameStructureHelper(
     }
   }
 
-  ValueIteratorPtr iter1 = GetValueIterator(o1);
-  ValueIteratorPtr iter2 = GetValueIterator(o2);
+  ValueIteratorPtr iter1 = value_iterator_getter(o1);
+  ValueIteratorPtr iter2 = value_iterator_getter(o2);
 
   if (!iter1->valid() || !iter2->valid()) return false;
 
@@ -714,9 +764,9 @@ bool AssertSameStructureHelper(
       if (Py_EnterRecursiveCall(" in assert_same_structure")) {
         return false;
       }
-      bool no_internal_errors =
-          AssertSameStructureHelper(v1.get(), v2.get(), check_types, error_msg,
-                                    is_type_error, is_sequence_helper);
+      bool no_internal_errors = AssertSameStructureHelper(
+          v1.get(), v2.get(), check_types, error_msg, is_type_error,
+          is_sequence_helper, value_iterator_getter);
       Py_LeaveRecursiveCall();
       if (!no_internal_errors) return false;
       if (!error_msg->empty()) return true;
@@ -742,9 +792,13 @@ bool IsAttrs(PyObject* o) { return IsAttrsHelper(o) == 1; }
 bool IsTensor(PyObject* o) { return IsTensorHelper(o) == 1; }
 bool IsIndexedSlices(PyObject* o) { return IsIndexedSlicesHelper(o) == 1; }
 
-PyObject* Flatten(PyObject* nested) {
+PyObject* Flatten(PyObject* nested, bool expand_composites) {
   PyObject* list = PyList_New(0);
-  if (FlattenHelper(nested, list, IsSequenceHelper, GetValueIterator)) {
+  const std::function<int(PyObject*)>& is_sequence_helper =
+      expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
+  const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
+      expand_composites ? GetValueIteratorForComposite : GetValueIterator;
+  if (FlattenHelper(nested, list, is_sequence_helper, get_value_iterator)) {
     return list;
   } else {
     Py_DECREF(list);
@@ -752,6 +806,12 @@ PyObject* Flatten(PyObject* nested) {
   }
 }
 
+bool IsSequenceOrComposite(PyObject* o) {
+  return IsSequenceOrCompositeHelper(o) == 1;
+}
+
+bool IsCompositeTensor(PyObject* o) { return IsCompositeTensorHelper(o) == 1; }
+
 bool IsSequenceForData(PyObject* o) { return IsSequenceForDataHelper(o) == 1; }
 
 PyObject* FlattenForData(PyObject* nested) {
@@ -850,11 +910,16 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
   }
 }
 
-PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
+                              bool expand_composites) {
+  const std::function<int(PyObject*)>& is_sequence_helper =
+      expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
+  const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
+      expand_composites ? GetValueIteratorForComposite : GetValueIterator;
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceHelper);
+                            is_sequence_helper, get_value_iterator);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
@@ -878,7 +943,7 @@ PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceForDataHelper);
+                            IsSequenceForDataHelper, GetValueIterator);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index f37cd527d819fad36bcac7b914e416bf788c8cb3..4a5db93401c328c056d80f678dd47d66306d53b3 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -33,6 +33,30 @@ namespace swig {
 //   dict.
 bool IsSequence(PyObject* o);
 
+// Implements the same interface as nest.is_sequence_or_composite
+// Returns a true if its input is a collections.Sequence (except strings)
+// or a CompositeTensor.
+//
+// Args:
+//   seq: an input sequence.
+//
+// Returns:
+//   True if the sequence is a not a string and is a collections.Sequence or a
+//   dict or a CompositeTensor.
+bool IsSequenceOrComposite(PyObject* o);
+
+// Implements the same interface as nest.is_sequence_or_composite
+// Returns a true if its input is a collections.Sequence (except strings)
+// or a CompositeTensor.
+//
+// Args:
+//   seq: an input sequence.
+//
+// Returns:
+//   True if the sequence is a not a string and is a collections.Sequence or a
+//   dict or a CompositeTensor.
+bool IsCompositeTensor(PyObject* o);
+
 // Implements the same interface as tensorflow.util.nest._is_namedtuple
 // Returns Py_True iff `instance` should be considered a `namedtuple`.
 //
@@ -118,7 +142,8 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2);
 //
 // Returns:
 //  Py_None on success, nullptr on error.
-PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
+                              bool expand_composites);
 
 // Implements the same interface as tensorflow.util.nest.flatten
 //
@@ -139,6 +164,9 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 // Args:
 //   nest: an arbitrarily nested structure or a scalar object. Note, numpy
 //       arrays are considered scalars.
+//   expand_composites: If true, then composite tensors (such as
+//       `tf.SparseTensor` and `tf.RaggedTensor` are flattened into their
+//       component tensors.
 //
 // Returns:
 //   A Python list, the flattened version of the input.
@@ -146,7 +174,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 //
 // Raises:
 //   TypeError: The nest is or contains a dict with non-sortable keys.
-PyObject* Flatten(PyObject* nested);
+PyObject* Flatten(PyObject* nested, bool expand_composites = false);
 
 // The tensorflow.python.data package has its own nest utility that follows very
 // slightly different semantics for its functions than the tensorflow.python
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 4d34d61eee65ea48ad4fbb2894699695110fc76c..6e2a3d8ccfc48bd9234e0c42229fb37dd9fa1ce4 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -35,7 +35,7 @@ limitations under the License.
 %noexception tensorflow::swig::IsTensor;
 
 %feature("docstring") tensorflow::swig::IsSequence
-"""Returns a true if its input is a collections.Sequence (except strings).
+"""Returns true if its input is a collections.Sequence (except strings).
 
 Args:
   seq: an input sequence.
@@ -47,6 +47,31 @@ Returns:
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
+%feature("docstring") tensorflow::swig::IsSequenceOrComposite
+"""Returns true if its input is a sequence or a `CompositeTensor`.
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a not a string and is a collections.Sequence or a
+  dict or a CompositeTensor.
+"""
+%unignore tensorflow::swig::IsSequenceOrComposite;
+%noexception tensorflow::swig::IsSequenceOrComposite;
+
+%feature("docstring") tensorflow::swig::IsCompositeTensor
+"""Returns true if its input is a `CompositeTensor`.
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a CompositeTensor.
+"""
+%unignore tensorflow::swig::IsCompositeTensor;
+%noexception tensorflow::swig::IsCompositeTensor;
+
 %unignore tensorflow::swig::IsNamedtuple;
 %noexception tensorflow::swig::IsNamedtuple;
 
@@ -103,6 +128,8 @@ running.
 Args:
   nest: an arbitrarily nested structure or a scalar object. Note, numpy
       arrays are considered scalars.
+  expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+      and `tf.RaggedTensor` are expanded into their component tensors.
 
 Returns:
   A Python list, the flattened version of the input.
@@ -112,6 +139,7 @@ Raises:
 """
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
+%feature("kwargs") tensorflow::swig::Flatten;
 
 %feature("docstring") tensorflow::swig::IsSequenceForData
 """Returns a true if `seq` is a Sequence or dict (except strings/lists).
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 00c23b8d1788d56cee0e549ccd835fa174037760..980c92aaca1496fd23a5eb7a92289adf67e72014 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,123 +1,661 @@
-licenses(["restricted"])
+# GPU executor library for data-parallel kernel launches and cross-platform
+# HPC-library APIs.
+#
+# Throughout this file, all targets are built with the standard crosstool and
+# do not link against restricted binary blobs.
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "launch_dim",
+    hdrs = [
+        "gpu_launch_dim.h",
+        "launch_dim.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_description",
+    srcs = ["device_description.cc"],
+    hdrs = ["device_description.h"],
+    deps = [
+        ":launch_dim",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "event",
+    srcs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "dnn.h",
+        "event.cc",
+        "fft.h",
+        "kernel_cache_config.h",
+        "launch_dim.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "shared_memory_config.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "trace_listener.h",
+    ],
+    hdrs = [
+        "device_memory.h",
+        "event.h",
+        "kernel.h",
+        "kernel_spec.h",
+        "platform.h",
+        "stream.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "kernel",
+    srcs = [
+        "dnn.h",
+        "fft.h",
+        "kernel.cc",
+        "plugin.h",
+        "rng.h",
+        "stream.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "event.h",
+        "kernel.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "multi_platform_manager.h",
+        "platform.h",
+        "plugin_registry.h",
+        "shared_memory_config.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "timer.h",
+        "trace_listener.h",
+    ],
+    deps = [
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":kernel_cache_config",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "kernel_spec",
+    srcs = ["kernel_spec.cc"],
+    hdrs = ["kernel_spec.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "kernel_cache_config",
+    hdrs = ["kernel_cache_config.h"],
+)
+
+cc_library(
+    name = "module_spec",
+    hdrs = ["module_spec.h"],
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "shared_memory_config",
+    hdrs = ["shared_memory_config.h"],
+)
+
+cc_library(
+    name = "stream_header",
+    hdrs = [
+        "blas.h",
+        "device_memory.h",
+        "dnn.h",
+        "event.h",
+        "fft.h",
+        "gpu_launch_dim.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "launch_dim.h",
+        "stream.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+# It implements :stream_header
+cc_library(
+    name = "stream",
+    srcs = [
+        "stream.cc",
+    ],
+    hdrs = ["stream.h"],
+    deps = [
+        ":blas",
+        ":device_memory",
+        ":dnn",
+        ":event",
+        ":fft",
+        ":host_or_device_scalar",
+        ":kernel",
+        ":launch_dim",
+        ":platform",
+        ":rng",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        ":stream_executor_pimpl",
+        ":temporary_memory_manager",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "timer",
+    srcs = [
+        "device_description.h",
+        "kernel_cache_config.h",
+        "timer.cc",
+    ],
+    hdrs = [
+        "blas.h",
+        "kernel.h",
+        "stream.h",
+        "stream_executor.h",
+        "timer.h",
+    ],
+    deps = [
+        ":host_or_device_scalar",
+        ":platform",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        ":stream_executor_pimpl_header",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "platform",
+    srcs = ["platform.cc"],
+    hdrs = ["platform.h"],
+    deps = [
+        ":plugin",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "rng",
+    srcs = ["rng.cc"],
+    hdrs = ["rng.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "temporary_device_memory",
+    srcs = [
+        "event.h",
+        "temporary_device_memory.cc",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = ["temporary_device_memory.h"],
+    deps = [
+        ":device_memory",
+        ":stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "temporary_memory_manager",
+    srcs = ["temporary_memory_manager.cc"],
+    hdrs = ["temporary_memory_manager.h"],
+    deps = [
+        ":device_memory",
+        ":stream_executor_pimpl_header",
+        ":stream_header",
+        ":temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
 
-STREAM_EXECUTOR_HEADERS = glob([
-    "*.h",
-    "cuda/*.h",
-    "host/*.h",
-    "lib/*.h",
-    "lib/gtl/*.h",
-    "platform/**/*.h",
-])
+cc_library(
+    name = "fft",
+    hdrs = ["fft.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "blas",
+    srcs = ["blas.cc"],
+    hdrs = ["blas.h"],
+    deps = [
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_memory",
+    hdrs = ["device_memory.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "host_or_device_scalar",
+    hdrs = ["host_or_device_scalar.h"],
+    deps = [
+        ":device_memory",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "device_options",
+    hdrs = ["device_options.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "executor_cache",
+    srcs = [
+        "device_description.h",
+        "device_memory.h",
+        "device_options.h",
+        "event.h",
+        "executor_cache.cc",
+        "launch_dim.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = [
+        "blas.h",
+        "dnn.h",
+        "executor_cache.h",
+        "fft.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "platform.h",
+        "shared_memory_config.h",
+        "stream.h",
+        "stream_executor_internal.h",
+        "trace_listener.h",
+    ],
+    deps = [
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "multi_platform_manager",
+    srcs = ["multi_platform_manager.cc"],
+    hdrs = ["multi_platform_manager.h"],
+    deps = [
+        ":platform",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "plugin",
+    srcs = ["plugin.cc"],
+    hdrs = ["plugin.h"],
+)
+
+cc_library(
+    name = "plugin_registry",
+    srcs = ["plugin_registry.cc"],
+    hdrs = ["plugin_registry.h"],
+    deps = [
+        ":blas",
+        ":dnn",
+        ":fft",
+        ":multi_platform_manager",
+        ":platform",
+        ":plugin",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "scratch_allocator",
+    srcs = ["scratch_allocator.cc"],
+    hdrs = ["scratch_allocator.h"],
+    deps = [
+        ":device_memory",
+        ":stream_header",
+        ":temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
 
 tf_proto_library(
     name = "dnn_proto",
     srcs = ["dnn.proto"],
     cc_api_version = 2,
     default_header = True,
-    protodeps = tf_additional_all_protos(),
+    provide_cc_alias = True,
 )
 
 tf_proto_library(
     name = "logging_proto",
     srcs = ["logging.proto"],
     cc_api_version = 2,
-    default_header = True,
-    protodeps = tf_additional_all_protos(),
+    protodeps = [":dnn_proto"],
+    provide_cc_alias = True,
+    visibility = [":friends"],
 )
 
 cc_library(
-    name = "stream_executor_impl",
-    srcs = glob(
-        [
-            "*.cc",
-            "host/*.cc",
-            "cuda/cuda_platform_id.cc",
-            "lib/*.cc",
-            "platform/default/*.cc",
-        ],
-        exclude = [
-            "**/*_test.cc",
-        ],
-    ),
-    hdrs = STREAM_EXECUTOR_HEADERS,
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
-    visibility = ["//visibility:public"],
+    name = "dnn",
+    srcs = ["dnn.cc"],
+    hdrs = ["dnn.h"],
     deps = [
-        ":dnn_proto_cc_impl",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":stream_executor_headers",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@local_config_cuda//cuda:cuda_headers",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
-    name = "stream_executor",
-    hdrs = STREAM_EXECUTOR_HEADERS,
+    name = "stream_executor_internal",
+    srcs = [
+        "dnn.h",
+        "stream_executor_internal.cc",
+    ],
+    hdrs = [
+        "shared_memory_config.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        ":device_description",
+        ":device_memory",
+        ":device_options",
+        ":dnn_proto_cc",
+        ":kernel",
+        ":kernel_cache_config",
+        ":kernel_spec",
+        ":launch_dim",
+        ":plugin_registry",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "stream_executor_pimpl_header",
+    hdrs = [
+        "device_description.h",
+        "dnn.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "shared_memory_config.h",
+        "stream_executor_pimpl.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":dnn_proto_cc",
+        ":platform",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
-    ] + if_static([":stream_executor_impl"]),
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
-cc_header_only_library(
-    name = "stream_executor_headers_lib",
-    visibility = ["//visibility:public"],
+# It implements :stream_executor_pimpl_header
+cc_library(
+    name = "stream_executor_pimpl",
+    srcs = ["stream_executor_pimpl.cc"],
+    hdrs = ["stream_executor_pimpl.h"],
     deps = [
-        ":stream_executor",
+        ":blas",
+        ":executor_cache",
+        ":fft",
+        ":kernel",
+        ":platform",
+        ":rng",
+        ":stream_executor_headers",
+        ":stream_header",
+        ":timer",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
     ],
 )
 
+# The stream_executor_headers target does not prescribe an implementation.
+#
+# TODO(b/25131218) this is OBSOLETE/DEPRECATED -- get rid of this target altogether
 cc_library(
-    name = "cuda_platform",
-    srcs = if_cuda_is_configured(
-        glob(
-            [
-                "cuda/*.cc",
-            ],
-            exclude = [
-                "cuda/*_test.cc",
-                "cuda/cuda_platform_id.cc",
-            ],
-        ),
-    ),
-    copts = select({
-        "//tensorflow:windows": ["/DNOGDI"],
-        "//conditions:default": [],
-    }),
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
+    name = "stream_executor_headers",
+    hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_memory.h",
+        "device_options.h",
+        "dnn.h",
+        "event.h",
+        "executor_cache.h",
+        "fft.h",
+        "gpu_launch_dim.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "module_spec.h",
+        "multi_platform_manager.h",
+        "platform.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "shared_memory_config.h",
+        "stream.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "timer.h",
+        "trace_listener.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
-        ":stream_executor",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:ops_util",
-        "@local_config_cuda//cuda:cuda_headers",
-    ] + if_cuda_is_configured([
-        "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cuda_driver",
-        "@local_config_cuda//cuda:cudnn",
-    ]),
-    alwayslink = 1,
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "stream_executor",
+    hdrs = ["stream_executor.h"],
+    deps = [":stream_executor_headers"] + if_static([":stream_executor_impl"]),
+)
+
+cc_library(
+    name = "stream_executor_impl",
+    deps = [
+        ":device_description",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":dnn_proto_cc_impl",
+        ":event",
+        ":kernel",
+        ":launch_dim",
+        ":multi_platform_manager",
+        ":platform",
+        ":stream",
+        ":stream_executor_headers",
+        ":stream_executor_pimpl",
+        ":timer",
+    ],
+)
+
+tf_cc_test(
+    name = "stream_test",
+    size = "small",
+    srcs = ["stream_test.cc"],
+    deps = [
+        ":stream_executor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/host:host_platform",
+    ],
+)
+
+alias(
+    name = "cuda_platform",
+    actual = "//tensorflow/stream_executor/cuda:all_runtime",
+)
+
+alias(
+    name = "rocm_platform",
+    actual = "//tensorflow/stream_executor/rocm:all_runtime",
 )
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..717c13d113a05c5150768692ef6526cc2ce27817
--- /dev/null
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -0,0 +1,12 @@
+def stream_executor_friends():
+    return ["//tensorflow/..."]
+
+def tf_additional_cuda_platform_deps():
+  return []
+
+# Use dynamic loading, therefore should be empty.
+def tf_additional_cuda_driver_deps():
+  return []
+
+def tf_additional_cudnn_plugin_deps():
+  return []
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9dc3be45e7a8428769159d08718db85193a10519
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -0,0 +1,378 @@
+# Description:
+#   CUDA-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+    "tf_additional_cuda_driver_deps",
+    "tf_additional_cuda_platform_deps",
+    "tf_additional_cudnn_plugin_deps",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "cuda_platform_id",
+    srcs = ["cuda_platform_id.cc"],
+    hdrs = ["cuda_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+cc_library(
+    name = "cuda_platform",
+    srcs = if_cuda_is_configured(["cuda_platform.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_platform.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        "//tensorflow/stream_executor",  # buildcleaner: keep
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ] + tf_additional_cuda_platform_deps()),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "cuda_diagnostics",
+    srcs = if_cuda_is_configured(["cuda_diagnostics.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_diagnostics.h"]),
+    deps = if_cuda_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = if_cuda_is_configured(["cuda_driver.cc"]),
+    hdrs = if_cuda_is_configured([
+        "cuda_driver.h",
+        "cuda_driver_wrapper.h",
+    ]),
+    deps = if_cuda_is_configured([
+        ":cuda_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + tf_additional_cuda_driver_deps()),
+)
+
+# The activation library is tightly coupled to the executor library.
+# TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
+cc_library(
+    name = "cuda_activation_header",
+    hdrs = ["cuda_activation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "cuda_activation",
+    srcs = [],
+    hdrs = if_cuda_is_configured(["cuda_activation.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cuda_gpu_executor_header",
+    textual_hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_kernel",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cublas_plugin",
+    srcs = if_cuda_is_configured(["cuda_blas.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_blas.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        ":cuda_helpers",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:cublas"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = if_cuda_is_configured(["cuda_fft.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_fft.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_activation_header",
+        ":cuda_gpu_executor_header",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_helpers",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:cufft"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_dnn.h"]),
+    copts = [
+        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+        # setting of template depth 256
+        "-ftemplate-depth-512",
+    ],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_diagnostics",
+        ":cuda_driver",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        ":cudnn_version",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:logger",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:logging_proto_cc",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + tf_additional_cudnn_plugin_deps() + if_static(["@local_config_cuda//cuda:cudnn"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "curand_plugin",
+    srcs = if_cuda_is_configured(["cuda_rng.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_rng.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_helpers",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:curand"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cuda_kernel",
+    srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+# TODO(leary) we likely need to canonicalize/eliminate this.
+cc_library(
+    name = "cuda_helpers",
+    textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
+    deps = if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+    ]),
+)
+
+cc_library(
+    name = "cuda_event",
+    srcs = if_cuda_is_configured(["cuda_event.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_event.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        ":cuda_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "cuda_stream",
+    srcs = [],
+    hdrs = if_cuda_is_configured(["cuda_stream.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cuda_timer",
+    srcs = [],
+    hdrs = if_cuda_is_configured(["cuda_timer.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        ":cuda_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "cuda_gpu_executor",
+    srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_diagnostics",
+        ":cuda_driver",
+        ":cuda_event",
+        ":cuda_kernel",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cudnn_version",
+    srcs = ["cudnn_version.cc"],
+    hdrs = ["cudnn_version.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "cudnn_version_test",
+    srcs = ["cudnn_version_test.cc"],
+    deps = [
+        ":cudnn_version",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas_plugin",
+        ":cuda_driver",
+        ":cuda_platform",
+        ":cudnn_plugin",
+        ":cufft_plugin",
+        ":curand_plugin",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index ef9807820fda493a9ab926ae0509beaafeebdf2e..2b80ae094d17bc8ad957044545ff46daf4aeb103 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -17,13 +17,13 @@ limitations under the License.
 // It reaches into the CUDA implementation to activate an underlying CUDA
 // context.
 //
-// Having this file separate from cuda_gpu_executor.h means that dependent
+// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
 // code does not also have to depend on cuda.h.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
 namespace stream_executor {
 
@@ -31,29 +31,7 @@ class StreamExecutor;
 
 namespace cuda {
 
-class CUDAExecutor;
-class ScopedActivateContext;
-
-// Activates a CUDA context within an enclosing scope.
-class ScopedActivateExecutorContext {
- public:
-  // Form that takes a CUDA executor implementation.
-  explicit ScopedActivateExecutorContext(CUDAExecutor* cuda_exec);
-
-  // Form that takes a pImpl executor and extracts a CUDA implementation --
-  // fatal failure if it is not CUDA inside.
-  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
-
-  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
-
-  ~ScopedActivateExecutorContext();
-
- private:
-  // The cuda.h-using datatype that we wrap.
-  ScopedActivateContext* driver_scoped_activate_context_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
-};
+using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 957f6c98da564500f81d7185ce6a151003549ee5..5bbb98664e80287410264fc1a288d4e0fc5e480e 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -58,16 +58,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -75,7 +71,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
@@ -261,8 +257,8 @@ namespace wrap {
   struct WrapperShim__##__name {                                    \
     static const char *kName;                                       \
     template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
+    cublasStatus_t operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
       return ::__name(args...);                                     \
     }                                                               \
   } __name;                                                         \
@@ -294,8 +290,8 @@ namespace wrap {
       return f;                                                           \
     }                                                                     \
     template <typename... Args>                                           \
-    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    cublasStatus_t operator()(GpuExecutor* parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                          \
     }                                                                     \
   } __name;                                                               \
@@ -399,7 +395,7 @@ class ScopedCublasPointerMode {
   //
   // Parameters:
   //  handle: The cublas library handle to act upon in setting the pointer mode.
-  explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle)
+  explicit ScopedCublasPointerMode(GpuExecutor *parent, cublasHandle_t handle)
       : parent_(parent), handle_(handle), ok_(false) {}
 
   // Attempts the switch to the requested scoped pointer mode, new_mode.
@@ -437,7 +433,7 @@ class ScopedCublasPointerMode {
   }
 
  private:
-  CUDAExecutor *parent_;   // Executor establishing this pointer mode for.
+  GpuExecutor *parent_;   // Executor establishing this pointer mode for.
   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
   cublasPointerMode_t old_mode_;  // Prior cuBLAS pointer mode, to be restored.
   bool ok_;                       // Whether the change was successful.
@@ -460,7 +456,7 @@ class ScopedCublasMathMode {
   //
   // Parameters:
   //  handle: The cublas library handle to act upon in setting the math mode.
-  explicit ScopedCublasMathMode(CUDAExecutor *parent, cublasHandle_t handle)
+  explicit ScopedCublasMathMode(GpuExecutor *parent, cublasHandle_t handle)
       : parent_(parent), handle_(handle), ok_(false) {}
 
   // Attempts the switch to the requested scoped math mode, new_mode.
@@ -497,7 +493,7 @@ class ScopedCublasMathMode {
   }
 
  private:
-  CUDAExecutor *parent_;   // Executor establishing this math mode for.
+  GpuExecutor *parent_;   // Executor establishing this math mode for.
   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
   cublasMath_t old_mode_;  // Prior cuBLAS math mode, to be restored.
   bool ok_;                // Whether the change was successful.
@@ -514,7 +510,7 @@ bool CUDABlas::Init() {
   return true;
 }
 
-CUDABlas::CUDABlas(cuda::CUDAExecutor *parent)
+CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
     : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
 
 CUDABlas::~CUDABlas() {
@@ -525,10 +521,10 @@ CUDABlas::~CUDABlas() {
 
 bool CUDABlas::SetStream(Stream *stream) {
   CHECK(stream != nullptr);
-  CHECK(AsCUDAStreamValue(stream) != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
   CHECK(blas_ != nullptr);
   cublasStatus_t ret =
-      wrap::cublasSetStream(parent_, blas_, AsCUDAStreamValue(stream));
+      wrap::cublasSetStream(parent_, blas_, AsGpuStreamValue(stream));
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
     return false;
@@ -706,7 +702,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(wrap::cublasSasum, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -714,7 +710,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(wrap::cublasDasum, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -722,7 +718,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
       wrap::cublasScasum, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -730,7 +726,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
       wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
@@ -738,7 +734,7 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSaxpy, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
@@ -746,7 +742,7 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDaxpy, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
@@ -755,8 +751,8 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(wrap::cublasCaxpy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
@@ -765,8 +761,8 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(wrap::cublasZaxpy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -774,7 +770,7 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasScopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -782,7 +778,7 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDcopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -790,8 +786,8 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(wrap::cublasCcopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -799,8 +795,8 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(wrap::cublasZcopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
@@ -809,7 +805,7 @@ bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
                          DeviceMemory<float> *result) {
   return DoBlasInternal(
       wrap::cublasSdot, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
@@ -818,7 +814,7 @@ bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
                          DeviceMemory<double> *result) {
   return DoBlasInternal(
       wrap::cublasDdot, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
@@ -827,8 +823,8 @@ bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *result) {
   return DoBlasInternal(
       wrap::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
@@ -837,8 +833,8 @@ bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *result) {
   return DoBlasInternal(
       wrap::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
@@ -847,8 +843,8 @@ bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *result) {
   return DoBlasInternal(
       wrap::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
@@ -857,8 +853,8 @@ bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *result) {
   return DoBlasInternal(
       wrap::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -866,7 +862,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(wrap::cublasSnrm2, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -874,7 +870,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(wrap::cublasDnrm2, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -882,7 +878,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
       wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -890,7 +886,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
       wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -898,7 +894,7 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          DeviceMemory<float> *y, int incy, float c, float s) {
   return DoBlasInternal(
       wrap::cublasSrot, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+      GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -907,7 +903,7 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          double s) {
   return DoBlasInternal(
       wrap::cublasDrot, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+      GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -916,8 +912,8 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          float c, float s) {
   return DoBlasInternal(wrap::cublasCsrot, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -926,17 +922,17 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          double c, double s) {
   return DoBlasInternal(wrap::cublasZdrot, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
                           DeviceMemory<float> *b, DeviceMemory<float> *c,
                           DeviceMemory<float> *s) {
   return DoBlasInternal(wrap::cublasSrotg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(a),
-                        CUDAMemoryMutable(b), CUDAMemoryMutable(c),
-                        CUDAMemoryMutable(s));
+                        false /* = pointer_mode_host */, GpuMemoryMutable(a),
+                        GpuMemoryMutable(b), GpuMemoryMutable(c),
+                        GpuMemoryMutable(s));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
@@ -944,8 +940,8 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
                           DeviceMemory<double> *s) {
   return DoBlasInternal(wrap::cublasDrotg, stream,
                         false /* = pointer_mode_host */,
-                        CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b),
-                        CUDAMemoryMutable(c), CUDAMemoryMutable(s));
+                        GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b),
+                        GpuMemoryMutable(c), GpuMemoryMutable(s));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
@@ -954,8 +950,8 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
                           DeviceMemory<std::complex<float>> *s) {
   return DoBlasInternal(
       wrap::cublasCrotg, stream, false /* = pointer_mode_host */,
-      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
-      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+      GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
+      GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
@@ -964,8 +960,8 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
                           DeviceMemory<std::complex<double>> *s) {
   return DoBlasInternal(
       wrap::cublasZrotg, stream, false /* = pointer_mode_host */,
-      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
-      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+      GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
+      GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
 }
 
 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
@@ -974,8 +970,8 @@ bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
                           const DeviceMemory<float> &param) {
   return DoBlasInternal(wrap::cublasSrotm, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
-                        CUDAMemory(param));
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy,
+                        GpuMemory(param));
 }
 
 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
@@ -984,8 +980,8 @@ bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
                           const DeviceMemory<double> &param) {
   return DoBlasInternal(wrap::cublasDrotm, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
-                        CUDAMemory(param));
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy,
+                        GpuMemory(param));
 }
 
 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
@@ -993,9 +989,9 @@ bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
                            const DeviceMemory<float> &y1,
                            DeviceMemory<float> *param) {
   return DoBlasInternal(wrap::cublasSrotmg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
-                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
-                        CUDAMemory(y1), CUDAMemoryMutable(param));
+                        false /* = pointer_mode_host */, GpuMemoryMutable(d1),
+                        GpuMemoryMutable(d2), GpuMemoryMutable(x1),
+                        GpuMemory(y1), GpuMemoryMutable(param));
 }
 
 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
@@ -1003,37 +999,37 @@ bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
                            const DeviceMemory<double> &y1,
                            DeviceMemory<double> *param) {
   return DoBlasInternal(wrap::cublasDrotmg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
-                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
-                        CUDAMemory(y1), CUDAMemoryMutable(param));
+                        false /* = pointer_mode_host */, GpuMemoryMutable(d1),
+                        GpuMemoryMutable(d2), GpuMemoryMutable(x1),
+                        GpuMemory(y1), GpuMemoryMutable(param));
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<float> *x, int incx) {
   return DoBlasInternal(wrap::cublasSscal, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemoryMutable(x), incx);
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<double> *x, int incx) {
   return DoBlasInternal(wrap::cublasDscal, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemoryMutable(x), incx);
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<std::complex<float>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<std::complex<double>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
@@ -1041,7 +1037,7 @@ bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasCscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
@@ -1049,7 +1045,7 @@ bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasZscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1057,7 +1053,7 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1065,7 +1061,7 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1073,8 +1069,8 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(wrap::cublasCswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1082,8 +1078,8 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(wrap::cublasZswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1091,7 +1087,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(wrap::cublasIsamax, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1099,7 +1095,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(wrap::cublasIdamax, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1107,7 +1103,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1115,7 +1111,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1123,7 +1119,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1131,7 +1127,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1139,7 +1135,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1147,7 +1143,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1157,8 +1153,8 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasSgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
-      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, GpuMemory(a), lda,
+      GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1168,8 +1164,8 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasDgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
-      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, GpuMemory(a), lda,
+      GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1181,9 +1177,9 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasCgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1195,9 +1191,9 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1206,8 +1202,8 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           float beta, DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasSgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1216,8 +1212,8 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           double beta, DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasDgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1228,9 +1224,9 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasCgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1241,9 +1237,9 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
@@ -1252,7 +1248,7 @@ bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
                          DeviceMemory<float> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
@@ -1261,7 +1257,7 @@ bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
                          DeviceMemory<double> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
@@ -1271,8 +1267,8 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCgerc, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
@@ -1282,8 +1278,8 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZgerc, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
@@ -1293,8 +1289,8 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCgeru, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
@@ -1304,8 +1300,8 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZgeru, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1316,9 +1312,9 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasChbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1329,9 +1325,9 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZhbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1342,9 +1338,9 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasChemv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1355,9 +1351,9 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZhemv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1366,8 +1362,8 @@ bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCher, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, &alpha, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1376,8 +1372,8 @@ bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZher, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, &alpha, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1387,9 +1383,9 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCher2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1399,9 +1395,9 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZher2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1412,9 +1408,9 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasChpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1425,9 +1421,9 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZhpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1436,8 +1432,8 @@ bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<float>> *ap) {
   return DoBlasInternal(
       wrap::cublasChpr, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1446,8 +1442,8 @@ bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<double>> *ap) {
   return DoBlasInternal(
       wrap::cublasZhpr, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1457,9 +1453,9 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *ap) {
   return DoBlasInternal(
       wrap::cublasChpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1469,9 +1465,9 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *ap) {
   return DoBlasInternal(
       wrap::cublasZhpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1480,8 +1476,8 @@ bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           float beta, DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasSsbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1490,8 +1486,8 @@ bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           double beta, DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasDsbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1500,8 +1496,8 @@ bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSspmv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1510,8 +1506,8 @@ bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDspmv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1519,8 +1515,8 @@ bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<float> *ap) {
   return DoBlasInternal(wrap::cublasSspr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1528,8 +1524,8 @@ bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<double> *ap) {
   return DoBlasInternal(wrap::cublasDspr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1538,8 +1534,8 @@ bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *ap) {
   return DoBlasInternal(wrap::cublasSspr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1548,8 +1544,8 @@ bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *ap) {
   return DoBlasInternal(wrap::cublasDspr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1558,8 +1554,8 @@ bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSsymv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1568,8 +1564,8 @@ bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDsymv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1577,8 +1573,8 @@ bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<float> *a, int lda) {
   return DoBlasInternal(wrap::cublasSsyr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1586,8 +1582,8 @@ bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<double> *a, int lda) {
   return DoBlasInternal(wrap::cublasDsyr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1596,8 +1592,8 @@ bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *a, int lda) {
   return DoBlasInternal(wrap::cublasSsyr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1606,8 +1602,8 @@ bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *a, int lda) {
   return DoBlasInternal(wrap::cublasDsyr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1617,8 +1613,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStbmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1628,8 +1624,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtbmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1640,8 +1636,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasCtbmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1652,8 +1648,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasZtbmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1663,8 +1659,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStbsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1674,8 +1670,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtbsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1686,8 +1682,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasCtbsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1698,8 +1694,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasZtbsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1709,7 +1705,7 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasStpmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1719,7 +1715,7 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDtpmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1729,8 +1725,8 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtpmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1740,8 +1736,8 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtpmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1751,7 +1747,7 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasStpsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1761,7 +1757,7 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDtpsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1771,8 +1767,8 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtpsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1782,8 +1778,8 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtpsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1793,8 +1789,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1804,8 +1800,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1815,8 +1811,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1826,8 +1822,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1837,8 +1833,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1848,8 +1844,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1859,8 +1855,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1870,8 +1866,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasGemm(
@@ -1925,9 +1921,9 @@ bool CUDABlas::DoBlasGemm(
   return DoBlasInternalImpl(
       wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
       true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
-      CUDABlasTranspose(transb), m, n, k, &alpha, CUDAMemory(a),
-      SE_CUDA_DATA_HALF, lda, CUDAMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
-      CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
+      SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
+      GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
 
 #else
   LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
@@ -1972,7 +1968,7 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasSgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1983,7 +1979,7 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasDgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1996,9 +1992,9 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasCgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -2011,9 +2007,9 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasZgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasGemvWithProfiling(
@@ -2120,10 +2116,10 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return false;
     }
   }
@@ -2133,9 +2129,9 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
       DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2152,10 +2148,10 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return false;
     }
   }
@@ -2165,9 +2161,9 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
                            ldb, beta, c, ldc);
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2242,13 +2238,13 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because "
                  "output_profile_result was given, but we were unable to "
-                 "create a CUDATimer.";
+                 "create a GpuTimer.";
       return false;
     }
   }
@@ -2274,19 +2270,19 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   bool result = DoBlasInternalFailureOK(
       wrap::cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      alpha.is_pointer() ? CUDAMemory(alpha.pointer()) : &alpha.value(),
-      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb,
-      beta.is_pointer() ? CUDAMemory(beta.pointer()) : &beta.value(),
-      CUDAMemoryMutable(c), CUDADataType<OutT>::type, ldc,
+      alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
+      GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
+      beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
+      GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc,
       CUDAComputationType(computation_type),
       static_cast<cublasGemmAlgo_t>(algorithm));
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop "
-                 "CUDATimer.";
+                 "GpuTimer.";
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2474,7 +2470,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
+  typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2539,11 +2535,11 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     cudaDataType_t compute_type =
         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
     const void **a_void_ptrs = reinterpret_cast<const void **>(
-        const_cast<const CUDA_T **>(CUDAMemory(a)));
+        const_cast<const CUDA_T **>(GpuMemory(a)));
     const void **b_void_ptrs = reinterpret_cast<const void **>(
-        const_cast<const CUDA_T **>(CUDAMemory(b)));
+        const_cast<const CUDA_T **>(GpuMemory(b)));
     void **c_void_ptrs =
-        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c)));
     bool ok;
     ok = DoBlasInternalImpl(
         wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
@@ -2563,9 +2559,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok = DoBlasInternal(
         cublas_func, stream, true /* = pointer_mode_host */,
         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+        GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
+        const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta),
+        const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
     if (ok) {
       return port::Status::OK();
     }
@@ -2697,8 +2693,8 @@ bool CUDABlas::DoBlasGemmStridedBatched(
           wrap::cublasGemmStridedBatchedEx, stream,
           true /* = pointer_mode_host */, true /* = err_on_failure */,
           use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, CUDAMemory(a), CUDA_R_16F, lda, stride_a,
-          CUDAMemory(b), CUDA_R_16F, ldb, stride_b, &beta, CUDAMemoryMutable(c),
+          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
+          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
           CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
       if (ok) {
         return true;
@@ -2712,11 +2708,11 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
   for (int batch = 0; batch < batch_count; ++batch) {
     const auto *a_matrix =
-        reinterpret_cast<const __half *>(CUDAMemory(a) + batch * stride_a);
+        reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a);
     const auto *b_matrix =
-        reinterpret_cast<const __half *>(CUDAMemory(b) + batch * stride_b);
+        reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b);
     auto *c_matrix =
-        reinterpret_cast<__half *>(CUDAMemoryMutable(c) + batch * stride_c);
+        reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
         true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
@@ -2740,8 +2736,8 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, stride_a, CUDAMemory(b), ldb, stride_b, &beta,
-      CUDAMemoryMutable(c), ldc, stride_c, batch_count);
+      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
+      GpuMemoryMutable(c), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2753,8 +2749,8 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, stride_a, CUDAMemory(b), ldb, stride_b, &beta,
-      CUDAMemoryMutable(c), ldc, stride_c, batch_count);
+      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
+      GpuMemoryMutable(c), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2767,9 +2763,9 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, stride_a,
-      CUDAComplex(CUDAMemory(b)), ldb, stride_b, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc, stride_c, batch_count);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2782,9 +2778,9 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, stride_a,
-      CUDAComplex(CUDAMemory(b)), ldb, stride_b, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc, stride_c, batch_count);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
@@ -2796,9 +2792,9 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasChemm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
@@ -2810,9 +2806,9 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasZhemm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2824,8 +2820,8 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCherk, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2837,8 +2833,8 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZherk, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2851,9 +2847,9 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCher2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2866,9 +2862,9 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZher2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2878,8 +2874,8 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<float> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasSsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
-      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, GpuMemory(a),
+      lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2889,8 +2885,8 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<double> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasDsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
-      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, GpuMemory(a),
+      lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2902,9 +2898,9 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasCsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2916,9 +2912,9 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasZsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2928,7 +2924,7 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasSsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2938,7 +2934,7 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2950,8 +2946,8 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasCsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2963,8 +2959,8 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasZsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2975,7 +2971,7 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasSsyr2k, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2986,7 +2982,7 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDsyr2k, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2999,9 +2995,9 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCsyr2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -3014,9 +3010,9 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZsyr2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3027,8 +3023,8 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasStrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
-      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+      CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a), lda,
+      GpuMemoryMutable(b), ldb, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3039,8 +3035,8 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasDtrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
-      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+      CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a), lda,
+      GpuMemoryMutable(b), ldb, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3052,9 +3048,9 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasCtrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
-      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb,
+      GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3066,9 +3062,9 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasZtrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
-      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb,
+      GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3079,8 +3075,8 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(wrap::cublasStrsm, stream,
                         true /* = pointer_mode_host */, CUDABlasSide(side),
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
-                        lda, CUDAMemoryMutable(b), ldb);
+                        CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a),
+                        lda, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3091,8 +3087,8 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(wrap::cublasDtrsm, stream,
                         true /* = pointer_mode_host */, CUDABlasSide(side),
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
-                        lda, CUDAMemoryMutable(b), ldb);
+                        CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a),
+                        lda, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3104,8 +3100,8 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasCtrsm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3117,19 +3113,19 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasZtrsm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cublas() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
-          cuda::kCudaPlatformId, cuda::kCuBlasPlugin, "cuBLAS",
+          cuda::kCudaPlatformId, gpu::kCuBlasPlugin, "cuBLAS",
           [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR)
                   << "Attempting to initialize an instance of the cuBLAS "
@@ -3137,7 +3133,7 @@ void initialize_cublas() {
               return nullptr;
             }
 
-            cuda::CUDABlas *blas = new cuda::CUDABlas(cuda_executor);
+            gpu::CUDABlas *blas = new gpu::CUDABlas(cuda_executor);
             if (!blas->Init()) {
               // Note: Init() will log a more specific error.
               delete blas;
@@ -3152,7 +3148,7 @@ void initialize_cublas() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kBlas, cuda::kCuBlasPlugin);
+      cuda::kCudaPlatformId, PluginKind::kBlas, gpu::kCuBlasPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 0fb05089d7530aa298a332e4e6c714eddd7799e9..63d03056d911fe807617f0987e751825248ae607 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -33,26 +33,26 @@ namespace stream_executor {
 
 class Stream;
 
-namespace cuda {
+namespace gpu {
 
 // Opaque and unique identifier for the cuBLAS plugin.
 extern const PluginId kCuBlasPlugin;
 
-class CUDAExecutor;
+class GpuExecutor;
 
 // BLAS plugin for CUDA platform via cuBLAS library.
 //
 // This satisfies the platform-agnostic BlasSupport interface.
 //
 // Note that the cuBLAS handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuBLAS handle when a
 // CUDA context is active.
 //
 // Thread-safe post-initialization.
 class CUDABlas : public blas::BlasSupport {
  public:
-  explicit CUDABlas(CUDAExecutor *parent);
+  explicit CUDABlas(GpuExecutor *parent);
 
   // Allocates a cuBLAS handle.
   bool Init();
@@ -145,9 +145,9 @@ class CUDABlas : public blas::BlasSupport {
   // mutex that guards the cuBLAS handle for this device.
   mutex mu_;
 
-  // CUDAExecutor which instantiated this CUDABlas.
+  // GpuExecutor which instantiated this CUDABlas.
   // Immutable post-initialization.
-  CUDAExecutor *parent_;
+  GpuExecutor *parent_;
 
   // cuBLAS library handle on the device.
   cublasHandle_t blas_ GUARDED_BY(mu_);
@@ -155,7 +155,7 @@ class CUDABlas : public blas::BlasSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 6af71b6c9d194182e79decd3f1beeb96d8141974..e58ebee80da613a63e00d7627abf4e8f8c99bc5b 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -52,13 +52,6 @@ limitations under the License.
 namespace stream_executor {
 namespace cuda {
 
-#ifdef __APPLE__
-static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#elif !defined(PLATFORM_WINDOWS)
-static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
-#endif
-
-
 string DriverVersionToString(DriverVersion version) {
   return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
 }
@@ -112,6 +105,18 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
   return result;
 }
 
+}  // namespace cuda
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
+#ifdef __APPLE__
+static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
+#elif !defined(PLATFORM_WINDOWS)
+static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+#endif
+
 // -- class Diagnostician
 
 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@@ -190,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
   }
   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "libcuda reported version is: "
-            << DriverVersionStatusToString(dso_version);
+            << cuda::DriverVersionStatusToString(dso_version);
 
   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
-	  << DriverVersionStatusToString(kernel_version);
+            << cuda::DriverVersionStatusToString(kernel_version);
 #endif
 
   // OS X kernel driver does not report version accurately
@@ -232,7 +237,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
     }
     const size_t length = suffix_pos - start;
     const string version = path.substr(start, length);
-    result = StringToDriverVersion(version);
+    result = cuda::StringToDriverVersion(version);
   }
 #else
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
@@ -260,7 +265,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
       auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
       auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
-      *result = StringToDriverVersion(stripped_dso_version);
+      *result = cuda::StringToDriverVersion(stripped_dso_version);
       return 1;
     }
     return 0;
@@ -292,7 +297,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   // TODO(b/22689637): Eliminate the explicit namespace if possible.
   auto stripped_kernel_version =
       port::StripSuffixString(kernel_version, ".ld64");
-  return StringToDriverVersion(stripped_kernel_version);
+  return cuda::StringToDriverVersion(stripped_kernel_version);
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
@@ -301,12 +306,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
   if (kernel_version.ok() && dso_version.ok() &&
       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
     LOG(INFO) << "kernel version seems to match DSO: "
-              << DriverVersionToString(kernel_version.ValueOrDie());
+              << cuda::DriverVersionToString(kernel_version.ValueOrDie());
   } else {
     LOG(ERROR) << "kernel version "
-               << DriverVersionStatusToString(kernel_version)
+               << cuda::DriverVersionStatusToString(kernel_version)
                << " does not match DSO version "
-               << DriverVersionStatusToString(dso_version)
+               << cuda::DriverVersionStatusToString(dso_version)
                << " -- cannot find working devices in this configuration";
   }
 }
@@ -336,9 +341,9 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     // see
     // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
     if (version == NULL) {
-      return StringToDriverVersion("");
+      return cuda::StringToDriverVersion("");
     }
-    return StringToDriverVersion(version);
+    return cuda::StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
   auto status = port::Status(
@@ -387,6 +392,5 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 #endif
 }
 
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index f2db2eb20a18c671e055b910809dfde940a5e3f8..0837e136fd428570cb0d4ebddc85bedf66375f1a 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -16,17 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
-#include <tuple>
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 
 namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
-using DriverVersion = std::tuple<int, int, int>;
+using DriverVersion = gpu::DriverVersion;
 
 // Converts a parsed driver version to string form.
 string DriverVersionToString(DriverVersion version);
@@ -35,61 +31,9 @@ string DriverVersionToString(DriverVersion version);
 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
 
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
-
-class Diagnostician {
- public:
-  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
-  // not initializing).
-  //
-  // Note: if we're running on a machine that has no GPUs, we don't want to
-  // produce very much log spew beyond saying, "looks like there's no CUDA
-  // kernel
-  // module running".
-  //
-  // Note: we use non-Google-File:: API here because we may be called before
-  // InitGoogle has completed.
-  static void LogDiagnosticInformation();
-
-  // Given the driver version file contents, finds the kernel module version and
-  // returns it as a string.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
-      const string &driver_version_file_contents);
-
-  // Extracts the kernel driver version from the current host.
-  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
-
-  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-  // driver-interfacing DSO version number. Returns it as a string.
-  static port::StatusOr<DriverVersion> FindDsoVersion();
-
-  // Logs information about the kernel driver version and userspace driver
-  // library version.
-  static void LogDriverVersionInformation();
-
- private:
-
-  // Given the DSO version number and the driver version file contents, extracts
-  // the driver version and compares, warning the user in the case of
-  // incompatibility.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static void WarnOnDsoKernelMismatch(
-      port::StatusOr<DriverVersion> dso_version,
-      port::StatusOr<DriverVersion> kernel_version);
-
-  // Logs information about the dev nodes present on this machine: their
-  // existence, permissions, accessibility from this uid/gid.
-  static void LogDevNodeDiagnosticInformation();
-
-  static string GetDevNodePath(int dev_node_ordinal);
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
-};
+using Diagnostician = gpu::Diagnostician;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 249bad0c109a9191fa0f653637e255bac89fc970..06739e8655087fff229d6be0a599ec368c27adc0 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -38,6 +39,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/logging.pb.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
@@ -48,8 +51,14 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 // clang-format on
 
+#pragma clang diagnostic push
+
+// Make sure that Eigen::half forward declaration in dnn.h matches the
+// declaration in Eigen.
+#pragma clang diagnostic warning "-Wmismatched-tags"
+
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
 
@@ -73,17 +82,6 @@ static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
     }                                                                    \
   } while (false)
 
-// Returns whether status is 'ok', and potentially logs the error.
-bool IsStatusOk(const port::Status& status, bool report_error) {
-  if (status.ok()) {
-    return true;
-  }
-  if (report_error) {
-    LOG(ERROR) << status.error_message();
-  }
-  return false;
-}
-
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -139,7 +137,7 @@ class CudnnHandle {
  public:
   // Takes ownership of the executor context and the lock to access cuDNN
   // using handle.
-  CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
+  CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
               cudnnHandle_t handle)
       : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
@@ -148,13 +146,163 @@ class CudnnHandle {
   cudnnHandle_t handle() const { return handle_; }
 
  private:
-  cuda::ScopedActivateExecutorContext context_;
+  gpu::ScopedActivateExecutorContext context_;
   mutex_lock lock_;
   cudnnHandle_t handle_;  // Not owned.
 };
 
 }  // namespace
 
+#ifdef PLATFORM_GOOGLE
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)   \
+  struct WrapperShim__##__name {             \
+    template <typename... Args>              \
+    cudnnStatus_t operator()(Args... args) { \
+      return ::__name(args...);              \
+    }                                        \
+  } __name;
+
+#else
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCudnnDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cudnn DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cudnnStatus_t operator()(Args... args) {                              \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+#endif
+
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7000_UNDER(__macro)               \
+  __macro(cudnnActivationForward)                             \
+  __macro(cudnnAddTensor)                                     \
+  __macro(cudnnBatchNormalizationBackward)                    \
+  __macro(cudnnBatchNormalizationForwardInference)            \
+  __macro(cudnnBatchNormalizationForwardTraining)             \
+  __macro(cudnnConvolutionBackwardBias)                       \
+  __macro(cudnnConvolutionBackwardData)                       \
+  __macro(cudnnConvolutionBackwardFilter)                     \
+  __macro(cudnnConvolutionBiasActivationForward)              \
+  __macro(cudnnConvolutionForward)                            \
+  __macro(cudnnCreate)                                        \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnCreateConvolutionDescriptor)                   \
+  __macro(cudnnCreateDropoutDescriptor)                       \
+  __macro(cudnnCreateFilterDescriptor)                        \
+  __macro(cudnnCreateLRNDescriptor)                           \
+  __macro(cudnnCreatePersistentRNNPlan)                       \
+  __macro(cudnnCreatePoolingDescriptor)                       \
+  __macro(cudnnCreateRNNDescriptor)                           \
+  __macro(cudnnCreateTensorDescriptor)                        \
+  __macro(cudnnDestroy)                                       \
+  __macro(cudnnDestroyActivationDescriptor)                   \
+  __macro(cudnnDestroyConvolutionDescriptor)                  \
+  __macro(cudnnDestroyDropoutDescriptor)                      \
+  __macro(cudnnDestroyFilterDescriptor)                       \
+  __macro(cudnnDestroyLRNDescriptor)                          \
+  __macro(cudnnDestroyPersistentRNNPlan)                      \
+  __macro(cudnnDestroyPoolingDescriptor)                      \
+  __macro(cudnnDestroyRNNDescriptor)                          \
+  __macro(cudnnDestroyTensorDescriptor)                       \
+  __macro(cudnnDropoutGetStatesSize)                          \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionForwardAlgorithm)                \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)            \
+  __macro(cudnnGetConvolutionNdDescriptor)                    \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)              \
+  __macro(cudnnGetFilterNdDescriptor)                         \
+  __macro(cudnnGetProperty)                                   \
+  __macro(cudnnGetRNNLinLayerBiasParams)                      \
+  __macro(cudnnGetRNNLinLayerMatrixParams)                    \
+  __macro(cudnnGetRNNParamsSize)                              \
+  __macro(cudnnGetRNNTrainingReserveSize)                     \
+  __macro(cudnnGetRNNWorkspaceSize)                           \
+  __macro(cudnnLRNCrossChannelBackward)                       \
+  __macro(cudnnLRNCrossChannelForward)                        \
+  __macro(cudnnPoolingBackward)                               \
+  __macro(cudnnPoolingForward)                                \
+  __macro(cudnnRNNBackwardData)                               \
+  __macro(cudnnRNNBackwardWeights)                            \
+  __macro(cudnnRNNForwardInference)                           \
+  __macro(cudnnRNNForwardTraining)                            \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnSetConvolutionNdDescriptor)                    \
+  __macro(cudnnSetDropoutDescriptor)                          \
+  __macro(cudnnSetFilterNdDescriptor)                         \
+  __macro(cudnnSetLRNDescriptor)                              \
+  __macro(cudnnSetPersistentRNNPlan)                          \
+  __macro(cudnnSetPoolingNdDescriptor)                        \
+  __macro(cudnnSetRNNDescriptor)                              \
+  __macro(cudnnSetRNNDescriptor_v6)                           \
+  __macro(cudnnSetStream)                                     \
+  __macro(cudnnSetTensor4dDescriptor)                         \
+  __macro(cudnnSetTensorNdDescriptor)                         \
+  __macro(cudnnTransformTensor)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7000_UNDER(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7000_UNDER
+
+#if CUDNN_VERSION >= 7000
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7000(__macro)                    \
+  __macro(cudnnSetRNNMatrixMathType)                         \
+  __macro(cudnnSetConvolutionMathType)                       \
+  __macro(cudnnSetConvolutionGroupCount)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7000(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7000
+#endif
+
+#if CUDNN_VERSION >= 7201
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7210(__macro)                     \
+  __macro(cudnnCreateRNNDataDescriptor)                       \
+  __macro(cudnnDestroyRNNDataDescriptor)                      \
+  __macro(cudnnRNNBackwardDataEx)                             \
+  __macro(cudnnRNNBackwardWeightsEx)                          \
+  __macro(cudnnRNNForwardInferenceEx)                         \
+  __macro(cudnnRNNForwardTrainingEx)                          \
+  __macro(cudnnSetRNNDataDescriptor)                          \
+  __macro(cudnnSetRNNPaddingMode)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7210(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7210
+#endif
+
 // Wraps a cuDNN handle and provides access to it through CudnnHandle
 // instances, which also locks a mutex, acquires the CUDA context, and sets
 // the stream that cuDNN should use to enqueue any work.
@@ -186,10 +334,10 @@ class CudnnAccess {
   // The legacy default stream synchronizes with all other streams and it is
   // therefore a bad idea (performance wise) to call any cuDNN APIs that
   // enqueue work in the stream.
-  CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
+  CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
     mutex_lock lock(mutex_);
-    cuda::ScopedActivateExecutorContext context(executor);
-    CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
+    gpu::ScopedActivateExecutorContext context(executor);
+    CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
     auto status = cudnnSetStream(handle_, cu_stream);
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
     return CudnnHandle(std::move(context), std::move(lock), handle_);
@@ -300,7 +448,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 
 }  // namespace
 
-CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
+CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
   ScopedActivateExecutorContext context(parent_);
@@ -333,14 +481,14 @@ port::Status CudnnSupport::Init() {
   CHECK_EQ(cudnn_handle, nullptr);
   LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
-    auto result = cuda::Diagnostician::FindKernelDriverVersion();
+    auto result = gpu::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
       LOG(ERROR) << "Error retrieving driver version: "
-                 << DriverVersionStatusToString(result);
+                 << cuda::DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
       LOG(ERROR) << "Possibly insufficient driver version: "
-                 << DriverVersionToString(version);
+                 << cuda::DriverVersionToString(version);
     }
   }
 
@@ -635,6 +783,19 @@ bool BatchnormSpatialPersistentEnabled() {
   return is_enabled;
 }
 
+// A helper function to decide whether to enable deterministic functionality.
+// TODO(pr/24355): Support all cuDNN functionality (currently only convolution).
+bool RequireDeterminism() {
+  static bool is_enabled = [] {
+    bool is_enabled = false;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
+                                               /*default_val=*/false,
+                                               &is_enabled));
+    return is_enabled;
+  }();
+  return is_enabled;
+}
+
 // Turns a ConvolutionDescriptor structure into a cudnn convolution handle
 // within a scope.
 class CudnnConvolutionDescriptor {
@@ -836,9 +997,11 @@ cudnnDataType_t ToCudnnDataType(
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
   switch (data_type) {
     case dnn::DataType::kFloat:
+      return CUDNN_DATA_FLOAT;
     case dnn::DataType::kDouble:
+      return CUDNN_DATA_DOUBLE;
     case dnn::DataType::kHalf:
-      return static_cast<cudnnDataType_t>(data_type);
+      return CUDNN_DATA_HALF;
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
@@ -849,6 +1012,15 @@ cudnnDataType_t ToCudnnDataType(
   }
 }
 
+cudnnDataType_t ToCudnnDataType(dnn::DataType data_type,
+                                dnn::FilterLayout filter_layout) {
+  if (data_type == dnn::DataType::kInt8 &&
+      filter_layout == dnn::FilterLayout::kOutputInputYX4) {
+    return CUDNN_DATA_INT8x4;
+  }
+  return ToCudnnDataType(data_type);
+}
+
 template <typename T>
 cudnnDataType_t GetCudnnDataType(
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
@@ -979,7 +1151,7 @@ class CudnnRnnParamsDescriptor {
 }  // namespace
 
 class CudnnRnnDescriptor : public dnn::RnnDescriptor {
-  CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc,
                      PersistentRnnPlan rnn_plan, int num_layers,
                      int hidden_size, int input_size, int batch_size,
                      cudnnRNNInputMode_t input_mode,
@@ -1019,7 +1191,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         CudnnDropoutDescriptor dropout_desc,
         CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
 
-    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
+    gpu::RnnDescriptor rnn_desc = CreateRnnDescriptor();
     cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
 
     // TODO: allow the user to choose an algorithm.
@@ -1110,7 +1282,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
   }
 
  private:
-  cuda::RnnDescriptor rnn_desc_;
+  gpu::RnnDescriptor rnn_desc_;
   PersistentRnnPlan rnn_plan_;
   int num_layers_;
   int hidden_size_;
@@ -1188,14 +1360,21 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
     for (int region = 0; region < region_count_per_layer; region++) {
       for (int type = 0; type < 2; type++) {
         void* offset = nullptr;
-        RETURN_IF_CUDNN_ERROR((type == 0 ? cudnnGetRNNLinLayerMatrixParams
-                                         : cudnnGetRNNLinLayerBiasParams)(
-            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-            /*layer=*/layer, /*xDesc=*/input_desc.get(),
-            /*wDesc=*/filter_desc.get(),
-            /*w=*/nullptr, /*linLayerID=*/region,
-            /*linLayerMatDesc=*/region_desc_handle.get(),
-            /*linLayerMat or linLayerBias=*/&offset));
+        RETURN_IF_CUDNN_ERROR(
+            type == 0 ? cudnnGetRNNLinLayerMatrixParams(
+                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+                            /*wDesc=*/filter_desc.get(),
+                            /*w=*/nullptr, /*linLayerID=*/region,
+                            /*linLayerMatDesc=*/region_desc_handle.get(),
+                            /*linLayerMat or linLayerBias=*/&offset)
+                      : cudnnGetRNNLinLayerBiasParams(
+                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+                            /*wDesc=*/filter_desc.get(),
+                            /*w=*/nullptr, /*linLayerID=*/region,
+                            /*linLayerMatDesc=*/region_desc_handle.get(),
+                            /*linLayerMat or linLayerBias=*/&offset));
         int dims[] = {1, 1, 1};
         cudnnDataType_t data_type;
         cudnnTensorFormat_t tensor_format;
@@ -1222,15 +1401,14 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
 
 class CudnnRnnSequenceTensorDescriptor
     : public dnn::RnnSequenceTensorDescriptor {
-  CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int max_seq_length,
+  CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
                                    int batch_size, int data_size,
                                    cudnnDataType_t data_type,
 #if CUDNN_VERSION >= 7201
                                    RNNDataDescriptor data_handle,
 #endif
                                    TensorDescriptor handle)
-      : parent_(parent),
-        max_seq_length_(max_seq_length),
+      : max_seq_length_(max_seq_length),
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type),
@@ -1246,7 +1424,7 @@ class CudnnRnnSequenceTensorDescriptor
       default;
 
   static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       cudnnDataType_t data_type) {
     CHECK_GT(max_seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
@@ -1265,7 +1443,7 @@ class CudnnRnnSequenceTensorDescriptor
   }
 
   static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       const absl::Span<const int>& seq_lengths, cudnnDataType_t data_type) {
 #if CUDNN_VERSION >= 7201
     CHECK_GT(max_seq_length, 0);
@@ -1317,7 +1495,6 @@ class CudnnRnnSequenceTensorDescriptor
   }
 
  private:
-  CUDAExecutor* parent_;
   int max_seq_length_;
   int batch_size_;
   int data_size_;
@@ -1332,11 +1509,10 @@ class CudnnRnnSequenceTensorDescriptor
 
 class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
  public:
-  CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
+  CudnnRnnStateTensorDescriptor(GpuExecutor* parent, int num_layers,
                                 int batch_size, int data_size,
                                 cudnnDataType_t data_type)
-      : parent_(parent),
-        handle_(CreateTensorDescriptor()),
+      : handle_(CreateTensorDescriptor()),
         num_layers_(num_layers),
         batch_size_(batch_size),
         data_size_(data_size),
@@ -1356,7 +1532,6 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
   int data_size() const { return data_size_; }
 
  private:
-  CUDAExecutor* parent_;
   TensorDescriptor handle_;
   int num_layers_;
   int batch_size_;
@@ -1520,14 +1695,14 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     }
   }
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -1545,7 +1720,8 @@ port::Status CudnnSupport::DoRnnForwardImpl(
           /*y=*/output_data->opaque(),
           /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
           /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
-          NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr,
           /*workspace=*/workspace.opaque(),
           /*workSpaceSizeInBytes=*/workspace.size()));
 #else
@@ -1581,7 +1757,8 @@ port::Status CudnnSupport::DoRnnForwardImpl(
           /*y=*/output_data->opaque(),
           /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
           /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
-          NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr,
           /*workspace=*/workspace.opaque(),
           /*workSpaceSizeInBytes=*/workspace.size(),
           /*reserveSpace=*/reserve_space.opaque(),
@@ -1610,7 +1787,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
   }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@@ -1661,14 +1838,14 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
                       CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
                                          workspace_allocator));
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -1679,7 +1856,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*yDesc=*/output_desc.data_handle(), /*y=*/output_data.opaque(),
         /*dyDesc=*/output_desc.data_handle(),
-        /*dy=*/output_backprop_data.opaque(), NULL, NULL,
+        /*dy=*/output_backprop_data.opaque(), nullptr, nullptr,
         /*dhyDesc=*/output_h_desc.handle(),
         /*dhy=*/output_h_backprop_data.opaque(),
         /*dcyDesc=*/output_c_desc.handle(),
@@ -1692,7 +1869,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
         /*dhxDesc=*/input_h_desc.handle(),
         /*dhx=*/input_h_backprop_data->opaque(),
         /*dcxDesc=*/input_c_desc.handle(),
-        /*dcx=*/input_c_backprop_data->opaque(), NULL, NULL,
+        /*dcx=*/input_c_backprop_data->opaque(), nullptr, nullptr,
         /*workspace=*/workspace.opaque(),
         /*workSpaceSizeInBytes=*/workspace.size(),
         /*reserveSpace=*/reserve_space_data->opaque(),
@@ -1767,7 +1944,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
   }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@@ -2542,6 +2719,23 @@ cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
   }
 }
 
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+    case dnn::DataType::kDouble:
+      return data_type;
+    case dnn::DataType::kHalf:
+      return CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+                 ? dnn::DataType::kFloat
+                 : dnn::DataType::kHalf;
+    case dnn::DataType::kInt8:
+    case dnn::DataType::kInt32:
+      return dnn::DataType::kInt32;
+    default:
+      LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
+  }
+}
+
 // Determines whether we can safely perform a winograd non-fused convolution for
 // the given input and output shapes.  This works around b/68264959, an integer
 // overflow in cuDNNv5 and cuDNNv6.
@@ -2569,23 +2763,137 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 }
 #endif
 
+dnn::ConvolutionProto GenerateConvProto(
+    dnn::ConvolutionKind kind, dnn::DataType element_type,
+    const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& output_descriptor, dnn::AlgorithmDesc algorithm,
+    const dnn::ConvolutionDescriptor& convolution_descriptor, double conv_scale,
+    double side_value_scale, dnn::DataType acc_type,
+    dnn::ActivationMode activation) {
+  dnn::ConvolutionProto conv_config;
+  conv_config.set_kind(kind);
+  *conv_config.mutable_input() = input_descriptor.ToProto(element_type);
+  *conv_config.mutable_filter() = filter_descriptor.ToProto(element_type);
+  *conv_config.mutable_output() = output_descriptor.ToProto(element_type);
+  *conv_config.mutable_algorithm() = algorithm.ToProto();
+  *conv_config.mutable_conv_desc() = convolution_descriptor.ToProto();
+  conv_config.mutable_conv_desc()->set_compute_mode(acc_type);
+  conv_config.set_conv_scale(conv_scale);
+  conv_config.set_side_value_scale(side_value_scale);
+  conv_config.set_activation(activation);
+  return conv_config;
+}
+
+void LogCudaProto(const dnn::ConvolutionProto& conv, float profile_time_ms,
+                  StreamExecutor* stream_executor) {
+  {
+    // For rolling-out, temporarily cap the number of logs per process.
+    // TODO(timshen): remove it.
+    static int count_down = 200;
+    if (count_down == 0) {
+      return;
+    }
+    count_down--;
+  }
+
+  ConvLogEntry conv_log;
+  *conv_log.mutable_convolution() = conv;
+  conv_log.set_profile_time_ms(profile_time_ms);
+
+  auto info = conv_log.mutable_cuda_info();
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  info->mutable_compute_capability()->set_major(cc_major);
+  info->mutable_compute_capability()->set_minor(cc_minor);
+
+  if (auto* dnn = stream_executor->AsDnn()) {
+    port::StatusOr<dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      info->mutable_cudnn_version()->set_major(version.major_version());
+      info->mutable_cudnn_version()->set_minor(version.minor_version());
+      info->mutable_cudnn_version()->set_patch(version.patch());
+    }
+  }
+  tensorflow::Logger::Singleton()->LogProto(conv_log);
+}
+
 }  // namespace
 
-template <class T>
-port::Status CudnnSupport::DoConvolveImpl(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<T>& input_data,
+port::Status CudnnSupport::DoPrepareForConvolution(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
+    ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory) {
+  CudnnTensorDescriptor input_nd(
+      input_descriptor,
+      ToCudnnDataType(element_type, input_descriptor.layout()));
+  CudnnFilterDescriptor filter_nd(
+      filter_descriptor,
+      ToCudnnDataType(element_type, filter_descriptor.layout()));
+  CudnnTensorDescriptor output_nd(
+      output_descriptor,
+      ToCudnnDataType(element_type, output_descriptor.layout()));
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionForwardAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardDataAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardFilterAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
+      break;
+    }
+    default:
+      return port::InternalError(
+          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+  }
+
+  return port::Status::OK();
+}
+
+port::Status CudnnSupport::DoConvolve(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
     dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  cudnnDataType_t cudnn_type = ToCudnnDataType(element_type);
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter_nd(filter_descriptor, cudnn_type);
+  auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
 
@@ -2603,28 +2911,22 @@ port::Status CudnnSupport::DoConvolveImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionForwardAlgorithm(
-                          stream, cudnn, algorithm_config, input_nd, filter,
-                          conv, output_nd, scratch_allocator, &scratch));
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
 
-  // Report an error if we might be hitting a cuDNN bug that accesses illegal
-  // memory. See nvbugs/2138754, b/80018418.
-  if (CUDNN_VERSION < 7300) {
-    SE_RETURN_IF_ERROR([&] {
-      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
+  auto get_fwd_bugs = [&]() -> port::Status {
+    // Report an error if we might be hitting a cuDNN bug that accesses illegal
+    // memory. See nvbugs/2138754, b/80018418.
+    if (CUDNN_VERSION < 7300) {
+      if (algorithm_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
         return port::Status::OK();
       }
       if (input_descriptor.ndims() < 3) {
@@ -2646,33 +2948,178 @@ port::Status CudnnSupport::DoConvolveImpl(
       SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
                                      output_descriptor.feature_map_count()));
       return port::Status::OK();
-    }());
-  }
+    }
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
+    return port::Status::OK();
+  };
 
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
+  auto get_bwd_data_bugs = [&]() -> port::Status {
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
 
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
-      cudnn.handle(),
-      /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvForwardAlgo(algo_desc), /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta,
-      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
+    // Cudnn 7.1.4 has a bug if the workspace of the following convolution is
+    // not zero-initialized, nvbugs/2254619.
+    if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
+        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
+        cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() &&
+        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
+        output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
+        (convolution_descriptor.vertical_filter_stride() > 1 ||
+         convolution_descriptor.horizontal_filter_stride() > 1)) {
+      stream->ThenMemZero(&scratch_memory, scratch_memory.size());
+    }
+    return port::Status::OK();
+  };
+
+  auto get_bwd_filter_bugs = [&]() -> port::Status {
+    // Report an error if we might be hitting a cuDNN bug that produces
+    // incorrect results. See nvbugs/2072856
+    if (CUDNN_VERSION < 7300) {
+      SE_RETURN_IF_ERROR([&] {
+        if (algorithm_desc.algo_id() !=
+            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
+          return port::Status::OK();
+        }
+        if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
+          return port::Status::OK();
+        }
+        int convolution_size = output_descriptor.height() > 1
+                                   ? filter_descriptor.input_filter_height()
+                                   : filter_descriptor.input_filter_width();
+        if (convolution_size <= 32) {
+          return port::Status::OK();
+        }
+        cudnnConvolutionMode_t convolution_mode;
+        cudnnDataType_t compute_type;
+        RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
+            conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
+            &convolution_mode, &compute_type));
+        if (convolution_mode != CUDNN_CONVOLUTION) {
+          return port::Status::OK();
+        }
+        return port::Status(
+            port::error::FAILED_PRECONDITION,
+            "This configuration potentially produces incorrect results.");
+      }());
+    }
+
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
+
+    // Zero out the result buffer for strided conv backward filter for NHWC
+    // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is
+    // not zeroed.
+    //
+    // This wrong result caused by the bug is very flaky. It needs to be run for
+    // up to 20 times to produce a mismatch.
+    //
+    // See nvbugs/2379553.
+    if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
+        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
+        cudnn_type == CUDNN_DATA_HALF &&
+        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
+        output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        (convolution_descriptor.vertical_filter_stride() > 1 ||
+         convolution_descriptor.horizontal_filter_stride() > 1)) {
+      stream->ThenMemZero(&filter_data, filter_data.size());
+    }
+    return port::Status::OK();
+  };
+
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      SE_RETURN_IF_ERROR(get_fwd_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
+          cudnn.handle(),
+          /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(), /*filterDesc=*/filter_nd.handle(),
+          /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvForwardAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(), /*beta=*/beta,
+          /*yDesc=*/output_nd.handle(), /*y=*/output_data.opaque()));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      SE_RETURN_IF_ERROR(get_bwd_data_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardData(
+          cudnn.handle(),
+          /*alpha=*/alpha,
+          /*wDesc=*/filter_nd.handle(),
+          /*w=*/filter_data.opaque(),
+          /*dyDesc=*/output_nd.handle(),
+          /*dy=*/output_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(),
+          /*beta=*/beta,
+          /*dxDesc=*/input_nd.handle(),
+          /*dx=*/input_data.opaque()));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      SE_RETURN_IF_ERROR(get_bwd_filter_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
+          cudnn.handle(),
+          /*alpha=*/alpha,
+          /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(),
+          /*diffDesc=*/output_nd.handle(),
+          /*diffData=*/output_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(),
+          /*beta=*/beta,
+          /*gradDesc=*/filter_nd.handle(),
+          /*dw=*/filter_data.opaque()));
+      break;
+    }
+    default:
+      return port::InternalError(
+          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+  }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_algorithm(algorithm_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
+    output_profile_result->set_scratch_size(scratch_memory.size());
+
+    LogCudaProto(
+        GenerateConvProto(kind, element_type, input_descriptor,
+                          filter_descriptor, output_descriptor, algorithm_desc,
+                          convolution_descriptor, dalpha, dbeta,
+                          accumulator_type, dnn::ActivationMode::kNone),
+        output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
@@ -2724,13 +3171,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
           stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
           output_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -2783,21 +3230,42 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
     output_profile_result->set_scratch_size(scratch.size());
+
+    LogCudaProto(
+        GenerateConvProto(
+            dnn::ConvolutionKind::FORWARD, dnn::ToDataType<ElementType>::value,
+            conv_input_descriptor, filter_descriptor, output_descriptor,
+            algo_desc, convolution_descriptor, conv_input_scale,
+            side_input_scale, accumulator_type, activation_mode),
+        output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
 }
 
+inline bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
+}
+
 bool CudnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
+  out_algorithms->clear();
+
+  if (RequireDeterminism()) {
+    out_algorithms->push_back({CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+                               tensor_op_math_available});
+    return true;
+  }
+
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
     // clang-format off
     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
@@ -2815,13 +3283,13 @@ bool CudnnSupport::GetConvolveAlgorithms(
     algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
   }
 
-  out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+    if (tensor_op_math_available) {
       out_algorithms->push_back({i, /*use_tensor_ops=*/true});
     }
   }
+
   return true;
 }
 
@@ -2850,6 +3318,15 @@ bool CudnnSupport::GetRnnAlgorithms(
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
+  out_algorithms->clear();
+
+  if (RequireDeterminism()) {
+    out_algorithms->push_back(
+        {CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, tensor_op_math_available});
+    return true;
+  }
+
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
       // clang-format off
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
@@ -2863,19 +3340,28 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
 
-  out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+    if (tensor_op_math_available) {
       out_algorithms->push_back({i, /*use_tensor_ops=*/true});
     }
   }
+
   return true;
 }
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
+  out_algorithms->clear();
+
+  if (RequireDeterminism()) {
+    out_algorithms->push_back(
+        {CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, tensor_op_math_available});
+    return true;
+  }
+
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
       // clang-format off
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
@@ -2894,13 +3380,13 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
 
-  out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+    if (tensor_op_math_available) {
       out_algorithms->push_back({i, /*use_tensor_ops=*/true});
     }
   }
+
   return true;
 }
 
@@ -3069,64 +3555,6 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<float>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, dnn::DataType::kFloat, scratch_allocator,
-                     algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<double>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, dnn::DataType::kDouble, scratch_allocator,
-                     algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<Eigen::half>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, acc_type, scratch_allocator, algorithm_config,
-                     output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
 bool CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<double>& conv_input_data, double conv_input_scale,
@@ -3141,13 +3569,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
-                          conv_input_scale, filter_descriptor, filter_data,
-                          convolution_descriptor, side_input_data,
-                          side_input_scale, bias_descriptor, biases,
-                          activation_mode, output_descriptor, output_data,
-                          dnn::DataType::kDouble, scratch_allocator,
-                          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kDouble), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3165,13 +3593,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
-                          conv_input_scale, filter_descriptor, filter_data,
-                          convolution_descriptor, side_input_data,
-                          side_input_scale, bias_descriptor, biases,
-                          activation_mode, output_descriptor, output_data,
-                          dnn::DataType::kFloat, scratch_allocator,
-                          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kFloat), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3189,17 +3617,14 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
   return IsStatusOk(
       DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
           filter_descriptor, filter_data, convolution_descriptor,
           side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, acc_type,
-          scratch_allocator, algorithm_config, output_profile_result),
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kHalf), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3225,13 +3650,13 @@ bool CudnnSupport::DoFusedConvolve(
     return false;
   }
   return IsStatusOk(
-      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
-                          conv_input_scale, filter_descriptor, filter_data,
-                          convolution_descriptor, side_input_data,
-                          side_input_scale, bias_descriptor, biases,
-                          activation_mode, output_descriptor, output_data,
-                          dnn::DataType::kInt32, scratch_allocator,
-                          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3257,368 +3682,6 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-template <class T>
-port::Status CudnnSupport::DoConvolveBackwardDataImpl(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  // Alpha is the scaling factor for input.
-  float falpha = 1.0;
-  double dalpha = 1.0;
-  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
-                                                : static_cast<void*>(&falpha);
-  // Beta is the scaling factor for output.
-  float fbeta = 0.0;
-  double dbeta = 0.0;
-  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
-                                               : static_cast<void*>(&fbeta);
-
-  auto cudnn = cudnn_->GetHandle(parent_, stream);
-
-  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-
-  const bool is_profiling = output_profile_result != nullptr;
-
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionBackwardDataAlgorithm(
-                          stream, cudnn, algorithm_config, in_back_nd, filter,
-                          conv, out_back_nd, scratch_allocator, &scratch));
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
-    }
-  }
-
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
-
-  // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
-  // zero-initialized, nvbugs/2254619.
-  if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
-      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF && algo_desc.tensor_ops_enabled() &&
-      input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
-      output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
-      (convolution_descriptor.vertical_filter_stride() > 1 ||
-       convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(&scratch, scratch.size());
-  }
-
-  RETURN_IF_CUDNN_ERROR(
-      cudnnConvolutionBackwardData(cudnn.handle(),
-                                   /*alpha=*/alpha,
-                                   /*wDesc=*/filter.handle(),
-                                   /*w=*/filter_data.opaque(),
-                                   /*dyDesc=*/out_back_nd.handle(),
-                                   /*dy=*/backward_output_data.opaque(),
-                                   /*convDesc=*/conv.handle(),
-                                   /*algo=*/ToConvBackwardDataAlgo(algo_desc),
-                                   /*workSpace=*/scratch.opaque(),
-                                   /*workSpaceSizeInBytes=*/scratch.size(),
-                                   /*beta=*/beta,
-                                   /*dxDesc=*/in_back_nd.handle(),
-                                   /*dx=*/backward_input_data->opaque()));
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
-    }
-    output_profile_result->set_algorithm(algo_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
-  }
-
-  return port::Status::OK();
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<double>* backward_input_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, dnn::DataType::kDouble, scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<float>* backward_input_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, dnn::DataType::kFloat, scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<Eigen::half>* backward_input_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, acc_type, scratch_allocator, algorithm_config,
-          output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-template <class T>
-port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<T>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  // Alpha is the scaling factor for input.
-  float falpha = 1.0;
-  double dalpha = 1.0;
-  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
-                                                : static_cast<void*>(&falpha);
-  // Beta is the scaling factor for output.
-  float fbeta = 0.0;
-  double dbeta = 0.0;
-  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
-                                               : static_cast<void*>(&fbeta);
-
-  auto cudnn = cudnn_->GetHandle(parent_, stream);
-
-  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-
-  const bool is_profiling = output_profile_result != nullptr;
-
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionBackwardFilterAlgorithm(
-                          stream, cudnn, algorithm_config, input_nd, filter,
-                          conv, out_back_nd, scratch_allocator, &scratch));
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
-    }
-  }
-
-  // Report an error if we might be hitting a cuDNN bug that produces incorrect
-  // results. See nvbugs/2072856
-  if (CUDNN_VERSION < 7300) {
-    SE_RETURN_IF_ERROR([&] {
-      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
-        return port::Status::OK();
-      }
-      if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
-        return port::Status::OK();
-      }
-      int convolution_size = output_descriptor.height() > 1
-                                 ? filter_descriptor.input_filter_height()
-                                 : filter_descriptor.input_filter_width();
-      if (convolution_size <= 32) {
-        return port::Status::OK();
-      }
-      cudnnConvolutionMode_t convolution_mode;
-      cudnnDataType_t compute_type;
-      RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
-          conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
-          &convolution_mode, &compute_type));
-      if (convolution_mode != CUDNN_CONVOLUTION) {
-        return port::Status::OK();
-      }
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration potentially produces incorrect results.");
-    }());
-  }
-
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
-
-  // Zero out the result buffer for strided conv backward filter for NHWC
-  // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is not
-  // zeroed.
-  //
-  // This wrong result caused by the bug is very flaky. It needs to be run for
-  // up to 20 times to produce a mismatch.
-  //
-  // See nvbugs/2379553.
-  if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
-      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF &&
-      input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
-      output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      (convolution_descriptor.vertical_filter_stride() > 1 ||
-       convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(backward_filter_data, backward_filter_data->size());
-  }
-
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
-      cudnn.handle(),
-      /*alpha=*/alpha,
-      /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(),
-      /*diffDesc=*/out_back_nd.handle(),
-      /*diffData=*/backward_output_data.opaque(),
-      /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvBackwardFilterAlgo(algo_desc),
-      /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(),
-      /*beta=*/beta,
-      /*gradDesc=*/filter.handle(),
-      /*dw=*/backward_filter_data->opaque()));
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
-    }
-    output_profile_result->set_algorithm(algo_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
-  }
-
-  return port::Status::OK();
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<double>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<double>* backward_filter_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, dnn::DataType::kDouble,
-
-          scratch_allocator, algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<float>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<float>* backward_filter_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(DoConvolveBackwardFilterImpl(
-                        stream, input_descriptor, input_data, output_descriptor,
-                        backward_output_data, convolution_descriptor,
-                        filter_descriptor, backward_filter_data,
-
-                        dnn::DataType::kFloat, scratch_allocator,
-                        algorithm_config, output_profile_result),
-                    /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<Eigen::half>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<Eigen::half>* backward_filter_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, acc_type, scratch_allocator, algorithm_config,
-          output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
 template <class T>
 port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -3952,6 +4015,31 @@ bool CudnnSupport::DoPoolForward(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<int8>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<int8>* output_data, ScratchAllocator* workspace_allocator) {
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_INT8);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_INT8);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
+}
+
 bool CudnnSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -4247,22 +4335,22 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cudnn() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
-          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
           [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
-            cuda::CUDAExecutor* cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
                          << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
-            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            gpu::CudnnSupport* dnn = new gpu::CudnnSupport(cuda_executor);
             if (!dnn->Init().ok()) {
               // Note: Init() will log a more specific error.
               delete dnn;
@@ -4277,10 +4365,12 @@ void initialize_cudnn() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
+      cuda::kCudaPlatformId, PluginKind::kDnn, gpu::kCuDnnPlugin);
 }
 
 }  // namespace stream_executor
 
+#pragma clang diagnostic pop
+
 REGISTER_MODULE_INITIALIZER(register_cudnn,
                             { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 044ed545145bdc521c32225a0e95f9dd63eace69..80c7c8a06c6f8600fc63cc051047cbdd5e4bb27c 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-class CUDAExecutor;
+class GpuExecutor;
 class CudnnRnnDescriptor;
 class CudnnRnnSequenceTensorDescriptor;
 class CudnnRnnStateTensorDescriptor;
@@ -42,7 +42,7 @@ extern const PluginId kCuDnnPlugin;
 // functions, see dnn.h.
 class CudnnSupport : public dnn::DnnSupport {
  public:
-  explicit CudnnSupport(CUDAExecutor* parent);
+  explicit CudnnSupport(GpuExecutor* parent);
 
   port::Status Init() override;
   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@@ -258,38 +258,16 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* scale_backprop,
       DeviceMemory<float>* offset_backprop) override;
 
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<float>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<float>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<float>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
-                  dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<double>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<double>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<double>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
-                  dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<Eigen::half>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<Eigen::half>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<Eigen::half>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
-                  dnn::ProfileResult* output_profile_result) override;
+  port::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
 
   bool DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
@@ -390,78 +368,6 @@ class CudnnSupport : public dnn::DnnSupport {
     return false;
   }
 
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<double>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<float>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<double>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<double>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<float>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<Eigen::half>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
   bool DoConvolveBackwardBias(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<double>& input_data,
@@ -540,6 +446,14 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* output_data,
                      ScratchAllocator* workspace_allocator) override;
 
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<int8>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<int8>* output_data,
+                     ScratchAllocator* workspace_allocator) override;
+
   bool DoPoolBackward(Stream* stream,
                       const dnn::PoolingDescriptor& pooling_dimensions,
                       const dnn::BatchDescriptor& input_dimensions,
@@ -638,7 +552,7 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemoryBase* output_data) override;
 
  private:
-  CUDAExecutor* parent_;  // Parent executor object. Not owned.
+  GpuExecutor* parent_;  // Parent executor object. Not owned.
 
   // Provides access to the cuDNN handle.
   std::unique_ptr<class CudnnAccess> cudnn_;
@@ -668,19 +582,6 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
       DeviceMemory<U>* offset_backprop);
 
-  template <class T>
-  port::Status DoConvolveImpl(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<T>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
   template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
@@ -698,32 +599,6 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <class T>
-  port::Status DoConvolveBackwardDataImpl(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
-  template <class T>
-  port::Status DoConvolveBackwardFilterImpl(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<T>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
   template <class T>
   port::Status DoConvolveBackwardBiasImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -776,10 +651,23 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator,
       dnn::ProfileResult* output_profile_result);
 
+ private:
+  port::Status DoPrepareForConvolution(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b34d1f722eaf60b21f2289a4b87b5653bfd43bb9..34ba7c5b1573a2127027ad1f5fba98037ca16ea8 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
@@ -44,21 +45,20 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
 
 // Debugging: on each push and pop of a cuda context, verify the current context
 // matches the expected one.
-constexpr bool kVerifyCudaContext = false;
+constexpr bool kVerifyGpuContext = false;
 
 namespace stream_executor {
-namespace cuda {
-
+namespace gpu {
 namespace {
 
 // Manages the singleton map of contexts that we've created, mapping
-// from the CUcontext to the CudaContext* that we pass around internally.
-// This also manages assignment of unique ids to CudaContexts, to allow
+// from the CUcontext to the GpuContext* that we pass around internally.
+// This also manages assignment of unique ids to GpuContexts, to allow
 // for fast comparison of a context against the current context.
 //
 // CUDA-runtime-created contexts are avoided, if triple angle
 // brace launches are required, by using the scoped activations in
-// cuda_activation.h.
+// gpu/gpu_activation.h.
 class CreatedContexts {
  public:
   // Returns whether context is a member of the live set.
@@ -68,14 +68,14 @@ class CreatedContexts {
   }
 
   // Adds context to the live set, or returns it if it's already present.
-  static CudaContext* Add(CUcontext context) {
+  static GpuContext* Add(CUcontext context) {
     CHECK(context != nullptr);
     mutex_lock lock(mu_);
     auto insert_result = Live()->insert(std::make_pair(context, nullptr));
     auto it = insert_result.first;
     if (insert_result.second) {
       // context was not present in the map.  Add it.
-      it->second = MakeUnique<CudaContext>(context, next_id_++);
+      it->second = MakeUnique<GpuContext>(context, next_id_++);
     }
     return it->second.get();
   }
@@ -91,9 +91,9 @@ class CreatedContexts {
 
  private:
   // Returns the live map singleton.
-  static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
+  static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
     static auto singleton =
-        new std::map<CUcontext, std::unique_ptr<CudaContext>>;
+        new std::map<CUcontext, std::unique_ptr<GpuContext>>;
     return singleton;
   }
 
@@ -108,11 +108,11 @@ class CreatedContexts {
 // Formats CUresult to output prettified values into a log stream.
 string ToString(CUresult result) {
   const char *error_name;
-  if (cuGetErrorName(result, &error_name)) {
+  if (tensorflow::wrap::cuGetErrorName(result, &error_name)) {
     return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
   }
   const char *error_string;
-  if (cuGetErrorString(result, &error_string)) {
+  if (tensorflow::wrap::cuGetErrorString(result, &error_string)) {
     return error_name;
   }
   return absl::StrCat(error_name, ": ", error_string);
@@ -122,7 +122,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current = CUDADriver::CurrentContextOrDie();
+  CUcontext current = cuda::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -167,7 +167,7 @@ namespace {
 
 // Call cuCtxtSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
-  auto res = cuCtxSynchronize();
+  auto res = tensorflow::wrap::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(FATAL) << "Synchronize found "
                << ToString(res) << " :: " << port::CurrentStackTrace();
@@ -176,7 +176,7 @@ void SynchronizeOrDie() {
 
 struct ThreadLocalData {
   int64 id;
-  CudaContext* context;  // Only valid if id == a known good context.
+  GpuContext* context;  // Only valid if id == a known good context.
   int depth;
 };
 
@@ -184,13 +184,13 @@ SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
 
 }  // namespace
 
-ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
+ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
 
   auto* tls = &tls_data.get();
   tls->depth++;
   if (tls->id == cuda_context->id()) {
-    if (kVerifyCudaContext) {
+    if (kVerifyGpuContext) {
       CHECK_EQ(CurrentContext(), cuda_context->context());
     }
     DCHECK_EQ(CurrentContext(), cuda_context->context());
@@ -203,7 +203,8 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
 
   // Set the context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
+  CHECK_EQ(CUDA_SUCCESS,
+           tensorflow::wrap::cuCtxSetCurrent(cuda_context->context()));
   tls->id = cuda_context->id();
   tls->context = cuda_context;
 }
@@ -213,8 +214,8 @@ ScopedActivateContext::~ScopedActivateContext() {
 
   auto* tls = &tls_data.get();
 
-  if (kVerifyCudaContext) {
-    // Note that if kVerifyCudaContext is used, and contexts are deleted, it's
+  if (kVerifyGpuContext) {
+    // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
     // possible this could fail in the CurrentContext() call.
     CHECK_EQ(CurrentContext(),
              tls->context == nullptr ? nullptr : tls->context->context());
@@ -228,7 +229,8 @@ ScopedActivateContext::~ScopedActivateContext() {
   }
 
   // Set context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
+  CHECK_EQ(CUDA_SUCCESS,
+           tensorflow::wrap::cuCtxSetCurrent(to_restore_->context()));
   tls->id = to_restore_->id();
   tls->context = to_restore_;
 }
@@ -239,7 +241,7 @@ namespace {
 // logging purposes. Returns "?" if the device could not be successfully
 // queried.
 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerDevice(pointer);
+  auto value = GpuDriver::GetPointerDevice(pointer);
   if (value.ok()) {
     return absl::StrCat(value.ValueOrDie());
   }
@@ -251,7 +253,7 @@ string CUDAPointerToDeviceString(CUdeviceptr pointer) {
 // logging purposes. Returns "?" if the memory space could not be successfully
 // queried.
 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerMemorySpace(pointer);
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
   if (value.ok()) {
     return MemorySpaceString(value.ValueOrDie());
   }
@@ -264,20 +266,20 @@ string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
 // primarily for logging purposes. Returns "error" if an error is encountered
 // in the process of querying.
 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
-  auto from_context = CUDADriver::GetPointerContext(from);
+  auto from_context = GpuDriver::GetPointerContext(from);
   if (!from_context.ok()) {
     LOG(ERROR) << "could not retrieve source pointer's context: "
                << from_context.status();
     return "error";
   }
-  auto to_context = CUDADriver::GetPointerContext(to);
+  auto to_context = GpuDriver::GetPointerContext(to);
   if (!to_context.ok()) {
     LOG(ERROR) << "could not retrieve destination pointer's context: "
                << to_context.status();
     return "error";
   }
-  return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
-                                         to_context.ValueOrDie())
+  return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
+                                        to_context.ValueOrDie())
              ? "true"
              : "false";
 }
@@ -290,7 +292,7 @@ static port::Status InternalInit() {
   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
   } else {
-    res = cuInit(0 /* = flags */);
+    res = tensorflow::wrap::cuInit(0 /* = flags */);
   }
 
   if (res == CUDA_SUCCESS) {
@@ -305,9 +307,9 @@ static port::Status InternalInit() {
 
 }  // namespace
 
-/* static */ port::Status CUDADriver::Init() {
+/* static */ port::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as cuInit need only be
-  // called once, but CUDADriver::Init may be called many times.
+  // called once, but GpuDriver::Init may be called many times.
   static port::Status init_retval;
   static bool set = false;
   static mutex *init_mu = new mutex;
@@ -321,9 +323,9 @@ static port::Status InternalInit() {
   return init_retval;
 }
 
-/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
-                                                CUdevice *device) {
-  CUresult res = cuDeviceGet(device, device_ordinal);
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               CUdevice* device) {
+  CUresult res = tensorflow::wrap::cuDeviceGet(device, device_ordinal);
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   }
@@ -333,11 +335,12 @@ static port::Status InternalInit() {
       absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }
 
-/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
-                                            string *device_name) {
+/* static */ bool GpuDriver::GetDeviceName(CUdevice device,
+                                           string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
-  CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  CUresult res =
+      tensorflow::wrap::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
@@ -372,9 +375,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return true;
 }
 
-/* static */ port::Status CUDADriver::CreateContext(
-    CUdevice device, const DeviceOptions &device_options,
-    CudaContext **context) {
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, CUdevice device, const DeviceOptions& device_options,
+    GpuContext** context) {
   *context = nullptr;
 
   int flags = 0;
@@ -388,9 +391,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 
   unsigned int former_primary_context_flags;
   int former_primary_context_is_active;
-  CHECK_EQ(CUDA_SUCCESS,
-           cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
-                                      &former_primary_context_is_active));
+  CHECK_EQ(CUDA_SUCCESS, tensorflow::wrap::cuDevicePrimaryCtxGetState(
+                             device, &former_primary_context_flags,
+                             &former_primary_context_is_active));
   if (former_primary_context_flags != flags) {
     if (former_primary_context_is_active) {
       LOG(ERROR)
@@ -398,15 +401,16 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
           << former_primary_context_flags << ") than the desired flag set ("
           << flags << ").";
     } else {
-      CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+      CHECK_EQ(CUDA_SUCCESS,
+               tensorflow::wrap::cuDevicePrimaryCtxSetFlags(device, flags));
     }
   }
 
-  former_context = CUDADriver::CurrentContextOrDie();
-  res = cuDevicePrimaryCtxRetain(&new_context, device);
+  former_context = cuda::CurrentContextOrDie();
+  res = tensorflow::wrap::cuDevicePrimaryCtxRetain(&new_context, device);
   if (former_context != nullptr) {
     CUdevice former_device;
-    if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+    if (tensorflow::wrap::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
       if (former_device == device) {
         if (former_context == new_context) {
           VLOG(2) << "The primary context " << former_context << " for device "
@@ -425,13 +429,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
                  << former_context;
     }
   }
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
+  CHECK_EQ(CUDA_SUCCESS, tensorflow::wrap::cuCtxSetCurrent(former_context));
 
   if (res == CUDA_SUCCESS) {
     *context = CreatedContexts::Add(new_context);
     CHECK(*context != nullptr)
         << "success in this call must entail non-null result";
-    VLOG(2) << "created or reused context " << context << " for this thread";
+    VLOG(2) << "created or reused context " << new_context
+            << " for this thread";
     return port::Status::OK();
   }
 
@@ -448,17 +453,17 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return port::Status(port::error::INTERNAL, message);
 }
 
-/* static */ void CUDADriver::DestroyContext(CudaContext* context) {
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
   if (context == nullptr) {
     return;
   }
   CUcontext former_context = CurrentContext();
-  CUresult res = cuCtxSetCurrent(context->context());
+  CUresult res = tensorflow::wrap::cuCtxSetCurrent(context->context());
   CUdevice device;
-  cuCtxGetDevice(&device);
-  cuCtxSetCurrent(former_context);
+  tensorflow::wrap::cuCtxGetDevice(&device);
+  tensorflow::wrap::cuCtxSetCurrent(former_context);
 
-  res = cuDevicePrimaryCtxRelease(device);
+  res = tensorflow::wrap::cuDevicePrimaryCtxRelease(device);
 
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
@@ -467,10 +472,11 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   CreatedContexts::Remove(context->context());
 }
 
-/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
-                                               CUfunction func,
-                                               int *attribute_value) {
-  CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
+/* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
+                                              CUfunction func,
+                                              int* attribute_value) {
+  CUresult res =
+      tensorflow::wrap::cuFuncGetAttribute(attribute_value, attribute, func);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
                << ", attribute: " << attribute;
@@ -479,9 +485,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return true;
 }
 
-/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
-                                                 CUfunc_cache cache_config) {
-  CUresult res = cuFuncSetCacheConfig(function, cache_config);
+/* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
+                                                CUfunc_cache cache_config) {
+  CUresult res = tensorflow::wrap::cuFuncSetCacheConfig(function, cache_config);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
@@ -492,13 +498,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 }
 
 /* static */ port::StatusOr<CUsharedconfig>
-CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   CUsharedconfig shared_mem_config;
   ScopedActivateContext activation(context);
-  CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
+  CUresult result =
+      tensorflow::wrap::cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    cuCtxGetDevice(&device);
+    tensorflow::wrap::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
@@ -509,13 +516,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return shared_mem_config;
 }
 
-/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
-    CudaContext* context, CUsharedconfig shared_mem_config) {
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, CUsharedconfig shared_mem_config) {
   ScopedActivateContext activation(context);
-  CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
+  CUresult result =
+      tensorflow::wrap::cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    cuCtxGetDevice(&device);
+    tensorflow::wrap::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to set CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
@@ -527,20 +535,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::LaunchKernel(
-    CudaContext* context, CUfunction function, unsigned int grid_dim_x,
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, CUfunction function, unsigned int grid_dim_x,
     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
     unsigned int block_dim_y, unsigned int block_dim_z,
-    unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
-    void **extra) {
+    unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
+    void** extra) {
   ScopedActivateContext activation(context);
   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z;
-  CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
-                                block_dim_x, block_dim_y, block_dim_z,
-                                shared_mem_bytes, stream, kernel_params, extra);
+  CUresult res = tensorflow::wrap::cuLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to launch CUDA kernel: " << function
                << "; result: " << ToString(res);
@@ -550,11 +558,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
-                                                const char *cubin_bytes,
-                                                CUmodule *module) {
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               CUmodule* module) {
   ScopedActivateContext activation(context);
-  CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
+  CUresult result =
+      tensorflow::wrap::cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status(port::error::INTERNAL,
                         "failed to load in-memory CUBIN: " + ToString(result));
@@ -563,9 +572,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::LoadPtx(CudaContext* context,
-                                      const char *ptx_contents,
-                                      CUmodule *module) {
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     CUmodule* module) {
   port::Notification notification;
   bool ret = true;
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
@@ -597,8 +606,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
-      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options),
-                               options, option_values);
+      res = tensorflow::wrap::cuModuleLoadDataEx(
+          module, ptx_data, TF_ARRAYSIZE(options), options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
@@ -633,11 +642,18 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return ret;
 }
 
-/* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
-                                                     CUdeviceptr location,
-                                                     uint8 value, size_t size) {
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
+  return false;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    CUdeviceptr location,
+                                                    uint8 value, size_t size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD8(location, value, size);
+  CUresult res = tensorflow::wrap::cuMemsetD8(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -645,12 +661,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint32 value,
-                                                      size_t uint32_count) {
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD32(location, value, uint32_count);
+  CUresult res = tensorflow::wrap::cuMemsetD32(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -658,13 +674,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint8 value,
-                                                      size_t uint32_count,
-                                                      CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
+  CUresult res =
+      tensorflow::wrap::cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -673,13 +690,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
-                                                       CUdeviceptr location,
-                                                       uint32 value,
-                                                       size_t uint32_count,
-                                                       CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      CUdeviceptr location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
+  CUresult res =
+      tensorflow::wrap::cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -688,12 +706,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
-                                                CUstream stream,
-                                                StreamCallback callback,
-                                                void *data) {
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               CUstream stream,
+                                               StreamCallback callback,
+                                               void* data) {
   // Note: flags param is required to be zero according to CUDA 6.0.
-  CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+  CUresult res = tensorflow::wrap::cuStreamAddCallback(stream, callback, data,
+                                                       0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -701,13 +720,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
-                                                CUmodule module,
-                                                const char *kernel_name,
-                                                CUfunction *function) {
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               CUmodule module,
+                                               const char* kernel_name,
+                                               CUfunction* function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
-  CUresult res = cuModuleGetFunction(function, module, kernel_name);
+  CUresult res =
+      tensorflow::wrap::cuModuleGetFunction(function, module, kernel_name);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
@@ -717,15 +737,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
-                                              CUmodule module,
-                                              const char *symbol_name,
-                                              CUdeviceptr *dptr,
-                                              size_t *bytes) {
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             CUmodule module,
+                                             const char* symbol_name,
+                                             CUdeviceptr* dptr, size_t* bytes) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
+  CUresult res =
+      tensorflow::wrap::cuModuleGetGlobal(dptr, bytes, module, symbol_name);
   if (res != CUDA_SUCCESS) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
@@ -737,21 +757,21 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ void CUDADriver::UnloadModule(CudaContext *context,
-                                           CUmodule module) {
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          CUmodule module) {
   ScopedActivateContext activated{context};
-  CUresult res = cuModuleUnload(module);
+  CUresult res = tensorflow::wrap::cuModuleUnload(module);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
   }
 }
 
-/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
-    CudaContext* context) {
+/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
+    GpuContext* context) {
   ScopedActivateContext activated{context};
   CUdevice device = -1;
-  CUresult result = cuCtxGetDevice(&device);
+  CUresult result = tensorflow::wrap::cuCtxGetDevice(&device);
   if (result == CUDA_SUCCESS) {
     return device;
   }
@@ -761,47 +781,47 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
-/* static */ bool CUDADriver::CreateStream(CudaContext *context,
-                                           CUstream *out) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          CUstream* stream) {
   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamCreate(out, 0);
+  CUresult res = tensorflow::wrap::cuStreamCreate(stream, 0);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "could not allocate CUDA stream for context " << context
-               << ": " << ToString(res);
+    LOG(ERROR) << "could not allocate CUDA stream for context "
+               << context->context() << ": " << ToString(res);
     return false;
   }
 
-  VLOG(2) << "successfully created stream " << *out << " for context "
-          << context << " on thread";
+  VLOG(2) << "successfully created stream " << *stream << " for context "
+          << context->context() << " on thread";
   return true;
 }
 
-/* static */ void CUDADriver::DestroyStream(CudaContext* context,
-                                            CUstream *stream) {
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           CUstream* stream) {
   if (*stream == nullptr) {
     return;
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamDestroy(*stream);
+  CUresult res = tensorflow::wrap::cuStreamDestroy(*stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "failed to destroy CUDA stream for context " << context
-               << ": " << ToString(res);
+    LOG(ERROR) << "failed to destroy CUDA stream for context "
+               << context->context() << ": " << ToString(res);
   } else {
     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
-            << context;
+            << context->context();
     *stream = nullptr;
   }
 }
 
-/* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
-                                              uint64 bytes) {
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
   ScopedActivateContext activated{context};
   CUdeviceptr result = 0;
-  CUresult res = cuMemAlloc(&result, bytes);
+  CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@@ -809,61 +829,63 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return nullptr;
   }
   void *ptr = reinterpret_cast<void *>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
-          << bytes << " bytes";
+  VLOG(2) << "allocated " << ptr << " for context " << context->context()
+          << " of " << bytes << " bytes";
   return ptr;
 }
 
-/* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
-                                               void *location) {
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
   ScopedActivateContext activation(context);
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  CUresult res = cuMemFree(pointer);
+  CUresult res = tensorflow::wrap::cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
   } else {
-    VLOG(2) << "deallocated " << location << " for context " << context;
+    VLOG(2) << "deallocated " << location << " for context "
+            << context->context();
   }
 }
 
-/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
-                                                     uint64 bytes) {
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
   ScopedActivateContext activation(context);
   CUdeviceptr result = 0;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+  CUresult res =
+      tensorflow::wrap::cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes unified memory; result: " << ToString(res);
     return nullptr;
   }
   void *ptr = reinterpret_cast<void *>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
-          << bytes << " bytes in unified memory";
+  VLOG(2) << "allocated " << ptr << " for context " << context->context()
+          << " of " << bytes << " bytes in unified memory";
   return ptr;
 }
 
-/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
-                                                      void *location) {
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
   ScopedActivateContext activation(context);
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  CUresult res = cuMemFree(pointer);
+  CUresult res = tensorflow::wrap::cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free unified memory at " << location
                << "; result: " << ToString(res);
   } else {
     VLOG(2) << "deallocated unified memory at " << location << " for context "
-            << context;
+            << context->context();
   }
 }
 
-/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
-                                            uint64 bytes) {
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
   ScopedActivateContext activation(context);
   void *host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
+  CUresult res = tensorflow::wrap::cuMemHostAlloc(&host_mem, bytes,
+                                                  CU_MEMHOSTALLOC_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
@@ -871,22 +893,22 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return host_mem;
 }
 
-/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
-                                             void *location) {
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemFreeHost(location);
+  CUresult res = tensorflow::wrap::cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
   }
 }
 
-/* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
-                                           uint64 bytes) {
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
   ScopedActivateContext activation(context);
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res =
-      cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+  CUresult res = tensorflow::wrap::cuMemHostRegister(
+      location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
@@ -895,10 +917,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
-                                             void *location) {
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemHostUnregister(location);
+  CUresult res = tensorflow::wrap::cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
@@ -907,15 +929,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
-                                                   CUevent *event) {
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  CUevent* event) {
   if (*event == nullptr) {
     return port::Status(port::error::INVALID_ARGUMENT,
                         "input event cannot be null");
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = cuEventDestroy(*event);
+  CUresult res = tensorflow::wrap::cuEventDestroy(*event);
   *event = nullptr;
 
   switch (res) {
@@ -935,11 +957,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
-                                                  CUevent event,
-                                                  CUstream stream) {
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 CUevent event,
+                                                 CUstream stream) {
   ScopedActivateContext activated{context};
-  CUresult res = cuEventRecord(event, stream);
+  CUresult res = tensorflow::wrap::cuEventRecord(event, stream);
   switch (res) {
     case CUDA_SUCCESS:
       return port::Status::OK();
@@ -957,10 +979,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
-    CudaContext *context, CUevent event) {
+/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
+                                                            CUevent event) {
   ScopedActivateContext activated{context};
-  CUresult res = cuEventQuery(event);
+  CUresult res = tensorflow::wrap::cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return port::Status(
         port::error::INTERNAL,
@@ -970,18 +992,18 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return res;
 }
 
-/* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
-                                                  float *elapsed_milliseconds,
-                                                  CUevent start, CUevent stop) {
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 CUevent start, CUevent stop) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for cuEventElapsedTime to
   // work.
-  CUresult res = cuEventSynchronize(stop);
+  CUresult res = tensorflow::wrap::cuEventSynchronize(stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
-  res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  res = tensorflow::wrap::cuEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
@@ -991,11 +1013,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
-                                                CUstream stream,
-                                                CUevent event) {
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               CUstream stream, CUevent event) {
   ScopedActivateContext activation(context);
-  CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
+  CUresult res =
+      tensorflow::wrap::cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
@@ -1004,9 +1026,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
   ScopedActivateContext activation(context);
-  CUresult res = cuCtxSynchronize();
+  CUresult res = tensorflow::wrap::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -1016,11 +1038,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
-                                                        CUstream stream) {
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = cuStreamSynchronize(stream);
+  CUresult res = tensorflow::wrap::cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
     port::Status status = port::InternalError(
         absl::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
@@ -1032,11 +1054,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
-                                           CUstream stream) {
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = cuStreamQuery(stream);
+  CUresult res = tensorflow::wrap::cuStreamQuery(stream);
   if (res == CUDA_SUCCESS) {
     return true;
   }
@@ -1047,12 +1069,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return false;
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
-                                                           void *host_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
+                                                          void* host_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
+  CUresult res = tensorflow::wrap::cuMemcpyDtoH(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
         port::Printf("failed to synchronous memcpy from device to host: %s; "
@@ -1065,12 +1087,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           const void *host_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          const void* host_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
+  CUresult res = tensorflow::wrap::cuMemcpyHtoD(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
@@ -1082,12 +1104,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
+  CUresult res = tensorflow::wrap::cuMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
@@ -1099,13 +1121,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
-                                                    void *host_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  CUresult res =
+      tensorflow::wrap::cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
@@ -1120,13 +1143,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    const void *host_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
+  CUresult res =
+      tensorflow::wrap::cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
@@ -1140,13 +1164,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  CUresult result =
+      tensorflow::wrap::cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to device: %s"
@@ -1167,9 +1192,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
-                                                  CUevent *result,
-                                                  EventFlags flags) {
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 CUevent* result,
+                                                 EventFlags flags) {
   int cuflags;
   switch (flags) {
     case EventFlags::kDefault:
@@ -1183,7 +1208,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = cuEventCreate(result, cuflags);
+  CUresult res = tensorflow::wrap::cuEventCreate(result, cuflags);
 
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
@@ -1197,9 +1222,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ int CUDADriver::GetDeviceCount() {
+/* static */ int GpuDriver::GetDeviceCount() {
   int device_count = 0;
-  CUresult res = cuDeviceGetCount(&device_count);
+  CUresult res = tensorflow::wrap::cuDeviceGetCount(&device_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
     return 0;
@@ -1211,11 +1236,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return device_count;
 }
 
-/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
+/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
     CUdeviceptr pointer) {
-  CudaContext* context = nullptr;
-  CUresult result =
-      cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  GpuContext* context = nullptr;
+  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
+      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
   if (result == CUDA_SUCCESS) {
     CHECK(context != nullptr) << "success should entail non-null context";
     return context;
@@ -1227,11 +1252,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    ToString(result)));
 }
 
-/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
     CUdeviceptr pointer) {
   unsigned int value;
-  CUresult result =
-      cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
+      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   if (result == CUDA_SUCCESS) {
     switch (value) {
       case CU_MEMORYTYPE_DEVICE:
@@ -1251,10 +1276,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    ToString(result)));
 }
 
-/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
-                                                             CUdeviceptr *base,
-                                                             size_t *size) {
-  CUresult result = cuMemGetAddressRange(base, size, dptr);
+/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                            CUdeviceptr* base,
+                                                            size_t* size) {
+  CUresult result = tensorflow::wrap::cuMemGetAddressRange(base, size, dptr);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (result == CUDA_ERROR_NOT_FOUND) {
@@ -1273,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    reinterpret_cast<void *>(dptr), ToString(result).c_str()));
 }
 
-/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
+/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
     CUdeviceptr pointer) {
   auto result = GetPointerContext(pointer);
   if (!result.ok()) {
@@ -1283,20 +1308,40 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return DeviceFromContext(result.ValueOrDie());
 }
 
-/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
-                                                           int *cc_minor,
-                                                           CUdevice device) {
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          CUdevice device) {
   *cc_major = 0;
   *cc_minor = 0;
-  CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device);
-  if (result == CUDA_SUCCESS) {
-    return port::Status::OK();
+
+  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+      cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
+  if (res != CUDA_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        port::Printf(
+            "failed to get compute capability major for device: %s; %d",
+            ToString(res).c_str(), device));
   }
 
-  return port::Status(
+  res = tensorflow::wrap::cuDeviceGetAttribute(
+      cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
+  if (res != CUDA_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        port::Printf(
+            "failed to get compute capability minor for device: %s; %d",
+            ToString(res).c_str(), device));
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      CUdevice device) {
+  return port::Status{
       port::error::INTERNAL,
-      port::Printf("failed to get compute capability for device: %s; %d",
-                   ToString(result).c_str(), device));
+      "Feature not supported on CUDA platform (GetGpuISAVersion)"};
 }
 
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
@@ -1305,7 +1350,8 @@ template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                                             CUdevice_attribute attribute) {
   int value = -1;
-  CUresult result = cuDeviceGetAttribute(&value, attribute, device);
+  CUresult result =
+      tensorflow::wrap::cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::NOT_FOUND,
@@ -1316,68 +1362,68 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return converted;
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
     CUdevice device) {
   return GetSimpleAttribute<int>(device,
                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device,
                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device,
                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
 }
 
-/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
-                                            CUdevice device) {
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           CUdevice device) {
   int value;
-  CUresult res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;
 
-  res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+  res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;
 
-  res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+  res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
@@ -1386,8 +1432,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
-  CUresult res = cuDriverGetVersion(driver_version);
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
+  CUresult res = tensorflow::wrap::cuDriverGetVersion(driver_version);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
@@ -1396,9 +1442,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
-                                                  int device_ordinal) {
-  CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
+/* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
+                                                 int device_ordinal) {
+  CUresult res = tensorflow::wrap::cuDeviceGetProperties(device_properties,
+                                                         device_ordinal);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device properties: " << ToString(res);
     return false;
@@ -1407,10 +1454,11 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
     CUdevice_attribute attribute, CUdevice device) {
   int val;
-  CUresult res = cuDeviceGetAttribute(&val, attribute, device);
+  CUresult res =
+      tensorflow::wrap::cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
@@ -1420,10 +1468,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return val;
 }
 
-/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
+/* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
   int value = -1;
-  CUresult res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
     return false;
@@ -1433,13 +1481,13 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
-                                                  int64 *free_out,
-                                                  int64 *total_out) {
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
   ScopedActivateContext activation(context);
   size_t free = 0;
   size_t total = 0;
-  CUresult res = cuMemGetInfo(&free, &total);
+  CUresult res = tensorflow::wrap::cuMemGetInfo(&free, &total);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
@@ -1450,10 +1498,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
-                                                   uint64 *result) {
+/* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
+                                                  uint64* result) {
   size_t value = -1;
-  CUresult res = cuDeviceTotalMem(&value, device);
+  CUresult res = tensorflow::wrap::cuDeviceTotalMem(&value, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
@@ -1463,12 +1511,13 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
+/* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
   string pci_bus_id;
   static const int kBufferSize = 64;
   absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
-  CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  CUresult res = tensorflow::wrap::cuDeviceGetPCIBusId(chars.begin(),
+                                                       kBufferSize - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
@@ -1477,8 +1526,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return pci_bus_id;
 }
 
-/* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
-                                                  CudaContext* to) {
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
   if (from == to) {
     return true;  // A context can always access its own memory.
   }
@@ -1496,7 +1545,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                << to_device.status();
     return false;
   }
-  CUresult res = cuDeviceCanAccessPeer(
+  CUresult res = tensorflow::wrap::cuDeviceCanAccessPeer(
       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1506,14 +1555,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return can_access_peer;
 }
 
-/* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
-                                                       CudaContext* to) {
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
   if (from == to) {
     return port::Status::OK();  // A context can always access its own memory.
   }
 
   ScopedActivateContext activated{from};
-  CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
+  CUresult result =
+      tensorflow::wrap::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return port::Status(
@@ -1525,14 +1575,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return port::Status::OK();
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
-    CudaContext* context, CUfunction kernel, int threads_per_block,
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, CUfunction kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
   ScopedActivateContext activation(context);
 
   int max_blocks;
-  CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
+  CUresult result =
+      tensorflow::wrap::cuOccupancyMaxActiveBlocksPerMultiprocessor(
+          &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
@@ -1543,11 +1594,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return max_blocks;
 }
 
-/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
+}  // namespace gpu
+
+namespace cuda {
+
+CUcontext CurrentContextOrDie() {
   CUcontext current = nullptr;
-  CUresult result = cuCtxGetCurrent(&current);
+  CUresult result = tensorflow::wrap::cuCtxGetCurrent(&current);
   if (result != CUDA_SUCCESS) {
-    LOG(FATAL) << "failed to query current context: " << ToString(result);
+    LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
   }
   return current;
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 447422739d25c2613c594f5f905658cd1fc27f02..5bbe6f6e627e8b4e217345b0e014e95c08df2fb0 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -18,505 +18,45 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 
-#include <stddef.h>
-#include "tensorflow/stream_executor/platform/port.h"
-
-#include "tensorflow/stream_executor/device_options.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 
 namespace stream_executor {
-namespace cuda {
-
-// Identifies the memory space where an allocation resides. See
-// CUDADriver::GetPointerMemorySpace().
-enum class MemorySpace { kHost, kDevice };
-
-// Returns a casual string, such as "host" for the provided memory space.
-string MemorySpaceString(MemorySpace memory_space);
-
-class CudaContext;
-
-// CUDADriver contains wrappers for calls to the userspace library driver. It's
-// useful to isolate these calls and put basic wrappers around them to separate
-// userspace library driver behaviors from the rest of the program.
-//
-// At the moment it's simply used as a namespace.
-//
-// The calls log any specific errors internally and return whether the operation
-// was successful to the caller.
-//
-// The order of parameters is generally kept symmetric with the underlying CUDA
-// driver API.
-//
-// Links on functions are to specific documentation under
-// http://docs.nvidia.com/cuda/cuda-driver-api/
-//
-// Thread safety: these functions should not be used from signal handlers.
-class CUDADriver {
- public:
-  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
-  // the case of failure. Safe to call multiple times; will be fast on all calls
-  // after the first.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
-  static port::Status Init();
-
-  // Returns the device associated with the given context.
-  // device is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
-  static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
-
-  // Creates a new CUDA stream associated with the given context via
-  // cuStreamCreate.
-  // stream is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(CudaContext* context, CUstream *stream);
-
-  // Destroys a CUDA stream associated with the given context.
-  // stream is owned by the caller, must not be null, and *stream is set to null
-  // if the stream is successfully destroyed.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
-  static void DestroyStream(CudaContext* context, CUstream *stream);
-
-  // CUDA events can explicitly disable event TSC retrieval for some presumed
-  // performance improvement if timing is unnecessary.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  enum class EventFlags { kDefault, kDisableTiming };
-
-  // Creates a new event associated with the given context.
-  // result is an outparam owned by the caller and must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  static port::Status CreateEvent(CudaContext* context, CUevent *result,
-                                  EventFlags flags);
-
-  // Destroys *event and turns it into a nullptr. event may not be null, but
-  // *event may be, via cuEventDestroy
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
-  static port::Status DestroyEvent(CudaContext* context, CUevent *event);
-
-  // Allocates a GPU memory space of size bytes associated with the given
-  // context via cuMemAlloc.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
-  static void *DeviceAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a GPU memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void DeviceDeallocate(CudaContext* context, void *location);
-
-  // Allocates a unified memory space of size bytes associated with the given
-  // context via cuMemAllocManaged.
-  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
-  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a unified memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
-
-  // Allocates page-locked and CUDA-registered memory on the host via
-  // cuMemAllocHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
-  static void *HostAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
-  static void HostDeallocate(CudaContext* context, void *location);
-
-  // Registers a memory region at location of size bytes via cuMemHostRegister.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
-  static bool HostRegister(CudaContext* context, void *location, uint64 bytes);
-
-  // Unregisters a memory region that was previously registered at location via
-  // cuMemHostUnregister.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
-  //
-  // TODO(leary) verify an error will be returned if the location wasn't
-  // previously registered.
-  static bool HostUnregister(CudaContext* context, void *location);
-
-  // Given a device ordinal, returns a device handle into the device outparam,
-  // which must not be null.
-  //
-  // N.B. these device handles do not have a corresponding destroy function in
-  // the CUDA driver API.
-  static port::Status GetDevice(int device_ordinal, CUdevice *device);
-
-  // Given a device handle, returns the name reported by the driver for the
-  // device.
-  static bool GetDeviceName(CUdevice device, string *name_out);
-
-  // Given a device to create a context for, returns a context handle into the
-  // context outparam, which must not be null.
-  //
-  // N.B. CUDA contexts are weird. They are implicitly associated with the
-  // calling thread. Current documentation on contexts and their influence on
-  // userspace processes is given here:
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
-  static port::Status CreateContext(CUdevice device,
-                                    const DeviceOptions& device_options,
-                                    CudaContext** context);
-
-  // Destroys the provided context via cuCtxDestroy.
-  // Don't do this while clients could still be using the context, per the docs
-  // bad things will happen.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
-  static void DestroyContext(CudaContext* context);
-
-  // Queries the runtime for the specified attribute of the specified function.
-  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
-  // in terms of integer-sized values, so there's no potential for overrun (as
-  // of CUDA 5.5).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
-  static bool FuncGetAttribute(CUfunction_attribute attribute,
-                               CUfunction function, int *attribute_value);
-
-  // Sets the preferred cache configuration for the specified function.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
-  static bool FuncSetCacheConfig(CUfunction function,
-                                 CUfunc_cache cache_config);
-
-  // Gets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
-  static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
-      CudaContext* context);
-
-  // Sets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
-  static port::Status ContextSetSharedMemConfig(
-      CudaContext* context, CUsharedconfig shared_mem_config);
-
-  // Launches a CUDA kernel via cuLaunchKernel.
-  // TODO(leary) describe the structure of kernel_params and extra in a readable
-  // way.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
-  static bool LaunchKernel(CudaContext* context, CUfunction function,
-                           unsigned int grid_dim_x, unsigned int grid_dim_y,
-                           unsigned int grid_dim_z, unsigned int block_dim_x,
-                           unsigned int block_dim_y, unsigned int block_dim_z,
-                           unsigned int shared_mem_bytes, CUstream stream,
-                           void **kernel_params, void **extra);
-
-  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
-  // handle in "module". Any error logs that are produced are logged internally.
-  static bool LoadPtx(CudaContext* context, const char *ptx_contents,
-                      CUmodule *module);
-
-  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
-  // the resulting handle in "module".
-  static port::Status LoadCubin(CudaContext* context, const char *cubin_bytes,
-                                CUmodule *module);
-
-  // Retrieves a named kernel from a loaded module, and places the resulting
-  // handle into function (outparam) on success. Neither kernel_name nor
-  // function may be null. No ownership is taken of kernel_name.
-  static bool GetModuleFunction(CudaContext* context, CUmodule module,
-                                const char *kernel_name, CUfunction *function);
-
-  // Retrieves a named global/constant symbol from a loaded module, and returns
-  // a device pointer and size of the symbol on success. symbol_name may not be
-  // null. At least one of dptr or bytes should not be null. No ownership is
-  // taken of symbol_name.
-  static bool GetModuleSymbol(CudaContext* context, CUmodule module,
-                              const char *symbol_name, CUdeviceptr *dptr,
-                              size_t *bytes);
-
-  // Unloads module from the current context via cuModuleUnload.
-  // TODO(leary) the documentation doesn't say what kind of disasters happen
-  // if you try to unload a module while its CUfunctions are in use.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
-  static void UnloadModule(CudaContext* context, CUmodule module);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
-  static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
-                                     uint8 value, size_t size);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
-  static bool SynchronousMemsetUint32(CudaContext* context,
-                                      CUdeviceptr location, uint32 value,
-                                      size_t uint32_count);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD8Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
-  static bool AsynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
-                                      uint8 value, size_t uint32_count,
-                                      CUstream stream);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD32Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
-  static bool AsynchronousMemsetUint32(CudaContext* context,
-                                       CUdeviceptr location, uint32 value,
-                                       size_t uint32_count, CUstream stream);
-
-  // -- Synchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
-
-  static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-  static port::Status SynchronousMemcpyH2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           const void* host_src, uint64 size);
-  static port::Status SynchronousMemcpyD2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-
-  // -- Asynchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
-
-  static bool AsynchronousMemcpyD2H(CudaContext* context, void *host_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    const void *host_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-
-  // The CUDA stream callback type signature.
-  // The data passed to AddStreamCallback is subsequently passed to this
-  // callback when it fires.
-  //
-  // Some notable things:
-  // * Callbacks must not make any CUDA API calls.
-  // * Callbacks from independent streams execute in an undefined order and may
-  //   be serialized.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
-  typedef void (*StreamCallback)(CUstream stream, CUresult status, void *data);
-
-  // Enqueues a callback operation into stream.
-  // See StreamCallback above and the NVIDIA documentation for additional
-  // details.
-  static bool AddStreamCallback(CudaContext* context, CUstream stream,
-                                StreamCallback callback, void *data);
-
-  // Causes stream to wait for event to trigger before proceeding via
-  // cuStreamWaitEvent.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
-  static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
-                                CUevent event);
-
-  // Blocks the calling thread until the operations enqueued onto stream have
-  // been completed, via cuStreamSynchronize.
-  //
-  // TODO(leary) if a pathological thread enqueues operations onto the stream
-  // while another thread blocks like this, can you wind up waiting an unbounded
-  // amount of time?
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
-
-  // Blocks the calling thread until the operations associated with the context
-  // have been completed, via cuCtxSynchronize.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
-  static bool SynchronizeContext(CudaContext* context);
-
-  // Returns true if all stream tasks have completed at time of the call. Note
-  // the potential for races around this call (if another thread adds work to
-  // the stream immediately after this returns).
-  static bool IsStreamIdle(CudaContext* context, CUstream stream);
-
-  // Returns whether code in the from context can access memory in the to
-  // context via cuDeviceCanAccessPeer.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
-  static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
-  static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Returns the elapsed milliseconds between start and stop via
-  // cuEventElapsedTime.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
-  static bool GetEventElapsedTime(CudaContext* context,
-                                  float *elapsed_milliseconds, CUevent start,
-                                  CUevent stop);
-
-  // Records that an event occurred when execution reaches the current point in
-  // thestream via cuEventRecord.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
-  static port::Status RecordEvent(CudaContext* context, CUevent event,
-                                  CUstream stream);
-
-  // Polls (without blocking) to determine the status of an event - pending or
-  // complete (or an error status).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
-  static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
-                                             CUevent event);
-
-  // -- Pointer-specific calls.
-
-  // Returns the context in which pointer was allocated or registered.
-  static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
-
-  // Returns the device associated with the context from GetPointerContext().
-  static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
-
-  // Returns the memory space addressed by pointer.
-  static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
-
-  // Returns the base address and size of the device pointer dptr.
-  static port::Status GetPointerAddressRange(CUdeviceptr dptr,
-                                             CUdeviceptr *base, size_t *size);
-
-  // -- Device-specific calls.
-
-  // Returns the compute capability for the device; i.e (3, 5).
-  // This is currently done via the deprecated device API.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
-  static port::Status GetComputeCapability(int *cc_major, int *cc_minor,
-                                           CUdevice device);
-
-  // Returns the number of multiprocessors on the device (note that the device
-  // may be multi-GPU-per-board).
-  static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
-
-  // Returns the limit on number of threads that can be resident in a single
-  // multiprocessor.
-  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
-
-  // Returns the limit on number of threads which may be resident for a single
-  // block (cooperative thread array).
-  static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
-
-  // Returns the amount of shared memory available on a single GPU core (i.e.
-  // SM on NVIDIA devices).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
-
-  // Returns the amount of shared memory available for a single block
-  // (cooperative thread array).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
-
-  // Returns the maximum supported number of registers per block.
-  static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
-
-  // Returns the number of threads per warp.
-  static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
-
-  // Queries the grid limits for device with cuDeviceGetAttribute calls.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool GetGridLimits(int *x, int *y, int *z, CUdevice device);
-
-  // Returns a grab-bag of device properties in a caller-owned device_properties
-  // structure for device_ordinal via cuDeviceGetProperties.
-  //
-  // This call is deprecated in the NVIDIA driver API; its replacement is
-  // GetDeviceAttribute
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
-  static bool GetDeviceProperties(CUdevprop *device_properties,
-                                  int device_ordinal);
-
-  // Gets a specific integer-valued property about the given device.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
-                                                CUdevice device);
-
-  // Returns whether ECC is enabled for the given CUdevice via
-  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool IsEccEnabled(CUdevice device, bool *result);
-
-  // Returns the total amount of memory available for allocation by the CUDA
-  // context, in bytes, via cuDeviceTotalMem.
-  static bool GetDeviceTotalMemory(CUdevice device, uint64 *result);
-
-  // Returns the free amount of memory and total amount of memory, as reported
-  // by cuMemGetInfo.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
-  static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
-                                  int64* total);
-
-  // Returns a PCI bus id string for the device.
-  // [domain]:[bus]:[device].[function]
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
-  static string GetPCIBusID(CUdevice device);
-
-  // -- Context- and device-independent calls.
-
-  // Returns the number of visible CUDA device via cuDeviceGetCount.
-  // This should correspond to the set of device ordinals available.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
-  static int GetDeviceCount();
-
-  // Returns the driver version number via cuDriverGetVersion.
-  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
-  // instead, the CUDA toolkit release number that this driver is compatible
-  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
-  // compatible driver).
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
-  static bool GetDriverVersion(int *driver_version);
-
-  // -- Other calls
-
-  // Returns the maximum number of blocks (per multiprocessor) occupied by the
-  // specified kernel/CUfunction when launched with the specified parameters.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
-  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
-      CudaContext* context, CUfunction kernel, int threads_per_block,
-      size_t dynamic_shared_memory_bytes);
-
-  // Returns the current context set in CUDA. This is done by calling the cuda
-  // driver (e.g., this value is not our cached view of the current context).
-  static CUcontext CurrentContextOrDie();
-
-  // Seam for injecting an error at CUDA initialization time for testing
-  // purposes.
-  static bool driver_inject_init_error_;
-};
-
-// Ensures a context is activated within a scope.
-class ScopedActivateContext {
- public:
-  // Activates the context via cuCtxSetCurrent, if it is not the currently
-  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
-  // mechanism is said by NVIDIA to be relatively slow and deprecated.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
-  explicit ScopedActivateContext(CudaContext* context);
-
-  // Checks that the context has remained activated for the duration of the
-  // scope.
-  ~ScopedActivateContext();
-
- private:
-  CudaContext* to_restore_ = nullptr;
-};
-
-// CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
+namespace gpu {
+// CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
 // unique id is positive, and ids are not repeated within the process.
-class CudaContext {
+class GpuContext {
  public:
-  CudaContext(CUcontext context, int64 id) : context_(context), id_(id) { }
+  GpuContext(CUcontext context, int64 id) : context_(context), id_(id) {}
 
   CUcontext context() const { return context_; }
   int64 id() const { return id_; }
 
   // Disallow copying and moving.
-  CudaContext(CudaContext&&) = delete;
-  CudaContext(const CudaContext&) = delete;
-  CudaContext& operator=(CudaContext&&) = delete;
-  CudaContext& operator=(const CudaContext&) = delete;
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
 
  private:
   CUcontext const context_;
   const int64 id_;
 };
 
-inline CUcontext CurrentContextOrDie() {
-  return CUDADriver::CurrentContextOrDie();
-}
+}  // namespace gpu
+
+namespace cuda {
+
+using MemorySpace = gpu::MemorySpace;
+
+using CUDADriver = gpu::GpuDriver;
+
+using ScopedActivateContext = gpu::ScopedActivateContext;
+
+using CudaContext = gpu::GpuContext;
+
+// Returns the current context set in CUDA. This is done by calling the cuda
+// driver (e.g., this value is not our cached view of the current context).
+CUcontext CurrentContextOrDie();
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..657eea81716e6cbf5f158ab29bf6bd9149d46403
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
@@ -0,0 +1,144 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps cuda driver calls with dso loader so that we don't need to
+// have explicit linking to libcuda. All TF cuda driver usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace tensorflow {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                       \
+  template <typename... Args>                                              \
+  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) { \
+    return ::cudaSymbolName(args...);                                      \
+  }
+
+// This macro wraps a global identifier, given by cudaSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                        \
+  template <typename... Args>                                               \
+  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) {  \
+    using FuncPtrT = std::add_pointer<decltype(::cudaSymbolName)>::type;    \
+    static FuncPtrT loaded = []() -> FuncPtrT {                             \
+      static const char *kName = TO_STR(cudaSymbolName);                    \
+      void *f;                                                              \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
+          stream_executor::internal::CachedDsoLoader::GetLibcudaDsoHandle() \
+              .ValueOrDie(),                                                \
+          kName, &f);                                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in libcuda DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                                 \
+    }();                                                                    \
+    return loaded(args...);                                                 \
+  }
+#endif
+
+// clang-format off
+#define LIBCUDA_ROUTINE_EACH(__macro)                   \
+  __macro(cuCtxEnablePeerAccess)                        \
+  __macro(cuCtxGetCurrent)                              \
+  __macro(cuCtxGetDevice)                               \
+  __macro(cuCtxGetSharedMemConfig)                      \
+  __macro(cuCtxSetCurrent)                              \
+  __macro(cuCtxSetSharedMemConfig)                      \
+  __macro(cuCtxSynchronize)                             \
+  __macro(cuDeviceCanAccessPeer)                        \
+  __macro(cuDeviceGet)                                  \
+  __macro(cuDeviceGetAttribute)                         \
+  __macro(cuDeviceGetCount)                             \
+  __macro(cuDeviceGetName)                              \
+  __macro(cuDeviceGetPCIBusId)                          \
+  __macro(cuDeviceGetProperties)                        \
+  __macro(cuDevicePrimaryCtxGetState)                   \
+  __macro(cuDevicePrimaryCtxRelease)                    \
+  __macro(cuDevicePrimaryCtxRetain)                     \
+  __macro(cuDevicePrimaryCtxSetFlags)                   \
+  __macro(cuDeviceTotalMem)                             \
+  __macro(cuDriverGetVersion)                           \
+  __macro(cuEventCreate)                                \
+  __macro(cuEventDestroy)                               \
+  __macro(cuEventElapsedTime)                           \
+  __macro(cuEventQuery)                                 \
+  __macro(cuEventRecord)                                \
+  __macro(cuEventSynchronize)                           \
+  __macro(cuFuncGetAttribute)                           \
+  __macro(cuFuncSetCacheConfig)                         \
+  __macro(cuGetErrorName)                               \
+  __macro(cuGetErrorString)                             \
+  __macro(cuInit)                                       \
+  __macro(cuLaunchKernel)                               \
+  __macro(cuMemAlloc)                                   \
+  __macro(cuMemAllocManaged)                            \
+  __macro(cuMemFree)                                    \
+  __macro(cuMemFreeHost)                                \
+  __macro(cuMemGetAddressRange)                         \
+  __macro(cuMemGetInfo)                                 \
+  __macro(cuMemHostAlloc)                               \
+  __macro(cuMemHostRegister)                            \
+  __macro(cuMemHostUnregister)                          \
+  __macro(cuMemcpyDtoD)                                 \
+  __macro(cuMemcpyDtoDAsync)                            \
+  __macro(cuMemcpyDtoH)                                 \
+  __macro(cuMemcpyDtoHAsync)                            \
+  __macro(cuMemcpyHtoD)                                 \
+  __macro(cuMemcpyHtoDAsync)                            \
+  __macro(cuMemsetD32)                                  \
+  __macro(cuMemsetD32Async)                             \
+  __macro(cuMemsetD8)                                   \
+  __macro(cuMemsetD8Async)                              \
+  __macro(cuModuleGetFunction)                          \
+  __macro(cuModuleGetGlobal)                            \
+  __macro(cuModuleLoadDataEx)                           \
+  __macro(cuModuleLoadFatBinary)                        \
+  __macro(cuModuleUnload)                               \
+  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor)  \
+  __macro(cuOccupancyMaxPotentialBlockSize)             \
+  __macro(cuPointerGetAttribute)                        \
+  __macro(cuStreamAddCallback)                          \
+  __macro(cuStreamCreate)                               \
+  __macro(cuStreamDestroy)                              \
+  __macro(cuStreamQuery)                                \
+  __macro(cuStreamSynchronize)                          \
+  __macro(cuStreamWaitEvent)
+
+// clang-format on
+
+LIBCUDA_ROUTINE_EACH(STREAM_EXECUTOR_LIBCUDA_WRAP)
+#undef LIBCUDA_ROUTINE_EACH
+#undef STREAM_EXECUTOR_LIBCUDA_WRAP
+#undef TO_STR
+#undef TO_STR_
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
index 96dcf173566087db475e3b237591d19f06128d92..fd9d4741e01082ee46c9f1ba77a089ee2cc8fad5 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -20,30 +20,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-CUDAEvent::CUDAEvent(CUDAExecutor* parent)
-    : parent_(parent), cuda_event_(nullptr) {}
-
-CUDAEvent::~CUDAEvent() {}
-
-port::Status CUDAEvent::Init() {
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
-                                 CUDADriver::EventFlags::kDisableTiming);
-}
-
-port::Status CUDAEvent::Destroy() {
-  return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
-}
-
-port::Status CUDAEvent::Record(CUDAStream* stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
-                                 stream->cuda_stream());
-}
-
-Event::Status CUDAEvent::PollForStatus() {
+Event::Status GpuEvent::PollForStatus() {
   port::StatusOr<CUresult> status =
-      CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
                << status.status().error_message();
@@ -62,9 +43,5 @@ Event::Status CUDAEvent::PollForStatus() {
   }
 }
 
-const CUevent& CUDAEvent::cuda_event() {
-  return cuda_event_;
-}
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index f62344672ed624f1ed60b5452d33b6f8273f2b47..e3596e0261acc1f6225c610db33dbbcdc38fd7e4 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -16,45 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// CUDAEvent wraps a CUevent in the platform-independent EventInterface
-// interface.
-class CUDAEvent : public internal::EventInterface {
- public:
-  explicit CUDAEvent(CUDAExecutor* parent);
-
-  ~CUDAEvent() override;
-
-  // Populates the CUDA-platform-specific elements of this object.
-  port::Status Init();
-
-  // Deallocates any platform-specific elements of this object. This is broken
-  // out (not part of the destructor) to allow for error reporting.
-  port::Status Destroy();
-
-  // Inserts the event at the current position into the specified stream.
-  port::Status Record(CUDAStream* stream);
-
-  // Polls the CUDA platform for the event's current status.
-  Event::Status PollForStatus();
-
-  // The underlying CUDA event element.
-  const CUevent& cuda_event();
-
- private:
-  // The Executor used to which this object and CUevent are bound.
-  CUDAExecutor* parent_;
-
-  // The underlying CUDA event element.
-  CUevent cuda_event_;
-};
+using CUDAEvent = gpu::GpuEvent;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index acac7d6368885537b1f5727779388d550680e90d..054b43b5b7a39702ce22891028f547b39c778fc1 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,21 +23,17 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
@@ -49,13 +45,13 @@ namespace wrap {
 // manner on first use. This dynamic loading technique is used to avoid DSO
 // dependencies on vendor libraries which may or may not be available in the
 // deployed binary environment.
-#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
-  struct WrapperShim__##__name {                                 \
-    template <typename... Args>                                  \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};           \
-      return ::__name(args...);                                  \
-    }                                                            \
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                      \
+  struct WrapperShim__##__name {                                \
+    template <typename... Args>                                 \
+    cufftResult operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};           \
+      return ::__name(args...);                                 \
+    }                                                           \
   } __name;
 
 #else
@@ -81,8 +77,8 @@ namespace wrap {
       return f;                                                           \
     }                                                                     \
     template <typename... Args>                                           \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    cufftResult operator()(GpuExecutor *parent, Args... args) {           \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                          \
     }                                                                     \
   } __name;                                                               \
@@ -149,8 +145,8 @@ cufftType CUDAFftType(fft::Type type) {
 }
 
 // Associates the given stream with the given cuFFT plan.
-bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
-  auto ret = wrap::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
+  auto ret = wrap::cufftSetStream(parent, plan, AsGpuStreamValue(stream));
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
     return false;
@@ -161,7 +157,7 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
 }  // namespace
 
 port::Status CUDAFftPlan::Initialize(
-    CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
     uint64 *input_embed, uint64 input_stride, uint64 input_distance,
     uint64 *output_embed, uint64 output_stride, uint64 output_distance,
     fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
@@ -321,7 +317,7 @@ port::Status CUDAFftPlan::Initialize(
   return port::Status::OK();
 }
 
-port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
+port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
                                      int rank, uint64 *elem_count,
                                      fft::Type type,
                                      ScratchAllocator *scratch_allocator) {
@@ -553,8 +549,8 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
   }
 
   auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)));
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)));
 
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine: " << ret;
@@ -580,8 +576,8 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
   }
 
   auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)),
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)),
                        cuda_fft_plan->GetFftDirection());
 
   if (ret != CUFFT_SUCCESS) {
@@ -618,22 +614,22 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 
 #undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cufft() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
-          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
           [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
                          << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
-            return new cuda::CUDAFft(cuda_executor);
+            return new gpu::CUDAFft(cuda_executor);
           });
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuFFT factory: "
@@ -641,7 +637,7 @@ void initialize_cufft() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+      cuda::kCudaPlatformId, PluginKind::kFft, gpu::kCuFftPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 8171e61418a3185455e50ee76315eb2493c36c01..0f3baeab6fa8b26b18c22854e8c95aadbb02f1ba 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -30,9 +30,9 @@ namespace stream_executor {
 
 class Stream;
 
-namespace cuda {
+namespace gpu {
 
-class CUDAExecutor;
+class GpuExecutor;
 
 // Opaque and unique indentifier for the cuFFT plugin.
 extern const PluginId kCuFftPlugin;
@@ -64,17 +64,17 @@ class CUDAFftPlan : public fft::Plan {
   }
 
   // Initialize function for batched plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, uint64 *input_embed,
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, uint64* input_embed,
                           uint64 input_stride, uint64 input_distance,
-                          uint64 *output_embed, uint64 output_stride,
+                          uint64* output_embed, uint64 output_stride,
                           uint64 output_distance, fft::Type type,
-                          int batch_count, ScratchAllocator *scratch_allocator);
+                          int batch_count, ScratchAllocator* scratch_allocator);
 
   // Initialize function for 1d,2d, and 3d plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, fft::Type type,
-                          ScratchAllocator *scratch_allocator);
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, fft::Type type,
+                          ScratchAllocator* scratch_allocator);
 
   port::Status UpdateScratchAllocator(Stream *stream,
                                       ScratchAllocator *scratch_allocator);
@@ -83,7 +83,7 @@ class CUDAFftPlan : public fft::Plan {
   bool IsInitialized() const { return is_initialized_; }
 
  private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
   cufftHandle plan_;
   fft::Type fft_type_;
   DeviceMemory<uint8> scratch_;
@@ -96,7 +96,7 @@ class CUDAFftPlan : public fft::Plan {
 // This satisfies the platform-agnostic FftSupport interface.
 //
 // Note that the cuFFT handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuFFT handle when a
 // CUDA context is active.
 //
@@ -104,13 +104,13 @@ class CUDAFftPlan : public fft::Plan {
 // context of parent_, so all context is explicit.
 class CUDAFft : public fft::FftSupport {
  public:
-  explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
+  explicit CUDAFft(GpuExecutor* parent) : parent_(parent) {}
   ~CUDAFft() override {}
 
   TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
 
  private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
 
   // Two helper functions that execute dynload::cufftExec?2?.
 
@@ -131,7 +131,7 @@ class CUDAFft : public fft::FftSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4874d096ad54fa352fd6e9ad3b7b87c1fff59f73..420f2591b82f48f0ff2bb713aca3e000083b4774 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
@@ -53,6 +54,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/timer.h"
 
+// LOG(ERROR) uses a const named ERROR, so a macro with the same name is
+// always unwanted. This happens on Windows that defines such a macro.
+#undef ERROR
+
 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
 #error \
     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
@@ -67,7 +72,7 @@ extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
 // It has been observed that loading both PTX and cubins into the driver library
@@ -79,17 +84,16 @@ namespace cuda {
 // variable with extern linkage and populate it from another translation unit.
 std::function<string(const string &)> g_cubinate;
 
-static CUDAEvent *AsCUDAEvent(Event *event) {
+static GpuEvent* AsGpuEvent(Event* event) {
   DCHECK(event != nullptr);
-  return static_cast<CUDAEvent *>(event->implementation());
+  return static_cast<GpuEvent*>(event->implementation());
 }
 
-
 // Given a platform-independent timer datatype, returns the internal CUDA
 // platform implementation pointer.
-static CUDATimer *AsCUDATimer(Timer *timer) {
+static GpuTimer* AsGpuTimer(Timer* timer) {
   DCHECK(timer != nullptr);
-  return static_cast<CUDATimer *>(timer->implementation());
+  return static_cast<GpuTimer*>(timer->implementation());
 }
 
 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@@ -107,48 +111,49 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
+GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
   CHECK(cuda_exec != nullptr);
-  return cuda_exec->cuda_context();
+  return cuda_exec->gpu_context();
 }
 
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
-  return static_cast<CUDAExecutor *>(stream_exec->implementation());
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
 }
 
-CUDAExecutor::~CUDAExecutor() {
-  CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
-  CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
+GpuExecutor::~GpuExecutor() {
+  CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
   if (context_ != nullptr) {
-    CUDADriver::DestroyContext(context_);
+    GpuDriver::DestroyContext(context_);
   }
 }
 
-port::Status CUDAExecutor::Init(int device_ordinal,
-                                DeviceOptions device_options) {
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
 
-  auto status = CUDADriver::Init();
+  auto status = GpuDriver::Init();
   if (!status.ok()) {
     return status;
   }
 
-  status = CUDADriver::GetDevice(device_ordinal_, &device_);
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
   if (!status.ok()) {
     return status;
   }
 
-  status = CUDADriver::CreateContext(device_, device_options, &context_);
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
   if (!status.ok()) {
     return status;
   }
 
-  return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
+  return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
 }
 
-bool CUDAExecutor::FindOnDiskForComputeCapability(
+bool GpuExecutor::FindOnDiskForComputeCapability(
     absl::string_view filename, absl::string_view canonical_suffix,
-    string *found_filename) const {
+    string* found_filename) const {
   if (cc_major_ == 0 && cc_minor_ == 0) {
     return false;
   }
@@ -172,6 +177,13 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
   return false;
 }
 
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  LOG(ERROR)
+      << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
+  return false;
+}
 // Returns the path to the running executable.
 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
 // Arg: strip_exe: if true, remove the name of the executable itself from the
@@ -206,12 +218,12 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
 
   if (*module == nullptr) {
-    auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+    auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
     if (!load_status.ok()) {
       LOG(ERROR) << "failed to load CUBIN: " << load_status;
       return false;
@@ -228,12 +240,12 @@ bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
   return true;
 }
 
-bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
 
   if (*module == nullptr) {
-    if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+    if (!GpuDriver::LoadPtx(context_, ptx, module)) {
       return false;
     }
     VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
@@ -248,9 +260,14 @@ bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
   return true;
 }
 
-bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
-                             KernelBase *kernel) {
-  CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
+  return false;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
   const string *kernelname;
 
@@ -290,8 +307,8 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
     return false;
   }
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
-  if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
-                                     cuda_kernel->cuda_function_ptr())) {
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    cuda_kernel->gpu_function_ptr())) {
     return false;
   }
 
@@ -308,7 +325,7 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   return true;
 }
 
-bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
   auto module_it = gpu_binary_to_module_.find(gpu_binary);
   if (gpu_binary_to_module_.end() == module_it) {
     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
@@ -319,13 +336,13 @@ bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
     VLOG(3) << "Unloading CUDA module " << module;
-    CUDADriver::UnloadModule(context_, module);
+    GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
   }
   return true;
 }
 
-void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
   mutex_lock lock{in_memory_modules_mu_};
@@ -341,9 +358,9 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
-                              ModuleHandle *module_handle) {
-  // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
   // ModuleHandle::id().
   CUmodule cu_module;
   if (spec.has_cuda_cubin_in_memory()) {
@@ -377,25 +394,23 @@ bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
   return false;
 }
 
-bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
   mutex_lock lock{in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
 
-bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
-                                     KernelMetadata *kernel_metadata) {
+bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
+                                    KernelMetadata* kernel_metadata) {
   int value;
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
     return false;
   }
   kernel_metadata->set_registers_per_thread(value);
 
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
     return false;
   }
   kernel_metadata->set_shared_memory_bytes(value);
@@ -403,13 +418,13 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
   return true;
 }
 
-bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
-                          const BlockDim &block_dims, const KernelBase &kernel,
-                          const KernelArgsArrayBase &args) {
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
-  CUstream custream = AsCUDAStreamValue(stream);
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  CUstream custream = AsGpuStreamValue(stream);
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
   // Only perform/print the occupancy check once.  Even just checking to see
   // whether we've done an occupancy check on this kernel before isn't free
@@ -426,16 +441,16 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 
   if (cuda_kernel->GetPreferredCacheConfig() !=
       KernelCacheConfig::kNoPreference) {
-    CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
+    GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
   }
 
   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
 
-  if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
-                                block_dims.z, thread_dims.x, thread_dims.y,
-                                thread_dims.z, args.number_of_shared_bytes(),
-                                custream, kernel_params,
-                                nullptr /* = extra */)) {
+  if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
+                               block_dims.z, thread_dims.x, thread_dims.y,
+                               thread_dims.z, args.number_of_shared_bytes(),
+                               custream, kernel_params,
+                               nullptr /* = extra */)) {
     LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
                << args.number_of_arguments()
                << " args; thread dim: " << thread_dims.ToString()
@@ -449,9 +464,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
-                                     const ThreadDim &thread_dims,
-                                     const BlockDim &block_dims) {
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
           << kernel.demangled_name();
   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
@@ -470,8 +485,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
   const DeviceDescription &device_description =
       kernel.parent()->GetDeviceDescription();
 
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
                                          smem_per_block, thread_dims, cufunc);
@@ -491,13 +506,14 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-int CUDAExecutor::CalculateOccupancy(
-    const DeviceDescription &device_description, uint64 registers_per_thread,
-    uint64 shared_memory_per_block, const ThreadDim &thread_dims,
-    CUfunction func) {
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
-  CUresult err = cuOccupancyMaxPotentialBlockSize(
+  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
       &suggested_blocks, &suggested_threads, func, nullptr,
       shared_memory_per_block, 0);
   CHECK_EQ(err, CUDA_SUCCESS);
@@ -506,15 +522,15 @@ int CUDAExecutor::CalculateOccupancy(
 
 // Compute and return the suggested thread count to achieve ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
-int CUDAExecutor::CompareOccupancy(int *initial_blocks,
-                                   const DeviceDescription &device_description,
-                                   uint64 registers_per_thread,
-                                   uint64 shared_memory_per_block,
-                                   const ThreadDim &thread_dims,
-                                   CUfunction func) {
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
-  CUresult err = cuOccupancyMaxPotentialBlockSize(
+  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
       &suggested_blocks, &suggested_threads, func, nullptr,
       shared_memory_per_block, 0);
   CHECK_EQ(err, CUDA_SUCCESS);
@@ -526,88 +542,87 @@ int CUDAExecutor::CompareOccupancy(int *initial_blocks,
   }
 }
 
-void *CUDAExecutor::Allocate(uint64 size) {
-  return CUDADriver::DeviceAllocate(context_, size);
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
 }
 
-void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
-                                      uint64 offset_bytes, uint64 size_bytes) {
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
   // offset and size are in bytes, so char* works as the pointer type.
   return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
 }
 
-void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
   if (!mem->is_sub_buffer()) {
-    CUDADriver::DeviceDeallocate(context_, mem->opaque());
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
   }
 }
 
-bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
   if (location == nullptr || size == 0) {
     LOG(WARNING) << "attempting to register null or zero-sized memory: "
                  << location << "; size " << size;
   }
   VLOG(2) << "registering " << location << " size " << size;
-  return CUDADriver::HostRegister(context_, location, size);
+  return GpuDriver::HostRegister(context_, location, size);
 }
 
-bool CUDAExecutor::HostMemoryUnregister(void *location) {
+bool GpuExecutor::HostMemoryUnregister(void* location) {
   VLOG(2) << "unregistering " << location;
-  return CUDADriver::HostUnregister(context_, location);
+  return GpuDriver::HostUnregister(context_, location);
 }
 
-bool CUDAExecutor::SynchronizeAllActivity() {
-  return CUDADriver::SynchronizeContext(context_);
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
 }
 
-bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
         context_, AsCudaDevicePtr(location), 0x0, size / 4);
   }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            0x0, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           0x0, size);
 }
 
-bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                     uint64 size) {
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     // cudaMemset reinterprets "value" as a uint8.
     uint8 byte_value = static_cast<uint8>(value);
     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
                      (byte_value << 8) | byte_value;
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
         context_, AsCudaDevicePtr(location), pattern, size / 4);
   }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            value, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           value, size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                             const void *host_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          host_src, size);
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         host_src, size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
-                                             const DeviceMemoryBase &gpu_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsCudaDevicePtr(gpu_src), size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         AsCudaDevicePtr(gpu_src), size);
 }
 
-bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
-                           uint64 size) {
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return Memset32(stream, location, 0x0, size);
@@ -616,88 +631,87 @@ bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
   }
 }
 
-bool CUDAExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
-                           uint8 pattern, uint64 size) {
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
-  return CUDADriver::AsynchronousMemsetUint8(
-      context_, AsCudaDevicePtr(location), pattern, size,
-      AsCUDAStreamValue(stream));
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
-                            uint32 pattern, uint64 size) {
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
         size % 4 == 0);
-  return CUDADriver::AsynchronousMemsetUint32(
+  return GpuDriver::AsynchronousMemsetUint32(
       context_, AsCudaDevicePtr(location), pattern, size / 4,
-      AsCUDAStreamValue(stream));
+      AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
-                          const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
-                          const void *host_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           host_src, size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
-                                        DeviceMemoryBase *gpu_dst,
-                                        const DeviceMemoryBase &gpu_src,
-                                        uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<port::Status()> callback) {
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
   auto callback_ptr = new std::function<void()>([callback]() {
     port::Status s = callback();
     if (!s.ok()) {
       LOG(WARNING) << "Host callback failed: " << s;
     }
   });
-  return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
-                                       InternalHostCallback, callback_ptr);
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
 }
 
-/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
-                                                     CUresult status,
-                                                     void *data) {
+/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
+                                                    CUresult status,
+                                                    void* data) {
   std::function<void()> *callback =
       reinterpret_cast<std::function<void()> *>(data);
   (*callback)();
   delete callback;
 }
 
-port::Status CUDAExecutor::AllocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Init();
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
 }
 
-port::Status CUDAExecutor::DeallocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Destroy();
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
 }
 
-port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
-  return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
 }
 
-port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
-  if (CUDADriver::WaitStreamOnEvent(context_,
-                                    AsCUDAStream(stream)->cuda_stream(),
-                                    AsCUDAEvent(event)->cuda_event())) {
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
     return port::Status::OK();
   } else {
     return port::Status(
@@ -707,61 +721,61 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
   }
 }
 
-Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
-  return AsCUDAEvent(event)->PollForStatus();
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
 }
 
-bool CUDAExecutor::AllocateStream(Stream *stream) {
-  return AsCUDAStream(stream)->Init();
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
 }
 
-void CUDAExecutor::DeallocateStream(Stream *stream) {
-  CUDAStream *cuda_stream = AsCUDAStream(stream);
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* cuda_stream = AsGpuStream(stream);
   if (!cuda_stream->IsIdle()) {
     LOG(ERROR) << "Deallocating stream with pending work";
   }
   cuda_stream->Destroy();
 }
 
-bool CUDAExecutor::AllocateTimer(Timer *timer) {
-  return AsCUDATimer(timer)->Init();
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
 }
 
-void CUDAExecutor::DeallocateTimer(Timer *timer) {
-  AsCUDATimer(timer)->Destroy();
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
 }
 
-bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
-  CUevent other_completed_event = *AsCUDAStream(other)->completed_event();
-  bool ok = CUDADriver::RecordEvent(context_, other_completed_event,
-                                    AsCUDAStreamValue(other))
-      .ok();
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  CUevent other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
   if (!ok) {
     LOG(ERROR) << "failed to record completion event; "
                   "therefore, failed to create inter-stream dependency";
     return false;
   }
 
-  return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
-                                       other_completed_event);
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
 }
 
-bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
 }
 
-bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
 }
 
-port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
-  return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
-blas::BlasSupport *CUDAExecutor::CreateBlas() {
+blas::BlasSupport* GpuExecutor::CreateBlas() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::BlasFactory> status =
-      registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
@@ -772,10 +786,10 @@ blas::BlasSupport *CUDAExecutor::CreateBlas() {
   return status.ValueOrDie()(this);
 }
 
-dnn::DnnSupport *CUDAExecutor::CreateDnn() {
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::DnnFactory> status =
-      registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.dnn());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve DNN factory: "
@@ -786,10 +800,10 @@ dnn::DnnSupport *CUDAExecutor::CreateDnn() {
   return status.ValueOrDie()(this);
 }
 
-fft::FftSupport *CUDAExecutor::CreateFft() {
+fft::FftSupport* GpuExecutor::CreateFft() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::FftFactory> status =
-      registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
@@ -800,10 +814,10 @@ fft::FftSupport *CUDAExecutor::CreateFft() {
   return status.ValueOrDie()(this);
 }
 
-rng::RngSupport *CUDAExecutor::CreateRng() {
+rng::RngSupport* GpuExecutor::CreateRng() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::RngFactory> status =
-      registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
@@ -815,23 +829,21 @@ rng::RngSupport *CUDAExecutor::CreateRng() {
 }
 
 // TODO(rspringer): Remove in b/18544742.
-bool CUDAExecutor::SupportsDnn() const {
-  return true;
-}
+bool GpuExecutor::SupportsDnn() const { return true; }
 
-bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
 }
 
-port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
 
-SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
   port::StatusOr<CUsharedconfig> cuda_config =
-      CUDADriver::ContextGetSharedMemConfig(context_);
+      GpuDriver::ContextGetSharedMemConfig(context_);
   if (!cuda_config.ok()) {
     // Don't log; the failed call will log necessary output.
     return SharedMemoryConfig::kDefault;
@@ -850,7 +862,7 @@ SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
   }
 }
 
-port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
     SharedMemoryConfig config) {
   CUsharedconfig cuda_config;
   switch (config) {
@@ -867,21 +879,21 @@ port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
       LOG(FATAL) << "Invalid shared memory configuration specified: "
                  << static_cast<int>(config);
   }
-  return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
+  return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
 }
 
-bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
-  return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool CUDAExecutor::GetSymbol(const string &symbol_name,
-                             ModuleHandle module_handle, void **mem,
-                             size_t *bytes) {
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
   auto lookup_in_module = [&](CUmodule module) {
     CHECK(module != nullptr);
-    return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                       reinterpret_cast<CUdeviceptr *>(mem),
-                                       bytes);
+    return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+                                      reinterpret_cast<CUdeviceptr*>(mem),
+                                      bytes);
   };
 
   {  // give limited scope to mutex_lock
@@ -903,13 +915,13 @@ bool CUDAExecutor::GetSymbol(const string &symbol_name,
   return false;
 }
 
-bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
   // (as opposed to ThreadDim which expresses the dimensions of threads
   // within a block).
   int x, y, z;
-  if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
     return false;
   }
 
@@ -919,35 +931,35 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
   return true;
 }
 
-bool CUDAExecutor::SupportsBlas() const { return true; }
+bool GpuExecutor::SupportsBlas() const { return true; }
 
-bool CUDAExecutor::SupportsFft() const { return true; }
+bool GpuExecutor::SupportsFft() const { return true; }
 
-bool CUDAExecutor::SupportsRng() const { return true; }
+bool GpuExecutor::SupportsRng() const { return true; }
 
 std::unique_ptr<internal::EventInterface>
-CUDAExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new CUDAEvent(this));
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
 }
 
 std::unique_ptr<internal::KernelInterface>
-CUDAExecutor::CreateKernelImplementation() {
-  return std::unique_ptr<internal::KernelInterface>(new CUDAKernel());
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
 }
 
 std::unique_ptr<internal::StreamInterface>
-CUDAExecutor::GetStreamImplementation() {
-  return std::unique_ptr<internal::StreamInterface>(new CUDAStream(this));
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }
 
 std::unique_ptr<internal::TimerInterface>
-CUDAExecutor::GetTimerImplementation() {
-  return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
 }
 
-void *CUDAExecutor::GpuContextHack() { return context_; }
+void* GpuExecutor::GpuContextHack() { return context_; }
 
-CudaContext* CUDAExecutor::cuda_context() { return context_; }
+GpuContext* GpuExecutor::gpu_context() { return context_; }
 
 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
 // of SysFS. Returns -1 if it cannot.
@@ -1014,21 +1026,21 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
 #endif
 }
 
-
-DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
 
   {
     int driver_version = 0;
-    (void)CUDADriver::GetDriverVersion(&driver_version);
+    (void)GpuDriver::GetDriverVersion(&driver_version);
     string augmented_driver_version = port::Printf(
         "%d (%s)", driver_version,
-        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+        cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
     builder.set_driver_version(augmented_driver_version);
   }
 
   {
-    string pci_bus_id = CUDADriver::GetPCIBusID(device_);
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
 
     // Lower the hex characters to match sysfs.
     pci_bus_id = port::Lowercase(pci_bus_id);
@@ -1039,35 +1051,45 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_numa_node(numa_node);
   }
 
-  CUdevprop prop;
-  if (CUDADriver::GetDeviceProperties(&prop, device_ordinal_)) {
-    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+  {
+    builder.set_threads_per_block_limit(
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                      device_)
+            .ValueOrDie());
 
     ThreadDim thread_dim_limit;
-    thread_dim_limit.x = prop.maxThreadsDim[0];
-    thread_dim_limit.y = prop.maxThreadsDim[1];
-    thread_dim_limit.z = prop.maxThreadsDim[2];
+    thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
+                             .ValueOrDie();
+    thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
+                             .ValueOrDie();
+    thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
+                             .ValueOrDie();
     builder.set_thread_dim_limit(thread_dim_limit);
 
-    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
-    builder.set_clock_rate_ghz(clock_rate_ghz);
+    int clock_rate =
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
+            .ValueOrDie();
+    builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
   }
 
   {
     bool ecc_enabled = false;
-    (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
     builder.set_ecc_enabled(ecc_enabled);
   }
 
   {
     uint64 device_memory_size = -1;
-    (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
     builder.set_device_memory_size(device_memory_size);
   }
 
-  port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
-  port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
     // Times 2 because HBM is DDR memory; it gets two data bits per each data
@@ -1085,7 +1107,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
   {
     string device_name;
-    (void)CUDADriver::GetDeviceName(device_, &device_name);
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
     builder.set_name(device_name);
   }
 
@@ -1099,19 +1121,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   builder.set_device_vendor("NVIDIA Corporation");
   builder.set_cuda_compute_capability(cc_major_, cc_minor_);
   builder.set_shared_memory_per_core(
-      CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
   builder.set_shared_memory_per_block(
-      CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
   builder.set_core_count(
-      CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
   builder.set_threads_per_core_limit(
-      CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
   builder.set_registers_per_block_limit(
-      CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
   builder.set_threads_per_warp(
-      CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
   builder.set_registers_per_core_limit(
-      CUDADriver::GetDeviceAttribute(
+      GpuDriver::GetDeviceAttribute(
           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
           .ValueOrDie());
 
@@ -1119,11 +1141,11 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   return built.release();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cuda_gpu_executor() {
-  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
-    return new cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
   };
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index ae8e4abf92024626bf3d2bd3d334244708f55737..9d02c7516cfd9aa1e86a7e534e41d54f8d8e5de3 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -22,289 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 
-#include <set>
-#include <unordered_map>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// CUDA-platform implementation of the platform-agnostic
-// StreamExecutorInferface.
-class CUDAExecutor : public internal::StreamExecutorInterface {
- public:
-  // sub_platform indicates the subplatform used in this executor; it must
-  // be a CUDA type.
-  explicit CUDAExecutor(const PluginConfig &plugin_config)
-      : device_(0),
-        context_(nullptr),
-        device_ordinal_(0),
-        cc_major_(0),
-        cc_minor_(0),
-        plugin_config_(plugin_config) {}
-
-  // See the corresponding StreamExecutor methods for method comments on the
-  // following overrides.
-
-  ~CUDAExecutor() override;
-
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
-
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override;
-  void UnloadKernel(const KernelBase *kernel) override;
-  bool LoadModule(const MultiModuleLoaderSpec &spec,
-                  ModuleHandle *module_handle) override;
-  bool UnloadModule(ModuleHandle module_handle) override;
-
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &k,
-              const KernelArgsArrayBase &args) override;
-
-  int CalculateOccupancy(const DeviceDescription &device_description,
-                         uint64 registers_per_thread,
-                         uint64 shared_memory_per_block,
-                         const ThreadDim &thread_dims, CUfunction func);
-
-  int CompareOccupancy(int *initial_blocks,
-                       const DeviceDescription &device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim &thread_dims, CUfunction func);
-
-  void *Allocate(uint64 size) override;
-
-  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
-                          uint64 size_bytes) override;
-
-  void Deallocate(DeviceMemoryBase *mem) override;
-
-  void *UnifiedMemoryAllocate(uint64 size) override {
-    return CUDADriver::UnifiedMemoryAllocate(context_, size);
-  }
-
-  void UnifiedMemoryDeallocate(void *location) override {
-    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
-  }
-
-  // CUDA allocation/registration functions are necessary because the driver
-  // internally sets up buffers for DMA operations (and page locks them).
-  // There's no external interface for us to otherwise control these DMA
-  // settings.
-  void *HostMemoryAllocate(uint64 size) override {
-    return CUDADriver::HostAllocate(context_, size);
-  }
-
-  void HostMemoryDeallocate(void *location) override {
-    return CUDADriver::HostDeallocate(context_, location);
-  }
-
-  bool HostMemoryRegister(void *location, uint64 size) override;
-
-  bool HostMemoryUnregister(void *location) override;
-
-  bool SynchronizeAllActivity() override;
-
-  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
-
-  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
-                         uint64 size) override;
-
-  port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                 const void *host_src, uint64 size) override;
-
-  port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &gpu_src,
-                                 uint64 size) override;
-
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                               const DeviceMemoryBase &gpu_src,
-                                               uint64 size) override;
-
-  bool MemZero(Stream *stream, DeviceMemoryBase *location,
-               uint64 size) override;
-  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
-              uint64 size) override;
-  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
-                uint64 size) override;
-
-  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
-              uint64 size) override;
-
-  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
-              uint64 size) override;
-
-  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
-                            const DeviceMemoryBase &gpu_src,
-                            uint64 size) override;
-
-  bool HostCallback(Stream *stream,
-                    std::function<port::Status()> callback) override;
-
-  bool AllocateStream(Stream *stream) override;
-
-  void DeallocateStream(Stream *stream) override;
-
-  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
-
-  bool AllocateTimer(Timer *timer) override;
-
-  void DeallocateTimer(Timer *timer) override;
-
-  bool StartTimer(Stream *stream, Timer *timer) override;
-
-  bool StopTimer(Stream *stream, Timer *timer) override;
-
-  port::Status AllocateEvent(Event *event) override;
-
-  port::Status DeallocateEvent(Event *event) override;
-
-  port::Status RecordEvent(Stream *stream, Event *event) override;
-
-  port::Status WaitForEvent(Stream *stream, Event *event) override;
-
-  Event::Status PollForEventStatus(Event *event) override;
-
-  port::Status BlockHostUntilDone(Stream *stream) override;
-
-  int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
-
-  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
-
-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
-
-  // Search for the symbol and returns a device pointer and size.
-  // Returns false if symbol does not exist.
-  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
-                 void **mem, size_t *bytes) override;
-
-  DeviceDescription *PopulateDeviceDescription() const override;
-
-  // Populates the block_dim_limit by querying the device driver API. If an
-  // error occurs at any point while asking the driver for block dim limits, it
-  // will be only partially populated as a result, and an error will be logged.
-  bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
-
-  bool SupportsBlas() const override;
-
-  blas::BlasSupport *CreateBlas() override;
-
-  bool SupportsFft() const override;
-
-  fft::FftSupport *CreateFft() override;
-
-  bool SupportsRng() const override;
-
-  rng::RngSupport *CreateRng() override;
-
-  bool SupportsDnn() const override;
-
-  dnn::DnnSupport *CreateDnn() override;
-
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override;
-
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override;
-
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
-
-  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
-
-  void *GpuContextHack() override;
-
-  CudaContext* cuda_context();
-
- private:
-  // Attempts to find a more specific version of the file indicated by
-  // filename by looking for compute-capability-specific suffixed versions; i.e.
-  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
-  // we're on a compute capability 3.0 machine.
-  bool FindOnDiskForComputeCapability(absl::string_view filename,
-                                      absl::string_view canonical_suffix,
-                                      string *found_filename) const;
-
-  // Host callback landing routine invoked by CUDA.
-  // data: User-provided callback provided to HostCallback() above, captured
-  //       as a std::function<void()>. Allocated/initialized inside
-  //       HostCallback() and owned and deleted by this call.
-  static void InternalHostCallback(CUstream stream, CUresult status,
-                                   void *data);
-
-  // Collects metadata for the specified kernel.
-  bool GetKernelMetadata(CUDAKernel *cuda_kernel,
-                         KernelMetadata *kernel_metadata);
-
-  // Prints to VLOG(2) information about the kernel's occupancy and how it might
-  // be improved.
-  void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
-                         const BlockDim &block_dims);
-
-  bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
-  bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  bool UnloadGpuBinary(const void *gpu_binary)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Guards the in-memory-module mapping.
-  mutex in_memory_modules_mu_;
-
-  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
-      GUARDED_BY(in_memory_modules_mu_);
-  // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
-  std::unordered_map<const void *, std::pair<CUmodule, uint64>>
-      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
-
-  // Guards the launched kernel set.
-  mutex launched_kernels_mu_;
-
-  // Keeps track of the set of launched kernels. Currently used to suppress the
-  // occupancy check on subsequent launches.
-  std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
-
-  // Handle for the CUDA device being operated on. Immutable
-  // post-initialization.
-  CUdevice device_;
-
-  // Handle for session with the library/driver. Immutable post-initialization.
-  CudaContext* context_;
-
-  // The device ordinal value that this executor was initialized with; recorded
-  // for use in getting device metadata. Immutable post-initialization.
-  int device_ordinal_;
-
-  // The major verion of the compute capability for device_.
-  int cc_major_;
-
-  // The minor verion of the compute capability for device_.
-  int cc_minor_;
-
-  // The plugin configuration associated with this instance.
-  PluginConfig plugin_config_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
-};
+using CUDAExecutor = gpu::GpuExecutor;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index dc0dc694cdc6001341514c02cef38178b25338aa..af6dcf3549748ef74674b5362c86dc284c6712c8 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -17,88 +17,9 @@ limitations under the License.
 //
 // These are typically placed here for use by multiple source components (for
 // example, BLAS and executor components).
-
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 
-#include <stddef.h>
-#include <complex>
-
-#include "cuda/include/cuComplex.h"
-
-namespace stream_executor {
-
-template <typename ElemT>
-class DeviceMemory;
-
-namespace cuda {
-
-// Converts a const DeviceMemory reference to its underlying typed pointer in
-// CUDA
-// device memory.
-template <typename T>
-const T *CUDAMemory(const DeviceMemory<T> &mem) {
-  return static_cast<const T *>(mem.opaque());
-}
-
-// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
-// pointer in CUDA device memory.
-template <typename T>
-T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
-  return static_cast<T *>(mem->opaque());
-}
-
-static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
-              "std::complex<float> and cuComplex should have the same size");
-static_assert(offsetof(cuComplex, x) == 0,
-              "The real part of cuComplex should appear first.");
-static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
-              "std::complex<double> and cuDoubleComplex should have the same "
-              "size");
-static_assert(offsetof(cuDoubleComplex, x) == 0,
-              "The real part of cuDoubleComplex should appear first.");
-
-// Type traits to get CUDA complex types from std::complex<>.
-
-template <typename T>
-struct CUDAComplexT {
-  typedef T type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<float>> {
-  typedef cuComplex type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<double>> {
-  typedef cuDoubleComplex type;
-};
-
-// Converts pointers of std::complex<> to pointers of
-// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
-
-template <typename T>
-inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
-  return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
-}
-
-template <typename T>
-inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
-  return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
-}
-
-// Converts values of std::complex<float/double> to values of
-// cuComplex/cuDoubleComplex.
-inline cuComplex CUDAComplexValue(std::complex<float> val) {
-  return {val.real(), val.imag()};
-}
-
-inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
-  return {val.real(), val.imag()};
-}
-
-}  // namespace cuda
-}  // namespace stream_executor
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.cc b/tensorflow/stream_executor/cuda/cuda_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b4e9a178fbcab63adb0a14bc806ac3ee3a60416
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return CU_FUNC_CACHE_PREFER_NONE;
+    case KernelCacheConfig::kPreferShared:
+      return CU_FUNC_CACHE_PREFER_SHARED;
+    case KernelCacheConfig::kPreferL1:
+      return CU_FUNC_CACHE_PREFER_L1;
+    case KernelCacheConfig::kPreferEqual:
+      return CU_FUNC_CACHE_PREFER_EQUAL;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index ec1dc51e57f5a928d54cb86b1cbcc217100df6d4..a8a18d200d93168660d70746db442aeaed146290 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -22,104 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 
-#include "tensorflow/stream_executor/kernel_cache_config.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "cuda/include/cuda.h"
-
-#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
-#error \
-    "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
-#endif
-
-#ifdef __CUDA_RUNTIME_H__
-#error \
-    "CUDA runtime being included into CUDA GPU executor; should be driver only."
-#endif
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// Wraps a CUfunction to implement the platform-independent KernelInterface.
-class CUDAKernel : public internal::KernelInterface {
- public:
-  CUDAKernel() : cuda_function_(nullptr), arity_(0),
-                 preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
-
-  // Note that the function is unloaded when the module is unloaded, and the
-  // module that the function is contained in is owned by the CUDAExecutor.
-  ~CUDAKernel() override {}
-
-  // As arity cannot be reflected upon using the CUDA API, the arity is
-  // explicitly set during the CUDAExecutor::GetKernel initialization process.
-  void set_arity(unsigned arity) { arity_ = arity; }
-  unsigned Arity() const override { return arity_; }
-
-  // Returns the CUfunction value for passing to the CUDA API.
-  CUfunction AsCUDAFunctionValue() const {
-    DCHECK(cuda_function_ != nullptr);
-    return const_cast<CUfunction>(cuda_function_);
-  }
-
-  // Returns the slot that the CUfunction is stored within for this object,
-  // for the CUDA API which wants to load into a CUfunction*.
-  CUfunction *cuda_function_ptr() { return &cuda_function_; }
-
-  // CUDA supports setting the preferred cache configuration of a CUfunction
-  // (more-or-less equivalent to a CUDAKernel). We support this via the below
-  // functions; users can set a preference, and that is applied when the kernel
-  // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
-  // load the kernel & set the preference when the user calls the setter below;
-  // either approach is valid.
-  // Sets the current kernel cache configuration preference.
-  void SetPreferredCacheConfig(KernelCacheConfig config) override {
-    preferred_cache_config_ = config;
-  }
-
-  // Returns the current kernel cache configuration preference.
-  KernelCacheConfig GetPreferredCacheConfig() const override {
-    return preferred_cache_config_;
-  }
-
-  // Returns the current kernel cache configuration preference as a
-  // CUfunc_cache.
-  CUfunc_cache GetCUDACacheConfig() const {
-    switch (preferred_cache_config_) {
-      case KernelCacheConfig::kNoPreference:
-        return CU_FUNC_CACHE_PREFER_NONE;
-      case KernelCacheConfig::kPreferShared:
-        return CU_FUNC_CACHE_PREFER_SHARED;
-      case KernelCacheConfig::kPreferL1:
-        return CU_FUNC_CACHE_PREFER_L1;
-      case KernelCacheConfig::kPreferEqual:
-        return CU_FUNC_CACHE_PREFER_EQUAL;
-      default:
-        LOG(FATAL) << "Unknown KernelCacheConfig"
-                   << static_cast<int32>(preferred_cache_config_);
-    }
-  }
-
- private:
-  CUfunction cuda_function_;  // Wrapped CUDA kernel handle.
-  unsigned arity_;            // Number of formal parameters the kernel takes.
-
-  // Preferred (but not required) cache configuration for this kernel.
-  KernelCacheConfig preferred_cache_config_;
-};
-
-// Given a platform-independent kernel datatype, returns the (const) internal
-// CUDA platform implementation pointer.
-inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
-  return static_cast<const CUDAKernel *>(kernel->implementation());
-}
-
-// Given a platform-independent kernel datatype, returns the (non-const)
-// internal CUDA platform implementation pointer.
-inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
-  return static_cast<CUDAKernel *>(kernel->implementation());
-}
+using CUDAKernel = gpu::GpuKernel;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index b342e71bdd94f6112d500d86f6ed4051821d2d54..54aba01278d17505a33d190fba85eb543dd624e1 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {
 
 // Synchronize with spinlocks.
@@ -129,16 +129,16 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
       port::Printf("Executor for bus %d not found.", bus_ordinal));
 }
 
-Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
+Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
 
 int CudaPlatform::VisibleDeviceCount() const {
   // Throw away the result - it logs internally, and this [containing] function
   // isn't in the path of user control. It's safe to call this > 1x.
-  if (!cuda::CUDADriver::Init().ok()) {
+  if (!gpu::GpuDriver::Init().ok()) {
     return -1;
   }
 
-  return CUDADriver::GetDeviceCount();
+  return GpuDriver::GetDeviceCount();
 }
 
 const string& CudaPlatform::Name() const { return name_; }
@@ -169,7 +169,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<CUDAExecutor>(config.plugin_config));
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status(
@@ -191,13 +191,13 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
   LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 static void InitializeCudaPlatform() {
   // Disabling leak checking, MultiPlatformManager does not destroy its
   // registered platforms.
 
-  std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
+  std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index fc0e15d5a6a9142f064085d34fcfaedfb25f433a..1ab09d690f308724fc817c48d86401db60b956e4 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/trace_listener.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 // Opaque and unique identifier for the CUDA platform plugin.
 // This is needed so that plugins can refer to/identify this platform without
@@ -102,6 +102,12 @@ class CudaPlatform : public Platform {
   SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
 };
 
+}  // namespace gpu
+
+namespace cuda {
+
+using CudaPlatform = gpu::CudaPlatform;
+
 }  // namespace cuda
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 7f920719321637360fdf5c098e83dfaa49164e6c..395b30b4916b68594ac35f96ec08f361c877a2de 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,17 +21,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
+// clang-format off
 #include "cuda/include/curand.h"
+// clang-format on
 
 // Formats curandStatus_t to output prettified values into a log stream.
 std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
@@ -60,33 +58,33 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
 }
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
 
 namespace wrap {
 
 #ifdef PLATFORM_GOOGLE
-#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                        \
+  struct WrapperShim__##__name {                                   \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                    \
+    }                                                              \
   } __name;
 
 #else
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
   struct DynLoadShim__##__name {                                          \
-    static const char *kName;                                             \
+    static const char* kName;                                             \
     using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void *GetDsoHandle() {                                         \
+    static void* GetDsoHandle() {                                         \
       auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
       return s.ValueOrDie();                                              \
     }                                                                     \
     static FuncPtrT LoadOrDie() {                                         \
-      void *f;                                                            \
+      void* f;                                                            \
       auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
                                                           kName, &f);     \
       CHECK(s.ok()) << "could not find " << kName                         \
@@ -98,12 +96,12 @@ namespace wrap {
       return f;                                                           \
     }                                                                     \
     template <typename... Args>                                           \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    curandStatus_t operator()(GpuExecutor* parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                          \
     }                                                                     \
   } __name;                                                               \
-  const char *DynLoadShim__##__name::kName = #__name;
+  const char* DynLoadShim__##__name::kName = #__name;
 #endif
 
 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
@@ -118,38 +116,15 @@ STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
 
 }  // namespace wrap
 
-template <typename T>
-string TypeString();
-
-template <>
-string TypeString<float>() {
-  return "float";
-}
-
-template <>
-string TypeString<double>() {
-  return "double";
-}
-
-template <>
-string TypeString<std::complex<float>>() {
-  return "std::complex<float>";
-}
-
-template <>
-string TypeString<std::complex<double>>() {
-  return "std::complex<double>";
-}
-
-CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
 
-CUDARng::~CUDARng() {
+GpuRng::~GpuRng() {
   if (rng_ != nullptr) {
     wrap::curandDestroyGenerator(parent_, rng_);
   }
 }
 
-bool CUDARng::Init() {
+bool GpuRng::Init() {
   mutex_lock lock(mu_);
   CHECK(rng_ == nullptr);
 
@@ -164,9 +139,9 @@ bool CUDARng::Init() {
   return true;
 }
 
-bool CUDARng::SetStream(Stream *stream) {
+bool GpuRng::SetStream(Stream* stream) {
   curandStatus_t ret =
-      wrap::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
+      wrap::curandSetStream(parent_, rng_, AsGpuStreamValue(stream));
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for random generation: " << ret;
     return false;
@@ -184,8 +159,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
 }
 
 template <typename T>
-bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
-                                            DeviceMemory<T> *v) {
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
   mutex_lock lock(mu_);
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
@@ -205,11 +179,11 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
   if (std::is_same<T, float>::value ||
       std::is_same<T, std::complex<float>>::value) {
     ret = wrap::curandGenerateUniform(
-        parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
         element_count);
   } else {
     ret = wrap::curandGenerateUniformDouble(
-        parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
         element_count);
   }
   if (ret != CURAND_STATUS_SUCCESS) {
@@ -222,29 +196,29 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
   return true;
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<float>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<double>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
 template <typename ElemT, typename FuncT>
-bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
-                                             ElemT stddev,
-                                             DeviceMemory<ElemT> *v,
-                                             FuncT func) {
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
   mutex_lock lock(mu_);
 
   if (!SetStream(stream)) {
@@ -253,7 +227,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
 
   uint64 element_count = v->ElementCount();
   curandStatus_t ret =
-      func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
 
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
@@ -264,19 +238,19 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
   return true;
 }
 
-bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                                     DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
   return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
                                         wrap::curandGenerateNormal);
 }
 
-bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                                     DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
   return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
                                         wrap::curandGenerateNormalDouble);
 }
 
-bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
   mutex_lock lock(mu_);
   CHECK(rng_ != nullptr);
 
@@ -305,15 +279,15 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
   return true;
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_curand() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
-          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
-          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+          cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR)
                   << "Attempting to initialize an instance of the cuRAND "
@@ -321,7 +295,7 @@ void initialize_curand() {
               return nullptr;
             }
 
-            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            gpu::GpuRng* rng = new gpu::GpuRng(cuda_executor);
             if (!rng->Init()) {
               // Note: Init() will log a more specific error.
               delete rng;
@@ -336,7 +310,7 @@ void initialize_curand() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+      cuda::kCudaPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
index 57ef398aaa88da7de769c49820325c6c9feb4d70..d7f6b0e8e034967ed2919332aafca9c7a8081eba 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -16,85 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/plugin_registry.h"
-#include "tensorflow/stream_executor/rng.h"
-
-typedef struct curandGenerator_st *curandGenerator_t;
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
 
 namespace stream_executor {
 
-class Stream;
-template <typename ElemT>
-class DeviceMemory;
-
 namespace cuda {
 
-// Opaque and unique identifier for the cuRAND plugin.
-extern const PluginId kCuRandPlugin;
-
-class CUDAExecutor;
-
-// CUDA-platform implementation of the random number generation support
-// interface.
-//
-// Thread-safe post-initialization.
-class CUDARng : public rng::RngSupport {
- public:
-  explicit CUDARng(CUDAExecutor *parent);
-
-  // Retrieves a curand library generator handle. This is necessary for
-  // enqueuing random number generation work onto the device.
-  // TODO(leary) provide a way for users to select the RNG algorithm.
-  bool Init();
-
-  // Releases a curand library generator handle, if one was acquired.
-  ~CUDARng() override;
-
-  // See rng::RngSupport for details on the following overrides.
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<float>> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<double>> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                              DeviceMemory<float> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                              DeviceMemory<double> *v) override;
-
-  bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
-
- private:
-  // Actually performs the work of generating random numbers - the public
-  // methods are thin wrappers to this interface.
-  template <typename T>
-  bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
-  template <typename ElemT, typename FuncT>
-  bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
-                                      DeviceMemory<ElemT> *v, FuncT func);
-
-  // Sets the stream for the internal curand generator.
-  //
-  // This is a stateful operation, as the handle can only have one stream set at
-  // a given time, so it is usually performed right before enqueuing work to do
-  // with random number generation.
-  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // mutex that guards the cuRAND handle for this device.
-  mutex mu_;
-
-  // CUDAExecutor which instantiated this CUDARng.
-  // Immutable post-initialization.
-  CUDAExecutor *parent_;
-
-  // cuRANDalibrary handle on the device.
-  curandGenerator_t rng_ GUARDED_BY(mu_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
-};
+using CUDARng = gpu::GpuRng;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
index bb8bda4755344d859668425f89614cc87d7e2d3e..4460351368894a009eaa4d7186e809ddf3fa3aed 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -13,79 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the CUDAStream type - the CUDA-specific implementation of the generic
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
 // StreamExecutor Stream interface.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
 namespace stream_executor {
 namespace cuda {
 
-class CUDAExecutor;
+using CUDAStream = gpu::GpuStream;
 
-// Wraps a CUstream in order to satisfy the platform-independent
-// StreamInterface.
-//
-// Thread-safe post-initialization.
-class CUDAStream : public internal::StreamInterface {
- public:
-  explicit CUDAStream(CUDAExecutor *parent)
-      : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
-
-  // Note: teardown is handled by a parent's call to DeallocateStream.
-  ~CUDAStream() override {}
-
-  void *GpuStreamHack() override { return cuda_stream_; }
-  void **GpuStreamMemberHack() override {
-    return reinterpret_cast<void **>(&cuda_stream_);
-  }
-
-  // Explicitly initialize the CUDA resources associated with this stream, used
-  // by StreamExecutor::AllocateStream().
-  bool Init();
-
-  // Explicitly destroy the CUDA resources associated with this stream, used by
-  // StreamExecutor::DeallocateStream().
-  void Destroy();
-
-  // Returns true if no work is pending or executing on the stream.
-  bool IsIdle() const;
-
-  // Retrieves an event which indicates that all work enqueued into the stream
-  // has completed. Ownership of the event is not transferred to the caller, the
-  // event is owned by this stream.
-  CUevent* completed_event() { return &completed_event_; }
-
-  // Returns the CUstream value for passing to the CUDA API.
-  //
-  // Precond: this CUDAStream has been allocated (otherwise passing a nullptr
-  // into the NVIDIA library causes difficult-to-understand faults).
-  CUstream cuda_stream() const {
-    DCHECK(cuda_stream_ != nullptr);
-    return const_cast<CUstream>(cuda_stream_);
-  }
-
-  CUDAExecutor *parent() const { return parent_; }
-
- private:
-  CUDAExecutor *parent_;  // Executor that spawned this stream.
-  CUstream cuda_stream_;  // Wrapped CUDA stream handle.
-
-  // Event that indicates this stream has completed.
-  CUevent completed_event_ = nullptr;
-};
-
-// Helper functions to simplify extremely common flows.
-// Converts a Stream to the underlying CUDAStream implementation.
-CUDAStream *AsCUDAStream(Stream *stream);
-
-// Extracts a CUstream from a CUDAStream-backed Stream object.
-CUstream AsCUDAStreamValue(Stream *stream);
+inline CUDAStream* AsCUDAStream(Stream* stream) {
+  return gpu::AsGpuStream(stream);
+}
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index e040cf86fad1f40a708ad4ca28693e31908393f0..01b722e888687c0e199d7fe8ace92aec407f3a4b 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -13,76 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the CUDATimer type - the CUDA-specific implementation of the generic
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
 // StreamExecutor Timer interface.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
 
 namespace stream_executor {
 namespace cuda {
 
-class CUDAExecutor;
-class CUDAStream;
-
-// Wraps a pair of CUevents in order to satisfy the platform-independent
-// TimerInferface -- both a start and a stop event are present which may be
-// recorded in a stream.
-class CUDATimer : public internal::TimerInterface {
- public:
-  explicit CUDATimer(CUDAExecutor *parent)
-      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
-
-  // Note: teardown needs to be explicitly handled in this API by a call to
-  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
-  // TODO(csigg): Change to RAII.
-  ~CUDATimer() override {}
-
-  // Allocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::AllocateTimer().
-  bool Init();
-
-  // Deallocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::DeallocateTimer().
-  void Destroy();
-
-  // Records the "timer start" event at the current point in the stream.
-  bool Start(CUDAStream *stream);
-
-  // Records the "timer stop" event at the current point in the stream.
-  bool Stop(CUDAStream *stream);
-
-  // Returns the elapsed time, in milliseconds, between the start and stop
-  // events.
-  float GetElapsedMilliseconds() const;
-
-  // See Timer::Microseconds().
-  // TODO(leary) make this into an error code interface...
-  uint64 Microseconds() const override {
-    return GetElapsedMilliseconds() * 1e3;
-  }
-
-  // See Timer::Nanoseconds().
-  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
-
- private:
-  CUDAExecutor *parent_;
-  CUevent start_event_;  // Event recorded to indicate the "start" timestamp
-                         // executing in a stream.
-  CUevent stop_event_;   // Event recorded to indicate the "stop" timestamp
-                         // executing in a stream.
-};
-
-struct TimerDeleter {
-  void operator()(CUDATimer *t) {
-    t->Destroy();
-    delete t;
-  }
-};
+using CUDATimer = gpu::GpuTimer;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
index e8fcc0361850a561928d09f29f78fb57071c24b2..9ef8bc95e5644ed060d88335de4f9d1abd5f719d 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version) {
@@ -36,5 +36,5 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
            loaded_version.minor_version >= source_version.minor_version));
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 6464e7f8e8755b5b46b90a4b35d50509eb0cfde7..4607a9bff87bf29a00a9f5e0f112f9389fa12972 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 struct CudnnVersion {
   CudnnVersion() = default;
@@ -44,7 +44,7 @@ struct CudnnVersion {
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version);
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 7d4c6399d040e9bcddff5d98d202ab00fdeffa58..cfe114662d4515c68ffdab46918db09f631e9343 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {
 
 TEST(CuDNNVersion, ToString) {
@@ -68,5 +68,5 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
 }
 
 }  // namespace
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 0b991b7ba8cdad7f342adc6c8ff25b88d91e2bd2..2595d216b4f97b36fe82e6d020c4f7afde4d4274 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
       clock_rate_ghz_(-1.0),
       cuda_compute_capability_major_(-1),
       cuda_compute_capability_minor_(-1),
+      rocm_amdgpu_isa_version_(-1),
       numa_node_(-1),
       core_count_(-1),
       ecc_enabled_(false) {}
@@ -112,6 +113,15 @@ bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
   return cuda_compute_capability_major_ != 0;
 }
 
+bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
+  bool status = false;
+  if (rocm_amdgpu_isa_version_ > 0) {
+    *version = rocm_amdgpu_isa_version_;
+    status = true;
+  }
+  return status;
+}
+
 bool ThreadDimOk(const DeviceDescription &device_description,
                  const ThreadDim &thread_dim) {
   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index 8ddf18629d554112631c3d9c09dbb7afd8505c76..cccc209e1c8bedde5bdba8f454bbe95d0f9f3458 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -133,6 +133,11 @@ class DeviceDescription {
   // zero, and the return value will be false.
   bool cuda_compute_capability(int *major, int *minor) const;
 
+  // Returns the AMDGPU ISA version if we're running on the ROCm platform.
+  // If the information is not available, the version is not modified,
+  // and the return value will be false.
+  bool rocm_amdgpu_isa_version(int *version) const;
+
   // Returns the maximum amount of shared memory present on a single core
   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
   // devices). Note that some devices, such as NVIDIA's have a configurable
@@ -195,6 +200,9 @@ class DeviceDescription {
   int cuda_compute_capability_major_;
   int cuda_compute_capability_minor_;
 
+  // ROCM AMDGPU ISA version, 0 if not available.
+  int rocm_amdgpu_isa_version_;
+
   int numa_node_;
   int core_count_;
   bool ecc_enabled_;
@@ -280,6 +288,10 @@ class DeviceDescriptionBuilder {
     device_description_->cuda_compute_capability_minor_ = minor;
   }
 
+  void set_rocm_amdgpu_isa_version(int version) {
+    device_description_->rocm_amdgpu_isa_version_ = version;
+  }
+
   void set_numa_node(int value) { device_description_->numa_node_ = value; }
   void set_core_count(int value) { device_description_->core_count_ = value; }
   void set_ecc_enabled(bool value) {
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index faa662211ebb366b8e20cdc3e33ca651c64cf73a..fcc3db928b1daaca33bef2e518aa6a4c1d8e5373 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -368,6 +368,16 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
   return output;
 }
 
+TensorDescriptorProto BatchDescriptor::ToProto(DataType data_type) const {
+  CHECK_EQ(0.0, value_max_);
+  CHECK_EQ(0.0, value_min_);
+  CHECK(quantized_activation_mode_ == QuantizedActivationMode::k8Bit);
+
+  TensorDescriptorProto ret = tensor_;
+  ret.set_data_type(data_type);
+  return ret;
+}
+
 // -- FilterDescriptor
 
 FilterDescriptor::FilterDescriptor(int ndims) {
@@ -434,6 +444,12 @@ int64 FilterDescriptor::ComputeWeightCount() const {
   return ret;
 }
 
+TensorDescriptorProto FilterDescriptor::ToProto(DataType data_type) const {
+  TensorDescriptorProto ret = tensor_;
+  ret.set_data_type(data_type);
+  return ret;
+}
+
 // -- ConvolutionDescriptor
 
 ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
@@ -565,5 +581,15 @@ string NormalizeDescriptor::ToShortString() const {
                       "_size:", segment_size_);
 }
 
+bool DnnSupport::IsStatusOk(const port::Status& status, bool report_error) {
+  if (status.ok()) {
+    return true;
+  }
+  if (report_error) {
+    LOG(ERROR) << status.error_message();
+  }
+  return false;
+}
+
 }  // namespace dnn
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 33ca0ff65ae457af2e397138d2a7c51f7c25634a..1d9a2be9517232792c990c0f284ceafd2d6aa61b 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -248,6 +248,12 @@ class BatchDescriptor {
   string ToString() const;
   string ToShortString() const;
 
+  // Pre-condition:
+  //   value_max_ == 0
+  //   value_min_ == 0
+  //   quantized_activation_mode_ == QuantizedActivationMode::k8Bit
+  TensorDescriptorProto ToProto(DataType data_type) const;
+
   // Accessors.
   int64 count() const { return tensor_.dimensions(0); }
   int64 feature_map_count() const { return tensor_.dimensions(1); }
@@ -420,6 +426,7 @@ class FilterDescriptor {
 
   string ToString() const;
   string ToShortString() const;
+  TensorDescriptorProto ToProto(DataType data_type) const;
 
   // Returns the number of weights required as parameters for a convolution
   // using this filter descriptor.
@@ -509,6 +516,7 @@ class ConvolutionDescriptor {
 
   string ToString() const;
   string ToShortString() const;
+  ConvolutionDescriptorProto ToProto() const { return proto_; }
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
     SetDim(padding(), DimIndex::Y, value);
@@ -730,6 +738,7 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
+  AlgorithmDesc() : AlgorithmDesc(0, false) {}
   AlgorithmDesc(Index a, bool use_tensor_ops) {
     proto_.set_algo_id(a);
     proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
@@ -745,6 +754,8 @@ class AlgorithmDesc {
   }
   uint64 hash() const;
 
+  AlgorithmProto ToProto() const { return proto_; }
+
  private:
   AlgorithmProto proto_;
 };
@@ -931,11 +942,7 @@ class VersionInfo {
 //   burden.
 // * Poor error handling: the API should return Status objects.
 //
-// Things worth trying:
-// * Move functions that are not actually common back to the backends. Then,
-//   callers may use dynamic_cast to access specific backends. This may not be
-//   that hard, as many of the callers are Stream::ThenXxx functions.
-// * Change all the returned bools to Status.
+// PrepareForConvolution is an example for how new APIs should be written.
 class DnnSupport {
  public:
   DnnSupport() {}
@@ -1176,6 +1183,26 @@ class DnnSupport {
     return false;
   }
 
+  template <typename ElementType>
+  port::Status PrepareForConvolution(
+      ConvolutionKind kind, Stream* stream,
+      const BatchDescriptor& batch_descriptor,
+      DeviceMemory<ElementType> input_data,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<ElementType> filter_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<ElementType> output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) {
+    return DoPrepareForConvolution(
+        kind, ToDataType<ElementType>::value, stream, batch_descriptor,
+        input_data, filter_descriptor, filter_data, output_descriptor,
+        output_data, convolution_descriptor, algorithm_config,
+        scratch_allocator, algorithm_desc, scratch_memory);
+  }
+
   // Enqueues a single-precision convolution operation onto the stream.
   //
   // Arguments (all borrowed):
@@ -1189,10 +1216,10 @@ class DnnSupport {
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
-  //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
-  //    space in order to speed up the convolution operation.
-  //  algorithm_config: specifies which algorithm should be used for the
+  //  algorithm_desc: specifies which algorithm should be used for the
   //    operation.
+  //  scratch: un-owned device memory for scratch space in order to speed up
+  //    the convolution operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1210,43 +1237,34 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+  virtual port::Status DoConvolve(
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
-  // Enqueues a double-precision convolution operation onto the stream.
-  // See DoConvolve above for argument details.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-      const DeviceMemory<double>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) = 0;
-
-  // Enqueues a half-precision convolution operation onto the stream.
-  // See DoConvolve above for argument details.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half>* output_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+  template <typename ElementType>
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+                  const DeviceMemory<ElementType>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<ElementType>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<ElementType>* output_data,
+                  const dnn::AlgorithmDesc& algorithm_desc,
+                  DeviceMemory<uint8>* scratch_memory,
+                  ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
+                   stream, input_descriptor, input_data, filter_descriptor,
+                   filter_data, output_descriptor, *output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the forward convolution pass.
   // cc_major and cc_minor are the compute capabilities of the device.
@@ -1319,17 +1337,27 @@ class DnnSupport {
   //    backprop of the input.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<float>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+  template <typename ElementType>
+  bool DoConvolveBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<ElementType>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      const DeviceMemory<ElementType>& backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<ElementType>* backward_input_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::BACKWARD_DATA,
+                   ToDataType<ElementType>::value, stream, input_descriptor,
+                   *backward_input_data, filter_descriptor, filter_data,
+                   output_descriptor, backward_output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
@@ -1337,30 +1365,6 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<double>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
   // Enqueues a single-precision backward convolution (for filter) operation
   // onto the stream.
   //
@@ -1380,17 +1384,27 @@ class DnnSupport {
   //    backprop of the filter.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  virtual bool DoConvolveBackwardFilter(
+  template <typename ElementType>
+  bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
+      const DeviceMemory<ElementType>& input_data,
       const BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
+      const DeviceMemory<ElementType>& backward_output_data,
       const ConvolutionDescriptor& convolution_descriptor,
       const FilterDescriptor& filter_descriptor,
-      DeviceMemory<float>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+      DeviceMemory<ElementType>* backward_filter_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::BACKWARD_FILTER,
+                   ToDataType<ElementType>::value, stream, input_descriptor,
+                   input_data, filter_descriptor, *backward_filter_data,
+                   output_descriptor, backward_output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
@@ -1398,30 +1412,6 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual bool DoConvolveBackwardFilter(
-      Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<double>& input_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const FilterDescriptor& filter_descriptor,
-      DeviceMemory<double>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
-  virtual bool DoConvolveBackwardFilter(
-      Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const FilterDescriptor& filter_descriptor,
-      DeviceMemory<Eigen::half>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
   // Enqueues a single-precision backward convolution (for bias) operation onto
   // the stream.
   //
@@ -1607,6 +1597,17 @@ class DnnSupport {
     return false;
   }
 
+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<int8>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<int8>* output_data,
+                             ScratchAllocator* workspace_allocator) {
+    LOG(FATAL) << "DoPoolForward not implemented for int8.";
+    return false;
+  }
+
   // Performs differentiation of the pooling operation.
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -2029,22 +2030,6 @@ class DnnSupport {
       QuantizedActivationMode mode,
       DeviceMemory<float>* gpu_unquantized_dst) = 0;
 
-  // Enqueues an asynchronous copy of the contents of buffer_src to
-  // gpu_unquantized_dst.
-  virtual bool DoCopyHostBuffer2Device(
-      Stream* stream, HostBuffer* buffer_src,
-      DeviceMemory<float>* gpu_unquantized_dst) {
-    return false;
-  }
-
-  // Enqueues an asynchronous copy of the contents of gpu_unquantized_src to
-  // buffer_dst.
-  virtual bool DoCopyDevice2HostBuffer(
-      Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
-      HostBuffer* buffer_dst) {
-    return false;
-  }
-
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   //
@@ -2349,7 +2334,25 @@ class DnnSupport {
     return false;
   }
 
+ protected:
+  // Returns whether status is 'ok', and potentially logs the error.
+  static bool IsStatusOk(const port::Status& status, bool report_error);
+
  private:
+  virtual port::Status DoPrepareForConvolution(
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return port::Status::OK();
+  }
+
   SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
 };
 
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 56b079c3f5b962636e7c75b46449adca8e13a43e..11fb5d0f6a02a32fd3c958133136b078ac848ac3 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -66,6 +66,13 @@ enum ConvolutionMode {
   CONVOLUTION = 1;
 }
 
+enum ConvolutionKind {
+  INVALID = 0;
+  FORWARD = 1;
+  BACKWARD_FILTER = 2;
+  BACKWARD_DATA = 3;
+}
+
 // Generic tensor representation.
 message TensorDescriptorProto {
   repeated int64 dimensions = 1;
@@ -101,3 +108,22 @@ message ConvolutionDescriptorProto {
   int32 group_count = 5;
   ConvolutionMode convolution_mode = 6;
 }
+
+// A convolution. Currently it's only used for logging. In the future, we may
+// want to use it in the API as well.
+message ConvolutionProto {
+  ConvolutionKind kind = 1;
+  TensorDescriptorProto input = 2;
+  TensorDescriptorProto filter = 3;
+  TensorDescriptorProto output = 4;
+  AlgorithmProto algorithm = 5;
+  ConvolutionDescriptorProto conv_desc = 6;
+
+  // result = conv_scale * conv(...) + side_value_scale * side_value.
+  // side_value is an arbitrary buffer if activation is not none. Otherwise, it
+  // has to be the result buffer (using its old values).
+  double conv_scale = 7;
+  double side_value_scale = 8;
+
+  ActivationMode activation = 9;
+}
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..26a9c6426de4a69ae68a8047b175952c760f5d27
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -0,0 +1,209 @@
+# Description:
+#   GPU-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "gpu_activation_header",
+    hdrs = ["gpu_activation.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "gpu_activation",
+    srcs = ["gpu_activation.cc"],
+    hdrs = ["gpu_activation.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_activation_header",
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_diagnostics_header",
+    hdrs = ["gpu_diagnostics.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_driver_header",
+    hdrs = ["gpu_driver.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "gpu_event_header",
+    hdrs = ["gpu_event.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_stream_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_event",
+    srcs = ["gpu_event.cc"],
+    hdrs = ["gpu_event.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_executor_header",
+    hdrs = ["gpu_executor.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_kernel_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gpu_helpers_header",
+    hdrs = ["gpu_helpers.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [":gpu_types_header"],
+)
+
+cc_library(
+    name = "gpu_kernel_header",
+    hdrs = ["gpu_kernel.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_rng_header",
+    hdrs = ["gpu_rng.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream_header",
+    hdrs = ["gpu_stream.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream",
+    srcs = ["gpu_stream.cc"],
+    hdrs = ["gpu_stream.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer_header",
+    hdrs = ["gpu_timer.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer",
+    srcs = ["gpu_timer.cc"],
+    hdrs = ["gpu_timer.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_types_header",
+    hdrs = ["gpu_types.h"],
+    visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/gpu/gpu_activation.cc
similarity index 62%
rename from tensorflow/stream_executor/cuda/cuda_activation.cc
rename to tensorflow/stream_executor/gpu/gpu_activation.cc
index 02371c3c3ab403e9b3303fbbafdef18c30196f4f..6f74eef2dbc106c14f04736418f3e42adb68f0b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/gpu/gpu_activation.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,36 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
+GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    CUDAExecutor *cuda_exec):
-      driver_scoped_activate_context_(
-          new ScopedActivateContext{ExtractCudaContext(cuda_exec)}) { }
+    GpuExecutor* gpu_exec)
+    : driver_scoped_activate_context_(
+          new ScopedActivateContext{ExtractGpuContext(gpu_exec)}) {}
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    StreamExecutor *stream_exec)
-    : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec)) {}
+    StreamExecutor* stream_exec)
+    : ScopedActivateExecutorContext(ExtractGpuExecutor(stream_exec)) {}
 
 ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
-  delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
+  delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
 }
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    ScopedActivateExecutorContext &&other)
+    ScopedActivateExecutorContext&& other)
     : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
   other.driver_scoped_activate_context_ = nullptr;
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_activation.h b/tensorflow/stream_executor/gpu/gpu_activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3409304d7796bfac92295b2eecc10e2f9487c018
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_activation.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains APIs that assume a StreamExecutor is backed by CUDA.
+// It reaches into the CUDA implementation to activate an underlying CUDA
+// context.
+//
+// Having this file separate from gpu/gpu_executor.h means that dependent
+// code does not also have to depend on cuda.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+namespace gpu {
+
+class GpuExecutor;
+class ScopedActivateContext;
+
+// Activates a CUDA context within an enclosing scope.
+class ScopedActivateExecutorContext {
+ public:
+  // Form that takes a CUDA executor implementation.
+  explicit ScopedActivateExecutorContext(GpuExecutor* gpu_exec);
+
+  // Form that takes a pImpl executor and extracts a CUDA implementation --
+  // fatal failure if it is not CUDA inside.
+  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
+
+  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
+
+  ~ScopedActivateExecutorContext();
+
+ private:
+  // The cuda.h-using datatype that we wrap.
+  ScopedActivateContext* driver_scoped_activate_context_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_diagnostics.h b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..71642109b57fd9b4e0a0a3dbc4efee7991bb6f03
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+
+#include <tuple>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = std::tuple<int, int, int>;
+
+// FIXME: These functions are in stream_executor::cuda namespaces for now
+// Will move to stream_executor::gpu namespace in the near future
+//
+//// Converts a parsed driver version to string form.
+// string DriverVersionToString(DriverVersion version);
+//
+//// Converts a parsed driver version or status value to natural string form.
+// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+//
+//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static port::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      port::StatusOr<DriverVersion> dso_version,
+      port::StatusOr<DriverVersion> kernel_version);
+
+  // Logs information about the dev nodes present on this machine: their
+  // existence, permissions, accessibility from this uid/gid.
+  static void LogDevNodeDiagnosticInformation();
+
+  static string GetDevNodePath(int dev_node_ordinal);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5ef48db4704015c51fb1d0e203e541b6b79afc5
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -0,0 +1,525 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA userspace driver library wrapper functionality.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+
+#include <stddef.h>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Identifies the memory space where an allocation resides. See
+// GpuDriver::GetPointerMemorySpace().
+enum class MemorySpace { kHost, kDevice };
+
+// Returns a casual string, such as "host" for the provided memory space.
+string MemorySpaceString(MemorySpace memory_space);
+
+class GpuContext;
+
+// GpuDriver contains wrappers for calls to the userspace library driver. It's
+// useful to isolate these calls and put basic wrappers around them to separate
+// userspace library driver behaviors from the rest of the program.
+//
+// At the moment it's simply used as a namespace.
+//
+// The calls log any specific errors internally and return whether the operation
+// was successful to the caller.
+//
+// The order of parameters is generally kept symmetric with the underlying CUDA
+// driver API.
+//
+// Links on functions are to specific documentation under
+// http://docs.nvidia.com/cuda/cuda-driver-api/
+//
+// Thread safety: these functions should not be used from signal handlers.
+class GpuDriver {
+ public:
+  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
+  // the case of failure. Safe to call multiple times; will be fast on all calls
+  // after the first.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
+  static port::Status Init();
+
+  // Returns the device associated with the given context.
+  // device is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
+  static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
+
+  // Creates a new CUDA stream associated with the given context via
+  // cuStreamCreate.
+  // stream is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // Destroys a CUDA stream associated with the given context.
+  // stream is owned by the caller, must not be null, and *stream is set to null
+  // if the stream is successfully destroyed.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
+  static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // CUDA events can explicitly disable event TSC retrieval for some presumed
+  // performance improvement if timing is unnecessary.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  enum class EventFlags { kDefault, kDisableTiming };
+
+  // Creates a new event associated with the given context.
+  // result is an outparam owned by the caller and must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
+                                  EventFlags flags);
+
+  // Destroys *event and turns it into a nullptr. event may not be null, but
+  // *event may be, via cuEventDestroy
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
+  static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
+
+  // Allocates a GPU memory space of size bytes associated with the given
+  // context via cuMemAlloc.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
+  static void* DeviceAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a GPU memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void DeviceDeallocate(GpuContext* context, void* location);
+
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  // (supported on CUDA only)
+  static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  // (supported on CUDA only)
+  static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
+
+  // Allocates page-locked and CUDA-registered memory on the host via
+  // cuMemAllocHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
+  static void* HostAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
+  static void HostDeallocate(GpuContext* context, void* location);
+
+  // Registers a memory region at location of size bytes via cuMemHostRegister.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+  static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
+
+  // Unregisters a memory region that was previously registered at location via
+  // cuMemHostUnregister.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
+  //
+  // TODO(leary) verify an error will be returned if the location wasn't
+  // previously registered.
+  static bool HostUnregister(GpuContext* context, void* location);
+
+  // Given a device ordinal, returns a device handle into the device outparam,
+  // which must not be null.
+  //
+  // N.B. these device handles do not have a corresponding destroy function in
+  // the CUDA driver API.
+  static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
+
+  // Given a device handle, returns the name reported by the driver for the
+  // device.
+  static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
+
+  // Given a device to create a context for, returns a context handle into the
+  // context outparam, which must not be null.
+  //
+  // N.B. CUDA contexts are weird. They are implicitly associated with the
+  // calling thread. Current documentation on contexts and their influence on
+  // userspace processes is given here:
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
+  static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
+                                    const DeviceOptions& device_options,
+                                    GpuContext** context);
+
+  // Destroys the provided context via cuCtxDestroy.
+  // Don't do this while clients could still be using the context, per the docs
+  // bad things will happen.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
+  static void DestroyContext(GpuContext* context);
+
+  // Queries the runtime for the specified attribute of the specified function.
+  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
+  // in terms of integer-sized values, so there's no potential for overrun (as
+  // of CUDA 5.5).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
+  static bool FuncGetAttribute(GpuFunctionAttribute attribute,
+                               GpuFunctionHandle function,
+                               int* attribute_value);
+
+  // Sets the preferred cache configuration for the specified function.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
+  static bool FuncSetCacheConfig(GpuFunctionHandle function,
+                                 GpuFuncCachePreference cache_config);
+
+  // Gets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
+  static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
+      GpuContext* context);
+
+  // Sets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
+  static port::Status ContextSetSharedMemConfig(
+      GpuContext* context, GpuSharedMemConfig shared_mem_config);
+
+  // Launches a CUDA kernel via cuLaunchKernel.
+  // TODO(leary) describe the structure of kernel_params and extra in a readable
+  // way.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
+  static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
+                           unsigned int grid_dim_x, unsigned int grid_dim_y,
+                           unsigned int grid_dim_z, unsigned int block_dim_x,
+                           unsigned int block_dim_y, unsigned int block_dim_z,
+                           unsigned int shared_mem_bytes,
+                           GpuStreamHandle stream, void** kernel_params,
+                           void** extra);
+
+  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
+  // handle in "module". Any error logs that are produced are logged internally.
+  // (supported on CUDA only)
+  static bool LoadPtx(GpuContext* context, const char* ptx_contents,
+                      GpuModuleHandle* module);
+
+  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
+  // the resulting handle in "module".
+  // (supported on CUDA only)
+  static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
+                                GpuModuleHandle* module);
+
+  // Loads HSACO with the ROCM runtime and stores the resulting handle in
+  // "module". Any error logs that are produced are logged internally.
+  // (supported on ROCm only)
+  static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
+                        GpuModuleHandle* module);
+
+  // Retrieves a named kernel from a loaded module, and places the resulting
+  // handle into function (outparam) on success. Neither kernel_name nor
+  // function may be null. No ownership is taken of kernel_name.
+  static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
+                                const char* kernel_name,
+                                GpuFunctionHandle* function);
+
+  // Retrieves a named global/constant symbol from a loaded module, and returns
+  // a device pointer and size of the symbol on success. symbol_name may not be
+  // null. At least one of dptr or bytes should not be null. No ownership is
+  // taken of symbol_name.
+  static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
+                              const char* symbol_name, GpuDevicePtr* dptr,
+                              size_t* bytes);
+
+  // Unloads module from the current context via cuModuleUnload.
+  // TODO(leary) the documentation doesn't say what kind of disasters happen
+  // if you try to unload a module while its GpuFunctionHandles are in use.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
+  static void UnloadModule(GpuContext* context, GpuModuleHandle module);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
+  static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
+                                     uint8 value, size_t size);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
+  static bool SynchronousMemsetUint32(GpuContext* context,
+                                      GpuDevicePtr location, uint32 value,
+                                      size_t uint32_count);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD8Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
+  static bool AsynchronousMemsetUint8(GpuContext* context,
+                                      GpuDevicePtr location, uint8 value,
+                                      size_t uint32_count,
+                                      GpuStreamHandle stream);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD32Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
+  static bool AsynchronousMemsetUint32(GpuContext* context,
+                                       GpuDevicePtr location, uint32 value,
+                                       size_t uint32_count,
+                                       GpuStreamHandle stream);
+
+  // -- Synchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
+
+  static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+  static port::Status SynchronousMemcpyH2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           const void* host_src, uint64 size);
+  static port::Status SynchronousMemcpyD2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+
+  // -- Asynchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
+
+  static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    const void* host_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+
+  // The CUDA stream callback type signature.
+  // The data passed to AddStreamCallback is subsequently passed to this
+  // callback when it fires.
+  //
+  // Some notable things:
+  // * Callbacks must not make any CUDA API calls.
+  // * Callbacks from independent streams execute in an undefined order and may
+  //   be serialized.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
+  typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
+                                 void* data);
+
+  // Enqueues a callback operation into stream.
+  // See StreamCallback above and the NVIDIA documentation for additional
+  // details.
+  static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
+                                StreamCallback callback, void* data);
+
+  // Causes stream to wait for event to trigger before proceeding via
+  // cuStreamWaitEvent.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
+  static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
+                                GpuEventHandle event);
+
+  // Blocks the calling thread until the operations enqueued onto stream have
+  // been completed, via cuStreamSynchronize.
+  //
+  // TODO(leary) if a pathological thread enqueues operations onto the stream
+  // while another thread blocks like this, can you wind up waiting an unbounded
+  // amount of time?
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
+  static port::Status SynchronizeStream(GpuContext* context,
+                                        GpuStreamHandle stream);
+
+  // Blocks the calling thread until the operations associated with the context
+  // have been completed, via cuCtxSynchronize.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
+  static bool SynchronizeContext(GpuContext* context);
+
+  // Returns true if all stream tasks have completed at time of the call. Note
+  // the potential for races around this call (if another thread adds work to
+  // the stream immediately after this returns).
+  static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
+
+  // Returns whether code in the from context can access memory in the to
+  // context via cuDeviceCanAccessPeer.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
+  static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
+  static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Returns the elapsed milliseconds between start and stop via
+  // cuEventElapsedTime.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
+  static bool GetEventElapsedTime(GpuContext* context,
+                                  float* elapsed_milliseconds,
+                                  GpuEventHandle start, GpuEventHandle stop);
+
+  // Records that an event occurred when execution reaches the current point in
+  // thestream via cuEventRecord.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
+  static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
+                                  GpuStreamHandle stream);
+
+  // Polls (without blocking) to determine the status of an event - pending or
+  // complete (or an error status).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
+  static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
+                                              GpuEventHandle event);
+
+  // -- Pointer-specific calls.
+
+  // Returns the context in which pointer was allocated or registered.
+  static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
+
+  // Returns the device associated with the context from GetPointerContext().
+  static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
+
+  // Returns the memory space addressed by pointer.
+  static port::StatusOr<MemorySpace> GetPointerMemorySpace(
+      GpuDevicePtr pointer);
+
+  // Returns the base address and size of the device pointer dptr.
+  static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
+                                             GpuDevicePtr* base, size_t* size);
+
+  // -- Device-specific calls.
+
+  // Returns the compute capability for the device; i.e (3, 5).
+  // This is currently done via the deprecated device API.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
+  // (supported on CUDA only)
+  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
+                                           GpuDeviceHandle device);
+
+  // Returns Gpu ISA version for the device; i.e 803, 900.
+  // (supported on ROCm only)
+  static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
+
+  // Returns the number of multiprocessors on the device (note that the device
+  // may be multi-GPU-per-board).
+  static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
+
+  // Returns the limit on number of threads that can be resident in a single
+  // multiprocessor.
+  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
+      GpuDeviceHandle device);
+
+  // Returns the limit on number of threads which may be resident for a single
+  // block (cooperative thread array).
+  static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available on a single GPU core (i.e.
+  // SM on NVIDIA devices).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
+      GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available for a single block
+  // (cooperative thread array).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
+      GpuDeviceHandle device);
+
+  // Returns the maximum supported number of registers per block.
+  static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
+
+  // Returns the number of threads per warp.
+  static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
+
+  // Queries the grid limits for device with cuDeviceGetAttribute calls.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
+
+  // Returns a grab-bag of device properties in a caller-owned device_properties
+  // structure for device_ordinal via cuDeviceGetProperties.
+  //
+  // This call is deprecated in the NVIDIA driver API; its replacement is
+  // GetDeviceAttribute
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
+  static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
+                                  int device_ordinal);
+
+  // Gets a specific integer-valued property about the given device.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
+                                                GpuDeviceHandle device);
+
+  // Returns whether ECC is enabled for the given GpuDeviceHandle via
+  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
+
+  // Returns the total amount of memory available for allocation by the CUDA
+  // context, in bytes, via cuDeviceTotalMem.
+  static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
+
+  // Returns the free amount of memory and total amount of memory, as reported
+  // by cuMemGetInfo.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
+  static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
+                                  int64* total);
+
+  // Returns a PCI bus id string for the device.
+  // [domain]:[bus]:[device].[function]
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
+  static string GetPCIBusID(GpuDeviceHandle device);
+
+  // -- Context- and device-independent calls.
+
+  // Returns the number of visible CUDA device via cuDeviceGetCount.
+  // This should correspond to the set of device ordinals available.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
+  static int GetDeviceCount();
+
+  // Returns the driver version number via cuDriverGetVersion.
+  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
+  // instead, the CUDA toolkit release number that this driver is compatible
+  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
+  // compatible driver).
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
+  static bool GetDriverVersion(int* driver_version);
+
+  // -- Other calls
+
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // specified kernel/GpuFunctionHandle when launched with the specified
+  // parameters.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
+  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
+      GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
+      size_t dynamic_shared_memory_bytes);
+
+  // Seam for injecting an error at CUDA initialization time for testing
+  // purposes.
+  static bool driver_inject_init_error_;
+};
+
+// Ensures a context is activated within a scope.
+class ScopedActivateContext {
+ public:
+  // Activates the context via cuCtxSetCurrent, if it is not the currently
+  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
+  // mechanism is said by NVIDIA to be relatively slow and deprecated.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
+  explicit ScopedActivateContext(GpuContext* context);
+
+  // Checks that the context has remained activated for the duration of the
+  // scope.
+  ~ScopedActivateContext();
+
+ private:
+  GpuContext* to_restore_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_event.cc b/tensorflow/stream_executor/gpu/gpu_event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a523958550d10c13624b729076a3fd271e68243a
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_event.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+GpuEvent::GpuEvent(GpuExecutor* parent)
+    : parent_(parent), gpu_event_(nullptr) {}
+
+GpuEvent::~GpuEvent() {}
+
+port::Status GpuEvent::Init() {
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
+                                GpuDriver::EventFlags::kDisableTiming);
+}
+
+port::Status GpuEvent::Destroy() {
+  return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
+}
+
+port::Status GpuEvent::Record(GpuStream* stream) {
+  return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
+                                stream->gpu_stream());
+}
+
+GpuEventHandle GpuEvent::gpu_event() { return gpu_event_; }
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_event.h b/tensorflow/stream_executor/gpu/gpu_event.h
new file mode 100644
index 0000000000000000000000000000000000000000..61f39d42fe7344b3b092b8fbcc5615da99564300
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_event.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
+// interface.
+class GpuEvent : public internal::EventInterface {
+ public:
+  explicit GpuEvent(GpuExecutor* parent);
+
+  ~GpuEvent() override;
+
+  // Populates the CUDA-platform-specific elements of this object.
+  port::Status Init();
+
+  // Deallocates any platform-specific elements of this object. This is broken
+  // out (not part of the destructor) to allow for error reporting.
+  port::Status Destroy();
+
+  // Inserts the event at the current position into the specified stream.
+  port::Status Record(GpuStream* stream);
+
+  // Polls the CUDA platform for the event's current status.
+  Event::Status PollForStatus();
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event();
+
+ private:
+  // The Executor used to which this object and GpuEventHandle are bound.
+  GpuExecutor* parent_;
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event_;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f969a98d2f42b5be0f6d29e8e19c006540e3b8b
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -0,0 +1,347 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class GpuExecutor : public internal::StreamExecutorInterface {
+ public:
+  // sub_platform indicates the subplatform used in this executor; it must
+  // be a CUDA type.
+  explicit GpuExecutor(const PluginConfig& plugin_config)
+      : device_(0),
+        context_(nullptr),
+        device_ordinal_(0),
+        cc_major_(0),
+        cc_minor_(0),
+        version_(0),
+        plugin_config_(plugin_config) {}
+
+  // See the corresponding StreamExecutor methods for method comments on the
+  // following overrides.
+
+  ~GpuExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  bool GetKernel(const MultiKernelLoaderSpec& spec,
+                 KernelBase* kernel) override;
+  // (supported on CUDA only)
+  void UnloadKernel(const KernelBase* kernel) override;
+  bool LoadModule(const MultiModuleLoaderSpec& spec,
+                  ModuleHandle* module_handle) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
+
+  bool Launch(Stream* stream, const ThreadDim& thread_dims,
+              const BlockDim& block_dims, const KernelBase& k,
+              const KernelArgsArrayBase& args) override;
+
+  // (supported on CUDA only)
+  int CalculateOccupancy(const DeviceDescription& device_description,
+                         uint64 registers_per_thread,
+                         uint64 shared_memory_per_block,
+                         const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  // (supported on CUDA only)
+  int CompareOccupancy(int* initial_blocks,
+                       const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  void* Allocate(uint64 size) override;
+
+  void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+
+  void Deallocate(DeviceMemoryBase* mem) override;
+
+  void* UnifiedMemoryAllocate(uint64 size) override {
+    return GpuDriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void* location) override {
+    return GpuDriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
+  void* HostMemoryAllocate(uint64 size) override {
+    return GpuDriver::HostAllocate(context_, size);
+  }
+
+  void HostMemoryDeallocate(void* location) override {
+    return GpuDriver::HostDeallocate(context_, location);
+  }
+
+  bool HostMemoryRegister(void* location, uint64 size) override;
+
+  bool HostMemoryUnregister(void* location) override;
+
+  bool SynchronizeAllActivity() override;
+
+  bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase* location, int value,
+                         uint64 size) override;
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64 size) override;
+
+  port::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64 size) override;
+
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                               const DeviceMemoryBase& gpu_src,
+                                               uint64 size) override;
+
+  bool MemZero(Stream* stream, DeviceMemoryBase* location,
+               uint64 size) override;
+  bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+              uint64 size) override;
+  bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
+                uint64 size) override;
+
+  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
+              uint64 size) override;
+
+  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
+              uint64 size) override;
+
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& gpu_src,
+                            uint64 size) override;
+
+  bool HostCallback(Stream* stream,
+                    std::function<port::Status()> callback) override;
+
+  bool AllocateStream(Stream* stream) override;
+
+  void DeallocateStream(Stream* stream) override;
+
+  bool CreateStreamDependency(Stream* dependent, Stream* other) override;
+
+  bool AllocateTimer(Timer* timer) override;
+
+  void DeallocateTimer(Timer* timer) override;
+
+  bool StartTimer(Stream* stream, Timer* timer) override;
+
+  bool StopTimer(Stream* stream, Timer* timer) override;
+
+  port::Status AllocateEvent(Event* event) override;
+
+  port::Status DeallocateEvent(Event* event) override;
+
+  port::Status RecordEvent(Stream* stream, Event* event) override;
+
+  port::Status WaitForEvent(Stream* stream, Event* event) override;
+
+  Event::Status PollForEventStatus(Event* event) override;
+
+  port::Status BlockHostUntilDone(Stream* stream) override;
+
+  int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+  bool DeviceMemoryUsage(int64* free, int64* total) const override;
+
+  // Search for the symbol and returns a device pointer and size.
+  // Returns false if symbol does not exist.
+  bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
+                 void** mem, size_t* bytes) override;
+
+  DeviceDescription* PopulateDeviceDescription() const override;
+
+  // Populates the block_dim_limit by querying the device driver API. If an
+  // error occurs at any point while asking the driver for block dim limits, it
+  // will be only partially populated as a result, and an error will be logged.
+  bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
+
+  bool SupportsBlas() const override;
+
+  blas::BlasSupport* CreateBlas() override;
+
+  bool SupportsFft() const override;
+
+  fft::FftSupport* CreateFft() override;
+
+  bool SupportsRng() const override;
+
+  rng::RngSupport* CreateRng() override;
+
+  bool SupportsDnn() const override;
+
+  dnn::DnnSupport* CreateDnn() override;
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override;
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override;
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
+
+  void* GpuContextHack() override;
+
+  GpuContext* gpu_context();
+
+ private:
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for compute-capability-specific suffixed versions; i.e.
+  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+  // we're on a compute capability 3.0 machine.
+  // (supported on CUDA only)
+  bool FindOnDiskForComputeCapability(absl::string_view filename,
+                                      absl::string_view canonical_suffix,
+                                      string* found_filename) const;
+
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for AMDGPU ISA-specific suffixed versions.
+  // (supported on ROCm only)
+
+  bool FindOnDiskForISAVersion(absl::string_view filename,
+                               absl::string_view canonical_suffix,
+                               string* found_filename) const;
+
+  // Host callback landing routine invoked by CUDA.
+  // data: User-provided callback provided to HostCallback() above, captured
+  //       as a std::function<void()>. Allocated/initialized inside
+  //       HostCallback() and owned and deleted by this call.
+  static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
+                                   void* data);
+
+  // Collects metadata for the specified kernel.
+  bool GetKernelMetadata(GpuKernel* cuda_kernel,
+                         KernelMetadata* kernel_metadata);
+
+  // Prints to VLOG(2) information about the kernel's occupancy and how it might
+  // be improved.
+  void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims);
+
+  // (supported on CUDA only)
+  bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
+  // (supported on CUDA only)
+  bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // (supported on ROCm only)
+  bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(const void* gpu_binary)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Guards the on-disk-module mapping.
+  mutex disk_modules_mu_;
+
+  // Mapping from filename to GPUModuleHandle, if it was already retrieved.
+  // Multiple GPUFunctionHandle are usually obtained from a single
+  // GPUModuleHandle so we attempt to hit in this mapping first, before
+  // retrieving it.
+  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+  // Guards the in-memory-module mapping.
+  mutex in_memory_modules_mu_;
+
+  std::map<const char*, GpuModuleHandle> in_memory_modules_
+      GUARDED_BY(in_memory_modules_mu_);
+
+  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
+  std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
+      GUARDED_BY(in_memory_modules_mu_);
+  // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
+  std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
+      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
+
+  // Guards the launched kernel set.
+  mutex launched_kernels_mu_;
+
+  // Keeps track of the set of launched kernels. Currently used to suppress the
+  // occupancy check on subsequent launches.
+  std::set<GpuFunctionHandle> launched_kernels_
+      GUARDED_BY(launched_kernels_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  GpuDeviceHandle device_;
+
+  // Handle for session with the library/driver. Immutable post-initialization.
+  GpuContext* context_;
+
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  // The major verion of the compute capability for device_.
+  int cc_major_;
+
+  // The minor verion of the compute capability for device_.
+  int cc_minor_;
+
+  // GPU ISA version for device_.
+  int version_;
+
+  // The plugin configuration associated with this instance.
+  PluginConfig plugin_config_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_helpers.h b/tensorflow/stream_executor/gpu/gpu_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..117a71718f269d8ffd724d55ae269fea95dac366
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_helpers.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+
+#include <stddef.h>
+#include <complex>
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Converts a const DeviceMemory reference to its underlying typed pointer in
+// CUDA
+// device memory.
+template <typename T>
+const T* GpuMemory(const DeviceMemory<T>& mem) {
+  return static_cast<const T*>(mem.opaque());
+}
+
+// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
+// pointer in CUDA device memory.
+template <typename T>
+T* GpuMemoryMutable(DeviceMemory<T>* mem) {
+  return static_cast<T*>(mem->opaque());
+}
+
+static_assert(
+    sizeof(std::complex<float>) == sizeof(GpuComplexType),
+    "std::complex<float> and GpuComplexType should have the same size");
+static_assert(offsetof(GpuComplexType, x) == 0,
+              "The real part of GpuComplexType should appear first.");
+static_assert(
+    sizeof(std::complex<double>) == sizeof(GpuDoubleComplexType),
+    "std::complex<double> and GpuDoubleComplexType should have the same "
+    "size");
+static_assert(offsetof(GpuDoubleComplexType, x) == 0,
+              "The real part of GpuDoubleComplexType should appear first.");
+
+// Type traits to get CUDA complex types from std::complex<>.
+
+template <typename T>
+struct GpuComplexT {
+  typedef T type;
+};
+
+template <>
+struct GpuComplexT<std::complex<float>> {
+  typedef GpuComplexType type;
+};
+
+template <>
+struct GpuComplexT<std::complex<double>> {
+  typedef GpuDoubleComplexType type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// GpuComplexType/GpuDoubleComplexType. No type conversion for non-complex
+// types.
+
+template <typename T>
+inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
+  return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
+}
+
+template <typename T>
+inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
+  return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
+}
+
+// Converts values of std::complex<float/double> to values of
+// GpuComplexType/GpuDoubleComplexType.
+inline GpuComplexType GpuComplexValue(std::complex<float> val) {
+  return {val.real(), val.imag()};
+}
+
+inline GpuDoubleComplexType GpuComplexValue(std::complex<double> val) {
+  return {val.real(), val.imag()};
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_kernel.h b/tensorflow/stream_executor/gpu/gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b89b20097323c02fc9cf7492d54657789956ca7
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_kernel.h
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Wraps a GpuFunctionHandle to implement the platform-independent
+// KernelInterface.
+class GpuKernel : public internal::KernelInterface {
+ public:
+  GpuKernel()
+      : gpu_function_(nullptr),
+        arity_(0),
+        preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the GpuExecutor.
+  ~GpuKernel() override {}
+
+  // As arity cannot be reflected upon using the CUDA API, the arity is
+  // explicitly set during the GpuExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  // Returns the GpuFunctionHandle value for passing to the CUDA API.
+  GpuFunctionHandle AsGpuFunctionHandle() const {
+    DCHECK(gpu_function_ != nullptr);
+    return const_cast<GpuFunctionHandle>(gpu_function_);
+  }
+
+  // Returns the slot that the GpuFunctionHandle is stored within for this
+  // object, for the CUDA API which wants to load into a GpuFunctionHandle*.
+  GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }
+
+  // CUDA supports setting the preferred cache configuration of a
+  // GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
+  // via the below functions; users can set a preference, and that is applied
+  // when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
+  // would be to load the kernel & set the preference when the user calls the
+  // setter below; either approach is valid. Sets the current kernel cache
+  // configuration preference.
+  void SetPreferredCacheConfig(KernelCacheConfig config) override {
+    preferred_cache_config_ = config;
+  }
+
+  // Returns the current kernel cache configuration preference.
+  KernelCacheConfig GetPreferredCacheConfig() const override {
+    return preferred_cache_config_;
+  }
+
+  // Returns the current kernel cache configuration preference as a
+  // CUfunc_cache.
+  GpuFuncCachePreference GetGpuCacheConfig() const;
+
+ private:
+  GpuFunctionHandle gpu_function_;  // Wrapped CUDA kernel handle.
+  unsigned arity_;  // Number of formal parameters the kernel takes.
+
+  // Preferred (but not required) cache configuration for this kernel.
+  KernelCacheConfig preferred_cache_config_;
+};
+
+// Given a platform-independent kernel datatype, returns the (const) internal
+// CUDA platform implementation pointer.
+inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
+  return static_cast<const GpuKernel*>(kernel->implementation());
+}
+
+// Given a platform-independent kernel datatype, returns the (non-const)
+// internal CUDA platform implementation pointer.
+inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
+  return static_cast<GpuKernel*>(kernel->implementation());
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_rng.h b/tensorflow/stream_executor/gpu/gpu_rng.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4bf1e1963044a9a54fb92b6a324d3fadd5e6c0b
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rng.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Opaque and unique identifier for the GPU RNG plugin.
+extern const PluginId kGpuRandPlugin;
+
+class GpuExecutor;
+
+// GPU-platform implementation of the random number generation support
+// interface.
+//
+// Thread-safe post-initialization.
+class GpuRng : public rng::RngSupport {
+ public:
+  explicit GpuRng(GpuExecutor* parent);
+
+  // Retrieves a gpu rng library generator handle. This is necessary for
+  // enqueuing random number generation work onto the device.
+  // TODO(leary) provide a way for users to select the RNG algorithm.
+  bool Init();
+
+  // Releases a gpu rng library generator handle, if one was acquired.
+  ~GpuRng() override;
+
+  // See rng::RngSupport for details on the following overrides.
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) override;
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<float>>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<double>>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                              DeviceMemory<float>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                              DeviceMemory<double>* v) override;
+
+  bool SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) override;
+
+ private:
+  // Actually performs the work of generating random numbers - the public
+  // methods are thin wrappers to this interface.
+  template <typename T>
+  bool DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v);
+  template <typename ElemT, typename FuncT>
+  bool DoPopulateRandGaussianInternal(Stream* stream, ElemT mean, ElemT stddev,
+                                      DeviceMemory<ElemT>* v, FuncT func);
+
+  // Sets the stream for the internal gpu rng generator.
+  //
+  // This is a stateful operation, as the handle can only have one stream set at
+  // a given time, so it is usually performed right before enqueuing work to do
+  // with random number generation.
+  bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // mutex that guards the gpu rng library handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this GpuRng.
+  // Immutable post-initialization.
+  GpuExecutor* parent_;
+
+  // gpu rng library handle on the device.
+  GpuRngHandle rng_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
+};
+
+template <typename T>
+string TypeString();
+
+template <>
+string TypeString<float>() {
+  return "float";
+}
+
+template <>
+string TypeString<double>() {
+  return "double";
+}
+
+template <>
+string TypeString<std::complex<float>>() {
+  return "std::complex<float>";
+}
+
+template <>
+string TypeString<std::complex<double>>() {
+  return "std::complex<double>";
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
similarity index 51%
rename from tensorflow/stream_executor/cuda/cuda_stream.cc
rename to tensorflow/stream_executor/gpu/gpu_stream.cc
index b5aa7694f7e1d8d47f3252d3ba679292155119b5..f43500370fc6a7a3e919d2c7af0a92e98100284b 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,49 +13,49 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-bool CUDAStream::Init() {
-  if (!CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_)) {
+bool GpuStream::Init() {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
     return false;
   }
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
-                                 CUDADriver::EventFlags::kDisableTiming)
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
+                                GpuDriver::EventFlags::kDisableTiming)
       .ok();
 }
 
-void CUDAStream::Destroy() {
+void GpuStream::Destroy() {
   if (completed_event_ != nullptr) {
     port::Status status =
-        CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
+        GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
     if (!status.ok()) {
       LOG(ERROR) << status.error_message();
     }
   }
 
-  CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
+  GpuDriver::DestroyStream(parent_->gpu_context(), &gpu_stream_);
 }
 
-bool CUDAStream::IsIdle() const {
-  return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
+bool GpuStream::IsIdle() const {
+  return GpuDriver::IsStreamIdle(parent_->gpu_context(), gpu_stream_);
 }
 
-CUDAStream *AsCUDAStream(Stream *stream) {
+GpuStream* AsGpuStream(Stream* stream) {
   DCHECK(stream != nullptr);
-  return static_cast<CUDAStream *>(stream->implementation());
+  return static_cast<GpuStream*>(stream->implementation());
 }
 
-CUstream AsCUDAStreamValue(Stream *stream) {
+GpuStreamHandle AsGpuStreamValue(Stream* stream) {
   DCHECK(stream != nullptr);
-  return AsCUDAStream(stream)->cuda_stream();
+  return AsGpuStream(stream)->gpu_stream();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..c38f6c132a571bb42b31c9649440fd0ff2aaa777
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
+// StreamExecutor Stream interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+
+// Wraps a GpuStreamHandle in order to satisfy the platform-independent
+// StreamInterface.
+//
+// Thread-safe post-initialization.
+class GpuStream : public internal::StreamInterface {
+ public:
+  explicit GpuStream(GpuExecutor* parent)
+      : parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
+
+  // Note: teardown is handled by a parent's call to DeallocateStream.
+  ~GpuStream() override {}
+
+  void* GpuStreamHack() override { return gpu_stream_; }
+  void** GpuStreamMemberHack() override {
+    return reinterpret_cast<void**>(&gpu_stream_);
+  }
+
+  // Explicitly initialize the CUDA resources associated with this stream, used
+  // by StreamExecutor::AllocateStream().
+  bool Init();
+
+  // Explicitly destroy the CUDA resources associated with this stream, used by
+  // StreamExecutor::DeallocateStream().
+  void Destroy();
+
+  // Returns true if no work is pending or executing on the stream.
+  bool IsIdle() const;
+
+  // Retrieves an event which indicates that all work enqueued into the stream
+  // has completed. Ownership of the event is not transferred to the caller, the
+  // event is owned by this stream.
+  GpuEventHandle* completed_event() { return &completed_event_; }
+
+  // Returns the GpuStreamHandle value for passing to the CUDA API.
+  //
+  // Precond: this GpuStream has been allocated (otherwise passing a nullptr
+  // into the NVIDIA library causes difficult-to-understand faults).
+  GpuStreamHandle gpu_stream() const {
+    DCHECK(gpu_stream_ != nullptr);
+    return const_cast<GpuStreamHandle>(gpu_stream_);
+  }
+
+  // TODO(timshen): Migrate away and remove this function.
+  GpuStreamHandle cuda_stream() const { return gpu_stream(); }
+
+  GpuExecutor* parent() const { return parent_; }
+
+ private:
+  GpuExecutor* parent_;         // Executor that spawned this stream.
+  GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
+
+  // Event that indicates this stream has completed.
+  GpuEventHandle completed_event_ = nullptr;
+};
+
+// Helper functions to simplify extremely common flows.
+// Converts a Stream to the underlying GpuStream implementation.
+GpuStream* AsGpuStream(Stream* stream);
+
+// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
+GpuStreamHandle AsGpuStreamValue(Stream* stream);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/gpu/gpu_timer.cc
similarity index 51%
rename from tensorflow/stream_executor/cuda/cuda_timer.cc
rename to tensorflow/stream_executor/gpu/gpu_timer.cc
index 991a12a23d632bd9fb4c97a340e244f6ffb4c7d3..cc4b50d9baa0af70410baad582d210e90bdb7b03 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/gpu/gpu_timer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-bool CUDATimer::Init() {
+bool GpuTimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::CreateEvent(
-      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::CreateEvent(context, &start_event_,
+                                               GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return false;
   }
 
-  status = CUDADriver::CreateEvent(context, &stop_event_,
-                                   CUDADriver::EventFlags::kDefault);
+  status = GpuDriver::CreateEvent(context, &stop_event_,
+                                  GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
-    status = CUDADriver::DestroyEvent(context, &start_event_);
+    status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
       LOG(ERROR) << status;
     }
@@ -48,47 +48,46 @@ bool CUDATimer::Init() {
   return true;
 }
 
-void CUDATimer::Destroy() {
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+void GpuTimer::Destroy() {
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 
-  status = CUDADriver::DestroyEvent(context, &stop_event_);
+  status = GpuDriver::DestroyEvent(context, &stop_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 }
 
-float CUDATimer::GetElapsedMilliseconds() const {
+float GpuTimer::GetElapsedMilliseconds() const {
   CHECK(start_event_ != nullptr && stop_event_ != nullptr);
   // TODO(leary) provide a way to query timer resolution?
   // CUDA docs say a resolution of about 0.5us
   float elapsed_milliseconds = NAN;
-  (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
-                                        &elapsed_milliseconds, start_event_,
-                                        stop_event_);
+  (void)GpuDriver::GetEventElapsedTime(
+      parent_->gpu_context(), &elapsed_milliseconds, start_event_, stop_event_);
   return elapsed_milliseconds;
 }
 
-bool CUDATimer::Start(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), start_event_, stream->cuda_stream());
+bool GpuTimer::Start(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), start_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
   return status.ok();
 }
 
-bool CUDATimer::Stop(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+bool GpuTimer::Stop(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), stop_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
   return status.ok();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_timer.h b/tensorflow/stream_executor/gpu/gpu_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..886f0c2d57729270b9a87635ddffd1a4be4acfdb
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_timer.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
+// StreamExecutor Timer interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+class GpuStream;
+
+// Wraps a pair of GpuEventHandles in order to satisfy the platform-independent
+// TimerInferface -- both a start and a stop event are present which may be
+// recorded in a stream.
+class GpuTimer : public internal::TimerInterface {
+ public:
+  explicit GpuTimer(GpuExecutor* parent)
+      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
+
+  // Note: teardown needs to be explicitly handled in this API by a call to
+  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  // TODO(csigg): Change to RAII.
+  ~GpuTimer() override {}
+
+  // Allocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::AllocateTimer().
+  bool Init();
+
+  // Deallocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::DeallocateTimer().
+  void Destroy();
+
+  // Records the "timer start" event at the current point in the stream.
+  bool Start(GpuStream* stream);
+
+  // Records the "timer stop" event at the current point in the stream.
+  bool Stop(GpuStream* stream);
+
+  // Returns the elapsed time, in milliseconds, between the start and stop
+  // events.
+  float GetElapsedMilliseconds() const;
+
+  // See Timer::Microseconds().
+  // TODO(leary) make this into an error code interface...
+  uint64 Microseconds() const override {
+    return GetElapsedMilliseconds() * 1e3;
+  }
+
+  // See Timer::Nanoseconds().
+  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
+
+ private:
+  GpuExecutor* parent_;
+  GpuEventHandle start_event_;  // Event recorded to indicate the "start"
+                                // timestamp executing in a stream.
+  GpuEventHandle stop_event_;   // Event recorded to indicate the "stop"
+                                // timestamp executing in a stream.
+};
+
+struct GpuTimerDeleter {
+  void operator()(GpuTimer* t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_types.h b/tensorflow/stream_executor/gpu/gpu_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..c69177d0760eb225a78bf7531070d007a93d377a
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_types.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// GPU (ROCm / CUDA) specific type handle resolution
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+
+#if TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hip/hip_complex.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/hiprand/hiprand.h"
+
+#else  // CUDA
+
+#include "cuda/include/cuComplex.h"
+#include "cuda/include/cuda.h"
+
+// cannot include curand.h here
+//   because it triggers the #error in cuda/cuda_gpu_executor.cc
+//     (because curand.h includes cuda_runtime.h)
+// so explicitly adding the lone typedef we need from that file
+typedef struct curandGenerator_st* curandGenerator_t;
+
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+#if TENSORFLOW_USE_ROCM
+
+using GpuStreamHandle = hipStream_t;
+using GpuEventHandle = hipEvent_t;
+using GpuFunctionHandle = hipFunction_t;
+using GpuFunctionAttribute = hipDeviceAttribute_t;  // not a typo!
+using GpuDeviceHandle = hipDevice_t;
+using GpuDevicePtr = hipDeviceptr_t;
+using GpuDeviceAttribute = hipDeviceAttribute_t;
+using GpuDeviceProperty = hipDeviceProp_t;
+using GpuModuleHandle = hipModule_t;
+using GpuStatus = hipError_t;
+using GpuFuncCachePreference = hipFuncCache_t;
+using GpuSharedMemConfig = hipSharedMemConfig;
+using GpuComplexType = hipComplex;
+using GpuDoubleComplexType = hipDoubleComplex;
+using GpuRngHandle = hiprandGenerator_t;
+
+#else  // CUDA
+
+using GpuStreamHandle = CUstream;
+using GpuEventHandle = CUevent;
+using GpuFunctionHandle = CUfunction;
+using GpuFunctionAttribute = CUfunction_attribute;
+using GpuDeviceHandle = CUdevice;
+using GpuDevicePtr = CUdeviceptr;
+using GpuDeviceAttribute = CUdevice_attribute;
+using GpuDeviceProperty = CUdevprop;
+using GpuModuleHandle = CUmodule;
+using GpuStatus = CUresult;
+using GpuFuncCachePreference = CUfunc_cache;
+using GpuSharedMemConfig = CUsharedconfig;
+using GpuComplexType = cuComplex;
+using GpuDoubleComplexType = cuDoubleComplex;
+using GpuRngHandle = curandGenerator_t;
+
+#endif
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..127452aee9f7a0d528ec2a7b80bc488406b99030
--- /dev/null
+++ b/tensorflow/stream_executor/host/BUILD
@@ -0,0 +1,110 @@
+# Description:
+#   Host-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(default_visibility = [":friends"])
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "host_platform_id",
+    srcs = [
+        "host_platform_id.cc",
+    ],
+    hdrs = [
+        "host_platform_id.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:platform",
+    ],
+)
+
+cc_library(
+    name = "host_platform",
+    srcs = [
+        "host_platform.cc",
+    ],
+    hdrs = [
+        "host_platform.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":host_gpu_executor",
+        ":host_platform_id",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "host_stream",
+    srcs = [
+        "host_stream.cc",
+    ],
+    hdrs = [
+        "host_stream.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "host_timer",
+    srcs = [
+        "host_timer.cc",
+    ],
+    hdrs = [
+        "host_timer.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+# TODO(22689637): Rename this target.
+cc_library(
+    name = "host_gpu_executor",
+    srcs = [
+        "host_gpu_executor.cc",
+    ],
+    hdrs = [
+        "host_gpu_executor.h",
+    ],
+    deps = [
+        ":host_platform_id",
+        ":host_stream",
+        ":host_timer",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/lib",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
deleted file mode 100644
index 20299da5172f20b9b73c31b6491806dc57b1d2f0..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/host_buffer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
-
-#include "tensorflow/stream_executor/dnn.h"
-
-namespace stream_executor {
-
-// A HostBuffer is a block of memory in host memory containing the data for a
-// dnn::BatchDescriptor using a device-dependent memory layout.
-// Derived classes provide methods to construct a HostBuffer for a specific
-// device, and to copy data in and out of the buffer.
-class HostBuffer {
- public:
-  const dnn::BatchDescriptor& descriptor() const { return descriptor_; }
-
-  // Returns a string describing the HostBuffer.
-  virtual string AsString() const = 0;
-
- protected:
-  // Construct a HostBuffer from the supplied dnn::BatchDescriptor.
-  explicit HostBuffer(const dnn::BatchDescriptor& descriptor)
-      : descriptor_(descriptor) {}
-  virtual ~HostBuffer() {}
-
- private:
-  const dnn::BatchDescriptor descriptor_;
-};
-
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..133ff2b161b9db227a6a4921865f56bfc4b9bece
--- /dev/null
+++ b/tensorflow/stream_executor/lib/BUILD
@@ -0,0 +1,62 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(default_visibility = [":friends"])
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "lib",
+    srcs = glob(
+        [
+            "**/*.cc",
+        ],
+        exclude = [
+            "**/*test*",
+        ],
+    ),
+    hdrs = glob(["**/*.h"]),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "statusor_test",
+    size = "small",
+    srcs = ["statusor_test.cc"],
+    deps = [
+        ":lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "utility_headers",
+    hdrs = [
+        "ptr_util.h",
+    ],
+    deps = [
+        "//tensorflow/core:ptr_util",
+    ],
+)
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
index 688b0214694478e9be1b1d14e58fda94367f547b..cd0b9dad19bf1d0e4e07bc153d94664fda12bd98 100644
--- a/tensorflow/stream_executor/lib/initialize.h
+++ b/tensorflow/stream_executor/lib/initialize.h
@@ -16,55 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/initialize.h"
-#else
-
-#undef REGISTER_MODULE_INITIALIZER
-#undef DECLARE_MODULE_INITIALIZER
-#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
-
-namespace stream_executor {
-namespace port {
-
-class Initializer {
- public:
-  typedef void (*InitializerFunc)();
-  explicit Initializer(InitializerFunc func) { func(); }
-
-  struct Dependency {
-    Dependency(const char *n, Initializer *i) : name(n), initializer(i) {}
-    const char *const name;
-    Initializer *const initializer;
-  };
-
-  struct DependencyRegisterer {
-    DependencyRegisterer(const char *type, const char *name,
-                         Initializer *initializer,
-                         const Dependency &dependency);
-  };
-};
-
-}  // namespace port
-}  // namespace stream_executor
-
-#define REGISTER_INITIALIZER(type, name, body)                             \
-  static void google_init_##type##_##name() { body; }                      \
-  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
-      google_init_##type##_##name)
-
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
-
-#define DECLARE_INITIALIZER(type, name) \
-  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
-
-#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
-
-#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
-
-#endif  // !defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/initialize.h"
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/logging.proto b/tensorflow/stream_executor/logging.proto
index 2c75500cda452f787cb174238058f026a31e4242..68021d2b3157ceeaabd0d0a2065bc946913f64c4 100644
--- a/tensorflow/stream_executor/logging.proto
+++ b/tensorflow/stream_executor/logging.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 
 package stream_executor;
 
+import "tensorflow/stream_executor/dnn.proto";
+
 message CudnnVersion {
   int32 major = 1;
   int32 minor = 2;
@@ -17,3 +19,11 @@ message CudaInfo {
   CudnnVersion cudnn_version = 1;
   ComputeCapability compute_capability = 2;
 }
+
+message ConvLogEntry {
+  CudaInfo cuda_info = 1;
+  dnn.ConvolutionProto convolution = 2;
+
+  // Profiled time in ms. 0.0 if the convolution is not profiled.
+  float profile_time_ms = 3;
+}
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index c0205abbee305edc23e24d79c53f9ed3b84049b5..9c99581438653a55223a5ebee6173d2a5fefb3ab 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -28,6 +28,8 @@ string PlatformKindString(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
       return "CUDA";
+    case PlatformKind::kROCm:
+      return "ROCm";
     case PlatformKind::kOpenCL:
       return "OpenCL";
     case PlatformKind::kHost:
@@ -52,6 +54,7 @@ PlatformKind PlatformKindFromString(string kind) {
 bool PlatformIsRunnable(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
     case PlatformKind::kOpenCL:
     case PlatformKind::kHost:
       return true;
@@ -63,6 +66,7 @@ bool PlatformIsRunnable(PlatformKind kind) {
 bool PlatformIsRunnableOnDevice(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
     case PlatformKind::kOpenCL:
       return true;
     default:
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index 5cb7047b6f39483f237b5bb249906d9ce8a06b9e..2c2cd77ad21aaeb700a7cffe598112237204b418 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -40,6 +40,7 @@ class StreamExecutor;
 enum class PlatformKind {
   kInvalid,
   kCuda,
+  kROCm,
   kOpenCL,
   kHost,
   kMock,
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..702b2cdfe0dd41997f99daf1bcdcbf8a6994edd8
--- /dev/null
+++ b/tensorflow/stream_executor/platform/BUILD
@@ -0,0 +1,47 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_platform_hdrs")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+cc_library(
+    name = "platform",
+    textual_hdrs = [
+        "logging.h",
+        "mutex.h",
+        "platform.h",
+        "port.h",
+        "thread_annotations.h",
+        "initialize.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/platform/default:platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "dso_loader",
+    hdrs = ["dso_loader.h"],
+    deps = [
+        ":platform",
+        "//tensorflow/stream_executor/platform/default:dso_loader",
+    ],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f1ae7d86ff78a50da51ef730098cee2fc9e30aad
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
+
+cc_library(
+    name = "platform",
+    textual_hdrs = [
+        "initialize.h",
+        "mutex.h",
+    ],
+    deps = ["//tensorflow/core:lib"],
+)
+
+cc_library(
+    name = "dso_loader",
+    srcs = ["dso_loader.cc"],
+    hdrs = ["dso_loader.h"],
+    deps = [
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
similarity index 96%
rename from tensorflow/stream_executor/dso_loader.cc
rename to tensorflow/stream_executor/platform/default/dso_loader.cc
index 6dda5d63155d8f9cf8d068b3feae51b1fba88a51..668eeee3f31ff257092674de98c7d20c39c46a73 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -16,8 +16,6 @@ limitations under the License.
 // TODO(jhen): Replace hardcoded, platform specific path strings in GetXXXPath()
 // with a function in e.g. cuda.h.
 
-#include "tensorflow/stream_executor/dso_loader.h"
-
 #include <limits.h>
 #include <stdlib.h>
 #include <initializer_list>
@@ -30,6 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
@@ -89,10 +88,13 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 #if defined(__APPLE__)
   // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
   // libcuda.1.dylib.
-  return status.ok() ? status : GetDsoHandle(
-     FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", ""),
-                 GetCudaDriverLibraryPath()),
-     dso_handle);
+  return status.ok()
+             ? status
+             : GetDsoHandle(
+                   FindDsoPath(
+                       port::Env::Default()->FormatLibraryFileName("cuda", ""),
+                       GetCudaDriverLibraryPath()),
+                   dso_handle);
 #else
   return status;
 #endif
@@ -144,7 +146,7 @@ static mutex& GetRpathMutex() {
               << ". LD_LIBRARY_PATH: "
               << (ld_library_path != nullptr ? ld_library_path : "")
 #endif
-    ;
+        ;
     return port::Status(port::error::FAILED_PRECONDITION,
                         absl::StrCat("could not dlopen DSO: ", path,
                                      "; dlerror: ", s.error_message()));
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
similarity index 100%
rename from tensorflow/stream_executor/dso_loader.h
rename to tensorflow/stream_executor/platform/default/dso_loader.h
index f063b68d6058f7b1faecfd83d3d21b899cf027a3..806f65b24cdc209dd14a727de6a724bcd1705075 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -19,8 +19,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
 #include <vector>
+#include "tensorflow/stream_executor/platform/port.h"
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/lib/status.h"
diff --git a/tensorflow/stream_executor/platform/default/initialize.h b/tensorflow/stream_executor/platform/default/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d27c85336e1ca64ebcc6969f2179399529e8b37
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/initialize.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+
+#undef REGISTER_MODULE_INITIALIZER
+#undef DECLARE_MODULE_INITIALIZER
+#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
+
+namespace stream_executor {
+namespace port {
+
+class Initializer {
+ public:
+  typedef void (*InitializerFunc)();
+  explicit Initializer(InitializerFunc func) { func(); }
+
+  struct Dependency {
+    Dependency(const char *n, Initializer *i) : name(n), initializer(i) {}
+    const char *const name;
+    Initializer *const initializer;
+  };
+
+  struct DependencyRegisterer {
+    DependencyRegisterer(const char *type, const char *name,
+                         Initializer *initializer,
+                         const Dependency &dependency);
+  };
+};
+
+}  // namespace port
+}  // namespace stream_executor
+
+#define REGISTER_INITIALIZER(type, name, body)                             \
+  static void google_init_##type##_##name() { body; }                      \
+  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
+      google_init_##type##_##name)
+
+#define REGISTER_MODULE_INITIALIZER(name, body) \
+  REGISTER_INITIALIZER(module, name, body)
+
+#define DECLARE_INITIALIZER(type, name) \
+  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
+
+#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
+
+#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index c9f5a7c609e5bbe59ea456e30d575b991aa37b65..2f8f0636ba7bd037f356525047f2dd7c0eda789d 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace stream_executor {
 
diff --git a/tensorflow/stream_executor/platform/dso_loader.h b/tensorflow/stream_executor/platform/dso_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dd56684b1917b07ba6e421479b14ac22af5d335
--- /dev/null
+++ b/tensorflow/stream_executor/platform/dso_loader.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
+
+#include "tensorflow/stream_executor/platform/platform.h"
+
+// Include appropriate platform-dependent implementations
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/google/dso_loader.h"
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
+    defined(PLATFORM_GOOGLE_ANDROID)
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/platform/initialize.h b/tensorflow/stream_executor/platform/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb13132afff7c9f6d4c57176eef8d7180bb45a93
--- /dev/null
+++ b/tensorflow/stream_executor/platform/initialize.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+
+#include "tensorflow/stream_executor/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/google/initialize.h"
+#else
+#include "tensorflow/stream_executor/platform/default/initialize.h"
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/mutex.h b/tensorflow/stream_executor/platform/mutex.h
index 28828951de521752e8debfc1b6cfd2de73a09828..fa6c8c017c30b66baf07e1ee19f4326d7c01b9c3 100644
--- a/tensorflow/stream_executor/platform/mutex.h
+++ b/tensorflow/stream_executor/platform/mutex.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
 
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/stream_executor/platform/google/mutex.h"
diff --git a/tensorflow/stream_executor/platform/platform.h b/tensorflow/stream_executor/platform/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf0e120d39f8bfa8e1a62ae3749beac076335c6
--- /dev/null
+++ b/tensorflow/stream_executor/platform/platform.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
+
+#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+
+// Choose which platform we are on.
+#if defined(ANDROID) || defined(__ANDROID__)
+#define PLATFORM_POSIX_ANDROID
+
+#elif defined(__APPLE__)
+#define PLATFORM_POSIX
+
+#else
+// If no platform specified, use:
+#define PLATFORM_POSIX
+
+#endif
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dd08b36308c58e0f4cc941a6c33c8e2a147559b5
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -0,0 +1,267 @@
+# Description:
+#   ROCm-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "rocm_diagnostics",
+    srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "rocm_driver",
+    srcs = if_rocm_is_configured(["rocm_driver.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
+cc_library(
+    name = "rocm_event",
+    srcs = if_rocm_is_configured(["rocm_event.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "rocm_gpu_executor",
+    srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        ":rocm_driver",
+        ":rocm_event",
+        ":rocm_kernel",
+        ":rocm_platform_id",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_kernel",
+    srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
+    hdrs = [],
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_platform",
+    srcs = if_rocm_is_configured(["rocm_platform.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_platform.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor",  # buildcleaner: keep
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "rocm_platform_id",
+    srcs = ["rocm_platform_id.cc"],
+    hdrs = ["rocm_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "rocblas_plugin",
+#    srcs = ["rocm_blas.cc"],
+#    hdrs = ["rocm_blas.h"],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "//third_party/eigen3",
+#        "//tensorflow/core:lib_internal",
+#        "//tensorflow/stream_executor",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:host_or_device_scalar",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tensorflow/stream_executor:timer",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_timer_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@com_google_absl//absl/strings",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + if_static(["@local_config_rocm//rocm:rocblas"]),
+#    alwayslink = True,
+#)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "rocfft_plugin",
+#    srcs = ["rocm_fft.cc"],
+#    hdrs = [],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_platform_id",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:fft",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + if_static(["@local_config_rocm//rocm:rocfft"]),
+#    alwayslink = True,
+#)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "miopen_plugin",
+#    srcs = ["rocm_dnn.cc"],
+#    hdrs = [],
+#    copts = [
+#        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+#        # setting of template depth 256
+#        "-ftemplate-depth-512",
+#    ],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_diagnostics",
+#        ":rocm_driver",
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "//third_party/eigen3",
+#        "//tensorflow/core:lib",
+#        "//tensorflow/core:lib_internal",
+#        "//tensorflow/core:logger",
+#        "//tensorflow/stream_executor:dnn",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:logging_proto_cc",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+#        "//tensorflow/stream_executor:temporary_device_memory",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_timer_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@com_google_absl//absl/strings",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + tf_additional_miopen_plugin_deps() + if_static(["@local_config_rocm//rocm:miopen"]),
+#    alwayslink = True,
+#)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "rocrand_plugin",
+#    srcs = ["rocm_rng.cc"],
+#    hdrs = [],
+#    deps = [
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "@local_config_rocm//rocm:rocm_headers",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:rng",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#    ] + if_static(["@local_config_rocm//rocm:curand"]),
+#    alwayslink = True,
+#)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        # FIXME: enable in future PRs
+        #":miopen_plugin",
+        #":rocfft_plugin",
+        #":rocblas_plugin",
+        #":rocrand_plugin",
+        ":rocm_driver",
+        ":rocm_platform",
+    ]),
+    alwayslink = 1,
+)
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6da7f9e3da9143466454466d4c402772ac69870
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
@@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dirent.h>
+
+#include <limits.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+
+namespace stream_executor {
+namespace gpu {
+
+string DriverVersionToString(DriverVersion version) {
+  return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
+                         std::get<2>(version));
+}
+
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+  if (!version.ok()) {
+    return version.status().ToString();
+  }
+
+  return DriverVersionToString(version.ValueOrDie());
+}
+
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
+  std::vector<string> pieces = port::Split(value, '.');
+  if (pieces.size() != 2 && pieces.size() != 3) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
+                                        "for driver version; got \"%s\"",
+                                        value.c_str())};
+  }
+
+  int major;
+  int minor;
+  int patch = 0;
+  if (!port::safe_strto32(pieces[0], &major)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse major version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[0].c_str(), value.c_str())};
+  }
+  if (!port::safe_strto32(pieces[1], &minor)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse minor version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[1].c_str(), value.c_str())};
+  }
+  if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse patch version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[2].c_str(), value.c_str())};
+  }
+
+  DriverVersion result{major, minor, patch};
+  VLOG(2) << "version string \"" << value << "\" made value "
+          << DriverVersionToString(result);
+  return result;
+}
+
+// -- class Diagnostician
+
+string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
+  return absl::StrCat("/dev/kfd", dev_node_ordinal);
+}
+
+void Diagnostician::LogDiagnosticInformation() {
+  LOG(INFO) << "retrieving ROCM diagnostic information for host: "
+            << port::Hostname();
+
+  LogDriverVersionInformation();
+}
+
+/* static */ void Diagnostician::LogDriverVersionInformation() {
+  LOG(INFO) << "hostname: " << port::Hostname();
+  if (VLOG_IS_ON(1)) {
+    const char* value = getenv("LD_LIBRARY_PATH");
+    string library_path = value == nullptr ? "" : value;
+    VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
+
+    std::vector<string> pieces = port::Split(library_path, ':');
+    for (const auto& piece : pieces) {
+      if (piece.empty()) {
+        continue;
+      }
+      DIR* dir = opendir(piece.c_str());
+      if (dir == nullptr) {
+        VLOG(1) << "could not open \"" << piece << "\"";
+        continue;
+      }
+      while (dirent* entity = readdir(dir)) {
+        VLOG(1) << piece << " :: " << entity->d_name;
+      }
+      closedir(dir);
+    }
+  }
+  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  LOG(INFO) << "librocm reported version is: "
+            << DriverVersionStatusToString(dso_version);
+
+  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  LOG(INFO) << "kernel reported version is: "
+            << DriverVersionStatusToString(kernel_version);
+
+  if (kernel_version.ok() && dso_version.ok()) {
+    WarnOnDsoKernelMismatch(dso_version, kernel_version);
+  }
+}
+
+// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+// driver-interfacing DSO version number. Returns it as a string.
+port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  port::StatusOr<DriverVersion> result{port::Status{
+      port::error::NOT_FOUND,
+      "was unable to find librocm.so DSO loaded into this program"}};
+
+  // Callback used when iterating through DSOs. Looks for the driver-interfacing
+  // DSO and yields its version number into the callback data, when found.
+  auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
+                         void* data) -> int {
+    if (strstr(info->dlpi_name, "librocm.so.1")) {
+      VLOG(1) << "found DLL info with name: " << info->dlpi_name;
+      char resolved_path[PATH_MAX] = {0};
+      if (realpath(info->dlpi_name, resolved_path) == nullptr) {
+        return 0;
+      }
+      VLOG(1) << "found DLL info with resolved path: " << resolved_path;
+      const char* slash = rindex(resolved_path, '/');
+      if (slash == nullptr) {
+        return 0;
+      }
+      const char* so_suffix = ".so.";
+      const char* dot = strstr(slash, so_suffix);
+      if (dot == nullptr) {
+        return 0;
+      }
+      string dso_version = dot + strlen(so_suffix);
+      // TODO(b/22689637): Eliminate the explicit namespace if possible.
+      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
+      *result = StringToDriverVersion(stripped_dso_version);
+      return 1;
+    }
+    return 0;
+  };
+
+  dl_iterate_phdr(iterate_phdr, &result);
+
+  return result;
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+    const string& driver_version_file_contents) {
+  static const char* kDriverFilePrelude = "Kernel Module  ";
+  size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
+  if (offset == string::npos) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not find kernel module information in "
+                     "driver version file contents: \"",
+                     driver_version_file_contents, "\"")};
+  }
+
+  string version_and_rest = driver_version_file_contents.substr(
+      offset + strlen(kDriverFilePrelude), string::npos);
+  size_t space_index = version_and_rest.find(" ");
+  auto kernel_version = version_and_rest.substr(0, space_index);
+  // TODO(b/22689637): Eliminate the explicit namespace if possible.
+  auto stripped_kernel_version =
+      port::StripSuffixString(kernel_version, ".ld64");
+  return StringToDriverVersion(stripped_kernel_version);
+}
+
+void Diagnostician::WarnOnDsoKernelMismatch(
+    port::StatusOr<DriverVersion> dso_version,
+    port::StatusOr<DriverVersion> kernel_version) {
+  if (kernel_version.ok() && dso_version.ok() &&
+      dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
+    LOG(INFO) << "kernel version seems to match DSO: "
+              << DriverVersionToString(kernel_version.ValueOrDie());
+  } else {
+    LOG(ERROR) << "kernel version "
+               << DriverVersionStatusToString(kernel_version)
+               << " does not match DSO version "
+               << DriverVersionStatusToString(dso_version)
+               << " -- cannot find working devices in this configuration";
+  }
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+  auto status = port::Status{port::error::UNIMPLEMENTED,
+                             "kernel reported driver version not implemented"};
+  return status;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39d52d2830429fd6fcb93d251b5b86700e785fdd
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -0,0 +1,1365 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <map>
+#include <set>
+#include <utility>
+
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/notification.h"
+#include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/static_threadlocal.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
+bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
+bool FLAGS_gpuexec_rocm_device_0_only = false;
+
+// Debugging: on each push and pop of a rocm context, verify the current device
+// matches the expected one.
+constexpr bool kVerifyGpuContext = false;
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuContext wraps the device_ordinal.
+// Only reason we need this wrapper class is to make the GpuDriver* API
+class GpuContext {
+ public:
+  GpuContext(const int v) : device_ordinal_(v) {}
+
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Disallow copying and moving.
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
+
+ private:
+  const int device_ordinal_;
+};
+
+namespace {
+
+// Formats hipError_t to output prettified values into a log stream.
+// Error summaries taken from:
+//
+// TODO(leary) switch to cuGetErrorName when updated rocm.h is available.
+string ToString(hipError_t result) {
+#define OSTREAM_ROCM_ERROR(__name) \
+  case hipError##__name:           \
+    return "HIP_ERROR_" #__name;
+
+  switch (result) {
+    OSTREAM_ROCM_ERROR(InvalidValue)
+    OSTREAM_ROCM_ERROR(OutOfMemory)
+    OSTREAM_ROCM_ERROR(NotInitialized)
+    OSTREAM_ROCM_ERROR(Deinitialized)
+    OSTREAM_ROCM_ERROR(NoDevice)
+    OSTREAM_ROCM_ERROR(InvalidDevice)
+    OSTREAM_ROCM_ERROR(InvalidImage)
+    OSTREAM_ROCM_ERROR(InvalidContext)
+    OSTREAM_ROCM_ERROR(InvalidHandle)
+    OSTREAM_ROCM_ERROR(NotFound)
+    OSTREAM_ROCM_ERROR(NotReady)
+    OSTREAM_ROCM_ERROR(NoBinaryForGpu)
+
+    // Encountered an uncorrectable ECC error during execution.
+    OSTREAM_ROCM_ERROR(ECCNotCorrectable)
+
+    // Load/store on an invalid address. Must reboot all context.
+    case 700:
+      return "ROCM_ERROR_ILLEGAL_ADDRESS";
+    // Passed too many / wrong arguments, too many threads for register count.
+    case 701:
+      return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+      OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
+      OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
+      OSTREAM_ROCM_ERROR(Unknown)  // Unknown internal error to ROCM.
+    default:
+      return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
+  }
+}
+
+// ROCM driver routines may require a large amount of stack (particularly
+// hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
+// stack-limited threads (such as those spawned by a default-argument
+// thread::ThreadPool on some platforms), we run certain routines in this pool
+// and wait for completion.
+static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
+static port::ThreadPool* InitializeDriverExecutor() {
+  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
+                              "rocm_driver", 1);
+}
+
+port::ThreadPool* GetDriverExecutor() {
+  mutex_lock lock(driver_executor_threadpool_mu);
+  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
+  return thread_pool;
+}
+
+}  // namespace
+
+string MemorySpaceString(MemorySpace memory_space) {
+  switch (memory_space) {
+    case MemorySpace::kHost:
+      return "host";
+    case MemorySpace::kDevice:
+      return "device";
+    default:
+      LOG(FATAL) << "impossible memory space";
+  }
+}
+
+// Returns the current device set in HIP. This is done by calling the
+// HIP driver (e.g., this value is not our cached view of the current device).
+static int CurrentDeviceOrDie() {
+  int current = -1;
+  hipError_t result = hipGetDevice(&current);
+  if (result != hipSuccess) {
+    LOG(FATAL) << "failed to query current device: " << ToString(result);
+  }
+  return current;
+}
+
+namespace {
+
+// Call hipDeviceSynchronize and crash if it doesn't succeed.
+void SynchronizeOrDie() {
+  auto res = hipDeviceSynchronize();
+  if (res != hipSuccess) {
+    LOG(FATAL) << "Synchronize found " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+  }
+}
+
+struct ThreadLocalData {
+  int current_device_ordinal;
+  int depth;
+};
+
+SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
+
+}  // namespace
+
+ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
+  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
+    SynchronizeOrDie();
+  }
+
+  auto* tls = &tls_data.get();
+  if (tls->depth == 0) {
+    tls->current_device_ordinal = CurrentDeviceOrDie();
+  }
+
+  if (kVerifyGpuContext) {
+    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
+  }
+
+  tls->depth++;
+
+  to_restore_ = context;
+
+  if (context->device_ordinal() == tls->current_device_ordinal) {
+    DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
+    return;
+  }
+
+  VLOG(3) << "ScopedActivateContext switching device from "
+          << tls->current_device_ordinal << " to " << context->device_ordinal();
+
+  // Set the device and update thread local.
+  CHECK_EQ(hipSuccess, hipSetDevice(context->device_ordinal()));
+  tls->current_device_ordinal = context->device_ordinal();
+}
+
+ScopedActivateContext::~ScopedActivateContext() {
+  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
+    SynchronizeOrDie();
+  }
+
+  auto* tls = &tls_data.get();
+
+  if (kVerifyGpuContext) {
+    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
+  }
+
+  tls->depth--;
+  DCHECK_GE(tls->depth, 0);
+
+  if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
+    DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
+    return;
+  }
+
+  VLOG(3) << "ScopedActivateContext switching device from "
+          << tls->current_device_ordinal << " to "
+          << to_restore_->device_ordinal();
+
+  // Set context and update thread local.
+  CHECK_EQ(hipSuccess, hipSetDevice(to_restore_->device_ordinal()));
+  tls->current_device_ordinal = to_restore_->device_ordinal();
+}
+
+namespace {
+
+// Returns a stringified device number associated with pointer, primarily for
+// logging purposes. Returns "?" if the device could not be successfully
+// queried.
+string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
+  auto value = GpuDriver::GetPointerDevice(pointer);
+  if (value.ok()) {
+    return absl::StrCat(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified memory space associated with pointer, primarily for
+// logging purposes. Returns "?" if the memory space could not be successfully
+// queried.
+string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
+  if (value.ok()) {
+    return MemorySpaceString(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified representation of whether or not peer access is
+// permitted between the "from" and "to" pointers' associated contexts,
+// primarily for logging purposes. Returns "error" if an error is encountered
+// in the process of querying.
+string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
+  hipPointerAttribute_t from_pointerAttributes;
+  hipError_t result = hipPointerGetAttributes(&from_pointerAttributes, from);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "could not retrieve source pointer's device: "
+               << ToString(result);
+    return "error";
+  }
+
+  hipPointerAttribute_t to_pointerAttributes;
+  result = hipPointerGetAttributes(&to_pointerAttributes, to);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "could not retrieve destination pointer's device: "
+               << ToString(result);
+    return "error";
+  }
+
+  GpuContext fromCtx(from_pointerAttributes.device);
+  GpuContext toCtx(to_pointerAttributes.device);
+
+  return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
+}
+
+// Actually performs the work of ROCM initialization. Wrapped up in one-time
+// execution guard.
+static port::Status InternalInit() {
+  hipError_t res = hipErrorNoDevice;
+  if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
+    LOG(ERROR) << "injecting ROCM init error; initialization will fail";
+  } else {
+    res = hipInit(0 /* = flags */);
+  }
+
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  }
+
+  LOG(ERROR) << "failed call to hipInit: " << ToString(res);
+  Diagnostician::LogDiagnosticInformation();
+  return port::Status{port::error::ABORTED,
+                      absl::StrCat("failed call to hipInit: ", ToString(res))};
+}
+
+}  // namespace
+
+/* static */ port::Status GpuDriver::Init() {
+  // Cached return value from calling InternalInit(), as hipInit need only be
+  // called once, but GpuDriver::Init may be called many times.
+  static port::Status init_retval;
+  static bool set = false;
+  static mutex* init_mu = new mutex;
+
+  mutex_lock lock(*init_mu);
+  if (!set) {
+    init_retval = InternalInit();
+    set = true;
+  }
+
+  return init_retval;
+}
+
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               hipDevice_t* device) {
+  hipError_t res = hipDeviceGet(device, device_ordinal);
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
+}
+
+/* static */ bool GpuDriver::GetDeviceName(hipDevice_t device,
+                                           string* device_name) {
+  static const size_t kCharLimit = 64;
+  absl::InlinedVector<char, 4> chars(kCharLimit);
+  hipError_t res = hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get device name for " << device << ": "
+               << ToString(res);
+    return false;
+  }
+  chars[kCharLimit - 1] = '\0';
+  *device_name = chars.begin();
+  return true;
+}
+
+bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
+                                 int* flags) {
+  static_assert(DeviceOptions::kMask == 0xf,
+                "needs update for new device options");
+  return true;
+}
+
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
+    GpuContext** context) {
+  *context = new GpuContext(device_ordinal);
+  return port::Status::OK();
+}
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
+  if (context == nullptr) {
+    return;
+  }
+  delete context;
+}
+
+/* static */ bool GpuDriver::FuncGetAttribute(hipDeviceAttribute_t attribute,
+                                              hipFunction_t func,
+                                              int* attribute_value) {
+  // TODO(ROCm) properly implement this feature in HIP
+  hipError_t res = hipSuccess;
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
+               << ", attribute: " << attribute;
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
+                                                hipFuncCache_t cache_config) {
+  hipError_t res = hipFuncSetCacheConfig(function, cache_config);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
+               << ", config: " << cache_config << ", result: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<hipSharedMemConfig>
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
+  hipSharedMemConfig shared_mem_config;
+  ScopedActivateContext activation{context};
+  hipError_t result = hipDeviceGetSharedMemConfig(&shared_mem_config);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "failed to get ROCM device shared memory config. "
+               << "Context device ID: " << context->device_ordinal()
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get shared memory config: ", ToString(result))};
+  }
+  return shared_mem_config;
+}
+
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, hipSharedMemConfig shared_mem_config) {
+  ScopedActivateContext activation{context};
+  hipError_t result = hipDeviceSetSharedMemConfig(shared_mem_config);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "failed to set ROCM device shared memory config. "
+               << "Context device ID: " << context->device_ordinal()
+               << ", config: " << shared_mem_config
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to set shared memory config: ", ToString(result))};
+  }
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
+    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
+    unsigned int block_dim_y, unsigned int block_dim_z,
+    unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
+    void** extra) {
+  ScopedActivateContext activation{context};
+  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
+          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
+          << " bdx: " << block_dim_x << " bdy: " << block_dim_y
+          << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
+  hipError_t res = hipModuleLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to launch ROCM kernel: " << function
+               << "; result: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully launched kernel";
+  return true;
+}
+
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     hipModule_t* module) {
+  LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
+  return false;
+}
+
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               hipModule_t* module) {
+  return port::Status{port::error::INTERNAL,
+                      "Feature not supported on ROCm platform (LoadCubin)"};
+}
+
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       hipModule_t* module) {
+  port::Notification notification;
+  bool ret = true;
+  GetDriverExecutor()->Schedule(
+      [context, hsaco_contents, module, &ret, &notification]() {
+        ScopedActivateContext activation{context};
+        void* hsaco_data = const_cast<char*>(hsaco_contents);
+
+        hipError_t res = hipModuleLoadData(module, hsaco_data);
+
+        if (res != hipSuccess) {
+          LOG(ERROR) << "failed to load HSACO: " << ToString(res);
+          ret = false;
+          notification.Notify();
+        }
+
+        CHECK(module != nullptr);
+        notification.Notify();
+      });
+  notification.WaitForNotification();
+
+  return ret;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    hipDeviceptr_t location,
+                                                    uint8 value, size_t size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipMemset(location, value, size);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     hipDeviceptr_t location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
+  ScopedActivateContext activation{context};
+  void* pointer = absl::bit_cast<void*>(location);
+  unsigned char valueC = static_cast<unsigned char>(value);
+  uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
+  if (value32 != value) {
+    //  mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
+    LOG(ERROR) << "failed to memset memory";
+    return false;
+  }
+  hipError_t res =
+      hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     hipDeviceptr_t location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipMemsetAsync(location, value, uint32_count, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      hipDeviceptr_t location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  void* pointer = absl::bit_cast<void*>(location);
+
+  // FIXME - need to set a 32-bit value here
+  unsigned char valueC = static_cast<unsigned char>(value);
+  uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
+  if (value32 != value) {
+    // mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
+    LOG(ERROR) << "failed to memset memory";
+    return false;
+  }
+  hipError_t res = hipMemsetAsync(pointer, value, uint32_count * 4, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               GpuStreamHandle stream,
+                                               StreamCallback callback,
+                                               void* data) {
+  hipError_t res = hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
+                                        data, 0 /* = flags */);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "unable to add host callback: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               hipModule_t module,
+                                               const char* kernel_name,
+                                               hipFunction_t* function) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && kernel_name != nullptr);
+  hipError_t res = hipModuleGetFunction(function, module, kernel_name);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get kernel \"" << kernel_name
+               << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             hipModule_t module,
+                                             const char* symbol_name,
+                                             hipDeviceptr_t* dptr,
+                                             size_t* bytes) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && symbol_name != nullptr &&
+        (dptr != nullptr || bytes != nullptr));
+  hipError_t res = hipModuleGetGlobal(dptr, bytes, module, symbol_name);
+  if (res != hipSuccess) {
+    // symbol may not be found in the current module, but it may reside in
+    // another module.
+    VLOG(2) << "failed to get symbol \"" << symbol_name
+            << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          hipModule_t module) {
+  ScopedActivateContext activated{context};
+  hipError_t res = hipModuleUnload(module);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to unload module " << module
+               << "; leaking: " << ToString(res);
+  }
+}
+
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          GpuStreamHandle* stream) {
+  ScopedActivateContext activated{context};
+  hipError_t res = hipStreamCreateWithFlags(
+      stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not allocate ROCM stream for device "
+               << context->device_ordinal() << ": " << ToString(res);
+    return false;
+  }
+
+  VLOG(2) << "successfully created stream " << *stream << " for device "
+          << context->device_ordinal() << " on thread";
+  return true;
+}
+
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           GpuStreamHandle* stream) {
+  if (*stream == nullptr) {
+    return;
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = hipStreamDestroy(*stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to destroy ROCM stream for device "
+               << context->device_ordinal() << ": " << ToString(res);
+  } else {
+    VLOG(2) << "successfully destroyed stream " << *stream << " for device "
+            << context->device_ordinal();
+    *stream = nullptr;
+  }
+}
+
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
+  ScopedActivateContext activated{context};
+  hipDeviceptr_t result = 0;
+  hipError_t res = hipMalloc(&result, bytes);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to allocate "
+               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+               << " bytes) from device: " << ToString(res);
+    return nullptr;
+  }
+  void* ptr = reinterpret_cast<void*>(result);
+  VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
+          << " of " << bytes << " bytes";
+  return ptr;
+}
+
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
+  ScopedActivateContext activation{context};
+  hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
+  hipError_t res = hipFree(pointer);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to free device memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated " << location << " for device "
+            << context->device_ordinal();
+  }
+}
+
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
+  ScopedActivateContext activated{context};
+
+  LOG(ERROR)
+      << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
+  return nullptr;
+}
+
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
+  LOG(ERROR)
+      << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
+}
+
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
+  ScopedActivateContext activation{context};
+  void* host_mem = nullptr;
+  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
+  hipError_t res = hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes on host: " << ToString(res);
+  }
+  return host_mem;
+}
+
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipHostFree(location);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error deallocating host memory at " << location << ": "
+               << ToString(res);
+  }
+}
+
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
+  ScopedActivateContext activation{context};
+  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
+  hipError_t res = hipHostRegister(location, bytes, hipHostRegisterPortable);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error registering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipHostUnregister(location);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error unregistering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  GpuEventHandle* event) {
+  if (*event == nullptr) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        "input event cannot be null"};
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = hipEventDestroy(*event);
+  *event = nullptr;
+
+  switch (res) {
+    case hipSuccess:
+      return port::Status::OK();
+    case hipErrorDeinitialized:
+    case hipErrorNotInitialized:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          absl::StrFormat("error destroying ROCM event in device %d: %s",
+                          context->device_ordinal(), ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INTERNAL,
+          absl::StrFormat("error destroying ROCM event in device %d: %s",
+                          context->device_ordinal(), ToString(res).c_str())};
+  }
+}
+
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 GpuEventHandle event,
+                                                 GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  hipError_t res = hipEventRecord(event, stream);
+  switch (res) {
+    case hipSuccess:
+      return port::Status::OK();
+    case hipErrorDeinitialized:
+    case hipErrorNotInitialized:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
+                          ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INVALID_ARGUMENT,
+          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
+                          ToString(res).c_str())};
+  }
+}
+
+/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
+    GpuContext* context, GpuEventHandle event) {
+  ScopedActivateContext activated{context};
+  hipError_t res = hipEventQuery(event);
+  if (res != hipSuccess && res != hipErrorNotReady) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
+  }
+
+  return res;
+}
+
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 GpuEventHandle start,
+                                                 GpuEventHandle stop) {
+  ScopedActivateContext activated{context};
+  // The stop event must have completed in order for hipEventElapsedTime to
+  // work.
+  hipError_t res = hipEventSynchronize(stop);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
+    return false;
+  }
+  res = hipEventElapsedTime(elapsed_milliseconds, start, stop);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get elapsed time between events: "
+               << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               GpuStreamHandle stream,
+                                               GpuEventHandle event) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipStreamWaitEvent(stream, event, 0 /* = flags */);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not wait stream on event: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipDeviceSynchronize();
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  hipError_t res = hipStreamSynchronize(stream);
+  if (res != hipSuccess) {
+    port::Status status = port::InternalError(
+        absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
+    LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
+    return status;
+  }
+  VLOG(2) << "successfully synchronized stream " << stream << " on device "
+          << context->device_ordinal();
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  hipError_t res = hipStreamQuery(stream);
+  if (res == hipSuccess) {
+    return true;
+  }
+
+  if (res != hipErrorNotReady) {
+    LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
+  }
+  return false;
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
+    GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipMemcpyDtoH(host_dst, gpu_src, size);
+  if (res != hipSuccess) {
+    return port::InternalError(
+        absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
+                        "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
+                        ToString(res).c_str(), host_dst,
+                        absl::bit_cast<void*>(gpu_src), size, size));
+  }
+  VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
+          << host_dst;
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
+    GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
+    uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
+  if (res != hipSuccess) {
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
+        " host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
+        size));
+  }
+  VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
+    GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
+    uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipMemcpyDtoD(gpu_dst, gpu_src, size);
+  if (res != hipSuccess) {
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
+        "Gpu src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
+        absl::bit_cast<void*>(gpu_src), size, size));
+  }
+  VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   hipDeviceptr_t gpu_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res = hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
+        "Gpu src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
+        size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2h of " << size
+          << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
+          << host_dst << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   hipDeviceptr_t gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
+        "host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
+        size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   hipDeviceptr_t gpu_dst,
+                                                   hipDeviceptr_t gpu_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t result = hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  if (result != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from device to device: %s"
+        "; Gpu dst: %p on %s %s"
+        "; Gpu src: %p on %s %s"
+        "; can access? %s; size: %llu=0x%llx",
+        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
+        ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
+        ROCMPointerToDeviceString(gpu_dst).c_str(),
+        absl::bit_cast<void*>(gpu_src),
+        ROCMPointerToMemorySpaceString(gpu_src).c_str(),
+        ROCMPointerToDeviceString(gpu_src).c_str(),
+        ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
+
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 GpuEventHandle* event,
+                                                 EventFlags flags) {
+  int hipflags;
+  switch (flags) {
+    case EventFlags::kDefault:
+      hipflags = hipEventDefault;
+      break;
+    case EventFlags::kDisableTiming:
+      hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
+      break;
+    default:
+      LOG(FATAL) << "impossible event flags: " << int(hipflags);
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = hipEventCreateWithFlags(event, hipflags);
+
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  } else if (res == hipErrorMemoryAllocation) {
+    return port::Status{port::error::RESOURCE_EXHAUSTED,
+                        "could not create ROCM event: out of device memory"};
+  } else {
+    return port::Status{
+        port::error::FAILED_PRECONDITION,
+        absl::StrCat("could not create ROCM event: ", ToString(res))};
+  }
+}
+
+/* static */ int GpuDriver::GetDeviceCount() {
+  int device_count = 0;
+  hipError_t res = hipGetDeviceCount(&device_count);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
+    return 0;
+  }
+
+  if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
+    device_count = 1;
+  }
+  return device_count;
+}
+
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          hipDevice_t device) {
+  return port::Status(
+      port::error::INTERNAL,
+      absl::StrFormat("failed to get compute capability for device: %d "
+                      "(unsupported API on AMD Gpus)",
+                      device));
+}
+
+/* static */ port::Status GpuDriver::GetPointerAddressRange(
+    hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
+  hipError_t result = hipMemGetAddressRange(base, size, dptr);
+  if (result == hipSuccess) {
+    return port::Status::OK();
+  } else if (result == hipErrorNotFound) {
+    // We differentiate between "this pointer is unknown" (return here) and
+    // "there was an internal error while performing this operation" (return
+    // below).
+    return port::Status{port::error::NOT_FOUND,
+                        absl::StrFormat("not a device pointer %p; %s",
+                                        reinterpret_cast<void*>(dptr),
+                                        ToString(result).c_str())};
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to get pointer into for device pointer %p; %s",
+                      reinterpret_cast<void*>(dptr), ToString(result).c_str())};
+}
+
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
+    hipDeviceptr_t pointer) {
+  unsigned int value;
+  hipError_t result = hipSuccess;
+  if (result == hipSuccess) {
+    switch (value) {
+      case hipMemoryTypeDevice:
+        return MemorySpace::kDevice;
+      case hipMemoryTypeHost:
+        return MemorySpace::kHost;
+      default:
+        return port::Status{
+            port::error::INTERNAL,
+            absl::StrCat("unknown memory space provided by ROCM API: ", value)};
+    }
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrCat("failed to query device pointer for memory space: ",
+                   ToString(result))};
+}
+
+/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
+    hipDeviceptr_t pointer) {
+  hipPointerAttribute_t pointerAttributes;
+  hipError_t result = hipPointerGetAttributes(&pointerAttributes, pointer);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get device for pointer: ", ToString(result))};
+  }
+
+  hipDevice_t device;
+  result = hipDeviceGet(&device, pointerAttributes.device);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get device for pointer: ", ToString(result))};
+  }
+
+  return device;
+}
+
+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      hipDevice_t device) {
+  hipDeviceProp_t props;
+  hipError_t result = hipGetDeviceProperties(&props, device);
+  if (result == hipSuccess) {
+    *version = props.gcnArch;
+    return port::Status::OK();
+  }
+  *version = 0;
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
+                      device)};
+}
+
+// Helper function that turns the integer output of hipDeviceGetAttribute to
+// type T and wraps it in a StatusOr.
+template <typename T>
+static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
+                                            hipDeviceAttribute_t attribute) {
+  int value = -1;
+  hipError_t result = hipDeviceGetAttribute(&value, attribute, device);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not retrieve ROCM device attribute (", attribute,
+                     "): ", ToString(result))};
+  }
+  T converted = value;
+  return converted;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(
+      device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxSharedMemoryPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(
+      device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxThreadsPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxRegistersPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
+}
+
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           hipDevice_t device) {
+  int value;
+  hipError_t res =
+      hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
+    return false;
+  }
+  *x = value;
+
+  res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
+    return false;
+  }
+  *y = value;
+
+  res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
+    return false;
+  }
+  *z = value;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
+  hipError_t res = hipDriverGetVersion(driver_version);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query driver version: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceProperties(
+    hipDeviceProp_t* device_properties, int device_ordinal) {
+  hipError_t res = hipGetDeviceProperties(device_properties, device_ordinal);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
+    hipDeviceAttribute_t attribute, hipDevice_t device) {
+  return GetSimpleAttribute<int>(device, attribute);
+}
+
+/* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
+  int value = -1;
+  hipError_t res = hipSuccess;
+  // TODO(ROCm) implement this feature in HIP
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query ECC status: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
+  ScopedActivateContext activation{context};
+  size_t free = 0;
+  size_t total = 0;
+  hipError_t res = hipMemGetInfo(&free, &total);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query device memory info: " << ToString(res);
+    return false;
+  }
+
+  *free_out = free;
+  *total_out = total;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
+                                                  uint64* result) {
+  size_t value = -1;
+  hipError_t res = hipDeviceTotalMem(&value, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query total available memory: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
+  string pci_bus_id;
+  static const int kBufferSize = 64;
+  absl::InlinedVector<char, 4> chars(kBufferSize);
+  chars[kBufferSize - 1] = '\0';
+  hipError_t res = hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
+    return pci_bus_id;
+  }
+  pci_bus_id = chars.begin();
+  return pci_bus_id;
+}
+
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
+  if (from->device_ordinal() == to->device_ordinal()) {
+    return true;  // A device can always access its own memory.
+  }
+
+  int can_access_peer = -1;
+  hipError_t res = hipDeviceCanAccessPeer(
+      &can_access_peer, from->device_ordinal(), to->device_ordinal());
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
+    return false;
+  }
+
+  return can_access_peer;
+}
+
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
+  if (from->device_ordinal() == to->device_ordinal()) {
+    return port::Status::OK();  // A device can always access its own memory.
+  }
+
+  ScopedActivateContext activated{from};
+  hipError_t result =
+      hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
+  if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to enable peer access from %d to %d: %s",
+                        from->device_ordinal(), to->device_ordinal(),
+                        ToString(result).c_str())};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, hipFunction_t kernel, int threads_per_block,
+    size_t dynamic_shared_memory_bytes) {
+  ScopedActivateContext activation{context};
+
+  int max_blocks = 0;
+  hipError_t result = hipSuccess;
+  // TODO(ROCm) implement this feature in HIP
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
+                        kernel, ToString(result).c_str())};
+  }
+
+  return max_blocks;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_event.cc b/tensorflow/stream_executor/rocm/rocm_event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0ffd74c177bf5149f98cc045a51559b9acf1d94
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_event.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+Event::Status GpuEvent::PollForStatus() {
+  port::StatusOr<hipError_t> status =
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error polling for event status: "
+               << status.status().error_message();
+    return Event::Status::kError;
+  }
+
+  switch (status.ValueOrDie()) {
+    case hipSuccess:
+      return Event::Status::kComplete;
+    case hipErrorNotReady:
+      return Event::Status::kPending;
+    default:
+      LOG(INFO) << "Error condition returned for event status: "
+                << status.ValueOrDie();
+      return Event::Status::kError;
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..249c7870e7efe75e9d1bb9a05041876b10d4d2bb
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -0,0 +1,976 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+
+#include "absl/base/casts.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/mathutil.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/path.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+#ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
+#error \
+    "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
+#endif
+
+#ifdef __ROCM_RUNTIME_H__
+#error \
+    "ROCM runtime being included into ROCM GPU executor; should be driver only."
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+static GpuEvent* AsGpuEvent(Event* event) {
+  DCHECK(event != nullptr);
+  return static_cast<GpuEvent*>(event->implementation());
+}
+
+// Given a platform-independent timer datatype, returns the internal ROCM
+// platform implementation pointer.
+static GpuTimer* AsGpuTimer(Timer* timer) {
+  DCHECK(timer != nullptr);
+  return static_cast<GpuTimer*>(timer->implementation());
+}
+
+// Given const GPU memory, returns a librocm device pointer datatype, suitable
+// for passing directly to librocm APIs.
+//
+// N.B. we must lose constness in order to pass a suitable type to the existing
+// librocm APIs, so the caller should take care to only pass the result of const
+// GPU memory conversions to librocm functions which will honor constness.
+static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
+  return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
+}
+
+// See description on const version above.
+static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
+  return AsROCmDevicePtr(*gpu_mem);
+}
+
+static GpuContext* GetGpuContext(Stream* stream) {
+  return static_cast<GpuExecutor*>(stream->parent()->implementation())
+      ->gpu_context();
+}
+
+GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
+  CHECK(rocm_exec != nullptr);
+  return rocm_exec->gpu_context();
+}
+
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
+}
+
+GpuExecutor::~GpuExecutor() {
+  for (auto& it : disk_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  for (auto& it : in_memory_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  if (context_ != nullptr) {
+    GpuDriver::DestroyContext(context_);
+  }
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
+}
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
+  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
+  mutex_lock lock{in_memory_modules_mu_};
+  return UnloadGpuBinary(gpu_binary);
+}
+
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
+  auto module_it = gpu_binary_to_module_.find(gpu_binary);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
+    return false;
+  }
+  auto& module = module_it->second.first;
+  auto& refcount = module_it->second.second;
+  VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading  HSACO module " << module;
+    GpuDriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  return true;
+}
+
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (UnloadKernel)";
+}
+
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
+  device_ordinal_ = device_ordinal;
+
+  auto status = GpuDriver::Init();
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return GpuDriver::GetGpuISAVersion(&version_, device_);
+}
+
+bool GpuExecutor::FindOnDiskForComputeCapability(
+    absl::string_view filename, absl::string_view canonical_suffix,
+    string* found_filename) const {
+  LOG(FATAL) << "Feature not supported on ROCM platform "
+                "(FindOnDiskForComputeCapability)";
+  return false;
+}
+
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  if (version_ == 0) {
+    return false;
+  }
+
+  string cc_specific =
+      absl::StrCat(filename, ".cc", version_, canonical_suffix);
+  if (port::FileExists(cc_specific).ok()) {
+    VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
+            << cc_specific;
+    *found_filename = cc_specific;
+    return true;
+  }
+
+  VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
+          << cc_specific;
+  if (port::FileExists(string(filename)).ok()) {
+    *found_filename = string(filename);
+    return true;
+  }
+
+  return false;
+}
+
+// Returns the path to the running executable.
+// N.B. Derived from //knowledge/smalltalk/background_kb.cc
+// Arg: strip_exe: if true, remove the name of the executable itself from the
+//                 returned string. Example: calling this from /usr/bin/foo
+//                 would return /usr/bin.
+static string GetBinaryDir(bool strip_exe) {
+  char exe_path[PATH_MAX] = {0};
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  // Make sure it's null-terminated:
+  exe_path[sizeof(exe_path) - 1] = 0;
+
+  if (strip_exe) {
+    // The exe is the last component of the path, so remove one component.
+    string ret = exe_path;
+    std::vector<string> components = port::Split(exe_path, '/');
+    components.pop_back();
+    return port::Join(components, "/");
+  }
+  return exe_path;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* rocm_kernel = AsGpuKernel(kernel);
+  hipModule_t module = nullptr;
+  const string* kernelname;
+
+  const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
+  bool has_cubin = spec.has_cuda_cubin_on_disk();
+  if (has_cubin) {
+    on_disk_spec = &spec.cuda_cubin_on_disk();
+  }
+
+  if (on_disk_spec != nullptr) {
+    LOG(WARNING) << "loading ROCM kernel from disk is not supported";
+    return false;
+  } else if (spec.has_cuda_cubin_in_memory()) {
+    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+
+    const char* hsaco = spec.cuda_cubin_in_memory().bytes();
+    mutex_lock lock{in_memory_modules_mu_};
+    module = in_memory_modules_[hsaco];
+
+    if (module == nullptr) {
+      if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
+        LOG(ERROR) << "failed to load HSACO\n";
+        return false;
+      }
+      in_memory_modules_[hsaco] = module;
+    }
+  } else {
+    LOG(WARNING) << "no method of loading ROCM kernel provided";
+    return false;
+  }
+
+  VLOG(2) << "getting function " << *kernelname << " from module " << module;
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    rocm_kernel->gpu_function_ptr())) {
+    return false;
+  }
+
+  // We have to trust the kernel loader spec arity because there doesn't appear
+  // to be a way to reflect on the number of expected arguments w/the ROCM API.
+  rocm_kernel->set_arity(spec.arity());
+
+  KernelMetadata kernel_metadata;
+  if (!GetKernelMetadata(rocm_kernel, &kernel_metadata)) {
+    LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
+  }
+  kernel->set_metadata(kernel_metadata);
+  kernel->set_name(*kernelname);
+  return true;
+}
+
+bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
+                                    KernelMetadata* kernel_metadata) {
+  int value = 0;
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_registers_per_thread(value);
+
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_shared_memory_bytes(value);
+
+  return true;
+}
+
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
+  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
+  GpuStreamHandle hipstream = AsGpuStreamValue(stream);
+  const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
+  hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
+
+  // Only perform/print the occupancy check once.  Even just checking to see
+  // whether we've done an occupancy check on this kernel before isn't free
+  // (because we have to synchronize), so we only do this at -v 2+.
+  if (VLOG_IS_ON(2)) {
+    mutex_lock lock(launched_kernels_mu_);
+    if (!launched_kernels_.count(hipfunc)) {
+      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
+      // expose a kernel/module deallocation method.
+      launched_kernels_.insert(hipfunc);
+    }
+  }
+
+  if (rocm_kernel->GetPreferredCacheConfig() !=
+      KernelCacheConfig::kNoPreference) {
+    GpuDriver::FuncSetCacheConfig(hipfunc, rocm_kernel->GetGpuCacheConfig());
+  }
+
+  // prepare kernargs
+  // KernelArgsArrayBase keeps the pointer of arguments
+  // deference them here
+  std::vector<void*> kernargs;
+  KernelArgIterator iter = args.arg_iterator();
+  while (iter.has_next()) {
+    KernelArg arg = iter.next();
+    VLOG(2) << "*(arg.address): "
+            << reinterpret_cast<void*>(
+                   *static_cast<const uint64_t*>(arg.address));
+    kernargs.push_back(
+        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
+  }
+
+  size_t size = sizeof(void*) * kernargs.size();
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+
+  if (!GpuDriver::LaunchKernel(
+          GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
+          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
+          args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
+    LOG(ERROR) << "failed to launch ROCM kernel with args: "
+               << args.number_of_arguments()
+               << "; thread dim: " << thread_dims.ToString()
+               << "; block dim: " << block_dims.ToString();
+    return false;
+  }
+
+  return true;
+}
+
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
+  return 0;
+}
+
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
+  return 0;
+}
+
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the  HSACO binary  as
+  // ModuleHandle::id().
+  hipModule_t hip_module = nullptr;
+  // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
+  if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromHsaco(
+            reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
+            &hip_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void*>(
+        static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
+    return true;
+  } else {
+    LOG(ERROR) << "No HSACO binary found \n";
+    return false;
+  }
+}
+
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
+
+  if (*module == nullptr) {
+    if (!GpuDriver::LoadHsaco(context_, hsaco, module)) {
+      LOG(ERROR) << "failed to load : HSACO \n";
+      return false;
+    }
+    module_refcount = 1;
+    VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
+            << " as module " << *module;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
+            << " is already loaded as module " << *module;
+  }
+  gpu_binary_to_module_[hsaco] = {*module, module_refcount};
+  return true;
+}
+
+// This is a non-essential operation; if there's a failure, proceed without
+// logging an error. It's nearly certain that in case of failures, we'd never
+// get here in the first place; these are very low-impact routines.
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
+  // TODO(ROCm) implement this feature in HIP
+}
+
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
+}
+
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
+  // offset and size are in bytes, so char* works as the pointer type.
+  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
+}
+
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
+  // ROCM "sub-buffers" are just pointer + offset, so no dealloc is necessary.
+  if (!mem->is_sub_buffer()) {
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
+  }
+}
+
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
+  if (location == nullptr || size == 0) {
+    LOG(WARNING) << "attempting to register null or zero-sized memory: "
+                 << location << "; size " << size;
+  }
+  VLOG(2) << "registering " << location << " size " << size;
+  return GpuDriver::HostRegister(context_, location, size);
+}
+
+bool GpuExecutor::HostMemoryUnregister(void* location) {
+  VLOG(2) << "unregistering " << location;
+  return GpuDriver::HostUnregister(context_, location);
+}
+
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
+}
+
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), 0x0, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           0x0, size);
+}
+
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    // hipMemset reinterprets "value" as a uint8.
+    uint8 byte_value = static_cast<uint8>(value);
+    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
+                     (byte_value << 8) | byte_value;
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), pattern, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           value, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         host_src, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return Memset32(stream, location, 0x0, size);
+  } else {
+    return Memset(stream, location, 0x0, size);
+  }
+}
+
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset8 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset32 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+        size % 4 == 0);
+  return GpuDriver::AsynchronousMemsetUint32(
+      context_, AsROCmDevicePtr(location), pattern, size / 4,
+      AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
+}
+
+/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
+                                                    hipError_t status,
+                                                    void* data) {
+  std::function<void()>* callback =
+      reinterpret_cast<std::function<void()>*>(data);
+  (*callback)();
+  delete callback;
+}
+
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
+}
+
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
+}
+
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
+    return port::Status::OK();
+  } else {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("error recording waiting for ROCM event on stream %p",
+                        stream)};
+  }
+}
+
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
+}
+
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
+}
+
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* rocm_stream = AsGpuStream(stream);
+  if (!rocm_stream->IsIdle()) {
+    LOG(ERROR) << "Deallocating stream with pending work";
+  }
+  rocm_stream->Destroy();
+}
+
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
+}
+
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
+}
+
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
+  if (!ok) {
+    LOG(ERROR) << "failed to record completion event; "
+                  "therefore, failed to create inter-stream dependency";
+    return false;
+  }
+
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
+}
+
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
+}
+
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
+}
+
+blas::BlasSupport* GpuExecutor::CreateBlas() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::BlasFactory> status =
+      registry->GetFactory<PluginRegistry::BlasFactory>(kROCmPlatformId,
+                                                        plugin_config_.blas());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve BLAS factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::DnnFactory> status =
+      registry->GetFactory<PluginRegistry::DnnFactory>(kROCmPlatformId,
+                                                       plugin_config_.dnn());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve DNN factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+fft::FftSupport* GpuExecutor::CreateFft() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::FftFactory> status =
+      registry->GetFactory<PluginRegistry::FftFactory>(kROCmPlatformId,
+                                                       plugin_config_.fft());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve FFT factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+rng::RngSupport* GpuExecutor::CreateRng() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::RngFactory> status =
+      registry->GetFactory<PluginRegistry::RngFactory>(kROCmPlatformId,
+                                                       plugin_config_.rng());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve RNG factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+// TODO(rspringer): Remove in b/18544742.
+bool GpuExecutor::SupportsDnn() const { return true; }
+
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
+}
+
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
+}
+
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
+  port::StatusOr<hipSharedMemConfig> rocm_config =
+      GpuDriver::ContextGetSharedMemConfig(context_);
+  if (!rocm_config.ok()) {
+    // Don't log; the failed call will log necessary output.
+    return SharedMemoryConfig::kDefault;
+  }
+
+  switch (rocm_config.ValueOrDie()) {
+    case hipSharedMemBankSizeDefault:
+      return SharedMemoryConfig::kDefault;
+    case hipSharedMemBankSizeFourByte:
+      return SharedMemoryConfig::kFourByte;
+    case hipSharedMemBankSizeEightByte:
+      return SharedMemoryConfig::kEightByte;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration returned: "
+                 << rocm_config.ValueOrDie();
+  }
+}
+
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
+    SharedMemoryConfig config) {
+  hipSharedMemConfig rocm_config;
+  switch (config) {
+    case SharedMemoryConfig::kDefault:
+      rocm_config = hipSharedMemBankSizeDefault;
+      break;
+    case SharedMemoryConfig::kFourByte:
+      rocm_config = hipSharedMemBankSizeFourByte;
+      break;
+    case SharedMemoryConfig::kEightByte:
+      rocm_config = hipSharedMemBankSizeEightByte;
+      break;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration specified: "
+                 << static_cast<int>(config);
+  }
+  return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
+}
+
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
+}
+
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{disk_modules_mu_};
+    for (auto& it : disk_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    for (auto& it : in_memory_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    if (static_cast<bool>(module_handle)) {
+      auto it = gpu_binary_to_module_.find(module_handle.id());
+      CHECK(it != gpu_binary_to_module_.end());
+      if (GpuDriver::GetModuleSymbol(
+              context_, it->second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+
+    for (auto& it : gpu_binary_to_module_) {
+      if (GpuDriver::GetModuleSymbol(
+              context_, it.second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+  }
+
+  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  return false;
+}
+
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
+  // The BlockDim name is a mismatch against these GRID_DIM_* queries because
+  // we use BlockDims to express the dimensions of blocks within a grid
+  // (as opposed to ThreadDim which expresses the dimensions of threads
+  // within a block).
+  int x, y, z;
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
+    return false;
+  }
+
+  block_dim_limit->x = x;
+  block_dim_limit->y = y;
+  block_dim_limit->z = z;
+  return true;
+}
+
+bool GpuExecutor::SupportsBlas() const { return true; }
+
+bool GpuExecutor::SupportsFft() const { return true; }
+
+bool GpuExecutor::SupportsRng() const { return true; }
+
+std::unique_ptr<internal::EventInterface>
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
+}
+
+std::unique_ptr<internal::KernelInterface>
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
+}
+
+std::unique_ptr<internal::StreamInterface>
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
+}
+
+std::unique_ptr<internal::TimerInterface>
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
+}
+
+void* GpuExecutor::GpuContextHack() { return context_; }
+
+GpuContext* GpuExecutor::gpu_context() { return context_; }
+
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
+// of SysFS. Returns -1 if it cannot.
+//
+// For anything more complicated/prod-focused than this, you'll likely want to
+// turn to gsys' topology modeling.
+static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
+  // TODO(ROCm) implement this feature in HIP
+  return 1;
+}
+
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  {
+    int driver_version = 0;
+    (void)GpuDriver::GetDriverVersion(&driver_version);
+    string augmented_driver_version = absl::StrFormat(
+        "%d (%s)", driver_version,
+        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+    builder.set_driver_version(augmented_driver_version);
+  }
+
+  {
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
+
+    // Lower the hex characters to match sysfs.
+    pci_bus_id = port::Lowercase(pci_bus_id);
+    builder.set_pci_bus_id(pci_bus_id);
+
+    // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    builder.set_numa_node(numa_node);
+  }
+
+  hipDeviceProp_t prop;
+  if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
+    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+
+    ThreadDim thread_dim_limit;
+    thread_dim_limit.x = prop.maxThreadsDim[0];
+    thread_dim_limit.y = prop.maxThreadsDim[1];
+    thread_dim_limit.z = prop.maxThreadsDim[2];
+    builder.set_thread_dim_limit(thread_dim_limit);
+
+    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
+    builder.set_clock_rate_ghz(clock_rate_ghz);
+  }
+
+  {
+    bool ecc_enabled = false;
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
+    builder.set_ecc_enabled(ecc_enabled);
+  }
+
+  {
+    uint64 device_memory_size = -1;
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    builder.set_device_memory_size(device_memory_size);
+  }
+
+  {
+    BlockDim block_dim_limit;
+    FillBlockDimLimit(&block_dim_limit);
+    builder.set_block_dim_limit(block_dim_limit);
+  }
+
+  {
+    string device_name;
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
+    builder.set_name(device_name);
+  }
+
+  builder.set_platform_version(
+      absl::StrCat("AMDGPU ISA version: gfx", version_));
+
+  // TODO(leary) should be a way to query this from the driver, but this is
+  // unlikely to change for us any time soon.
+  builder.set_device_address_bits(64);
+
+  builder.set_device_vendor("Advanced Micro Devices, Inc");
+  builder.set_rocm_amdgpu_isa_version(version_);
+  builder.set_shared_memory_per_core(
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+  builder.set_shared_memory_per_block(
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+  builder.set_core_count(
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
+  builder.set_threads_per_core_limit(
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+  builder.set_registers_per_block_limit(
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+  builder.set_threads_per_warp(
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
+  builder.set_registers_per_core_limit(64 * 1024);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace gpu
+
+void initialize_rocm_gpu_executor() {
+  *internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
+  };
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
+  stream_executor::initialize_rocm_gpu_executor();
+});
diff --git a/tensorflow/stream_executor/rocm/rocm_kernel.cc b/tensorflow/stream_executor/rocm/rocm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..162b2bdc71574e7dc30f5a3ed2d5a15a45d97206
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return hipFuncCachePreferNone;
+    case KernelCacheConfig::kPreferShared:
+      return hipFuncCachePreferShared;
+    case KernelCacheConfig::kPreferL1:
+      return hipFuncCachePreferL1;
+    case KernelCacheConfig::kPreferEqual:
+      return hipFuncCachePreferEqual;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.cc b/tensorflow/stream_executor/rocm/rocm_platform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..113371dd5531a0b99351134907040bb8541ed94d
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace gpu {
+
+ROCmPlatform::ROCmPlatform()
+    : name_("ROCM"), min_numa_node_(0), limit_numa_node_(0) {}
+
+ROCmPlatform::~ROCmPlatform() {}
+
+// Due to legacy issues in user code, we can't currently call InpectNumaNodes
+// at module initialization time, because non-GPU programs still include this
+// plugin via various methods, so instead, it has to be init-on-reference.
+void ROCmPlatform::InspectNumaNodes() {
+  // To get NUMA node information, we need to create all executors, so we can
+  // examine their device descriptions to see their bus assignments.
+  static bool initialized = false;
+  static mutex numa_mutex(LINKER_INITIALIZED);
+  mutex_lock lock(numa_mutex);
+  if (initialized) {
+    return;
+  }
+
+  StreamExecutorConfig config;
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    config.ordinal = i;
+    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+    if (i == 0) {
+      // NUMA nodes may not start at 0, so set the minimum node  based on the
+      // first executor we see.
+      min_numa_node_ = exec->GetDeviceDescription().numa_node();
+      limit_numa_node_ = min_numa_node_ + 1;
+    } else {
+      min_numa_node_ =
+          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+      limit_numa_node_ = std::max(limit_numa_node_,
+                                  exec->GetDeviceDescription().numa_node() + 1);
+    }
+  }
+  initialized = true;
+}
+
+int ROCmPlatform::BusCount() {
+  InspectNumaNodes();
+  return limit_numa_node_ - min_numa_node_;
+}
+
+int ROCmPlatform::DeviceToBus(int device_ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+  return exec->GetDeviceDescription().numa_node() - min_numa_node_;
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
+    int bus_ordinal) {
+  InspectNumaNodes();
+  CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    if (DeviceToBus(i) == bus_ordinal) {
+      StreamExecutorConfig config;
+      config.ordinal = i;
+      return GetExecutor(config).ValueOrDie();
+    }
+  }
+
+  return port::Status{
+      port::error::NOT_FOUND,
+      absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
+}
+
+Platform::Id ROCmPlatform::id() const { return kROCmPlatformId; }
+
+int ROCmPlatform::VisibleDeviceCount() const {
+  // Throw away the result - it logs internally, and this [containing] function
+  // isn't in the path of user control. It's safe to call this > 1x.
+
+  if (!gpu::GpuDriver::Init().ok()) {
+    return -1;
+  }
+
+  return GpuDriver::GetDeviceCount();
+}
+
+const string& ROCmPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  return executor_cache_.GetOrCreate(
+      config, [&]() { return GetUncachedExecutor(config); });
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat(
+            "failed initializing StreamExecutor for ROCM device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ROCmPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register ROCM trace listener";
+}
+
+void ROCmPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister ROCM trace listener";
+}
+
+}  // namespace gpu
+
+static void InitializeROCmPlatform() {
+  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // registered platforms.
+  auto status = MultiPlatformManager::PlatformWithName("ROCM");
+  if (!status.ok()) {
+    std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
+    SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_platform,
+                            stream_executor::InitializeROCmPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(rocm_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.h b/tensorflow/stream_executor/rocm/rocm_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..d498e5fdb1e9ef1f31b2fea13625aba995d9acad
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCM platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+extern const Platform::Id kROCmPlatformId;
+
+// ROCm-specific platform plugin, registered as a singleton value via module
+// initializer.
+class ROCmPlatform : public Platform {
+ public:
+  ROCmPlatform();
+  ~ROCmPlatform() override;
+
+  // ROCmPlatform-specific functionality
+  // Returns the number of distinct buses / NUMA nodes on the machine.
+  int BusCount();
+
+  // Returns the bus/NUMA node for the specified device ordinal.
+  int DeviceToBus(int device_ordinal);
+
+  // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
+  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+
+  // Platform interface implementation:
+  // Returns the same value as kROCmPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // Determines the number of NUMA nodes and the assignment of executor to each.
+  void InspectNumaNodes();
+
+  // This platform's name.
+  string name_;
+
+  // mutex that guards internal state.
+  mutable mutex mu_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  // The smallest NUMA node value for any device managed by this machine
+  // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
+  // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
+  int min_numa_node_;
+
+  // Larger than the NUMA node value for any device managed by this machine
+  // manager.
+  int limit_numa_node_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.cc b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
new file mode 100644
index 0000000000000000000000000000000000000000..daa42ab022af805d40c8d61973f87540167b9c1a
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLATFORM_DEFINE_ID(kROCmPlatformId);
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.h b/tensorflow/stream_executor/rocm/rocm_platform_id.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c760b827717081a1618d3629015a5766fb1504
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCm platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+// This is broken out here to avoid a circular dependency between ROCmPlatform
+// and GpuExecutor.
+extern const Platform::Id kROCmPlatformId;
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_rng.cc b/tensorflow/stream_executor/rocm/rocm_rng.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2048c8ff644379ff19ffd65944f44e9d7f31a8da
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@@ -0,0 +1,284 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/hiprand/hiprand.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+// Formats hiprandStatus_t to output prettified values into a log stream.
+std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
+#define OSTREAM_HIPRAND_STATUS(__name) \
+  case HIPRAND_STATUS_##__name:        \
+    in << "HIPRAND_STATUS_" #__name;   \
+    return in;
+
+  switch (status) {
+    OSTREAM_HIPRAND_STATUS(SUCCESS)
+    OSTREAM_HIPRAND_STATUS(VERSION_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(NOT_INITIALIZED)
+    OSTREAM_HIPRAND_STATUS(ALLOCATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(TYPE_ERROR)
+    OSTREAM_HIPRAND_STATUS(OUT_OF_RANGE)
+    OSTREAM_HIPRAND_STATUS(LENGTH_NOT_MULTIPLE)
+    OSTREAM_HIPRAND_STATUS(LAUNCH_FAILURE)
+    OSTREAM_HIPRAND_STATUS(PREEXISTING_FAILURE)
+    OSTREAM_HIPRAND_STATUS(INITIALIZATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(ARCH_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(INTERNAL_ERROR)
+    default:
+      in << "hiprandStatus_t(" << static_cast<int>(status) << ")";
+      return in;
+  }
+}
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
+
+namespace wrap {
+
+#define PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(__name)                     \
+  struct WrapperShim__##__name {                                    \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};               \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;
+
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandCreateGenerator);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandDestroyGenerator);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetStream);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniform);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniformDouble);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetGeneratorOffset);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormal);
+PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormalDouble);
+
+}  // namespace wrap
+
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
+
+GpuRng::~GpuRng() {
+  if (rng_ != nullptr) {
+    wrap::hiprandDestroyGenerator(parent_, rng_);
+  }
+}
+
+bool GpuRng::Init() {
+  mutex_lock lock{mu_};
+  CHECK(rng_ == nullptr);
+
+  hiprandStatus_t ret =
+      wrap::hiprandCreateGenerator(parent_, &rng_, HIPRAND_RNG_PSEUDO_DEFAULT);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create random number generator: " << ret;
+    return false;
+  }
+
+  CHECK(rng_ != nullptr);
+  return true;
+}
+
+bool GpuRng::SetStream(Stream* stream) {
+  hiprandStatus_t ret =
+      wrap::hiprandSetStream(parent_, rng_, AsGpuStreamValue(stream));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for random generation: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+// Returns true if std::complex stores its contents as two consecutive
+// elements. Tests int, float and double, as the last two are independent
+// specializations.
+constexpr bool ComplexIsConsecutiveFloats() {
+  return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
+         sizeof(std::complex<double>) == 16;
+}
+
+template <typename T>
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
+  mutex_lock lock{mu_};
+  static_assert(ComplexIsConsecutiveFloats(),
+                "std::complex values are not stored as consecutive values");
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // std::complex<T> is currently implemented as two consecutive T variables.
+  uint64 element_count = v->ElementCount();
+  if (std::is_same<T, std::complex<float>>::value ||
+      std::is_same<T, std::complex<double>>::value) {
+    element_count *= 2;
+  }
+
+  hiprandStatus_t ret;
+  if (std::is_same<T, float>::value ||
+      std::is_same<T, std::complex<float>>::value) {
+    ret = wrap::hiprandGenerateUniform(
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
+        element_count);
+  } else {
+    ret = wrap::hiprandGenerateUniformDouble(
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
+        element_count);
+  }
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
+               << " " << TypeString<T>() << "s at " << v->opaque() << ": "
+               << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+template <typename ElemT, typename FuncT>
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
+  mutex_lock lock{mu_};
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  uint64 element_count = v->ElementCount();
+  hiprandStatus_t ret =
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
+
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
+               << " floats at " << v->opaque() << ": " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormal);
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormalDouble);
+}
+
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
+  mutex_lock lock{mu_};
+  CHECK(rng_ != nullptr);
+
+  if (!CheckSeed(seed, seed_bytes)) {
+    return false;
+  }
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
+  // (which itself requires 16 for API consistency with host RNG fallbacks).
+  hiprandStatus_t ret = wrap::hiprandSetPseudoRandomGeneratorSeed(
+      parent_, rng_, *(reinterpret_cast<const uint64*>(seed)));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set rng seed: " << ret;
+    return false;
+  }
+
+  ret = wrap::hiprandSetGeneratorOffset(parent_, rng_, 0);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to reset rng position: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+namespace se = ::stream_executor;
+
+REGISTER_MODULE_INITIALIZER(register_hiprand, {
+  se::port::Status status =
+      se::PluginRegistry::Instance()
+          ->RegisterFactory<se::PluginRegistry::RngFactory>(
+              se::gpu::kROCmPlatformId, se::gpu::kGpuRandPlugin, "hipRAND",
+              [](se::internal::StreamExecutorInterface* parent)
+                  -> se::rng::RngSupport* {
+                se::gpu::GpuExecutor* rocm_executor =
+                    dynamic_cast<se::gpu::GpuExecutor*>(parent);
+                if (rocm_executor == nullptr) {
+                  LOG(ERROR)
+                      << "Attempting to initialize an instance of the hipRAND "
+                      << "support library with a non-ROCM StreamExecutor";
+                  return nullptr;
+                }
+
+                se::gpu::GpuRng* rng = new se::gpu::GpuRng(rocm_executor);
+                if (!rng->Init()) {
+                  // Note: Init() will log a more specific error.
+                  delete rng;
+                  return nullptr;
+                }
+                return rng;
+              });
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to register hipRAND factory: "
+               << status.error_message();
+  }
+
+  se::PluginRegistry::Instance()->SetDefaultFactory(
+      se::gpu::kROCmPlatformId, se::PluginKind::kRng, se::gpu::kGpuRandPlugin);
+});
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 3edc66cde8045d7f6ae53095e8136d1697fb1d23..e7485ca426bc8108cc7a376906c6624c7cae5600 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/host_buffer.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -95,8 +94,6 @@ string ToVlogString(const void *ptr) {
   return out.str();
 }
 
-string ToVlogString(const HostBuffer &buffer) { return buffer.AsString(); }
-
 template <class T>
 string ToVlogString(const std::complex<T> &c) {
   // StrCat does not convert std::complex to text.
@@ -549,11 +546,17 @@ Stream &Stream::ThenConvolveWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckStatus(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::FORWARD, this, input_descriptor, input_data,
+          filter_descriptor, filter_data, output_descriptor, *output,
+          convolution_descriptor, dnn::AlgorithmConfig(), scratch_allocator,
+          &algorithm_desc, &scratch_memory));
       CheckError(dnn->DoConvolve(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -576,11 +579,17 @@ Stream &Stream::ThenConvolveWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckStatus(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::FORWARD, this, input_descriptor, input_data,
+          filter_descriptor, filter_data, output_descriptor, *output,
+          convolution_descriptor, dnn::AlgorithmConfig(), scratch_allocator,
+          &algorithm_desc, &scratch_memory));
       CheckError(dnn->DoConvolve(
           this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -758,10 +767,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -789,10 +809,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -820,10 +851,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -969,10 +1011,18 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckStatus(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+          *backward_input_data, filter_descriptor, filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
+          &scratch_memory));
       CheckError(dnn->DoConvolveBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
+          backward_input_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -999,11 +1049,23 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+                 *backward_input_data, filter_descriptor, filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1032,11 +1094,23 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+                 *backward_input_data, filter_descriptor, filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1065,11 +1139,23 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+                 *backward_input_data, filter_descriptor, filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1096,10 +1182,18 @@ Stream &Stream::ThenConvolveBackwardDataWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckStatus(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+          *backward_input_data, filter_descriptor, filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
+          &scratch_memory));
       CheckError(dnn->DoConvolveBackwardData(
           this, filter_descriptor, filter_data, output_descriptor,
           backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
+          backward_input_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -1138,10 +1232,18 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckStatus(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+          input_data, filter_descriptor, *backward_filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
+          &scratch_memory));
       CheckError(dnn->DoConvolveBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
+          backward_filter_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -1168,11 +1270,23 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+                 input_data, filter_descriptor, *backward_filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1201,11 +1315,23 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+                 input_data, filter_descriptor, *backward_filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1232,10 +1358,18 @@ Stream &Stream::ThenConvolveBackwardFilterWithScratch(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      CheckStatus(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+          input_data, filter_descriptor, *backward_filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
+          &scratch_memory));
       CheckError(dnn->DoConvolveBackwardFilter(
           this, input_descriptor, input_data, output_descriptor,
           backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
+          backward_filter_data, algorithm_desc, &scratch_memory,
           /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -1262,11 +1396,23 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+                 input_data, filter_descriptor, *backward_filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1490,6 +1636,28 @@ Stream &Stream::ThenPoolForward(
   return *this;
 }
 
+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<int8> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<int8> *output_data, ScratchAllocator *workspace_allocator) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(workspace_allocator));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions, output_data,
+                                    workspace_allocator));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolBackward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
@@ -1932,36 +2100,6 @@ Stream &Stream::ThenMemcpyH2DQuantized(
   return *this;
 }
 
-Stream &Stream::ThenCopyHostBuffer2Device(
-    HostBuffer *buffer_src, DeviceMemory<float> *gpu_unquantized_dst) {
-  VLOG_CALL(PARAM(*buffer_src), PARAM(gpu_unquantized_dst));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(
-          dnn->DoCopyHostBuffer2Device(this, buffer_src, gpu_unquantized_dst));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenCopyDevice2HostBuffer(
-    const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst) {
-  VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(*buffer_dst));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(
-          dnn->DoCopyDevice2HostBuffer(this, gpu_unquantized_src, buffer_dst));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream *Stream::GetOrCreateSubStream() {
   mutex_lock lock(mu_);
 
@@ -5507,4 +5645,13 @@ string Stream::DebugStreamPointers() const {
                       ",impl=", ToVlogString(implementation_.get()), "]");
 }
 
+void Stream::CheckStatus(port::Status status) {
+  if (status.ok()) {
+    return;
+  }
+  LOG(ERROR) << status;
+  mutex_lock lock(mu_);
+  ok_ = false;
+}
+
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 0fc90cf83d6b4e3e0ede84747f8149c1a25289ca..f698d50e2a8179cdc1a376c279a490d8c377cd8c 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -650,6 +650,13 @@ class Stream {
                           DeviceMemory<Eigen::half> *output_data,
                           ScratchAllocator *workspace_allocator = nullptr);
 
+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<int8> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<int8> *output_data,
+                          ScratchAllocator *workspace_allocator = nullptr);
+
   Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                            const dnn::BatchDescriptor &input_dimensions,
                            const DeviceMemory<double> &input_data,
@@ -2092,6 +2099,9 @@ class Stream {
     ok_ = false;
   }
 
+  // Checks the status and logs the error message, if any.
+  void CheckStatus(port::Status status) LOCKS_EXCLUDED(mu_);
+
   void SetError() { CheckError(false /* = operation_retcode */); }
 
   void SetErrorAndLogNoDnnSupport() {
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 341c6edccd3c1bfd314127c5356f03a15a85e1d3..46afedef3316bcd6b23c6f7b081af10db43d58f6 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -25,6 +25,13 @@ StreamExecutorFactory* MakeCUDAExecutorImplementation() {
   return &instance;
 }
 
+// -- ROCm
+
+StreamExecutorFactory* MakeROCMExecutorImplementation() {
+  static StreamExecutorFactory instance;
+  return &instance;
+}
+
 // -- OpenCL
 
 StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 0c2c33cfca227b2d67fcdc633dd94274a65b92bb..6138554e9e7f7233c8ae46866e433a1307922b4a 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -374,9 +374,11 @@ using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
 using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
 using KernelFactory = std::function<KernelInterface*()>;
 
-StreamExecutorFactory* MakeCUDAExecutorImplementation();
+StreamExecutorFactory *MakeCUDAExecutorImplementation();
 
-StreamExecutorFactory* MakeOpenCLExecutorImplementation();
+StreamExecutorFactory *MakeROCMExecutorImplementation();
+
+StreamExecutorFactory *MakeOpenCLExecutorImplementation();
 
 extern StreamExecutorFactory MakeHostExecutorImplementation;
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index ee3d2b6da0cb8e1f51ff18ff9b98312c406cbb2e..6f0ba51ec6c30c8e342de534b0b09df50be50ead 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -71,6 +71,9 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
     case PlatformKind::kCuda:
       factory = *internal::MakeCUDAExecutorImplementation();
       break;
+    case PlatformKind::kROCm:
+      factory = *internal::MakeROCMExecutorImplementation();
+      break;
     case PlatformKind::kOpenCL:
       factory = *internal::MakeOpenCLExecutorImplementation();
       break;
@@ -188,10 +191,14 @@ StreamExecutor::StreamExecutor(
       memory_limit_bytes_(GetMemoryLimitBytes()) {
   if (port::Lowercase(platform_->Name()) == "cuda") {
     platform_kind_ = PlatformKind::kCuda;
+  } else if (port::Lowercase(platform_->Name()) == "rocm") {
+    platform_kind_ = PlatformKind::kROCm;
   } else if (port::Lowercase(platform_->Name()) == "opencl") {
     platform_kind_ = PlatformKind::kOpenCL;
   } else if (port::Lowercase(platform_->Name()) == "host") {
     platform_kind_ = PlatformKind::kHost;
+  } else {
+    platform_kind_ = PlatformKind::kInvalid;
   }
 }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d93e0df5e44eb32145a7f966cc631ceefab7117c..874d9e872c5d9f0444ae7ad4e5b409ba7c430bc1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -9,6 +9,7 @@ load(
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
     "tf_cuda_tests_tags",
+    "tf_exec_compatible_with",
     "tf_sycl_tests_tags",
 )
 load(
@@ -47,6 +48,18 @@ load(
 def register_extension_info(**kwargs):
     pass
 
+def if_v2(a):
+    return select({
+        clean_dep("//tensorflow:api_version_2"): a,
+        "//conditions:default": [],
+    })
+
+def if_not_v2(a):
+    return select({
+        clean_dep("//tensorflow:api_version_2"): [],
+        "//conditions:default": a,
+    })
+
 # if_cuda_is_configured def placeholder
 
 def if_cuda_is_configured_compat(x):
@@ -773,6 +786,7 @@ def tf_cc_test(
             ],
         ),
         data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        exec_compatible_with = tf_exec_compatible_with(kwargs),
         # Nested select() statements seem not to be supported when passed to
         # linkstatic, and we already have a cuda select() passed in to this
         # function.
@@ -885,6 +899,7 @@ def tf_cuda_only_cc_test(
         args = [],
         kernels = [],
         linkopts = []):
+    tags = tags + tf_cuda_tests_tags()
     native.cc_test(
         name = "%s%s" % (name, "_gpu"),
         srcs = srcs + tf_binary_additional_srcs(),
@@ -907,7 +922,8 @@ def tf_cuda_only_cc_test(
             clean_dep("//tensorflow:darwin"): 1,
             "//conditions:default": 0,
         }),
-        tags = tags + tf_cuda_tests_tags(),
+        tags = tags,
+        exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
     )
 
 register_extension_info(
@@ -971,6 +987,7 @@ def tf_cc_test_mkl(
             }) + _rpath_linkopts(src_to_test_name(src)),
             deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_deps(),
             data = data + tf_binary_dynamic_kernel_dsos(kernels),
+            exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
             linkstatic = linkstatic,
             tags = tags,
             size = size,
@@ -1632,7 +1649,9 @@ def tf_py_wrap_cc(
         swig_includes = [],
         deps = [],
         copts = [],
+        version_script = None,
         **kwargs):
+    """Builds a Python extension module."""
     module_name = name.split("/")[-1]
 
     # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
@@ -1651,6 +1670,11 @@ def tf_py_wrap_cc(
         toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
         deps = deps + extra_deps,
     )
+    if not version_script:
+        version_script = select({
+            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
+            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
+        })
     vscriptname = name + "_versionscript"
     _append_init_to_versionscript(
         name = vscriptname,
@@ -1659,10 +1683,7 @@ def tf_py_wrap_cc(
             "//conditions:default": True,
         }),
         module_name = module_name,
-        template_file = select({
-            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
-            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
-        }),
+        template_file = version_script,
     )
     extra_linkopts = select({
         "@local_config_cuda//cuda:darwin": [
@@ -1740,6 +1761,7 @@ def py_test(deps = [], data = [], kernels = [], **kwargs):
             "//conditions:default": [],
             clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
         }) + tf_binary_dynamic_kernel_dsos(kernels),
+        exec_compatible_with = tf_exec_compatible_with(kwargs),
         **kwargs
     )
 
@@ -1781,10 +1803,20 @@ def tf_py_test(
         tags = [],
         shard_count = 1,
         additional_deps = [],
+        additional_visibility = [],
         kernels = [],
         flaky = 0,
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
+    """Create one or more python tests with extra tensorflow dependencies."""
+    xla_test_true_list = []
+
+    # xla_enable_strict_auto_jit is used to run Tensorflow unit tests with all XLA compilable
+    # kernels compiled with XLA.
+    if xla_enable_strict_auto_jit:
+        xla_enabled = True
+        xla_test_true_list += ["//tensorflow/python:is_xla_test_true"]
     if xla_enabled:
         additional_deps = additional_deps + tf_additional_xla_deps_py()
     if grpc_enabled:
@@ -1801,11 +1833,11 @@ def tf_py_test(
         shard_count = shard_count,
         srcs_version = "PY2AND3",
         tags = tags,
-        visibility = [clean_dep("//tensorflow:internal")],
+        visibility = [clean_dep("//tensorflow:internal")] + additional_visibility,
         deps = [
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
-        ] + additional_deps,
+        ] + additional_deps + xla_test_true_list,
     )
 
 register_extension_info(
@@ -1825,8 +1857,12 @@ def cuda_py_test(
         kernels = [],
         tags = [],
         flaky = 0,
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
+    # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
+    # XLA tests once enough compute resources are available.
+    _ignored = [xla_enable_strict_auto_jit]
     if main == None:
         main = name + ".py"
     for config in ["cpu", "gpu"]:
@@ -1849,6 +1885,7 @@ def cuda_py_test(
             shard_count = shard_count,
             tags = test_tags,
             xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = False,
         )
 
 register_extension_info(
@@ -1902,6 +1939,7 @@ def py_tests(
         tags = [],
         shard_count = 1,
         prefix = "",
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
     for src in srcs:
@@ -1920,6 +1958,7 @@ def py_tests(
             shard_count = shard_count,
             tags = tags,
             xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = xla_enable_strict_auto_jit,
         )
 
 def cuda_py_tests(
@@ -1932,8 +1971,12 @@ def cuda_py_tests(
         shard_count = 1,
         tags = [],
         prefix = "",
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
+    # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
+    # XLA tests once enough compute resources are available.
+    _ignored = [xla_enable_strict_auto_jit]
     test_tags = tags + tf_cuda_tests_tags()
     py_tests(
         name = name,
@@ -1947,6 +1990,7 @@ def cuda_py_tests(
         shard_count = shard_count,
         tags = test_tags,
         xla_enabled = xla_enabled,
+        xla_enable_strict_auto_jit = False,
     )
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 9f6114f503467fc12fcfb5dae07e75d2113e410d..04632330c56c69a359d2e8fad424a1fb5afff74b 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -4,5 +4,4 @@
 *TF_*
 *TFE_*
 *nsync_*
-*pywrap_xla*
 *stream_executor*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 39d258c3b7edd1f5f7d0805c080e832aa1d6109a..563d178de7396fbae6127d9dcfbfa8cf00c65038 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -5,7 +5,6 @@ tensorflow {
     *TF_*;
     *TFE_*;
     *nsync_*;
-    *pywrap_xla*;
     *stream_executor*;
   local:
     *;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index a1083d732a1bb1b3212457f445323e5e868ef162..078f1028fd4e98c11481c6bb9e08303dd3c54c19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -26,7 +26,13 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
-     reserved_range {
+    field {
+      name: "collective_deterministic_sequential_execution"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    reserved_range {
       start: 2
       end: 3
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index b505d813509c2049fa6e3f60df553492d6f66613..d2ee0c4db668d0a1aa6190573f56f210a06e2653 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -149,6 +149,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "collective_deterministic_sequential_execution"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
index fee84d85307dffb675b507a31c4f1fda60de869d..5b47c718a5753905a4fa426b739dad4b01678c3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index c0ed95653552f904acea1cc82bca00773ecb792c..481a8c73ac351cc0ef38ee3681d5134f06334421 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.RaggedTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
index 3add49e90d7eb5094ad68d1474e834404549c988..64f7260369d7cbc656ad3d23b69cc9079e030f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 62d8ea9208f7f5f031b80be168cedfd538f18a22..341ace07663032a836da9c4c6b5f9fccccfb7add 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce29615f72eee78525b8a1efbb4531215e6b72fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.audio"
+tf_module {
+  member_method {
+    name: "encode_wav"
+    argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
index a71da113b4ffcaa9ff71e18df4a9263b141b42e6..359530f69c8a5ba61d3ab3531feb742987cbbede 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
@@ -9,10 +9,6 @@ tf_class {
     name: "AUTO_CONTROL_DEPS"
     mtype: "<enum \'Feature\'>"
   }
-  member {
-    name: "DECORATORS"
-    mtype: "<enum \'Feature\'>"
-  }
   member {
     name: "ERROR_REWRITING"
     mtype: "<enum \'Feature\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
index 12e23bc0c8fd0831471abcf56bcd8f07d3e6fe57..0baf6e03552f5b12e5f2e48f87cf1ec7332787bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'level\', \'alsologtostdout\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "to_code"
     argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
@@ -12,4 +16,8 @@ tf_module {
     name: "to_graph"
     argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
   }
+  member_method {
+    name: "trace"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
index f1d760603e981a0b9a72fdc379dc81932ac71d67..95352dff3a6b9341857b3f3b82dcb6817e4553ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
@@ -32,6 +32,14 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
+  member_method {
+    name: "dimension_at_index"
+    argspec: "args=[\'shape\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dimension_value"
+    argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "forward_compatibility_horizon"
     argspec: "args=[\'year\', \'month\', \'day\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
index 4f0147a52381c748eccbfee29df0d3537ba5d14a..682a2b91b6187783eef74a4cd3672ae2ae2d47fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.Iterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "initializer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 2d115904925eb96164484300baf628d41d3fcff4..abc98a74b64ab274ed8b2fc43876b7102f1c7201 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "bucket_by_sequence_length"
-    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "cardinality"
@@ -180,6 +180,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "take_while"
+    argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unbatch"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index aa474680592a1a3996ca3db970b814ba167cd801..272963382a009c837427176859994f5c603a05a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "make_initializable_iterator"
-    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'dataset\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "make_one_shot_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index b06c73d12602b25426034f801be329fb88067011..89748f7713fd813ab56d0e07780da33ca8ff14bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -15,45 +11,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -64,33 +28,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -99,18 +47,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -119,20 +59,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
index df707e8920e4488ed6b40a7f93f56b5624188c84..c3b7991175769f473acf929d656cd52ccca7bf4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "devices"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "distribution_strategy"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
@@ -26,6 +22,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_call"
     argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
index 77706e57133e1186d9e98fcf9205ed4c91772eda..5c4f09075316150b3118f048091d3c68a60a232d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_run_steps_on_iterator"
     argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
@@ -78,4 +82,8 @@ tf_class {
     name: "value_container"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variable_created_in_scope"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 9a1df5514261a47aae6f3d11be78b5a6fa6da919..6ed49d339d7af7b2d05dfa57121805a7dce48090 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -2,10 +2,6 @@ path: "tensorflow.distribute.Strategy"
 tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -14,45 +10,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -63,33 +27,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -98,18 +46,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -118,20 +58,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index 741102466d9580eaf5081c8c92b5b1fe38bc7525..b1bd5a2661d44d9b36b965ba160874e6142628ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -40,4 +40,12 @@ tf_module {
     name: "stop_if_lower_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
   }
+  member_method {
+    name: "stop_if_no_decrease_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_decrease\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_no_increase_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_increase\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c364b0217a7ed10282dc8fc28797f3be1b92f867
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.experimental.Module"
+tf_class {
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "owned_submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "owned_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "owned_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index 0c3f04e468c4c817cd474deb42149aee3021aa43..a7ee6d3e07d4387c4fc0bd9a5f6d9614473ca73c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.experimental"
 tf_module {
+  member {
+    name: "Module"
+    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
+  }
   member_method {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf57a88fc1295da13e0b58671191c9d8ba8caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.io.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
index cfa3372b12bfe32eed4311c89b6448c0359c0913..a797c06ff337cffe503d89c09497996ea64c6ad2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.io.gfile"
 tf_module {
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "copy"
     argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index eced2e1cb0706153a9bfc2749847395d194fcb56..283cc6a735695b0b2d16af28f7688a7a077f19be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -175,11 +175,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 2acb90173f3242e8a92913728eec84ef5d455d1f..95e405aebaf61e3ccae268b474a006a3bca51343 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -180,11 +180,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 8cd0c6ea5f027fa1f30b60a742450b651242d406..712e3ad6fcefc0f9f6a8ecc892e1dec81b673b45 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -124,6 +124,14 @@ tf_module {
     name: "ctc_label_dense_to_sparse"
     argspec: "args=[\'labels\', \'label_lengths\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -248,6 +256,10 @@ tf_module {
     name: "learning_phase"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learning_phase_scope"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index a5804d3bbcff401920ddd2b59bd5f094b3e1c628..0725f606e2923ff1bd5a8814febdfe7de8a2602c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
index bbc02c4d71f835497be74e771c5ae57682f5a5b3..14bfc3bedbfb5a379e28a0cb9cd2f7f744539fa1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
@@ -16,11 +16,11 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_predict_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 6182baf0a31e7027b685561fed5eeedc54a766a3..9812bad8f66f3d5afe365287feca748f9e6efd5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
index 9b1b068e225a5dae69672ecba70bdea48c6e6ae6..5aa739391ef894cdede1db17f903a50111f25eca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 92440188c81ee192df332cd89256233591b2d281..bf5bcb68df47ed8661509598d3bc59f01dfcefe6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -17,11 +17,11 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_predict_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index c10c236ad1990160be53ba5df7afeb64619bf260..5ae176017b3cf1ac019ecdc0f1c255f23b32fcec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 0db6b8d371b61db6fa565a93416dfc14eeae1d47..0fed6fd23670a16acd8d770269090c3dda0eee30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index dac2049fe19426738368009822ce2dac8bc64467..71cf7f4a4922752c0ba154a8d3fe29b37c305675 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 2834b74e8afbd5ee01eb77b8b14e75fc4e50f230..d5a59d870a390a6f5632332c12534f83c686e2dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -17,11 +17,11 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_predict_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index b2ab5006dc4ac3571b4f9d01607adb6aa2c0be26..1bfd51cdcc3e783fcd24a80e189d3d73bf3a928d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index da212382c1a6a3c5d37afbd1ac895249b566a913..8a0b8eb46f006497472c1e9ce539e91db19bd260 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index c910db027e69f3ca21495c968ebeae691711c316..abb3c236948a7f46d64cad92ae922324446f9a99 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 8b7b33e98ce2673ffb5dcf951a8cd6a684d847af..b27db4e7f23499fd27430059f1cb556f341547b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 5e3e41ba205c70413b7d015141b92c206ea26f32..50998ac9d63c9492523720d7dcc8041fd9efcab5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index e160b1015380fcf9f3a7a8f4a41df6877cbf9246..be17aeafb5ae383cba58b854808f6c9bc0e9696d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index b6b71358c869ff6210e9a704f79cbd63970b5dcd..7f21b444bc8832189b11cd8ff206e034bc89170c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 5c5ab1580eb3d6ce02498b1bc42aefc39784abf4..2ac86f152fad454fc0b09e2cb8814f23ad997c20 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index 489de2e4d31d8c631ea11f8a50c91498a70fa308..f6b1dd2f7e4244218b7c64868b773142c79695d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 30fec249b838350ac4ef542dd0f1969b0ddd7588..3da1f43a92a3fb5a146bcf8fd16f26783487f129 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 0e983c9234597a17c6c9342eaa3b3a26158736fa..a7be5ac81814b28c93407cd5d1ca7c3f60822f0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index ec50db71279b5e688ef36558941071fbba3c02f4..c5c29bead383da6b9c0c7436fb089e27413e72f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index cbbb000e25669a6a77c90c371d999983274e48bf..3af3c2a501d6b46821d2ef1b4e6a06e2820d4764 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 23153d42847ad6015ccc347b70d35b7f3b83dc03..880d18e1aae53512b2f587b5c8914babcd68566f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 766c3f267f97f19d87cc39a24ae90dca796b4988..1eb0cf1a188b88d55b82297da715624c9e5a58f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 898098227190498a5a752a493e3d9bccb431bf15..d9394e60f532465c1852b2cac46ca4cbd9125583 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index a74b8d29502f0493a99b16d8fdeccf77e205be0e..a0f6dc8097adfb896a8f3aa3b642c2997e257cf3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index b093f8ead94199ba2a4861d0453ff5248b2d7fb0..037b92f861b14720a1a638884752a4d3e1dbd9f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 0ce9f6fd591f127eb2874397abce21e8451ba3b4..6a0d027d47d999f5770e59299fc1206249bf9b43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index c1f5bfae0d35683e4e718a73add8f57be9473c72..66b5bd75fc16c37aecaa65ef12fb2311925c252a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 4aa872c4f04c2f0a3cddc83bc7c64700cb97ca3c..e73133ff0731821407084cc1cd6160b2e9bf3d9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 6e01f7c70c9987cfa651078175927edfaf1fd6ad..7af6b2b3c398473398a9d2e227a42ec96451b301 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index c002042d7703bccb0af37cebe453803c9e9009e3..baff492dfbd3c9ab6f2c269cb89632768e6b6c92 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index f5e5446d2b995c9ba2707cc16376e8c639576c76..63d30a61851cdae8daa8a5dc70fa733fd6b2ff11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index d5f36f4bc3d1b517a7f2dfaf3fed490df66a5fba..7a29cbbec35b885792828828354ac8f9a29579b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 346fec6056380842f4d5f40833cea82a540c088d..87c75c02243cd646502e12e2947555ad7c6913e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f8fe9f05e0f577dca9e1f3225f3e14074fefa12..f69104ddfef17c8b5df36f4bc3e9b0ea3a986295 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index 68fb7382a71cc0e3215daa43e2f1ea0f6de26e16..aa05471933cc97a872480e0ac45213b49a882189 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index deda82f9b3f020589c9673e9070ec40713846b7f..d61f1ddc1d506ae2db992aaacbdc634964d53292 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 2eba3fb95450568a8e1611dda2564b764565cb3c..e2d05f8298ef4779f3c2678b24ea9f938a3889cd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 6ed13d37f2b511e09d3dbd4ade0ca29088e565ae..f650f48423b4496fe21abd215ac494018921df00 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 919aed5723c0464b8540ba1cfb971bb23bfef73a..06e8b6b314183b884e635f3b78e5bd5368e0962f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index f590ce1ef71200854f62baf3c8746deefbaf8e46..9fdf6f66d1160a49da302ffa8eeeade3009de048 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index db4261fadc76e2d953d477c472adcb422d48105e..cbe102065071a00596fb4b8f764b410737c638a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 7369552b3b8733b6c586888c643c9596bebcdded..0efba09b272c8ffb2220ccfaad830c7fff98568c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index f643ef9de28eed6756073d84553a4986fb0d338f..b34c499eb2e603aa8e2a6c9c84ec752a41efd0de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index ce053ae8c44353815d9f6872d1f8ab72ec93c4f0..51dd853127f549c8ff370391f11cf7b8021af469 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index db9504307798cf5e51a28469a3df669dd77dc0b8..dcd18a9cedd53565fdf38d9787335e0afea9ad3d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index a6edba6b7efc631cc1057a8ddb7d4af19142ac6d..f029907ee86943fb8c04eada819e9cbfd6d01009 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index f8c0dbb27364db34f8357460376cde555a5a0063..278ae06cba9bd60b16716fdd0a38e87df2ee303e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index ac4bbe7d19625bdf1b11f8c3dfc9bdf1ad5eaaf4..15cbcfe8edffa92ef0514248e9dbc523dc6a49bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 947e3170aea6cfb26e6604f1ef950293fa4cf4ad..865b898c4cc54253d85442f2db2f3f624ecfb817 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 17e202c5812f633e430a821dac5f424ae587ad47..3e17aca17cc4e636ea3f6235f04cd4b7f468ae28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 9772c5df9b525576a2b9702f238fc7d309b7561e..b160687a2a610714f5dd6c0cc7c7c92408d386df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index cd65075591d151c0e6538588af932a6cdab5c90c..70e8d51a5a782d5f473e1350d16d942998e58fbb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 0423de7a248c17b1232ea5b9689578f2d824cbdc..809dc8554b38af9486035f1f3b03aa58392812de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4471cba245469c636419209084d624d2138fd4d4..3fbce8cb714355c0898dcbdc6797394410e90253 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index c0e7fae4564f2f253df4377076b0ec64cf2b5cab..70e4103ea1abd5bda90811d127230105ae7bb941 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 6975a6e88d8822f5a817d4a178ab15104799b91a..000bf54c4523307d791db76d73e1cbe71cb46e4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 56bd70db7e18f61f8af8cd9f9d4439b544d5b380..8ffbf07f9bc32cc9a3a83e2f57f5fe4d78fffd3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 656319920ec34891e22b7145da1f80f787681572..3803d2b0a8765f4832df34fc4876256d4dd2ea86 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index f815e669115eb21ca2f23909d6a36ede278ccbd2..28668224e01e04bbc4c14259d84a11ec72c826a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index f61f0e521b7962bbef1c916a5aa79c43e8ce4019..b83ed67723afd5544ec19c599437a57909d780c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index c58c8ce63f50b6d5f2dc3428fd50726ddee720c6..e689d69140e36a94c731f4c3b4578919d31343f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 0efe9a4297960644c20d16f097e816046bb2672c..bb6eddae7168b576bdaca91b6f7951fe7b65ee1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5caa02e71a10d92a3c0d68f20628b5391f80e260..5fb3f9dd3aa9a3f3761c2bfadbcc19f46fe2303c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index f21c7e5b217cb5e3e9a8c30c31b6a0615d7d73b8..8eb6dd9f4a1799c9a6d90fff42490e29417cc24d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 381d5660b9846b9f2b90f630d724fb0561d6ca94..376bec0814880e3fa0091a41cd9a4ba0dcc4ab60 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 36b0a86628b92c84c227eb59d55c9e9a12be053c..c5f91a6338cf5c0b8f017f6a6a87d0016ffbe999 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index b41662e63a8f9273062256ef7ee100d70900e22a..bde888735916a018647f681968241a583e0271ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index e4abfca91363887b9574b76894da24c9700102cf..16945f2c12a7be4eba8a67a9a58587d756888d12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index cfcb92e293c59493c7e57ebdb30ac2f2ab35715b..f05741ffceb6a855f56731086619dcc621c8d71e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index e0721353d14d87d2d1e9e204eb9d5b4fe5902b3f..7885db4ed291afb8ea627cebe1dbae45723d4b2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 0618fbeead02e74e645a2b6be1310f8fd0c00470..9380d26cf4c7b1c93a4c0ce2681e792381c42deb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 4af52ffec80937d32e8cc0e0b128a8db606fd94d..8eb8218df3f3532c6c108e9f43f6ff5a708c7fbc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index db9311ee58d441908fb5c4ce3d952bafdab9dfcd..0c96f86ed36d7cf99c396f863de6d9ef8f90adc5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index bfb15cb44789d9d8d134a5090bf27abf2f81eda2..0c6b230eb79aac1e719949b3f8331423b621d47b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 1db962dbb8c0ac2b0562ecce10354a76d3e74be4..eb7ca52fba97174a1c6869ae003beec8ffc328df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index f80d5267e79c4b74831b2b926beb84d479008e10..e724e9088f82f7ff7152cc4393af4a8f582136c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index cd772d4ac75e3ea4820a788543e15e3af3566b21..dafbd09ee28ef6a5ec933cddf3c246cf2f4f4480 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 2bb6b3073ac79cf475c942b68ac351a18073c689..3122fbec1c7cad161d71fdf9970995adcedfdad7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index e1a1f0735524af6d3597dfff9ca64b3e7dbd5e2c..0527cda1f026e1ff9075e827c2902c45fd22db9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 66c4446572c2ac5930a8a0bc0d5de96e584aa94e..814e5a5d545f0d4b1276ef1639eddb72004b4d1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 0839554f434f64cf957b17c8f5863655fb427ee4..aa1731afb82698cb44375407fc717bf32ef634d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index b10695f6f7965ab7d5dbca7128530348c8758179..9d7dd85fe0eef3733a86b9e918396e882f5812d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index b96500f710398514b37b5b6f32fe31c61aa99e44..e9bba298bb028851e6e9b9a17ff40a671d9132f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index a27d93ec62002a9ade1a012c1bc9f8bc4f05e80f..3c783eb5129028b3eb5160c75dde2859541cfd32 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 6dda24d3d27dcdbc88189e377ce20ec64d908dc9..b8e0882541c51209cca112c54197bbce305bd1b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 8a4ae8aaa7b91587c7f4e0a71eae6e0ac8598482..310f369ed6c7a9931af56016ae09db5d4bca15d5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index a083c1da2e3a0a450bb1f39dd12f3270bc49e1f6..df19d781c21e403b51d451d772cfba66a7383be7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 5d5b361f8272d273941e8beb1978d0ec8b406027..bf909509bd4b25507291839fff1ee0eaccee630f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 392c338d73d75e2af9b06be86d449d0ac3415c50..5d66bc6fb6334d99242009f59ccbe0e7aa2d4e89 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 1143604903523b286f24cc6ca20b97b68e473593..88e9300de912f3b12712bbc311ac156803ef35c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 5a15f1a55fa0ce6db3357ab9a3e69d13846caaf9..9d81c6d4bc3139952e9f41113d05547b215cf571 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index c470d9c8e8d9281087e347881592c488f46212f7..712eb0c6ec3b706a9e396a532f58916140a2c606 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index d17d6495c09b0e43041e85b8eb99d9d47212606d..dfc4ca27052f919ea3866a489e525ae1202795f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 2d538b4734892b85034974887b7fa7dd024551b8..5e4f727f71d8be2496bee1abdb87c7050f1ca02e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index b70923601aeb843fe663734d45493fa97757915f..9d893cb30a066c4732ccab9d1520f5047a4d3a01 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index f453ddd50efb193accb2d9105fcaf8a130ca3b3f..a2ed954e4c0ce1d474b8c71b41ce1d585d42d665 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 5759169e07d26600a12b086edb8f945735782fed..8a0818e78ac766a624bdcce85591fe13e1d4ceac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index bfde1c35f65a603bd38e1cbab6c2d5eb49ac40f3..b5591b48265d3d09459bf9bb114a4a3149984eb9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index e7f59a9cc5143b337e539d28cd6d1ffd691b5e97..210e4fd4e6f0b2e8ba75d22e83134e2267fbece5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 0354149d4fea06d489be61391c46e84d8b6c369a..da2213a84fe2f6bc683630e5f8760acdf3239b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index fff0e26bc16b863a1d86d3f735da009cedcaffd9..e2c303d506e0f8f99d8ca89f29979a5999382378 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index c49fa5663d91c4601062d7b207ce2257cec6dd2c..396e774c8a4a10c4996c56c208fc4f4d432e3135 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index c961699053a4fdd71f8a2782ae463970f243c88e..8b6418d514e61536f314da88f1586cea4f29cfc5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 1911e128eb2d7b0ffe6c4ff7eeb0b4927a731bca..e8fda4c71ada65aecef59eb8012120488b0f17c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 88be9143472ad00b12688059600890f67c6f4e92..50c52d270b684bcea5105e4c9813cc62103403f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 2bbb71ece2583d283cedec37b10eda7b693baa0e..84c6b78a2b4405fe0e2a1fbb3bba0635db06a21d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..476c597fe60ced2338a38aca4c5ddd5f21b0ccf2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.experimental.LayerNormalization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'norm_axis\', \'params_axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'-1\', \'1e-12\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f229615461dc7b781c0ba2ec6f81692d65354bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.layers.experimental"
+tf_module {
+  member {
+    name: "LayerNormalization"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c7808689bedd8abb835e61c1f38fdb1d..ad74ab3f120f4f7ed56850d7852ca09ddaaa7d90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "ZeroPadding3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Input"
     argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
@@ -412,6 +416,10 @@ tf_module {
     name: "concatenate"
     argspec: "args=[\'inputs\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
@@ -428,6 +436,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "subtract"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ba9e57bed4100437c8b71d8b506cc2c928a9ac9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4952a76291c00bfdd73eed5412e7421887d1bab2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3c62d3bef0b9d200577f34cbe303fc7a094acc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2204f0b0f07fb2904067f4aa47576cefa2dfd272
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60706b4035e24414aff556c7749677a6b9822d22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0f1e80e8f34b00870c8d03f4c19976603676
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-loss.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..466e0b40066602b9887bf380a54f391d577fa65d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-loss.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.LogLoss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogLoss\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04a9cc94201a5472a7c6158acfc4bfd48d4f74db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..674258ed8bce833057de277b1f6259272a5c16fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15a59d16c20ef52168981be78bccf682556cc073
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f8badb2b6ec2ba8dd16136c32f5d27811a0d4d9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index 9e26ddbdca0c45df195dd566952379887dcfcff3..7381fe3918b565063b22dfc8e72f5e9466a4d63a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -8,6 +8,38 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogLoss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -24,6 +56,18 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -46,11 +90,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -58,11 +102,11 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8019d43c78fc62689bb4245e418518f251d0e5dc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.keras.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'AUCCurve.ROC\', \'AUCSummationMethod.INTERPOLATION\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index bad488f59b99ccbe7c6424244c86288afba51f46..5f0079762fce727000e4fdeeda3dfb012840f3ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index a1e7601a5141152c6709c46bb50b331fda69afca..a0a3ae890036161d5ec8da0af6b81495c99f9146 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce9e042027eb5e1186de4ff7c01230fb0e038f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 5f2c2f980777a34ed5128d8090ea7e945d9004e7..587ffddac96220dad543be2e63a0fc202d60cb10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cd173c7eb9f77d2731b790b068ce3a768f1c586
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..99414dd54ec1fd383f2f1a672f311a3b99213a00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b059e039deaaf699414c779584eff50b5b760537
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index c153e9cf4d7932b1e4bf65bd02b8de2706d4b8be..5432f7f4006b165fefb9aa028bf7d36d8cbc38f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index aae2bd99886fbe93086186864eb6040437b872d5..75541bf285d8989f867aabc7c7025e56cce1d05d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f45a57df41e56a52f1a4b784a8c8d140f6670b25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25bcf8591d45bf2237845a914122f5f9ca9c0ad7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb2df7f459753b08426449f026129d0960d21eaf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11538c6c09dc4f0ed7de48fbf95dcc253adb1e24
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..adec3feffa1f07d4d65b63281f1a87c7469eb4cb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47b550a9e73bdd33c5d592ff86f237d821866e09
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.keras.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1ec7cb51e545077737051ff3f06509882e4d32a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af960fa54329d7d80d23f24df798509b2d12bb85
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..417f92f2734329e2382d6d6bc1ff7c399c990704
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a5218fed4cadf155f29781ff7341675b64f22fb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.keras.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 904a2fa9caee882775701c53a97c9aac0fd8120e..b089109baa5ea49b588e6e9a54b56256d4e4e3a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.metrics.Mean"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2216043cec13086660c370d835209ff39ddaa324
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e91098f95bffa13fdc97406590d1ae71edd6887
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index e81ecfe3f627f9d43ad1c673d41b70e81c783f13..9aeaa5627a9805579d6a6c4e09336a4d7994d1c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -87,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index f8470b94d7f52216d1c1e4342acabb404bbd8f74..748cec08668c461fcf80df6a50fd5192f99073b9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -87,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dd60bb772aa93b637fca287baf3f17104a2e16d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index b70ef32bcaf3cb243d5d22d93cdbd8188f56d4df..97aeb680be1c5c412a16e2a73e3f1bc2ca8ce6c0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 2e693269bf749260e143cf19c6e1f51a5242412f..5a7bef4714d5aa37dcd1c11048ffd370e8841f0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea0f2a760ab7e34eb5f5b8c2ba63eea96ce63777
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index e62a2df0564a0eb4dba528dab575b7c08e41b913..85f80b062efe3d2d91104b211c8d9d75127c8c0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15895daf42bbfbbeda419d767844fd840ab4178e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ca1c6c8396e3d79a8f6250f34e7137870a23ac6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.keras.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cf1bf21cb5acd9d81c581eecfc2e64b78bd9e70
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 1a524d73c0d387fe603846b5f180916829d65435..4bc9383f6ffc90972416fa031d5515a149e70425 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index b9b4f565c5eff9ece856255ffbe15af3fb97c2df..2eae4df0ae344656bb637bc27e806876304a86f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index 905021dd790205e64a6f9839218200db98941927..7ebf43b3625ed4d1bfc5db1b10e774f9f3d1bb9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Accuracy"
     mtype: "<type \'type\'>"
@@ -8,10 +12,26 @@ tf_module {
     name: "BinaryAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -20,10 +40,58 @@ tf_module {
     name: "FalsePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -32,6 +100,10 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SensitivityAtSpecificity"
     mtype: "<type \'type\'>"
@@ -40,10 +112,26 @@ tf_module {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -78,7 +166,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -86,15 +174,15 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 5885cd21c1976bd7b95f7ca5bbea59eeb40b2ce8..eb1ab1d9dd61b36ed8662e25700f12f82aadb502 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -175,11 +175,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 935fa32f8c7f2d3b9c6b220a6b77a957d2c73f30..c69cf281742360d9ed4d1f7cbd35219cf04b1149 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -180,11 +180,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index b9ce154bddef609e0aaf6627d6f59de551e51e3b..0a56293e804f583a949ecb413da0ba613e0bc876 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adadelta\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index d0dc9e37a386a26143365eb443d5ba5fce8a87d9..14d0894e5622021c4961228d431d01516b752055 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adagrad\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 06815fa99a4a474ec131c29d0cbc78bb2b9cb72d..fdb1ea838c04f296c3d0ee7596d73052b6f4b6d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index 47b55fdb44e79e976b6de13d760a7cf175323c6c..ece63ec168dac58f58286dbd9fd8a8151d0dc2dc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index 8c63a7dda98568b24ea1b3cda15d4c840fbfd804..f952f88b6d203488ea0ec4f1794d7de79a25853a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 53d64dae932e250b9d81b2767a833de3bac8c403..27bae902b0cb7f1f4e09737a83fadd95a83cc163 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,14 +1,35 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -18,6 +39,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -26,8 +55,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index a1e9b8cceb95e8f25ac5f414fadacf237be33cd9..e523443a0099b57942c73cafcd8a919503e8db38 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.RMSprop\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index a67fefb1bafebd62db9f6108f0fe1847b5d2e0cb..d2721f8e92088c216ab748cae45e415553b9d4c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.SGD\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'momentum\', \'decay\', \'nesterov\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index 85764cc8dcb46f5aa8f0d0050dae07cdbaae35f3..6d826a8f8e47e53bffd5f759c4af02c5f9d5b15c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 259da2ad3e8938bc65f3cb740f8599a29a7a9a17..9505c90aac52c3329861bf01ce3c40c50e557b10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index ffda9334cf31ebd5329eab57fc0b0111b4bb6ab3..5b1b8f78dc5d98c059756122df24340bcbc2790b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 56a3fc3de751b6b52cbb165f3f07cc935c33c054..ef4c57b6942f17bdd1524d3eb773fbfabd5a82a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index d72f24b3d574c2a5a59df57e00241804c6bd6cfc..b5ee2e7302d034c4b9d9ced7a1159c87297b1a06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 72a7339368a64474bcb3ae70ac655c89f446abf4..57f6d7c7c0114d3ab5cd9cf4066979a6837d8e9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 38a63df42d296d9c87c70be0f87a6894b7d1dcab..88c616bd17987acc2e766a26c4b14c62d6d4a3b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 29620561f7f3d244ac900c92565812cb20834853..b70a907907e5cb7e7a509e1712675bc9c9bd4cfc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index f1a2bcbb7268e32c213124887d5f635c91493339..33e8765ce6edfe3a14b7d6ff88be9a2ec2f07b32 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index d1e2d5757068f15e893c42631fcef12558d1f16f..1ac13b57912cd815c1b8de9b461d6cae2364ed9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 92e40f6d96063d06d8b2e4dc63d69481171bfaa7..77faa3c2b9da7eb1c7f9cb086948997f6b2af02c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 087601a3c13a921fbcaee22cf92ec17dcb841d93..0b2631491b0d727b262df4ac05eef7bc64eedba3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index b052c6bb0a7c72c05102006fe3f413b53c0651ac..0a3414d20cfd554d1c5eb82d613fbc938f8f3600 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 9444a1bc765814e49834a2c76cb5f8938861728b..ffc5cf1c8b76e84d30f6a7af22773b45feb1a02b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 83dcb5e4e7d379c129483d507f07c4875d467092..ff2cf2ba90732bcb042c7f5ebc8a50483d37c8e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index eb26e2220bb2b96403fb50304e07e5ddc3a8579e..09c8a31a7be1162866e4457fda84a921de283377 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index 38d75e8bd54995c85ce0f403a0c5cc2fc167eebb..549e13a7ac6b595dfa665096aa15d7bb20df65e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 90fc61cdfaebe4d03cc5422337899fbe853fecc7..169ecdece5ddc92908ed027c7f470d08cbd5a5e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 773c74e64d13ca4a840b7f599fc2cbe9c161cd03..c7a50969b54e5efc4d338caa79dea76d86bffe8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 533544d21f2753f785113a30518f4fcbcff96cd7..3900c752c8527f68af2496f99083d80fc9d18106 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index e3926eb6d4714731d09ff9c5b75a89830c06e7c1..7b876099af6a28d9fca2e5c55aeae5e4610f82a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index ba209df7824a9cc076499458e35acd7dcf1eaf35..5bddba8e798618f5b1d0cdc61ddff9725a495fe0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 081fb0e08bcd1b35ab44459d1c8eb0857dd14956..62ba8bb59e8af14447fe570ba28c5d0eba7f6af8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 2014a04301618c20af5cf6f1144eb4dbda2479e1..0803feeabd12acb7988459fe6da2748e19b70a5f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9a87ae9687741090485bd8d4d0d07d359a2015e7..6def32864b9cc660b94d628ccd53dc48a566ea81 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index 33afb835ce1d524991c0024bfb87c29a72aac08e..dbf1ac82d33b81c63e5c356ac736f63262797ff0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -0,0 +1,142 @@
+path: "tensorflow.linalg.LinearOperatorInversion"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index a9078c8ab5cca078237a29febabdbbd4a8b6c89c..85d902b977ceddd405abb1154a086d7bd29e7848 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287..638d82a599248e547bcae86ebd6d8d8dc3f6aa4b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -115,6 +115,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index a87649133fd207ad59f2124c6b0b5aa44916e5a5..ab1b04bd3cb1b215b848019b6c578ce091f8f828 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 32656467840fbbc0c8708ea68aac5aa75c11a540..961969aac58b78e4edd53b47f2932f71f2d21fd5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 49d8890c8942bc0021886ee6c9bc4e7625452655..e76738a9648123414159fdc9666a99b0577aa46e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index c89dc067b331603e227d9d578147e2dd1ee4a900..b35cd69da474a9665652f04f12b34a8d9f33fa8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 9f7b422fabcd55aed98bc93f01143d35698c0399..5e49b75c3131b989c765ab03659fb225cc23e26e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorInversion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorKronecker"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index f34e2c2aa5a5b30e037157bc84894da5dce78538..2e298d8cb641e7a9333b4cd7a84ed4dd9eb213a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -264,6 +264,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "nextafter"
+    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index adffc552275554f888c398ac8beea730b851e293..4251206cda782be1a3a4c7f78fc0df705df88596 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 95746cc49c3c4e762e8559cf704572ef122a96ef..20af24633a45759c5b6e7b7758d09d87a024cacd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3547b66d19ac6b64449860160774647df855a6de..3205c6a4dcaaa00591cd957021a463b77835343e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 7582fd52b63afdb8c6f2a5e7f0e6b26071232832..14cf5ce4569f18b326af1ba953a8edec2fee5706 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 7ec61661fde68ff102aeed8992891854a4028bb2..e43547b15428634f0f84ff0e01abdf4585e9d5db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 9617d07568ee70a7e6158fdbd33c956f8ae5e604..99381cd7e167223cea0fe4eaebbcff736be66054 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b31886f73665d6e895ebbf25a33d61b4b95eba74..1fbde9df17cb83bffd46c82f11c99d2926859f77 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index c36ecaa4b2b2ce14292cd2c46a986bb1387294bd..8ba92fcc8dc89958b8395aa986c358a03fedd66d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 42128ebd17234fcee3b016bbd7f1964824d1a0b6..9de73076b1197ce7bee8a00dfd7bfcd1b48a35bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 6541952ccfdda15cbf7b3128869d9199bbafb5e2..016ae23890866ce3394806b0114ca0c9a4d70869 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -292,6 +292,10 @@ tf_module {
     name: "app"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "audio"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "autograph"
     mtype: "<type \'module\'>"
@@ -1080,6 +1084,10 @@ tf_module {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_batch_normalization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1124,6 +1132,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_batch_normalization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1250,7 +1262,7 @@ tf_module {
   }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\', \'batch_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "gather_nd"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
index 02e59a63e10b1a24bfe0c275044bf807b433f62e..a31689a58bceb91ccfb3fa91d8b778c6c25cc929 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index ad26ded10b4dc652574ce4b544cbadd98e57a013..6f4f4c2f31a70854de3e6da78a5f98e6a6394cb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.strings"
 tf_module {
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
   member_method {
     name: "format"
     argspec: "args=[\'template\', \'inputs\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'{}\', \'3\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
index 1f1d8b6f9e2cde4800cdef9c417191b1a0ce07b5..65a2b605d532c4a14d3d444a44a723c543af5026 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index a7c05d484905a0af26c80a52d92623ef4a3eb6c4..179272d8a8a298ac374c19641068aca739bb9626 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
index bc8b92389c6ed7dcb0fa23ff3abd86bb0d1c488a..15c2ef46c127543cb94690aade3c79b6e75981c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
index 5d17be9378fd130b89e199544f85e03a23a71d3c..9c902e582f35ca44a6825727637fa3d76011e33c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
index 5be37200f368b1823093c67ad7042db534b0df93..42dcdac9e77a8efac875e4985f6a8f744e838ddb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.Checkpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
index d265fdeb01c38d8a1347e630d7f7bff111999634..f41d9f12d9fe65b128d216551870ec8c95834a6c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
index c673e29cd4dd6cd3c01582abfbc306c092818892..7399750385f960133aa5cf071c57dc9fc716a18d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
index 8199f63b9b8c64c73a3d62294277838cdc240280..9bbaa14a6fd54dfcad37560142bebc7b3118601c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
index 876bb35e391885e751066a415967af848280c714..448e17a44891781b1d6b0fe8e627cb91d098f1e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index 14349a74efb61124fc7b5568d5ec023f08b1b62f..eb1782e9cad73708de24f6565237830a29cfaf8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index 7d982dc51f6edce1cf691671e31ddd07664f0dc1..eb9a86183e10775379efb84c693f7aa7ba573f2d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index 906384a2875bf7b05ac26fc43207f4ef9b5a7472..2cf4c2e7ea4879c48c1b3a43302f7fa4e9f689cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
index 2c0fda3c72b7e1f02265827b9dc1929500935cd1..ecce08220d6bd9815fecd26a95f8ac6f745d9e33 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
deleted file mode 100644
index 3b75a1735be76fe77689736e492c42c54ab795c1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.Event"
-tf_proto {
-  descriptor {
-    name: "Event"
-    field {
-      name: "wall_time"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "step"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "file_version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "graph_def"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "summary"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary"
-      oneof_index: 0
-    }
-    field {
-      name: "log_message"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.LogMessage"
-      oneof_index: 0
-    }
-    field {
-      name: "session_log"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SessionLog"
-      oneof_index: 0
-    }
-    field {
-      name: "tagged_run_metadata"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TaggedRunMetadata"
-      oneof_index: 0
-    }
-    field {
-      name: "meta_graph_def"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "what"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
index fee84d85307dffb675b507a31c4f1fda60de869d..5b47c718a5753905a4fa426b739dad4b01678c3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index c0ed95653552f904acea1cc82bca00773ecb792c..481a8c73ac351cc0ef38ee3681d5134f06334421 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.RaggedTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
index 3add49e90d7eb5094ad68d1474e834404549c988..64f7260369d7cbc656ad3d23b69cc9079e030f95 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
deleted file mode 100644
index a66b74b315c6132e8f884bd52e7a3b5bd7f52ccd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.SummaryMetadata.PluginData"
-tf_proto {
-  descriptor {
-    name: "PluginData"
-    field {
-      name: "plugin_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "content"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
deleted file mode 100644
index c02575b9626c848e9b871d2cc6febb26a5142f08..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
+++ /dev/null
@@ -1,40 +0,0 @@
-path: "tensorflow.SummaryMetadata"
-tf_proto {
-  descriptor {
-    name: "SummaryMetadata"
-    field {
-      name: "plugin_data"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata.PluginData"
-    }
-    field {
-      name: "display_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "summary_description"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    nested_type {
-      name: "PluginData"
-      field {
-        name: "plugin_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "content"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
deleted file mode 100644
index 94f712073e0d0dda201fcf7adba849dd45a1229b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.Summary.Audio"
-tf_proto {
-  descriptor {
-    name: "Audio"
-    field {
-      name: "sample_rate"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "num_channels"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "length_frames"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "encoded_audio_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-    field {
-      name: "content_type"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
deleted file mode 100644
index fc1acb483b3051cba01f5d9bc8501a61965bbc37..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.Summary.Image"
-tf_proto {
-  descriptor {
-    name: "Image"
-    field {
-      name: "height"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "width"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "colorspace"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "encoded_image_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
deleted file mode 100644
index feb84b6ee996549ac58aa0e8a4ac560f947b6339..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.Summary.Value"
-tf_proto {
-  descriptor {
-    name: "Value"
-    field {
-      name: "node_name"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "metadata"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata"
-    }
-    field {
-      name: "simple_value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "obsolete_old_style_histogram"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "image"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Image"
-      oneof_index: 0
-    }
-    field {
-      name: "histo"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.HistogramProto"
-      oneof_index: 0
-    }
-    field {
-      name: "audio"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Audio"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
deleted file mode 100644
index b2bdff7171804aae114d1e3631e3074b1e4006ba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.Summary"
-tf_proto {
-  descriptor {
-    name: "Summary"
-    field {
-      name: "value"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Value"
-    }
-    nested_type {
-      name: "Image"
-      field {
-        name: "height"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "width"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "colorspace"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "encoded_image_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-    nested_type {
-      name: "Audio"
-      field {
-        name: "sample_rate"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-      }
-      field {
-        name: "num_channels"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "length_frames"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "encoded_audio_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-      field {
-        name: "content_type"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    nested_type {
-      name: "Value"
-      field {
-        name: "node_name"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tag"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "metadata"
-        number: 9
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SummaryMetadata"
-      }
-      field {
-        name: "simple_value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-        oneof_index: 0
-      }
-      field {
-        name: "obsolete_old_style_histogram"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-        oneof_index: 0
-      }
-      field {
-        name: "image"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Image"
-        oneof_index: 0
-      }
-      field {
-        name: "histo"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.HistogramProto"
-        oneof_index: 0
-      }
-      field {
-        name: "audio"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Audio"
-        oneof_index: 0
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-        oneof_index: 0
-      }
-      oneof_decl {
-        name: "value"
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index 6136c8fbe79ef8d3851c39b8f11ac3c33f6050f2..a80726d3bbc400b1ce8e640819ad370c3589be6c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce29615f72eee78525b8a1efbb4531215e6b72fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.audio"
+tf_module {
+  member_method {
+    name: "encode_wav"
+    argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
index a71da113b4ffcaa9ff71e18df4a9263b141b42e6..359530f69c8a5ba61d3ab3531feb742987cbbede 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
@@ -9,10 +9,6 @@ tf_class {
     name: "AUTO_CONTROL_DEPS"
     mtype: "<enum \'Feature\'>"
   }
-  member {
-    name: "DECORATORS"
-    mtype: "<enum \'Feature\'>"
-  }
   member {
     name: "ERROR_REWRITING"
     mtype: "<enum \'Feature\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
index 12e23bc0c8fd0831471abcf56bcd8f07d3e6fe57..0baf6e03552f5b12e5f2e48f87cf1ec7332787bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'level\', \'alsologtostdout\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "to_code"
     argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
@@ -12,4 +16,8 @@ tf_module {
     name: "to_graph"
     argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
   }
+  member_method {
+    name: "trace"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
index f1d760603e981a0b9a72fdc379dc81932ac71d67..95352dff3a6b9341857b3f3b82dcb6817e4553ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
@@ -32,6 +32,14 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
+  member_method {
+    name: "dimension_at_index"
+    argspec: "args=[\'shape\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dimension_value"
+    argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "forward_compatibility_horizon"
     argspec: "args=[\'year\', \'month\', \'day\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
index 00ec669b1685f3cbdacd676bac61755bebb9f6da..437131abb2d4512c547635117ee0f9c2e1c3b284 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.constant_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 2d115904925eb96164484300baf628d41d3fcff4..abc98a74b64ab274ed8b2fc43876b7102f1c7201 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "bucket_by_sequence_length"
-    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "cardinality"
@@ -180,6 +180,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "take_while"
+    argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unbatch"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index b06c73d12602b25426034f801be329fb88067011..89748f7713fd813ab56d0e07780da33ca8ff14bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -15,45 +11,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -64,33 +28,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -99,18 +47,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -119,20 +59,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
index df707e8920e4488ed6b40a7f93f56b5624188c84..c3b7991175769f473acf929d656cd52ccca7bf4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "devices"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "distribution_strategy"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
@@ -26,6 +22,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_call"
     argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
index 77706e57133e1186d9e98fcf9205ed4c91772eda..5c4f09075316150b3118f048091d3c68a60a232d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_run_steps_on_iterator"
     argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
@@ -78,4 +82,8 @@ tf_class {
     name: "value_container"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variable_created_in_scope"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 9a1df5514261a47aae6f3d11be78b5a6fa6da919..6ed49d339d7af7b2d05dfa57121805a7dce48090 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -2,10 +2,6 @@ path: "tensorflow.distribute.Strategy"
 tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -14,45 +10,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -63,33 +27,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -98,18 +46,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -118,20 +58,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index 01b870a81639807489ec2a09dcc185137aae1665..956e4d93e57069b6936413a3a432d45a22e4ed1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -112,10 +112,6 @@ tf_module {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "as_string"
-    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
-  }
   member_method {
     name: "cast"
     argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index 741102466d9580eaf5081c8c92b5b1fe38bc7525..b1bd5a2661d44d9b36b965ba160874e6142628ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -40,4 +40,12 @@ tf_module {
     name: "stop_if_lower_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
   }
+  member_method {
+    name: "stop_if_no_decrease_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_decrease\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_no_increase_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_increase\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c364b0217a7ed10282dc8fc28797f3be1b92f867
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.experimental.Module"
+tf_class {
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "owned_submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "owned_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "owned_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 0c3f04e468c4c817cd474deb42149aee3021aa43..a7ee6d3e07d4387c4fc0bd9a5f6d9614473ca73c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.experimental"
 tf_module {
+  member {
+    name: "Module"
+    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
+  }
   member_method {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.glorot_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.glorot_uniform_initializer.pbtxt
deleted file mode 100644
index bb8540d0fd8b4a737bce8d23404616f3f51d2c79..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.glorot_uniform_initializer.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.glorot_uniform_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
deleted file mode 100644
index 607a5aae21ff7299fc96aee3b932c10d622f1127..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.constant"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
deleted file mode 100644
index 4a81e52df966d0af93b097fe07ec642eb81f7edb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.initializers.glorot_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
deleted file mode 100644
index 815dc81dff5d5c3f89bc6e1d39b8fa7c4c15c914..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.initializers.glorot_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
deleted file mode 100644
index 37fcab95997bb7299675a387d08184fc1387eee1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.identity"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
deleted file mode 100644
index 18481d48150d2dcf7d6908ab1914ab217da93c10..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.ones"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
deleted file mode 100644
index ff64efd60cf1197bb9032912eb5cba48a63609a0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.orthogonal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
deleted file mode 100644
index e3c63fe737ee655169c00c7c0b2882c84f566244..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ /dev/null
@@ -1,67 +0,0 @@
-path: "tensorflow.initializers"
-tf_module {
-  member {
-    name: "constant"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "glorot_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "glorot_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "identity"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ones"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "orthogonal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "truncated_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "uniform_unit_scaling"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "variance_scaling"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "zeros"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "he_normal"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "he_uniform"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "lecun_normal"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "lecun_uniform"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
deleted file mode 100644
index 133e61c1d9869bdd00948df3877be990b30b7cc3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.random_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
deleted file mode 100644
index 0cfa0080f5a936bc80f69c2b5c15f671096ba350..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.random_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
deleted file mode 100644
index 730390fba274f9dc25eea7a53bb8145a2ade8613..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.truncated_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt
deleted file mode 100644
index 13295ef375a4002f8fece5ebb5d2a5d5d26c68eb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.uniform_unit_scaling"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.UniformUnitScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
deleted file mode 100644
index 86340913e2506c96499aae05a3ed0d5273c93bba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.variance_scaling"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
deleted file mode 100644
index 7df4237bb6537b39f42f7b3894beb1bec6641f6f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.zeros"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf57a88fc1295da13e0b58671191c9d8ba8caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.io.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
index cfa3372b12bfe32eed4311c89b6448c0359c0913..a797c06ff337cffe503d89c09497996ea64c6ad2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.io.gfile"
 tf_module {
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "copy"
     argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index eced2e1cb0706153a9bfc2749847395d194fcb56..283cc6a735695b0b2d16af28f7688a7a077f19be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -175,11 +175,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 2acb90173f3242e8a92913728eec84ef5d455d1f..95e405aebaf61e3ccae268b474a006a3bca51343 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -180,11 +180,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index d200d3d26d7c1b7d54eda596a8056a66e29be0b6..49e3d1155c3cb711676c4e67b8c47a8ffbe7615d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -124,6 +124,14 @@ tf_module {
     name: "ctc_label_dense_to_sparse"
     argspec: "args=[\'labels\', \'label_lengths\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,10 @@ tf_module {
     name: "learning_phase"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learning_phase_scope"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index a5804d3bbcff401920ddd2b59bd5f094b3e1c628..0725f606e2923ff1bd5a8814febdfe7de8a2602c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
index bbc02c4d71f835497be74e771c5ae57682f5a5b3..14bfc3bedbfb5a379e28a0cb9cd2f7f744539fa1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
@@ -16,11 +16,11 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_predict_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 6182baf0a31e7027b685561fed5eeedc54a766a3..9812bad8f66f3d5afe365287feca748f9e6efd5d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
index 9b1b068e225a5dae69672ecba70bdea48c6e6ae6..5aa739391ef894cdede1db17f903a50111f25eca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 92440188c81ee192df332cd89256233591b2d281..bf5bcb68df47ed8661509598d3bc59f01dfcefe6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -17,11 +17,11 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_predict_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index c10c236ad1990160be53ba5df7afeb64619bf260..5ae176017b3cf1ac019ecdc0f1c255f23b32fcec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 0db6b8d371b61db6fa565a93416dfc14eeae1d47..0fed6fd23670a16acd8d770269090c3dda0eee30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index dac2049fe19426738368009822ce2dac8bc64467..71cf7f4a4922752c0ba154a8d3fe29b37c305675 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 2834b74e8afbd5ee01eb77b8b14e75fc4e50f230..d5a59d870a390a6f5632332c12534f83c686e2dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -17,11 +17,11 @@ tf_class {
   }
   member_method {
     name: "on_epoch_begin"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_epoch_end"
-    argspec: "args=[\'self\', \'epoch\', \'logs\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'train\'], "
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "on_predict_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index b2ab5006dc4ac3571b4f9d01607adb6aa2c0be26..1bfd51cdcc3e783fcd24a80e189d3d73bf3a928d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
index cbaba78ed5a851c3d6e29ab67c89fdfd5db01754..71b5acc38fdf6a0246053f3260fc7e9c17d3f204 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.Constant"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edff37e3a15b198839a6729d75f190e88491f057
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.GlorotNormal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc685ce0d58f5ec4afb058508886d8d14c393c24
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.GlorotUniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
index a5f7f348de9d9899d962e7647d7943ddb6a60604..e0f0f3a93dac6c5e63822bbddb0d88ffcc0cfa70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.Identity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
index 8f10d1698e7b7b2afa9c2664c7dca38045eda85b..ae5ea9e48c9bf4fc478f60968cf1d83dd9c43762 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.Initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
index 2fbfa774f8ed020164e32bb3cfb69b8a235609ba..57c0b0917d1fd50b7817e575f55cfbb2e3e1781b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.initializers.Ones"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
index 874d320d73d1f1cdbd817db587ea9dcfea4d352b..b24844fa35c555294f899ebe68a2ee180de149cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.Orthogonal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
index 26784ce55d087d7d4fea6e6e0989d4490c95c6c1..0753827aa67434cd5670a41bd09e61ae7acb28dc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
@@ -1,12 +1,11 @@
 path: "tensorflow.keras.initializers.RandomNormal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
index 4110bda5f6d54eb6853a10b5e31123e369ce1514..280b0a0243d5c0e4f595f6d0f8b8bcda8202cb5c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
@@ -1,12 +1,11 @@
 path: "tensorflow.keras.initializers.RandomUniform"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
index 0451d0d73a0b3ed718c4a95eaaecabbe51448b63..4076aa595fe9be1c77e25be3f9a09469cae9298b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
@@ -1,12 +1,11 @@
 path: "tensorflow.keras.initializers.TruncatedNormal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
index 03f4064b9ef5093044a9cbb897043d643cf7f83e..a68219def66c5d68a189262408a7d1e16ac0c109 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.VarianceScaling"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
index b6ab68e5beb47c9bcfbc52f9808255bbb03d2dc0..129fa18c6171cd04fbd2d023fe8c67b75de4e542 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.initializers.Zeros"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
deleted file mode 100644
index bddc37b907e7573c9fff27a0c3a5f7e199b88a9a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.initializers.constant"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
deleted file mode 100644
index ef0815972d219e7fee1e2a02f5eb53d26a41c734..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.glorot_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
deleted file mode 100644
index 439b5ada9bb3ff1f6267922a8c755d8f097b004a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.glorot_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
deleted file mode 100644
index a4c5a6149047ffdaadde1243e4c80feae05cd77b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.initializers.identity"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt
deleted file mode 100644
index 8d0b5c242bd97f6b85b34408fd6d96fadec530e5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
deleted file mode 100644
index a89f78d1e1a47c7cd5a252cfd0a7b2fa23979e90..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.initializers.ones"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
deleted file mode 100644
index ee1e9bbae2b7130db5b96309e2d87719169d788a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.initializers.orthogonal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
index 1540c2915bff8b49ab1619223a54c67814c69551..7412cd130588a6a95538607b17b93be26492111a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
@@ -4,6 +4,14 @@ tf_module {
     name: "Constant"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "GlorotNormal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlorotUniform"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Identity"
     mtype: "<type \'type\'>"
@@ -40,54 +48,6 @@ tf_module {
     name: "Zeros"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "constant"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "glorot_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "glorot_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "identity"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ones"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "orthogonal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "truncated_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "zeros"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
deleted file mode 100644
index bac8211a10a50a33f19f36bb3f6370f38518903f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.random_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
deleted file mode 100644
index ab0d74d07171e3863be09b0d79045af7a7095587..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.random_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
deleted file mode 100644
index 358cca2b9cf657f5db6533a5523bfb6393d1f36f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.truncated_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt
deleted file mode 100644
index e6c731361acde102dfc049a750637385555f9f43..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
deleted file mode 100644
index a262390687f31a5fb79822e69273306b9e1897b5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.initializers.zeros"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index da212382c1a6a3c5d37afbd1ac895249b566a913..8a0b8eb46f006497472c1e9ce539e91db19bd260 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index c910db027e69f3ca21495c968ebeae691711c316..abb3c236948a7f46d64cad92ae922324446f9a99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 8b7b33e98ce2673ffb5dcf951a8cd6a684d847af..b27db4e7f23499fd27430059f1cb556f341547b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 5e3e41ba205c70413b7d015141b92c206ea26f32..50998ac9d63c9492523720d7dcc8041fd9efcab5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index e160b1015380fcf9f3a7a8f4a41df6877cbf9246..be17aeafb5ae383cba58b854808f6c9bc0e9696d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index b6b71358c869ff6210e9a704f79cbd63970b5dcd..7f21b444bc8832189b11cd8ff206e034bc89170c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 5c5ab1580eb3d6ce02498b1bc42aefc39784abf4..2ac86f152fad454fc0b09e2cb8814f23ad997c20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index 489de2e4d31d8c631ea11f8a50c91498a70fa308..f6b1dd2f7e4244218b7c64868b773142c79695d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 30fec249b838350ac4ef542dd0f1969b0ddd7588..3da1f43a92a3fb5a146bcf8fd16f26783487f129 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 0e983c9234597a17c6c9342eaa3b3a26158736fa..a7be5ac81814b28c93407cd5d1ca7c3f60822f0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index ec50db71279b5e688ef36558941071fbba3c02f4..c5c29bead383da6b9c0c7436fb089e27413e72f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 36ea9d58519d1638ca25f31bc1ce3dcbcd51aeb3..b13f963a6fcaa8b4c2da541654564d620c710c20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 23153d42847ad6015ccc347b70d35b7f3b83dc03..880d18e1aae53512b2f587b5c8914babcd68566f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 766c3f267f97f19d87cc39a24ae90dca796b4988..1eb0cf1a188b88d55b82297da715624c9e5a58f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 898098227190498a5a752a493e3d9bccb431bf15..d9394e60f532465c1852b2cac46ca4cbd9125583 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index a74b8d29502f0493a99b16d8fdeccf77e205be0e..a0f6dc8097adfb896a8f3aa3b642c2997e257cf3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index b093f8ead94199ba2a4861d0453ff5248b2d7fb0..037b92f861b14720a1a638884752a4d3e1dbd9f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 0ce9f6fd591f127eb2874397abce21e8451ba3b4..6a0d027d47d999f5770e59299fc1206249bf9b43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index c1f5bfae0d35683e4e718a73add8f57be9473c72..66b5bd75fc16c37aecaa65ef12fb2311925c252a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 4aa872c4f04c2f0a3cddc83bc7c64700cb97ca3c..e73133ff0731821407084cc1cd6160b2e9bf3d9c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 6e01f7c70c9987cfa651078175927edfaf1fd6ad..7af6b2b3c398473398a9d2e227a42ec96451b301 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index c002042d7703bccb0af37cebe453803c9e9009e3..baff492dfbd3c9ab6f2c269cb89632768e6b6c92 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index f5e5446d2b995c9ba2707cc16376e8c639576c76..63d30a61851cdae8daa8a5dc70fa733fd6b2ff11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index d5f36f4bc3d1b517a7f2dfaf3fed490df66a5fba..7a29cbbec35b885792828828354ac8f9a29579b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 346fec6056380842f4d5f40833cea82a540c088d..87c75c02243cd646502e12e2947555ad7c6913e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f8fe9f05e0f577dca9e1f3225f3e14074fefa12..f69104ddfef17c8b5df36f4bc3e9b0ea3a986295 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index 68fb7382a71cc0e3215daa43e2f1ea0f6de26e16..aa05471933cc97a872480e0ac45213b49a882189 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index deda82f9b3f020589c9673e9070ec40713846b7f..d61f1ddc1d506ae2db992aaacbdc634964d53292 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index ff00ca1bb24eab0e35f04c232b3eb5252d645edb..3caa3ff4edf352fa1baaf29092dd22340bdd12a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 919aed5723c0464b8540ba1cfb971bb23bfef73a..06e8b6b314183b884e635f3b78e5bd5368e0962f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index f590ce1ef71200854f62baf3c8746deefbaf8e46..9fdf6f66d1160a49da302ffa8eeeade3009de048 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index db4261fadc76e2d953d477c472adcb422d48105e..cbe102065071a00596fb4b8f764b410737c638a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 7369552b3b8733b6c586888c643c9596bebcdded..0efba09b272c8ffb2220ccfaad830c7fff98568c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index f643ef9de28eed6756073d84553a4986fb0d338f..b34c499eb2e603aa8e2a6c9c84ec752a41efd0de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index ce053ae8c44353815d9f6872d1f8ab72ec93c4f0..51dd853127f549c8ff370391f11cf7b8021af469 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index db9504307798cf5e51a28469a3df669dd77dc0b8..dcd18a9cedd53565fdf38d9787335e0afea9ad3d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index a6edba6b7efc631cc1057a8ddb7d4af19142ac6d..f029907ee86943fb8c04eada819e9cbfd6d01009 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index df2ea3fbe9a20987892a971499a671f7268c23e5..ac2d8c9aa3b74e5754c2a8014b4c093a610c5198 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index ac4bbe7d19625bdf1b11f8c3dfc9bdf1ad5eaaf4..15cbcfe8edffa92ef0514248e9dbc523dc6a49bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 947e3170aea6cfb26e6604f1ef950293fa4cf4ad..865b898c4cc54253d85442f2db2f3f624ecfb817 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 17e202c5812f633e430a821dac5f424ae587ad47..3e17aca17cc4e636ea3f6235f04cd4b7f468ae28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 9772c5df9b525576a2b9702f238fc7d309b7561e..b160687a2a610714f5dd6c0cc7c7c92408d386df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index cd65075591d151c0e6538588af932a6cdab5c90c..70e8d51a5a782d5f473e1350d16d942998e58fbb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 0423de7a248c17b1232ea5b9689578f2d824cbdc..809dc8554b38af9486035f1f3b03aa58392812de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4471cba245469c636419209084d624d2138fd4d4..3fbce8cb714355c0898dcbdc6797394410e90253 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index c0e7fae4564f2f253df4377076b0ec64cf2b5cab..70e4103ea1abd5bda90811d127230105ae7bb941 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 6975a6e88d8822f5a817d4a178ab15104799b91a..000bf54c4523307d791db76d73e1cbe71cb46e4d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 56bd70db7e18f61f8af8cd9f9d4439b544d5b380..8ffbf07f9bc32cc9a3a83e2f57f5fe4d78fffd3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 656319920ec34891e22b7145da1f80f787681572..3803d2b0a8765f4832df34fc4876256d4dd2ea86 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index f815e669115eb21ca2f23909d6a36ede278ccbd2..28668224e01e04bbc4c14259d84a11ec72c826a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index f61f0e521b7962bbef1c916a5aa79c43e8ce4019..b83ed67723afd5544ec19c599437a57909d780c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index c58c8ce63f50b6d5f2dc3428fd50726ddee720c6..e689d69140e36a94c731f4c3b4578919d31343f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 0efe9a4297960644c20d16f097e816046bb2672c..bb6eddae7168b576bdaca91b6f7951fe7b65ee1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5caa02e71a10d92a3c0d68f20628b5391f80e260..5fb3f9dd3aa9a3f3761c2bfadbcc19f46fe2303c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 33082a6f06c17232a136e612adc00f284f2787ec..89dfc2a256da20cc65b7d18601dce240f5580a21 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 381d5660b9846b9f2b90f630d724fb0561d6ca94..376bec0814880e3fa0091a41cd9a4ba0dcc4ab60 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 36b0a86628b92c84c227eb59d55c9e9a12be053c..c5f91a6338cf5c0b8f017f6a6a87d0016ffbe999 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index b41662e63a8f9273062256ef7ee100d70900e22a..bde888735916a018647f681968241a583e0271ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
index 5766528b31adfb27ddc5540232425610e737577d..c4726cf82430c800267f24032f2e02fc65e9499c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -180,11 +180,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index e4abfca91363887b9574b76894da24c9700102cf..16945f2c12a7be4eba8a67a9a58587d756888d12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index cfcb92e293c59493c7e57ebdb30ac2f2ab35715b..f05741ffceb6a855f56731086619dcc621c8d71e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index e0721353d14d87d2d1e9e204eb9d5b4fe5902b3f..7885db4ed291afb8ea627cebe1dbae45723d4b2f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 0618fbeead02e74e645a2b6be1310f8fd0c00470..9380d26cf4c7b1c93a4c0ce2681e792381c42deb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 4af52ffec80937d32e8cc0e0b128a8db606fd94d..8eb8218df3f3532c6c108e9f43f6ff5a708c7fbc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index db9311ee58d441908fb5c4ce3d952bafdab9dfcd..0c96f86ed36d7cf99c396f863de6d9ef8f90adc5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index bfb15cb44789d9d8d134a5090bf27abf2f81eda2..0c6b230eb79aac1e719949b3f8331423b621d47b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 1db962dbb8c0ac2b0562ecce10354a76d3e74be4..eb7ca52fba97174a1c6869ae003beec8ffc328df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index f80d5267e79c4b74831b2b926beb84d479008e10..e724e9088f82f7ff7152cc4393af4a8f582136c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index cd772d4ac75e3ea4820a788543e15e3af3566b21..dafbd09ee28ef6a5ec933cddf3c246cf2f4f4480 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 2bb6b3073ac79cf475c942b68ac351a18073c689..3122fbec1c7cad161d71fdf9970995adcedfdad7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index e1a1f0735524af6d3597dfff9ca64b3e7dbd5e2c..0527cda1f026e1ff9075e827c2902c45fd22db9d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 66c4446572c2ac5930a8a0bc0d5de96e584aa94e..814e5a5d545f0d4b1276ef1639eddb72004b4d1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 0839554f434f64cf957b17c8f5863655fb427ee4..aa1731afb82698cb44375407fc717bf32ef634d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index b10695f6f7965ab7d5dbca7128530348c8758179..9d7dd85fe0eef3733a86b9e918396e882f5812d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index b96500f710398514b37b5b6f32fe31c61aa99e44..e9bba298bb028851e6e9b9a17ff40a671d9132f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index a27d93ec62002a9ade1a012c1bc9f8bc4f05e80f..3c783eb5129028b3eb5160c75dde2859541cfd32 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 6dda24d3d27dcdbc88189e377ce20ec64d908dc9..b8e0882541c51209cca112c54197bbce305bd1b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 8a4ae8aaa7b91587c7f4e0a71eae6e0ac8598482..310f369ed6c7a9931af56016ae09db5d4bca15d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index a083c1da2e3a0a450bb1f39dd12f3270bc49e1f6..df19d781c21e403b51d451d772cfba66a7383be7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 5d5b361f8272d273941e8beb1978d0ec8b406027..bf909509bd4b25507291839fff1ee0eaccee630f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 392c338d73d75e2af9b06be86d449d0ac3415c50..5d66bc6fb6334d99242009f59ccbe0e7aa2d4e89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 1143604903523b286f24cc6ca20b97b68e473593..88e9300de912f3b12712bbc311ac156803ef35c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 5a15f1a55fa0ce6db3357ab9a3e69d13846caaf9..9d81c6d4bc3139952e9f41113d05547b215cf571 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index c470d9c8e8d9281087e347881592c488f46212f7..712eb0c6ec3b706a9e396a532f58916140a2c606 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index d17d6495c09b0e43041e85b8eb99d9d47212606d..dfc4ca27052f919ea3866a489e525ae1202795f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 2d538b4734892b85034974887b7fa7dd024551b8..5e4f727f71d8be2496bee1abdb87c7050f1ca02e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index b70923601aeb843fe663734d45493fa97757915f..9d893cb30a066c4732ccab9d1520f5047a4d3a01 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index f453ddd50efb193accb2d9105fcaf8a130ca3b3f..a2ed954e4c0ce1d474b8c71b41ce1d585d42d665 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 5759169e07d26600a12b086edb8f945735782fed..8a0818e78ac766a624bdcce85591fe13e1d4ceac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index bfde1c35f65a603bd38e1cbab6c2d5eb49ac40f3..b5591b48265d3d09459bf9bb114a4a3149984eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index e7f59a9cc5143b337e539d28cd6d1ffd691b5e97..210e4fd4e6f0b2e8ba75d22e83134e2267fbece5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 0354149d4fea06d489be61391c46e84d8b6c369a..da2213a84fe2f6bc683630e5f8760acdf3239b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index fff0e26bc16b863a1d86d3f735da009cedcaffd9..e2c303d506e0f8f99d8ca89f29979a5999382378 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index c49fa5663d91c4601062d7b207ce2257cec6dd2c..396e774c8a4a10c4996c56c208fc4f4d432e3135 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index c961699053a4fdd71f8a2782ae463970f243c88e..8b6418d514e61536f314da88f1586cea4f29cfc5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 1911e128eb2d7b0ffe6c4ff7eeb0b4927a731bca..e8fda4c71ada65aecef59eb8012120488b0f17c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 88be9143472ad00b12688059600890f67c6f4e92..50c52d270b684bcea5105e4c9813cc62103403f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 2bbb71ece2583d283cedec37b10eda7b693baa0e..84c6b78a2b4405fe0e2a1fbb3bba0635db06a21d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..476c597fe60ced2338a38aca4c5ddd5f21b0ccf2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.experimental.LayerNormalization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'norm_axis\', \'params_axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'-1\', \'1e-12\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f229615461dc7b781c0ba2ec6f81692d65354bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.layers.experimental"
+tf_module {
+  member {
+    name: "LayerNormalization"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index e84c9a2a8f178f0acf8305a77f6ea06c406b9888..f878c460596ccda1ce24417f2a260f7a5e69b755 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "ZeroPadding3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Input"
     argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
@@ -412,6 +416,10 @@ tf_module {
     name: "concatenate"
     argspec: "args=[\'inputs\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
@@ -428,6 +436,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "subtract"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ba9e57bed4100437c8b71d8b506cc2c928a9ac9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4952a76291c00bfdd73eed5412e7421887d1bab2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3c62d3bef0b9d200577f34cbe303fc7a094acc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2204f0b0f07fb2904067f4aa47576cefa2dfd272
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60706b4035e24414aff556c7749677a6b9822d22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0f1e80e8f34b00870c8d03f4c19976603676
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..466e0b40066602b9887bf380a54f391d577fa65d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-loss.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.LogLoss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogLoss\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04a9cc94201a5472a7c6158acfc4bfd48d4f74db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..674258ed8bce833057de277b1f6259272a5c16fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15a59d16c20ef52168981be78bccf682556cc073
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f8badb2b6ec2ba8dd16136c32f5d27811a0d4d9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index c198096d252cd9a3706bcbf6f1e4a1199ec7a1f7..090ab7ee3fab6fe1b24c44293324e584db40e4bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -8,6 +8,38 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogLoss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -24,10 +56,22 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -50,11 +94,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -62,11 +106,11 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8019d43c78fc62689bb4245e418518f251d0e5dc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.keras.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'AUCCurve.ROC\', \'AUCSummationMethod.INTERPOLATION\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index bad488f59b99ccbe7c6424244c86288afba51f46..5f0079762fce727000e4fdeeda3dfb012840f3ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index a1e7601a5141152c6709c46bb50b331fda69afca..a0a3ae890036161d5ec8da0af6b81495c99f9146 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce9e042027eb5e1186de4ff7c01230fb0e038f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 5f2c2f980777a34ed5128d8090ea7e945d9004e7..587ffddac96220dad543be2e63a0fc202d60cb10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cd173c7eb9f77d2731b790b068ce3a768f1c586
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..99414dd54ec1fd383f2f1a672f311a3b99213a00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b059e039deaaf699414c779584eff50b5b760537
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CosineProximity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index c153e9cf4d7932b1e4bf65bd02b8de2706d4b8be..5432f7f4006b165fefb9aa028bf7d36d8cbc38f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index aae2bd99886fbe93086186864eb6040437b872d5..75541bf285d8989f867aabc7c7025e56cce1d05d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f45a57df41e56a52f1a4b784a8c8d140f6670b25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25bcf8591d45bf2237845a914122f5f9ca9c0ad7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb2df7f459753b08426449f026129d0960d21eaf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11538c6c09dc4f0ed7de48fbf95dcc253adb1e24
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..adec3feffa1f07d4d65b63281f1a87c7469eb4cb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47b550a9e73bdd33c5d592ff86f237d821866e09
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.keras.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1ec7cb51e545077737051ff3f06509882e4d32a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af960fa54329d7d80d23f24df798509b2d12bb85
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..417f92f2734329e2382d6d6bc1ff7c399c990704
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a5218fed4cadf155f29781ff7341675b64f22fb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.keras.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 904a2fa9caee882775701c53a97c9aac0fd8120e..b089109baa5ea49b588e6e9a54b56256d4e4e3a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.metrics.Mean"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2216043cec13086660c370d835209ff39ddaa324
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e91098f95bffa13fdc97406590d1ae71edd6887
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index e81ecfe3f627f9d43ad1c673d41b70e81c783f13..9aeaa5627a9805579d6a6c4e09336a4d7994d1c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -87,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index f8470b94d7f52216d1c1e4342acabb404bbd8f74..748cec08668c461fcf80df6a50fd5192f99073b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -87,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dd60bb772aa93b637fca287baf3f17104a2e16d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index b70ef32bcaf3cb243d5d22d93cdbd8188f56d4df..97aeb680be1c5c412a16e2a73e3f1bc2ca8ce6c0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 2e693269bf749260e143cf19c6e1f51a5242412f..5a7bef4714d5aa37dcd1c11048ffd370e8841f0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -109,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea0f2a760ab7e34eb5f5b8c2ba63eea96ce63777
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index e62a2df0564a0eb4dba528dab575b7c08e41b913..85f80b062efe3d2d91104b211c8d9d75127c8c0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15895daf42bbfbbeda419d767844fd840ab4178e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ca1c6c8396e3d79a8f6250f34e7137870a23ac6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.keras.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cf1bf21cb5acd9d81c581eecfc2e64b78bd9e70
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 1a524d73c0d387fe603846b5f180916829d65435..4bc9383f6ffc90972416fa031d5515a149e70425 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index b9b4f565c5eff9ece856255ffbe15af3fb97c2df..2eae4df0ae344656bb637bc27e806876304a86f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index 905021dd790205e64a6f9839218200db98941927..7ebf43b3625ed4d1bfc5db1b10e774f9f3d1bb9d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Accuracy"
     mtype: "<type \'type\'>"
@@ -8,10 +12,26 @@ tf_module {
     name: "BinaryAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineProximity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -20,10 +40,58 @@ tf_module {
     name: "FalsePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -32,6 +100,10 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SensitivityAtSpecificity"
     mtype: "<type \'type\'>"
@@ -40,10 +112,26 @@ tf_module {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -78,7 +166,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -86,15 +174,15 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 5885cd21c1976bd7b95f7ca5bbea59eeb40b2ce8..eb1ab1d9dd61b36ed8662e25700f12f82aadb502 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -175,11 +175,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 935fa32f8c7f2d3b9c6b220a6b77a957d2c73f30..c69cf281742360d9ed4d1f7cbd35219cf04b1149 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -180,11 +180,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index 54262697932738810406380504fba217e736b1b7..0a56293e804f583a949ecb413da0ba613e0bc876 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index c39fe6ba4f7355e24bdaa5d7592f1ee7bd6de67f..14d0894e5622021c4961228d431d01516b752055 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 05d46d380bf93631ea598efb0fce256f2e33a848..fdb1ea838c04f296c3d0ee7596d73052b6f4b6d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 78829def67d11e422aa33e06434e78d3048382d9..ece63ec168dac58f58286dbd9fd8a8151d0dc2dc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 8c63a7dda98568b24ea1b3cda15d4c840fbfd804..f952f88b6d203488ea0ec4f1794d7de79a25853a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 58b7f274916f378a0893b2addc99c3f4b68d108f..27bae902b0cb7f1f4e09737a83fadd95a83cc163 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 8de796edde56b4639f1b59963383ed9f35a39f58..e523443a0099b57942c73cafcd8a919503e8db38 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 393eeb3d6cab4ea4c9acf3f909edc0a929d51414..d2721f8e92088c216ab748cae45e415553b9d4c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 773c74e64d13ca4a840b7f599fc2cbe9c161cd03..c7a50969b54e5efc4d338caa79dea76d86bffe8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 533544d21f2753f785113a30518f4fcbcff96cd7..3900c752c8527f68af2496f99083d80fc9d18106 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index e3926eb6d4714731d09ff9c5b75a89830c06e7c1..7b876099af6a28d9fca2e5c55aeae5e4610f82a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index ba209df7824a9cc076499458e35acd7dcf1eaf35..5bddba8e798618f5b1d0cdc61ddff9725a495fe0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 081fb0e08bcd1b35ab44459d1c8eb0857dd14956..62ba8bb59e8af14447fe570ba28c5d0eba7f6af8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 2014a04301618c20af5cf6f1144eb4dbda2479e1..0803feeabd12acb7988459fe6da2748e19b70a5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9a87ae9687741090485bd8d4d0d07d359a2015e7..6def32864b9cc660b94d628ccd53dc48a566ea81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index 33afb835ce1d524991c0024bfb87c29a72aac08e..dbf1ac82d33b81c63e5c356ac736f63262797ff0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -0,0 +1,142 @@
+path: "tensorflow.linalg.LinearOperatorInversion"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index a9078c8ab5cca078237a29febabdbbd4a8b6c89c..85d902b977ceddd405abb1154a086d7bd29e7848 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287..638d82a599248e547bcae86ebd6d8d8dc3f6aa4b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -115,6 +115,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index a87649133fd207ad59f2124c6b0b5aa44916e5a5..ab1b04bd3cb1b215b848019b6c578ce091f8f828 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 32656467840fbbc0c8708ea68aac5aa75c11a540..961969aac58b78e4edd53b47f2932f71f2d21fd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 49d8890c8942bc0021886ee6c9bc4e7625452655..e76738a9648123414159fdc9666a99b0577aa46e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index c89dc067b331603e227d9d578147e2dd1ee4a900..b35cd69da474a9665652f04f12b34a8d9f33fa8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 3e1e2e3d54de3e2442299a783f933a60dfd2db6d..f9119cdd5f728f3b35d83248daff17547a497aa2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorInversion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorKronecker"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index 233b1a0131a4d292574be161de2d547cb0060c23..36007d3ca6f63ccf06ef613aad584c8d1c63d627 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -4,24 +4,4 @@ tf_module {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
-  }
-  member_method {
-    name: "get_losses"
-    argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
-  }
-  member_method {
-    name: "get_regularization_loss"
-    argspec: "args=[\'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'total_regularization_loss\'], "
-  }
-  member_method {
-    name: "get_regularization_losses"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_total_loss"
-    argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 4ac0484050054abee9496bcf09d90ff58bbfb9d7..a0b8e9e4013d4f69cc933f6f495bdcbbe478641e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -264,6 +264,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "nextafter"
+    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3547b66d19ac6b64449860160774647df855a6de..3205c6a4dcaaa00591cd957021a463b77835343e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index c36ecaa4b2b2ce14292cd2c46a986bb1387294bd..8ba92fcc8dc89958b8395aa986c358a03fedd66d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 42128ebd17234fcee3b016bbd7f1964824d1a0b6..9de73076b1197ce7bee8a00dfd7bfcd1b48a35bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
index 210b56242b27fe4a832cfe50a53626d716d8877e..b271db6a659108031aab42a397068a4a13967551 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.ones_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 87d8e2ae7c8b644373b1a7f0d4306bd79c281506..92f4704b493caba790862174c17092c51f0eb54f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "DType"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Event"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "GradientTape"
     mtype: "<type \'type\'>"
@@ -40,14 +36,6 @@ tf_module {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Summary"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SummaryMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Tensor"
     mtype: "<type \'type\'>"
@@ -80,6 +68,10 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "audio"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "autograph"
     mtype: "<type \'module\'>"
@@ -164,10 +156,6 @@ tf_module {
     name: "float64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "glorot_uniform_initializer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "graph_util"
     mtype: "<type \'module\'>"
@@ -180,10 +168,6 @@ tf_module {
     name: "image"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "initializers"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "int16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -332,10 +316,6 @@ tf_module {
     name: "train"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "truncated_normal_initializer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "uint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -444,10 +424,6 @@ tf_module {
     name: "atanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "batch_gather"
-    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -618,7 +594,7 @@ tf_module {
   }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'batch_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "gather_nd"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
index 5993fdeb9c232ebc4090d9fffd8857da8ca6ada4..b1dfc444113d28e6dad160be8786872dabc70cc8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.random_normal_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
index a434ed1599ef8b99b6e0496be388aa0e44755249..d16924a07a5f6fc11bb71f1786a691255e5c67e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.random_uniform_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
index 7721eed65b0bea9e47bc8a3e2e5d9e54f42bb187..7781337c826e01cdc820a65a288bf9ce7e251fd0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
index 02e59a63e10b1a24bfe0c275044bf807b433f62e..a31689a58bceb91ccfb3fa91d8b778c6c25cc929 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 962cf9a7239343e3b570d3a6d20edeeeb871b120..e2da65eee41905c7b7c67eade11e1775a2408ca0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.strings"
 tf_module {
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
   member_method {
     name: "format"
     argspec: "args=[\'template\', \'inputs\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'{}\', \'3\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
deleted file mode 100644
index eb99d0f5334457aa654fed0553af143839328dba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.summary.Event"
-tf_proto {
-  descriptor {
-    name: "Event"
-    field {
-      name: "wall_time"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "step"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "file_version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "graph_def"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "summary"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary"
-      oneof_index: 0
-    }
-    field {
-      name: "log_message"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.LogMessage"
-      oneof_index: 0
-    }
-    field {
-      name: "session_log"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SessionLog"
-      oneof_index: 0
-    }
-    field {
-      name: "tagged_run_metadata"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TaggedRunMetadata"
-      oneof_index: 0
-    }
-    field {
-      name: "meta_graph_def"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "what"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
deleted file mode 100644
index 2a5b63dceae3c0ac27b34c2e896ee3b90bbd7f75..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-path: "tensorflow.summary.FileWriterCache"
-tf_class {
-  is_instance: "<class \'tensorflow.python.summary.writer.writer_cache.FileWriterCache\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "clear"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get"
-    argspec: "args=[\'logdir\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
deleted file mode 100644
index 6b65b0ace3cf7740ab03390841c941592000d127..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.summary.FileWriter"
-tf_class {
-  is_instance: "<class \'tensorflow.python.summary.writer.writer.FileWriter\'>"
-  is_instance: "<class \'tensorflow.python.summary.writer.writer.SummaryToEventTransformer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_event"
-    argspec: "args=[\'self\', \'event\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_graph"
-    argspec: "args=[\'self\', \'graph\', \'global_step\', \'graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_meta_graph"
-    argspec: "args=[\'self\', \'meta_graph_def\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_run_metadata"
-    argspec: "args=[\'self\', \'run_metadata\', \'tag\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_session_log"
-    argspec: "args=[\'self\', \'session_log\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_summary"
-    argspec: "args=[\'self\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_logdir"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reopen"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
deleted file mode 100644
index 4a8b59cf02ed46ef70f22564f3134214840600fe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.summary.SummaryDescription"
-tf_proto {
-  descriptor {
-    name: "SummaryDescription"
-    field {
-      name: "type_hint"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
deleted file mode 100644
index 8b271cf58fc11c8666abd456021afeedc0b14c7a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.summary.Summary.Audio"
-tf_proto {
-  descriptor {
-    name: "Audio"
-    field {
-      name: "sample_rate"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "num_channels"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "length_frames"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "encoded_audio_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-    field {
-      name: "content_type"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
deleted file mode 100644
index dbbc02dd0506dbcebd1690602b5786b02c3ed4a0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.summary.Summary.Image"
-tf_proto {
-  descriptor {
-    name: "Image"
-    field {
-      name: "height"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "width"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "colorspace"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "encoded_image_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
deleted file mode 100644
index 4176171cd938e383fe5366153364d8e8e8c1a1ee..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.summary.Summary.Value"
-tf_proto {
-  descriptor {
-    name: "Value"
-    field {
-      name: "node_name"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "metadata"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata"
-    }
-    field {
-      name: "simple_value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "obsolete_old_style_histogram"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "image"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Image"
-      oneof_index: 0
-    }
-    field {
-      name: "histo"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.HistogramProto"
-      oneof_index: 0
-    }
-    field {
-      name: "audio"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Audio"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
deleted file mode 100644
index d6c5e3a87a115b9bdcfd044abe93177eda2af275..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.summary.Summary"
-tf_proto {
-  descriptor {
-    name: "Summary"
-    field {
-      name: "value"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Value"
-    }
-    nested_type {
-      name: "Image"
-      field {
-        name: "height"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "width"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "colorspace"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "encoded_image_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-    nested_type {
-      name: "Audio"
-      field {
-        name: "sample_rate"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-      }
-      field {
-        name: "num_channels"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "length_frames"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "encoded_audio_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-      field {
-        name: "content_type"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    nested_type {
-      name: "Value"
-      field {
-        name: "node_name"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tag"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "metadata"
-        number: 9
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SummaryMetadata"
-      }
-      field {
-        name: "simple_value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-        oneof_index: 0
-      }
-      field {
-        name: "obsolete_old_style_histogram"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-        oneof_index: 0
-      }
-      field {
-        name: "image"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Image"
-        oneof_index: 0
-      }
-      field {
-        name: "histo"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.HistogramProto"
-        oneof_index: 0
-      }
-      field {
-        name: "audio"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Audio"
-        oneof_index: 0
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-        oneof_index: 0
-      }
-      oneof_decl {
-        name: "value"
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
deleted file mode 100644
index 27c8873320403cb2e7402ef9f1bb0e7134d5f96b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.summary.TaggedRunMetadata"
-tf_proto {
-  descriptor {
-    name: "TaggedRunMetadata"
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "run_metadata"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 61670bd15122f65ef05d20ee5d023a3c326f7757..c59f1b8474302b5529895b8aa9216a2e197d958f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -1,33 +1,9 @@
 path: "tensorflow.summary"
 tf_module {
-  member {
-    name: "Event"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "FileWriter"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FileWriterCache"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Summary"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SummaryDescription"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "SummaryWriter"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "TaggedRunMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member_method {
     name: "create_file_writer"
     argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 5be37200f368b1823093c67ad7042db534b0df93..42dcdac9e77a8efac875e4985f6a8f744e838ddb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.Checkpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
deleted file mode 100644
index 7d982dc51f6edce1cf691671e31ddd07664f0dc1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.ProximalGradientDescentOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.0\', \'False\', \'ProximalGradientDescent\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index c72564e5987de36a95f7f44bae2b8122dcf256c4..8b39086ed1b2ae197668ff4c89f1e42ca8b50ab4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -60,10 +60,6 @@ tf_module {
     name: "JobDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "ProximalGradientDescentOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -140,8 +136,4 @@ tf_module {
     name: "sdca_shrink_l1"
     argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "summary_iterator"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt
deleted file mode 100644
index c1e1c230a9f79e87294eb6038f870726a0ba85a4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.truncated_normal_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
index e229b02ceec6739974d3b4ae2bb02ef273398c45..53c5ac89cf79527522ae2f1cc69b451c405c90d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.zeros_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 723fceef413d86675e885debd37e73e5facd7f7c..6b9fc7ddce17cd523e0b8ce240b5da6f63fb6fce 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -59,7 +59,7 @@ _UPDATE_GOLDENS_HELP = """
 # DEFINE_boolean, only_test_core_api, default False:
 _ONLY_TEST_CORE_API_HELP = """
     Some TF APIs are being moved outside of the tensorflow/ directory. There is
-    no garuntee which versions of these APIs will be present when running this
+    no guarantee which versions of these APIs will be present when running this
     test. Therefore, do not error out on API changes in non-core TF code
     if this flag is set.
 """
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index a4cad4b6c65c35651e58495c8f1b8b4c5b5f38d8..f5a28ff16352d5428ac698f2cc7f73b0b1ba3394 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
@@ -7,6 +7,12 @@ LABEL maintainer="Jan Prach <jendap@google.com>"
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
 
+# Installs TensorRT, which is not included in NVIDIA Docker containers.
+RUN apt-get update \
+        && apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
@@ -24,7 +30,7 @@ COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
 # Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
+RUN mkdir /usr/local/cuda/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
index 7e5860aeec186d908e5d2884bd690b2e5e43cffa..500fb6e0b3a995a91f0faf6555e2e248babbfda1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
@@ -1,3 +1,8 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cpu \
+#       --tag "gcr.io/tensorflow-testing/nosla-ubuntu16.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-ubuntu16.04
+
 FROM launcher.gcr.io/google/rbe-ubuntu16-04:r327695
 LABEL maintainer="Yu Yi <yiyu@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
index 4fe86066c91b2baa665070a6fd9d34ebc74bdab7..c6099c9e45115bfb84be6d3721fbf62088614801 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -1,9 +1,9 @@
 # To push a new version, run:
 # $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
-#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
-# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+#       --tag "gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04
 
-FROM ubuntu:14.04
+FROM gcr.io/clang-docker-builder/clang-ubuntu14_04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
@@ -19,7 +19,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates
 ENV CUDA_VERSION 10.0.130
 ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
 ENV CUDNN_VERSION 7.3.1.20
-ENV NCCL_VERSION 2.3.5
 ENV TENSORRT_VERSION 5.0.2
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
@@ -48,25 +47,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
-        libnccl2=$NCCL_VERSION-2+cuda10.0 \
-        libnccl-dev=$NCCL_VERSION-2+cuda10.0 \
         nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda10.0 && \
     apt-get update && apt-get install -y --no-install-recommends \
         libnvinfer5=$TENSORRT_VERSION-1+cuda10.0 \
         libnvinfer-dev=$TENSORRT_VERSION-1+cuda10.0 && \
     ln -s cuda-10.0 /usr/local/cuda && \
     apt-mark hold libcudnn7 && \
-    apt-mark hold libnccl2 && \
     rm -rf /var/lib/apt/lists/*
 
 # TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
-# libnccl is resolved, delete this block.
-RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
- && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+# Install a newer version of g++:
+# - we need a new libstdc++, because new clang versions do not work with a stock
+#   ubuntu 14.04 libstdc++.
+# - we want to compile with g++-7 to get ahead of LLVM dropping support for
+#   gcc 4.8.
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends g++-7 && \
+    rm -rf /var/lib/apt/lists/*
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index 60a23e1edbced8dbef738e290353cdfb60ea86a6..4ce4214065fbddd4769a4a35941e3b752aa49c9c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -1,7 +1,7 @@
 # To push a new version, run:
 # $ docker build -f Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 \
-#       --tag "gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04" .
-# $ docker push gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04
+#       --tag "gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04
 #
 # TODO(klimek): Include clang in this image so we can also target clang
 # builds.
@@ -25,7 +25,6 @@ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
-ENV NCCL_VERSION 2.2.13
 ENV TENSORRT_VERSION 5.0.2
 ENV CUDNN_VERSION 7.1.4.18
 
@@ -45,14 +44,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cudart-$CUDA_PKG_VERSION \
         cuda-libraries-$CUDA_PKG_VERSION \
         cuda-cublas-9-0=9.0.176.4-1 \
-        libnccl2=$NCCL_VERSION-1+cuda9.0 \
         cuda-libraries-dev-$CUDA_PKG_VERSION \
         cuda-nvml-dev-$CUDA_PKG_VERSION \
         cuda-minimal-build-$CUDA_PKG_VERSION \
         cuda-command-line-tools-$CUDA_PKG_VERSION \
         cuda-core-9-0=9.0.176.3-1 \
         cuda-cublas-dev-9-0=9.0.176.4-1 \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 \
         libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
         nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda9.0 && \
@@ -60,7 +57,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libnvinfer5=$TENSORRT_VERSION-1+cuda9.0 \
         libnvinfer-dev=$TENSORRT_VERSION-1+cuda9.0 && \
     ln -s cuda-9.0 /usr/local/cuda && \
-    apt-mark hold libnccl2 && \
     apt-mark hold libcudnn7 libcudnn7-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -71,11 +67,6 @@ RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
-# libnccl is resolved, delete this block.
-RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
- && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
-
 # Install a newer version of libstdc++, as new clang versions do not work
 # with the stock ubuntu 14.04 libstdc++.
 RUN apt-get update && \
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
index b65620583676f7ae2a4e849e33df05a18c4c9a24..c4912a65b65d61c6154be5083805d430d697f662 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Nick Lopez <ngiraldo@google.com>"
 
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index 7d5cf3f8439e223e0e8591333e727b2e58ca275c..a095633a22e8b24a4561ad3e13902a34424717ae 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -88,7 +88,8 @@ if [[ ${IS_GPU} == "1" ]]; then
   PIP_TEST_FILTER_TAG="-no_gpu,-no_pip_gpu,${PIP_TEST_FILTER_TAG}"
 fi
 if [[ ${IS_MAC} == "1" ]]; then
-  PIP_TEST_FILTER_TAG="-nomac,${PIP_TEST_FILTER_TAG}"
+  # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+  PIP_TEST_FILTER_TAG="-nomac,-no_mac,${PIP_TEST_FILTER_TAG}"
 fi
 
 # Bazel flags we need for all tests:
diff --git a/tensorflow/tools/ci_build/builds/test_tutorials.sh b/tensorflow/tools/ci_build/builds/test_tutorials.sh
index db335f14ca4f88ade7a540ffab7ed9de67f1248e..a12827a2d3c9d4bf643d26ae854f544e614934dd 100755
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@@ -33,7 +33,7 @@
 #
 
 # List of all tutorial tests to run, separated by spaces
-TUT_TESTS="mnist_softmax mnist_with_summaries word2vec estimator_abalone"
+TUT_TESTS="mnist_with_summaries word2vec"
 
 if [[ -z "${TUT_TESTS_BLACKLIST}" ]]; then
   TF_BUILD_TUT_TEST_BLACKLIST=""
@@ -212,16 +212,6 @@ test_word2vec() {
 }
 
 
-# -----------------------------------------------------------
-# Estimator: abalone
-test_estimator_abalone() {
-  LOG_FILE=$1
-
-  run_in_directory "${TEST_DIR}" "${LOG_FILE}" \
-    "tensorflow/examples/tutorials/estimators/abalone.py"
-}
-
-
 # -----------------------------------------------------------
 # ptb_word_lm
 test_ptb_word_lm() {
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 25ecee472524d5346252772b3058a5e824eef217..9da9c3b881ed14c4cebd3dd641c23d9cfd6f6708 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -239,8 +239,15 @@ function run_op() {
   fi
 }
 
+printf "\nTesting execution of user-defined op under graph mode:\n\n"
 run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")"
-run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+
+if [[ "${IS_GPU}" == "0" ]]; then
+  printf "\nTesting execution of user-defined op under eager mode:\n\n"
+  run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+else
+  printf "\nSKIPPING the testing of execution of user-defined GPU kernel under eager mode. See b/122972785.\n\n"
+fi
 
 
 popd
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 435ec7ca68fc28362b9b546f977b24e003e55d2f..62c1e014d5eedecbbb07ed349914e8428c58785b 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -398,7 +398,8 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
         NEW_ITEM="${NEW_ITEM},-benchmark-test"
       fi
       if [[ ${IS_MAC} == "1" ]] && [[ ${NEW_ITEM} != *"nomac"* ]]; then
-        NEW_ITEM="${NEW_ITEM},-nomac"
+        # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+        NEW_ITEM="${NEW_ITEM},-nomac,-no_mac"
       fi
       EXTRA_ARGS="${EXTRA_ARGS} ${NEW_ITEM}"
     else
@@ -408,11 +409,13 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
 else
   EXTRA_ARGS="${EXTRA_ARGS} ${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
-    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+    # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac,-no_mac"
   fi
   EXTRA_ARGS="${EXTRA_ARGS} --build_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
-    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+    # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac,-no_mac"
   fi
 fi
 
@@ -610,6 +613,13 @@ if [[ "${DO_DOCKER}" == "1" ]]; then
   fi
 fi
 
+# Set a disk usage trap.
+function debug_disk_usage {
+    echo "Finished script... disk usage report in ${TMP_DIR}"
+    du -k -d 2 ${TMP_DIR} | sort -n -r
+}
+# trap debug_disk_usage EXIT
+
 chmod +x ${TMP_SCRIPT}
 
 # Map TF_BUILD container types to containers we actually have.
@@ -645,6 +655,8 @@ echo ""
 echo "Parameterized build ends with ${RESULT} at: $(date) "\
 "(Elapsed time: $((END_TIME - START_TIME)) s)"
 
+# Dump disk usage
+debug_disk_usage
 
 # Clean up temporary directory if it exists
 if [[ ! -z "${TMP_DIR}" ]]; then
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 148526492d25e9acebe036294175e2814b2ead12..40a744374564d3ad3e663de8453d4085202c4e0c 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -33,7 +33,7 @@ import tempfile
 import zipfile
 
 TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
-                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
+                    r"\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
diff --git a/tensorflow/tools/ci_build/install/.bazelrc b/tensorflow/tools/ci_build/install/.bazelrc
index 2060babd4a450a0fc318f8b9ee5cb8536d57189c..4662e2e60a16e8dd675ee2131a94fb313b9d5b8b 100644
--- a/tensorflow/tools/ci_build/install/.bazelrc
+++ b/tensorflow/tools/ci_build/install/.bazelrc
@@ -5,7 +5,7 @@ startup --batch
 
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
-build  --verbose_failures --spawn_strategy=standalone --genrule_strategy=standalone
+build  --verbose_failures --spawn_strategy=standalone --strategy=Genrule=standalone
 test --spawn_strategy=standalone
 
 # Force bazel output to use colors (good for jenkins) and print useful errors.
diff --git a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
index 9d8e3df3b5c3e192b987718318465c14184d4045..b6734e55226842fc54667fbdf3a349c321e45edd 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
@@ -25,5 +25,4 @@ apt-get install -y libpython3-all-dev:armhf
 echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
 curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
 apt-get update
-rm -rf /usr/local/bin/bazel
-apt-get install -y bazel python3 python3-numpy python3-dev python3-pip
+apt-get install -y python3 python3-numpy python3-dev python3-pip
diff --git a/tensorflow/tools/ci_build/install/install_pi_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
index 03c43cc83805fbde8576b9d170c1d3d6c3993625..0238cc5895ff3e848f974ee464f77450f92a3f22 100755
--- a/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
@@ -25,5 +25,4 @@ apt-get install -y libpython-all-dev:armhf
 echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
 curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
 apt-get update
-rm -rf /usr/local/bin/bazel
-apt-get install -y bazel python python-numpy python-dev python-pip
+apt-get install -y python python-numpy python-dev python-pip
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index eeadabaa737d500759b6c22bcccd016f49d0ab6a..3878452b695069c7fc6881c1825c06a74fe038e3 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -18,10 +18,10 @@ set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
 # Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
-# Run easy_install before easy_install3, so that the default pip points to pip2,
+# Run easy_install after easy_install3, so that the default pip points to pip2,
 # to match the default python version of 2.7.
-easy_install3 -U pip==9.0.3
-easy_install -U pip==9.0.3
+easy_install3 -U pip==18.1
+easy_install -U pip==18.1
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
@@ -60,8 +60,8 @@ pip2 install --upgrade markdown==2.6.8
 pip3 install --upgrade markdown==2.6.8
 
 # Install protobuf.
-pip2 install --upgrade protobuf==3.6.0
-pip3 install --upgrade protobuf==3.6.0
+pip2 install --upgrade protobuf==3.6.1
+pip3 install --upgrade protobuf==3.6.1
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index 821d50baff325106fceca368d46042401d13c336..3cb100856706558cacf6d2b601e2b34fd194082e 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -17,7 +17,7 @@
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.6.0"
+PROTOBUF_VERSION="3.6.1"
 protobuf_ver_flat=$(echo $PROTOBUF_VERSION | sed 's/\.//g' | sed 's/^0*//g')
 local_protobuf_ver=$(protoc --version)
 local_protobuf_ver_flat=$(echo $local_protobuf_ver | sed 's/\.//g' | sed 's/^0*//g')
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 62e04df717316ffc8cf211a6887730be115623be..a58f49af2867812961675b7db61415b94febef39 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -52,7 +52,7 @@ pip3.5 install --upgrade absl-py
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.5 install --upgrade protobuf==3.6.0
+pip3.5 install --upgrade protobuf==3.6.1
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -90,4 +90,7 @@ pip3.5 install keras_applications==1.0.6
 pip3.5 install keras_preprocessing==1.0.5
 pip3.5 install --upgrade h5py==2.8.0
 
+# Estimator
+pip3.5 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 48d556b1dd8e3e17b763b9c71e78e1d551554703..b1c2a0ab00a344df2dd26c74440bdb4a95ac410a 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -64,7 +64,7 @@ pip3 install --upgrade absl-py
 pip3 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3 install --upgrade protobuf==3.6.0
+pip3 install --upgrade protobuf==3.6.1
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -107,4 +107,7 @@ pip3 install --upgrade h5py==2.8.0
 pip3 install keras_applications==1.0.6
 pip3 install keras_preprocessing==1.0.5
 
+# Estimator
+pip3 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index 7be5f454ecd6344cc1b0b79789c2b18acefc448d..a8b73cbe0cfe7fda70483a8b10fee2a7648b138a 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -36,4 +36,4 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=cc,py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
     --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
-    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index 3efd994d783d8f47b3471cc5ce177293b1e017cc..1184d4acec61f36cc630df313d403d33d73e1e7a 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -31,6 +31,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
+# TODO(b/122370901): Fix nomac, no_mac inconsistency.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium --config=opt \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index adee0d3171fe13261f177a6f8a3b55aeb5789cc5..d39340b1d83dde254a00fea1ff6090e1df2d10ae 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -32,6 +32,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
+# TODO(b/122370901): Fix nomac, no_mac inconsistency.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 --config=opt \
     --announce_rc \
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 864278c6477b4b1e7e9bc3836e3e3d102d086530..987f0769b2d6da4631b6f408af4dbf62d9099f76 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -107,6 +107,7 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
+  --define tensorflow_mkldnn_contraction_kernel=0 \
   --verbose_failures \
   //tensorflow:libtensorflow.so \
   //tensorflow:libtensorflow_framework.so \
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 4373d464b6a9f8cf6d498652d7afeed507a666ba..117627c458c3326735deb667b484c642b25a2ba9 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -84,19 +84,26 @@ class Version(object):
       identifier_string: extension string eg. (-rc0)
       version_type: version parameter ((REGULAR|NIGHTLY)_VERSION)
     """
-    self.string = "%s.%s.%s%s" % (major,
-                                  minor,
-                                  patch,
-                                  identifier_string)
     self.major = major
     self.minor = minor
     self.patch = patch
     self.identifier_string = identifier_string
     self.version_type = version_type
+    self._update_string()
+
+  def _update_string(self):
+    self.string = "%s.%s.%s%s" % (self.major,
+                                  self.minor,
+                                  self.patch,
+                                  self.identifier_string)
 
   def __str__(self):
     return self.string
 
+  def set_identifier_string(self, identifier_string):
+    self.identifier_string = identifier_string
+    self._update_string()
+
   @property
   def pep_440_str(self):
     if self.version_type == REGULAR_VERSION:
@@ -283,15 +290,14 @@ def main():
   """
 
   parser = argparse.ArgumentParser(description="Cherry picking automation.")
-  group = parser.add_mutually_exclusive_group(required=True)
 
   # Arg information
-  group.add_argument("--version",
-                     help="<new_major_ver>.<new_minor_ver>.<new_patch_ver>",
-                     default="")
-  group.add_argument("--nightly",
-                     help="disable the service provisioning step",
-                     action="store_true")
+  parser.add_argument("--version",
+                      help="<new_major_ver>.<new_minor_ver>.<new_patch_ver>",
+                      default="")
+  parser.add_argument("--nightly",
+                      help="disable the service provisioning step",
+                      action="store_true")
 
   args = parser.parse_args()
 
@@ -299,13 +305,17 @@ def main():
   old_version = get_current_semver_version()
 
   if args.nightly:
-    # Dev minor version is one ahead of official.
-    nightly_minor_ver = int(old_version.minor) + 1
-    new_version = Version(old_version.major,
-                          str(nightly_minor_ver),
-                          old_version.patch,
-                          "-dev" + time.strftime("%Y%m%d"),
-                          NIGHTLY_VERSION)
+    if args.version:
+      new_version = Version.parse_from_string(args.version, NIGHTLY_VERSION)
+      new_version.set_identifier_string("-dev" + time.strftime("%Y%m%d"))
+    else:
+      # Dev minor version is one ahead of official.
+      nightly_minor_ver = int(old_version.minor) + 1
+      new_version = Version(old_version.major,
+                            str(nightly_minor_ver),
+                            old_version.patch,
+                            "-dev" + time.strftime("%Y%m%d"),
+                            NIGHTLY_VERSION)
   else:
     new_version = Version.parse_from_string(args.version, REGULAR_VERSION)
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 7ec386a60ecc379218f5d67a89240191761b4120..9741fba1c8016b89fa17384faee0fc8738c72307 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -104,7 +104,11 @@ else
 fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
-  python tensorflow/tools/ci_build/update_version.py --nightly
+  if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
+  else
+    python tensorflow/tools/ci_build/update_version.py --nightly
+  fi
   if [ -z ${PROJECT_NAME} ]; then
     EXTRA_PIP_FLAGS="--nightly_flag"
   else
@@ -121,6 +125,10 @@ fi
 
 run_configure_for_cpu_build
 
+bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS}  \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  tensorflow/lite:framework tensorflow/lite/examples/minimal:minimal || exit $?
+
 bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
   tensorflow/tools/pip_package:build_pip_package \
   --incompatible_remove_native_http_archive=false || exit $?
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index c076157b082ebc60784373be1305665fe0bc84af..efdd5f13c87e187c84e6e1d11770ebdb91e9df41 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -104,7 +104,11 @@ else
 fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
-  python tensorflow/tools/ci_build/update_version.py --nightly
+  if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
+  else
+    python tensorflow/tools/ci_build/update_version.py --nightly
+  fi
   if [ -z ${PROJECT_NAME} ]; then
     EXTRA_PIP_FLAGS="--nightly_flag"
   else
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 7dfee8f371b8c4795fe748d1fd02ee8d884f18f9..9c05db974b4e30c2997a9c0d11f792ae52587eb5 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt --copt=/arch:AVX --announce_rc \
+bazel --output_user_root=${TMPDIR} build -c opt --copt=/arch:AVX --announce_rc \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index a9902d77f5ec103fe2000a4a470d425e3998f45e..31dbc02963d60a4943f0683252c86ea0ba1610c0 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -1,17 +1,21 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",  # @unused
     "tf_cc_test",  # @unused
 )
 
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
 py_library(
     name = "ast_edits",
     srcs = ["ast_edits.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "@pasta",
+        "@six_archive//:six",
+    ],
 )
 
 py_test(
@@ -65,6 +69,7 @@ py_library(
         ":ast_edits",
         ":renames_v2",
         ":reorders_v2",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index eac2150502d6511da127a42fbb46c92bea7fe364..eabb0be4e6cf62d866a620de9c0b1fcd125683bb 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
-import collections
 import os
 import re
 import shutil
@@ -27,11 +26,42 @@ import sys
 import tempfile
 import traceback
 
+import pasta
+import six
+
 # Some regular expressions we will need for parsing
 FIND_OPEN = re.compile(r"^\s*(\[).*$")
 FIND_STRING_CHARS = re.compile(r"['\"]")
 
 
+INFO = "INFO"
+WARNING = "WARNING"
+ERROR = "ERROR"
+
+
+def full_name_node(name, ctx=ast.Load()):
+  """Make an Attribute or Name node for name.
+
+  Translate a qualified name into nested Attribute nodes (and a Name node).
+
+  Args:
+    name: The name to translate to a node.
+    ctx: What context this name is used in. Defaults to Load()
+
+  Returns:
+    A Name or Attribute node.
+  """
+  names = name.split(".")
+  names.reverse()
+  node = ast.Name(id=names.pop(), ctx=ast.Load())
+  while names:
+    node = ast.Attribute(value=node, attr=names.pop(), ctx=ast.Load())
+
+  # Change outermost ctx to the one given to us (inner ones should be Load).
+  node.ctx = ctx
+  return node
+
+
 class APIChangeSpec(object):
   """This class defines the transformations that need to happen.
 
@@ -44,262 +74,261 @@ class APIChangeSpec(object):
     notifications)
   * `function_reorders`: maps functions whose argument order has changed to the
     list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
   * `function_warnings`: maps full names of functions to warnings that will be
     printed out if the function is used. (e.g. tf.nn.convolution())
-  * `unrestricted_function_warnings`: maps names of functions to warnings that
-    will be printed out when the function is used (e.g. foo.convolution()).
+  * `function_transformers`: maps function names to custom handlers
 
   For an example, see `TFAPIChangeSpec`.
   """
 
 
-class _FileEditTuple(
-    collections.namedtuple("_FileEditTuple",
-                           ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The column number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class _ASTCallVisitor(ast.NodeVisitor):
+class _PastaEditVisitor(ast.NodeVisitor):
   """AST Visitor that processes function calls.
 
   Updates function calls from old API version to new API version using a given
   change spec.
   """
 
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
+  def __init__(self, api_change_spec):
     self._api_change_spec = api_change_spec
+    self._log = []   # Holds 4-tuples: severity, line, col, msg.
+    self._stack = []  # Allow easy access to parents.
 
-  def process(self, lines):
-    return self._file_edit.process(lines)
+  # Overridden to maintain a stack of nodes to allow for parent access
+  def visit(self, node):
+    self._stack.append(node)
+    super(_PastaEditVisitor, self).visit(node)
+    self._stack.pop()
 
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
+  @property
+  def errors(self):
+    return [log for log in self._log if log[0] == ERROR]
 
-  def _rename_functions(self, node, full_name):
-    symbol_renames = self._api_change_spec.symbol_renames
-    try:
-      new_name = symbol_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
+  @property
+  def warnings(self):
+    return [log for log in self._log if log[0] == WARNING]
 
-  def _print_warning_for_function(self, node, full_name):
-    function_warnings = self._api_change_spec.function_warnings
-    try:
-      warning_message = function_warnings[full_name]
-      warning_message = warning_message.replace("<function name>", full_name)
-      self._file_edit.add(warning_message,
-                          node.lineno, node.col_offset, full_name, full_name,
-                          error="%s requires manual check." % full_name)
-    except KeyError:
-      pass
+  @property
+  def warnings_and_errors(self):
+    return [log for log in self._log if log[0] in (WARNING, ERROR)]
 
-  def _print_warning_for_function_unrestricted(self, node):
-    """Print a warning when specific functions are called.
+  @property
+  def info(self):
+    return [log for log in self._log if log[0] == INFO]
 
-    The function _print_warning_for_function matches the full name of the called
-    function, e.g., tf.foo.bar(). This function matches the function name that
-    is called, as long as the function is an attribute. For example,
-    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
+  @property
+  def log(self):
+    return self._log
+
+  def add_log(self, severity, lineno, col, msg):
+    self._log.append((severity, lineno, col, msg))
+    print("%s line %d:%d: %s" % (severity, lineno, col, msg))
+
+  def add_logs(self, logs):
+    """Record a log and print it.
+
+    The log should be a tuple `(severity, lineno, col_offset, msg)`, which will
+    be printed and recorded. It is part of the log available in the `self.log`
+    property.
 
     Args:
-      node: ast.Call object
+      logs: The logs to add. Must be a list of tuples
+        `(severity, lineno, col_offset, msg)`.
     """
-    function_warnings = getattr(
-        self._api_change_spec, "unrestricted_function_warnings", {})
-    if isinstance(node.func, ast.Attribute):
-      function_name = node.func.attr
-      try:
-        warning_message = function_warnings[function_name]
-        self._file_edit.add(warning_message,
-                            node.lineno, node.col_offset, "", "",
-                            error="%s requires manual check." % function_name)
-      except KeyError:
-        pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+    self._log.extend(logs)
+    for log in logs:
+      print("%s line %d:%d: %s" % log)
+
+  def _get_applicable_entries(self, transformer_field, full_name, name):
+    """Get all list entries indexed by name that apply to full_name or name."""
+    # Transformers are indexed to full name, name, or no name
+    # as a performance optimization.
+    function_transformers = getattr(self._api_change_spec,
+                                    transformer_field, {})
+
+    glob_name = "*." + name if name else None
+    transformers = []
+    if full_name in function_transformers:
+      transformers.append(function_transformers[full_name])
+    if glob_name in function_transformers:
+      transformers.append(function_transformers[glob_name])
+    if "*" in function_transformers:
+      transformers.append(function_transformers["*"])
+    return transformers
+
+  def _get_applicable_dict(self, transformer_field, full_name, name):
+    """Get all dict entries indexed by name that apply to full_name or name."""
+    # Transformers are indexed to full name, name, or no name
+    # as a performance optimization.
+    function_transformers = getattr(self._api_change_spec,
+                                    transformer_field, {})
+
+    glob_name = "*." + name if name else None
+    transformers = function_transformers.get("*", {}).copy()
+    transformers.update(function_transformers.get(glob_name, {}))
+    transformers.update(function_transformers.get(full_name, {}))
+    return transformers
+
+  def _get_full_name(self, node):
+    """Traverse an Attribute node to generate a full name, e.g., "tf.foo.bar".
+
+    This is the inverse of `full_name_node`.
 
     Args:
       node: A Node of type Attribute.
 
     Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
+      a '.'-delimited full-name or None if node was not Attribute or Name.
       i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
     """
     curr = node
     items = []
     while not isinstance(curr, ast.Name):
       if not isinstance(curr, ast.Attribute):
-        return None, None
+        return None
       items.append(curr.attr)
       curr = curr.value
     items.append(curr.id)
-    return ".".join(reversed(items)), items[0]
+    return ".".join(reversed(items))
 
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
+  def _maybe_add_warning(self, node, full_name):
+    """Adds an error to be printed about full_name at node."""
+    function_warnings = self._api_change_spec.function_warnings
+    if full_name in function_warnings:
+      warning_message = function_warnings[full_name]
+      warning_message = warning_message.replace("<function name>", full_name)
+      self.add_log(WARNING, node.lineno, node.col_offset,
+                   "%s requires manual check. %s" % (full_name,
+                                                     warning_message))
+      return True
+    else:
+      return False
 
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
+  def _maybe_add_call_warning(self, node, full_name, name):
+    """Print a warning when specific functions are called with selected args.
 
-    Returns:
-      lineno, offset for the given node
+    The function _print_warning_for_function matches the full name of the called
+    function, e.g., tf.foo.bar(). This function matches the function name that
+    is called, as long as the function is an attribute. For example,
+    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
 
     Args:
-      node: Node for which we wish to know the lineno and col_offset
+      node: ast.Call object
+      full_name: The precomputed full name of the callable, if one exists, None
+        otherwise.
+      name: The precomputed name of the callable, if one exists, None otherwise.
+
+    Returns:
+      Whether an error was recorded.
     """
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line - 1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = FIND_OPEN.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
+    # Only look for *.-warnings here, the other will be handled by the Attribute
+    # visitor. Also, do not warn for bare functions, only if the call func is
+    # an attribute.
+    warned = False
+    if isinstance(node.func, ast.Attribute):
+      warned = self._maybe_add_warning(node, "*." + name)
+
+    # All arg warnings are handled here, since only we have the args
+    arg_warnings = self._get_applicable_dict("function_arg_warnings",
+                                             full_name, name)
+
+    used_args = [kw.arg for kw in node.keywords]
+    for (kwarg, arg), warning in arg_warnings.items():
+      if kwarg in used_args or len(node.args) > arg:
+        warned = True
+        warning_message = warning.replace("<function name>", full_name or name)
+        self.add_log(WARNING, node.lineno, node.col_offset,
+                     "%s called with %s argument requires manual check: %s." %
+                     (full_name or name, kwarg, warning_message))
+
+    return warned
+
+  def _maybe_rename(self, parent, node, full_name):
+    """Replace node (Attribute or Name) with a node representing full_name."""
+    new_name = self._api_change_spec.symbol_renames.get(full_name, None)
+    if new_name:
+      self.add_log(INFO, node.lineno, node.col_offset,
+                   "Renamed %r to %r" % (full_name, new_name))
+      new_node = full_name_node(new_name, node.ctx)
+      ast.copy_location(new_node, node)
+      pasta.ast_utils.replace_child(parent, node, new_node)
+      return True
+    else:
+      return False
+
+  def _maybe_change_to_function_call(self, parent, node, full_name):
+    """Wraps node (typically, an Attribute or Expr) in a Call."""
+    if full_name in self._api_change_spec.change_to_function:
+      if not isinstance(parent, ast.Call):
+        # ast.Call's constructor is really picky about how many arguments it
+        # wants, and also, it changed between Py2 and Py3.
+        if six.PY2:
+          new_node = ast.Call(node, [], [], None, None)
+        else:
+          new_node = ast.Call(node, [], [])
+        pasta.ast_utils.replace_child(parent, node, new_node)
+        ast.copy_location(new_node, node)
+        self.add_log(INFO, node.lineno, node.col_offset,
+                     "Changed %r to a function call" % full_name)
+        return True
+    return False
+
+  def _maybe_add_arg_names(self, node, full_name):
+    """Make args into keyword args if function called full_name requires it."""
+    function_reorders = self._api_change_spec.function_reorders
+
+    if full_name in function_reorders:
+      reordered = function_reorders[full_name]
+      new_keywords = []
+      for idx, arg in enumerate(node.args):
+        if sys.version_info[:2] >= (3, 5) and isinstance(arg, ast.Starred):
+          continue  # Can't move Starred to keywords
+        keyword_arg = reordered[idx]
+        keyword = ast.keyword(arg=keyword_arg, value=arg)
+        new_keywords.append(keyword)
+
+      if new_keywords:
+        self.add_log(INFO, node.lineno, node.col_offset,
+                     "Added keywords to args of function %r" % full_name)
+        node.args = []
+        node.keywords = new_keywords + (node.keywords or [])
+        return True
+    return False
+
+  def _maybe_modify_args(self, node, full_name, name):
+    """Rename keyword args if the function called full_name requires it."""
+    renamed_keywords = self._get_applicable_dict("function_keyword_renames",
+                                                 full_name, name)
+
+    if not renamed_keywords:
+      return False
+
+    modified = False
+    new_keywords = []
+    for keyword in node.keywords:
+      argkey = keyword.arg
+      if argkey in renamed_keywords:
+        modified = True
+        if renamed_keywords[argkey] is None:
+          lineno = getattr(keyword, "lineno", node.lineno)
+          col_offset = getattr(keyword, "col_offset", node.col_offset)
+          self.add_log(INFO, lineno, col_offset,
+                       "Removed argument %s for function %s" % (
+                           argkey, full_name or name))
         else:
-          if (reversed_preceding_text == "" or
-              reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start == -1:
-              col = len(prev_line) - 1
-            elif FIND_STRING_CHARS.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
+          keyword.arg = renamed_keywords[argkey]
+          lineno = getattr(keyword, "lineno", node.lineno)
+          col_offset = getattr(keyword, "col_offset", node.col_offset)
+          self.add_log(INFO, lineno, col_offset,
+                       "Renamed keyword argument for %s from %s to %s" % (
+                           full_name, argkey, renamed_keywords[argkey]))
+          new_keywords.append(keyword)
+      else:
+        new_keywords.append(keyword)
+
+    if modified:
+      node.keywords = new_keywords
+    return modified
 
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
@@ -307,104 +336,72 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Current Node
     """
-    self._print_warning_for_function_unrestricted(node)
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name, name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
+    assert self._stack[-1] is node
 
+    # Get the name for this call, so we can index stuff with it.
+    full_name = self._get_full_name(node.func)
     if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      glob_name = "*.{}".format(name)
-      if glob_name in function_handles:
-        function_handles[glob_name](self._file_edit, node)
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r" %
-                (reordered[idx], full_name),
-                arg.lineno,
-                arg.col_offset,
-                "",
-                "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r" %
-                                (reordered[idx], full_name), lineno, col_offset,
-                                "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
-                "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey,
-                                   renamed_keywords[argkey]), argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "",
-              "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
+      name = full_name.split(".")[-1]
+    elif isinstance(node.func, ast.Name):
+      name = node.func.id
+    elif isinstance(node.func, ast.Attribute):
+      name = node.func.attr
+    else:
+      name = None
+
+    # Call standard transformers for this node.
+    # Make sure warnings come first, since args or names triggering warnings
+    # may be removed by the other transformations.
+    self._maybe_add_call_warning(node, full_name, name)
+    # Make all args into kwargs
+    self._maybe_add_arg_names(node, full_name)
+    # Argument name changes or deletions
+    self._maybe_modify_args(node, full_name, name)
+
+    # Call transformers. These have the ability to modify the node, and if they
+    # do, will return the new node they created (or the same node if they just
+    # changed it). The are given the parent, but we will take care of
+    # integrating their changes into the parent if they return a new node.
+    #
+    # These are matched on the old name, since renaming is performed by the
+    # Attribute visitor, which happens later.
+    transformers = self._get_applicable_entries("function_transformers",
+                                                full_name, name)
+
+    parent = self._stack[-2]
+
+    for transformer in transformers:
+      logs = []
+      new_node = transformer(parent, node, full_name, name, logs)
+      self.add_logs(logs)
+      if new_node:
+        if new_node is not node:
+          pasta.ast_utils.replace_child(parent, node, new_node)
+          node = new_node
+          self._stack[-1] = node
+
+    self.generic_visit(node)
 
   def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+    """Handle bare Attributes i.e. [tf.foo, tf.bar]."""
+    assert self._stack[-1] is node
 
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name, _ = self._get_attribute_full_path(node)
+    full_name = self._get_full_name(node)
     if full_name:
+      parent = self._stack[-2]
+
       # Make sure the warning comes first, otherwise the name may have changed
-      self._print_warning_for_function(node, full_name)
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
+      self._maybe_add_warning(node, full_name)
+
+      # Once we did a modification, node is invalid and not worth inspecting
+      # further. Also, we only perform modifications for simple nodes, so
+      # There'd be no point in descending further.
+      if self._maybe_rename(parent, node, full_name):
+        return
+      if self._maybe_change_to_function_call(parent, node, full_name):
+        return
 
-    ast.NodeVisitor.generic_visit(self, node)
+    self.generic_visit(node)
 
 
 class ASTCodeUpgrader(object):
@@ -427,16 +424,48 @@ class ASTCodeUpgrader(object):
     """
 
     # Write to a temporary file, just in case we are doing an implace modify.
+    # pylint: disable=g-backslash-continuation
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
       ret = self.process_opened_file(in_filename, in_file, out_filename,
                                      temp_file)
+    # pylint: enable=g-backslash-continuation
 
     shutil.move(temp_file.name, out_filename)
     return ret
 
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
+  def format_log(self, log, in_filename):
+    log_string = "%d:%d: %s: %s" % (log[1], log[2], log[0], log[3])
+    if in_filename:
+      return in_filename + ":" + log_string
+    else:
+      return log_string
+
+  def update_string_pasta(self, text, in_filename):
+    """Updates a file using pasta."""
+    try:
+      t = pasta.parse(text)
+    except (SyntaxError, ValueError, TypeError):
+      log = ["ERROR: Failed to parse.\n" + traceback.format_exc()]
+      return 0, "", log, []
+
+    visitor = _PastaEditVisitor(self._api_change_spec)
+    visitor.visit(t)
+
+    logs = [self.format_log(log, None) for log in visitor.log]
+    errors = [self.format_log(error, in_filename)
+              for error in visitor.warnings_and_errors]
+    return 1, pasta.dump(t), logs, errors
+
+  def _format_log(self, log, in_filename, out_filename):
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+    text += "\n".join(log)
+    text += "-" * 80 + "\n\n"
+    return text
+
   def process_opened_file(self, in_filename, in_file, out_filename, out_file):
     """Process the given python file for incompatible changes.
 
@@ -451,33 +480,19 @@ class ASTCodeUpgrader(object):
     Returns:
       A tuple representing number of files processed, log of actions, errors
     """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
     lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-
-  # pylint: enable=broad-except
+    processed_file, new_file_content, log, process_errors = (
+        self.update_string_pasta("".join(lines), in_filename))
+
+    if out_file and processed_file:
+      out_file.write(new_file_content)
+
+    return (processed_file,
+            self._format_log(log, in_filename, out_filename),
+            process_errors)
 
   def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
+                   copy_other_files, in_place):
     """Processes upgrades on an entire tree of python files in place.
 
     Note that only Python files. If you have custom code in other languages,
@@ -487,11 +502,21 @@ class ASTCodeUpgrader(object):
       root_directory: Directory to walk and process.
       output_root_directory: Directory to use as base.
       copy_other_files: Copy files that are not touched by this converter.
+      in_place: Allow the conversion of an entire directory in place.
 
     Returns:
-      A tuple of files processed, the report string ofr all files, and errors
+      A tuple of files processed, the report string ofr all files, and a dict
+        mapping filenames to errors encountered in that file.
     """
 
+    if output_root_directory == root_directory:
+      if in_place:
+        return self.process_tree_inplace(root_directory)
+      else:
+        print("In order to copy a directory in place the `--inplace` input "
+              "arg must be set to `True`.")
+        sys.exit(1)
+
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
       print("Output directory %r must not already exist." %
@@ -528,7 +553,7 @@ class ASTCodeUpgrader(object):
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
-    tree_errors = []
+    tree_errors = {}
     report = ""
     report += ("=" * 80) + "\n"
     report += "Input tree: %r\n" % root_directory
@@ -540,7 +565,7 @@ class ASTCodeUpgrader(object):
         os.makedirs(output_directory)
       file_count += 1
       _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
+      tree_errors[input_path] = l_errors
       report += l_report
     for input_path, output_path in files_to_copy:
       output_directory = os.path.dirname(output_path)
@@ -548,3 +573,26 @@ class ASTCodeUpgrader(object):
         os.makedirs(output_directory)
       shutil.copy(input_path, output_path)
     return file_count, report, tree_errors
+
+  def process_tree_inplace(self, root_directory):
+    """Process a directory of python files in place."""
+    files_to_process = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [os.path.join(dir_name,
+                               f) for f in file_list if f.endswith(".py")]
+      files_to_process += py_files
+
+    file_count = 0
+    tree_errors = {}
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for path in files_to_process:
+      file_count += 1
+      _, l_report, l_errors = self.process_file(path, path)
+      tree_errors[path] = l_errors
+      report += l_report
+
+    return file_count, report, tree_errors
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index 99f20a026fcb9b60e0d4365dd2690946f0d833fc..704947915296d1e8991d71cac6153178139b2124 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -39,7 +39,10 @@ following new APIs:
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import ast
 import six
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 from tensorflow.tools.compatibility import ast_edits
@@ -54,7 +57,6 @@ class NoUpdateSpec(ast_edits.APIChangeSpec):
     self.function_keyword_renames = {}
     self.symbol_renames = {}
     self.function_warnings = {}
-    self.unrestricted_function_warnings = {}
     self.change_to_function = {}
 
 
@@ -191,6 +193,20 @@ class TestAstEdits(test_util.TensorFlowTestCase):
     _, new_text = self._upgrade(RenameKeywordSpec(), text)
     self.assertEqual(new_text, text)
 
+  def testKeywordReorderWithParens(self):
+    """Test that we get the expected result if there are parens around args."""
+    text = "f((a), ( ( b ) ))\n"
+    acceptable_outputs = [
+        # No change is a valid output
+        text,
+        # Also cases where all arguments are fully specified are allowed
+        "f(a=(a), b=( ( b ) ))\n",
+        # Making the parens canonical is ok
+        "f(a=(a), b=((b)))\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
   def testKeywordReorder(self):
     """Test that we get the expected result if kw2 is now before kw1."""
     text = "f(a, b, kw1=c, kw2=d)\n"
@@ -401,7 +417,8 @@ class TestAstEdits(test_util.TensorFlowTestCase):
 
       def __init__(self):
         NoUpdateSpec.__init__(self)
-        self.unrestricted_function_warnings = {"foo": "not good"}
+        self.function_warnings = {"*.foo": "not good"}
+
     texts = ["object.foo()", "get_object().foo()",
              "get_object().foo()", "object.foo().bar()"]
     for text in texts:
@@ -415,6 +432,13 @@ class TestAstEdits(test_util.TensorFlowTestCase):
       (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
       self.assertNotIn("not good", report)
 
+  def testFullNameNode(self):
+    t = ast_edits.full_name_node("a.b.c")
+    self.assertEquals(
+        ast.dump(t),
+        "Attribute(value=Attribute(value=Name(id='a', ctx=Load()), attr='b', "
+        "ctx=Load()), attr='c', ctx=Load())"
+    )
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 6235eb3eedd081747fabd64a277c717f414a9d56..9c02c20f0f021b767bdc1157702ee6f52c76b089 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -34,6 +34,7 @@ renames = {
     'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
     'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
     'tf.Dimension': 'tf.compat.v1.Dimension',
+    'tf.Event': 'tf.compat.v1.Event',
     'tf.FIFOQueue': 'tf.queue.FIFOQueue',
     'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
     'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
@@ -73,6 +74,8 @@ renames = {
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
     'tf.SparseFeature': 'tf.io.SparseFeature',
     'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
+    'tf.Summary': 'tf.compat.v1.Summary',
+    'tf.SummaryMetadata': 'tf.compat.v1.SummaryMetadata',
     'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
     'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
     'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
@@ -118,7 +121,6 @@ renames = {
     'tf.container': 'tf.compat.v1.container',
     'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
     'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
-    'tf.count_nonzero': 'tf.compat.v1.count_nonzero',
     'tf.count_up_to': 'tf.compat.v1.count_up_to',
     'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
     'tf.cross': 'tf.linalg.cross',
@@ -141,10 +143,11 @@ renames = {
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
-    'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
-    'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.dimension_at_index': 'tf.compat.dimension_at_index',
+    'tf.dimension_value': 'tf.compat.dimension_value',
     'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_batch_normalization': 'tf.compat.v1.disable_v2_batch_normalization',
     'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
@@ -166,8 +169,10 @@ renames = {
     'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
     'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
     'tf.div': 'tf.compat.v1.div',
+    'tf.dtypes.as_string': 'tf.strings.as_string',
     'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_batch_normalization': 'tf.compat.v1.enable_v2_batch_normalization',
     'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
@@ -198,12 +203,13 @@ renames = {
     'tf.get_variable': 'tf.compat.v1.get_variable',
     'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
     'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
-    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
-    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
+    'tf.gfile.GFile': 'tf.io.gfile.GFile',
+    'tf.gfile.Open': 'tf.io.gfile.GFile',
     'tf.global_norm': 'tf.linalg.global_norm',
     'tf.global_variables': 'tf.compat.v1.global_variables',
     'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
-    'tf.glorot_normal_initializer': 'tf.keras.initializers.glorot_normal',
+    'tf.glorot_normal_initializer': 'tf.compat.v1.glorot_normal_initializer',
+    'tf.glorot_uniform_initializer': 'tf.compat.v1.glorot_uniform_initializer',
     'tf.graph_util.convert_variables_to_constants': 'tf.compat.v1.graph_util.convert_variables_to_constants',
     'tf.graph_util.extract_sub_graph': 'tf.compat.v1.graph_util.extract_sub_graph',
     'tf.graph_util.must_run_on_cpu': 'tf.compat.v1.graph_util.must_run_on_cpu',
@@ -218,17 +224,32 @@ renames = {
     'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
     'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
     'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
-    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
     'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
     'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
     'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
     'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
     'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
     'tf.initialize_variables': 'tf.compat.v1.initialize_variables',
+    'tf.initializers.constant': 'tf.compat.v1.initializers.constant',
     'tf.initializers.global_variables': 'tf.compat.v1.initializers.global_variables',
+    'tf.initializers.glorot_normal': 'tf.compat.v1.initializers.glorot_normal',
+    'tf.initializers.glorot_uniform': 'tf.compat.v1.initializers.glorot_uniform',
+    'tf.initializers.he_normal': 'tf.compat.v1.initializers.he_normal',
+    'tf.initializers.he_uniform': 'tf.compat.v1.initializers.he_uniform',
+    'tf.initializers.identity': 'tf.compat.v1.initializers.identity',
+    'tf.initializers.lecun_normal': 'tf.compat.v1.initializers.lecun_normal',
+    'tf.initializers.lecun_uniform': 'tf.compat.v1.initializers.lecun_uniform',
     'tf.initializers.local_variables': 'tf.compat.v1.initializers.local_variables',
+    'tf.initializers.ones': 'tf.compat.v1.initializers.ones',
+    'tf.initializers.orthogonal': 'tf.compat.v1.initializers.orthogonal',
+    'tf.initializers.random_normal': 'tf.compat.v1.initializers.random_normal',
+    'tf.initializers.random_uniform': 'tf.compat.v1.initializers.random_uniform',
     'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
+    'tf.initializers.truncated_normal': 'tf.compat.v1.initializers.truncated_normal',
+    'tf.initializers.uniform_unit_scaling': 'tf.compat.v1.initializers.uniform_unit_scaling',
     'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
+    'tf.initializers.variance_scaling': 'tf.compat.v1.initializers.variance_scaling',
+    'tf.initializers.zeros': 'tf.compat.v1.initializers.zeros',
     'tf.invert_permutation': 'tf.math.invert_permutation',
     'tf.io.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
     'tf.io.PriorityQueue': 'tf.queue.PriorityQueue',
@@ -242,9 +263,26 @@ renames = {
     'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
     'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
-    'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
-    'tf.keras.layers.CuDNNGRU': 'tf.compat.v1.keras.layers.CuDNNGRU',
-    'tf.keras.layers.CuDNNLSTM': 'tf.compat.v1.keras.layers.CuDNNLSTM',
+    'tf.keras.initializers.Identity': 'tf.compat.v1.keras.initializers.Identity',
+    'tf.keras.initializers.Orthogonal': 'tf.compat.v1.keras.initializers.Orthogonal',
+    'tf.keras.initializers.TruncatedNormal': 'tf.compat.v1.keras.initializers.TruncatedNormal',
+    'tf.keras.initializers.VarianceScaling': 'tf.compat.v1.keras.initializers.VarianceScaling',
+    'tf.keras.initializers.constant': 'tf.compat.v1.keras.initializers.constant',
+    'tf.keras.initializers.glorot_normal': 'tf.compat.v1.keras.initializers.glorot_normal',
+    'tf.keras.initializers.glorot_uniform': 'tf.compat.v1.keras.initializers.glorot_uniform',
+    'tf.keras.initializers.he_normal': 'tf.compat.v1.keras.initializers.he_normal',
+    'tf.keras.initializers.he_uniform': 'tf.compat.v1.keras.initializers.he_uniform',
+    'tf.keras.initializers.identity': 'tf.compat.v1.keras.initializers.identity',
+    'tf.keras.initializers.lecun_normal': 'tf.compat.v1.keras.initializers.lecun_normal',
+    'tf.keras.initializers.lecun_uniform': 'tf.compat.v1.keras.initializers.lecun_uniform',
+    'tf.keras.initializers.normal': 'tf.compat.v1.keras.initializers.normal',
+    'tf.keras.initializers.ones': 'tf.compat.v1.keras.initializers.ones',
+    'tf.keras.initializers.orthogonal': 'tf.compat.v1.keras.initializers.orthogonal',
+    'tf.keras.initializers.random_normal': 'tf.compat.v1.keras.initializers.random_normal',
+    'tf.keras.initializers.random_uniform': 'tf.compat.v1.keras.initializers.random_uniform',
+    'tf.keras.initializers.truncated_normal': 'tf.compat.v1.keras.initializers.truncated_normal',
+    'tf.keras.initializers.uniform': 'tf.compat.v1.keras.initializers.uniform',
+    'tf.keras.initializers.zeros': 'tf.compat.v1.keras.initializers.zeros',
     'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
     'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
     'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
@@ -313,8 +351,13 @@ renames = {
     'tf.logging.warning': 'tf.compat.v1.logging.warning',
     'tf.logical_xor': 'tf.math.logical_xor',
     'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
+    'tf.losses.add_loss': 'tf.compat.v1.losses.add_loss',
     'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
     'tf.losses.cosine_distance': 'tf.compat.v1.losses.cosine_distance',
+    'tf.losses.get_losses': 'tf.compat.v1.losses.get_losses',
+    'tf.losses.get_regularization_loss': 'tf.compat.v1.losses.get_regularization_loss',
+    'tf.losses.get_regularization_losses': 'tf.compat.v1.losses.get_regularization_losses',
+    'tf.losses.get_total_loss': 'tf.compat.v1.losses.get_total_loss',
     'tf.losses.hinge_loss': 'tf.compat.v1.losses.hinge_loss',
     'tf.losses.huber_loss': 'tf.compat.v1.losses.huber_loss',
     'tf.losses.log_loss': 'tf.compat.v1.losses.log_loss',
@@ -405,7 +448,7 @@ renames = {
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
     'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
     'tf.op_scope': 'tf.compat.v1.op_scope',
-    'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
+    'tf.orthogonal_initializer': 'tf.compat.v1.orthogonal_initializer',
     'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
     'tf.placeholder': 'tf.compat.v1.placeholder',
@@ -428,15 +471,13 @@ renames = {
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
-    'tf.ragged.constant_value': 'tf.compat.v1.ragged.constant_value',
-    'tf.ragged.convert_to_tensor_or_ragged_tensor': 'tf.compat.v1.ragged.convert_to_tensor_or_ragged_tensor',
     'tf.ragged.RaggedTensorValue': 'tf.compat.v1.ragged.RaggedTensorValue',
+    'tf.ragged.constant_value': 'tf.compat.v1.ragged.constant_value',
     'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
     'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
     'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
     'tf.random_normal': 'tf.random.normal',
-    'tf.random_poisson': 'tf.compat.v1.random_poisson',
     'tf.random_shuffle': 'tf.random.shuffle',
     'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
@@ -456,7 +497,7 @@ renames = {
     'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
     'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
     'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
-    'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.TRAINING': 'tf.saved_model.TRAINING',
     'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
     'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
     'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
@@ -497,7 +538,7 @@ renames = {
     'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
     'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
     'tf.saved_model.tag_constants.TPU': 'tf.saved_model.TPU',
-    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRAINING',
     'tf.saved_model.utils.build_tensor_info': 'tf.compat.v1.saved_model.utils.build_tensor_info',
     'tf.saved_model.utils.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
     'tf.scatter_add': 'tf.compat.v1.scatter_add',
@@ -569,7 +610,13 @@ renames = {
     'tf.string_strip': 'tf.strings.strip',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
+    'tf.summary.Event': 'tf.compat.v1.summary.Event',
+    'tf.summary.FileWriter': 'tf.compat.v1.summary.FileWriter',
+    'tf.summary.FileWriterCache': 'tf.compat.v1.summary.FileWriterCache',
     'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
+    'tf.summary.Summary': 'tf.compat.v1.summary.Summary',
+    'tf.summary.SummaryDescription': 'tf.compat.v1.summary.SummaryDescription',
+    'tf.summary.TaggedRunMetadata': 'tf.compat.v1.summary.TaggedRunMetadata',
     'tf.summary.audio': 'tf.compat.v1.summary.audio',
     'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
     'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
@@ -618,6 +665,7 @@ renames = {
     'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
     'tf.train.ProfilerHook': 'tf.estimator.ProfilerHook',
     'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
+    'tf.train.ProximalGradientDescentOptimizer': 'tf.compat.v1.train.ProximalGradientDescentOptimizer',
     'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
     'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
     'tf.train.Saver': 'tf.compat.v1.train.Saver',
@@ -674,12 +722,13 @@ renames = {
     'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
     'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
     'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
+    'tf.train.summary_iterator': 'tf.compat.v1.train.summary_iterator',
     'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
     'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
     'tf.train.write_graph': 'tf.io.write_graph',
     'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
     'tf.truncated_normal': 'tf.random.truncated_normal',
-    'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
+    'tf.uniform_unit_scaling_initializer': 'tf.compat.v1.uniform_unit_scaling_initializer',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
     'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
     'tf.unsorted_segment_min': 'tf.math.unsorted_segment_min',
@@ -690,7 +739,7 @@ renames = {
     'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
     'tf.variable_scope': 'tf.compat.v1.variable_scope',
     'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
-    'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
+    'tf.variance_scaling_initializer': 'tf.compat.v1.variance_scaling_initializer',
     'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
     'tf.wrap_function': 'tf.compat.v1.wrap_function',
     'tf.write_file': 'tf.io.write_file',
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
index 1c9fb92db0efdec6996dc75ce45aba150776f813..f9b0e3f9d8e6107701b01768b9674680d0e4b64a 100644
--- a/tensorflow/tools/compatibility/reorders_v2.py
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -28,8 +28,10 @@ from __future__ import print_function
 reorders = {
     'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
     'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.batch_gather': ['params', 'indices', 'name'],
     'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
     'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
+    'tf.cond': ['pred', 'true_fn', 'false_fn', 'strict', 'name', 'fn1', 'fn2'],
     'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
     'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
     'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
@@ -64,6 +66,7 @@ reorders = {
     'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
     'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
     'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.softmax_cross_entropy_with_logits': ['_sentinel', 'labels', 'logits', 'dim', 'name'],
     'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
     'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
     'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index 5ce4dd49adc940dbc56e19915a188cdb6b8de1d1..2663762aa70253f54037393c0cb3cd791a040d56 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -70,6 +70,15 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         [0],
         tf.argmin([[1, 3, 2]], name='abc', dimension=1))
 
+  @test_util.run_v1_only("b/120545219")
+  def testSoftmaxCrossEntropyWithLogits(self):
+    out = tf.nn.softmax_cross_entropy_with_logits(
+        logits=[0.1, 0.8], labels=[0, 1])
+    self.assertAllClose(out, 0.40318608)
+    out = tf.nn.softmax_cross_entropy_with_logits_v2(
+        logits=[0.1, 0.8], labels=[0, 1])
+    self.assertAllClose(out, 0.40318608)
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 287d1a5483c32379da1dc651aba62a86a3f6d0f9..241b08510f6b1c7b62ab3563752b042bd1366f99 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -175,26 +175,13 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
-    # Specially handled functions.
-    self.function_handle = {"tf.reverse": self._reverse_handler}
-
     # Warnings that should be printed if corresponding functions are used.
-    self.function_warnings = {}
-
-  @staticmethod
-  def _reverse_handler(file_edit_recorder, node):
-    # TODO(aselle): Could check for a literal list of bools and try to convert
-    # them to indices.
-    comment = ("ERROR: tf.reverse has had its argument semantics changed "
-               "significantly the converter cannot detect this reliably, so "
-               "you need to inspect this usage manually.\n")
-    file_edit_recorder.add(
-        comment,
-        node.lineno,
-        node.col_offset,
-        "tf.reverse",
-        "tf.reverse",
-        error="tf.reverse requires manual check.")
+    self.function_warnings = {
+        "tf.reverse":
+            "ERROR: tf.reverse has had its argument semantics changed "
+            "significantly. The converter cannot detect this reliably, so "
+            "you need to inspect this usage manually.\n",
+    }
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index 66325ea2ad36265c6c3779b414774abab8213a84..cf05575a9dd0cf6940a18e801fc76b667dbda233 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -112,7 +112,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     text = "tf.reverse(a, b)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, new_text)
-    self.assertEqual(errors, ["test.py:1: tf.reverse requires manual check."])
+    self.assertIn("tf.reverse requires manual check", errors[0])
 
   def testListComprehension(self):
     def _test(input, output):  # pylint: disable=redefined-builtin
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 2662889a60639f36a41fbdb3ca8d56b29cd2abcb..03ecf5f303f2fc9a698df571bf17c3b7f342f608 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -18,6 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
+
+import pasta
+import six
+
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
 from tensorflow.tools.compatibility import reorders_v2
@@ -29,7 +34,27 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
+    # If the new argument is None, it will be removed.
+    # Only keyword args are handled, so make sure to also put any function in
+    # function_reorders to ensure that all args are made into keywords first.
     self.function_keyword_renames = {
+        "tf.gradients": {
+            "colocate_gradients_with_ops": None,
+        },
+        "tf.hessians": {
+            "colocate_gradients_with_ops": None,
+        },
+        "*.minimize": {
+            "colocate_gradients_with_ops": None,
+        },
+        "*.compute_gradients": {
+            "colocate_gradients_with_ops": None,
+        },
+        "tf.cond": {
+            "strict": None,
+            "fn1": "true_fn",
+            "fn2": "false_fn"
+        },
         "tf.argmin": {
             "dimension": "axis",
         },
@@ -75,6 +100,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.convert_to_tensor": {
             "preferred_dtype": "dtype_hint"
         },
+        "tf.nn.softmax_cross_entropy_with_logits": {
+            "dim": "axis",
+            "_sentinel": None,
+        },
         "tf.nn.softmax_cross_entropy_with_logits_v2": {
             "dim": "axis"
         },
@@ -90,6 +119,11 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.load_file_system_library": {
             "library_filename": "library_location",
         },
+        "tf.count_nonzero": {
+            "input_tensor": "input",
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis",
+        },
         "tf.math.count_nonzero": {
             "input_tensor": "input",
             "keep_dims": "keepdims",
@@ -356,6 +390,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     self.manual_symbol_renames = {
         "tf.batch_to_space_nd":
             "tf.batch_to_space",
+        "tf.batch_gather":
+            "tf.gather",
         "tf.space_to_batch_nd":
             "tf.space_to_batch",
         "tf.nn.space_to_batch":
@@ -472,6 +508,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.data.experimental.unbatch",
         "tf.contrib.data.unique":
             "tf.data.experimental.unique",
+        "tf.contrib.saved_model.load_keras_model":
+            "tf.keras.experimental.load_from_saved_model",
+        "tf.contrib.saved_model.save_keras_model":
+            "tf.keras.experimental.export",
         "tf.contrib.rnn.RNNCell":
             "tf.nn.rnn_cell.RNNCell",
         "tf.contrib.rnn.LSTMStateTuple":
@@ -480,6 +520,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.sort",
         "tf.contrib.framework.argsort":
             "tf.argsort",
+        "tf.count_nonzero":
+            "tf.math.count_nonzero",
         "tf.manip.batch_to_space_nd":
             "tf.batch_to_space",
         "tf.quantize_v2":
@@ -556,6 +598,64 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         # changed significantly.
         "tf.nn.ctc_loss":
             "tf.compat.v1.nn.ctc_loss",
+        "tf.zeros_initializer":
+            "tf.compat.v1.initializers.zeros",
+        "tf.ones_initializer":
+            "tf.compat.v1.initializers.ones",
+        "tf.constant_initializer":
+            "tf.compat.v1.initializers.constant",
+        "tf.random_uniform_initializer":
+            "tf.compat.v1.initializers.random_uniform",
+        "tf.random_normal_initializer":
+            "tf.compat.v1.initializers.random_normal",
+        "tf.truncated_normal_initializer":
+            "tf.compat.v1.initializers.truncated_normal",
+        "tf.image.resize_images":
+            "tf.image.resize",
+        "tf.random_poisson":
+            "tf.random.poisson",
+        "tf.debugging.assert_greater":
+            "tf.compat.v1.debugging.assert_greater",
+        "tf.debugging.assert_greater_equal":
+            "tf.compat.v1.debugging.assert_greater_equal",
+        "tf.debugging.assert_integer":
+            "tf.compat.v1.debugging.assert_integer",
+        "tf.debugging.assert_less":
+            "tf.compat.v1.debugging.assert_less",
+        "tf.debugging.assert_less_equal":
+            "tf.compat.v1.debugging.assert_less_equal",
+        "tf.debugging.assert_near":
+            "tf.compat.v1.debugging.assert_near",
+        "tf.debugging.assert_negative":
+            "tf.compat.v1.debugging.assert_negative",
+        "tf.debugging.assert_non_negative":
+            "tf.compat.v1.debugging.assert_non_negative",
+        "tf.debugging.assert_non_positive":
+            "tf.compat.v1.debugging.assert_non_positive",
+        "tf.debugging.assert_none_equal":
+            "tf.compat.v1.debugging.assert_none_equal",
+        "tf.debugging.assert_type":
+            "tf.compat.v1.debugging.assert_type",
+        "tf.debugging.assert_positive":
+            "tf.compat.v1.debugging.assert_positive",
+        "tf.debugging.assert_equal":
+            "tf.compat.v1.debugging.assert_equal",
+        "tf.debugging.assert_scalar":
+            "tf.compat.v1.debugging.assert_scalar",
+        "tf.assert_equal":
+            "tf.compat.v1.assert_equal",
+        "tf.assert_less":
+            "tf.compat.v1.assert_less",
+        "tf.assert_greater":
+            "tf.compat.v1.assert_greater",
+        "tf.debugging.assert_rank":
+            "tf.compat.v1.debugging.assert_rank",
+        "tf.debugging.assert_rank_at_least":
+            "tf.compat.v1.debugging.assert_rank_at_least",
+        "tf.debugging.assert_rank_in":
+            "tf.compat.v1.debugging.assert_rank_in",
+        "tf.assert_rank":
+            "tf.compat.v1.assert_rank",
     }
     # pylint: enable=line-too-long
 
@@ -578,7 +678,9 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.io.serialize_many_sparse",
         "tf.argmax",
         "tf.argmin",
+        "tf.batch_gather",
         "tf.batch_to_space",
+        "tf.cond",
         "tf.nn.space_to_batch",
         "tf.boolean_mask",
         "tf.convert_to_tensor",
@@ -648,6 +750,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.norm",
         "tf.reverse_sequence",
         "tf.sparse_split",
+        # tf.nn.softmax_cross_entropy_with_logits *must* be called with
+        # keyword arguments. Add keyword arguments in rare case when they
+        # are not specified.
+        "tf.nn.softmax_cross_entropy_with_logits",
     }
 
     # Functions that were reordered should be changed to the new keyword args
@@ -655,18 +761,45 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # positional arguments yourself, this could do the wrong thing.
     self.function_reorders = reorders_v2.reorders
 
-    # Specially handled functions.
-    self.function_handle = {
-        "tf.nn.dropout": self._dropout_handler,
-        "tf.gradients": self._colocate_handler("tf.gradients"),
-        "*.minimize": self._colocate_handler("Optimizer.minimize"),
-        "*.compute_gradients":
-            self._colocate_handler("Optimizer.compute_gradients"),
+    # Specially handled functions (pasta version)
+    # Each transformer is a callable which will be called with the arguments
+    #   transformer(parent, node, full_name, name, logs, errors)
+    # Where logs and errors are lists to which (line, col, msg) tuples can be
+    # appended, full_name is the FQN of the function called (or None if that is
+    # unknown), name is the name of the function called (or None is that is
+    # unknown). node is an ast.Call node representing this function call, and
+    # parent is its parent in the AST.
+    # The function may modify node (but not parent), and must return
+    # - none, if nothing was modified
+    # - node, if node was modified in place (make sure to use
+    #   pasta.ast_utils.replace_child to swap out children, otherwise formatting
+    #   may get messy)
+    # - a replacement for node, if the whole call node was replaced. The caller
+    #   will take care of changing parent.
+    self.function_transformers = {
+        "*.make_initializable_iterator": self._iterator_transformer,
+        "*.make_one_shot_iterator": self._iterator_transformer,
+        "tf.nn.dropout": self._dropout_transformer,
+        "tf.batch_gather": self._batch_gather_transformer,
+        "tf.to_bfloat16": self._cast_transformer,
+        "tf.to_complex128": self._cast_transformer,
+        "tf.to_complex64": self._cast_transformer,
+        "tf.to_double": self._cast_transformer,
+        "tf.to_float": self._cast_transformer,
+        "tf.to_int32": self._cast_transformer,
+        "tf.to_int64": self._cast_transformer,
+        "tf.nn.softmax_cross_entropy_with_logits":
+            self._softmax_cross_entropy_with_logits_transformer,
+        "tf.image.resize_area": self._image_resize_transformer,
+        "tf.image.resize_bicubic": self._image_resize_transformer,
+        "tf.image.resize_bilinear": self._image_resize_transformer,
+        "tf.image.resize_nearest_neighbor": self._image_resize_transformer,
+
     }
 
     decay_function_comment = (
-        "WARNING: <function name> has been changed to return a callable instead"
-        " of a tensor when graph building, but its functionality remains "
+        "<function name> has been changed to return a callable instead "
+        "of a tensor when graph building, but its functionality remains "
         "unchanged during eager execution (returns a callable like "
         "before). The converter cannot detect and fix this reliably, so "
         "this usage has been converted to compat.v1 (even though it may already"
@@ -675,26 +808,26 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
 
     # TODO(b/118888586): add default value change to update script.
     default_loss_reduction_changed = (
-        "WARNING: default value of loss_reduction has been changed to "
+        "default value of loss_reduction has been changed to "
         "SUM_OVER_BATCH_SIZE.\n"
     )
 
     assert_return_type_comment = (
-        "WARNING: assert_* functions have been changed to return None, the "
+        "assert_* functions have been changed to return None, the "
         "data argument has been removed, and arguments have been reordered."
         "\nThe calls have been converted to compat.v1 for safety (even though "
         " they may already have been correct)."
     )
 
     assert_rank_comment = (
-        "WARNING: assert_rank_* functions have been changed to return None, and"
+        "assert_rank_* functions have been changed to return None, and"
         " the data and summarize arguments have been removed."
         "\nThe calls have been converted to compat.v1 for safety (even though "
         " they may already have been correct)."
     )
 
     tf_01s_like_no_optimize_comment = (
-        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
+        "tf.zeros_like and tf.ones_like no longer have the optimize "
         "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
         "`input')."
         "\nThe calls have been converted to compat.v1 for safety (even though "
@@ -702,23 +835,84 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     )
 
     deprecate_partition_strategy_comment = (
-        "WARNING: `partition_strategy` has been removed from `%s` "
+        "`partition_strategy` has been removed from `%s` "
         " The 'div' strategy is used by default.")
 
+    initializers_no_dtype_comment = (
+        "tf.initializers and tf.keras.initializers no longer have the "
+        "dtype argument in the constructor or partition_info argument in the "
+        "call method in TF 2.0 and after. The only API symbols are now "
+        "tf.keras.initializers.* or tf.initializers.*."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        "they may already have been correct).")
+
+    uniform_unit_scaling_initializer_comment = (
+        "uniform_unit_scaling_initializer has been removed. Please use"
+        " tf.initializers.variance_scaling instead with distribution=uniform "
+        "to get equivalent behaviour.")
+
+    metrics_comment = (
+        "tf.metrics have been converted to object oriented versions in"
+        " TF 2.0 and after. The metric function calls have been converted to "
+        "compat.v1 for backward compatibility. Please update these calls to "
+        "the TF 2.0 versions.")
+
+    losses_comment = (
+        "tf.losses have been converted to object oriented versions in"
+        " TF 2.0 and after. The loss function calls have been converted to "
+        "compat.v1 for backward compatibility. Please update these calls to "
+        "the TF 2.0 versions.")
+
+    export_saved_model_renamed = (
+        "(Manual edit required) Please rename the method export_savedmodel() "
+        "to export_saved_model(). Two things to note:\n\t(1) The argument "
+        "strip_default_attributes has been removed. The function will always "
+        "strip the default attributes from ops. If this breaks your code, "
+        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
+        "only effects core estimator. If you are using "
+        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
+    # You can use *. to add items which do not check the FQN, and apply to e.g.,
+    # methods.
     self.function_warnings = {
-        "tf.assert_greater":
-            assert_return_type_comment,
+        "*.export_savedmodel":
+            export_saved_model_renamed,
         "tf.assert_equal":
             assert_return_type_comment,
+        "tf.assert_none_equal":
+            assert_return_type_comment,
+        "tf.assert_negative":
+            assert_return_type_comment,
+        "tf.assert_positive":
+            assert_return_type_comment,
+        "tf.assert_non_negative":
+            assert_return_type_comment,
+        "tf.assert_non_positive":
+            assert_return_type_comment,
+        "tf.assert_near":
+            assert_return_type_comment,
         "tf.assert_less":
             assert_return_type_comment,
+        "tf.assert_less_equal":
+            assert_return_type_comment,
+        "tf.assert_greater":
+            assert_return_type_comment,
+        "tf.assert_greater_equal":
+            assert_return_type_comment,
+        "tf.assert_integer":
+            assert_return_type_comment,
+        "tf.assert_type":
+            assert_return_type_comment,
+        "tf.assert_scalar":
+            assert_return_type_comment,
         "tf.assert_rank":
             assert_rank_comment,
-        "tf.cond": "tf.cond no longer takes 'strict'. "
-                   "Now 'strict' defaults to True."
-                   "fn1/fn2 arguments are replaced by true_fn/false_fn.",
+        "tf.assert_rank_at_least":
+            assert_rank_comment,
+        "tf.assert_rank_in":
+            assert_rank_comment,
         "tf.debugging.assert_equal":
             assert_return_type_comment,
         "tf.debugging.assert_greater":
@@ -743,15 +937,20 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_return_type_comment,
         "tf.debugging.assert_positive":
             assert_return_type_comment,
+        "tf.debugging.assert_type":
+            assert_return_type_comment,
+        "tf.debugging.assert_scalar":
+            assert_return_type_comment,
         "tf.debugging.assert_rank":
             assert_rank_comment,
         "tf.debugging.assert_rank_at_least":
             assert_rank_comment,
         "tf.debugging.assert_rank_in":
             assert_rank_comment,
-        "tf.device": "tf.device no longer takes function as an argument. "
-                     "'devide_name_or_function' argument has been renamed to "
-                     "'device_name'.",
+        "tf.device":
+            "tf.device no longer takes function as an argument. "
+            "'device_name_or_function' argument has been renamed to "
+            "'device_name'.",
         "tf.flags":
             "tf.flags has been removed, please use the argparse or absl"
             " module if you need command line parsing.",
@@ -789,10 +988,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             default_loss_reduction_changed,
         "tf.estimator.BaselineRegressor":
             default_loss_reduction_changed,
-        "tf.hessians": "tf.hessians no longer takes "
-                       "'colocate_gradients_with_ops' argument. Also, "
-                       "arguments have been reordered so that 'name' is the "
-                       "last argument.",
         "tf.nn.conv1d":
             "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
             " was renamed to \"input\"",
@@ -839,13 +1034,206 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "'deterministic' arguments. Now it takes a single 'seed' arg. If "
             "'seed' is zero, the execution is random and deterministic "
             "otherwise",
-        "tf.nn.softmax_cross_entropy_with_logits":
-            "tf.nn.softmax_cross_entropy_with_logits behavior has changed. "
-            "'labels' needs to be wrapped with tf.stop_gradient to keep the "
-            "old behavior. Also, 'dim' argument has been renamed to 'axis'.",
         "tf.test.assert_equal_graph_def":
             "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
             "argument. 'checkpoint_v2' now defaults to True.",
+        "tf.keras.initializers.Zeros":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.zeros":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Ones":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.ones":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Constant":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.constant":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.VarianceScaling":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Orthogonal":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.orthogonal":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Identity":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.identity":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.glorot_uniform":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.glorot_normal":
+            initializers_no_dtype_comment,
+        "tf.initializers.zeros":
+            initializers_no_dtype_comment,
+        "tf.zeros_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.ones":
+            initializers_no_dtype_comment,
+        "tf.ones_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.constant":
+            initializers_no_dtype_comment,
+        "tf.constant_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.random_uniform":
+            initializers_no_dtype_comment,
+        "tf.random_uniform_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.random_normal":
+            initializers_no_dtype_comment,
+        "tf.random_normal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.truncated_normal":
+            initializers_no_dtype_comment,
+        "tf.truncated_normal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.variance_scaling":
+            initializers_no_dtype_comment,
+        "tf.variance_scaling_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.orthogonal":
+            initializers_no_dtype_comment,
+        "tf.orthogonal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.identity":
+            initializers_no_dtype_comment,
+        "tf.glorot_uniform_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.glorot_uniform":
+            initializers_no_dtype_comment,
+        "tf.glorot_normal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.glorot_normal":
+            initializers_no_dtype_comment,
+        "tf.initializers.uniform_unit_scaling":
+            uniform_unit_scaling_initializer_comment,
+        "tf.uniform_unit_scaling_initializer":
+            uniform_unit_scaling_initializer_comment,
+        "tf.losses.absolute_difference":
+            losses_comment,
+        "tf.losses.add_loss":
+            losses_comment,
+        "tf.losses.compute_weighted_loss":
+            losses_comment,
+        "tf.losses.cosine_distance":
+            losses_comment,
+        "tf.losses.get_losses":
+            losses_comment,
+        "tf.losses.get_regularization_loss":
+            losses_comment,
+        "tf.losses.get_regularization_losses":
+            losses_comment,
+        "tf.losses.get_total_loss":
+            losses_comment,
+        "tf.losses.hinge_loss":
+            losses_comment,
+        "tf.losses.huber_loss":
+            losses_comment,
+        "tf.losses.log_loss":
+            losses_comment,
+        "tf.losses.mean_pairwise_squared_error":
+            losses_comment,
+        "tf.losses.mean_squared_error":
+            losses_comment,
+        "tf.losses.sigmoid_cross_entropy":
+            losses_comment,
+        "tf.losses.softmax_cross_entropy":
+            losses_comment,
+        "tf.losses.sparse_softmax_cross_entropy":
+            losses_comment,
+        "tf.metrics.accuracy":
+            metrics_comment,
+        "tf.metrics.auc":
+            metrics_comment,
+        "tf.metrics.average_precision_at_k":
+            metrics_comment,
+        "tf.metrics.false_negatives":
+            metrics_comment,
+        "tf.metrics.false_negatives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.false_positives":
+            metrics_comment,
+        "tf.metrics.false_positives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.mean":
+            metrics_comment,
+        "tf.metrics.mean_absolute_error":
+            metrics_comment,
+        "tf.metrics.mean_cosine_distance":
+            metrics_comment,
+        "tf.metrics.mean_iou":
+            metrics_comment,
+        "tf.metrics.mean_per_class_accuracy":
+            metrics_comment,
+        "tf.metrics.mean_relative_error":
+            metrics_comment,
+        "tf.metrics.mean_squared_error":
+            metrics_comment,
+        "tf.metrics.mean_tensor":
+            metrics_comment,
+        "tf.metrics.percentage_below":
+            metrics_comment,
+        "tf.metrics.precision":
+            metrics_comment,
+        "tf.metrics.precision_at_k":
+            metrics_comment,
+        "tf.metrics.precision_at_thresholds":
+            metrics_comment,
+        "tf.metrics.precision_at_top_k":
+            metrics_comment,
+        "tf.metrics.recall":
+            metrics_comment,
+        "tf.metrics.recall_at_k":
+            metrics_comment,
+        "tf.metrics.recall_at_thresholds":
+            metrics_comment,
+        "tf.metrics.recall_at_top_k":
+            metrics_comment,
+        "tf.metrics.root_mean_squared_error":
+            metrics_comment,
+        "tf.metrics.sensitivity_at_specificity":
+            metrics_comment,
+        "tf.metrics.sparse_average_precision_at_k":
+            metrics_comment,
+        "tf.metrics.sparse_precision_at_k":
+            metrics_comment,
+        "tf.metrics.specificity_at_sensitivity":
+            metrics_comment,
+        "tf.metrics.true_negatives":
+            metrics_comment,
+        "tf.metrics.true_negatives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.true_positives":
+            metrics_comment,
+        "tf.metrics.true_positives_at_thresholds":
+            metrics_comment,
+    }
+
+    # Warnings that are emitted only if a specific arg is found.
+    self.function_arg_warnings = {
+        "tf.gradients": {
+            ("colocate_gradients_with_ops", 4):
+                "tf.gradients no longer takes "
+                "'colocate_gradients_with_ops' argument, it behaves as if it "
+                "was set to True.",
+        },
+        "*.minimize": {
+            ("colocate_gradients_with_ops", 5):
+                "Optimizer.minimize no longer takes "
+                "'colocate_gradients_with_ops' argument, it behaves as if it "
+                "was set to True.",
+        },
+        "*.compute_gradients": {
+            ("colocate_gradients_with_ops", 4):
+                "Optimizer.compute_gradients no "
+                "longer takes 'colocate_gradients_with_ops' argument, it "
+                "behaves as if it was set to True.",
+        },
+        "tf.cond": {
+            ("strict", 3):
+                "tf.cond no longer takes 'strict' argument, it behaves as "
+                "if was set to True."
+        },
     }
 
     self.symbol_renames = {
@@ -853,82 +1241,197 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         for name, new_name in self.symbol_renames.items()
     }
 
-    export_saved_model_renamed = (
-        "(Manual edit required) Please rename the method export_savedmodel() "
-        "to export_saved_model(). Two things to note:\n\t(1) The argument "
-        "strip_default_attributes has been removed. The function will always "
-        "strip the default attributes from ops. If this breaks your code, "
-        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
-        "only effects core estimator. If you are using "
-        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+  @staticmethod
+  def _iterator_transformer(parent, node, full_name, name, logs):
+    # First, check that node.func.value is not already something we like
+    # (tf.compat.v1.data), or something which is handled in the rename
+    # (tf.data). This transformer only handles the method call to function call
+    # conversion.
+    if full_name and (full_name.startswith("tf.compat.v1.data") or
+                      full_name.startswith("tf.data")):
+      return
 
-    make_initializable_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_initializable_iterator()` method has been "
-        "removed. If you are using the Estimator API, you can return a dataset "
-        "directly from your input functions without creating an iterator. "
-        "As a last resort, please replace calls to that method on `dataset` "
-        "with a call to "
-        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
-
-    make_one_shot_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
-        "removed. If you are using eager execution, you can iterate over "
-        "`dataset` using a Python `for` loop. If you are using the Estimator "
-        "API, you can return a dataset directly from your input functions "
-        "without creating an iterator. As a last resort, please replace calls "
-        "to that method on `dataset` with a call to "
-        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
-
-    # Specify warnings for functions that aren't restricted to the tf.x.y.z
-    # format. This should only be used for methods with unique names, e.g.
-    # export_savedmodel, which is only defined in Estimator objects.
-    self.unrestricted_function_warnings = {
-        "export_savedmodel": export_saved_model_renamed,
-        "make_initializable_iterator": make_initializable_iterator_deprecation,
-        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
-    }
+    # This should never happen, since we're only called for Attribute nodes.
+    if not isinstance(node.func, ast.Attribute):
+      return
+
+    # Transform from x.f(y) to tf.compat.v1.data.f(x, y)
+    # Fortunately, node.func.value should already have valid position info
+    node.args = [node.func.value] + node.args
+    node.func.value = ast_edits.full_name_node("tf.compat.v1.data")
+
+    logs.append((ast_edits.WARNING, node.lineno, node.col_offset,
+                 "Changing dataset.%s() to tf.compat.v1.data.%s(dataset). "
+                 "Please check this transformation.\n" % (name, name)))
+
+    return node
 
   @staticmethod
-  def _dropout_handler(file_edit_recorder, node):
+  def _dropout_transformer(parent, node, full_name, name, logs):
+    def _replace_keep_prob_node(parent, old_value):
+      """Replaces old_value with 1-(old_value)."""
+      one = ast.Num(n=1)
+      one.lineno = 0
+      one.col_offset = 0
+      new_value = ast.BinOp(left=one, op=ast.Sub(),
+                            right=old_value)
+      # This copies the prefix and suffix on old_value to new_value.
+      pasta.ast_utils.replace_child(parent, old_value, new_value)
+      ast.copy_location(new_value, old_value)
+      # Put parentheses around keep_prob.value (and remove the old prefix/
+      # suffix, they should only be around new_value).
+      pasta.base.formatting.set(old_value, "prefix", "(")
+      pasta.base.formatting.set(old_value, "suffix", ")")
+
+    # Check if we have a keep_prob keyword arg
+    for keep_prob in node.keywords:
+      if keep_prob.arg == "keep_prob":
+        logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                     "Changing keep_prob arg of tf.nn.dropout to rate\n"))
+        keep_prob.arg = "rate"
+        _replace_keep_prob_node(keep_prob, keep_prob.value)
+        return node
+
+    # Maybe it was a positional arg
     if len(node.args) < 2:
-      comment = ("ERROR: tf.nn.dropout did not take arguments, so automatic "
-                 "transformation was disabled. tf.nn.dropout has changed "
-                 "the semantics of the second argument.")
-      file_edit_recorder.add(
-          comment,
-          node.lineno,
-          node.col_offset,
-          "tf.nn.dropout",
-          "tf.nn.dropout",
-          error="tf.nn.dropout requires manual check.")
+      logs.append((ast_edits.ERROR, node.lineno, node.col_offset,
+                   "tf.nn.dropout called without arguments, so "
+                   "automatic fix was disabled. tf.nn.dropout has changed "
+                   "the semantics of the second argument."))
     else:
-      comment = ("WARNING: tf.nn.dropout has changed the semantics of the "
-                 "second argument. Please check the transformation.\n")
-      file_edit_recorder.add(
-          comment,
-          node.args[1].lineno,
-          node.args[1].col_offset,
-          "",
-          "1 - ")
+      _replace_keep_prob_node(node, node.args[1])
+      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                   "Changing keep_prob arg of tf.nn.dropout to rate, and "
+                   "recomputing value.\n"))
+
+      return node
+
+  @staticmethod
+  def _cast_transformer(parent, node, full_name, name, logs):
+    """Transforms to_int and to_float to cast(..., dtype=...)."""
+
+    # Find out the dtype to cast to from the function name
+    dtype_str = name[3:]
+    # Special cases where the full dtype is not given
+    if dtype_str == "float":
+      dtype_str = "float32"
+    elif dtype_str == "double":
+      dtype_str = "float64"
+    new_arg = ast.keyword(arg="dtype",
+                          value=ast.Attribute(value=ast.Name(id="tf",
+                                                             ctx=ast.Load()),
+                                              attr=dtype_str, ctx=ast.Load()))
+    # Ensures a valid transformation when a positional name arg is given
+    if len(node.args) == 2:
+      name_arg = ast.keyword(arg="name",
+                             value=node.args[-1])
+      node.args = node.args[:-1]
+      node.keywords.append(name_arg)
+
+    # Python3 ast requires the args for the Attribute, but codegen will mess up
+    # the arg order if we just set them to 0.
+    new_arg.value.lineno = node.lineno
+    new_arg.value.col_offset = node.col_offset+100
+
+    node.keywords.append(new_arg)
+    if isinstance(node.func, ast.Attribute):
+      node.func.attr = "cast"
+    else:
+      assert isinstance(node.func, ast.Name)
+      node.func.id = "cast"
+
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Changed %s call to tf.cast(..., dtype=tf.%s)." % (full_name,
+                                                                    dtype_str)))
+    return node
 
   @staticmethod
-  def _colocate_handler(name):
-    def _helper(file_edit_recorder, node):
-      for keyword in node.keywords:
-        if keyword.arg == "colocate_gradients_with_ops":
-          # TODO(jhseu): Since ast_edit.py does string replacement, there's no
-          # straightforward way to remove the argument. Try to fix before 2.0 is
-          # final.
-          comment = ("For tf.gradients and tf.Optimizer.minimize, "
-                     "colocate_gradients_with_op has been removed and now "
-                     "defaults to True.")
-          file_edit_recorder.add(
-              comment,
-              node.lineno,
-              node.col_offset,
-              "",
-              "",
-              error="{} requires manual check.".format(name))
-    return _helper
+  def _softmax_cross_entropy_with_logits_transformer(
+      parent, node, full_name, name, logs):
+    def _wrap_label(parent, old_value):
+      """Wrap labels with tf.stop_gradient."""
+      if six.PY3:
+        new_value = ast.Call(
+            ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+            [old_value], [])
+      else:
+        new_value = ast.Call(
+            ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+            [old_value], [], None, None)
+
+      # This copies the prefix and suffix on old_value to new_value.
+      pasta.ast_utils.replace_child(parent, old_value, new_value)
+      ast.copy_location(new_value, old_value)
+
+    # Check if we have a labels keyword arg
+    for karg in node.keywords:
+      if karg.arg == "labels":
+        logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                     "Changing labels arg of "
+                     "tf.nn.softmax_cross_entropy_with_logits to "
+                     "tf.stop_gradient(labels). Please check this "
+                     "transformation.\n"))
+        _wrap_label(karg, karg.value)
+        return node
+    return node
+
+  @staticmethod
+  def _batch_gather_transformer(parent, node, full_name, name, logs):
+    # Check if the call already has a batch_dims argument
+    if any([kw.arg == "batch_dims" for kw in node.keywords]):
+      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                   "tf.batch_gather already has batch_dims argument. Neat."))
+      return None
+
+    minus_one = ast.Num(n=-1)
+    minus_one.lineno = 0
+    minus_one.col_offset = 0
+    new_arg = ast.keyword("batch_dims", minus_one)
+    node.keywords.append(new_arg)
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Added keyword argument batch_dims=-1 to tf.batch_gather."))
+    return node
+
+  @staticmethod
+  def _image_resize_transformer(parent, node, full_name, name, logs):
+    """Transforms image.resize_* to image.resize(..., method=*, ...)."""
+
+    resize_method = name[7:].upper()
+    new_arg = ast.keyword(arg="method",
+                          value=ast.Attribute(
+                              value=ast.Attribute(
+                                  value=ast.Attribute(
+                                      value=ast.Name(id="tf", ctx=ast.Load()),
+                                      attr="image", ctx=ast.Load()),
+                                  attr="ResizeMethod", ctx=ast.Load()),
+                              attr=resize_method, ctx=ast.Load()))
+
+    # Ensures a valid transformation when a positional name arg is given
+    if len(node.args) == 4:
+      pos_arg = ast.keyword(arg="preserve_aspect_ratio",
+                            value=node.args[-1])
+      node.args = node.args[:-1]
+      node.keywords.append(pos_arg)
+    if len(node.args) == 3:
+      pos_arg = ast.keyword(arg="align_corners",
+                            value=node.args[-1])
+      node.args = node.args[:-1]
+      node.keywords.append(pos_arg)
+
+    # Python3 ast requires the args for the Attribute, but codegen will mess up
+    # the arg order if we just set them to 0.
+    new_arg.value.lineno = node.lineno
+    new_arg.value.col_offset = node.col_offset+100
+
+    node.keywords.append(new_arg)
+    if isinstance(node.func, ast.Attribute):
+      node.func.attr = "resize"
+    else:
+      assert isinstance(node.func, ast.Name)
+      node.func.id = "resize"
+
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Changed %s call to tf.image.resize(..., "
+                 "method=tf.image.ResizeMethod.%s)." % (full_name,
+                                                        resize_method)))
+    return node
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 543d0786423f5b3f9bc59895c1325d19b6241cf7..b446452cfe382eefa4c1e900765c76361cf08e29 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -59,6 +59,14 @@ Simple usage:
             "copy the other files."),
       type=bool,
       default=True)
+  parser.add_argument(
+      "--inplace",
+      dest="in_place",
+      help=("If converting a whole tree of files, whether to "
+            "allow the conversion to be performed on the "
+            "files in the input tree."),
+      type=bool,
+      default=False)
   parser.add_argument(
       "--reportfile",
       dest="report_filename",
@@ -79,6 +87,7 @@ Simple usage:
           "single file.")
     files_processed, report_text, errors = upgrade.process_file(
         args.input_file, args.output_file)
+    errors = {args.input_file: errors}
     files_processed = 1
   elif args.input_tree:
     if not args.output_tree:
@@ -86,19 +95,34 @@ Simple usage:
           "--outtree=<output directory> argument is required when converting a "
           "file tree.")
     files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files)
+        args.input_tree, args.output_tree, args.copy_other_files, args.in_place)
   else:
     parser.print_help()
   if report_text:
-    open(report_filename, "w").write(report_text)
-    print("TensorFlow 2.0 Upgrade Script")
-    print("-----------------------------")
-    print("Converted %d files\n" % files_processed)
-    print("Detected %d errors that require attention" % len(errors))
-    print("-" * 80)
-    print("\n".join(errors))
-    print("\nMake sure to read the detailed log %r\n" % report_filename)
+    num_errors = 0
+    report = []
+    for f in errors:
+      if errors[f]:
+        num_errors += len(errors[f])
+        report.append("-" * 80 + "\n")
+        report.append("File: %s\n" % f)
+        report.append("-" * 80 + "\n")
+        report.append("\n".join(errors[f]) + "\n")
+
+    report = ("TensorFlow 2.0 Upgrade Script\n"
+              "-----------------------------\n"
+              "Converted %d files\n" % files_processed +
+              "Detected %d errors that require attention" % num_errors + "\n" +
+              "-" * 80 + "\n") + "".join(report)
+    with open(report_filename, "w") as report_file:
+      report_file.write(report)
+      report_file.write("=" * 80 + "\n")
+      report_file.write("Detailed log follows:\n\n")
+      report_file.write("=" * 80 + "\n")
+      report_file.write(report_text)
 
+    print(report)
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
 
 if __name__ == "__main__":
   main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 270d93065ecaf8ed450fe55cff4c6daf67ab8fcd..8ff5d01ae6d3fabeb7c7cd88115ad41e38df4b00 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -114,12 +114,12 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     self.assertTrue(report.find("Failed to parse") != -1)
 
   def testReport(self):
-    text = "tf.assert_near(a)\n"
+    text = "tf.angle(a)\n"
     _, report, unused_errors, unused_new_text = self._upgrade(text)
     # This is not a complete test, but it is a sanity test that a report
     # is generating information.
-    self.assertTrue(report.find("Renamed function `tf.assert_near` to "
-                                "`tf.debugging.assert_near`"))
+    self.assertTrue(report.find("Renamed function `tf.angle` to "
+                                "`tf.math.angle`"))
 
   def testRename(self):
     text = "tf.conj(a)\n"
@@ -239,8 +239,8 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     }
     function_warnings = (
         tf_upgrade_v2.TFAPIChangeSpec().function_warnings)
-    function_handles = (
-        tf_upgrade_v2.TFAPIChangeSpec().function_handle)
+    function_transformers = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_transformers)
     keyword_renames = (
         tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
 
@@ -255,7 +255,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
 
         for name in names_v1:
           tf_name = "tf.%s" % name
-          if tf_name in function_warnings or tf_name in function_handles:
+          if tf_name in function_warnings or tf_name in function_transformers:
             continue  # These require manual change
           if tf_name in v1_name_exceptions:
             continue
@@ -293,6 +293,24 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     visitor.private_map["tf.compat"] = ["v1", "v2"]
     traverse.traverse(tf.compat.v1, visitor)
 
+  def testPositionsMatchArgGiven(self):
+    full_dict = tf_upgrade_v2.TFAPIChangeSpec().function_arg_warnings
+    method_names = full_dict.keys()
+    for method_name in method_names:
+      args = full_dict[method_name].keys()
+      # special case for optimizer methods
+      if method_name.startswith("*."):
+        method = method_name.replace("*", "tf.train.Optimizer")
+      else:
+        method = method_name
+      method = get_symbol_for_name(tf, method)
+      arg_spec = tf_inspect.getfullargspec(method)
+      for (arg, pos) in args:
+        # to deal with the self argument on methods on objects
+        if method_name.startswith("*."):
+          pos += 1
+        self.assertEqual(arg_spec[0][pos], arg)
+
   def testReorderFileNeedsUpdate(self):
     reordered_function_names = (
         tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
@@ -362,17 +380,95 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
       text = "%s(a, b)\n" % decay
       _, report, errors, _ = self._upgrade(text)
-      self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
+      self.assertIn("%s requires manual check" % decay, errors[0])
       self.assertIn("%s has been changed" % decay, report)
 
   def testPiecewiseDecay(self):
     text = "tf.train.piecewise_constant_decay(a, b)\n"
     _, report, errors, _ = self._upgrade(text)
-    self.assertEqual(
-        errors,
-        ["test.py:1: tf.train.piecewise_constant_decay requires manual check."])
+    self.assertIn("tf.train.piecewise_constant_decay requires manual check",
+                  errors[0])
     self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
 
+  def testMetrics(self):
+    metrics = [
+        "accuracy",
+        "auc",
+        "average_precision_at_k",
+        "false_negatives",
+        "false_negatives_at_thresholds",
+        "false_positives",
+        "false_positives_at_thresholds",
+        "mean",
+        "mean_absolute_error",
+        "mean_cosine_distance",
+        "mean_iou",
+        "mean_per_class_accuracy",
+        "mean_relative_error",
+        "mean_squared_error",
+        "mean_tensor",
+        "percentage_below",
+        "precision",
+        "precision_at_k",
+        "precision_at_thresholds",
+        "precision_at_top_k",
+        "recall",
+        "recall_at_k",
+        "recall_at_thresholds",
+        "recall_at_top_k",
+        "root_mean_squared_error",
+        "sensitivity_at_specificity",
+        "sparse_average_precision_at_k",
+        "sparse_precision_at_k",
+        "specificity_at_sensitivity",
+        "true_negatives",
+        "true_negatives_at_thresholds",
+        "true_positives",
+        "true_positives_at_thresholds",
+    ]
+    for m in metrics:
+      ns = "tf.metrics." + m
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1.metrics." + m + "(a, b)", new_text)
+      self.assertIn("%s requires manual check" % ns, errors[0])
+      self.assertIn(
+          "tf.metrics have been converted to object oriented"
+          " versions in TF 2.0 and after. The metric function calls have been "
+          "converted to compat.v1 for backward compatibility. Please update "
+          "these calls to the TF 2.0 versions.", report)
+
+  def testLosses(self):
+    losses = [
+        "absolute_difference",
+        "add_loss",
+        "compute_weighted_loss",
+        "cosine_distance",
+        "get_losses",
+        "get_regularization_loss",
+        "get_regularization_losses",
+        "get_total_loss",
+        "hinge_loss",
+        "huber_loss",
+        "log_loss",
+        "mean_pairwise_squared_error",
+        "mean_squared_error",
+        "sigmoid_cross_entropy",
+        "softmax_cross_entropy",
+        "sparse_softmax_cross_entropy",
+    ]
+    for l in losses:
+      ns = "tf.losses." + l
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1.losses." + l + "(a, b)", new_text)
+      self.assertIn("%s requires manual check" % ns, errors[0])
+      self.assertIn(
+          "tf.losses have been converted to object oriented"
+          " versions in TF 2.0 and after. The loss function calls have been "
+          "converted to compat.v1 for backward compatibility. Please update "
+          "these calls to the TF 2.0 versions.", report)
+
   def testEstimatorLossReductionChange(self):
     classes = [
         "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
@@ -384,7 +480,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       text = ns + "(a, b)"
       _, report, errors, new_text = self._upgrade(text)
       self.assertEqual(text, new_text)
-      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("%s requires manual check" % ns, errors[0])
       self.assertIn("loss_reduction has been changed", report)
 
   def testDropout(self):
@@ -392,18 +488,43 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(
         new_text,
-        "tf.nn.dropout(x, 1 - keep_prob, name=\"foo\")\n",
+        "tf.nn.dropout(x, 1 - (keep_prob), name=\"foo\")\n",
+    )
+
+    text = "tf.nn.dropout(x, keep_prob=.4, name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x, rate=1 - (.4), name=\"foo\")\n",
+    )
+
+    text = (
+        "tf.nn.dropout(x,  # Stuff before\n"
+        "              keep_prob=.4,  # Stuff after\n"
+        "              name=\"foo\")\n"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x,  # Stuff before\n"
+        "              rate=1 - (.4),  # Stuff after\n"
+        "              name=\"foo\")\n",
     )
 
     text = "tf.nn.dropout(x)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, text)
+    self.assertIn("tf.nn.dropout called without arguments", errors[0])
+
+  def testDropoutExpr(self):
+    text = "tf.nn.dropout(x, 1 - func(3 + 4.), name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(
-        errors,
-        ["test.py:1: tf.nn.dropout requires manual check."]
+        new_text,
+        "tf.nn.dropout(x, 1 - (1 - func(3 + 4.)), name=\"foo\")\n",
     )
 
-  def testCountNonZeroChanges(self):
+  def testMathCountNonZeroChanges(self):
     text = (
         "tf.math.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
         "reduction_indices=axis, keep_dims=keepdims)\n"
@@ -415,6 +536,18 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         )
     self.assertEqual(new_text, expected_text)
 
+  def testCountNonZeroChanges(self):
+    text = (
+        "tf.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
+        "reduction_indices=axis, keep_dims=keepdims)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.math.count_nonzero(input=input, dtype=dtype, name=name, "
+        "axis=axis, keepdims=keepdims)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
   def testRandomMultinomialToRandomCategorical(self):
     text = (
         "tf.random.multinomial(logits, samples, seed, name, output_dtype)\n"
@@ -436,6 +569,15 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         )
     self.assertEqual(new_text, expected_text)
 
+  def testRandomPoissonConversion(self):
+    text1 = "tf.random_poisson(lam, shape, dtype)"
+    text2 = "tf.random.poisson(lam, shape, dtype)"
+    expected_text = "tf.random.poisson(lam=lam, shape=shape, dtype=dtype)"
+    _, unused_report, unused_errors, new_text1 = self._upgrade(text1)
+    self.assertEqual(new_text1, expected_text)
+    _, unused_report, unused_errors, new_text2 = self._upgrade(text2)
+    self.assertEqual(new_text2, expected_text)
+
   def testConvolutionOpUpdate(self):
     text = (
         "tf.nn.convolution(input, filter, padding, strides, dilation_rate, "
@@ -464,9 +606,11 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
     text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors, ["test.py:1: tf.gradients requires manual check."])
+    self.assertEqual("tf.gradients(a)\n", new_text)
+    self.assertIn("tf.gradients", errors[0])
+    self.assertIn("requires manual check", errors[0])
 
+  def testColocateGradientsWithOpsMinimize(self):
     text = "optimizer.minimize(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
@@ -474,10 +618,11 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
     text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors,
-                     ["test.py:1: Optimizer.minimize requires manual check."])
+    self.assertEqual("optimizer.minimize(a)\n", new_text)
+    self.assertIn("requires manual check", errors[0])
+    self.assertIn("minimize", errors[0])
 
+  def testColocateGradientsWithOpsComputeGradients(self):
     text = "optimizer.compute_gradients(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
@@ -485,10 +630,9 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
 
     text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors,
-                     ["test.py:1: Optimizer.compute_gradients "
-                      "requires manual check."])
+    self.assertEqual("optimizer.compute_gradients(a)\n", new_text)
+    self.assertIn("requires manual check", errors[0])
+    self.assertIn("compute_gradients", errors[0])
 
   def testExportSavedModelRename(self):
     text = "self.est.export_savedmodel(path)"
@@ -568,6 +712,16 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testKerasSavedModel(self):
+    text = (
+        "tf.contrib.saved_model.save_keras_model(model, './saved_models')\n"
+        "tf.contrib.saved_model.load_keras_model(saved_model_path)\n")
+    expected_text = (
+        "tf.keras.experimental.export(model, './saved_models')\n"
+        "tf.keras.experimental.load_from_saved_model(saved_model_path)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testStatelessMultinomial(self):
     text = (
         "tf.random.stateless_multinomial(logits, num_samples, seed, "
@@ -579,26 +733,32 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual(new_text, expected_text)
 
   def testSoftMaxCrossEntropyWithLogitsV2(self):
-    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    text = (
+        "tf.nn.softmax_cross_entropy_with_logits_v2("
+        "labels=labels, logits=logits, dim=2)")
     expected_text = (
-        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=labels, logits=logits, axis=2)")
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
     self.assertFalse(errors)
 
   def testSoftMaxCrossEntropyWithLogits(self):
-    text = "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)"
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=labels, logits=logits, dim=2)")
     expected_text = (
-        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)")
-    _, report, errors, new_text = self._upgrade(text)
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=tf.stop_gradient(labels), logits=logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
-    self.assertIn(
-        "tf.nn.softmax_cross_entropy_with_logits requires manual check.",
-        errors[0])
-    self.assertIn(
-        "tf.nn.softmax_cross_entropy_with_logits behavior has changed. ",
-        report)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo(bar))")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo(bar)))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
 
   def testSparseMatmul(self):
     text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
@@ -736,6 +896,163 @@ tf.print('abc')
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testBatchGather(self):
+    text = "tf.batch_gather(foo, bar)"
+    expected_text1 = "tf.gather(params=foo, indices=bar, batch_dims=-1)"
+    expected_text2 = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertIn(new_text, [expected_text1, expected_text2])
+
+    text = "tf.batch_gather(params=foo, indices=bar)"
+    expected_text1 = "tf.gather(params=foo, indices=bar, batch_dims=-1)"
+    expected_text2 = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertIn(new_text, [expected_text1, expected_text2])
+
+  def testIterators(self):
+    for (text, expected) in [
+        ("(expr + yielding(data)).make_one_shot_iterator()",
+         "tf.compat.v1.data.make_one_shot_iterator((expr + yielding(data)))"),
+        ("dataset.make_one_shot_iterator()",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("dataset.make_one_shot_iterator(shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("dataset.make_one_shot_iterator(x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("dataset.make_initializable_iterator()",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("ds.make_initializable_iterator(shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("dataset.make_initializable_iterator(x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)"),
+        ("tf.data.make_one_shot_iterator(dataset)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("tf.data.make_one_shot_iterator(dataset, shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("tf.data.make_one_shot_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("tf.data.make_initializable_iterator(dataset)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("tf.data.make_initializable_iterator(ds, shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("tf.data.make_initializable_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("tf.compat.v1.data.make_initializable_iterator(dataset)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)")]:
+      _, unused_report, unused_errors, actual = self._upgrade(text)
+      self.assertEqual(actual, expected)
+
+  def testCast(self):
+    for (name, dtype) in [("int32", "int32"),
+                          ("int64", "int64"),
+                          ("float", "float32"),
+                          ("double", "float64"),
+                          ("complex64", "complex64"),
+                          ("complex128", "complex128"),
+                          ("bfloat16", "bfloat16")]:
+      text = "tf.to_%s(x, name='test')" % name
+      expected_text = "tf.cast(x, name='test', dtype=tf.%s)" % dtype
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testCastPositionalSecondArgument(self):
+    for (name, dtype) in [("int32", "int32"),
+                          ("int64", "int64"),
+                          ("float", "float32"),
+                          ("double", "float64"),
+                          ("complex64", "complex64"),
+                          ("complex128", "complex128"),
+                          ("bfloat16", "bfloat16")]:
+      text = "tf.to_%s(x, 'test')" % name
+      expected_text = "tf.cast(x, name='test', dtype=tf.%s)" % dtype
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testImageResize(self):
+    for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
+      text = "tf.image.resize_%s(i, s)" % method
+      expected_text = ("tf.image.resize(i, s, "
+                       "method=tf.image.ResizeMethod.%s)" % method.upper())
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testImageResizeExtraPositionalArgs(self):
+    for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
+      text = "tf.image.resize_%s(i, s, a, p)" % method
+      expected_text = ["tf.image.resize(i, s, ", "align_corners=a, ",
+                       "preserve_aspect_ratio=p, ",
+                       "method=tf.image.ResizeMethod.%s)" % method.upper()]
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      for s in expected_text:
+        self.assertIn(s, new_text)
+
+  def testCond(self):
+    text = "tf.cond(a, b, c, True)"
+    expected_text = "tf.cond(pred=a, true_fn=b, false_fn=c)"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+    self.assertIn("tf.cond", errors[0])
+    self.assertIn("requires manual check", errors[0])
+
+  def testParens(self):
+    text = """
+def _log_prob(self, x):
+  return tf.reduce_logsumexp(
+      (self.mixture_distribution.logits + self.distribution.log_prob(
+          x[..., tf.newaxis])),
+          axis=-1)"""
+    expected_text = """
+def _log_prob(self, x):
+  return tf.reduce_logsumexp(
+      input_tensor=(self.mixture_distribution.logits + self.distribution.log_prob(
+          x[..., tf.newaxis])),
+          axis=-1)"""
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testAssertStatements(self):
+    for name in ["assert_greater", "assert_equal", "assert_none_equal",
+                 "assert_less", "assert_negative", "assert_positive",
+                 "assert_non_negative", "assert_non_positive", "assert_near",
+                 "assert_less", "assert_less_equal", "assert_greater",
+                 "assert_greater_equal", "assert_integer", "assert_type",
+                 "assert_scalar"]:
+      text = "tf.%s(a)" % name
+      expected_text = "tf.compat.v1.%s(a)" % name
+      _, unused_report, errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("assert_* functions", errors[0])
+
+      text = "tf.debugging.%s(a)" % name
+      expected_text = "tf.compat.v1.debugging.%s(a)" % name
+      _, unused_report, errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("assert_* functions", errors[0])
+
+  def testAssertRankStatements(self):
+    for name in ["assert_rank", "assert_rank_at_least", "assert_rank_in"]:
+      text = "tf.%s(a)" % name
+      expected_text = "tf.compat.v1.%s(a)" % name
+      _, unused_report, errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("assert_rank_* functions", errors[0])
+
+      text = "tf.debugging.%s(a)" % name
+      expected_text = "tf.compat.v1.debugging.%s(a)" % name
+      _, unused_report, errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("assert_rank_* functions", errors[0])
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
@@ -754,4 +1071,3 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
 if __name__ == "__main__":
   test_lib.main()
-
diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md
index 1e29977788176477492a03c4683cc489ec9fae44..6e7769b42aef040b93e1ed17f89417604a56c863 100644
--- a/tensorflow/tools/dist_test/README.md
+++ b/tensorflow/tools/dist_test/README.md
@@ -12,9 +12,8 @@ For example:
 
     ./local_test.sh
 
-By default, local_test.sh runs the MNIST-with-replicas model as a test.
-However, you can use the --model_name flag to run the tf-learn/wide&deep
-cesnsu model:
+By default, local_test.sh runs the MNIST-with-replicas model as a test. However,
+you can use the --model_name flag to run the tf-learn/wide&deep census model:
 
     ./local_test.sh --model_name CENSUS_WIDENDEEP
 
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
index c570d1a9f834bd9df57df62088a0c4562be9512c..038a712d538fbaeb8d0d176287704993cff07799 100644
--- a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
+++ b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
@@ -195,7 +195,7 @@ def generate_RSA(bits=2048, exponent=65537):
 
 def get_change_ssh_port(use_hostnet, port):
   if use_hostnet == 1:
-    return "sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
+    return r"sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
 
   return ''
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1ad359ddccc71201553803140fa4efca06fbb5e1..e085ee7170c83729cb103811d5e2ba45e3d8cb96 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -15,8 +15,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -41,11 +39,6 @@ RUN apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
         apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
 
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
@@ -111,9 +104,6 @@ ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
 ENV TF_CUDA_VERSION=9.0
 ENV TF_CUDNN_VERSION=7
 
-# NCCL 2.x
-ENV TF_NCCL_VERSION=2
-
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index b42dd9fc0cda842def86af5be953002e11a1959a..50b0cc5870ecccc216336fe5cdcbdd1a2a0e94b0 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -29,12 +29,13 @@ in the Dockerfile itself.
 After building the image with the tag `tf` (for example), use `docker run` to
 run the images.
 
-Note for new Docker users: the `-v` and `-u` flags share directories between
-the Docker container and your machine, and very important. Without
-`-v`, your work will be wiped once the container quits, and without `-u`, files
-created by the container will have the wrong file permissions on your host
-machine. If you are confused, check out the [Docker run
-documentation](https://docs.docker.com/engine/reference/run/).
+Note for new Docker users: the `-v` and `-u` flags share directories and
+permissions between the Docker container and your machine. Without `-v`, your
+work will be wiped once the container quits, and without `-u`, files created by
+the container will have the wrong file permissions on your host machine. Check
+out the
+[Docker run documentation](https://docs.docker.com/engine/reference/run/) for
+more info.
 
 ```bash
 # Volume mount (-v) is optional but highly recommended, especially for Jupyter.
@@ -83,7 +84,7 @@ $ alias asm_images="docker run --rm -v $(pwd):/tf -v /var/run/docker.sock:/var/r
 # If you're REBUILDING OR ADDING DOCKERFILES, remove docker.sock and add -u:
 $ alias asm_dockerfiles="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/tf tf-tools python3 assembler.py "
 
-# Check flags
+# Check assembler flags
 $ asm_dockerfiles --help
 
 # Assemble all of the Dockerfiles
@@ -92,6 +93,12 @@ $ asm_dockerfiles --release dockerfiles --construct_dockerfiles
 # Build all of the "nightly" images on your local machine:
 $ asm_images --release nightly --build_images
 
+# Save the list of built images to a file:
+$ asm_images --release nightly --build_images > tf-built.txt
+
 # Build version release for version 99.0, except "gpu" tags:
-$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '*.gpu.*'
+$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '.*gpu.*'
+
+# Test your changes to the devel images:
+$ asm_images --release nightly --build_images --run_tests_path=$(realpath tests) --only_tags_matching="^devel-gpu-py3$"
 ```
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 67a0320241d273bbb7a2439b2e09723905db0765..09537b7314491819d06d3bfda2f2446c5af93067 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -18,6 +18,9 @@
 - Builds images (and optionally runs image tests)
 - Pushes images to Docker Hub (provided with credentials)
 
+Logs are written to stderr; the list of successfully built images is
+written to stdout.
+
 Read README.md (in this directory) for instructions!
 """
 
@@ -49,7 +52,7 @@ flags.DEFINE_string('hub_username', None,
 flags.DEFINE_string(
     'hub_password', None,
     ('Dockerhub password, only used with --upload_to_hub. Use from an env param'
-     'so your password isn\'t in your history.'))
+     ' so your password isn\'t in your history.'))
 
 flags.DEFINE_integer('hub_timeout', 3600,
                      'Abort Hub upload if it takes longer than this.')
@@ -142,6 +145,10 @@ flags.DEFINE_multi_string(
      'args will print a warning).'),
     short_name='a')
 
+flags.DEFINE_boolean(
+    'nocache', False,
+    'Disable the Docker build cache; identical to "docker build --no-cache"')
+
 flags.DEFINE_string(
     'spec_file',
     './spec.yml',
@@ -513,6 +520,7 @@ def main(argv):
   # Each tag has a name ('tag') and a definition consisting of the contents
   # of its Dockerfile, its build arg list, etc.
   failed_tags = []
+  succeeded_tags = []
   for tag, tag_defs in all_tags.items():
     for tag_def in tag_defs:
       eprint('> Working on {}'.format(tag))
@@ -569,6 +577,7 @@ def main(argv):
           image, logs = dock.images.build(
               timeout=FLAGS.hub_timeout,
               path='.',
+              nocache=FLAGS.nocache,
               dockerfile=dockerfile,
               buildargs=tag_def['cli_args'],
               tag=repo_tag)
@@ -656,12 +665,20 @@ def main(argv):
               args=(FLAGS.hub_repository, dock, image, tag))
           p.start()
 
+      if not tag_failed:
+        succeeded_tags.append(tag)
+
   if failed_tags:
     eprint(
         '> Some tags failed to build or failed testing, check scrollback for '
         'errors: {}'.format(','.join(failed_tags)))
     exit(1)
 
+  eprint('> Writing built{} tags to standard out.'.format(
+      ' and tested' if FLAGS.run_tests_path else ''))
+  for tag in succeeded_tags:
+    print('{}:{}'.format(FLAGS.repository, tag))
+
 
 if __name__ == '__main__':
   app.run(main)
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index b072853a4ec298ce5c15afc1307a966ecefb743f..cc106b5955ba07f4f166638ba51699060788e6ae 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -81,7 +81,7 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
-py_binary(
+py_library(
     name = "generate_lib",
     srcs = ["generate_lib.py"],
     srcs_version = "PY2AND3",
@@ -155,7 +155,7 @@ py_test(
         "optonly",
     ],
     deps = [
-        ":generate2",
+        ":generate2_lib",
     ],
 )
 
@@ -163,7 +163,17 @@ py_binary(
     name = "generate2",
     srcs = ["generate2.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = [":generate2_lib"],
+)
+
+py_library(
+    name = "generate2_lib",
+    srcs = ["generate2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:util",
+    ],
 )
 
 py_library(
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index fba909d26defffad2d7dbaffa4463695685ae50c..0a50eb6c2392b37932705b6481055d49d66417b2 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -31,10 +31,22 @@ from os import path
 
 from absl import app
 from absl import flags
-
 import tensorflow as tf
 
+from tensorflow_docs.api_generator import doc_generator_visitor
 from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import parser
+
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# Use tensorflow's `tf_inspect`, which is aware of `tf_decorator`.
+parser.tf_inspect = tf_inspect
+
+# `tf` has an `__all__` that doesn't list important things like `keras`.
+# The doc generator recognizes `__all__` as the list of public symbols.
+# So patch `tf.__all__` to list everything.
+tf.__all__ = [item_name for item_name, value in tf_inspect.getmembers(tf)]
 
 FLAGS = flags.FLAGS
 
@@ -50,6 +62,28 @@ flags.DEFINE_string(
 flags.DEFINE_bool("search_hints", True,
                   "Include meta-data search hints at the top of each file.")
 
+flags.DEFINE_string("site_path", "",
+                    "The prefix ({site-path}/api_docs/python/...) used in the "
+                    "`_toc.yaml` and `_redirects.yaml` files")
+
+
+# The doc generator isn't aware of tf_export.
+# So prefix the score tuples with -1 when this is the canonical name, +1
+# otherwise. The generator chooses the name with the lowest score.
+class TfExportAwareDocGeneratorVisitor(
+    doc_generator_visitor.DocGeneratorVisitor):
+  """A `tf_export` aware doc_visitor."""
+
+  def _score_name(self, name):
+    canonical = tf_export.get_canonical_name_for_symbol(self._index[name])
+
+    canonical_score = 1
+    if canonical is not None and name == "tf." + canonical:
+      canonical_score = -1
+
+    scores = super(TfExportAwareDocGeneratorVisitor, self)._score_name(name)
+    return (canonical_score,) + scores
+
 
 def build_docs(output_dir, code_url_prefix, search_hints=True):
   """Build api docs for tensorflow v2.
@@ -66,7 +100,8 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
       base_dir=base_dir,
       search_hints=search_hints,
       code_url_prefix=code_url_prefix,
-      site_path="api_docs/")
+      site_path=FLAGS.site_path,
+      visitor_cls=TfExportAwareDocGeneratorVisitor)
 
   doc_generator.build(output_dir)
 
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 8e7cd9b10415740a554445edbb634706dd97857c..c2449da9239df74eac5c6b1cd91df666e170a108 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -29,8 +29,8 @@ from __future__ import print_function
 import argparse
 import json
 import os
-import subprocess
 import shutil
+import subprocess
 
 
 def parse_branch_ref(filename):
@@ -159,12 +159,14 @@ def get_git_version(git_base_path, git_tag_override):
   """
   unknown_label = b"unknown"
   try:
+    # Force to bytes so this works on python 2 and python 3
     val = bytes(subprocess.check_output([
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    version_separator = b"-"
     if git_tag_override and val:
-      split_val = val.split("-")
+      split_val = val.split(version_separator)
       if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
@@ -172,8 +174,8 @@ def get_git_version(git_base_path, git_tag_override):
       # There might be "-" in the tag name. But we can be sure that the final
       # two "-" are those inserted by the git describe command.
       abbrev_commit = split_val[-1]
-      val = bytes(
-          "-".join([git_tag_override, "0", abbrev_commit]))
+      val = version_separator.join(
+          [bytes(git_tag_override, "utf-8"), b"0", abbrev_commit])
     return val if val else unknown_label
   except (subprocess.CalledProcessError, OSError):
     return unknown_label
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index eb1ed1f2ca859df42809084c1ea47a6f3b21012e..41ed31a5c1b282a962f38df83392d5d9f1d6c90e 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -12,6 +12,7 @@ load(
     "tf_cc_binary",
     "tf_cc_test",
     "tf_py_test",
+    "if_not_v2",
 )
 
 exports_files(["LICENSE"])
@@ -131,12 +132,26 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
-        "//tensorflow/contrib/rnn:gru_ops_op_lib",
-        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
         "//tensorflow/core/kernels:quantization_utils",
     ] + if_not_windows([
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
+        "//tensorflow/core:user_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
+        "//tensorflow/core:string_ops_op_lib",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:manip_ops_op_lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:candidate_sampling_ops_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
+    ]) + if_not_v2([
+        "//tensorflow/contrib/rnn:gru_ops_op_lib",
+        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
     ]),
     alwayslink = 1,
 )
@@ -173,6 +188,7 @@ tf_cc_test(
         ":transforms_lib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 435f46c107cd9b0a6d64d4c0d52607ec5f41eb4f..6c7174926d06460556ce673a5fe738901134543d 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -291,7 +291,7 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     std::vector<Tensor> fused_outputs;
     TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
 
-    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+    test::ExpectClose(original_outputs[0], fused_outputs[0]);
 
     for (const NodeDef& node : fused_graph_def.node()) {
       EXPECT_NE("FusedBatchNorm", node.op());
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1186189844aa887ba011b532df3a73d89ffe52b8..86bd5107924ec4627b955264b179a06231ef8532 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -162,6 +162,7 @@ genrule(
         "//conditions:default": [],
     }) + if_cuda([
         "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
@@ -232,6 +233,7 @@ genrule(
         "//conditions:default": [],
     }) + if_cuda([
         "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..aa6c850b0b3abb3351e3225e0c3a66ab4272846e
--- /dev/null
+++ b/tensorflow/tools/optimization/BUILD
@@ -0,0 +1,52 @@
+# Description:
+#   Utilities that perform useful transformations on graphs
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cuda_library",
+)
+
+exports_files(["LICENSE"])
+
+tf_cuda_library(
+    name = "optimization_pass_runner_lib",
+    srcs = ["optimization_pass_runner.cc"],
+    hdrs = ["optimization_pass_runner.h"],
+    deps = [
+        "//tensorflow/contrib:contrib_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+tf_cc_binary(
+    name = "gpu_optimization_pass_runner",
+    srcs = ["gpu_optimization_pass_runner_main.cc"],
+    deps = [
+        ":optimization_pass_runner_lib",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/contrib:contrib_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f26cd5a42f7315cc1d074e8b6ec19caa75f30
--- /dev/null
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file creates a binary that can run any registered optimization pass.
+// ./xla_gpu_opt  --input_file_path=/tmp/input.pbtxt
+// --output_file_path=/tmp/output.pbtxt
+// --optimization_pass=NameOfGraphOptimizationPass
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/optimization/optimization_pass_runner.h"
+
+int main(int argc, char** argv) {
+  tensorflow::OptimizationPassRunner runner;
+  // Add fake devices for CPU, GPU, and XLA to ensure we have all devices we
+  // need.
+  // Most machines in our servers currently use 8 gpus. There is nothing special
+  // about this number and it can be decreased or increased to test other
+  // configurations.
+  int num_gpus_per_machine = 8;
+  for (int i = 0; i < num_gpus_per_machine; i++) {
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i),
+        tensorflow::DEVICE_CPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i),
+        tensorflow::DEVICE_GPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_CPU:", i),
+        tensorflow::DEVICE_XLA_CPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_GPU:", i),
+        tensorflow::DEVICE_XLA_GPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU_XLA_JIT:", i),
+        tensorflow::DEVICE_CPU_XLA_JIT));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU_XLA_JIT:", i),
+        tensorflow::DEVICE_GPU_XLA_JIT));
+  }
+  // This binary is used to test TF:XLA behavior, so turn on auto_jit.
+  TF_CHECK_OK(runner.SetJitLevel(tensorflow::OptimizerOptions::GlobalJitLevel::
+                                     OptimizerOptions_GlobalJitLevel_ON_2));
+  // Run the actual "main" function.
+  TF_CHECK_OK(runner.RunMain(argc, argv));
+}
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..231ff083813870819c23729e4308e0215661afcd
--- /dev/null
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file creates a library that can run any registered optimization pass.
+// The binary that uses this will be run in a form similar to:
+// ./optimization_pass_runner  --input_file_path=/tmp/input.pbtxt
+// --output_file_path=/tmp/output.pbtxt
+// --optimization_pass=NameOfGraphOptimizationPass
+#include "tensorflow/tools/optimization/optimization_pass_runner.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+
+namespace {
+// A fake device used to populate a DeviceSet.
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override;
+  static std::unique_ptr<Device> Make(const string& name, const string& type);
+};
+
+Status FakeDevice::Sync() {
+  return errors::Unimplemented("FakeDevice::Sync()");
+}
+
+std::unique_ptr<Device> FakeDevice::Make(const string& name,
+                                         const string& type) {
+  DeviceAttributes device_attributes;
+  device_attributes.set_name(name);
+  device_attributes.set_device_type(DeviceType(type).type());
+  return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+}
+}  // namespace
+
+Status OptimizationPassRunner::RunMain(int argc, char** argv) {
+  string input_file_path;
+  string output_file_path;
+  string optimization_pass;
+
+  const std::vector<Flag> flag_list = {
+      Flag("input_file_path", &input_file_path, "Location of the input graph."),
+      Flag("output_file_path", &output_file_path,
+           "Location to write the resulting graph."),
+      // For now only a single optimization pass can be run.
+      Flag("optimization_pass", &optimization_pass,
+           "Which optimization pass to run."),
+  };
+  if (!Flags::Parse(&argc, argv, flag_list)) {
+    return errors::FailedPrecondition("Invalid flags passed");
+  }
+  port::InitMain(argv[0], &argc, &argv);
+
+  if (input_file_path.empty()) {
+    return errors::FailedPrecondition("input_file_path is a required flag.");
+  }
+  if (output_file_path.empty()) {
+    return errors::FailedPrecondition("output_file_path is a required flag.");
+  }
+  if (optimization_pass.empty()) {
+    return errors::FailedPrecondition("optimization_pass is a required flag.");
+  }
+
+  // Turn on XLA Auto-Jit.
+  auto session_options = absl::make_unique<SessionOptions>();
+  session_options->config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(jit_level_);
+  FunctionDefLibrary flib;
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+
+  GraphOptimizationPassOptions options;
+  options.session_options = session_options.release();
+  options.graph = &graph;
+  options.flib_def =
+      new FunctionLibraryDefinition((*options.graph)->op_registry(), flib);
+
+  // Grab the data
+  GraphDef graphdef;
+  GraphConstructorOptions graph_opts;
+  graph_opts.expect_device_spec = true;
+  graph_opts.allow_internal_ops = true;
+  TF_RETURN_IF_ERROR(ReadTextProto(Env::Default(), input_file_path, &graphdef));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(graph_opts, graphdef, options.graph->get()));
+
+  // Add all devices that were previously configured with AddDevice.
+  DeviceSet device_set;
+  for (auto& device : devices_) {
+    device_set.AddDevice(device.get());
+  }
+  options.device_set = &device_set;
+
+  Status result = errors::NotFound(
+      "An OptimizationPass was not found with the desired name.");
+
+  // Run the optimization pass specified by the command line flag.
+  for (const auto& groups_and_passes :
+       OptimizationPassRegistry::Global()->groups()) {
+    for (const auto& phase_and_passes : groups_and_passes.second) {
+      for (const auto& pass : phase_and_passes.second) {
+        if (pass->name() == optimization_pass) {
+          result = pass->Run(options);
+        }
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(result);
+
+  // Write out the result.
+  options.graph->get()->ToGraphDef(&graphdef);
+  TF_RETURN_IF_ERROR(
+      WriteTextProto(Env::Default(), output_file_path, graphdef));
+  return Status::OK();
+}
+
+Status OptimizationPassRunner::SetJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level) {
+  jit_level_ = jit_level;
+  return Status::OK();
+}
+
+Status OptimizationPassRunner::AddDevice(const string& name,
+                                         const string& type) {
+  devices_.push_back(FakeDevice::Make(name, type));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.h b/tensorflow/tools/optimization/optimization_pass_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b26f64bcfb86e5e7fd6b6fe31b20cf75f931da1
--- /dev/null
+++ b/tensorflow/tools/optimization/optimization_pass_runner.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
+#define TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// OptimizationPassRunner can be initialized, populated with devices, then run
+// to test individual Tensorflow Optimization passes.
+class OptimizationPassRunner {
+ public:
+  explicit OptimizationPassRunner()
+      : jit_level_(OptimizerOptions::GlobalJitLevel::
+                       OptimizerOptions_GlobalJitLevel_DEFAULT) {}
+
+  // Add a fake device to the (initially empty) DeviceSet used for optimization.
+  // Names are of the form: "/job:localhost/replica:0/task:0/device:CPU:0"
+  Status AddDevice(const string& name, const string& type);
+
+  // Increasing the Jit level will cause XLA to compile parts of the tensorflow
+  // graph that it is able to.
+  Status SetJitLevel(OptimizerOptions::GlobalJitLevel jit_level);
+
+  // This can be called after adding devices and setting the jit level to parse
+  // command line flags and run the specified job. All 3 flags are required:
+  // input_file_path, output_file_path, optimization_pass.
+  //
+  // If this library becomes heavily used, the caller should be responsible for
+  // parsing any command line flags desired rather than this Method handling the
+  // work of a main() function.
+  Status RunMain(int argc, char** argv);
+
+ private:
+  OptimizerOptions::GlobalJitLevel jit_level_;
+  std::vector<std::unique_ptr<Device>> devices_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2de00ea957f7b15ac2b4ba04c2cb1350fa94a567..bc970d0f763159c24e94d5237d98e1fe0fd5ba1f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -18,13 +18,6 @@ load(
     "if_ngraph",
 )
 
-# This flag specifies whether TensorFlow 2.0 API should be built instead
-# of 1.* API. Note that TensorFlow 2.0 API is currently under development.
-config_setting(
-    name = "api_version_2",
-    define_values = {"tf_api_version": "2"},
-)
-
 # This returns a list of headers of all public header libraries (e.g.,
 # framework, lib), and all of the transitive dependencies of those
 # public headers.  Not all of the headers returned by the filegroup
@@ -66,7 +59,6 @@ COMMON_PIP_DEPS = [
     "setup.py",
     ":included_headers",
     "//tensorflow:tensorflow_py",
-    "//tensorflow/examples/tutorials/mnist:package",
     "//tensorflow/lite/python:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
@@ -90,6 +82,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
     "//tensorflow/python/ops/ragged:ragged_test_util",
@@ -107,6 +100,7 @@ COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
     "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
     "//tensorflow/contrib/compiler:xla",
     "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
+    "//tensorflow/contrib/distribute/python:distribute_test_lib_pip",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
@@ -130,6 +124,7 @@ COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
     "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
     "//tensorflow/contrib/timeseries:timeseries_pip",
     "//tensorflow/contrib/tpu",
+    "//tensorflow/examples/tutorials/mnist:package",
 ]
 
 # On Windows, python binary is a zip file of runfiles tree.
@@ -139,8 +134,8 @@ py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
     data = select({
-        "api_version_2": COMMON_PIP_DEPS,
         "//conditions:default": COMMON_PIP_DEPS_V1,
+        "//tensorflow:api_version_2": COMMON_PIP_DEPS,
     }) + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
@@ -153,7 +148,11 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl:LICENSE",
+        "@absl_py//absl/logging:LICENSE",
         "@absl_py//absl/flags:LICENSE",
+        "@absl_py//absl/testing:LICENSE",
+        "@absl_py//absl/third_party/unittest3_backport:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astor_archive//:LICENSE",
         "@boringssl//:LICENSE",
@@ -162,6 +161,7 @@ filegroup(
         "@curl//:COPYING",
         "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
+        "@enum34_archive//:LICENSE",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
         "@flatbuffers//:LICENSE.txt",
@@ -176,6 +176,7 @@ filegroup(
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pasta//:LICENSE",
         "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -242,7 +243,7 @@ sh_binary(
     name = "build_pip_package",
     srcs = ["build_pip_package.sh"],
     data = select({
-               "api_version_2": COMMON_PIP_DEPS,
+               "//tensorflow:api_version_2": COMMON_PIP_DEPS,
                "//conditions:default": COMMON_PIP_DEPS_V1,
            }) +
            select({
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 272ff4735c34b319589bd9302fcdb5cd91b6d1ec..c304e8cf6ebe1739c1cc9011dafd8f89cae9baac 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -6,7 +6,6 @@ recursive-include * *.so
 recursive-include * *.dll
 recursive-include * *.lib
 recursive-include * *.csv
-recursive-include tensorflow/aux-bin *
 recursive-include tensorflow/include/tensorflow *.h
 recursive-include tensorflow/include/Eigen *
 recursive-include tensorflow/include/external *
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 439b5428b3b7bff651689e08e783bf7875f16319..27815491d23a6ec294f08b1b5eee5ed2d11e9766 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -118,9 +118,6 @@ function prepare_src() {
         fi
       fi
     fi
-    mkdir "${TMPDIR}/tensorflow/aux-bin"
-    # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index ff821b864300c1eeb2f9d290ae47a25ce87a0884..952c71c61580fba72dbf1a4b2e1bd836816b1420 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -30,14 +30,19 @@ os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
 PIP_PACKAGE_QUERY_EXPRESSION = (
     "deps(//tensorflow/tools/pip_package:build_pip_package)")
 
+# List of file paths containing BUILD files that should not be included for the
+# pip smoke test.
+BUILD_BLACKLIST = [
+    "tensorflow/lite/examples/android",
+    "tensorflow/lite/experimental/swift",
+]
 
 def GetBuild(dir_base):
   """Get the list of BUILD file all targets recursively startind at dir_base."""
   items = []
   for root, _, files in os.walk(dir_base):
     for name in files:
-      if (name == "BUILD" and
-          root.find("tensorflow/lite/examples/android") == -1):
+      if (name == "BUILD" and root not in BUILD_BLACKLIST):
         items.append("//" + root + ":all")
   return items
 
@@ -67,9 +72,9 @@ def BuildPyTestDependencies():
 
 PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 
-# Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
-BLACKLIST = [
+# List of dependencies that should not included in the pip package.
+DEPENDENCY_BLACKLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
@@ -82,9 +87,7 @@ BLACKLIST = [
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
-    # contrib
-    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
-    "//tensorflow/contrib/keras:testing_utils",
+    # lite
     "//tensorflow/lite/experimental/examples/lstm:tflite_lstm",
     "//tensorflow/lite/experimental/examples/lstm:tflite_lstm.py",
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test",  # pylint:disable=line-too-long
@@ -93,6 +96,9 @@ BLACKLIST = [
     "//tensorflow/lite/python:interpreter_test",
     "//tensorflow/lite/python:interpreter.py",
     "//tensorflow/lite/python:interpreter_test.py",
+    # contrib
+    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
+    "//tensorflow/contrib/keras:testing_utils",
     "//tensorflow/contrib/ffmpeg:test_data",
     "//tensorflow/contrib/fused_conv:fused_conv2d_bias_activation_op_test_base",
     "//tensorflow/contrib/hadoop:test_data",
@@ -102,6 +108,7 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/saved_model:reader",  # Not present in v2
     "//tensorflow/contrib/timeseries/examples:predict",
     "//tensorflow/contrib/timeseries/examples:multivariate",
     "//tensorflow/contrib/timeseries/examples:known_anomaly",
@@ -148,8 +155,8 @@ def main():
   # File extensions and endings to ignore
   ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
 
-  ignored_files = 0
-  blacklisted_files = len(BLACKLIST)
+  ignored_files_count = 0
+  blacklisted_dependencies_count = len(DEPENDENCY_BLACKLIST)
   # Compare dependencies
   for dependency in tf_py_test_dependencies_list:
     if dependency and dependency.startswith("//tensorflow"):
@@ -157,16 +164,16 @@ def main():
       # Ignore extensions
       if any(dependency.endswith(ext) for ext in ignore_extensions):
         ignore = True
-        ignored_files += 1
+        ignored_files_count += 1
 
-      # Check if the dependency is in the pip package, the blacklist, or
-      # should be ignored because of its file extension
+      # Check if the dependency is in the pip package, the dependency blacklist,
+      # or should be ignored because of its file extension.
       if not (ignore or dependency in pip_package_dependencies_list or
-              dependency in BLACKLIST):
+              dependency in DEPENDENCY_BLACKLIST):
         missing_dependencies.append(dependency)
 
-  print("Ignored files: %d" % ignored_files)
-  print("Blacklisted files: %d" % blacklisted_files)
+  print("Ignored files count: %d" % ignored_files_count)
+  print("Blacklisted dependencies count: %d" % blacklisted_dependencies_count)
   if missing_dependencies:
     print("Missing the following dependencies from pip_packages:")
     for missing_dependency in missing_dependencies:
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 3927540cc79ef8b827ce4d7e60e884c2237f8e9d..368cb171ba99933dc3194e13d4925b3d59417cc5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -51,13 +51,14 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'google_pasta >= 0.1.1',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.6.1',
     'tensorboard >= 1.12.0, < 1.13.0',
-    'tensorflow_estimator >= 1.10.0',
+    'tensorflow_estimator >= 1.13.0rc0, < 1.14.0rc0',
     'termcolor >= 1.1.0',
 ]
 
@@ -248,7 +249,7 @@ setup(
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     # Contained modules and scripts.
     packages=find_packages(),
     entry_points={
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 700aa065b1e39edc2cccc028e3c0002306129b6d..655f0fdb8495e8e5853a7bd9c7aaf4dd21770e60 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -10,6 +10,7 @@ load("//third_party/py:python_configure.bzl", "python_configure")
 
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
+load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//third_party:repo.bzl", "tf_http_archive")
@@ -29,6 +30,7 @@ load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
 load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/keras_applications_archive:workspace.bzl", keras_applications = "repo")
+load("//third_party/pasta:workspace.bzl", pasta = "repo")
 
 def initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
@@ -41,6 +43,7 @@ def initialize_third_party():
     kissfft()
     jpeg()
     nasm()
+    pasta()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -62,6 +65,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
+    remote_execution_configure(name = "local_config_remote_execution")
 
     initialize_third_party()
 
@@ -136,11 +140,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "753fbb58d0a49b6bcbcfb126ebfa2e21fc97f7471529ba835a096008ce588d8a",
-        strip_prefix = "eigen-eigen-9f48e814419e",
+        patch_file = clean_dep("//third_party/eigen3:gebp_neon.patch"),
+        sha256 = "48678550a32665331d729be87076e576f2502fff325f5b6c2c78ebf7b1b22c7b",
+        strip_prefix = "eigen-eigen-bcc817c0ba98",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/bcc817c0ba98.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/bcc817c0ba98.tar.gz",
         ],
     )
 
@@ -181,15 +186,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "44eee8bd47cbd5ff192e895b45f9f913e2e117f10fdb9af0fd3b1a87a7b53bc3",
-        strip_prefix = "google-cloud-cpp-0.4.0",
+        sha256 = "886bcba3616d5f362838a2d86ae0198dd3670a84a84c82291cda6c30e14779fc",
+        strip_prefix = "google-cloud-cpp-0.5.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz",
+            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.5.0.tar.gz",
+            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.5.0.tar.gz",
         ],
     )
 
@@ -283,7 +288,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"),
         urls = [
             "https://mirror.bazel.build/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
-            "https://files.pythonhosted.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
+            "https://pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
         ],
     )
 
@@ -313,19 +318,30 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "absl_py",
-        sha256 = "95160f778a62c7a60ddeadc7bf2d83f85a23a27359814aca12cf949e896fa82c",
-        strip_prefix = "abseil-py-pypi-v0.2.2",
+        sha256 = "595726be4bf3f7e6d64a1a255fa03717b693c01b913768abd52649cbb7ddf2bd",
+        strip_prefix = "abseil-py-pypi-v0.7.0",
         system_build_file = clean_dep("//third_party/systemlibs:absl_py.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:absl_py.absl.flags.BUILD": "absl/flags/BUILD",
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
-            "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.7.0.tar.gz",
+            "https://github.com/abseil/abseil-py/archive/pypi-v0.7.0.tar.gz",
         ],
     )
 
+    tf_http_archive(
+        name = "enum34_archive",
+        urls = [
+            "https://mirror.bazel.build/pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
+            "https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
+        ],
+        sha256 = "8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
+        build_file = clean_dep("//third_party:enum34.BUILD"),
+        strip_prefix = "enum34-1.1.6/enum",
+    )
+
     tf_http_archive(
         name = "org_python_pypi_backports_weakref",
         build_file = clean_dep("//third_party:backports_weakref.BUILD"),
@@ -393,22 +409,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "nsync",
-        sha256 = "692f9b30e219f71a6371b98edd39cef3cbda35ac3abc4cd99ce19db430a5591a",
-        strip_prefix = "nsync-1.20.1",
+        sha256 = "704be7f58afa47b99476bbac7aafd1a9db4357cef519db361716f13538547ffd",
+        strip_prefix = "nsync-1.20.2",
         system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.1.tar.gz",
-            "https://github.com/google/nsync/archive/1.20.1.tar.gz",
+            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.2.tar.gz",
+            "https://github.com/google/nsync/archive/1.20.2.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_googletest",
-        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
-        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
+        sha256 = "ff7a82736e158c077e76188232eac77913a15dac0b22508c390ab3f88e6d6d86",
+        strip_prefix = "googletest-b6cd405286ed8635ece71c72f118e659f4ade3fb",
         urls = [
-            "https://mirror.bazel.build/github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
-            "https://github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
+            "https://mirror.bazel.build/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
+            "https://github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
         ],
     )
 
@@ -498,11 +514,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "65b48c80eba736ab834a9790b78a72cd0e3919b6dace44a96259d3e6936624ec",
-        strip_prefix = "llvm-cfa2cf74cd9ba0e759974ce11bfd7b9e051dd8ff",
+        sha256 = "d32baf8e3ccfdc689704ee8c9af5f17b9d2b571d3d5b25e857c375d28a435193",
+        strip_prefix = "llvm-ec3165a24ab126615b1766f6c48c0b0de844e4cb",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/cfa2cf74cd9ba0e759974ce11bfd7b9e051dd8ff.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/cfa2cf74cd9ba0e759974ce11bfd7b9e051dd8ff.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/ec3165a24ab126615b1766f6c48c0b0de844e4cb.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/ec3165a24ab126615b1766f6c48c0b0de844e4cb.tar.gz",
         ],
     )
 
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index 5426f79e4650a1ce4dcb4a8408691310c864f06c..fde3072403f428830763d3b11542b8b39436b4ca 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -54,6 +54,11 @@ cc_library(
     hdrs = [
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
+    copts = [
+        "-DAWS_SDK_VERSION_MAJOR=1",
+        "-DAWS_SDK_VERSION_MINOR=5",
+        "-DAWS_SDK_VERSION_PATCH=8",
+    ],
     defines = select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index c2166381549a5cf6fb44912081ae9479bff69645..1d269f4d43ec4cc9d39f3c89ff40e07b4e4947c4 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -2,14 +2,17 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
+# NOTE: version updates here should also update the major, minor, and patch variables declared in
+# the  copts field of the //third_party/aws:aws target
+
 def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
         ],
-        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
-        strip_prefix = "aws-sdk-cpp-1.3.15",
+        sha256 = "89905075fe50aa13e0337ff905c2e8c1ce9caf77a3504484a7cda39179120ffc",
+        strip_prefix = "aws-sdk-cpp-1.5.8",
         build_file = "//third_party/aws:BUILD.bazel",
     )
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 7ced9027473e39ad9870ce138b64c7f7ec64ad01..20ac3a8d5a4bd243151484da7cbcdf31b5dc3a36 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "347933"
+    CLANG_REVISION = "348507"
     CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0",
-        "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4",
-        "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949",
+        "Linux_x64": "85a24f215737af91e0054d3a1cb435bd8ff06178cef14241c029c8a04ff16a79",
+        "Mac": "16a96a3c4b599d0418e812307087a223d5fee2ee3c7fd96f5cbc2a9e5bf8607d",
+        "Win": "4c144f24d3a82d546845c680f5b029ff02dd4de7614e93d1b21cfc6e20a26dad",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen3/gebp_neon.patch b/third_party/eigen3/gebp_neon.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d0022e94ccd40c8efe1423926b418531b0fb3eba
--- /dev/null
+++ b/third_party/eigen3/gebp_neon.patch
@@ -0,0 +1,11 @@
+--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h	2019-01-22 20:46:51.000000000 -0800
++++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h	2019-01-25 13:48:49.000000000 -0800
+@@ -1031,7 +1031,7 @@
+ 
+   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+   {
+-    c += a * b;
++    c = vfmaq_n_f32(c, a, b);
+   }
+ 
+   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
diff --git a/third_party/enum34.BUILD b/third_party/enum34.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..85262b07f6478bc91cfe8eb178a85c963feb4b79
--- /dev/null
+++ b/third_party/enum34.BUILD
@@ -0,0 +1,13 @@
+# Description:
+#   enum34 provides a backport of the enum module for Python 2.
+
+licenses(["notice"])  # MIT
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "enum",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/examples/eager/spinn/BUILD b/third_party/examples/eager/spinn/BUILD
index 0e39d4696fb5b4efafc94b4b96965d232ae4e473..640bcb230c8f3bcdf6f4c905e45cf32768b32418 100644
--- a/third_party/examples/eager/spinn/BUILD
+++ b/third_party/examples/eager/spinn/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # 3-clause BSD.
 
-py_binary(
+py_library(
     name = "spinn",
     srcs = ["spinn.py"],
     srcs_version = "PY2AND3",
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index 5e6b47f44f5b80c34c58de6ed37478ea71ee8321..db76306ffbe9244a59d2e28e8e7c2a2f03e56f49 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -23,7 +23,11 @@ cc_toolchain_suite(
         "darwin|compiler": ":cc-compiler-darwin",
         "x64_windows|msvc-cl": ":cc-compiler-windows",
         "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
         "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
     },
 )
 
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 5ca9b2deb4f3e39ab1b78bf695d7b75100d1fac6..1a13ac844caa4b46f030ef904537b3295a017418 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -2,31 +2,6 @@ major_version: "local"
 minor_version: ""
 default_target_cpu: "same_as_host"
 
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "local_windows"
-}
-
 toolchain {
   abi_version: "local"
   abi_libc_version: "local"
diff --git a/third_party/gpus/crosstool/remote.BUILD.tpl b/third_party/gpus/crosstool/remote.BUILD.tpl
deleted file mode 100644
index b2316331db257a39086bdd5ca02b5ca6848cebcb..0000000000000000000000000000000000000000
--- a/third_party/gpus/crosstool/remote.BUILD.tpl
+++ /dev/null
@@ -1,10 +0,0 @@
-# Description:
-#   Template for crosstool Build file to use a pre-generated config.
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(
-    name = "toolchain",
-    actual = "%{remote_cuda_repo}:toolchain",
-)
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index f6b497f813185f82108de470ae39fac60d5d9f34..1921ef7c1fa0f13d336d3dae9e0eddf59ae25b44 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -202,4 +202,4 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-%{cuda_include_genrules}
+%{copy_rules}
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index 325d18b9cb8a7c7c18c3df9e0630e67a9a28a937..3ed4fd415c33d3719307e3520084956f44430b0b 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -161,4 +161,4 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-%{cuda_include_genrules}
+%{copy_rules}
diff --git a/third_party/gpus/cuda/remote.BUILD.tpl b/third_party/gpus/cuda/remote.BUILD.tpl
deleted file mode 100644
index 100c7bb7c41bd3f2a4e7e0eba865573d30422b45..0000000000000000000000000000000000000000
--- a/third_party/gpus/cuda/remote.BUILD.tpl
+++ /dev/null
@@ -1,110 +0,0 @@
-# Description:
-#   Template for cuda Build file to use a pre-generated config.
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cuda_headers",
-    actual = "%{remote_cuda_repo}/cuda:cuda_headers",
-)
-
-alias(
-    name = "cudart_static",
-    actual = "%{remote_cuda_repo}/cuda:cudart_static",
-)
-
-alias(
-    name = "cuda_driver",
-    actual = "%{remote_cuda_repo}/cuda:cuda_driver",
-)
-
-alias(
-    name = "cudart",
-    actual = "%{remote_cuda_repo}/cuda:cudart",
-)
-
-alias(
-    name = "cublas",
-    actual = "%{remote_cuda_repo}/cuda:cublas",
-)
-
-alias(
-    name = "cusolver",
-    actual = "%{remote_cuda_repo}/cuda:cusolver",
-)
-
-alias(
-    name = "cudnn",
-    actual = "%{remote_cuda_repo}/cuda:cudnn",
-)
-
-alias(
-    name = "cudnn_header",
-    actual = "%{remote_cuda_repo}/cuda:cudnn_header",
-)
-
-alias(
-    name = "cufft",
-    actual = "%{remote_cuda_repo}/cuda:cufft",
-)
-
-alias(
-    name = "curand",
-    actual = "%{remote_cuda_repo}/cuda:curand",
-)
-
-alias(
-    name = "cuda",
-    actual = "%{remote_cuda_repo}/cuda:cuda",
-)
-
-alias(
-    name = "cupti_headers",
-    actual = "%{remote_cuda_repo}/cuda:cupti_headers",
-)
-
-alias(
-    name = "cupti_dsos",
-    actual = "%{remote_cuda_repo}/cuda:cupti_dsos",
-)
-
-alias(
-    name = "libdevice_root",
-    actual = "%{remote_cuda_repo}/cuda:libdevice_root",
-)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 8aa5b89cddb336380d35f85a6ecd3ebdf6589e88..40c5e373ede149494918643509f424d4f352ebc5 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -400,7 +400,7 @@ def _cuda_include_path(repository_ctx, cuda_config):
   return "\n".join(inc_entries)
 
 
-def _enable_cuda(repository_ctx):
+def enable_cuda(repository_ctx):
   if "TF_NEED_CUDA" in repository_ctx.os.environ:
     enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
     return enable_cuda == "1"
@@ -643,9 +643,7 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
     auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
                          "TF_CUDNN_VERSION (%s)") %
                         (str(cudnn_h_path), full_version, environ_version),)
-
-  # We only use the major version since we use the libcudnn libraries that are
-  # only versioned with the major version (e.g. libcudnn.so.5).
+  # Only use the major version to match the SONAME of the library.
   version = major_version
   if cpu_value == "Windows":
     version = "64_" + version
@@ -691,11 +689,11 @@ def _is_windows(repository_ctx):
   return get_cpu_value(repository_ctx) == "Windows"
 
 
-def _lib_name(lib, cpu_value, version = "", static = False):
+def lib_name(base_name, cpu_value, version = None, static = False):
   """Constructs the platform-specific name of a library.
 
     Args:
-      lib: The name of the library, such as "cudart"
+      base_name: The name of the library, such as "cudart"
       cpu_value: The name of the host operating system.
       version: The version of the library.
       static: True the library is static or False if it is a shared object.
@@ -703,31 +701,55 @@ def _lib_name(lib, cpu_value, version = "", static = False):
     Returns:
       The platform-specific name of the library.
     """
+  version = "" if not version else "." + version
   if cpu_value in ("Linux", "FreeBSD"):
     if static:
-      return "lib%s.a" % lib
-    else:
-      if version:
-        version = ".%s" % version
-      return "lib%s.so%s" % (lib, version)
+      return "lib%s.a" % base_name
+    return "lib%s.so%s" % (base_name, version)
   elif cpu_value == "Windows":
-    return "%s.lib" % lib
+    return "%s.lib" % base_name
   elif cpu_value == "Darwin":
     if static:
-      return "lib%s.a" % lib
-    elif version:
-      version = ".%s" % version
-    return "lib%s%s.dylib" % (lib, version)
+      return "lib%s.a" % base_name
+    return "lib%s%s.dylib" % (base_name, version)
   else:
     auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
+def find_lib(repository_ctx, paths, check_soname = True):
+  """
+    Finds a library among a list of potential paths.
+
+    Args:
+      paths: List of paths to inspect.
+
+    Returns:
+      Returns the first path in paths that exist.
+  """
+  objdump = repository_ctx.which("objdump")
+  mismatches = []
+  for path in [repository_ctx.path(path) for path in paths]:
+    if not path.exists:
+      continue
+    if check_soname and objdump != None:
+      output = repository_ctx.execute([objdump, "-p", str(path)]).stdout
+      output = [line for line in output.splitlines() if "SONAME" in line]
+      sonames = [line.strip().split(" ")[-1] for line in output]
+      if not any([soname == path.basename for soname in sonames]):
+        mismatches.append(str(path))
+        continue
+    return path
+  if mismatches:
+    auto_configure_fail(
+        "None of the libraries match their SONAME: " + ", ".join(mismatches))
+  auto_configure_fail("No library found under: " + ", ".join(paths))
+
 
 def _find_cuda_lib(
         lib,
         repository_ctx,
         cpu_value,
         basedir,
-        version = "",
+        version,
         static = False):
   """Finds the given CUDA or cuDNN library on the system.
 
@@ -740,16 +762,12 @@ def _find_cuda_lib(
       static: True if static library, False if shared object.
 
     Returns:
-      Returns a struct with the following fields:
-        file_name: The basename of the library found on the system.
-        path: The full path to the library.
+      Returns the path to the library.
     """
-  file_name = _lib_name(lib, cpu_value, version, static)
-  for relative_path in CUDA_LIB_PATHS:
-    path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
-    if path.exists:
-      return struct(file_name=file_name, path=str(path.realpath))
-  auto_configure_fail("Cannot find cuda library %s" % file_name)
+  file_name = lib_name(lib, cpu_value, version, static)
+  return find_lib(repository_ctx, [
+      "%s/%s%s" % (basedir, path, file_name) for path in CUDA_LIB_PATHS
+  ], check_soname = version and not static)
 
 
 def _find_cupti_header_dir(repository_ctx, cuda_config):
@@ -785,23 +803,17 @@ def _find_cupti_lib(repository_ctx, cuda_config):
       cuda_config: The cuda configuration as returned by _get_cuda_config.
 
     Returns:
-      Returns a struct with the following fields:
-        file_name: The basename of the library found on the system.
-        path: The full path to the library.
+      Returns the path to the library.
     """
-  file_name = _lib_name(
+  file_name = lib_name(
       "cupti",
       cuda_config.cpu_value,
       cuda_config.cuda_version,
   )
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUPTI_LIB_PATHS:
-    path = repository_ctx.path(
-        "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),)
-    if path.exists:
-      return struct(file_name=file_name, path=str(path.realpath))
-
-  auto_configure_fail("Cannot find cupti library %s" % file_name)
+  basedir = cuda_config.cuda_toolkit_path
+  return find_lib(repository_ctx, [
+      "%s/%s%s" % (basedir, path, file_name) for path in CUPTI_LIB_PATHS
+  ])
 
 
 def _find_libs(repository_ctx, cuda_config):
@@ -817,8 +829,12 @@ def _find_libs(repository_ctx, cuda_config):
   cpu_value = cuda_config.cpu_value
   return {
       "cuda":
-          _find_cuda_lib("cuda", repository_ctx, cpu_value,
-                         cuda_config.cuda_toolkit_path),
+          _find_cuda_lib(
+              "cuda",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              None),
       "cudart":
           _find_cuda_lib(
               "cudart",
@@ -1035,9 +1051,9 @@ def _create_dummy_repository(repository_ctx):
       "cuda:BUILD",
       {
           "%{cuda_driver_lib}":
-              _lib_name("cuda", cpu_value),
+              lib_name("cuda", cpu_value),
           "%{cudart_static_lib}":
-              _lib_name(
+              lib_name(
                   "cudart_static",
                   cpu_value,
                   static=True,
@@ -1045,20 +1061,20 @@ def _create_dummy_repository(repository_ctx):
           "%{cudart_static_linkopt}":
               _cudart_static_linkopt(cpu_value),
           "%{cudart_lib}":
-              _lib_name("cudart", cpu_value),
+              lib_name("cudart", cpu_value),
           "%{cublas_lib}":
-              _lib_name("cublas", cpu_value),
+              lib_name("cublas", cpu_value),
           "%{cusolver_lib}":
-              _lib_name("cusolver", cpu_value),
+              lib_name("cusolver", cpu_value),
           "%{cudnn_lib}":
-              _lib_name("cudnn", cpu_value),
+              lib_name("cudnn", cpu_value),
           "%{cufft_lib}":
-              _lib_name("cufft", cpu_value),
+              lib_name("cufft", cpu_value),
           "%{curand_lib}":
-              _lib_name("curand", cpu_value),
+              lib_name("curand", cpu_value),
           "%{cupti_lib}":
-              _lib_name("cupti", cpu_value),
-          "%{cuda_include_genrules}":
+              lib_name("cupti", cpu_value),
+          "%{copy_rules}":
               "",
           "%{cuda_headers}":
               "",
@@ -1067,20 +1083,20 @@ def _create_dummy_repository(repository_ctx):
 
   # Create dummy files for the CUDA toolkit since they are still required by
   # tensorflow/core/platform/default/build_config:cuda.
-  repository_ctx.file("cuda/cuda/include/cuda.h", "")
-  repository_ctx.file("cuda/cuda/include/cublas.h", "")
-  repository_ctx.file("cuda/cuda/include/cudnn.h", "")
-  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+  repository_ctx.file("cuda/cuda/include/cuda.h")
+  repository_ctx.file("cuda/cuda/include/cublas.h")
+  repository_ctx.file("cuda/cuda/include/cudnn.h")
+  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
   repository_ctx.file(
-      "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+      "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
@@ -1148,65 +1164,37 @@ def _norm_path(path):
     path = path[:-1]
   return path
 
-
-def symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-  """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-  if src_dir != None:
-    src_dir = _norm_path(src_dir)
-    dest_dir = _norm_path(dest_dir)
-    files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-
-    # Create a list with the src_dir stripped to use for outputs.
-    dest_files = files.replace(src_dir, "").splitlines()
-    src_files = files.splitlines()
-  command = []
-  if not _is_windows(repository_ctx):
-    # We clear folders that might have been generated previously to avoid
-    # undesired inclusions
-    command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
-    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-    command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
-  outs = []
-  for i in range(len(dest_files)):
-    if dest_files[i] != "":
-      # If we have only one file to link we do not want to use the dest_dir, as
-      # $(@D) will include the full path to the file.
-      dest = "$(@D)/" + dest_dir + dest_files[i] if len(
-          dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-      # Copy the headers to create a sandboxable setup.
-      cmd = "cp -f"
-      command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-      outs.append('        "' + dest_dir + dest_files[i] + '",')
-  genrule = _genrule(
-      src_dir,
-      genrule_name,
-      " && ".join(command),
-      "\n".join(outs),
-  )
-  return genrule
-
-
-def _genrule(src_dir, genrule_name, command, outs):
-  """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-  return (
-      "genrule(\n" + '    name = "' + genrule_name + '",\n' + "    outs = [\n" +
-      outs + "\n    ],\n" + '    cmd = """\n' + command + '\n   """,\n' + ")\n")
-
+def make_copy_files_rule(repository_ctx, name, srcs, outs):
+  """Returns a rule to copy a set of files."""
+  cmds = []
+  # Copy files.
+  for src, out in zip(srcs, outs):
+    cmds.append('cp -f "%s" $(location %s)' % (src, out))
+  outs = [('        "%s",' % out) for out in outs]
+  return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""%s \""",
+)""" % (name, "\n".join(outs), " && ".join(cmds))
+
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+  """Returns a rule to recursively copy a directory."""
+  src_dir = _norm_path(src_dir)
+  out_dir = _norm_path(out_dir)
+  outs = _read_dir(repository_ctx, src_dir)
+  outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
+  # '@D' already contains the relative path for a single file, see
+  # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
+  out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+  return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""cp -rLf "%s/." "%s/" \""",
+)""" % (name, "\n".join(outs), src_dir, out_dir)
 
 def _read_dir(repository_ctx, src_dir):
   """Returns a string with all files in a directory.
@@ -1233,7 +1221,7 @@ def _read_dir(repository_ctx, src_dir):
         empty_stdout_fine=True,
     )
     result = find_result.stdout
-  return result
+  return sorted(result.splitlines())
 
 
 def _flag_enabled(repository_ctx, flag_name):
@@ -1272,69 +1260,59 @@ def _create_local_cuda_repository(repository_ctx):
   cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
   nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
 
-  # Set up symbolic links for the cuda toolkit by creating genrules to do
-  # symlinking. We create one genrule for each directory we want to track under
-  # cuda_toolkit_path
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  genrules = [
-      symlink_genrule_for_dir(
+  # Create genrule to copy files from the installed CUDA toolkit into execroot.
+  copy_rules = [
+      make_copy_dir_rule(
           repository_ctx,
-          cuda_include_path,
-          "cuda/include",
-          "cuda-include",
-      )
-  ]
-  genrules.append(
-      symlink_genrule_for_dir(
+          name = "cuda-include",
+          src_dir = cuda_include_path,
+          out_dir = "cuda/include",
+      ),
+      make_copy_dir_rule(
           repository_ctx,
-          nvvm_libdevice_dir,
-          "cuda/nvvm/libdevice",
-          "cuda-nvvm",
-      ))
-  genrules.append(
-      symlink_genrule_for_dir(
+          name = "cuda-nvvm",
+          src_dir = nvvm_libdevice_dir,
+          out_dir = "cuda/nvvm/libdevice",
+      ),
+      make_copy_dir_rule(
           repository_ctx,
-          cupti_header_dir,
-          "cuda/extras/CUPTI/include",
-          "cuda-extras",
-      ))
+          name = "cuda-extras",
+          src_dir = cupti_header_dir,
+          out_dir = "cuda/extras/CUPTI/include",
+      ),
+  ]
 
   cuda_libs = _find_libs(repository_ctx, cuda_config)
-  cuda_lib_src = []
-  cuda_lib_dest = []
-  for lib in cuda_libs.values():
-    cuda_lib_src.append(lib.path)
-    cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-  genrules.append(
-      symlink_genrule_for_dir(
-          repository_ctx,
-          None,
-          "",
-          "cuda-lib",
-          cuda_lib_src,
-          cuda_lib_dest,
-      ))
+  cuda_lib_srcs = []
+  cuda_lib_outs = []
+  for path in cuda_libs.values():
+    cuda_lib_srcs.append(str(path))
+    cuda_lib_outs.append("cuda/lib/" + path.basename)
+  copy_rules.append(make_copy_files_rule(
+      repository_ctx,
+      name = "cuda-lib",
+      srcs = cuda_lib_srcs,
+      outs = cuda_lib_outs,
+  ))
+
+  copy_rules.append(make_copy_dir_rule(
+      repository_ctx,
+      name = "cuda-bin",
+      src_dir = cuda_config.cuda_toolkit_path + "/bin",
+      out_dir = "cuda/bin"
+  ))
 
-  # Set up the symbolic links for cudnn if cndnn was not installed to
-  # CUDA_TOOLKIT_PATH.
-  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
-      cuda_include_path,
-      "",
-  ).splitlines()
-  if "/cudnn.h" not in included_files:
-    genrules.append(
-        symlink_genrule_for_dir(
-            repository_ctx,
-            None,
-            "cuda/include/",
-            "cudnn-include",
-            [cudnn_header_dir + "/cudnn.h"],
-            ["cudnn.h"],
-        ))
+  # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
+  included_files = _read_dir(repository_ctx, cuda_include_path)
+  if not any([file.endswith("cudnn.h") for file in included_files]):
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cudnn-include",
+        srcs = [cudnn_header_dir + "/cudnn.h"],
+        outs = ["cuda/include/cudnn.h"],
+    ))
   else:
-    genrules.append(
-        "filegroup(\n" + '    name = "cudnn-include",\n' + "    srcs = [],\n" +
-        ")\n",)
+    copy_rules.append("filegroup(name = 'cudnn-include')\n")
 
   # Set up BUILD file for cuda/
   _tpl(
@@ -1355,27 +1333,27 @@ def _create_local_cuda_repository(repository_ctx):
       "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
       {
           "%{cuda_driver_lib}":
-              cuda_libs["cuda"].file_name,
+              cuda_libs["cuda"].basename,
           "%{cudart_static_lib}":
-              cuda_libs["cudart_static"].file_name,
+              cuda_libs["cudart_static"].basename,
           "%{cudart_static_linkopt}":
               _cudart_static_linkopt(cuda_config.cpu_value,),
           "%{cudart_lib}":
-              cuda_libs["cudart"].file_name,
+              cuda_libs["cudart"].basename,
           "%{cublas_lib}":
-              cuda_libs["cublas"].file_name,
+              cuda_libs["cublas"].basename,
           "%{cusolver_lib}":
-              cuda_libs["cusolver"].file_name,
+              cuda_libs["cusolver"].basename,
           "%{cudnn_lib}":
-              cuda_libs["cudnn"].file_name,
+              cuda_libs["cudnn"].basename,
           "%{cufft_lib}":
-              cuda_libs["cufft"].file_name,
+              cuda_libs["cufft"].basename,
           "%{curand_lib}":
-              cuda_libs["curand"].file_name,
+              cuda_libs["curand"].basename,
           "%{cupti_lib}":
-              cuda_libs["cupti"].file_name,
-          "%{cuda_include_genrules}":
-              "\n".join(genrules),
+              cuda_libs["cupti"].basename,
+          "%{copy_rules}":
+              "\n".join(copy_rules),
           "%{cuda_headers}": ('":cuda-include",\n' + '        ":cudnn-include",'
                              ),
       },
@@ -1530,22 +1508,26 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
               ),
       },
   )
-  _tpl(
-      repository_ctx,
-      "cuda:remote.BUILD",
-      {
-          "%{remote_cuda_repo}": remote_config_repo,
-      },
+  repository_ctx.template(
       "cuda/BUILD",
+      Label(remote_config_repo + "/cuda:BUILD"),
+      {},
+  )
+  repository_ctx.template(
+      "cuda/build_defs.bzl",
+      Label(remote_config_repo + "/cuda:build_defs.bzl"),
+      {},
+  )
+  repository_ctx.template(
+      "cuda/cuda/cuda_config.h",
+      Label(remote_config_repo + "/cuda:cuda/cuda_config.h"),
+      {},
   )
-  _tpl(repository_ctx, "crosstool:remote.BUILD", {
-      "%{remote_cuda_repo}": remote_config_repo,
-  }, "crosstool/BUILD")
 
 
 def _cuda_autoconf_impl(repository_ctx):
   """Implementation of the cuda_autoconf repository rule."""
-  if not _enable_cuda(repository_ctx):
+  if not enable_cuda(repository_ctx):
     _create_dummy_repository(repository_ctx)
   elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
     _create_remote_cuda_repository(
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
index 8258bb35897ac47c2e95c84a14089c73a075335d..502b6b8de2f520650c54f1ff01b1146b00a5c3f5 100644
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -18,6 +18,7 @@ cc_library(
     includes = [
         ".",
         "rocm/include",
+        "rocm/include/rocrand",
     ],
     visibility = ["//visibility:public"],
 )
@@ -96,4 +97,4 @@ cc_library(
     ],
 )
 
-%{rocm_include_genrules}
+%{copy_rules}
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 6df6799bd7696d5dbcc70345bf7b5e19f709b8d4..f67fb8ae38bce23b55149a5a134f88ff879946f5 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -14,6 +14,12 @@
     `gfx803,gfx900`.
 """
 
+load(
+    ":cuda_configure.bzl",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+)
+
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
 _TF_ROCM_VERSION = "TF_ROCM_VERSION"
@@ -445,7 +451,7 @@ def _create_dummy_repository(repository_ctx):
             "%{miopen_lib}": _lib_name("miopen", cpu_value),
             "%{rocfft_lib}": _lib_name("rocfft", cpu_value),
             "%{hiprand_lib}": _lib_name("hiprand", cpu_value),
-            "%{rocm_include_genrules}": "",
+            "%{copy_rules}": "",
             "%{rocm_headers}": "",
         },
     )
@@ -510,51 +516,6 @@ def _norm_path(path):
         path = path[:-1]
     return path
 
-def _symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = _read_dir(repository_ctx, src_dir)
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-
-    # We clear folders that might have been generated previously to avoid
-    # undesired inclusions
-    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # On Windows, symlink is not supported, so we just copy all the files.
-            cmd = "ln -s"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
-
 def _genrule(src_dir, genrule_name, command, outs):
     """Returns a string with a genrule.
 
@@ -601,55 +562,49 @@ def _create_local_rocm_repository(repository_ctx):
     """Creates the repository containing files set up to build with ROCm."""
     rocm_config = _get_rocm_config(repository_ctx)
 
-    # Set up symbolic links for the rocm toolkit by creating genrules to do
-    # symlinking. We create one genrule for each directory we want to track under
+    # Copy header and library files to execroot.
     # rocm_toolkit_path
     rocm_toolkit_path = rocm_config.rocm_toolkit_path
-    rocm_include_path = rocm_toolkit_path + "/include"
-    genrules = [_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_include_path,
-        "rocm/include",
-        "rocm-include",
-    )]
-    genrules.append(_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_toolkit_path + "/rocfft/include",
-        "rocm/include/rocfft",
-        "rocfft-include",
-    ))
-    genrules.append(_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_toolkit_path + "/rocblas/include",
-        "rocm/include/rocblas",
-        "rocblas-include",
-    ))
-    genrules.append(_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_toolkit_path + "/miopen/include",
-        "rocm/include/miopen",
-        "miopen-include",
-    ))
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rocm-include",
+            src_dir = rocm_toolkit_path + "/include",
+            out_dir = "rocm/include",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rocfft-include",
+            src_dir = rocm_toolkit_path + "/rocfft/include",
+            out_dir = "rocm/include/rocfft",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rocblas-include",
+            src_dir = rocm_toolkit_path + "/rocblas/include",
+            out_dir = "rocm/include/rocblas",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "miopen-include",
+            src_dir = rocm_toolkit_path + "/miopen/include",
+            out_dir = "rocm/include/miopen",
+        ),
+    ]
 
     rocm_libs = _find_libs(repository_ctx, rocm_config)
-    rocm_lib_src = []
-    rocm_lib_dest = []
+    rocm_lib_srcs = []
+    rocm_lib_outs = []
     for lib in rocm_libs.values():
-        rocm_lib_src.append(lib.path)
-        rocm_lib_dest.append("rocm/lib/" + lib.file_name)
-    genrules.append(_symlink_genrule_for_dir(
+        rocm_lib_srcs.append(lib.path)
+        rocm_lib_outs.append("rocm/lib/" + lib.file_name)
+    copy_rules.append(make_copy_files_rule(
         repository_ctx,
-        None,
-        "",
-        "rocm-lib",
-        rocm_lib_src,
-        rocm_lib_dest,
+        name = "rocm-lib",
+        srcs = rocm_lib_srcs,
+        outs = rocm_lib_outs,
     ))
 
-    included_files = _read_dir(repository_ctx, rocm_include_path).replace(
-        rocm_include_path,
-        "",
-    ).splitlines()
 
     # Set up BUILD file for rocm/
     _tpl(
@@ -672,7 +627,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
             "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
             "%{miopen_lib}": rocm_libs["miopen"].file_name,
-            "%{rocm_include_genrules}": "\n".join(genrules),
+            "%{copy_rules}": "\n".join(copy_rules),
             "%{rocm_headers}": ('":rocm-include",\n' +
                                 '":rocfft-include",\n' +
                                 '":rocblas-include",\n' +
diff --git a/third_party/hwloc/BUILD.bazel b/third_party/hwloc/BUILD.bazel
index b73267d6680077aa855cab5d3af727e0763e0788..1f29301645cce4ee3d82d931b1ada683d8731231 100644
--- a/third_party/hwloc/BUILD.bazel
+++ b/third_party/hwloc/BUILD.bazel
@@ -8,6 +8,9 @@ licenses(["notice"])
 
 exports_files(["LICENSE"])
 
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
+
 COMMON_INCLUDE_COPTS = [
     "-I.",
     "-Ihwloc",
@@ -23,6 +26,176 @@ VAR_SETTINGS_COPTS = [
     "-DRUNSTATEDIR=",
 ]
 
+template_rule(
+    name = "include_hwloc_autogen_config_h",
+    src = "include/hwloc/autogen/config.h.in",
+    out = "include/hwloc/autogen/config.h",
+    substitutions = {
+        "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
+        "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
+        "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
+        "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
+        "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
+        "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
+        "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
+        "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
+        "#  undef HWLOC_HAVE_STDINT_H": "#  define HWLOC_HAVE_STDINT_H 1 ",
+        "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
+        "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
+        "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
+    },
+)
+
+template_rule(
+    name = "include_private_hwloc_autogen__config_h",
+    src = "include/private/autogen/config.h.in",
+    out = "include/private/autogen/config.h",
+    substitutions = {
+        "#undef HAVE_CLOCK_GETTIME": "#define HAVE_CLOCK_GETTIME 1",
+        "#undef HAVE_CTYPE_H": "#define HAVE_CTYPE_H 1",
+        "#undef HAVE_DECL_CTL_HW": "#define HAVE_DECL_CTL_HW 0",
+        "#undef HAVE_DECL_FABSF": "#define HAVE_DECL_FABSF 1",
+        "#undef HAVE_DECL_GETEXECNAME": "#define HAVE_DECL_GETEXECNAME 0",
+        "#undef HAVE_DECL_GETMODULEFILENAME": "#define HAVE_DECL_GETMODULEFILENAME 0",
+        "#undef HAVE_DECL_GETPROGNAME": "#define HAVE_DECL_GETPROGNAME 0",
+        "#undef HAVE_DECL_HW_NCPU": "#define HAVE_DECL_HW_NCPU 0",
+        "#undef HAVE_DECL_MODFF": "#define HAVE_DECL_MODFF 1",
+        "#undef HAVE_DECL_PTHREAD_GETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1",
+        "#undef HAVE_DECL_PTHREAD_SETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1",
+        "#undef HAVE_DECL_RUNNING_ON_VALGRIND": "#define HAVE_DECL_RUNNING_ON_VALGRIND 0",
+        "#undef HAVE_DECL_SCHED_GETCPU": "#define HAVE_DECL_SCHED_GETCPU 1",
+        "#undef HAVE_DECL_SNPRINTF": "#define HAVE_DECL_SNPRINTF 1",
+        "#undef HAVE_DECL_STRTOULL": "#define HAVE_DECL_STRTOULL 1",
+        "#undef HAVE_DECL__PUTENV": "#define HAVE_DECL__PUTENV 0",
+        "#undef HAVE_DECL__SC_LARGE_PAGESIZE": "#define HAVE_DECL__SC_LARGE_PAGESIZE 0",
+        "#undef HAVE_DECL__SC_NPROCESSORS_CONF": "#define HAVE_DECL__SC_NPROCESSORS_CONF 1",
+        "#undef HAVE_DECL__SC_NPROCESSORS_ONLN": "#define HAVE_DECL__SC_NPROCESSORS_ONLN 1",
+        "#undef HAVE_DECL__SC_NPROC_CONF": "#define HAVE_DECL__SC_NPROC_CONF 0",
+        "#undef HAVE_DECL__SC_NPROC_ONLN": "#define HAVE_DECL__SC_NPROC_ONLN 0",
+        "#undef HAVE_DECL__SC_PAGESIZE": "#define HAVE_DECL__SC_PAGESIZE 1",
+        "#undef HAVE_DECL__SC_PAGE_SIZE": "#define HAVE_DECL__SC_PAGE_SIZE 1",
+        "#undef HAVE_DECL__STRDUP": "#define HAVE_DECL__STRDUP 0",
+        "#undef HAVE_DIRENT_H": "#define HAVE_DIRENT_H 1",
+        "#undef HAVE_DLFCN_H": "#define HAVE_DLFCN_H 1",
+        "#undef HAVE_FFS": "#define HAVE_FFS 1",
+        "#undef HAVE_FFSL": "#define HAVE_FFSL 1",
+        "#undef HAVE_GETPAGESIZE": "#define HAVE_GETPAGESIZE 1",
+        "#undef HAVE_INTTYPES_H": "#define HAVE_INTTYPES_H 1",
+        "#undef HAVE_LANGINFO_H": "#define HAVE_LANGINFO_H 1",
+        "#undef HAVE_LOCALE_H": "#define HAVE_LOCALE_H 1",
+        "#undef HAVE_MALLOC_H": "#define HAVE_MALLOC_H 1",
+        "#undef HAVE_MEMALIGN": "#define HAVE_MEMALIGN 1",
+        "#undef HAVE_MEMORY_H": "#define HAVE_MEMORY_H 1",
+        "#undef HAVE_MKSTEMP": "#define HAVE_MKSTEMP 1",
+        "#undef HAVE_NL_LANGINFO": "#define HAVE_NL_LANGINFO 1",
+        "#undef HAVE_OPENAT": "#define HAVE_OPENAT 1",
+        "#undef HAVE_POSIX_MEMALIGN": "#define HAVE_POSIX_MEMALIGN 1",
+        "#undef HAVE_PROGRAM_INVOCATION_NAME": "#define HAVE_PROGRAM_INVOCATION_NAME 1",
+        "#undef HAVE_PTHREAD_T": "#define HAVE_PTHREAD_T 1",
+        "#undef HAVE_PUTWC": "#define HAVE_PUTWC 1",
+        "#undef HAVE_SETLOCALE": "#define HAVE_SETLOCALE 1",
+        "#undef HAVE_SSIZE_T": "#define HAVE_SSIZE_T 1",
+        "#undef HAVE_STDINT_H": "#define HAVE_STDINT_H 1",
+        "#undef HAVE_STDLIB_H": "#define HAVE_STDLIB_H 1",
+        "#undef HAVE_STRCASECMP": "#define HAVE_STRCASECMP 1",
+        "#undef HAVE_STRFTIME": "#define HAVE_STRFTIME 1",
+        "#undef HAVE_STRINGS_H": "#define HAVE_STRINGS_H 1",
+        "#undef HAVE_STRING_H": "#define HAVE_STRING_H 1",
+        "#undef HAVE_STRNCASECMP": "#define HAVE_STRNCASECMP 1",
+        "#undef HAVE_SYS_MMAN_H": "#define HAVE_SYS_MMAN_H 1",
+        "#undef HAVE_SYS_PARAM_H": "#define HAVE_SYS_PARAM_H 1",
+        "#undef HAVE_SYS_STAT_H": "#define HAVE_SYS_STAT_H 1",
+        "#undef HAVE_SYS_SYSCTL_H": "#define HAVE_SYS_SYSCTL_H 1",
+        "#undef HAVE_SYS_TYPES_H": "#define HAVE_SYS_TYPES_H 1",
+        "#undef HAVE_SYS_UTSNAME_H": "#define HAVE_SYS_UTSNAME_H 1",
+        "#undef HAVE_TIME_H": "#define HAVE_TIME_H 1",
+        "#undef HAVE_UNAME": "#define HAVE_UNAME 1",
+        "#undef HAVE_UNISTD_H": "#define HAVE_UNISTD_H 1",
+        "#undef HAVE_USELOCALE": "#define HAVE_USELOCALE 1",
+        "#undef HAVE_WCHAR_T": "#define HAVE_WCHAR_T 1",
+        "#undef HAVE_X11_KEYSYM_H": "#define HAVE_X11_KEYSYM_H 1",
+        "#undef HAVE_X11_XLIB_H": "#define HAVE_X11_XLIB_H 1",
+        "#undef HAVE_X11_XUTIL_H": "#define HAVE_X11_XUTIL_H 1",
+        "#undef HAVE_XLOCALE_H": "#define HAVE_XLOCALE_H 1",
+        "#undef HAVE___PROGNAME": "#define HAVE___PROGNAME 1",
+        "#undef HWLOC_C_HAVE_VISIBILITY": "#define HWLOC_C_HAVE_VISIBILITY 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE": "#define HWLOC_HAVE_ATTRIBUTE 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_ALIGNED": "#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE": "#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_COLD": "#define HWLOC_HAVE_ATTRIBUTE_COLD 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_CONST": "#define HWLOC_HAVE_ATTRIBUTE_CONST 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_DEPRECATED": "#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_FORMAT": "#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_HOT": "#define HWLOC_HAVE_ATTRIBUTE_HOT 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_MALLOC": "#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_NONNULL": "#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_NORETURN": "#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION": "#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_PACKED": "#define HWLOC_HAVE_ATTRIBUTE_PACKED 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_PURE": "#define HWLOC_HAVE_ATTRIBUTE_PURE 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_SENTINEL": "#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_UNUSED": "#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT": "#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1",
+        "#undef HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1",
+        "#undef HWLOC_HAVE_CPU_SET": "#define HWLOC_HAVE_CPU_SET 1",
+        "#undef HWLOC_HAVE_CPU_SET_S": "#define HWLOC_HAVE_CPU_SET_S 1",
+        "#undef HWLOC_HAVE_DECL_FFS": "#define HWLOC_HAVE_DECL_FFS 1",
+        "#undef HWLOC_HAVE_DECL_FFSL": "#define HWLOC_HAVE_DECL_FFSL 1",
+        "#undef HWLOC_HAVE_DECL_STRCASECMP": "#define HWLOC_HAVE_DECL_STRCASECMP 1",
+        "#undef HWLOC_HAVE_DECL_STRNCASECMP": "#define HWLOC_HAVE_DECL_STRNCASECMP 1",
+        "#undef HWLOC_HAVE_FFS": "#define HWLOC_HAVE_FFS 1",
+        "#undef HWLOC_HAVE_FFSL": "#define HWLOC_HAVE_FFSL 1",
+        "#undef HWLOC_HAVE_LIBTERMCAP": "#define HWLOC_HAVE_LIBTERMCAP 1",
+        "#undef HWLOC_HAVE_LINUXIO": "#define HWLOC_HAVE_LINUXIO 1",
+        "#undef HWLOC_HAVE_PTHREAD_MUTEX": "#define HWLOC_HAVE_PTHREAD_MUTEX 1",
+        "#undef HWLOC_HAVE_SCHED_SETAFFINITY": "#define HWLOC_HAVE_SCHED_SETAFFINITY 1",
+        "#undef HWLOC_HAVE_STDINT_H": "#define HWLOC_HAVE_STDINT_H 1",
+        "#undef HWLOC_HAVE_SYSCALL": "#define HWLOC_HAVE_SYSCALL 1",
+        "#undef HWLOC_HAVE_X11_KEYSYM": "#define HWLOC_HAVE_X11_KEYSYM 1",
+        "#undef HWLOC_HAVE_X86_CPUID": "#define HWLOC_HAVE_X86_CPUID 1",
+        "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
+        "#undef HWLOC_SIZEOF_UNSIGNED_INT": "#define HWLOC_SIZEOF_UNSIGNED_INT 4",
+        "#undef HWLOC_SIZEOF_UNSIGNED_LONG": "#define HWLOC_SIZEOF_UNSIGNED_LONG 8",
+        "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
+        "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
+        "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
+        "#undef HWLOC_USE_NCURSES": "#define HWLOC_USE_NCURSES 1",
+        "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
+        "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
+        "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
+        "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
+        "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
+        "#undef HWLOC_X86_64_ARCH": "#define HWLOC_X86_64_ARCH 1",
+        "#undef LT_OBJDIR": "#define LT_OBJDIR \".libs/\"",
+        "#undef PACKAGE": "#define PACKAGE \"hwloc\"",
+        "#undef PACKAGE_BUGREPORT": "#define PACKAGE_BUGREPORT \"http://github.com/open-mpi/hwloc/i",
+        "#undef PACKAGE_NAME": "#define PACKAGE_NAME \"hwloc\"",
+        "#undef PACKAGE_STRING": "#define PACKAGE_STRING \"hwloc 2.0.3\"",
+        "#undef PACKAGE_TARNAME": "#define PACKAGE_TARNAME \"hwloc\"",
+        "#undef PACKAGE_URL": "#define PACKAGE_URL \"\"",
+        "#undef PACKAGE_VERSION": "#define PACKAGE_VERSION \"2.0.3\"",
+        "#undef SIZEOF_UNSIGNED_INT": "#define SIZEOF_UNSIGNED_INT 4",
+        "#undef SIZEOF_UNSIGNED_LONG": "#define SIZEOF_UNSIGNED_LONG 8",
+        "#undef SIZEOF_VOID_P": "#define SIZEOF_VOID_P 8",
+        "#undef STDC_HEADERS": "#define STDC_HEADERS 1",
+        "# undef _HPUX_SOURCE": "# define _HPUX_SOURCE 1",
+        "# undef _ALL_SOURCE": "# define _ALL_SOURCE 1",
+        "# undef _GNU_SOURCE": "# define _GNU_SOURCE 1",
+        "# undef _POSIX_PTHREAD_SEMANTICS": "# define _POSIX_PTHREAD_SEMANTICS 1",
+        "# undef _TANDEM_SOURCE": "# define _TANDEM_SOURCE 1",
+        "# undef __EXTENSIONS__": "# define __EXTENSIONS__ 1",
+        "#undef VERSION": "#define VERSION \"2.0.3\"",
+        "#undef _HPUX_SOURCE": "#define _HPUX_SOURCE 1",
+        "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
+        "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
+    } + if_cuda({
+        "#undef HAVE_CUDA": "#undef HAVE_CUDA 1",
+        "#undef HAVE_CUDA_H": "#undef HAVE_CUDA_H 1",
+        "#undef HAVE_CUDA_RUNTIME_API_H": "#undef HAVE_CUDA_RUNTIME_API_H 1",
+    }),
+)
+
 cc_library(
     name = "hwloc",
     srcs = [
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
index c3a71e4ff9b33a06a87f0f90978eaf3a718c7de6..3f3c9e9336afa551ed904150319a92e378288a5f 100644
--- a/third_party/mkl/mkl.BUILD
+++ b/third_party/mkl/mkl.BUILD
@@ -12,7 +12,7 @@ filegroup(
 
 cc_library(
     name = "mkl_headers",
-    srcs = glob(["include/*"]),
+    srcs = glob(["include/*(.cc|.cpp|.cxx|.c++|.C|.c|.h|.hh|.hpp|.ipp|.hxx|.inc|.S|.s|.asm|.a|.lib|.pic.a|.lo|.lo.lib|.pic.lo|.so|.dylib|.dll|.o|.obj|.pic.o)"]),
     includes = ["include"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 7a08f97ef328a7a731d7c76de8bda70c8d004dac..5901c6b296fa0f4da8061b2b44daed18cd0b3558 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -1,174 +1,106 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["restricted"])
+licenses(["notice"])
 
 exports_files(["LICENSE.txt"])
 
 load(
     "@local_config_nccl//:build_defs.bzl",
-    "gen_nccl_h",
-    "nccl_library",
-    "rdc_copts",
-    "rdc_library",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "cuda_default_copts",
-)
-
-# Generate the nccl.h header file.
-gen_nccl_h(
-    name = "nccl_h",
-    output = "src/nccl.h",
-    template = "src/nccl.h.in",
+    "cuda_rdc_library",
+    "gen_device_srcs",
+    "process_srcs",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 
-nccl_library(
+cc_library(
     name = "src_hdrs",
-    hdrs = [
-        "src/nccl.h",
-        # src/include/common_coll.h #includes "collectives/collectives.h".
-        # All other #includes of collectives.h are patched in process_srcs.
+    hdrs = process_srcs([
         "src/collectives/collectives.h",
-    ],
-    strip_include_prefix = "src",
+        "src/nccl.h.in",
+    ]),
 )
 
-nccl_library(
+cc_library(
     name = "include_hdrs",
-    hdrs = glob(["src/include/*.h"]),
-    strip_include_prefix = "src/include",
-)
-
-filegroup(
-    name = "device_hdrs",
-    srcs = glob(["src/collectives/device/*.h"]),
+    hdrs = process_srcs(glob(["src/include/*.h"])),
+    strip_include_prefix = "include",
 )
 
-filegroup(
-    name = "device_srcs",
-    srcs = [
-        "src/collectives/device/all_gather.cu",
-        "src/collectives/device/all_reduce.cu",
-        "src/collectives/device/broadcast.cu",
-        "src/collectives/device/reduce.cu",
-        "src/collectives/device/reduce_scatter.cu",
-    ],
-)
+device_srcs = process_srcs([
+    "src/collectives/device/all_gather.cu",
+    "src/collectives/device/all_reduce.cu",
+    "src/collectives/device/broadcast.cu",
+    "src/collectives/device/reduce.cu",
+    "src/collectives/device/reduce_scatter.cu",
+])
 
-nccl_library(
+# NCCL compiles the same source files with different NCCL_OP defines. RDC
+# compilation requires that each compiled module has a unique ID. Clang derives
+# the module ID from the path only so we need to rename the files to get
+# different IDs for different parts of compilation. NVCC does not have that
+# problem because it generates IDs based on preprocessed content.
+gen_device_srcs(
     name = "sum",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=0"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "sum_",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 0,
 )
 
-nccl_library(
+gen_device_srcs(
     name = "prod",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=1"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "_prod",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 1,
 )
 
-nccl_library(
+gen_device_srcs(
     name = "min",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=2"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "min_",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 2,
 )
 
-nccl_library(
+gen_device_srcs(
     name = "max",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=3"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "max_",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 3,
 )
 
-nccl_library(
-    name = "functions",
+cuda_rdc_library(
+    name = "device",
     srcs = [
-        "src/collectives/device/functions.cu",
-        ":device_hdrs",
-    ],
-    copts = rdc_copts(),
-    linkstatic = True,
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-rdc_library(
-    name = "device_code",
-    deps = [
-        ":functions",
         ":max",
         ":min",
         ":prod",
         ":sum",
+    ] + process_srcs(glob([
+        "src/collectives/device/*.h",
+        "src/collectives/device/functions.cu",
+    ])),
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
     ],
 )
 
 # Primary NCCL target.
-nccl_library(
+cc_library(
     name = "nccl",
-    srcs = glob(
+    srcs = process_srcs(glob(
         include = ["src/**/*.cu"],
         # Exclude device-library code.
         exclude = ["src/collectives/device/**"],
-    ) + [
+    )) + [
         # Required for header inclusion checking (see
         # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
-        # Files in src/ which #include "nccl.h" load it from there rather than
-        # from the virtual includes directory.
-        "src/nccl.h",
+        "nccl.h",
+        "collectives/collectives.h",
     ],
-    hdrs = ["src/nccl.h"],
-    copts = cuda_default_copts(),
+    hdrs = ["nccl.h"],
+    copts = cuda_default_copts() + ["-Wno-vla"],
     include_prefix = "third_party/nccl",
-    strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = [
-        ":device_code",
+        ":device",
         ":include_hdrs",
-        ":src_hdrs",
+        "@local_config_cuda//cuda:cudart_static",
     ],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 42de79c411c844d48982c47753337102b915aefd..245f180a91b6f2661fc35d834aa13f9347b1f330 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -1,87 +1,97 @@
 """Repository rule for NCCL."""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
-def _gen_nccl_h_impl(ctx):
-    """Creates nccl.h from a template."""
+def _process_src_impl(ctx):
+    """Applies various patches to the NCCL source."""
+    substitutions = {
+        "\"collectives.h": "\"collectives/collectives.h",
+        "\"../collectives.h": "\"collectives/collectives.h",
+        # Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
+        # TODO(csigg): Apply substitutions upstream and remove here.
+        "#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
+        "#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
+        "#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
+        "#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
+        "nullptr_t": "std::nullptr_t",
+    }
+    if ctx.file.src.basename == "nccl.h.in":
+        substitutions.update({
+          "${nccl:Major}": "2",
+          "${nccl:Minor}": "3",
+          "${nccl:Patch}": "5",
+          "${nccl:Suffix}": "",
+          "${nccl:Version}": "2305",
+        })
+    if ctx.file.src.basename == "function.cu":
+        substitutions.update({
+            # Don't try to initialize the host shadow copy of this device-side
+            # global variable. There is no host pointer to a device-side
+            # function, which confuses clang.
+            # TODO(csigg): remove when fixed in clang.
+            "NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
+            "NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
+        })
     ctx.actions.expand_template(
-        output = ctx.outputs.output,
-        template = ctx.file.template,
-        substitutions = {
-            "${nccl:Major}": "2",
-            "${nccl:Minor}": "3",
-            "${nccl:Patch}": "5",
-            "${nccl:Suffix}": "",
-            "${nccl:Version}": "2305",
-        },
+        output = ctx.outputs.out,
+        template = ctx.file.src,
+        substitutions = substitutions,
     )
 
-gen_nccl_h = rule(
-    implementation = _gen_nccl_h_impl,
+_process_src = rule(
+    implementation = _process_src_impl,
     attrs = {
-        "template": attr.label(allow_single_file = True),
-        "output": attr.output(),
+        "src": attr.label(allow_single_file = True),
+        "out": attr.output(),
     },
 )
-"""Creates the NCCL header file."""
+"""Processes one NCCL source file so it can be compiled with bazel and clang."""
+
+def _out(src):
+    if not src.startswith("src/"):
+      fail("Source file not under src/...:", src)
+    src = src[4:]  # Strip 'src/'
+    if src == "nccl.h.in":
+      return "nccl.h"
+    if src.endswith(".cu"):
+      return src + ".cc"
+    return src
+
+def process_srcs(srcs):
+    """Processes files under src/ and copies them to the parent directory."""
+    [_process_src(
+      name = "_" + src,
+      src = src,
+      out = _out(src),
+    ) for src in srcs]
+    return ["_" + src for src in srcs]
 
-def _process_srcs_impl(ctx):
-    """Appends .cc to .cu files, patches include directives."""
+def _gen_device_srcs_impl(ctx):
     files = []
     for src in ctx.files.srcs:
-        if not src.is_source:
-            # Process only once, specifically "src/nccl.h".
-            files.append(src)
-            continue
-        name = src.basename
-        if src.extension == "cu":
-            name = ctx.attr.prefix + name + ".cc"
+        name = "%s_%s" % (ctx.attr.name, src.basename)
         file = ctx.actions.declare_file(name, sibling = src)
         ctx.actions.expand_template(
             output = file,
             template = src,
             substitutions = {
-                "\"collectives.h": "\"collectives/collectives.h",
-                "\"../collectives.h": "\"collectives/collectives.h",
-                "#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
-                # Substitutions are applied in order.
-                "std::nullptr_t": "nullptr_t",
-                "nullptr_t": "std::nullptr_t",
+                "#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
             },
         )
         files.append(file)
     return [DefaultInfo(files = depset(files))]
 
-_process_srcs = rule(
-    implementation = _process_srcs_impl,
+gen_device_srcs = rule(
+    implementation = _gen_device_srcs_impl,
     attrs = {
         "srcs": attr.label_list(allow_files = True),
-        "prefix": attr.string(default = ""),
+        "NCCL_OP": attr.int(),
     },
 )
-"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
-
-def nccl_library(name, srcs = None, hdrs = None, prefix = None, **kwargs):
-    """Processes the srcs and hdrs and creates a cc_library."""
-
-    _process_srcs(
-        name = name + "_srcs",
-        srcs = srcs,
-        prefix = prefix,
-    )
-    _process_srcs(
-        name = name + "_hdrs",
-        srcs = hdrs,
-    )
+"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
 
-    native.cc_library(
-        name = name,
-        srcs = [name + "_srcs"] if srcs else [],
-        hdrs = [name + "_hdrs"] if hdrs else [],
-        **kwargs
-    )
-
-def rdc_copts():
+def _rdc_copts():
     """Returns copts for compiling relocatable device code."""
 
     # The global functions can not have a lower register count than the
@@ -100,118 +110,255 @@ def rdc_copts():
             "-fcuda-rdc",
             "-Xcuda-ptxas",
             maxrregcount,
+            # Work around for clang bug (fixed in r348662), declaring
+            # '__device__ operator delete(void*, std::size_t)' non-inline.
+            # TODO(csigg): Only add this option for older clang versions.
+            "-std=gnu++11",
         ],
         "//conditions:default": [],
-    }) + ["-fvisibility=hidden"]
+    })
 
-def _filter_impl(ctx):
-    suffix = ctx.attr.suffix
-    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
-    return [DefaultInfo(files = depset(files))]
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files:
+        if file.path.endswith(path):
+            return file
+    return None
 
-_filter = rule(
-    implementation = _filter_impl,
-    attrs = {
-        "srcs": attr.label_list(allow_files = True),
-        "suffix": attr.string(),
-    },
-)
-"""Filters the srcs to the ones ending with suffix."""
+def _pic_only(files):
+    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
+    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
+    return pic_only if pic_only else files
+
+def _device_link_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
+
+    inputs = []
+    for dep in ctx.attr.deps:
+        inputs += dep.files.to_list()
+    inputs = _pic_only(inputs)
 
-def _gen_link_src_impl(ctx):
+    # Device-link to cubins for each architecture.
+    name = ctx.attr.name
+    register_h = None
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
+        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
+        ctx.actions.run(
+            outputs = [register_h, cubin],
+            inputs = inputs,
+            executable = ctx.file._nvlink,
+            arguments = ctx.attr.nvlink_args + [
+                "--arch=%s" % arch,
+                "--register-link-binaries=%s" % register_h.path,
+                "--output-file=%s" % cubin.path,
+            ] + [file.path for file in inputs],
+            mnemonic = "nvlink",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin header from all cubins.
+    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
+    bin2c = ctx.file._bin2c
+    ctx.actions.run(
+        outputs = [tmp_fatbin, fatbin_h],
+        inputs = cubins,
+        executable = ctx.file._fatbinary,
+        arguments = [
+            "-64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--bin2c-path=%s" % bin2c.dirname,
+            "--create=%s" % tmp_fatbin.path,
+            "--embedded-fatbin=%s" % fatbin_h.path,
+        ] + images,
+        tools = [bin2c],
+        mnemonic = "fatbinary",
+    )
+
+    # Generate the source file #including the headers generated above.
     ctx.actions.expand_template(
-        output = ctx.outputs.output,
-        template = ctx.file.template,
+        output = ctx.outputs.out,
+        template = ctx.file._link_stub,
         substitutions = {
-            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
-            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
+            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
+            "FATBINFILE": '"%s"' % fatbin_h.short_path,
         },
     )
 
-_gen_link_src = rule(
-    implementation = _gen_link_src_impl,
+    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
+
+_device_link = rule(
+    implementation = _device_link_impl,
     attrs = {
-        "register_hdr": attr.label(allow_single_file = True),
-        "fatbin_hdr": attr.label(allow_single_file = True),
-        "template": attr.label(allow_single_file = True),
-        "output": attr.output(),
+        "deps": attr.label_list(),
+        "out": attr.output(mandatory = True),
+        "gpu_archs": attr.string_list(),
+        "nvlink_args": attr.string_list(),
+        "_nvlink": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
+        ),
+        "_fatbinary": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
+        ),
+        "_bin2c": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
+        ),
+        "_link_stub": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
+            allow_single_file = True,
+        ),
     },
 )
-"""Patches the include directives for the link.stub file."""
-
-def rdc_library(name, deps):
-    """Produces a cc_library from deps containing relocatable device code."""
-
-    # From .a and .pic.a archives, just use the latter. Otherwise we get
-    # multiply defined symbols.
-    # TODO(csigg): C++ Sandwich once available should allow passing this target
-    # to a cc_library dependency, which would avoid the linking order issue.
-    _filter(
-        name = name + "_deps_a",
-        srcs = deps,
-        suffix = ".pic.a",
+"""Links device code and generates source code for kernel registration."""
+
+def _merge_archive_impl(ctx):
+    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
+    # See https://stackoverflow.com/a/23621751.
+    files = _pic_only(ctx.files.srcs)
+    mri_script = "create " + ctx.outputs.out.path
+    for f in files:
+        mri_script += "\\naddlib " + f.path
+    mri_script += "\\nsave\\nend"
+
+    cc_toolchain = find_cpp_toolchain(ctx)
+    ctx.actions.run_shell(
+        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
+        outputs = [ctx.outputs.out],
+        command = ("printf \"%s\" " % mri_script +
+                   "| %s -M" % cc_toolchain.ar_executable),
     )
 
-    # Device-link to cubins for each architecture.
-    images = []
-    cubins = []
-    for arch in %{gpu_architectures}:
-        cubin = "%s_%s.cubin" % (name, arch)
-        register_hdr = "%s_%s.h" % (name, arch)
-        nvlink = "@local_config_nccl//:nvlink"
-        cmd = ("$(location %s) " % nvlink +
-               select({
-                   # NCCL is only supported on Linux.
-                   "@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
-                   "@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
-                   "//conditions:default": "",
-               }) +
-               "--arch=%s $(SRCS) " % arch +
-               "--register-link-binaries=$(location %s) " % register_hdr +
-               "--output-file=$(location %s)" % cubin)
-        native.genrule(
-            name = "%s_%s" % (name, arch),
-            outs = [register_hdr, cubin],
-            srcs = [name + "_deps_a"],
-            cmd = cmd,
-            tools = [nvlink],
-        )
-        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
-        cubins.append(cubin)
+_merge_archive = rule(
+    implementation = _merge_archive_impl,
+    attrs = {
+        "srcs": attr.label_list(mandatory = True, allow_files = True),
+        "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
+        # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
+    },
+    outputs = {"out": "lib%{name}.a"},
+)
+"""Merges srcs into a single archive."""
 
-    # Generate fatbin header from all cubins.
-    fatbin_hdr = name + ".fatbin.h"
-    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
-    bin2c = "@local_config_nccl//:cuda/bin/bin2c"
-    cmd = ("$(location %s) -64 --cmdline=--compile-only " % fatbinary +
-           "--link --bin2c-path $$(dirname $(location %s)) " % bin2c +
-           "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
-           "--embedded-fatbin=$@")
-    native.genrule(
-        name = name + "_fatbin_h",
-        outs = [fatbin_hdr],
-        srcs = cubins,
-        cmd = cmd,
-        tools = [fatbinary, bin2c],
+def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
+    """Produces a cuda_library using separate compilation and linking.
+
+    CUDA separate compilation and linking allows device function calls across
+    translation units. This is different from the normal whole program
+    compilation where each translation unit contains all device code. For more
+    background, see
+    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
+    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
+
+    During separate compilation, the different CUDA source files are compiled
+    to 'relocatable device code' (RDC) and embedded in the host object files.
+    When using nvcc, linking the device code for each supported GPU
+    architecture and generating kernel registration code for the CUDA runtime
+    is handled automatically. Clang supports generating relocatable device
+    code, but it can't link it. We therefore rely on tools provided by the CUDA
+    SDK to link the device code and generate the host code to register the
+    kernels.
+
+    The nvlink tool extracts the RDC code from the object files and links it
+    into cubin files, one per GPU architecture. It also produces a header file
+    with a list of kernel names to register. The cubins are merged into a
+    binary blob using the fatbinary tool, and converted to a C header file with
+    the help of the bin2c tool. The registration header file, the fatbinary
+    header file, and the link.stub file (shipped with the CUDA SDK) are
+    compiled as ordinary host code.
+
+    Here is a diagram of the CUDA separate compilation trajectory:
+
+     x.cu.cc    y.cu.cc
+           \    /            cc_library (compile RDC and archive)
+            xy.a
+           /    \            * nvlink
+    register.h  xy.cubin
+          :      |           * fatbinary and bin2c
+          :     xy.fatbin.h
+          :      :           * #include
+          dlink.cc           * Expanded from crt/dlink.stub template
+             |               cc_library (host compile and archive)
+          dlink.a
+
+    The steps marked with '*' are implemented in the _device_link rule.
+
+    The object files in both xy.a and dlink.a reference symbols defined in the
+    other archive. The separate archives are a side effect of using two
+    cc_library targets to implement a single compilation trajectory. We could
+    fix this once bazel supports C++ sandwich. For now, we just merge the two
+    archives to avoid unresolved symbols:
+
+    xy.a      dlink.a
+        \    /           merge archive
+      xy_dlink.a
+           |             cc_library (or alternatively, cc_import)
+     final target
+
+    Another complication is that cc_library produces (depending on the
+    configuration) both PIC and non-PIC archives, but the distinction
+    is hidden from Starlark until C++ sandwich becomes available. We work
+    around this by dropping the non-PIC files if PIC files are available.
+
+    Args:
+      name: Target name.
+      hdrs: Header files.
+      copts: Compiler options.
+      linkstatic: Must be true.
+      **kwargs: Any other arguments.
+    """
+
+    if not hdrs:
+        hdrs = []
+    if not copts:
+        copts = []
+
+    # Compile host and device code into library.
+    lib = name + "_lib"
+    native.cc_library(
+        name = lib,
+        hdrs = hdrs,
+        copts = _rdc_copts() + copts,
+        linkstatic = linkstatic,
+        **kwargs
     )
 
-    # Generate the source file #including the headers generated above.
-    _gen_link_src(
-        name = name + "_dlink_src",
-        # Include just the last one, they are equivalent.
-        register_hdr = register_hdr,
-        fatbin_hdr = fatbin_hdr,
-        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
-        output = name + ".cc",
+    # Generate source file containing linked device code.
+    dlink_hdrs = name + "_dlink_hdrs"
+    dlink_cc = name + "_dlink.cc"
+    _device_link(
+        name = dlink_hdrs,
+        deps = [lib],
+        out = dlink_cc,
+        gpu_archs = %{gpu_architectures},
+        nvlink_args = select({
+            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
+            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            "//conditions:default": [],
+        }),
     )
 
-    # Compile the source file into the cc_library.
+    # Compile the source file into a library.
+    dlink = name + "_dlink"
     native.cc_library(
-        name = name + "_dlink_a",
-        srcs = [
-            name + "_dlink_src",
-        ],
-        textual_hdrs = [register_hdr, fatbin_hdr],
+        name = dlink,
+        srcs = [dlink_cc],
+        textual_hdrs = [dlink_hdrs],
         deps = [
             "@local_config_cuda//cuda:cuda_headers",
         ],
@@ -222,31 +369,22 @@ def rdc_library(name, deps):
             "__NV_EXTRA_INITIALIZATION=",
             "__NV_EXTRA_FINALIZATION=",
         ],
-        linkstatic = True,
+        linkstatic = linkstatic,
     )
 
-    # Repackage deps into a single archive. This avoid unresolved symbols when
-    # the archives happen to be linked in the wrong order. For more details, see
+    # Repackage the two libs into a single archive. This is required because
+    # both libs reference symbols defined in the other one. For details, see
     # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    native.genrule(
-        name = name + "_a",
-        srcs = [
-            name + "_deps_a",
-            name + "_dlink_a",
-        ],
-        outs = [name + ".a"],
-        # See https://stackoverflow.com/a/23621751
-        cmd = """
-addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
-printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
-""",
+    archive = name + "_a"
+    _merge_archive(
+        name = archive,
+        srcs = [lib, dlink],
     )
 
+    # Create cc target from archive.
     native.cc_library(
         name = name,
-        srcs = [name + "_a"],
-        deps = [
-            "@local_config_cuda//cuda:cudart_static",
-        ],
-        linkstatic = True,
+        srcs = [archive],
+        hdrs = hdrs,
+        linkstatic = linkstatic,
     )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 1e6422b49ef4d7ce97b3b38f3b3964281a158b7c..3a836fadc336cc100d1e9f62696bd8980efabc71 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -13,7 +13,9 @@ load(
     "auto_configure_fail",
     "compute_capabilities",
     "cuda_toolkit_path",
+    "enable_cuda",
     "find_cuda_define",
+    "get_cpu_value",
     "matches_version",
 )
 
@@ -22,7 +24,7 @@ _NCCL_HDR_PATH = "NCCL_HDR_PATH"
 _NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
-_TF_NCCL_CONFIG_REPO = "TF_NCCL_CONFIG_REPO"
+_TF_NEED_CUDA = "TF_NEED_CUDA"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -41,13 +43,6 @@ cc_library(
 """
 
 _NCCL_ARCHIVE_BUILD_CONTENT = """
-exports_files([
-    "cuda/bin/crt/link.stub",
-    "cuda/bin/fatbinary",
-    "cuda/bin/bin2c",
-    "nvlink",
-])
-
 filegroup(
   name = "LICENSE",
   data = ["@nccl_archive//:LICENSE.txt"],
@@ -116,26 +111,24 @@ def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_v
     header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
     if not matches_version(nccl_version, header_version):
         auto_configure_fail(
-            ("NCCL library version detected from %s/nccl.h (%s) does not match " +
-             "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+            ("NCCL library version detected from %s/nccl.h (%s) does not " +
+             "match TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
             (header_dir, header_version, nccl_version),
         )
 
 def _nccl_configure_impl(repository_ctx):
     """Implementation of the nccl_configure repository rule."""
-    if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+    if not enable_cuda(repository_ctx) or \
+       get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD"):
         # Add a dummy build file to make bazel query happy.
         repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
         return
 
-    if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
-        # Forward to the pre-configured remote repository.
-        repository_ctx.template("BUILD", _label("remote.BUILD.tpl"), {
-            "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
-        })
-        return
+    nccl_version = ""
+    if _TF_NCCL_VERSION in repository_ctx.os.environ:
+        nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+        nccl_version = nccl_version.split(".")[0]
 
-    nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
@@ -151,15 +144,6 @@ def _nccl_configure_impl(repository_ctx):
         repository_ctx.template("build_defs.bzl", _label("build_defs.bzl.tpl"), {
             "%{gpu_architectures}": str(gpu_architectures),
         })
-
-        repository_ctx.symlink(cuda_toolkit_path(repository_ctx), "cuda")
-
-        # Temporary work-around for setups which symlink ptxas to a newer
-        # version. The versions of nvlink and ptxas need to agree, so we find
-        # nvlink next to the real location of ptxas. This is only temporary and
-        # will be removed again soon.
-        nvlink_dir = repository_ctx.path("cuda/bin/ptxas").realpath.dirname
-        repository_ctx.symlink(nvlink_dir.get_child("nvlink"), "nvlink")
     else:
         # Create target for locally installed NCCL.
         nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
@@ -179,7 +163,7 @@ nccl_configure = repository_rule(
         _NCCL_INSTALL_PATH,
         _TF_NCCL_VERSION,
         _TF_CUDA_COMPUTE_CAPABILITIES,
-        _TF_NCCL_CONFIG_REPO,
+        _TF_NEED_CUDA,
     ],
 )
 """Detects and configures the NCCL configuration.
diff --git a/third_party/nccl/remote.BUILD.tpl b/third_party/nccl/remote.BUILD.tpl
deleted file mode 100644
index d66fc5563d16edc81c9d883984e438f82e6820ae..0000000000000000000000000000000000000000
--- a/third_party/nccl/remote.BUILD.tpl
+++ /dev/null
@@ -1,6 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(name="LICENSE", actual = "%{target}:LICENSE")
-alias(name = "nccl", actual = "%{target}:nccl")
diff --git a/third_party/pasta/BUILD b/third_party/pasta/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9bd256a57939c402a1f2240f2ddc53f97794c56b
--- /dev/null
+++ b/third_party/pasta/BUILD
@@ -0,0 +1 @@
+# Empty BUILD file to force build system to see this directory at all.
diff --git a/third_party/pasta/BUILD.bazel b/third_party/pasta/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..ade681b606953b1df3e0140f83d714a39384c221
--- /dev/null
+++ b/third_party/pasta/BUILD.bazel
@@ -0,0 +1,30 @@
+# Description:
+#   AST-based python refactoring.
+load("@//third_party/pasta:build_defs.bzl", "copy_srcs")
+
+licenses(["notice"])  # Apache2
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "pasta",
+    srcs = copy_srcs([
+        "__init__.py",
+        "augment/__init__.py",
+        "augment/errors.py",
+        "augment/import_utils.py",
+        "augment/inline.py",
+        "augment/rename.py",
+        "base/__init__.py",
+        "base/annotate.py",
+        "base/ast_constants.py",
+        "base/ast_utils.py",
+        "base/codegen.py",
+        "base/formatting.py",
+        "base/scope.py",
+        "base/test_utils.py",
+        "base/token_generator.py",
+    ]),
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/pasta/BUILD.system b/third_party/pasta/BUILD.system
new file mode 100644
index 0000000000000000000000000000000000000000..6adc953c5abdc4bc5495fdf1bceef242a7bac61a
--- /dev/null
+++ b/third_party/pasta/BUILD.system
@@ -0,0 +1,13 @@
+# Description: Pasta, AST based python refactoring.
+
+licenses(["notice"])  # Apache2
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "pasta",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/pasta/build_defs.bzl b/third_party/pasta/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..0a5316de402b8cb6d59ba271400bf4d9bee9f033
--- /dev/null
+++ b/third_party/pasta/build_defs.bzl
@@ -0,0 +1,12 @@
+"""Skylark makros for building pasta."""
+
+def copy_srcs(srcs):
+    """Copies srcs from 'pasta' to parent directory."""
+    for src in srcs:
+        native.genrule(
+            name = src.replace(".", "_"),
+            srcs = ["pasta/" + src],
+            outs = [src],
+            cmd = "mkdir -p $$(dirname $@); cp $< $@",
+        )
+    return srcs
diff --git a/third_party/pasta/workspace.bzl b/third_party/pasta/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..e46cc4a45e42cc8e9da0e8c8401f05673286686d
--- /dev/null
+++ b/third_party/pasta/workspace.bzl
@@ -0,0 +1,16 @@
+"""Loads pasta python package."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "pasta",
+        urls = [
+            "https://mirror.bazel.build/github.com/google/pasta/archive/c3d72cdee6fc806251949e912510444d58d7413c.tar.gz",
+            "https://github.com/google/pasta/archive/c3d72cdee6fc806251949e912510444d58d7413c.tar.gz",
+        ],
+        strip_prefix = "pasta-c3d72cdee6fc806251949e912510444d58d7413c",
+        sha256 = "b5905f9cecc4b28363c563f3c4cb0545288bd35f7cc72c55066e97e53befc084",
+        build_file = "//third_party/pasta:BUILD.bazel",
+        system_build_file = "//third_party/pasta:BUILD.system",
+    )
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 53264630a1618e448182c946707fb5336930cf6d..9a7581c246d9d4468f5264cb8975c7260012e2a1 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -294,9 +294,7 @@ def _create_local_python_repository(repository_ctx):
 def _create_remote_python_repository(repository_ctx, remote_config_repo):
   """Creates pointers to a remotely configured repo set up to build with Python.
   """
-  _tpl(repository_ctx, "remote.BUILD", {
-      "%{REMOTE_PYTHON_REPO}": remote_config_repo,
-  }, "BUILD")
+  repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
 
 
 def _python_autoconf_impl(repository_ctx):
diff --git a/third_party/py/remote.BUILD.tpl b/third_party/py/remote.BUILD.tpl
deleted file mode 100644
index edcac41ec6fdd80151caa894aa50ba3f4f2aa536..0000000000000000000000000000000000000000
--- a/third_party/py/remote.BUILD.tpl
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(
-    name = "python_headers",
-    actual = "%{REMOTE_PYTHON_REPO}:python_headers",
-)
-
-alias(
-    name = "numpy_headers",
-    actual = "%{REMOTE_PYTHON_REPO}:numpy_headers",
-)
diff --git a/third_party/systemlibs/absl_py.absl.testing.BUILD b/third_party/systemlibs/absl_py.absl.testing.BUILD
index c1b794c1e9cd43fbb4b9a2bd49000ae79d88531a..7629509ebb6aa0bb525081ab8eaae11639415ba6 100644
--- a/third_party/systemlibs/absl_py.absl.testing.BUILD
+++ b/third_party/systemlibs/absl_py.absl.testing.BUILD
@@ -2,6 +2,10 @@ licenses(["notice"])  # Apache 2.0
 
 py_library(
     name = "parameterized",
-    testonly = 1,
     visibility = ["//visibility:public"],
 )
+
+py_library(
+    name = "absltest",
+    visibility = ["//visiblity:public"],
+)
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index a2c30b8b94ab9d1e511a235d875931e19d479b2b..a41ab808c70cbe8f69653794afe5a7651f514252 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -16,9 +16,9 @@ cc_library(
 )
 
 cc_library(
-    name = "nv_infer",
-    srcs = [%{nv_infer}],
-    data = [%{nv_infer}],
+    name = "tensorrt",
+    srcs = %{tensorrt_libs},
+    data = %{tensorrt_libs},
     copts= cuda_default_copts(),
     deps = [
         "@local_config_cuda//cuda:cuda",
@@ -28,6 +28,5 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-
-%{tensorrt_genrules}
+%{copy_rules}
 
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
index 0dc3a7ba2d239cbeca5181ba20d0c98edb26bb94..6d00513827b3804c49ad1cb93e952c0338b886e9 100644
--- a/third_party/tensorrt/build_defs.bzl.tpl
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -2,6 +2,4 @@
 
 def if_tensorrt(if_true, if_false=[]):
   """Tests whether TensorRT was enabled during the configure process."""
-  if %{tensorrt_is_configured}:
-    return if_true
-  return if_false
+  return %{if_tensorrt}
diff --git a/third_party/tensorrt/remote.BUILD.tpl b/third_party/tensorrt/remote.BUILD.tpl
deleted file mode 100644
index 7598e7aa4bb32702307fe073a33903184b2dc70e..0000000000000000000000000000000000000000
--- a/third_party/tensorrt/remote.BUILD.tpl
+++ /dev/null
@@ -1,7 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-alias(name="LICENSE", actual = "%{target}:LICENSE")
-alias(name = "tensorrt_headers", actual = "%{target}:tensorrt_headers")
-alias(name = "nv_infer", actual = "%{target}:nv_infer")
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 77ee6622d17c77c4c55e4bcb6a645e8598e6497b..c6de25b33e3fab545d845986231c6880632babeb 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -12,8 +12,11 @@ load(
     "auto_configure_fail",
     "get_cpu_value",
     "find_cuda_define",
+    "find_lib",
+    "lib_name",
     "matches_version",
-    "symlink_genrule_for_dir",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
 )
 
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
@@ -93,10 +96,11 @@ def _trt_lib_version(repository_ctx, trt_install_path):
         ("TensorRT library version detected from %s/%s (%s) does not match " +
          "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
         (trt_header_dir, "NvInfer.h", full_version, environ_version))
-  return environ_version
+  # Only use the major version to match the SONAME of the library.
+  return major_version
 
 
-def _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version):
+def _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version):
   """Finds the given TensorRT library on the system.
 
   Adapted from code contributed by Sami Kama (https://github.com/samikama).
@@ -108,30 +112,13 @@ def _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version):
       by _trt_lib_version.
 
   Returns:
-    Map of library names to structs with the following fields:
-      src_file_path: The full path to the library found on the system.
-      dst_file_name: The basename of the target library.
+    The path to the library.
   """
-  objdump = repository_ctx.which("objdump")
   result = {}
   for lib in _TF_TENSORRT_LIBS:
-    dst_file_name = "lib%s.so.%s" % (lib, trt_lib_version)
-    src_file_path = repository_ctx.path("%s/%s" % (trt_install_path,
-                                                   dst_file_name))
-    if not src_file_path.exists:
-      auto_configure_fail(
-          "Cannot find TensorRT library %s" % str(src_file_path))
-    if objdump != None:
-      objdump_out = repository_ctx.execute([objdump, "-p", str(src_file_path)])
-      for line in objdump_out.stdout.splitlines():
-        if "SONAME" in line:
-          dst_file_name = line.strip().split(" ")[-1]
-    result.update({
-        lib:
-            struct(
-                dst_file_name=dst_file_name,
-                src_file_path=str(src_file_path.realpath))
-    })
+    file_name = lib_name("nvinfer", cpu_value, trt_lib_version)
+    path = find_lib(repository_ctx, ["%s/%s" % (trt_install_path, file_name)])
+    result[file_name] = path
   return result
 
 
@@ -142,33 +129,33 @@ def _tpl(repository_ctx, tpl, substitutions):
 
 def _create_dummy_repository(repository_ctx):
   """Create a dummy TensorRT repository."""
-  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "False"})
-  substitutions = {
-      "%{tensorrt_genrules}": "",
-      "%{tensorrt_headers}": "",
-  }
-  for lib in _TF_TENSORRT_LIBS:
-    k = "%%{%s}" % lib.replace("nv", "nv_")
-    substitutions.update({k: ""})
-  _tpl(repository_ctx, "BUILD", substitutions)
+  _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_false"})
 
+  _tpl(repository_ctx, "BUILD", {
+      "%{tensorrt_genrules}": "",
+      "%{tensorrt_headers}": "[]",
+      "%{tensorrt_libs}": "[]"
+  })
 
 def _tensorrt_configure_impl(repository_ctx):
   """Implementation of the tensorrt_configure repository rule."""
   if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
     # Forward to the pre-configured remote repository.
-    repository_ctx.template("BUILD", Label("//third_party/tensorrt:remote.BUILD.tpl"), {
-        "%{target}": repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO],
-    })
-    # Set up config file.
-    _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
+    remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
+    repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
+    repository_ctx.template(
+        "build_defs.bzl",
+        Label(remote_config_repo + ":build_defs.bzl"),
+        {},
+    )
     return
 
   if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
     _create_dummy_repository(repository_ctx)
     return
 
-  if (get_cpu_value(repository_ctx) != "Linux"):
+  cpu_value = get_cpu_value(repository_ctx)
+  if (cpu_value != "Linux"):
     auto_configure_fail("TensorRT is supported only on Linux.")
   if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
     auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
@@ -177,42 +164,46 @@ def _tensorrt_configure_impl(repository_ctx):
     auto_configure_fail(
         "Cannot find TensorRT install path %s." % trt_install_path)
 
-  # Set up the symbolic links for the library files.
+  # Copy the library files.
   trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
-  trt_libs = _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version)
-  trt_lib_src = []
-  trt_lib_dest = []
-  for lib in trt_libs.values():
-    trt_lib_src.append(lib.src_file_path)
-    trt_lib_dest.append(lib.dst_file_name)
-  genrules = [
-      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/lib/",
-                              "tensorrt_lib", trt_lib_src, trt_lib_dest)
-  ]
-
-  # Set up the symbolic links for the header files.
+  trt_libs = _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version)
+  trt_lib_srcs = []
+  trt_lib_outs = []
+  for path in trt_libs.values():
+    trt_lib_srcs.append(str(path))
+    trt_lib_outs.append("tensorrt/lib/" + path.basename)
+  copy_rules = [make_copy_files_rule(
+      repository_ctx,
+      name = "tensorrt_lib",
+      srcs = trt_lib_srcs,
+      outs = trt_lib_outs,
+  )]
+
+  # Copy the header files header files.
   trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
-  src_files = [
+  trt_header_srcs = [
       "%s/%s" % (trt_header_dir, header) for header in _TF_TENSORRT_HEADERS
   ]
-  dest_files = _TF_TENSORRT_HEADERS
-  genrules.append(
-      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/include/",
-                              "tensorrt_include", src_files, dest_files))
+  trt_header_outs = [
+      "tensorrt/include/" + header for header in _TF_TENSORRT_HEADERS
+  ]
+  copy_rules.append(
+      make_copy_files_rule(
+          repository_ctx,
+          name = "tensorrt_include",
+          srcs = trt_header_srcs,
+          outs = trt_header_outs,
+  ))
 
   # Set up config file.
-  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
+  _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_true"})
 
   # Set up BUILD file.
-  substitutions = {
-      "%{tensorrt_genrules}": "\n".join(genrules),
+  _tpl(repository_ctx, "BUILD", {
+      "%{copy_rules}": "\n".join(copy_rules),
       "%{tensorrt_headers}": '":tensorrt_include"',
-  }
-  for lib in _TF_TENSORRT_LIBS:
-    k = "%%{%s}" % lib.replace("nv", "nv_")
-    v = '"tensorrt/lib/%s"' % trt_libs[lib].dst_file_name
-    substitutions.update({k: v})
-  _tpl(repository_ctx, "BUILD", substitutions)
+      "%{tensorrt_libs}": str(trt_lib_outs),
+  })
 
 
 tensorrt_configure = repository_rule(
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 9da417fd5fe18619de6dc51032b8e3cde21b6ffb..0e067708a8b27c07c16b7848a426e45f6e6bb605 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -4,10 +4,20 @@ package(default_visibility = ["//visibility:public"])
 
 load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
 
-# Platform for use with remote execution with
-# custom container based off RBE Ubuntu16_04
-# http://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04
-# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+# Constraint used for platforms below so we can force certain rules to be executed
+# on specific platforms.
+constraint_setting(name = "custom_platforms")
+
+# Constraint for platforms that allow GPU testing (i.e. have a GPU available).
+# This is used in exec_compatible_with of rules that need GPU access.
+constraint_value(
+    name = "gpu_test",
+    constraint_setting = ":custom_platforms",
+)
+
+# TODO(b/122347293): This is the RBE config based on the CPU configuration / image provided
+# in the asci-toolchain setup. Delete this once we switched CPU remote builds to the
+# new platform below.
 platform(
     name = "rbe_ubuntu16_04-tf",
     constraint_values = [
@@ -20,9 +30,39 @@ platform(
         properties: {
             name: "container-image"
             value:"docker://gcr.io/asci-toolchain/nosla-ubuntu16_04-tf@sha256:63a0e981a4e7ce5da2a851cf063e430f72947fd999d9336b7e54e2eebe8e0bf5"
-        }""",
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """,
+)
+
+# Remote build platforms.
+# Each of the platform rules here provide a platform definition that is bound to a docker image.
+# The result of the skylark configuration is checked into
+# //tensorflow/third_party/toolchains/preconfig.
+
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu.
+platform(
+    name = "rbe_ubuntu16.04",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/tensorflow-testing/nosla-ubuntu16.04@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["ubuntu16.04"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda9.0-cudnn7-ubuntu14.04",
     constraint_values = [
@@ -32,10 +72,16 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
-        }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
+            value:"docker://gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
     constraint_values = [
@@ -45,6 +91,35 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
-        }""" % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+            value:"docker://gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+)
+
+# The above platform with GPU support.
+platform(
+    name = "rbe_cuda10.0-cudnn7-ubuntu14.04-gpu",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+        ":gpu_test",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value: "docker://gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }
+        properties: {
+            name: "dockerRuntime"
+            value: "nvidia"
+        }
+        properties: {
+            name: "Pool"
+            value: "gpu-pool"
+        }
+        """ % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
 )
diff --git a/third_party/toolchains/cpus/arm/BUILD b/third_party/toolchains/cpus/arm/BUILD
index a565e1e55fe6aa526b1a441ae196296792b51491..efed6972395c7e6f496f797e5e7463f2f2778438 100644
--- a/third_party/toolchains/cpus/arm/BUILD
+++ b/third_party/toolchains/cpus/arm/BUILD
@@ -6,6 +6,10 @@ cc_toolchain_suite(
         "armeabi|compiler": ":cc-compiler-armeabi",
         "local|compiler": ":cc-compiler-local",
         "armeabi": ":cc-compiler-armeabi",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "arm": ":cc-compiler-local",
+        "s390x": ":cc-compiler-local",
     },
 )
 
diff --git a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
index f0e17d1fe065703e3ff5574cd1d1d94d322a66a8..8d51e9b0c6f9eb875d37b502a99327667f5078cc 100644
--- a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+++ b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
@@ -2,14 +2,6 @@ major_version: "local"
 minor_version: ""
 default_target_cpu: "same_as_host"
 
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
 default_toolchain {
   cpu: "darwin"
   toolchain_identifier: "local_darwin"
@@ -18,14 +10,6 @@ default_toolchain {
   cpu: "freebsd"
   toolchain_identifier: "local_freebsd"
 }
-default_toolchain {
-  cpu: "armeabi"
-  toolchain_identifier: "arm-linux-gnueabihf"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
 default_toolchain {
   cpu: "x64_windows"
   toolchain_identifier: "local_windows_msys64"
@@ -34,10 +18,6 @@ default_toolchain {
   cpu: "x64_windows_msvc"
   toolchain_identifier: "vc_14_0_x64"
 }
-default_toolchain {
-  cpu: "s390x"
-  toolchain_identifier: "local_linux"
-}
 
 toolchain {
   abi_version: "armeabi"
diff --git a/third_party/toolchains/gpus/cuda/BUILD b/third_party/toolchains/gpus/cuda/BUILD
index f63a0ea81925783085b1b551aab778d41ba1fb2c..8bb22c0269b5c4bfc21ea60c6605ac75ba072595 100644
--- a/third_party/toolchains/gpus/cuda/BUILD
+++ b/third_party/toolchains/gpus/cuda/BUILD
@@ -85,8 +85,8 @@ cc_library(
 
 cc_library(
     name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.9.0"],
-    data = ["cuda/lib/libcudart.so.9.0"],
+    srcs = ["cuda/lib/libcudart.so.10.0"],
+    data = ["cuda/lib/libcudart.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -97,8 +97,8 @@ cc_library(
 
 cc_library(
     name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.9.0"],
-    data = ["cuda/lib/libcublas.so.9.0"],
+    srcs = ["cuda/lib/libcublas.so.10.0"],
+    data = ["cuda/lib/libcublas.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -109,8 +109,8 @@ cc_library(
 
 cc_library(
     name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.9.0"],
-    data = ["cuda/lib/libcusolver.so.9.0"],
+    srcs = ["cuda/lib/libcusolver.so.10.0"],
+    data = ["cuda/lib/libcusolver.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -143,8 +143,8 @@ cc_library(
 
 cc_library(
     name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.9.0"],
-    data = ["cuda/lib/libcufft.so.9.0"],
+    srcs = ["cuda/lib/libcufft.so.10.0"],
+    data = ["cuda/lib/libcufft.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -155,8 +155,8 @@ cc_library(
 
 cc_library(
     name = "curand",
-    srcs = ["cuda/lib/libcurand.so.9.0"],
-    data = ["cuda/lib/libcurand.so.9.0"],
+    srcs = ["cuda/lib/libcurand.so.10.0"],
+    data = ["cuda/lib/libcurand.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -193,7 +193,7 @@ cc_library(
 
 cc_library(
     name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.9.0"],
+    data = ["cuda/lib/libcupti.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -1193,7 +1193,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cudnn.h" "$(@D)/cuda/include/cudnn.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-10.0/include/cudnn.h" "$(@D)/cuda/include/cudnn.h" && cp "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-10.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-10.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-10.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-10.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-10.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-10.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1203,7 +1203,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1240,7 +1240,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1248,17 +1248,17 @@ genrule(
     name = "cuda-lib",
     outs = [
         "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.9.0",
+        "cuda/lib/libcudart.so.10.0",
         "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.9.0",
-        "cuda/lib/libcusolver.so.9.0",
-        "cuda/lib/libcurand.so.9.0",
-        "cuda/lib/libcufft.so.9.0",
+        "cuda/lib/libcublas.so.10.0",
+        "cuda/lib/libcusolver.so.10.0",
+        "cuda/lib/libcurand.so.10.0",
+        "cuda/lib/libcufft.so.10.0",
         "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.9.0",
+        "cuda/lib/libcupti.so.10.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.176" "$(@D)/cuda/lib/libcudart.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.480" "$(@D)/cuda/lib/libcublas.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.176" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.176" "$(@D)/cuda/lib/libcurand.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.176" "$(@D)/cuda/lib/libcufft.so.10.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.10.0.176" "$(@D)/cuda/lib/libcupti.so.10.0"
    """,
 )
 
diff --git a/third_party/toolchains/gpus/cuda/cuda/cuda_config.h b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
index 7cdaf144ada77c93119f7412df93e8f3423872ee..b05bfb732651360581d2ef9d353f16b6f9e2d9a6 100644
--- a/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
+++ b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
@@ -19,9 +19,9 @@ limitations under the License.
 
 #define TF_CUDA_CAPABILITIES CudaVersion("3.0")
 
-#define TF_CUDA_VERSION "9.0"
+#define TF_CUDA_VERSION "10.0"
 #define TF_CUDNN_VERSION "7"
 
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-9.0"
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
 
 #endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index b4c98dc94de7a0368efbce712e8a3b48c49f7841..7bc5f2bb6057d40038445f99ae519a31b477b742 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -2,6 +2,12 @@ licenses(["restricted"])
 
 load(":generate.bzl", "tensorflow_rbe_config")
 
+tensorflow_rbe_config(
+    name = "ubuntu16.04-py3-clang",
+    compiler = "clang",
+    python_version = "3",
+)
+
 tensorflow_rbe_config(
     name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-tensorrt5",
     compiler = "gcc",
@@ -20,6 +26,15 @@ tensorflow_rbe_config(
     tensorrt_version = "5",
 )
 
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc7-cuda10.0-cudnn7-tensorrt5",
+    compiler = "gcc-7",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+    tensorrt_version = "5",
+)
+
 tensorflow_rbe_config(
     name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-tensorrt5",
     compiler = "gcc",
diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl
index 0850893589ba428c42a5faee9546686f049a46cf..bafc7d49434f48dfaad5b421e2ead472fd7ebc5f 100644
--- a/third_party/toolchains/preconfig/generate/archives.bzl
+++ b/third_party/toolchains/preconfig/generate/archives.bzl
@@ -2,26 +2,12 @@ load("//tensorflow:version_check.bzl", "parse_bazel_version")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 def bazel_toolchains_archive():
-    # Not all bazel versions have set native.bazel_version - if it is not set,
-    # fall back to the more compatible version of the toolchains archive.
-    if native.bazel_version and parse_bazel_version(native.bazel_version) >= parse_bazel_version("0.19"):
-        # This version of the toolchains repo is incompatible with older bazel
-        # versions - we can remove this once TensorFlow drops support for bazel
-        # before 0.19.
-        http_archive(
-            name = "bazel_toolchains",
-            sha256 = "41c48a189be489e2d15dec40e0057ea15b95ee5b39cc2a7e6cf663e31432c75e",
-            strip_prefix = "bazel-toolchains-3f8c58fe530fedc446de04673bc1e32985887dea",
-            urls = [
-                "https://github.com/nlopezgi/bazel-toolchains/archive/3f8c58fe530fedc446de04673bc1e32985887dea.tar.gz",
-            ],
-        )
-    else:
-        http_archive(
-            name = "bazel_toolchains",
-            sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
-            strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
-            urls = [
-                "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
-            ],
-        )
+    http_archive(
+      name = "bazel_toolchains",
+      sha256 = "ee854b5de299138c1f4a2edb5573d22b21d975acfc7aa938f36d30b49ef97498",
+      strip_prefix = "bazel-toolchains-37419a124bdb9af2fec5b99a973d359b6b899b61",
+      urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/37419a124bdb9af2fec5b99a973d359b6b899b61.tar.gz",
+        "https://github.com/bazelbuild/bazel-toolchains/archive/37419a124bdb9af2fec5b99a973d359b6b899b61.tar.gz",
+      ],
+    )
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index c56c6f3346ac64d516fa08f02ba9a206571a35e3..a86261328eb2c6a90236fb429d71fe3dcb9fddf9 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,4 +1,5 @@
 container_digests = {
-    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c43ed5341dd765042e0bbd1bf50fadeedd649d1e0c34d81999cb6ce30916cb95",
-    "cuda10.0-cudnn7-ubuntu14.04": "sha256:919e75247743ae1244d5d72ee9f18090379d4a9035e5853010f6d59d87cd2e8b",
+    "ubuntu16.04": "sha256:d0d98c53111c3ec071aa81632a2b0d6f210e5c2411c5172e31f99002125ec4de",
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:006a76ee1838122ff7f21ebac85f24c1ef350d4dd79b3ceff0e4fe649ed90d33",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
 }
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index 75deea41b819d0deaf35af71587322f41ff095c0..40e0957cf2e2e36ee26e05bcda8fb0c873a7a40e 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -3,30 +3,40 @@ load(
     "docker_toolchain_autoconfig",
 )
 
-def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler, tensorrt_version):
-    docker_toolchain_autoconfig(
-        name = name,
-        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
-        bazel_version = "0.19.2",
+def _tensorflow_rbe_config(name, compiler, python_version, cuda_version = None, cudnn_version = None, tensorrt_version = None):
+    base = "@ubuntu16.04//image"
+    config_repos = [
+        "local_config_python",
+        "local_config_cc",
+    ]
+    env = {
+        "ABI_VERSION": "gcc",
+        "ABI_LIBC_VERSION": "glibc_2.19",
+        "BAZEL_COMPILER": compiler,
+        "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+        "BAZEL_TARGET_LIBC": "glibc_2.19",
+        "BAZEL_TARGET_CPU": "k8",
+        "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+        "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+        "CC": compiler,
+        "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+        "CLEAR_CACHE": "1",
+        "HOST_CXX_COMPILER": compiler,
+        "HOST_C_COMPILER": compiler,
+    }
+
+    if cuda_version != None:
+        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version)
+        # The cuda toolchain currently contains its own C++ toolchain definition,
+        # so we do not fetch local_config_cc.
         config_repos = [
-            "local_config_cuda",
             "local_config_python",
+            "local_config_cuda",
             "local_config_tensorrt",
-        ],
-        env = {
-            "ABI_VERSION": "gcc",
-            "ABI_LIBC_VERSION": "glibc_2.19",
-            "BAZEL_COMPILER": compiler,
-            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
-            "BAZEL_TARGET_LIBC": "glibc_2.19",
-            "BAZEL_TARGET_CPU": "k8",
-            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
-            "CC": compiler,
-            "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+        ]
+        env.update({
             "TF_NEED_CUDA": "1",
             "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
-            "CLEAR_CACHE": "1",
             "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
             "TF_ENABLE_XLA": "1",
             "TF_CUDNN_VERSION": cudnn_version,
@@ -35,7 +45,15 @@ def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, co
             "TF_NEED_TENSORRT" : "1",
             "TF_TENSORRT_VERSION": tensorrt_version,
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-        },
+            "GCC_HOST_COMPILER_PATH": compiler if compiler != "clang" else "",
+        })
+
+    docker_toolchain_autoconfig(
+        name = name,
+        base = base,
+        bazel_version = "0.21.0",
+        config_repos = config_repos,
+        env = env,
         mount_project = "$(mount_project)",
         tags = ["manual"],
         incompatible_changes_off = True,
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
index 523c232ee1c9885c3318222138811faa76ec1346..e39daa6a30baf1c91a5b00f4b7491620fc4cf5dd 100755
--- a/third_party/toolchains/preconfig/generate/generate.sh
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -37,8 +37,16 @@ TENSORRT_VERSION="${PLATFORM[5]}"
 
 # TODO(klimek): Put this into the name.
 
-if [[ "${COMPILER}" == "gcc" ]]; then
-  COMPILER="gcc-nvcc-${CUDA_VERSION}"
+if [[ -n "${CUDA_VERSION}" ]]; then
+  if [[ "${COMPILER}" == gcc* ]]; then
+    COMPILER="${COMPILER}-nvcc-${CUDA_VERSION}"
+  fi
+  # Currently we create a special toolchain for clang when compiling with
+  # cuda enabled. We can get rid of this once the default toolchain bazel
+  # provides supports cuda.
+  if [[ "${COMPILER}" == "clang" ]]; then
+    COMPILER="cuda-clang"
+  fi
 fi
 
 echo "OS: ${OS}"
@@ -52,9 +60,12 @@ bazel build --define=mount_project="${PWD}" "${PKG}/generate:${TARGET}"
 cd "${TEMPDIR}"
 tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
 
-# Other than @local_config_tensorrt, the remote config repo is a subpackage of
-# @org_tensorflow and we need to add '-iquote <package_path>' manually.
-buildozer "set strip_include_prefix package_name()" //local_config_tensorrt:%cc_library
+# TODO(klimek): The skylark config rules should copy the files instead of
+# creating aliases.
+# Other than in @local_config_tensorrt, the header files in the remote config
+# repo are not relative to the repository root. Add a dummy include_prefix to
+# make them available as virtual includes.
+buildozer 'set include_prefix ""' //local_config_tensorrt:%cc_library
 
 # Delete all empty files: configurations leave empty files around when they are
 # unnecessary.
@@ -73,14 +84,19 @@ mkdir "${OS}"
 # Python:
 mv local_config_python "${OS}/${PY_VERSION}"
 
-# Compiler:
-mv local_config_cuda/crosstool "${OS}/${COMPILER}"
+if [[ -n "${CUDA_VERSION}" ]]; then
+  # Compiler:
+  mv local_config_cuda/crosstool "${OS}/${COMPILER}"
 
-# CUDA:
-mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
+  # CUDA:
+  mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
 
-# TensorRT:
-mv local_config_tensorrt "${OS}/${TENSORRT_VERSION}"
+  # TensorRT:
+  mv local_config_tensorrt "${OS}/${TENSORRT_VERSION}"
+else
+  # Compiler:
+  mv local_config_cc "${OS}/${COMPILER}"
+fi
 
 # Cleanup for copybara.
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
index f30c2f1ae6318c645e174617a74b8fdadac1598e..0495173786328367b1a74d00653da58f759d963c 100644
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -8,17 +8,24 @@ load(":containers.bzl", "container_digests")
 def _remote_config_workspace():
     container_repositories()
 
+    container_pull(
+        name = "ubuntu16.04",
+        registry = "gcr.io",
+        repository = "tensorflow-testing/nosla-ubuntu16.04",
+        digest = container_digests["ubuntu16.04"],
+    )
+
     container_pull(
         name = "cuda9.0-cudnn7-ubuntu14.04",
         registry = "gcr.io",
-        repository = "asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04",
+        repository = "tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04",
         digest = container_digests["cuda9.0-cudnn7-ubuntu14.04"],
     )
 
     container_pull(
         name = "cuda10.0-cudnn7-ubuntu14.04",
         registry = "gcr.io",
-        repository = "asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04",
+        repository = "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04",
         digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
     )
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
index c813efccf9b82578984b33d04fd513030c83e0b1..13429d7b88be691eeddad77e9de1aede2819816f 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -1192,9 +1192,7 @@ genrule(
         "cuda/include/vector_functions.hpp",
         "cuda/include/vector_types.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_egl_interop.h" "$(@D)/cuda/include/cuda_egl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-10.0/include/cudart_platform.h" "$(@D)/cuda/include/cudart_platform.h" && cp -f "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-10.0/include/nvjpeg.h" "$(@D)/cuda/include/nvjpeg.h" && cp -f "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExt.h" "$(@D)/cuda/include/nvtx3/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCuda.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtOpenCL.h" "$(@D)/cuda/include/nvtx3/nvToolsExtOpenCL.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtSync.h" "$(@D)/cuda/include/nvtx3/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImpl.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImpl.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCore.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInit.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInit.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDecls.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDefs.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxTypes.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxTypes.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/alignment.h" "$(@D)/cuda/include/thrust/detail/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/preprocessor.h" "$(@D)/cuda/include/thrust/detail/preprocessor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
 )
 
 genrule(
@@ -1202,9 +1200,7 @@ genrule(
     outs = [
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
 )
 
 genrule(
@@ -1241,9 +1237,7 @@ genrule(
         "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
         "cuda/extras/CUPTI/include/openmp/ompt.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/cupti_openmp.h" "$(@D)/cuda/extras/CUPTI/include/openmp/cupti_openmp.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/ompt.h" "$(@D)/cuda/extras/CUPTI/include/openmp/ompt.h"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
 genrule(
@@ -1259,9 +1253,31 @@ genrule(
         "cuda/lib/libcudnn.so.7",
         "cuda/lib/libcupti.so.10.0",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130" "$(@D)/cuda/lib/libcudart.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.130" "$(@D)/cuda/lib/libcublas.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.130" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.130" "$(@D)/cuda/lib/libcurand.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.145" "$(@D)/cuda/lib/libcufft.so.10.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.3.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0.130" "$(@D)/cuda/lib/libcupti.so.10.0"
-   """,
+    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" $(location cuda/lib/libcudart.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" $(location cuda/lib/libcublas.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" $(location cuda/lib/libcusolver.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" $(location cuda/lib/libcurand.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" $(location cuda/lib/libcufft.so.10.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" $(location cuda/lib/libcupti.so.10.0) """,
+)
+
+genrule(
+    name = "cuda-bin",
+    outs = [
+        "cuda/bin/bin2c",
+        "cuda/bin/crt/link.stub",
+        "cuda/bin/crt/prelink.stub",
+        "cuda/bin/cuda-gdb",
+        "cuda/bin/cuda-gdbserver",
+        "cuda/bin/cuda-memcheck",
+        "cuda/bin/cudafe++",
+        "cuda/bin/cuobjdump",
+        "cuda/bin/fatbinary",
+        "cuda/bin/gpu-library-advisor",
+        "cuda/bin/nvcc",
+        "cuda/bin/nvcc.profile",
+        "cuda/bin/nvdisasm",
+        "cuda/bin/nvlink",
+        "cuda/bin/nvprof",
+        "cuda/bin/nvprune",
+        "cuda/bin/ptxas",
+    ],
+    cmd = """cp -rf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
 )
 
 genrule(
@@ -1269,7 +1285,5 @@ genrule(
     outs = [
         "cuda/include/cudnn.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
-   """,
+    cmd = """cp -f "/usr/include/cudnn.h" $(location cuda/include/cudnn.h) """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index c6930904b564bf2cce70b484a0e7b0759f13b7c9..cfd0a08e93a2654f266b4a7e647cc03062074cd2 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1187,9 +1187,7 @@ genrule(
         "cuda/include/vector_functions.hpp",
         "cuda/include/vector_types.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp -f "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp -f "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp -f "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-9.0/include/." "$(@D)/cuda/include/" """,
 )
 
 genrule(
@@ -1197,9 +1195,7 @@ genrule(
     outs = [
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-9.0/nvvm/libdevice/." "$(@D)/" """,
 )
 
 genrule(
@@ -1234,9 +1230,7 @@ genrule(
         "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-9.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
 genrule(
@@ -1252,9 +1246,32 @@ genrule(
         "cuda/lib/libcudnn.so.7",
         "cuda/lib/libcupti.so.9.0",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
-   """,
+    cmd = """cp -f "/usr/local/cuda-9.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-9.0/lib64/libcudart.so.9.0" $(location cuda/lib/libcudart.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-9.0/lib64/libcublas.so.9.0" $(location cuda/lib/libcublas.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcusolver.so.9.0" $(location cuda/lib/libcusolver.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcurand.so.9.0" $(location cuda/lib/libcurand.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcufft.so.9.0" $(location cuda/lib/libcufft.so.9.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0" $(location cuda/lib/libcupti.so.9.0) """,
+)
+
+genrule(
+    name = "cuda-bin",
+    outs = [
+        "cuda/bin/bin2c",
+        "cuda/bin/crt/link.stub",
+        "cuda/bin/crt/prelink.stub",
+        "cuda/bin/cuda-gdb",
+        "cuda/bin/cuda-gdbserver",
+        "cuda/bin/cuda-memcheck",
+        "cuda/bin/cudafe",
+        "cuda/bin/cudafe++",
+        "cuda/bin/cuobjdump",
+        "cuda/bin/fatbinary",
+        "cuda/bin/gpu-library-advisor",
+        "cuda/bin/nvcc",
+        "cuda/bin/nvcc.profile",
+        "cuda/bin/nvdisasm",
+        "cuda/bin/nvlink",
+        "cuda/bin/nvprof",
+        "cuda/bin/nvprune",
+        "cuda/bin/ptxas",
+    ],
+    cmd = """cp -rf "/usr/local/cuda-9.0/bin/." "$(@D)/cuda/bin/" """,
 )
 
 genrule(
@@ -1262,7 +1279,5 @@ genrule(
     outs = [
         "cuda/include/cudnn.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
-   """,
+    cmd = """cp -f "/usr/include/cudnn.h" $(location cuda/include/cudnn.h) """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
index 00483951af966e0085e6f2b1d74290d9ee872963..426b9ca86746c3ef92299435d7de4e6191e4b664 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
index 859b3196d5dba9afadeae56f34be04247b00fe09..b0b4a53a805cba4e1be3b6b5438ca725a3599e78 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 63893d3722f6b43579758e5f747076b1f1e73ed7..192314137d4f5ca178e350894550132d045d7a2b 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -49,9 +49,9 @@ import pipes
 CPU_COMPILER = ('/usr/bin/gcc')
 GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
 
-NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_PATH = '/usr/local/cuda/bin/nvcc'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '9.0'
+NVCC_VERSION = '10.0'
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
index 859b3196d5dba9afadeae56f34be04247b00fe09..b0b4a53a805cba4e1be3b6b5438ca725a3599e78 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..2b84b761ff9fb0c8a803a8cae8d1f9b89c210008
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
@@ -0,0 +1,96 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_identifier = "local_linux",
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "local_windows",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..059842f58cf748e972635db1a82d6ed4ef580f6c
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/7"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/7"
+  cxx_builtin_include_directory: "/usr/include/c++/7/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/7"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/7"
+  cxx_builtin_include_directory: "/usr/include/c++/7/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..07742839ca5eabdeb7acd902aefa8ece4201347b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc-7')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc-7')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..cb1385dccdc7a9b0d40533c273825250edd8a13b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc-7')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc-7')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+NVCC_VERSION = '10.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
deleted file mode 100755
index 96ed60d3cfe2e6e16b33b884c9be8749d7fd0a4f..0000000000000000000000000000000000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nccl",
-    srcs = ["libnccl.so.2"],
-    hdrs = ["nccl.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-genrule(
-    name = "nccl-files",
-    outs = [
-        "libnccl.so.2",
-        "nccl.h",
-    ],
-    cmd = """cp "/usr/include/nccl.h" "$(@D)/nccl.h" &&
-           cp "/usr/lib/libnccl.so.2" "$(@D)/libnccl.so.2" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/nccl2/WORKSPACE
deleted file mode 100644
index 1e6662ac91669df808c82391c68a76292c1cd23d..0000000000000000000000000000000000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for nccl_configure rule
-workspace(name = "local_config_nccl")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
index da16bb31b61235365f548f551418a9417ff03378..75e2b7fd3361c079668e76db9fb416c324553e83 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
@@ -12,17 +12,17 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "tensorrt_headers",
     hdrs = [":tensorrt_include"],
-    strip_include_prefix = package_name(),
+    include_prefix = "",
     visibility = ["//visibility:public"],
 )
 
 cc_library(
-    name = "nv_infer",
+    name = "tensorrt",
     srcs = ["tensorrt/lib/libnvinfer.so.5"],
     copts = cuda_default_copts(),
     data = ["tensorrt/lib/libnvinfer.so.5"],
+    include_prefix = "",
     linkstatic = 1,
-    strip_include_prefix = package_name(),
     visibility = ["//visibility:public"],
     deps = [
         ":tensorrt_headers",
@@ -35,9 +35,7 @@ genrule(
     outs = [
         "tensorrt/lib/libnvinfer.so.5",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5.0.2" "$(@D)/libnvinfer.so.5"
-   """,
+    cmd = """cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5" $(location tensorrt/lib/libnvinfer.so.5) """,
 )
 
 genrule(
@@ -46,7 +44,5 @@ genrule(
         "tensorrt/include/NvInfer.h",
         "tensorrt/include/NvUtils.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" "$(@D)/tensorrt/include/NvInfer.h" && cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" "$(@D)/tensorrt/include/NvUtils.h"
-   """,
+    cmd = """cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" $(location tensorrt/include/NvInfer.h) && cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" $(location tensorrt/include/NvUtils.h) """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
index 5c1c40361da2a20f4c504ec066784a615c454d12..527be93834197de6d72fbc70a6dd25e4fb893900 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
@@ -2,6 +2,4 @@
 
 def if_tensorrt(if_true, if_false = []):
     """Tests whether TensorRT was enabled during the configure process."""
-    if True:
-        return if_true
-    return if_false
+    return if_true
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..5a0c52f66ab2224c0b021875d0447ee638e833c4
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
@@ -0,0 +1,111 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "malloc",
+)
+
+cc_library(
+    name = "stl",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "cc_wrapper",
+    srcs = ["cc_wrapper.sh"],
+)
+
+filegroup(
+    name = "compiler_deps",
+    srcs = glob(["extra_tools/**"]) + [":empty"],
+)
+
+# This is the entry point for --crosstool_top.  Toolchains are found
+# by lopping off the name of --crosstool_top and searching for
+# the "${CPU}" entry in the toolchains attribute.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "k8|clang": ":cc-compiler-k8",
+        "k8": ":cc-compiler-k8",
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-k8",
+    all_files = ":compiler_deps",
+    compiler_files = ":compiler_deps",
+    cpu = "k8",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":compiler_deps",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "linux_gnu_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-k8",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    toolchain = ":cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..48f82eb35d5b2268a758bb0ebb36e243663ca372
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
@@ -0,0 +1,1209 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+major_version: "local"
+minor_version: ""
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+toolchain {
+  abi_version: "armeabi-v7a"
+  abi_libc_version: "armeabi-v7a"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "armeabi-v7a"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "armeabi-v7a"
+  target_cpu: "armeabi-v7a"
+  target_system_name: "armeabi-v7a"
+  toolchain_identifier: "stub_armeabi-v7a"
+
+  tool_path { name: "ar" path: "/bin/false" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "/bin/false" }
+  tool_path { name: "dwp" path: "/bin/false" }
+  tool_path { name: "gcc" path: "/bin/false" }
+  tool_path { name: "gcov" path: "/bin/false" }
+  tool_path { name: "ld" path: "/bin/false" }
+
+  tool_path { name: "nm" path: "/bin/false" }
+  tool_path { name: "objcopy" path: "/bin/false" }
+  tool_path { name: "objdump" path: "/bin/false" }
+  tool_path { name: "strip" path: "/bin/false" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "linux_gnu_x86"
+  abi_version: "gcc"
+  abi_libc_version: "glibc_2.19"
+  builtin_sysroot: ""
+  compiler: "clang"
+  host_system_name: "i686-unknown-linux-gnu"
+  needsPic: true
+  supports_gold_linker: true
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: true
+  target_libc: "glibc_2.19"
+  target_cpu: "k8"
+  target_system_name: "x86_64-unknown-linux-gnu"
+  cxx_flag: "-std=c++0x"
+  linker_flag: "-fuse-ld=gold"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-Wl,-z,relro,-z,now"
+  linker_flag: "-B/usr/local/bin"
+  linker_flag: "-lstdc++"
+  linker_flag: "-lm"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/local/lib/clang/7.0.0/include"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9/backward"
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-Wall"
+  compiler_flag: "-Wthread-safety"
+  compiler_flag: "-Wself-assign"
+  compiler_flag: "-fcolor-diagnostics"
+  compiler_flag: "-fno-omit-frame-pointer"
+  tool_path {name: "ar" path: "/usr/bin/ar" }
+  tool_path {name: "ld" path: "/usr/bin/ld" }
+  tool_path {name: "cpp" path: "/usr/bin/cpp" }
+  tool_path {name: "gcc" path: "/usr/local/bin/clang" }
+  tool_path {name: "dwp" path: "/usr/bin/dwp" }
+  tool_path {name: "gcov" path: "None" }
+  tool_path {name: "nm" path: "/usr/bin/nm" }
+  tool_path {name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path {name: "objdump" path: "/usr/bin/objdump" }
+  tool_path {name: "strip" path: "/usr/bin/strip" }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-O2"
+    compiler_flag: "-D_FORTIFY_SOURCE=1"
+    compiler_flag: "-DNDEBUG"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+
+
+    feature {
+      name: 'coverage'
+      provides: 'profile'
+      flag_set {
+        action: 'preprocess-assemble'
+        action: 'c-compile'
+        action: 'c++-compile'
+        action: 'c++-header-parsing'
+        action: 'c++-module-compile'
+        flag_group {
+        flag: '--coverage'
+      }
+      }
+      flag_set {
+        action: 'c++-link-dynamic-library'
+        action: 'c++-link-nodeps-dynamic-library'
+        action: 'c++-link-executable'
+        flag_group {
+        flag: '--coverage'
+      }
+      }
+    }
+  
+
+  feature {
+    name: 'fdo_optimize'
+    provides: 'profile'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      expand_if_all_available: 'fdo_profile_path'
+      flag_group {
+        flag: '-fprofile-use=%{fdo_profile_path}'
+        flag: '-fprofile-correction',
+      }
+    }
+  }
+}
+
+toolchain {
+  toolchain_identifier: "msys_x64_mingw"
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "mingw-gcc"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "mingw"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+
+
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "msvc_x64"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+  default_python_version: "python2.7"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_gold_linker: false
+  supports_start_end_lib: false
+  supports_interface_shared_objects: true
+  supports_incremental_linker: false
+  supports_normalizing_ar: true
+  needsPic: false
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0601"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  artifact_name_pattern {
+     category_name: 'object_file'
+     prefix: ''
+     extension: '.obj'
+  }
+
+  artifact_name_pattern {
+     category_name: 'static_library'
+     prefix: ''
+     extension: '.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'alwayslink_static_library'
+     prefix: ''
+     extension: '.lo.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+  artifact_name_pattern {
+     category_name: 'dynamic_library'
+     prefix: ''
+     extension: '.dll'
+  }
+
+  artifact_name_pattern {
+     category_name: 'interface_library'
+     prefix: ''
+     extension: '.if.lib'
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+    implies: 'msvc_compile_env'
+    implies: 'msvc_link_env'
+  }
+
+  feature {
+    name: "msvc_compile_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_link_env"
+    env_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: ""
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: ""
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2" # Implies /Og /Oi /Ot /Oy /Ob2 /Gs /GF /Gy
+      }
+    }
+    implies: 'frame_pointer'
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  # Must come after /O1, /O2 and /Ox.
+  feature {
+    name: "frame_pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "/Oy-"
+      }
+    }
+  }
+
+  # Remove assert/DCHECKs in opt mode.
+  # You can have them back with --features=-disable_assertions.
+  feature {
+    name: 'disable_assertions'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        # TODO: detect clang on Windows and use "-Wno-builtin-macro-redefined"
+        flag: "/wd4117" # Trying to define or undefine a predefined macro
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: 'treat_warnings_as_errors'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/WX"
+      }
+    }
+  }
+
+  # Trade slower build time for smaller binary
+  feature {
+    name: 'smaller_binary'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/Gy" # Enable function-level linking (-ffunction-sections)
+        flag: "/Gw" # Optimize global data (-fdata-sections)
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library',
+      action: 'c++-link-nodeps-dynamic-library'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: '/OPT:ICF' # Fold identical functions
+        flag: '/OPT:REF' # Eliminate unreferenced functions and data
+      }
+    }
+  }
+
+  # Suppress warnings that most users do not care
+  feature {
+    name: 'ignore_noisy_warnings'
+    enabled: true
+    flag_set {
+      action: 'c++-link-static-library'
+      flag_group {
+        # Suppress 'object file does not define any public symbols' warning
+        flag: '/ignore:4221'
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..bc05b4c36ff49949e18a9c6f08b03d541149ede1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
+workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42a751dccfb0d9c7115ef5ed5483335c0e0f129b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+# Copyright 2015 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Ship the environment to the C++ action
+#
+set -eu
+
+# Set-up the environment
+
+
+# Call the C++ compiler
+/usr/local/bin/clang "$@"
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl b/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..45c0285d232806672e93cb6d9b860b2693e75d3d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc b/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c272dabaeb6829b5ded592b4b37194ef3af364dd
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc
@@ -0,0 +1 @@
+int main() {}
\ No newline at end of file
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..77eaa4d5121c32f2a4d58f3bb0fb470b72c9f0f6
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
@@ -0,0 +1,205 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/accu.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/bltinmodule.h",
+        "python_include/boolobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/dynamic_annotations.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/fileutils.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/namespaceobject.h",
+        "python_include/node.h",
+        "python_include/numpy/__multiarray_api.h",
+        "python_include/numpy/__ufunc_api.h",
+        "python_include/numpy/_neighborhood_iterator_imp.h",
+        "python_include/numpy/_numpyconfig.h",
+        "python_include/numpy/arrayobject.h",
+        "python_include/numpy/arrayscalars.h",
+        "python_include/numpy/halffloat.h",
+        "python_include/numpy/multiarray_api.txt",
+        "python_include/numpy/ndarrayobject.h",
+        "python_include/numpy/ndarraytypes.h",
+        "python_include/numpy/noprefix.h",
+        "python_include/numpy/npy_1_7_deprecated_api.h",
+        "python_include/numpy/npy_3kcompat.h",
+        "python_include/numpy/npy_common.h",
+        "python_include/numpy/npy_cpu.h",
+        "python_include/numpy/npy_endian.h",
+        "python_include/numpy/npy_interrupt.h",
+        "python_include/numpy/npy_math.h",
+        "python_include/numpy/npy_no_deprecated_api.h",
+        "python_include/numpy/npy_os.h",
+        "python_include/numpy/numpyconfig.h",
+        "python_include/numpy/old_defines.h",
+        "python_include/numpy/oldnumeric.h",
+        "python_include/numpy/ufunc_api.txt",
+        "python_include/numpy/ufuncobject.h",
+        "python_include/numpy/utils.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/odictobject.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pyatomic.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pygetopt.h",
+        "python_include/pyhash.h",
+        "python_include/pylifecycle.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymacro.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrhex.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/pytime.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/typeslots.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "/usr/include/python3.5m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.5m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.5m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.5m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.5m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.5m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.5m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.5m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.5m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.5m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.5m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.5m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.5m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.5m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.5m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.5m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.5m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.5m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.5m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.5m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.5m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.5m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.5m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.5m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.5m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.5m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.5m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.5m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.5m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.5m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.5m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.5m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.5m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.5m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.5m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.5m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.5m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.5m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.5m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.5m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.5m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.5m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.5m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.5m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.5m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.5m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.5m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.5m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.5m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.5m/numpy/__multiarray_api.h" "$(@D)/python_include/numpy/__multiarray_api.h" && cp -f "/usr/include/python3.5m/numpy/__ufunc_api.h" "$(@D)/python_include/numpy/__ufunc_api.h" && cp -f "/usr/include/python3.5m/numpy/_neighborhood_iterator_imp.h" "$(@D)/python_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/include/python3.5m/numpy/_numpyconfig.h" "$(@D)/python_include/numpy/_numpyconfig.h" && cp -f "/usr/include/python3.5m/numpy/arrayobject.h" "$(@D)/python_include/numpy/arrayobject.h" && cp -f "/usr/include/python3.5m/numpy/arrayscalars.h" "$(@D)/python_include/numpy/arrayscalars.h" && cp -f "/usr/include/python3.5m/numpy/halffloat.h" "$(@D)/python_include/numpy/halffloat.h" && cp -f "/usr/include/python3.5m/numpy/multiarray_api.txt" "$(@D)/python_include/numpy/multiarray_api.txt" && cp -f "/usr/include/python3.5m/numpy/ndarrayobject.h" "$(@D)/python_include/numpy/ndarrayobject.h" && cp -f "/usr/include/python3.5m/numpy/ndarraytypes.h" "$(@D)/python_include/numpy/ndarraytypes.h" && cp -f "/usr/include/python3.5m/numpy/noprefix.h" "$(@D)/python_include/numpy/noprefix.h" && cp -f "/usr/include/python3.5m/numpy/npy_1_7_deprecated_api.h" "$(@D)/python_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/include/python3.5m/numpy/npy_3kcompat.h" "$(@D)/python_include/numpy/npy_3kcompat.h" && cp -f "/usr/include/python3.5m/numpy/npy_common.h" "$(@D)/python_include/numpy/npy_common.h" && cp -f "/usr/include/python3.5m/numpy/npy_cpu.h" "$(@D)/python_include/numpy/npy_cpu.h" && cp -f "/usr/include/python3.5m/numpy/npy_endian.h" "$(@D)/python_include/numpy/npy_endian.h" && cp -f "/usr/include/python3.5m/numpy/npy_interrupt.h" "$(@D)/python_include/numpy/npy_interrupt.h" && cp -f "/usr/include/python3.5m/numpy/npy_math.h" "$(@D)/python_include/numpy/npy_math.h" && cp -f "/usr/include/python3.5m/numpy/npy_no_deprecated_api.h" "$(@D)/python_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/include/python3.5m/numpy/npy_os.h" "$(@D)/python_include/numpy/npy_os.h" && cp -f "/usr/include/python3.5m/numpy/numpyconfig.h" "$(@D)/python_include/numpy/numpyconfig.h" && cp -f "/usr/include/python3.5m/numpy/old_defines.h" "$(@D)/python_include/numpy/old_defines.h" && cp -f "/usr/include/python3.5m/numpy/oldnumeric.h" "$(@D)/python_include/numpy/oldnumeric.h" && cp -f "/usr/include/python3.5m/numpy/ufunc_api.txt" "$(@D)/python_include/numpy/ufunc_api.txt" && cp -f "/usr/include/python3.5m/numpy/ufuncobject.h" "$(@D)/python_include/numpy/ufuncobject.h" && cp -f "/usr/include/python3.5m/numpy/utils.h" "$(@D)/python_include/numpy/utils.h" && cp -f "/usr/include/python3.5m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.5m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.5m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/include/python3.5m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.5m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.5m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.5m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.5m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.5m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.5m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.5m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.5m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.5m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.5m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.5m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.5m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.5m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.5m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.5m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.5m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.5m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.5m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/include/python3.5m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.5m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.5m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.5m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.5m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.5m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.5m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.5m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/include/python3.5m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.5m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.5m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.5m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.5m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.5m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.5m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.5m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.5m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.5m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.5m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.5m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.5m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.5m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.5m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.5m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.5m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.5m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.5m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..1d298fefa3bf40b2c02605960d69c5974e9de7b7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
+workspace(name = "local_config_python")
diff --git a/third_party/toolchains/remote/BUILD b/third_party/toolchains/remote/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/toolchains/remote/BUILD.tpl b/third_party/toolchains/remote/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/toolchains/remote/configure.bzl b/third_party/toolchains/remote/configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..cc5b9842648e74a7aa5ac01721719abbd0752123
--- /dev/null
+++ b/third_party/toolchains/remote/configure.bzl
@@ -0,0 +1,43 @@
+"""Repository rule for remote GPU autoconfiguration.
+
+This rule creates the starlark file
+//third_party/toolchains/remote:execution.bzl
+providing the function `gpu_test_tags`.
+
+`gpu_test_tags` will return:
+
+  * `local`: if `REMOTE_GPU_TESTING` is false, allowing CPU tests to run
+    remotely and GPU tests to run locally in the same bazel invocation.
+  * `remote-gpu`: if `REMOTE_GPU_TESTING` is true; this allows rules to
+    set an execution requirement that enables a GPU-enabled remote platform.
+"""
+
+_REMOTE_GPU_TESTING = "REMOTE_GPU_TESTING"
+
+def _flag_enabled(repository_ctx, flag_name):
+    if flag_name not in repository_ctx.os.environ:
+        return False
+    return repository_ctx.os.environ[flag_name].strip() == "1"
+
+def _remote_execution_configure(repository_ctx):
+    # If we do not support remote gpu test execution, mark them as local, so we
+    # can combine remote builds with local gpu tests.
+    gpu_test_tags = "\"local\""
+    if _flag_enabled(repository_ctx, _REMOTE_GPU_TESTING):
+        gpu_test_tags = "\"remote-gpu\""
+    repository_ctx.template(
+        "remote_execution.bzl",
+        Label("//third_party/toolchains/remote:execution.bzl.tpl"),
+        {
+            "%{gpu_test_tags}": gpu_test_tags,
+        },
+    )
+    repository_ctx.template(
+        "BUILD",
+        Label("//third_party/toolchains/remote:BUILD.tpl"),
+    )
+
+remote_execution_configure = repository_rule(
+    implementation = _remote_execution_configure,
+    environ = [_REMOTE_GPU_TESTING],
+)
diff --git a/third_party/toolchains/remote/execution.bzl.tpl b/third_party/toolchains/remote/execution.bzl.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..18858cc0dc01fa50b70eb735768de613388dddda
--- /dev/null
+++ b/third_party/toolchains/remote/execution.bzl.tpl
@@ -0,0 +1,2 @@
+def gpu_test_tags():
+    return [%{gpu_test_tags}]